#!/usr/bin/perl -w
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

our ($verbose, $help, $man);
our ($inputfile, $outtype, $normlength);

GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'normlength'=>\$normlength) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 2 or pod2usage ("Syntax error");

($inputfile, $outtype) = @ARGV;

$outtype eq 'spread' or $outtype eq 'dif' or pod2usage ("Error in argument: the --outtype argument can be spread or dif only");

open (INPUTFILE, $inputfile) or confess "Error: cannot open input sift file $inputfile: $!";

my $num_residue;
my (@spread, @diff, @res, @raw_score);
while (<INPUTFILE>) {
	chomp;
	my ($res, $score, $scores, $wt_index) = split (/\t/, $_);
	my @scores = split (/,/, $scores);
	
	push @res, $res;
	$num_residue++;
	
	if ($outtype eq 'spread') {
		push @spread, sd (@scores);
	} elsif ($outtype eq 'dif') {
		push @diff, $scores[$wt_index] - mean (@scores);
	}
	push @raw_score, $scores;
}

$normlength and print STDERR "NOTICE: Scores are normalized by the length of the structure $num_residue residues\n";

for my $i (0 .. @res-1) {
	if ($outtype eq 'spread') {
		print $res[$i], "\t", $normlength?($spread[$i]/$num_residue):$spread[$i], "\t", $raw_score[$i], "\n";
	} else {
		print $res[$i], "\t", $normlength?($diff[$i]/$num_residue):$diff[$i], "\t", $raw_score[$i], "\n";
	}
}



sub mean {
	my $result = 0;
	$result += $_ for (@_);
	return $result/@_;
}

sub sd {
	my $result = 0;
	my $mean = mean (@_);
	$result+=(($_-$mean)*($_-$mean)) for (@_);
	return sqrt ($result/(@_-1));
}


=head1 SYNOPSIS

generate_rapdf_dif_spread.pl [arguments] <aarapdf-file> <dif|spread>

 Optional arguments:
 	-v, --verbose			verbose output level 0(default), 1, 2 or 3
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	
 	-n, --normlength		normalize score by protein length

 Function: given an AARAPDF file containing RAPDF scores for each mutation for 
 each residue in a protein structure, generate the RAPDF_dif and the 
 RAPDF_spread score for use in the Meta-Functional Signature (MFS) generation 
 protocol.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--normlength>

normalize the scores based on the protein length. This is an experimental 
feature: it is not really useful and not recommended to use.

=back

=head1 DESCRIPTION

This program is used to calculate RAPDF_dif or RAPDF_spread score, given an 
AARAPDF file generated by the scoreaa_by_structure.pl program. These new score 
files can be used by the MFS (meta-functional signature) system for 
interpretation of functional importance of individual amino acid residues.

=cut
              