#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

our ($verbose, $help, $man);
our ($modelfile, $logodds, $multicolumn, $expand, $tolerate);
our (@scorefile);
GetOptions('verbose'=>\$verbose, 'help'=>\$help, 'man|m'=>\$man, 'modelfile=s'=>\$modelfile, 'logodds'=>\$logodds, 'multicolumn'=>\$multicolumn, 
	'expanded_output'=>\$expand, 'tolerate'=>\$tolerate) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV >= 1 or pod2usage ("Syntax error");
defined $logodds or $logodds = 1;		#we always set --logodds as 1 by default, which means that logodds value (instead of probability value) be normalized, so the MFS score distributions are less skewed

defined $modelfile or pod2usage ("Error in argument: you must specify the --modelfile option for MFS calculation");

@scorefile = @ARGV;

my ($coef_score, $coef_aa, $coef_cons) = readModelFile ($modelfile);	#coefficient for each function, for each amino acid and for the constant
my (@atom_name, @score, @norm_score, @sort_score, @ind_score, @expand_score);
my ($max_score, $min_score);
my @aa = split (//, 'ACDEFGHIKLMNPQRSTVWY');
my %aa_index;
for my $i (0 .. 19) {
	$aa_index{$aa[$i]} = $i;
}

@scorefile == 4 or @scorefile == 2 or pod2usage ("Error in argument: at this point only 4 (for MFS) or 2 (for seqonly MFS) scoring function files can be handled for this program");

if (@$coef_score != @scorefile) {
	confess "Error: there are ${\(scalar @$coef_score)} functions in model file $modelfile, but ${\(scalar @scorefile)} scoring files are supplied";
}

for my $i (0 .. @scorefile-1) {
	open (SCORE, $scorefile[$i]) or confess "Error: cannot read from scoring file $scorefile[$i]: $!";
	my $count = 0;
	while (<SCORE>) {
		m/^((\d+)_(\w))\t(\S+)/ or confess "Error: invalid score record in $scorefile[$i]: <$_>";
		my ($aa_identity, $position, $aa_type, $aa_score) = ($1, $2, $3, $4);
		
		if (not defined $coef_aa->{uc $aa_type}) {
			print STDERR "WARNING: The amino acid $aa_type is not included in the model file $modelfile and will be treated as A\n";
		}
		
		#do this only once, score the @atom_name, and generate the initial @score array that contain only the coefficients of Amino Acid
		if ($i == 0) {
			push @atom_name, $aa_identity;
			push @score, ($coef_aa->{uc $aa_type} || 0) + $coef_cons;
		}
		
		if ($aa_identity ne $atom_name[$count]) {
			if ($tolerate) {
				my ($temp_a, $temp_b);
				$aa_identity=~m/(\w)$/ and $temp_a=$1;
				$atom_name[$count]=~m/(\w)$/ and $temp_b=$1;
				$temp_a eq $temp_b or confess "Error: the amino acid identity in scorefile $scorefile[$i] does not match scorefile $scorefile[0] in line ${\($count+1)}: $aa_identity vs $atom_name[$count]";
			} else {
				confess "Error: the amino acid count or identity in scorefile $scorefile[$i] does not match scorefile $scorefile[0] in line ${\($count+1)}: $aa_identity vs $atom_name[$count]";
			}
		}
		
		$score[$count] += $aa_score * $coef_score->[$i];
		$count++;
		
		if ($multicolumn) {
			push @{$ind_score[$i]}, $aa_score;
		}
	}
}

#normalizing raw MFS scores to be between 0 and 1
if (not $logodds) {
	@norm_score = map {exp ($_) / (1 + exp ($_))} @score;
	normalizeScore (\@norm_score);
} else {
	@norm_score = @score;
	normalizeScore (\@norm_score);
}

#getting and maximum and minimum score, so that the normalized MFS for each mutation can be calculated
if ($expand) {
	@sort_score = sort {$a <=> $b} @score;
	($max_score, $min_score) = ($sort_score[$#sort_score], $sort_score[0]);
}

#if the --expanded_output argument is set, calculate the MFS scores for each mutation in each residue, and store them in @expand_score array
if ($expand and @scorefile == 4) {
	my $count = 0;
	my $last_file_index = @scorefile-1;
	open (SCORE, $scorefile[$last_file_index]) or confess "Error: cannot read from scoring file $scorefile[$#scorefile]: $!";
	while (<SCORE>) {
		m/^((\d+)_(\w))\t(\S+)\t(.+)/ or confess "Error: invalid score record in $scorefile[$last_file_index]: <$_>";
		my ($aa_identity, $position, $aa_type, $aa_score, $aarapdf_scores) = ($1, $2, $3, $4, $5);
		my @aarapdf_scores = split (/,/, $aarapdf_scores);
		for my $i (0 .. @aa-1) {
			my $temp_score = $score[$count] + ($aarapdf_scores[$i] - $aarapdf_scores[$aa_index{$aa_type}]) * $coef_score->[$last_file_index] + $coef_aa->{$aa[$i]} - $coef_aa->{$aa_type};
			if (not defined $logodds) {
				$temp_score = exp ($expand_score[$count]->[$i]) / (1 + $expand_score[$count]->[$i]);
				
			}
			if ($max_score - $min_score) {		#make sure the denominator is not zero
				$expand_score[$count]->[$i] = ($temp_score - $min_score) / ($max_score - $min_score);
			} else {
				$expand_score[$count]->[$i] = $temp_score;
			}
		}
		$count++;
	}
} elsif ($expand and @scorefile == 2) {
	for my $count (0 .. @atom_name-1) {
		$atom_name[$count] =~ m/(\w)$/;
		my $wt_atom = $1;
		for my $i (0 .. @aa-1) {
			my $temp_score = $score[$count] + $coef_aa->{$aa[$i]} - $coef_aa->{$wt_atom};
			if (not defined $logodds) {
				$temp_score = exp ($expand_score[$count]->[$i]) / (1 + $expand_score[$count]->[$i]);
				
			}
			$expand_score[$count]->[$i] = ($temp_score - $min_score) / ($max_score - $min_score);
		}
	}
}

#handling output formats
if (not defined $multicolumn) {
	#by default we use a tab-delimited output format
	for my $count (0 .. @atom_name-1) {
		print $atom_name[$count], "\t", $norm_score[$count], "\t", $score[$count];
		if ($expand) {
			print "\t", join (",", map {$expand_score[$count]->[$_]} (0 .. 19));
		}		
		print "\n";
	}
} else {
	my ($head, $body, $tail);

	@sort_score = sort {$b<=>$a} @score;
	my $tenthscore = $sort_score[9];
	
	#tail contains the top-10 hits for the protein (the 10 residues with the highest MFS scores)
	$tail .= "The top 10 highest scoring residues are ";
	for my $i (0 .. @score-1) {
		if ($score[$i] >= $tenthscore) {
			$tail .= $atom_name[$i] . " (" . sprintf ("%4.2f", $norm_score[$i]) . "), ";
		}
	}
	$tail =~ s/, $/./;
	print $tail, "\n";
		
	#head contains the appropriate heading for each of the following columns
	if (not $expand) {
		if (@scorefile == 4) {
			$head  = "Residue     MFS | HMM          SSR  RAPDF   RAPDF     \n";
			$head .= "                | relative          spread  differ\n";
			$head .= "                | entropy                               \n";
			$head .= "----------------------------------------------------------------------\n";
			$tail  = "----------------------------------------------------------------------\n";
		} elsif (@scorefile == 2) {
			$head  = "Residue     MFS | HMM          SSR\n";
			$head .= "                | relative        \n";
			$head .= "                | entropy         \n";
			$head .= "----------------------------------------------------------------------\n";
			$tail  = "----------------------------------------------------------------------\n";
		}
	} else {
		if (@scorefile == 4) {
			$head  = "Residue     MFS | HMM          SSR  RAPDF   RAPDF  |    " . join ("    ", @aa) . "\n";
			$head .= "                | relative          spread  differ |\n";
			$head .= "                | entropy                          |\n";

			$head .= "--------------------------------------------------------------------------------------------------------------------------------------------------------\n";
			$tail  = "--------------------------------------------------------------------------------------------------------------------------------------------------------\n";
		} elsif (@scorefile == 2) {
			$head  = "Residue     MFS | HMM          SSR |    " . join ("    ", @aa) . "\n";
			$head .= "                | relative         |\n";
			$head .= "                | entropy          |\n";
			$head .= "----------------------------------------------------------------------------------------------------------------------------------------\n";
			$tail  = "----------------------------------------------------------------------------------------------------------------------------------------\n";
		}
	}
	print $head;

	#body contains the actual individual scores for each of the scoring functions and possibly the MFS score for each of the mutations
	for my $i (0 .. @atom_name-1) {
		$atom_name[$i] =~ s/_//;
		$body = substr ("       $atom_name[$i]", -7, 7);
		$body .= sprintf ("%8.2f", $norm_score[$i]) . " | ";
		for my $j (0 .. @scorefile-1) {
			$body .= sprintf ("%8.2f", $ind_score[$j]->[$i]);
		}
		
		if ($expand) {
			$body .= " |";
			for my $j (0 .. @aa-1) {
				if ($expand_score[$i]->[$j] < 0) {
					$body .= sprintf ("%5.1f", $expand_score[$i]->[$j]);
				} else {
					$body .= sprintf ("%5.2f", $expand_score[$i]->[$j]);
				}
			}
		}
		
		
		print $body, "\n";
	}
	

}






sub normalizeScore {
	my ($refscore) = @_;
	my $min_score = $refscore->[0];
	my $max_score = $refscore->[0];
	for (@$refscore) {
		$_ > $max_score and $max_score = $_;
		$_ < $min_score and $min_score = $_;
	}
	if ($max_score == $min_score) {
		for my $i (0 .. @$refscore-1) {
			$refscore->[$i] = 0;
		}
	} else {
		for my $i (0 .. @$refscore-1) {
			$refscore->[$i] = ($refscore->[$i]-$min_score)/($max_score-$min_score);
		}
	}
}

sub readModelFile {
	my ($modelfile) = @_;
	my @aa = split (//, 'ACDEFGHIKLMNPQRSTVWXY');
	my (@coef_score, %coef_aa);
	open (MODEL, $modelfile) or confess "Error: cannot read from model file $modelfile: $!";

	#read the first three lines to make sure that the model file is indeed generated by the STATA software
	$_ = <MODEL>;
	chomp;
	$_ =~ m/^-+$/ or confess "Error: invalid first line in model file: <$_>";
	$_ = <MODEL>;
	chomp;
	$_ eq '        real |      Coef.   Std. Err.      z    P>|z|     [95% Conf. Interval]' or confess "Error: invalid second line in model file: <$_>";
	$_ = <MODEL>;
	chomp;
	$_ =~ m/^[\-\+]+$/ or confess "Error: invalid first line in model file: <$_>";
		
	while (<MODEL>) {
		chomp;
		if (m/^-+$/) {
			last;					#reach the end of the model
		} elsif (m/^\s*_cons \|\s*(\S+)/) {
			$coef_cons = $1;
		} elsif (m/^\s+_\S+_(\d+) \|\s*(\S+)/) {
			$coef_aa{$aa[$1-1]} = $2;		#the coefficient of a particular amino acid
		} elsif (m/^\s*\S+ \|\s*(\S+)/) {
			push @coef_score, $1;			#the coefficient of a scoring function
		}
	}
	close (MODEL);
	$coef_aa{A} = 0;					#the Alaline is not listed in model file
	return (\@coef_score, \%coef_aa, $coef_cons);	
}

=head1 SYNOPSIS

 generate_meta_sig.pl [arguments]

 Optional arguments:
 	-v, --verbose		use verbose output
 	-h, --help		print help message
 	-m, --man		print complete documentation
 	    --modelfile <string>	the file containing MFS model
 	    --multicolumn	output contains individual score for each function
 	-e, --expanded_output	expand MFS calculation for each mutation
 	-l, --logodds		normalization on logodds (default)
 	

 Function: calculate MFS scores based on several scoring files and a model file 
 generated by regression

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--modelfile>

a file generated by STATA program that contains the coefficients for each 
components of the MFS model 

=item B<--multicolumn>

use the multi-column output format, where each individual score has one column, 
and the calculated MFS score corresponds to the last column. All columns are 
normalized to 0-1.

=item B<--expanded_output>

the output contains not only the MFS score for wildtype protein, but also the 
MFS score for each possible mutation for each residue in the protein. This 
argument requires that the input scoring function file contains the detailed 
RAPDF scores for each mutant structure.

=item B<--logodds>

the MFS normalization is based on logodds scores, rather than probability 
values, calculated from the regression model. This option is set by default. Use 
"--logodds 0" to unset this option.

=item B<--tolerate>

tolerate the situations where two scoring file have difference amino acid index. 
For example, one file has 500_R, but the other have 1_R as residue identifier. 
Normally an error will be thrown out, unless this option is set.

=back

=head1 DESCRIPTION

This program is used to calculate meta-functional signature scores based on 
several scoring files and a model file generated by regression.

Currently two types of models are supported: one generated from four scoring 
functions (including two structure-based function), and one generated from two 
sequence- only scoring functions.  In turn, two model files can be used, one for 
default MFS (with four scoring functions) and one for seqonly MFS (with two 
scoring functions).

The current version of the model file for structure-based default MFS contains the 
following contents:


	------------------------------------------------------------------------------
	        real |      Coef.   Std. Err.      z    P>|z|     [95% Conf. Interval]
	-------------+----------------------------------------------------------------
	   hmmrelent |   5.192328    .123639    42.00   0.000         4.95    5.434656
	         ssr |   1.311577   .0777918    16.86   0.000     1.159108    1.464046
	    aarapdf4 |   .0187633   .0006957    26.97   0.000     .0173997    .0201269
	    aarapdf5 |   .0037688   .0003684    10.23   0.000     .0030466    .0044909
	    _Ires2_2 |   2.069348   .2699081     7.67   0.000     1.540338    2.598359
	    _Ires2_3 |   3.174386   .2500509    12.69   0.000     2.684295    3.664476
	    _Ires2_4 |   3.450401    .251067    13.74   0.000     2.958319    3.942483
	    _Ires2_5 |   .8066618   .2911906     2.77   0.006     .2359388    1.377385
	    _Ires2_6 |   .7257259   .2657338     2.73   0.006     .2048973    1.246555
	    _Ires2_7 |   3.128285   .2542768    12.30   0.000     2.629912    3.626658
	    _Ires2_8 |   .1617555   .3562298     0.45   0.650    -.5364421    .8599532
	    _Ires2_9 |   3.448015    .254365    13.56   0.000     2.949469    3.946561
	   _Ires2_10 |   .3742268   .3158052     1.18   0.236    -.2447401    .9931936
	   _Ires2_11 |    .561105   .3547015     1.58   0.114    -.1340972    1.256307
	   _Ires2_12 |   1.958035   .2651344     7.39   0.000     1.438381    2.477689
	   _Ires2_13 |  -.5058271   .3890479    -1.30   0.194    -1.268347    .2566928
	   _Ires2_14 |   1.653717   .2896534     5.71   0.000     1.086007    2.221427
	   _Ires2_15 |   3.018166   .2538173    11.89   0.000     2.520693    3.515638
	   _Ires2_16 |   2.566068   .2573003     9.97   0.000     2.061769    3.070368
	   _Ires2_17 |   2.251352   .2628584     8.56   0.000     1.736159    2.766545
	   _Ires2_18 |  -.4445313   .4133514    -1.08   0.282    -1.254685    .3656226
	   _Ires2_19 |   .5752178   .3073324     1.87   0.061    -.0271426    1.177578
	   _Ires2_21 |   2.099594   .2603405     8.06   0.000     1.589336    2.609852
	       _cons |   -10.4728   .2516399   -41.62   0.000      -10.966   -9.979592
	------------------------------------------------------------------------------

While the current version of the seqonly MFS model file contains the following 
contents:

	------------------------------------------------------------------------------
	        real |      Coef.   Std. Err.      z    P>|z|     [95% Conf. Interval]
	-------------+----------------------------------------------------------------
	   hmmrelent |   5.572188   .1222287    45.59   0.000     5.332624    5.811752
	         ssr |   1.404021   .0767352    18.30   0.000     1.253622    1.554419
	    _Ires2_2 |    1.91072   .2681098     7.13   0.000     1.385234    2.436205
	    _Ires2_3 |    2.88515   .2489968    11.59   0.000     2.397125    3.373175
	    _Ires2_4 |   3.139968    .250056    12.56   0.000     2.649868    3.630069
	    _Ires2_5 |   .7114236   .2887629     2.46   0.014     .1454588    1.277389
	    _Ires2_6 |     .62648    .264875     2.37   0.018     .1073346    1.145625
	    _Ires2_7 |   2.737804   .2527959    10.83   0.000     2.242333    3.233275
	    _Ires2_8 |   .1466863   .3550922     0.41   0.680    -.5492816    .8426543
	    _Ires2_9 |   3.129898   .2531095    12.37   0.000     2.633812    3.625984
	   _Ires2_10 |   .3129215   .3149862     0.99   0.320    -.3044401    .9302831
	   _Ires2_11 |   .3511931   .3531785     0.99   0.320     -.341024     1.04341
	   _Ires2_12 |   1.733744   .2641333     6.56   0.000     1.216052    2.251436
	   _Ires2_13 |  -.8529155   .3884978    -2.20   0.028    -1.614357   -.0914738
	   _Ires2_14 |   1.364789   .2887798     4.73   0.000     .7987914    1.930787
	   _Ires2_15 |   2.567989   .2522995    10.18   0.000     2.073492    3.062487
	   _Ires2_16 |   2.566792   .2566021    10.00   0.000     2.063861    3.069723
	   _Ires2_17 |   2.107711   .2623259     8.03   0.000     1.593562    2.621861
	   _Ires2_18 |   -.460659   .4128253    -1.12   0.264    -1.269782    .3484638
	   _Ires2_19 |    .240424   .3051129     0.79   0.431    -.3575863    .8384344
	   _Ires2_21 |   1.815767   .2592885     7.00   0.000      1.30757    2.323963
	       _cons |  -9.471974    .247213   -38.32   0.000    -9.956502   -8.987445
	------------------------------------------------------------------------------

These files are generated by the STATA software. To facilitate future updates of 
the model files, we keep the output of the software as is in the model file.

The function files should be generated by the generate_meta_sig.pl program, and 
they have suffixes like *.atom_hmmrelent, *.atom_ssr, *.atom_rapdf_spread and 
*.atom_rapdf_dif. Each of the scoring function file contains one residue per 
line, and each line several tab-delimited columns. The first two columns 
represent the residue and the corresponding score, respectively. The remaining 
columns may also be useful: for example, when --expanded_output argument is set, 
the individual RAPDF scores for each mutation (as opposed to wildtype residue 
only) will be used for MFS calculation. For example, the first five lines of a 
*.atom_rapdf_spread file is:


	1_P	13.1329999999998	-4399.28,-4412.21,-4406.00,-4389.93,-4407.25,-4397.69,-4458.08,-4406.42,-4400.92,-4395.89,-4423.20,-4411.24,-4398.05,-4419.69,-4426.15,-4406.47,-4411.41,-4410.83,-4421.02,-4421.93
	2_Q	-16.2444999999998	-4352.63,-4357.23,-4381.31,-4390.42,-4352.25,-4363.63,-4415.19,-4352.73,-4385.76,-4353.12,-4367.88,-4393.33,-4372.03,-4392.96,-4401.43,-4373.17,-4383.83,-4367.51,-4386.43,-4391.47
	3_I	-40.637999999999	-4344.05,-4372.65,-4308.22,-4282.46,-4425.07,-4338.12,-4344.24,-4398.94,-4311.27,-4400.52,-4376.59,-4335.29,-4365.41,-4327.56,-4282.97,-4351.74,-4362.14,-4384.96,-4450.69,-4403.15
	4_T	4.51499999999942	-4395.04,-4378.07,-4401.08,-4417.83,-4379.08,-4394.09,-4398.52,-4347.09,-4414.49,-4383.11,-4393.74,-4404.21,-4444.03,-4413.49,-4422.37,-4410.19,-4394.65,-4365.69,-4417.89,-4408.64
	5_L	3.56199999999999	-4382.32,-4375.24,-4398.67,-4411.46,-4437.28,-4388.92,-4404.64,-4367.46,-4393.46,-4397.35,-4402.30,-4398.18,-4416.60,-4422.32,-4395.27,-4396.69,-4391.93,-4375.49,-4424.94,-4437.72

Several arguments in the program can be uesd to control the output format. By 
default, the output has the similar tab-delimited format as the input scoring 
function files. For example, the first five lines of a output MFS file is: 

	1_P	0.223342343234234	-10.5330190345129
	2_Q	0.307952281541531	-8.15561221947953
	3_I	0.184599844578446	-9.10789912191466
	4_T	0.408178645987139	-7.38185973845797
	5_L	0.159126839053199	-9.30455198200051

In the above output, the second column is the calculated MFS score (normalized 
between 0 and 1), while the third column is the raw logodds score, given by the 
regression model.

If the --multicolumn argument is set, then the output will contain detailed 
information on every scoring function in a multiple column format. For example, 
the first a few lines of a output MFS file is:

                                                                      
	Residue     MFS | HMM          SSR  RAPDF   RAPDF                     
	                | relative          spread  differ                    
	                | entropy                                             
	----------------------------------------------------------------------
	     1P    0.00 |     0.00    0.09   14.92   13.13                    
	     2Q    0.31 |     0.00    0.29   18.27  -16.24                    
	     3I    0.18 |     0.00    0.39   44.86  -40.64                    
	     4T    0.41 |     0.00    0.32   21.72    4.51                    
	     5L    0.16 |     0.00    0.32   19.48    3.56                    

If the --expanded_output argument is set, the output becomes:

	Residue     MFS | HMM          SSR  RAPDF   RAPDF  |    A    C    D    E    F    G    H    I    K    L    M    N    P    Q    R    S    T    V    W    Y
	                | relative          spread  differ |
	                | entropy                          |
	--------------------------------------------------------------------------------------------------------------------------------------------------------
	     1P    0.00 |     0.00    0.09   14.92   13.13 | 0.06 0.33 0.47 0.52 0.17 0.16 0.44 0.08 0.51 0.12 0.13 0.31 0.00 0.27 0.44 0.39 0.35 0.00 0.13 0.33
	     2Q    0.31 |     0.00    0.29   18.27  -16.24 | 0.11 0.38 0.51 0.54 0.22 0.20 0.49 0.13 0.54 0.16 0.18 0.35 0.04 0.31 0.48 0.44 0.39 0.05 0.17 0.37
	     3I    0.18 |     0.00    0.39   44.86  -40.64 | 0.19 0.44 0.62 0.67 0.26 0.29 0.60 0.18 0.65 0.21 0.25 0.45 0.11 0.41 0.61 0.52 0.47 0.11 0.21 0.43
	     4T    0.41 |     0.00    0.32   21.72    4.51 | 0.12 0.39 0.52 0.55 0.23 0.21 0.52 0.16 0.55 0.17 0.19 0.37 0.03 0.32 0.49 0.44 0.41 0.07 0.18 0.38
	     5L    0.16 |     0.00    0.32   19.48    3.56 | 0.12 0.39 0.52 0.55 0.20 0.21 0.51 0.15 0.56 0.16 0.18 0.36 0.04 0.31 0.50 0.44 0.40 0.06 0.17 0.36

Therefore, the MFS score for each possible mutation is also calculated, in 
addition to the wildtype MFS score. The --expanded_output argument uses the 
information contained in one of the columns in the RAPDF_dif file to determine 
the RAPDF score for individual mutations.

The --logodds argument is set by default. It means that the normalization of MFS 
is based on the logodds score from the regression model, rather than the 
probability value calculated from the regression model. The predicted 
probability is calculated as exp(logodds)/(1+exp(logodds)). This treatment 
allows the distribution of the MFS score to be less skewed after normalization.

