#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Getopt::Long;
use Pod::Usage;

my ($verbose, $help, $man, $prefix, $slow_scwrl, $scoring_function_file, $old_potential, $encad, $encad_lib, $residue_start, $residue_end, $segment);
GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'prefix=s'=>\$prefix, 'slow_scwrl'=>\$slow_scwrl,
	'scoring_function_file=s'=>\$scoring_function_file, 'old_potential'=>\$old_potential,
	'encad'=>\$encad, 'encad_lib=s'=>\$encad_lib, 'residue_start=i'=>\$residue_start, 'residue_end=i'=>\$residue_end, 'segment=s'=>\$segment) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 2 or pod2usage ("Syntax error");

my ($inputpdbfile, $outputfile) = @ARGV;

#setting up default arguments for the program
$prefix ||= $outputfile;
if (not defined $scoring_function_file) {
	defined $ENV{RAMP_ROOT} or pod2usage ("Error in argument: please specify --scoring_function_file argument or specify the \$RAMP_ROOT environmental variable");
	if ($old_potential) {
		$scoring_function_file = "$ENV{RAMP_ROOT}/lib/scoring_functions/astral_159_e4_allatoms_xray_scores";
	} else {
		$scoring_function_file = "$ENV{RAMP_ROOT}/lib/scoring_functions/astral_169_e4_allatoms_xray_lt2.0_37bins_scores";
	}
	print STDERR "NOTICE: the --scoring_function_file argument is automatically set as $scoring_function_file based on the RAMP_ROOT environmental variable\n";
}
-r $scoring_function_file or confess "Error: the scoring_function_file $scoring_function_file is not readable. Please use the --scoring_function_file argument to specify one";
if (not $old_potential) {
	`cat $scoring_function_file | wc -l` + 0 == 6216 or confess "Error: the scoring function file $scoring_function_file should contain 6216 lines for 37-bin RAPDF scoring function";
} else {
	`cat $scoring_function_file | wc -l` + 0 == 3024 or confess "Error: the scoring function file $scoring_function_file should contain 3024 lines for RAPDF scoring function";
}

if ($encad) {
	if (not defined $encad_lib) {
		defined $ENV{ENCAD_LIB} or pod2usage ("Error in argument: please specify --encad_lib argument or specify the \$ENCAD_LIB environmental variable");
		$encad_lib = "$ENV{ENCAD_LIB}/aamodel.lib";
		print STDERR "NOTICE: the --encad_lib argument is automatically set as $encad_lib based on the ENCAD_LIB environmental variable\n";
	}
	-r $encad_lib or confess "Error: the encad_lib file $encad_lib is not readable. Please use the --encad_lib argument to specify one";
}

#checking whether required external programs are installed in the current computer
my $output;
if ($old_potential) {
	$output = qx/potential_rapdf 2>&1/;
	$output =~ m/Usage: potential_rapdf/ or confess "Error: cannot execute the potential_rapdf program. Please intall the RAMP program suite (http://software.compbio.org/ramp)";
} else {
	$output = qx/potential_rapdf_37bins 2>&1/;
	$output =~ m/Usage: potential_rapdf_37bins/ or confess "Error: cannot execute the potential_rapdf_37bins program. Please intall the RAMP program suite (http://software.compbio.org/ramp)";
}
$output = qx/scwrl3/;
$output =~ m/Usage: scwrl3/ or confess "Error: cannot execute the scwrl3 program for side-chain re-arrangement. Please install the SCWRL3 program (http://dunbrack.fccc.edu/SCWRL3.php)";
if ($encad) {
	$output = qx/echo | encadv5.exe/;
	$output =~ m/ENCAD/ or confess "Error: cannot execute the encadv5.exe program for energy minimization. Please install the ENCAD program";
}



#@aa is the complete set of amino acids (not including gap and not including "X")
#@score is used to store all scores for each amino acid position (in ATOM record)
#@raw_mut_score is used to store all raw scores (joined by comma) for 20 mutations for each amino acid position
#@native_index is used to store all native amino acid index (0-19) for each amino acid position
my @aa = split (//, 'ACDEFGHIKLMNPQRSTVWY');
my ($seq, @seq, @score, @raw_mut_score, @native_index);

#read the PDB file and get the sequence for the ATOM records for the specified chain
$seq = qx/pdbseq.pl $inputpdbfile/;
$seq =~ tr/>/>/ == 0 and confess "Error: unable to retrieve FASTA sequence from the structure file $inputpdbfile\n";
$seq =~ tr/>/>/ > 1 and print STDERR "\n--------\nWARNING: Multiple sequences found in the structure file $inputpdbfile! The program still runs but it has not been comprehensively evaluated for multi-chain structures\n--------\n\n";

$seq = uc $seq;			#all calculation used capital letters
$seq =~ s/^>.+\n//gm;		#get rid of all lines starting with ">" which are description line in FASTA file
$seq =~ s/\s//g;		#get rid of all return characters
@seq = split (//, $seq);	#store individual residues into an array
@seq or confess "Error: unable to retrieve protein sequence from the PDB file $inputpdbfile";

if (defined $segment) {
	$segment =~ m/^(\d+)\/(\d+)$/ or pod2usage ("Error: the --segment argument should be specified as two integers separated by '/'");
	my ($curseg, $totalseg) = ($1, $2);
	$curseg <= $totalseg or pod2usage ("Error: the --segment argument should contain two integers, representing desired segment and the total number of segments");
	my $numperseg = int (@seq/$totalseg);
	$residue_start = $numperseg * ($curseg-1) + 1;
	$residue_end = $numperseg * $curseg;
	if ($curseg == $totalseg) {
		$residue_end = scalar (@seq);
	}
	print STDERR "NOTICE: automatically setting up the --residue_start as $residue_start and --residue_end as $residue_end based on --segment argument of $segment\n";	
}

if (defined $residue_start) {
	$residue_start >= 1 and $residue_start <= @seq or pod2usage ("Error in argument: the --residue_start argument should be more than or equal to 1 and less than or equal to total length of structure ${\(scalar @seq)}");
} else {
	$residue_start = 1;
}
if (defined $residue_end) {
	$residue_end >= 1 and $residue_end <= @seq or pod2usage ("Error in argument: the --residue_end argument should be less than or equal to total length of structure ${\(scalar @seq)}");
} else {
	$residue_end = scalar (@seq);
}
$residue_start <= $residue_end or pod2usage ("Error in argument: the --residue_start ($residue_start) should be less than or equal to --residue_end ($residue_end)");
print STDERR "NOTICE: Totally ", $residue_end-$residue_start+1, " residues in $inputpdbfile will be scored and written to $outputfile\n";

if ($encad) {
	open (ENCAD_OPTION, ">$prefix.encadoption") or confess "Error: cannot write to file $prefix.encadoption: $!";
	print ENCAD_OPTION ">input    styles='coord'  stylec='format'\n          ulib='$encad_lib'\n          uxyz='$prefix.mutant.scwrl.pdb'\n          uotxyz='$prefix.mutant.pdb' \n>minimize nstepm=100 ibad=1 nbcycl=10 rmswgt=1 rmslim=1.0 rijmin=20 \n>output   styleo='format' titleo='$prefix.mutant.scwrl, Minimized 100+2*200' \n>stop --------------------------------------------------------------- \n::  \n";
	close (ENCAD_OPTION);
}

# generate all the mutant structure, re-arrange side-chain conformations, evaluate strcuture stability and calculates RAPDF score for each mutation in each residue
for my $i ($residue_start-1 .. $residue_end-1) {
	my @potential_score;		#stores 20 potential scores for 20 mutant structures for each residue
	my $native_j;			#the j index for the native amino acid (range from 0 to 19)
	my $system_command;

	print STDERR "NOTICE: calculating scores for residue ${\($i+1)} of ${\(scalar @seq)} in $inputpdbfile\n";
	for my $j (0 .. @aa-1) {
		$aa[$j] eq $seq[$i] and $native_j = $j;
		open (SEQ, ">$prefix.seq") or confess "Error: cannot write to temporary sequence file <$prefix.seq>: $!";
		if (not $slow_scwrl) {
			print SEQ lc substr ($seq, 0, $i), $aa[$j], lc substr ($seq, $i+1, @seq-$i-1), "\n";
		} else {
			print SEQ substr ($seq, 0, $i), $aa[$j], substr ($seq, $i+1, @seq-$i-1), "\n";
		}
		close (SEQ);
		
		if ($encad) {
			system ("scwrl3 -i $inputpdbfile -o $prefix.mutant.scwrl.pdb -s $prefix.seq >& /dev/null");		#SCWRL does not return true when successfully executed
			$system_command = "encadv5.exe < $prefix.encadoption >& /dev/null";
			$verbose and print STDERR "NOTICE: running energy minimization by $system_command\n";
			system ($system_command) and confess "Error: cannot execute the system command $system_command";
		} else {
			system ("scwrl3 -i $inputpdbfile -o $prefix.mutant.pdb -s $prefix.seq >& /dev/null");			#SCWRL does not return true when successfully executed
			$verbose and print STDERR "NOTICE: running SCWRL <scwrl3 -i $inputpdbfile -o $prefix.mutant.pdb -s $prefix.seq >& /dev/null>\n";
		}

		$verbose and print STDERR "NOTICE: running potential to calculate RAPDF scores <potential_rapdf_37bins_LARGE $prefix.mutant.pdb $scoring_function_file 2> /dev/null>\n";

		#use the open command (instead of generating a temporary file), to reduce system overhead caused by disk reading and writting
		if ($old_potential) {
			open (POTENTIAL, '-|', "potential $prefix.mutant.pdb $scoring_function_file 2> /dev/null");
		} else {
			open (POTENTIAL, '-|', "potential_rapdf_37bins $prefix.mutant.pdb $scoring_function_file 2> /dev/null");
		}
		$_ = <POTENTIAL>;
		defined $_ or confess "Error: unable to calculate the RAPDF score for $prefix.mutant.pdb";
		m/^S \(probability score\) for \S+ is\s+([\d\.e\-]+)/ or confess "Error: invalid record in potential output: <$_>";
		push @potential_score, $1;
		close (POTENTIAL);
		$verbose and print STDERR "NOTICE: New RAPDF Score for structure $inputpdbfile substitution ${\($j+1)} ($aa[$j]) for residue ${\($i+1)} ($seq[$i]) is $1\n";


	}

	if (not defined $native_j) {
		$seq[$i] eq 'X' or print STDERR "Error: non-standard amino acid identifier found ($seq[$i]) in PDB sequence and were treated as Alanine\n";
		$native_j = 0;
	}

	push @score, $potential_score[$native_j];
	push @raw_mut_score, join (',', @potential_score);
	push @native_index, $native_j;
	$verbose and print STDERR "NOTICE: calculation of mutation score for residue ${\($i+1)} in $inputpdbfile is done\n";
}

#delete temporary files generated by various programs
unlink ("$prefix.seq", "$prefix.mutant.pdb", "$prefix.mutant.scwrl.pdb", "$prefix.encadoption");

outputScoreFile ($outputfile, \@seq, \@score, \@raw_mut_score, \@native_index);
print STDERR "NOTICE: Done writting score file $outputfile for ${\(scalar @seq)} residues in structure file $inputpdbfile successfully!\n";

#write the score to the outputfile
sub outputScoreFile {
	my ($outputfile, $ref_seq, $score, @other_info) = @_;
	open (OUTPUTFILE, ">$outputfile") or confess "Error: cannot write to outputfile $outputfile: $!";
	for my $i ($residue_start-1 .. $residue_end-1) {
		print OUTPUTFILE $i+1, "_", $ref_seq->[$i], "\t", $score->[$i-$residue_start+1];
		if (@other_info) {
			for my $j (0 .. @other_info-1) {
				print OUTPUTFILE "\t", $other_info[$j]->[$i-$residue_start+1];
			}
		}
		print OUTPUTFILE "\n";
	}
	close (OUTPUTFILE);
}

=head1 SYNOPSIS

 scoreaa_by_structure.pl [argument] <input-PDB-file> <output-score-file>

 Optioinal arguments:
 	-h, --help			print help message
 	-h, --man			print complete documentation
 	-v, --verbose			use verbose output
 	    --prefix <string>		prefix of temporary files (default is outputfilename)
 	    --old_potential		use old 19-bin potential program for RAPDF calculation
 	    --slow_scwrl		change side chain for all instead of one residue
 	    --encad			use ENCAD energy minimization
 	    --encad_lib <string>	library file for ENCAD computation
 	    --residue_start <int>	start scoring from this residue
 	    --residue_end <int>		end scoring at this residue
 	    --segment <int1/int2>	automatically set residue start and end for int1 segment with totally int2 segments
 	    --scoring_function_file <file>	scoring funciton file for the potential program

 Function: given a PDB file, for every residue, calculate its structure-based 
 functional importance score through mutating to 20 alternative amino acids and 
 evaluating the resulting structure stability scores.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--prefix>

the prefix for temporary files, which are used during the execution process of 
scwrl and potential. By default the prefix is the same as the outputfile name.

=item B<--slow_scwrl>

use SCWRL to calculate side chain conformations for all residues, rather than 
the single mutated residue. This makes the calculation much slower, and it is 
unlikely that this argument gives better results. Therefore, this argument is 
NOT recommended to use, unless for special purposes.

=item B<--scoring_function_file>

the location of the scoring funciton file used by the potential program in the 
RAMP program suite. By default the program will search the environment variable 
RAMP_ROOT to figure out the path to this file, but users can specify a different 
one if they want.

=item B<--old_potential>

this argument tells the program to use the old version of the potential program 
in the RAMP program suite for RAPDF calculation, rather than the default 
potential_rapdf_37bins_LARGE program in the newer version of RAMP. These two 
programs generate different RAPDF scores, but the new one has better performance.

=item B<--encad>

after reconstructing side chains by SCWRL, use the ENCAD energy minimization on 
the structure. Generally speaking, the resulting structures will be more stable 
(and more similar to real structure), but the computational cost for ENCAD is 
prohibitive in many cases.

=item B<--encad_lib>

the encad program requires a file called aamodel.lib for its computation. This 
file is typically stored in the directory specified by the ENCAD_LIB 
environmental variable, but can be supplied by command line as well.

=item B<--residue_start>

the starting residue to analyze in the structure. this argument is very useful 
for processing large structure files, since different segments of the structure 
can be processed in a cluster of CPUs/machines.

=item B<--residue_end>

the ending residue to process in the structure.

=item B<--segment>

this argument has the form of int1/int2 to automatically set up the --
residue_startand --residue_end argument. For example, when you want to divide 
the structure into 10 segments each processed separately, you can use 1/10, 2/10 
through 10/10 after the --segment argument for each invocation of the program.

=back

=head1 DESCRIPTION

This program is used to give structural stability scores to a complete mutation 
spectrum for amino acid residues in a given structure file in PDB format. The 
outputfile contains one line for each residue, and each line is composed of the 
following tab- delimited fields: amino acid identity, the RAPDF score, the RAPDF 
scores for 20 mutations, and the index of the native residue among 20 amino acid 
types (range from 0 to 19, corresponding to ACDEFGHIKLMNPQRSTVWY, respectively.

The method used for computation is briefly described here: For each residue in 
the structure file, we mutate it to the 20 possible amino acids, then run the 
SCWRL3 program to calculate the side chain orientation for mutated structure, 
then optionally apply the ENCAD energy minimization protocol to pertube the 
structure globally to achieve minimum energy, then run the 
potential_rapdf_37bins program in RAMP program suite to calculate the 37-bin 
RAPDF score for mutated structure. Once we have 20 RAPDF scores for a given 
position, we record it in the outputfile.

This program depends on several external programs, so please make sure that the 
path to these external programs are in your PATH environmental variable and that 
you have previleges to execute these external programs. These include: 
potential_rapdf_37bins program in the RAMP program suite (alternatively, the 
potential program in earlier versions of the RAMP program suite can be used by 
setting the --old_potential argument), the scwrl3 program in the SCWRL program, 
the pdbseq.pl program written by me, optionally the encadv5.exe program in the 
ENCAD package.

The resulting scores are useful in interpreting the functional/structural 
importance of individual amino acid positions. In addition, using the 
generate_rapdf_dif_spread.pl program, the output from this program and be 
transformed to RAPDF_SPREAD and RAPDF_DIF scores, which are two components of 
the MFS function prediction system (http://protinfo.org/mfs).

Some advanced topics for this program are further discussed below:

=over 8

=item * 1

The --segment argument can be used for splitting the job into multiple 
parallel jobs running in differnet machines. Typically for a 500-residue 
structure, it may take a whole day for running this program, so it is essential 
in many cases to split the job in parallel. When you have access to for example 
10 computational nodes in a computer cluseter, you can run the programs 
simultaneously in 10 machines with differnet outut file names, and with 
different --segment argument, ranging from 1/10, 2/10 to 10/10. The resulting 
files can be directly concatenated and generate a final output file containing 
scores for each residue in the structure.

=item * 2

Although this program was originally designed to analyze single-domain protein 
structures, it can be used to analyze multi-chain protein complexes. In fact, I 
believe that it is bad practice to separte a complex of two proteins into 
two chains and analyze them separately, because we lose spatial constraint 
imposed by protein interactions in the RAPDF calculation. However, this should 
be handled in a case-by-case basis: a lot of times protein crystal structures 
are composed of two identical chains/units, but in living cells the protein does 
not form homodimers. Therefore, it makes more sense to extract one chain from 
the structure file and then run this program in this case.

When a structure complex (such as heterodimer) is processed by this program, the 
entire structure is treated as having a single large chain. Presumaly the SCWRL 
program still works correctly but I am not sure about that. The resulting output 
file will contain scores for all residues but the residue are consecutively 
numbered as if they are in a single chain.

=item * 3

The use of --encad argument significantly increase the running time for this 
program, yet have no clear performance advantage in my benchmarking experiments. 
Therefore, it is generally not recommended to use, but maybe it would be useful 
for predicted protein structures, where the energy has not been properly 
minimized, or for highly flexible structures, where the structure tend to have 
global change upon a single mutation.

=item * 4

The output file of this program can be further analyzed by the 
generate_rapdf_dif_spread.pl program to generate a RAPDF_DIF score and a 
RAPDF_SPREAD score. These scores can be used to interpret the structural 
importance of individual residues, as well as the structural destabilization 
effects of individual mutations in each residue. Whether these scores correlate 
with functional effects should be carefully examined and benchmarked in future 
studies, which is an area that I am actively working on and thinking about for 
the past a few years.

=back

=cut
                    
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
