#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man);
my ($inputfastafile, $inputpdbfile, $prefix, $scoring_function_file, $blastdb, $blastmat, $path, $encad, $muscle);
my ($system_command);
GetOptions('verbose'=>\$verbose, 'help'=>\$help, 'man|m'=>\$man, 'fastafile=s'=>\$inputfastafile, 'p|pdbfile=s'=>\$inputpdbfile, 'prefix=s'=>\$prefix,
	'scoring_function_file=s'=>\$scoring_function_file, 'blastdb=s'=>\$blastdb, 'blastmat=s'=>\$blastmat, 'path=s'=>\$path, 'encad'=>\$encad,
	'muscle'=>\$muscle) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV == 0 or pod2usage ("Syntax error");
$inputpdbfile or $inputfastafile or pod2usage ("Syntax error: please specify either the --fastafile or the --pdbfile argument");
$encad = $encad?' --encad':'';

if (defined $path) {
	$path =~ m/[^\\]\s/ and pod2usage ("Error in argument: the --path argument <$path> cannot contain any space");
	$ENV{PATH} = "$path:" . $ENV{PATH};
	print STDERR "system executable path is now $ENV{PATH}\n\n";
}

if (defined $inputpdbfile and not defined $scoring_function_file) {
	if ($ENV{RAMP_ROOT}) {
		$scoring_function_file = "$ENV{RAMP_ROOT}/lib/scoring_functions/astral_169_e4_allatoms_xray_lt2.0_37bins_scores";		#modified file name in the new version of RAMP
		-r $scoring_function_file or pod2usage ("Error in argument: please specify --scoring_function_file argument for the 37-bin RAPDF computation");
		print STDERR "NOTICE: The default scoring function file for RAPDF score generation is $scoring_function_file\n";
	} else {
		pod2usage ("Error in arguments: when structure is supplied by the --pdbfile argument, please also specify the location of the 37-bin RAPDF scoring function file using the --scoring_function_file argument or by setting up the RAMP_ROOT environment variable");
	}
}

if (not defined $inputfastafile) {
	$inputfastafile = "$inputpdbfile.fasta";
	$system_command = "pdbseq.pl $inputpdbfile -n query -c first > $inputfastafile";
	$verbose and print STDERR "NOTICE: generating the ATOM-based FASTA file for structure file $inputpdbfile\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
}

$prefix ||= $inputfastafile;

#We assume that the blast program is already correctly installed and configured in the system, so the following paragraph is not really necessary
#modern BLAST installation use the ~/.ncbirc file for settings, so the environment variables are probably no longer important
if (not -r "$ENV{HOME}/.ncbirc") {
	if (not defined $blastdb) {
		if ($ENV{BLASTDB}) {
			$blastdb = "$ENV{BLASTDB}/nr";		#the default nr should be a protein sequence database generated by the formatdb program
		} else {
			pod2usage ("Error in arguments: please specify the location of the BLAST database using the --blastdb argument or the 'BLASTDB' environmental variable");
		}
	}
	
	#I treat the --blastmat argument differently, since it is less important
	if (not defined $blastmat) {
		if (not $ENV{BLASTMAT}) {
			pod2usage ("Error in argument: please specify the location of BLAST substitution matrices using the --blastmat argument or the 'BLASTMAT' environmental variable");
		}
	} else {
		$ENV{BLASTMAT} = $blastmat;
	}
}


#first generate multiple alignments
$system_command = "generate_multi_align.pl $inputfastafile " . ($blastdb?" --arg3 '-d $blastdb'":"") . " -o $prefix.klist --name query --prefix $prefix";
$muscle and $system_command .= " -method psiblast-muscle";		#when --muscle argument is present use muscle to refine alignment
$verbose and print STDERR "NOTICE: Generating multiple sequence alignments by running command <$system_command>\n";
if (system ($system_command)) {
	if ($? == 1000) {					#the error code 1000 is set up in the generate_mult_align.pl program to signal non-alignment-found error
		confess "Error running <$system_command>: No alignment found in PSI-BLAST output, so MFS cannot be applied on the input sequence.\n";
	} else {
		confess "Error running <$system_command>: $?";
	}
}

#then generate HMM_rel_ent based functional signature
$system_command = "scoreaa_from_align.pl $prefix.klist --detail -method hmmentropy --seqid query --relative_entropy -o $prefix.hmmrelent --normalize";
$verbose and print STDERR "NOTICE: Generating HMM_rel_ent score by running command <$system_command>\n";
system ($system_command) and confess "Error running <$system_command>: $?";

#then generate SSR based functional signature (normalized score)
$system_command = "scoreaa_from_align.pl $prefix.klist --detail --method ssr --seqid query -o $prefix.ssr --normalize";
$verbose and print STDERR "NOTICE: Generating SSR score by running command <$system_command>\n";
if (system ($system_command)) {
	if ($? == 1000) {
		print STDERR "Error running <$system_command>: Cannot generate phylogenetic trees from given multiple alignments\n";
		exit (1000);
	} else {
		confess "Error running <$system_command>: $?";
	}
}

if (defined $inputpdbfile) {
	#then generate the ATOM-based sequence for the PDB file
	$system_command = "pdbseq.pl $inputpdbfile > $inputpdbfile.atom_fasta";
	$verbose and print STDERR "NOTICE: Generating ATOM-based FASTA file for $inputpdbfile by running <$system_command>\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
	
	#then make the ATOM-SEQRES correspondence
	$system_command = "modify_signature.pl $prefix.hmmrelent $prefix.atom_hmmrelent -f $inputpdbfile.atom_fasta";
	$verbose and print STDERR "NOTICE: Converting HMM_rel_ent score so that it corresponds to ATOM records in PDB file by running <$system_command>\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
	$system_command = "modify_signature.pl $prefix.ssr $prefix.atom_ssr -f $inputpdbfile.atom_fasta";
	$verbose and print STDERR "NOTICE: Converting SSR score so that it corresponds to ATOM records in PDB file by running <$system_command>\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
	
	#then generate the AARAPDF score, which contains all the RAPDF scores for each individual mutation
	$system_command = "scoreaa_by_structure.pl $inputpdbfile $prefix.atom_aarapdf --scoring_function_file $scoring_function_file $encad";
	$verbose and print STDERR "NOTICE: Generating AARAPDF scores for each mutation by running <$system_command>\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
	
	#then generate the aarapdf score to RAPDF_dif and RAPDF_spread score
	$system_command = "generate_rapdf_dif_spread.pl $prefix.atom_aarapdf dif > $prefix.atom_rapdf_dif";
	$verbose and print STDERR "NOTICE: Generating RAPDF_dif score by running <$system_command>\n";
	system ($system_command) and confess "Error running <$system_command>: $?";
	$system_command = "generate_rapdf_dif_spread.pl $prefix.atom_aarapdf spread > $prefix.atom_rapdf_spread";
	$verbose and print STDERR "NOTICE: Generating RAPDF_spread score by running <$system_command>\n";
	system ($system_command) and confess "Errro running <$system_command>: $?";

	print STDERR "NOTICE: All scoring files generated successfully: $prefix.atom_hmmrelent $prefix.atom_ssr $prefix.atom_rapdf_spread $prefix.atom_aarapdf_dif!\n";
} else {
	print STDERR "NOTICE: All scoring files generated successfully: $prefix.hmmrelent $prefix.ssr!\n";
}



=head1 SYNOPSIS

 generate_meta_sig.pl [arguments]

 Optional arguments:
 	-v, --verbose			use verbose output
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-f, --fastafile <file>		FASTA file for the sequence
 	-p, --pdbfile <file>		PDB file for the structure
 	    --prefix <string>		the prefix of output files
 	    --blastdb <file>		the location of the BLAST database files
 	    --blastmat <file>		the location of the BLAST data direcotry
 	    --muscle			use MUSCLE to refine PSIBLAST-generated alignment
 	-s, --scoring_function_file <file>	the location of the RAPDF scoring function file
 	    --path <string>		add the specified path to search for executables

Function: generate individual scoring files required for constructing 
meta-functional signature for a given protein sequence and/or 
structure

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--fastafile>

a FASTA formatted file containing the sequence of the protein. When 
this option is not given, the sequence of the protein will be 
extracted from the ATOM record of the PDB file. It is always 
recommended to use this option, since many PDB files contain chain 
breaks in the ATOM records.

=item B<--pdbfile>

a PDB formatted file containing the atomic coordinates of the protein. 
When this option is not given, the sequence-only meta functional 
signature will be generated. Our tests show that the performance of 
seqonly metasig is about 10% less than that of the structure-based 
metasig, using top 10 hits as the evaluation methodology.

=item B<--prefix>

this argument specify the prefix of output files. By default the 
prefix is the name of the input-fasta-file. When input fasta file is 
not specified, the prefix is the input-pdb-file suffixed by ".fasta".

=item B<--blastdb>

the location of the BLAST database file. The query sequence will be searched 
against this database. When the environment variable BLASTDB is not present and 
when the ~/.ncbirc file is missing, this argument can be used.

=item B<--blastmat>

the location of the BLAST data directory containing the subsitution matrices. 
This is usually under the 'data' directory in the the installation directory of 
the BLAST program. When the environment variable BLASTMAT is not set and when 
the ~/.ncbirc file is missing, this argument can be used.

=item B<--scoring_function_file>

the location of the scoring function file for the RAPDF calculation. 
When the environment variable RAMP_ROOT is not set, this argument can 
be used.

=item B<--encad>

specify whether the ENCAD energy minimization protocol should be applied after 
side-chain replacement by SCWRL. Generally speaking, running ENCAD will generate 
more stable structures, but it requires more computational time.

=item B<--path>

add the specified path to search for executables. This program will call several 
external programs (such as HMMER, PSIBLAST, RAMP, SCWRL, PHYLIP, etc) and all of 
them should be already placed in PATH of the system. If not, the appropriate 
path can be specified here. Multiple directories can be separated by ":".

=item B<--muscle>

use MUSCLE (rather than the default PSI-BLAST) for sequence alignment. Our 
benchmarking results show that MUSCLE alignments are generally tighter, and the 
results are better.

=back

=head1 DESCRIPTION

This program is used to generating individual scoring files that constitute 
meta-functional signature (MFS) for a given protein sequence or protein 
structure. Once these scoring files have been generated, you can use 
calculate_meta_sig.pl program to calculate the actual MFS for your query 
protein.

When protein structure is given as a PDB file and its sequence is given as a 
FASTA file, the MFS is most accurate. When only protein sequence is known, the 
seqonly MFS will be used. Their differences are explained in more detail below.

When a structure file is specified by the --pdbfile (or -p) argument, this 
program works in the following steps:

=over 8

=item 1

It will check whether the --fastafile argument is set. If not, it will 
call an external program pdbseq.pl to extract the sequence information from the 
given PDB file and write to a FASTA file.

=item 2

It will then generate a multiple sequence alignment for the FASTA file, 
by querying a sequence database using the PSI-BLAST program. It will reformat 
the output files so that the final alignment file has one sequence per line, and 
each line contains sequence name and its alignment separated by tab character.

=item 3

It will then calculte the HMM_rel_ent score using the generated multiple 
sequence alignments. This score is a sequence conservation based score that 
incorporate amino acid background frequencies.

=item 4

It will then calculate the SSR score using the generated multiple sequence 
alignments. This score is an evolutionry change based score that analyze a 
maximum parsimony tree to give estimates of how amino acid substitution 
correlate with the topology change of the tree. The tree is generated by the 
protpars program in the PHYLIP program package. It is reasonably fast, but for a 
moderate size (100 amino acid residues) of protein and 250 homologous sequences 
in the alignments, it still takes dozens of minutes to run in a typical modern 
desktop computer.

=item 5

It will then calculate the AARAPDF score for each amino acid change in the 
protein structure, using the supplied PDB file. RAPDF was originally developed 
for scoring decoy protein structures, but it can be considered as a pseudo-
energy score. For each amino acid residue in the protein structure, we mutate 
it to one of 19 other amino acid, then apply the side-chain rearrangement 
program SCWRL to assign the conformation for side chains of this residue, then 
apply the ENCAD energy minimization protocol to slighly pertube the whole 
structure to achieve lower energy, then calculate the resulting RAPDF scores. 
For a protein with 100 amino acid residues, this means performing the SCWRL and 
ENCAD calculation for 100*19 times and performing RAPDF calculations for 100*20 
times; therefore, this step takes a lot of time. Since the calculation of RAPDF 
(and probably SCWRL and ENCAD also) is linearly correlated with the square of 
the protein length, it is recommended to split large proteins into domains, 
before calculating the MFS.

=item 6

Based on the AARAPDF scores calculated in the above step, this program will then 
for each amino acid residue, calculate the RAPDF_spread and RAPDF_dif score. The 
first score evaluates how the 20 RAPDF scores differs from each other, while the 
second score evaluates how the pseudo-energy of proteins with native amino acid 
differs from mean of all the 19 mutated proteins.

=item 7

The program will then exit. You can then use the calculate_meta_sig.pl program 
and a pre-computed model file to combine all the individual scores together and 
give a composite MFS score for the query protein.

=back

When only the --fastafile argument is supplied, the program will perform the 
step 2, 3, 4 as described above. You can then use the calculate_meta_sig.pl 
program and a pre-computed model file (specifically computed for seqonly-MFS) to 
combine all the individual scores together.

=cut                                                                                                                                                                                                            
                                                                                                                                                                                