#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man);
my ($inputfastafile, $similar, $method, $prefix, $path, $pdbfile, $keep_temp, $outputfile, $arg1, $arg2, $arg3, $name);
GetOptions('verbose'=>\$verbose, 'help'=>\$help, 'man|m'=>\$man, 'similar=s'=>\$similar, 'method|a=s'=>\$method, 'prefix=s'=>\$prefix, 'path=s'=>\$path,
	'pdbfile=s'=>\$pdbfile, 'keep_temp'=>\$keep_temp, 'outputfile=s'=>\$outputfile, 'arg1=s'=>\$arg1, 'arg2=s'=>\$arg2, 'arg3=s'=>\$arg3, 'name=s'=>\$name) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

($inputfastafile) = @ARGV;
$method ||= 'psiblast';
$method =~ m/^(clustalw|sam|psiblast|psiblast\-muscle)$/ or pod2usage ("Error in argument: --method can be only 'sam' or 'clustalw' or 'psiblast'");
$prefix ||= "rand" . substr (rand (), 2, 4);
$path and $ENV{PATH} .= ":$path";
$arg1 ||= '';
$arg2 ||= '';
$arg3 ||= '';

#read the first sequence from the FASTA file, which is the sequence to be analyzed
$verbose and print STDERR "\nNOTICE: reading the first sequence in fasta file $inputfastafile as the target sequence ...\n";
my ($targetid, $targetseq) = readTargetSeq ($inputfastafile);

#generate multiple alignment containing the target sequence and its homologs, using various methods (default is PSI-BLAST, which is fast and well-tested)
if ($method eq 'sam') {
	if ($similar) {
		$verbose and print STDERR "\nNOTICE: generaing multiple alignments using the SAM-T2K programs <target2k -seed $inputfastafile -out $prefix -homologs $similar -tuneup $arg1>\n";
		system ("target2k -seed $inputfastafile -out $prefix -homologs $similar -tuneup $arg1") and confess "Error running target2k: $?";
	} else {
		$verbose and print STDERR "\nNOTICE: generaing multiple alignments using the SAM-T2K programs <target2k -seed $inputfastafile -out $prefix $arg1>\n";
		system ("target2k -seed $inputfastafile -out $prefix $arg1") and confess "Error running target2k: $?";
	}

	$verbose and print STDERR "\nNOTICE: post-processing alignment files generated by the target2k program ...\n";
	system ("prettyalign $prefix.a2m -f -i > $prefix.a2m.prettyfasta") and confess "Error";
	system (q{grep_fasta.pl -c '$$=~s/\./-/g' } . "$prefix.a2m.prettyfasta > $prefix.a2m.fasta") and confess "Error";
	system ("convert_align_to_klist.pl -f fasta $prefix.a2m.fasta > $prefix.klist") and confess "Error";
	$keep_temp or unlink ("$prefix.a2m", "$prefix.a2m.prettyfasta", "$prefix.a2m.fasta", "$prefix.cst");
} elsif ($method eq 'clustalw') {
	if ($similar) {
		system (q{grep_fasta.pl -c '$$=~s/\.|\-//g' } . "$inputfastafile > $prefix.fasta") and confess "Error";		#eliminate gaps from FASTA file
		system (q{grep_fasta.pl -c '$$=~s/\.|\-//g' } . "$similar >> $prefix.fasta") and confess "Error";		#eliminate gaps from FASTA file
		$verbose and print STDERR "\nNOTICE: generating multiple alignments using the clustalw programs <clustalw -infile=$prefix.fasta -align -outfile=$prefix.aln $arg2>\n";
		system ("clustalw -infile=$prefix.fasta -align -outfile=$prefix.aln $arg2") and confess "Error";		#run CLUSTALW to generate multiple alignments
	} else {
		pod2usage ("Error in argument: when the --method argument is 'clustalw', you must specify all homologs to align using the --similar option");
	}
	
	$verbose and print STDERR "\nNOTICE: post-processing alignment files generated by the clustalw program ...\n";
	system ("convert_align_to_klist.pl -f clustalw $prefix.aln> $prefix.klist") and confess "Error";
	$keep_temp or unlink ("$prefix.aln");
} elsif ($method eq 'psiblast') {
	if ($similar) {
		$verbose and print STDERR "\nNOTICE: generating a BLAST database for all sequences in $similar ...\n";
		system ("formatdb -i $similar -o T") and confess "Error running formatdb";
		$verbose and print STDERR "\nNOTICE: generating multiple alignments using the blastpgp program <blastpgp -i $inputfastafile -o $prefix.psiblast_output -d $similar -j 3 -m 6 $arg3>...\n";
		system ("blastpgp -i $inputfastafile -o $prefix.psiblast_output -d $similar -j 3 -m 6 $arg3") and confess "Error running blastpgp";
	} else {
		$verbose and print STDERR "\nNOTICE: generating multiple alignments using the blastpgp programs <blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3> ...\n";
		system ("blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3") and confess "Error running <blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3>";
	}

	$verbose and print STDERR "\nNOTICE: post-processing alignment files generated by the blastpgp program ...\n";
	system ("convert_align_to_klist.pl -f psiblastm6 $prefix.psiblast_output > $prefix.klist.temp") and confess "Error";
	
	#the PSI-BLAST program output renames the target name as 'QUERY', so we have to manually change it to the name of the query (by default it is the targetid, unless the --name argument is set)
	if (defined $name) {
		system ("perl -pe 's/^QUERY\t/$name\t/' < $prefix.klist.temp > $prefix.klist.temp1") and confess "Error running perl";
	} else {
		system ("perl -pe 's/^QUERY\t/$targetid\t/' < $prefix.klist.temp > $prefix.klist.temp1") and confess "Error running perl";
	}
	rename ("$prefix.klist.temp1", "$prefix.klist.temp") or confess "Error renaming file from <$prefix.klist.temp1> to <$prefix.klist.temp>";
	
	my ($align_start, $align_end) = readPsiblastAlignLength ("$prefix.psiblast_output");
	if (not defined $align_start) {
		print STDERR "\nNOTICE: UNABLE TO READ PSI-BLAST ALIGNMENTS FROM PSIBLAST OUTPUT FILE $prefix.psiblast_output\n";
		exit (1000);
	} elsif ($align_start == 1 and $align_end == length ($targetseq)) {
		$verbose and print "NOTICE: PSI-BLAST output contains complete sequence for target <$targetid> ...\n";
		rename ("$prefix.klist.temp", "$prefix.klist") or confess "Error renaming file from <$prefix.klist.temp> to <$prefix.klist>";
	} else {
		#The PSI-BLAST output sometimes only contains partial sequence for the target. We have to compensate for the lost amino acids in the head or tail of alignments.
		$verbose and print "NOTICE: completing the partial alignments generated by PSI-BLAST for target <$targetid> (alignment starts at $align_start and ends at $align_end) ...\n";
		completePartialKlist ("$prefix.klist.temp", "$prefix.klist", $align_start, $align_end);
	}
	$keep_temp or unlink ("$prefix.klist.temp", "$prefix.psiblast_output");
	$similar and $keep_temp || unlink ("$similar.cst", "$similar.phr", "$similar.pin", "$similar.psd", "$similar.psi", "$similar.psq");
}elsif ($method eq 'psiblast-muscle') {
	if ($similar) {
		$verbose and print STDERR "\nNOTICE: generating a BLAST database for all sequences in $similar ...\n";
		system ("formatdb -i $similar -o T") and confess "Error running formatdb";
		$verbose and print STDERR "\nNOTICE: generating multiple alignments using the blastpgp program <blastpgp -i $inputfastafile -o $prefix.psiblast_output -d $similar -j 3 -m 6 $arg3>...\n";
		system ("blastpgp -i $inputfastafile -o $prefix.psiblast_output -d $similar -j 3 -m 6 $arg3") and confess "Error running blastpgp";
	} else {
		$verbose and print STDERR "\nNOTICE: generating multiple alignments using the blastpgp programs <blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3> ...\n";
		system ("blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3") and confess "Error running <blastpgp -i $inputfastafile -o $prefix.psiblast_output -j 3 -m 6 $arg3>";
	}

	$verbose and print STDERR "\nNOTICE: post-processing alignment files generated by the blastpgp program ...\n";
	system ("convert_align_to_klist.pl -f psiblastm6 $prefix.psiblast_output > $prefix.klist.temp") and confess "Error";
	
	#the PSI-BLAST program output renames the target name as 'QUERY', so we have to manually change it to the name of the query (by default it is the targetid, unless the --name argument is set)
	if (defined $name) {
		system ("perl -pe 's/^QUERY\t/$name\t/' < $prefix.klist.temp > $prefix.klist.temp1") and confess "Error running perl";
	} else {
		system ("perl -pe 's/^QUERY\t/$targetid\t/' < $prefix.klist.temp > $prefix.klist.temp1") and confess "Error running perl";
	}
	rename ("$prefix.klist.temp1", "$prefix.klist.temp") or confess "Error renaming file from <$prefix.klist.temp1> to <$prefix.klist.temp>";
	
	my ($align_start, $align_end) = readPsiblastAlignLength ("$prefix.psiblast_output");
	if (not defined $align_start) {
		print STDERR "\nNOTICE: UNABLE TO READ PSI-BLAST ALIGNMENTS FROM PSIBLAST OUTPUT FILE $prefix.psiblast_output\n";
		exit (1000);
	} elsif ($align_start == 1 and $align_end == length ($targetseq)) {
		$verbose and print "NOTICE: PSI-BLAST output contains complete sequence for target <$targetid> ...\n";
		rename ("$prefix.klist.temp", "$prefix.klist") or confess "Error renaming file from <$prefix.klist.temp> to <$prefix.klist>";
	} else {
		#The PSI-BLAST output sometimes only contains partial sequence for the target. We have to compensate for the lost amino acids in the head or tail of alignments.
		$verbose and print "NOTICE: completing the partial alignments generated by PSI-BLAST for target <$targetid> (alignment starts at $align_start and ends at $align_end) ...\n";
		completePartialKlist ("$prefix.klist.temp", "$prefix.klist", $align_start, $align_end);
	}
	$keep_temp or unlink ("$prefix.klist.temp", "$prefix.psiblast_output");
	$similar and $keep_temp || unlink ("$similar.cst", "$similar.phr", "$similar.pin", "$similar.psd", "$similar.psi", "$similar.psq");
	
	#after PSIBLAST output is generated, using MUSCEL to refine the alignment
	my $aln_length = qx/head -n 1 $prefix.klist | cut -f 2 | wc -m/;
	chomp $aln_length;
	print STDERR "NOTICE: Before applying muscle, the alignment length for $prefix.klist is $aln_length\n";
	system ("convert_align_from_klist.pl -format fasta $prefix.klist > $prefix.klist_fasta") and confess "Error running conversion";
	system ("muscle -in $prefix.klist_fasta -out $prefix.klist_afa 2> /dev/null") and confess "error running muscle";
	system ("convert_align_to_klist.pl -format fasta $prefix.klist_afa > $prefix.klist") and confess "error running muscle";
	$aln_length = qx/head -n 1 $prefix.klist | cut -f 2 | wc -m/;
	chomp $aln_length;
	print STDERR "NOTICE: After applying muscle, the alignment length for $prefix.klist is $aln_length\n";
	$keep_temp or unlink ("$prefix.klist_fasta", "$prefix.klist.afa");
	
}

#process the generated alignment file: compress alignment to eliminate common gaps and convert to upper case
$verbose and print STDERR "\nNOTICE: compressing alignment files and converting alignments to upper case ...\n";
system ("convert_align_from_klist.pl -f klist -o compress $prefix.klist -u > $prefix.klist.compress") and confess "Error";
rename ("$prefix.klist.compress", "$prefix.klist") or confess "Error renaming file <$prefix.klist.compress>";
$outputfile and rename ("$prefix.klist", $outputfile) || confess "Error renaming file <$prefix.klist> to $outputfile: $!";
print STDERR "\nNOTICE: OUTPUT ALIGNMENT FILE ", $outputfile || "$prefix.klist", " GENERATED SUCCESSFULLY!\n";

#--------------------------------------------------------------------------------------------------------------
#SUBROUTINE SECTION
#--------------------------------------------------------------------------------------------------------------

#this subroutine is used to read the target sequence from the provided FASTA file. It is the first sequence encountered in the FASTA file.
sub readTargetSeq {
	my ($fastafile) = @_;
	my ($targetid, $targetseq);
	open (FH, $fastafile) or confess "Error opening fasta file $fastafile: $!";
	while (<FH>) {
		/^>(\S+)/ and $targetid = $1 and last;
	}
	while (<FH>) {
		/^>/ and last;
		$targetseq .= $_;
	}
	close (FH);
	$targetid and $targetseq or confess "Error: cannot find sequence ID from fasta file $fastafile\n";
	$targetseq =~ s/\s//g;		#eliminate space and return from targetseq
	$targetseq =~ s/\.//g;		#eliminate gap characters from targetseq
	$targetseq =~ s/\-//g;		#eliminate gap characters from targetseq
	$targetseq = uc $targetseq;	#change targetseq to upper case
	$targetseq =~ m/([^A-Z])/ and confess "Error: invalid character in target sequence $targetid: <$1> in $targetseq";
	return ($targetid, $targetseq);
}

#PSI-BLAST alignments with the '-m 6' option do not always contains the full length alignment for the target sequence, so this subroutine is used to read the align_start and align_end
sub readPsiblastAlignLength {
	my ($inputfile) = @_;
	my ($max_round, $align_start, $align_end);

	#sometimes PSIBLAST output contains alignments for many rounds (when '-j' option is set in blastpgp command line), and we are interested in only the last round
	open (PSIBLAST, $inputfile) or confess "Error: cannot read from PSIBLAST output file $inputfile: $!";
	while (<PSIBLAST>) {
		/^Results from round (\d+)\s+/ and $max_round = $1;			#first read tells us the maximum rounds
	}
	open (PSIBLAST, $inputfile) or confess "Error: cannot read from PSIBLAST output file $inputfile: $!";
	if ($max_round) {
		while (<PSIBLAST>) {
			/^Results from round $max_round\s+/ and last;			#reach section for a particular (the last) round
		}
	}

	#read the start and end of the alignment (sometimes alignments start from the middle of the target sequence and end before the tail of the target sequence).
	while (<PSIBLAST>) {
		if (/^QUERY\s+(\d+)\s+\S+\s+(\d+)/) {
			$align_start ||= $1;
			$align_end ||= $2;
			$align_end < $2 and $align_end = $2;
		}
	}
	close (PSIBLAST);
	return ($align_start, $align_end);
}

#now process the generated klist file, and add missing amino acid information into the alignment
sub completePartialKlist {
	my ($inputfile, $outputfile, $align_start, $align_end) = @_;
	open (KLIST, $inputfile) or confess "Error: cannot read from input file $inputfile: $!";
	open (NEWKLIST, ">$outputfile") or confess "Error: cannot write to output file $outputfile: $!";
	while (<KLIST>) {
		s/[\r\n]+$//;
		m/^([^\t]+)\t([^\t]+)$/ or confess "Error: invalid record found in klist file $prefix.klist.temp: <$_>";
		if ($1 eq 'QUERY') {
			print NEWKLIST $targetid, "\t";
			if ($align_start != 1) {
				print NEWKLIST substr ($targetseq, 0, $align_start-1);
			}
			print NEWKLIST $2;
			if ($align_end != length ($targetseq)) {
				print NEWKLIST substr ($targetseq, $align_end, length ($targetseq)-$align_end);
			}
			print NEWKLIST "\n";
		} else {
			print NEWKLIST $1, "\t";
			if ($align_start != 1) {
				print NEWKLIST '-'x($align_start-1);
			}
			print NEWKLIST $2;
			if ($align_end != length ($targetseq)) {
				print NEWKLIST '-'x(length ($targetseq) - $align_end);
			}
			print NEWKLIST "\n";
		}
	}
	close (KLIST);
	close (NEWKLIST);
}

=head1 SYNOPSIS

 generate_multi_align.pl [arguments] <input-fasta-file>

 Optional arguments:
 	-v, --verbose			use verbose output
 	-h, --help			print help message
 	-m, --man			print complete documentation
	-s, --similar <string>		fasta file containing sequences similar to query
	-o, --outputfile <string>	name of output file (default is $prefix.klist)
	-n, --name <string>		name of the query in the final alignment file
	    --method <string>		alignment method: `sam' or `clustalw' or 'psiblast' (default)
	    --prefix <string>		prefix for the temparory file
	    --path <string>		add the path to search for executable
 	    --pdbfile <string>		the PDB file for the query sequence
 	    --keep_temp			keep the temporary files
	    --arg1 <string>		optional argument for the 'target2k' program
	    --arg2 <string>		optional argument for the 'clustalw' program
	    --arg3 <string>		optional argument for the 'blastpgp' program

Function: given a sequene in FASTA format and optionally a sequence 
file containing similar sequences, generate multiple alignments 
containing this sequence using various methods.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--outputfile>

name of output file. By default a name such as "rand8912.klist" will 
be used.

=item B<--name>

name of the query in the final alignment file. By default the name is the string 
after ">" in FASTA file, but you can change it by this argument so that the 
final alignment file conatins the query sequence in another name.

=item B<--similar>

a FASTA file containing similar sequences to the target sequence. When 
this option is set, this program will not request SAM or PSI-BLAST to 
search sequence database for similar sequences. Instead, all seqences 
in this FASTA file are assumed to be homologous to the target, and 
their alignment will be built.

=item B<--method>

the alignment method to be used: 'sam' or 'clustalw' or 'psiblast' 
(default). SAM is a HMM-based method for alignment that is relatively 
slow, but it can produce more accurate alignments and can be more 
sensitive to find homologs than other methods. Clustalw is a method 
based on pairwise comparison and tree building, and it is reasonably 
fast for small number (<300) of sequences. For the clustalw mehtod, 
you have to specify a group of similar sequences to the target 
sequence, since the method cannot be used to search sequence 
databases. psiblast is a fast iterative database searching method that 
find homologs through Position-Specific Scoring Matrix (PSSM) and 
build multiple alignment through stacking pairwise alignments for the 
target and all other proteins.

=item B<--prefix>

the prefix for temporary files used in the execution of this program. 
By default the prefix is something like "rand2892" in current directory. In 
certain cases one cannot write temporary files to the current 
directory so you can specify "-prefix 'other_directory/other_prefix'" 
argument. The "--keep_temp" argument can be used to specify whether 
keep or delete these temporary files after the program finishes.

=item B<--path>

specify additional path to be searched for external programs. This 
program requires several external programs to run, such as the 
clustalw program when --method is specified as 'clustalw'. Sometimes 
these external programs are not in the system $PATH variable, and you 
can use this argument so that the program can still execute the 
external programs without generating "No such file or directory" 
error. For example, you can use "--path 
'/usr/clustalw_dir:/usr/blast_dir'" argument to specify the path to 
the culstalw and blastpgp programs.

=item B<--pdbfile>

specify the PDB file to be used for generating multiple structure 
alignment. If the PDB file contains chain breaks (in other word, if 
the ATOM records in the PDB file does not exactly correspond to the 
sequence in the inputfastafile), then the resulting alignments should 
be handled with caution!

=item B<--keep_temp>

keep temporary files and do not delete them after program finishes. 
These temporary files can be used to debug or diagnose any potential 
problem during the program execution process.

=item B<--arg1>

optional argument for the 'target2k' program in the SAM program suite 
when the --method is specified as "sam". Some of the highly 
recommended arguments include '-db_size', which signifiantly expedite 
execution speed when large sequence database is used. You can also 
specify '-family' argument so the homologs have a similarity level to 
target that is in SCOP family level rather than SCOP superfamily 
level. For example, "--arg1 '-db_size 1000 -family'" can be used in 
the command line.

=item B<--arg2>

optional argument for the "clustalw" program when the --method is 
specified as "clustalw". This is usually not necessary to specify.

=item B<--arg3>

optional argument for the "blastpgp" program when the --method is 
specified as "psiblast". For example, you can use '-e 0.01' to specify 
the statistical significance of hits found by PSI-BLAST, or use '-d 
/lib/est' to specify the BLAST database to be searched (by default 
PSI-BLAST use the 'nr' database in a path specified by environmental 
variable BLASTDB).

=back

=head1 DESCRIPTION

This program generate multiple alignments for a given sequence or structure 
(refered to as "target") using various methods, by calling external alignment 
programs and reformating their outputs. The homologs of the target can be found 
by searching a sequence or a structure database, or can be specified by user in 
a FASTA file through the --similar argument. Use of this program assumes that 
the external programs are installed and configured correctly in the system. This 
program will generate an output file in klist alignment format. Klist is an 
extremely simple format that contain one record per line, and each record 
consists of an ID and an alignment separated by tab.

Several hints and tips that might be helpful for users are explained:

(1) When the --method argument is set to "sam", this program will call an 
external program called target2k (a program in the SAM-T2K program suite) to 
find homologous sequences from a sequence database and generate multiple 
alignments. We assume that you have SAM program suite installed in your system 
and everything is configured correctly. When the --similar argument is set, the 
target2k program will simply generate alignments using the FASTA file specified 
by the --similar argument. Otherwise, the target2k program will search the 
default 'nr' BLAST database file (or other database file specified by the -arg1 
'-db other_database' argument and find homologous sequences, which takes quite a 
lot of time. Personally, I recommend using the UniRef90 (UniProt Non-redundant 
Reference databases) database, which can be downloaded from 
ftp://ftp.expasy.org/databases/uniprot/uniref/. As of UniProt Knowledgebase 
Release 6.7 (20-Dec-2005), this database contains about 1,879,637 sequences 
(collected from the Swiss-Prot, TrEMBLE and PIR-PSD database and filtered by 
90% seuqence identity).

(2) When using the "sam" alignment method, please notice that "the default 
parameters (for target2k) have been adjusted to give good performance at 
recognizing sequences related at the superfamily level of the SCOP database". If 
your goal is to build alignments from very homologous sequences (rather than 
distantly related sequences), then this is not a good method to use. Instead, 
you can try to use the --similar argument and provide a FASTA file with very 
similar sequences.

(3) When using PSI-BLAST method, we also assume that the system is configured 
correctly. Try to run a blastpgp command yourself and see what happen. You can 
set up the BLASTDB environment variable so that you do not have to enter the nr 
database every time. Otherwise, you can always specify the location of the nr 
database or a different database when you BEGIN to run the program using the -
arg2 '-d /usr/lib/ecoli.aa' option.

  An example usage:
generate_multi_align.pl --method sam -arg1 '-family -db_size 5000 -db /usr/lib/ecoli.aa' oneseq.fasta

The above command requests to generate multiple alignments for oneseq.fasta with 
all its very homologous sequences (similarities in the SCOP family level) found 
by SAM in the /usr/lib/ecoli.aa database.

  Another example usage:
generate_multi_align.pl --method clustalw -similar tim30.fasta oneseq.fasta

The above command requests to generated multiple alignments for the combined 
sequences in the oneseq.fasta and time30.fasta files using the clustalw program.
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
=cut