#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man);
my ($na_score, $fastafile, $pdbfile, $chain);
GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'na_score=f'=>\$na_score, 'fastafile=s'=>\$fastafile, 'pdbfile=s'=>\$pdbfile, 'chain=s'=>\$chain) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 2 or pod2usage ("Syntax error");

my ($inputsigfile, $outputsigfile) = @ARGV;
$na_score ||= 'N/A';
defined $chain or $chain = 'first';
$fastafile or $pdbfile or pod2usage ("Error in argument: you must specify either --fastafile or --pdbfile");
$fastafile and $pdbfile and pod2usage ("Error in argument: you must specify either --fastafile or --pdbfile but not both");

my ($seq2, $signature, $otherinfo) = readSignatureFile ($inputsigfile);
my $fh;				#file handle to read sequence from
if ($pdbfile) {
	open ($fh, "pdbseq.pl $pdbfile -c $chain |") or confess "Error: unable to retrieve sequence from PDB file $pdbfile: $!";
} else {
	open ($fh, $fastafile) or confess "Error: unable to read from fasta file $fastafile: $!";
}
my ($seq1_id, $seq1) = readTargetSeq ($fh);
close ($fh);
my ($index, $posaa) = extractNongapFromSeq ($seq1, $seq2);	#we need to extract index to seq2, so that we can generate a signature for seq1

open (OUTPUT, ">$outputsigfile") or confess "Error: cannot write to output signature file $outputsigfile: $!";
for my $i (0 .. @$index-1) {
	print OUTPUT $i+1, '_', $posaa->[$i], "\t", $index->[$i] eq 'N/A'?$na_score:$signature->[$index->[$i]], $otherinfo?($index->[$i] eq 'N/A'?'':$otherinfo->[$index->[$i]]):'', "\n";
}
close (OUTPUT);

sub readSignatureFile {
	my ($scorefile) = @_;
	my ($seq, @score, @otherinfo);
	open (SCORE, $scorefile) or confess "Error: cannot read from score file $scorefile: $!";
	while (<SCORE>) {
		s/[\r\n]+$//;
		/^\-?\d*\w?_?(\w)\t([\d\.\-e]+|N\/A)(.*)/ or confess "Error: score file contains unrecognizable records: <$_>";	#sometimes the line could be "188A_K  -153.13", or "-1_A  3.2", due to alternative residue numbering
		my ($aa, $score, $otherinfo) = ($1, $2, $3);
		$seq .= $aa;
		$score eq 'N/A' and $score = $na_score;		#when 'N/A' annotation is given for a position, use the $na_score as the score
		push @score, $score;
		push @otherinfo, $otherinfo;
	}
	close (SCORE);
	return ($seq, \@score, \@otherinfo);
}

#this subroutine is used to read the target sequence from the provided FASTA file. It is the first sequence encountered in the FASTA file.
sub readTargetSeq {
	my ($fh) = @_;
	my ($targetid, $targetseq);
	while (<$fh>) {
		/^>(\S+)/ and $targetid = $1 and last;
	}
	while (<$fh>) {
		/^>/ and last;
		$targetseq .= $_;
	}
	$targetid and $targetseq or confess "Error: cannot find sequence ID from fasta file $fastafile\n";
	$targetseq =~ s/\s//g;		#eliminate space and return from targetseq
	$targetseq =~ s/\.//g;		#eliminate gap characters from targetseq
	$targetseq =~ s/\-//g;		#eliminate gap characters from targetseq
	$targetseq = uc $targetseq;	#change targetseq to upper case
	$targetseq =~ m/([^A-Z])/ and confess "Error: invalid character in target sequence $targetid: <$1> in $targetseq";
	return ($targetid, $targetseq);
}

###########################################################
#THE FOLLOWING SUBROUTINE IS DIRECTLY COPIED FROM SCOREAA_FROM_ALIGN.PL WITHOUT ANY MODIFICATION
###########################################################

#given a partial sequence for a protein (may contain breaks) and the real full sequence for the protein, extract indexes that correspond to non-gap characters and extract amino acid for each index
#these are actually common situations that needs extra care: 
#(1) many times PDB files does not contain structures for all amino acids for a given protein, so structural alignments do not contain these amino acids as well.
#(2) many times PSI-BLAST output does not contain all amino acids for a given sequence and misses either head or tail.
#(3) many times multiple alignments for a given protein is available, but we are interested in a variant of the give protein that has several amino acid differences.
#to solve this problem, I think the best way is to generate a Needleman-Wuntch alignment between the two very similar sequences, then analyze the alignment.
sub extractNongapFromSeq {
	my ($orig_seq1, $orig_seq2) = @_;
	my ($seq1, $seq2) = ($orig_seq1, $orig_seq2);
	$verbose and print STDERR "NOTICE: generating alignments between <$seq1> and <$seq2>\n";

	#both orig_seq1 and orig_seq2 are extracted from certain alignments, so they may contain many gap characters
	#we are interested in adjusting the alignments in orig_seq2 (THIS IS IMPORTANT TO USE THIS SUBROUTINE!)
	#process seq1 and seq2 so that they do not contain non-alphabetic characters
	$seq1 =~ s/\s//g;		#discard space and return characters
	$seq2 =~ s/\s//g;
	$seq1 =~ s/\.|\-//g;		#discard gap characters
	$seq2 =~ s/\.|\-//g;
	$seq1 = uc $seq1;		#change to upper case
	$seq2 = uc $seq2;
	$seq1 =~ m/([^A-Z])/ and confess "Error: Non-alphabetic character <$1> found in sequence";
	$seq2 =~ m/([^A-Z])/ and confess "Error: Non-alphabetic character <$1> found in sequence";

	#generate two FASTA file and call the external program 'needle' to align these two sequences
	my $prefix = 'rand' . substr (rand (), 2, 4);
	open (SEQ, ">$prefix.seq1") or confess "Error: cannot write to $prefix.seq1: $!";
	print SEQ ">seq1\n$seq1\n";
	open (SEQ, ">$prefix.seq2") or confess "Error: cannot write to $prefix.seq2: $!";
	print SEQ ">seq2\n$seq2\n";
	close (SEQ);
	system ("needle -asequence $prefix.seq1 -bsequence $prefix.seq2 -aformat markx10 -auto 1 -stdout 1 > $prefix.needle_output") and confess "Error running 'needle' program: $?";

	#analyze the needle ealignment, which has a pseudo-FASTA format (by the '-aformat markx10' option in command line
	(my $FastaStream = $0) =~ s|[^\\\/]+$|FastaStream.pm|;
	eval {require $FastaStream};
	$@ and confess "Error loading the FastaStream module required for this program:\n\tPlease make sure that the file $FastaStream exists!\n";
	my (@alignseq);
	my $alignio = FastaStream->new (-file=>"$prefix.needle_output");
	while (my $alignobj = $alignio->next_seq) {
		$alignobj->{desc} =~ m/^>seq\d/ or next;
		$alignobj->{orig_seq} =~ m/^; sq_len: \d+\n; sq_type: p\n; al_start: (\d+)\n; al_stop: (\d+)\n; al_display_start: (\d+)\n(.+)/s or confess "Error: invalid pattern $alignobj->{orig_seq}";
		my ($al_start, $al_stop, $al_seq) = ($1, $2, $4);
		$al_seq =~ s/\n\n.*//s;		#there are some comment two lines after the alignment for the second protein, so delete the comments
		$al_seq =~ s/\n//g;		#discard any blank or return characters in the alignment
		push @alignseq, $al_seq;
	}
	@alignseq == 2 or confess "Error: needle alignment file $prefix.needle_output contains alignments for more than two sequences";
	length ($alignseq[0]) == length ($alignseq[1]) or confess "Error: length discordance found in alignment file $prefix.needle_output: <$alignseq[0]> and <$alignseq[1]>";
	
	#process alignment and extract indexes for interested positions
	my @align1 = split (//, $alignseq[0]);		#real full sequence
	my @align2 = split (//, $alignseq[1]);		#sequence extracted from multiple alignments and may miss some amino acids
	my @orig_seq2 = split (//, $orig_seq2);		#we are interested in extracting alignment indexes from orig_seq2, which is a line from multiple alignments
	my $pointer = -1;				#pointer points to alignment positions in the orig_alignment
	my @index;					#stores the important indexes in the orig_seq2 alignment (-1 for insertion)
	my @posaa;					#stores the amino acid identity for the corresponding position

	for my $i (0 .. @align1-1) {
		if ($align1[$i] ne '-' and $align2[$i] ne '-') {		#a normal aligned position
			$pointer++;
			$pointer++ while ($orig_seq2[$pointer] eq '-');		#move pointer to a non-gap position in sequence in orig_alignment
			push @index, $pointer;
			push @posaa, $align1[$i];
		} elsif ($align1[$i] ne '-' and $align2[$i] eq '-') {		#break found in sequence in orig_alignment
			push @index, "N/A";
			push @posaa, $align1[$i];
		} elsif ($align1[$i] eq '-' and $align2[$i] ne '-') {		#rare situation (extra amino acid in sequence in orig_alignment): do nothing
			$pointer++;
			$pointer++ while ($orig_seq2[$pointer] eq '-');
		} else {
			confess "FATAL ERROR: both aligned position in $prefix.needle_output is gap";
		}
	}
	@index == length ($seq1) or confess "Error: length discordance between <$seq1> and size of \@index ${\(scalar @index)}";
	unlink ("$prefix.seq1", "$prefix.seq2", "$prefix.needle_output");
	return (\@index, \@posaa);
}




=head1 SYNOPSIS

 modify_signature.pl [arguments] <input-sig-file> <output-sig-file>

 Optional arguments:
 	-v, --verbose			verbose output level 0(default), 1, 2 or 3
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-n, --na_score <float>		the score for N/A residue (residue without score)
 	-f, --fastafile <string>	the FASTA file that contains the sequence for new signature
 	-p, --pdbfile <string>		the PDB file that contains the sequence for new signature
 	-c, --chain <letter>		the PDB chain (default: first) that contains the sequence

Function: given a functional signature file, based on a given FASTA file or PDB 
file (the protein sequence in them may be slightly different from that in the 
signature file), generate a new signature file that correspond to the amino acid 
sequence in either the FASTA file or the PDB file.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--na_score>

the scores for residues that do not have a score (because such residues cannot 
find a match in the given signature file). By default 'N/A' will be used in the 
new signature file for these residues

=item B<--fastafile>

specify a given sequence file in FASTA format, and the new signature file will 
be based on the first sequence in the fastafile (each residue in the new 
sequence will have a score in the output signature file.

=item B<--pdbfile>

specify a given structure file in PDB format, and the new signature file will be 
based on the ATOM records in the pdbfile (each ATOM residues in the new 
structure will have a score in the output signature file). By default the 
sequence for the first chain in the structure file will be used, unless the --
chain argument is used.

=item B<--chain>

specify the chain in the pdbfile from which the new signature file will be based 
on. By default the 'first' chain encountered in the PDB file will be used.

=back

=head1 DESCRIPTION

This program is used to modify functional signature file and generate an output 
signature file that corresponds to the sequence in a given sequence file or 
structure file. A functional signature file contains one line for each residue, 
and each line is composed of tab-delimited fields: a residue identifier (such as 
100_A), a score, and optionally other information.

Usually the sequences in the input signature file is different from the 
sequences in the given sequence or structure file. For example, the input 
signature file may contain the scores for 200 residues in a certain protein. 
However, the structure for the protein only contains 180 residues (the first M 
residue is missing, and the structure contains a 19-residue chain break). In 
such cases, we can use the program to generate a new signature file that 
contains 180 scores for each of the ATOM residues in the pdbfile.

In some other times, we may have the functional signature for one protein, but 
we want to know the signature for another related protein. This program will try 
to align the two proteins via global sequence alignment, and then analyze the 
residue correspondence between the two proteins and output a new signature file. 
We will then gain some insights into which residues in the new protein might be 
functionally important.

The output of this program may be coupled with modify_pdb.pl program to map the 
new structure-based signature to a given structure. You can then use various 
molecular graphics software to visualize the functionally important residues in 
the new structure, and such analysis has been proven to be quite useful.

=cut
                                                                                                                                                                                                                                     