#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man);
my ($method, $seqfile, $seqid, $outputfile, $keep_temp, $normalize, $detail, $exclude_gap, $relative_entropy, $background_frequency_file, $facindex);
my ($alignfile);
GetOptions('verbose'=>\$verbose, 'help'=>\$help, 'man|m'=>\$man, 'method=s'=>\$method, 'seqfile=s'=>\$seqfile, 'seqid=s'=>\$seqid,
	'outputfile=s'=>\$outputfile, 'keep_temp'=>\$keep_temp, 'normalize'=>\$normalize, 'detail'=>\$detail, 'exclude_gap!'=>\$exclude_gap,
	'relative_entropye'=>\$relative_entropy, 'background_frequency_file=s'=>\$background_frequency_file, 'facindex=i'=>\$facindex) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

($alignfile) = @ARGV;
if (not defined $method) {
	print STDERR "NOTICE: the default scoring method is the 'entropy' method (use --method to change)\n";
	$method = 'entropy';
}


if (not defined $exclude_gap) {
	$exclude_gap = 1;
}
$relative_entropy and $exclude_gap || pod2usage ("Error: the --exclude_gap argument must be set for --relative_entropy calculation");


$facindex ||= 1;
$facindex >= 1 and $facindex <= 5 or pod2usage ("Error in argument: the --facindex must be between 1 and 5");

#extract score indexes from alignment positions that correspond to a given protein (i.e., discard alignment columns that have gaps for a given protein)
my ($index, $posaa);		#index: non-gap index in alignment for target sequence; posaa: amino acid sequence of target sequence
if ($seqfile) {
	my ($targetid, $targetseq) = readTargetSeq ($seqfile);
	my $targetalign = readTargetAlign ($alignfile, $targetid) or confess "Error: target ID <$targetid> not found in alignment file $alignfile";
	($index, $posaa) = extractNongapFromSeq ($targetseq, $targetalign);
} elsif ($seqid) {
	my $targetalign = readTargetAlign ($alignfile, $seqid) or confess "Error: sequence ID <$seqid> not found in alignment file $alignfile";
	($index, $posaa) = extractNongapFromAlign ($targetalign);
}


#calculate scores for each alignment positions.
#For SSR method, scores are stored in $state and $step variables, which are both references to an array.
#For entropy method, scores are stored in entropy variable, which is reference to an array.
#alignaa is a string containing all the aligned amino acid in a given column
my ($state, $step, $score, $alignaa, $allprob);
if ($method eq 'ssr') {
	$verbose and print STDERR "NOTICE: generating trees and calculating SSR scores for $alignfile\n";
	($state, $step) = scoreSSR ($alignfile);
	$score = [];
	for my $i (0 .. @$state-1) {
		$score->[$i] = $state->[$i]/($step->[$i]+1);
	}
	$normalize and $score = normalizeScore ($score);
	$outputfile ||= "$alignfile.ssr";
	if ($detail) {
		outputScore ($index, $posaa, $score, $state, $step);		#when --detail argument is set, print additional information on state and step
	} else {
		outputScore ($index, $posaa, $score);
	}
} elsif ($method eq 'entropy') {
	$verbose and print STDERR "NOTICE: calculating entropy scores for $alignfile\n";
	($score, $state, $alignaa) = scoreEntropy ($alignfile);
	$normalize and $score = normalizeScore ($score, $relative_entropy?1:(-1));			#-1 make sure that higher score indicating functionally more important residue
	if ($relative_entropy) {
		$outputfile ||= "$alignfile.relent";
	} else {
		$outputfile ||= "$alignfile.entropy";
	}
	if ($detail) {
		outputScore ($index, $posaa, $score, $state, $alignaa);		#when --detail argument is set, print additional information on number of types of amino acid in each column
	} else {
		outputScore ($index, $posaa, $score);
	}
} elsif ($method eq 'scorecons') {
	$verbose and print STDERR "NOTICE: calculating positional scores for $alignfile using the scorecons program\n";
	($score) = scoreScorecons ($alignfile);
	$normalize and $score = normalizeScore ($score, 1);
	$outputfile ||= "$alignfile.scorecons";
	outputScore ($index, $posaa, $score);
} elsif ($method eq 'hmmentropy') {
	$verbose and print STDERR "NOTICE: generating HMM model and calculating the entropy scores for the emission probability\n";
	($score, $allprob) = scoreHmmentropy ($alignfile);
	$normalize and $score = normalizeScore ($score, $relative_entropy?1:(-1));			# -1 make sure that higher score indicating functionally more important
	if ($relative_entropy) {
		$outputfile ||= "$alignfile.hmmrelent";
	} else {
		$outputfile ||= "$alignfile.hmmentropy";
	}
	if ($detail) {
		outputScore ($index, $posaa, $score, $allprob);
	} else {
		outputScore ($index, $posaa, $score);
	}
	$keep_temp or unlink ("$alignfile.fasta", "$alignfile.hmmer");			#delete temporary files
} elsif ($method eq 'quantauc') {
	$verbose and print STDERR "NOTICE: calculating quantitative distribution differences based on area-under-curve measure\n";
	($score) = scoreConservation ($alignfile);
	$normalize and $score = normalizeScore ($score);
	$outputfile ||= "$alignfile.quantauc$facindex";
	outputScore ($index, $posaa, $score);
} elsif ($method eq 'ksmirnov') {
	1;
} else {
	pod2usage ("Error in argument: the specified --method argument '$method' is not supported by the current program.");
}

print STDERR "NOTICE: SCORING FILE $outputfile WRITTEN SUCCESSFULLY!\n";

#Given a fasta file, read the first protein and return the protein ID and the sequence
sub readTargetSeq {
	my ($fastafile) = @_;
	my ($targetid, $targetseq);
	open (FH, $fastafile) or confess "Error opening fasta file $fastafile: $!";
	while (<FH>) {
		/^>(\S+)/ and $targetid = $1 and last;
	}
	while (<FH>) {
		/^>/ and last;
		$targetseq .= $_;
	}
	close (FH);
	$targetid and $targetseq or confess "Error: cannot find sequence ID from fasta file $fastafile\n";
	$targetseq =~ s/\s//g;		#eliminate space and return from targetseq
	$targetseq =~ s/\.//g;		#eliminate gap characters from targetseq
	$targetseq =~ s/\-//g;		#eliminate gap characters from targetseq
	$targetseq = uc $targetseq;	#change targetseq to upper case
	$targetseq =~ m/([^A-Z])/ and confess "Error: invalid character in target sequence $targetid: <$1> in $targetseq";
	return ($targetid, $targetseq);
}

#Given a klist alignmen file and an ID, return the corresponding alignment for the ID
sub readTargetAlign {
	my ($alignfile, $seqid) = @_;
	my ($found_target, $targetalign);
	open (KLIST, $alignfile) or confess "Error: cannot read from alignfile $alignfile: $!";
	while (<KLIST>) {
		s/[\r\n]+$//;
		m/^([^\t]+)\t(\S+)$/ or confess "Error: invalid record in alignfile $alignfile: <$_>";
		if ($seqid eq $1) {
			$found_target and confess "Error: seqid $seqid occurs twice in alignfile $alignfile";
			$found_target = 1;
			$targetalign = $2;
		}
	}
	return $targetalign;
}

#print out the scores in a tab-delimited format with the first two column as position and score, respectively
sub outputScore {
	my ($index, $posaa, $score, @other_info) = @_;		#index and posaa may be 'undef', when --seqfile and --seqid are not specified
	my @index_print;
	if (defined $index) {
		@index_print = @$index;
	} else {
		@index_print = (0 .. @$score-1);
	}
	$verbose and print STDERR "NOTICE: writting output file $outputfile with two columns representing alignment position and score\n";
	open (OUTPUT, ">$outputfile") or confess "Error: cannot write to outputfile $outputfile: $!";
	for my $i (0 .. @index_print-1) {
		if ($posaa) {
			print OUTPUT $i+1, '_', $posaa->[$i];
		} else {
			print OUTPUT $i+1;
		}
		print OUTPUT "\t", ($index_print[$i] eq 'N/A')?'N/A':$score->[$index_print[$i]];
		if (@other_info) {
			for my $j (0 .. @other_info-1) {
				print OUTPUT "\t", ($index_print[$i] eq 'N/A')?'N/A':$other_info[$j]->[$index_print[$i]];
			}
		}
		print OUTPUT "\n";
	}
	close (OUTPUT);
}

#calculate the SSR score for a given klist alignment file and return the state and step for all alignment positions.
sub scoreSSR {
	my ($alignfile) = @_;
	my (@step, @state, @newstate);
	my (@alignaa, $num_align_pos);

	#reading alignment file (in klist format) for number of states in each alignment position
	open (ALIGN, $alignfile) or confess "Error: cannot open alignment file\n";
	while (<ALIGN>) {
		s/[\r\n]+$//;
		/^[^\t]+\t([^\t]+)$/ or confess "Error: invalid alignment record: <$_>";
		@alignaa = split (//, $1);
		$num_align_pos ||= scalar (@alignaa);
		$num_align_pos == @alignaa or confess "Error: different number of alignment positions in alignment file $alignfile:  $num_align_pos versus ${\(scalar @alignaa)}\n";
		for my $i (0 .. @alignaa-1) {
			$state[$i]->{$alignaa[$i]} ++;
		}
	}
	close (ALIGN);	

	#the PHYLIP programs need fixed file names, so we cannot execute several programs simultaneously in the same directory
	#to circumvent this, we have to resort to temporary directories.
	-d "$alignfile.tempdir" or mkdir ("$alignfile.tempdir") or confess "Error: cannot make a temporary directory $alignfile.tempdir";

	system (q{convert_align_from_klist.pl -f phylip -n -o compress -u -c '$$=~s/\./\-/g' } . "$alignfile > $alignfile.tempdir/infile") and confess "Error";

	#change to the lower level directory and perform the tree construction using PHYLIP
	chdir ("$alignfile.tempdir") or confess "Error: cannot change current working directory to $alignfile.tempdir";

	unlink ("outfile", "outtree");						#PHYLIP won't run when these files already exist
	system (qq{echo "3\n4\n6\ny" | protpars >& /dev/null}) and confess "Error running protpars on alignfile $alignfile";		#run the 'protpars' program in PHYLIP program suite


	
	#reading PHYLIP outfile for number of steps in each alignment position
	open (PHYLIP, "outfile") or confess "Error: cannot open phylip outfile: $!";
	while (<PHYLIP>) {
		/^steps in each position/ and last;
	}
	$_ = <PHYLIP> for (1 .. 2);
	while (<PHYLIP>) {
		/\S/ or last;
		m/^\s*\d+!\s*(.*\d)/ or confess "Error: invalid record: <$_>";	#EXTREMELY IMPORTANT TO USE .* INSTEAD OF .+, BECAUSE SOMETIMES THERE IS ONLY ONE DIGIT NUMBER IN THE LINE
		my $tempsteps = $1;						#some times there is no space between steps"   40!   31012 798   3 630   6 332 342 542 306"
		1 while ($tempsteps =~ s/(\d+)(\d{4})/$1 $2/g);			#split long numbers, since the maximum step can be four-digits. The 31012 is actually two numbers: 3 and 1012
		push @step, split (/\s+/, $tempsteps);
	}
	close (PHYLIP);
	
	if (@step == 0) {
		print STDERR "ERROR: Unable to construct phylogenetic trees\n";
		exit (1000);							#exit value of 1000 means that there is no phylogenetic tree
	}
	
	#calculating state-to-step ratio
	@step == @state or confess "Error: different number of step ${\(scalar @step)} and state ${\(scalar @state)}";
	for my $i (0 .. @state-1) {
		push @newstate, scalar (keys %{$state[$i]});
	}

	$keep_temp or unlink ("infile", "outfile", "outtree");
	
	chdir ("..") or confess "Error: cannot change current working directory to .. (upper level)";
	$keep_temp or rmdir ("$alignfile.tempdir") or confess "Error: cannot remove temporary directory";
	
	return (\@newstate, \@step);
}

sub scoreScorecons {
	my ($alignfile) = @_;
	my $prefix = 'rand' . substr (rand (), 2, 4);
	my @score;
	system ("convert_align_from_klist.pl $alignfile -f fasta > $prefix.fasta") and confess "Error running convert_align_from_klist.pl";
	system ("scorecons $prefix.fasta > $prefix.scorecons_output 2> /dev/null") and confess "Error running scorecons";
	open (SCORECONS, "$prefix.scorecons_output") or confess "Error: cannot read from SCORECONS output file: $!";
	while (<SCORECONS>) {
		s/[\r\n]+$//;
		m/^([\d\.]+)\s+\#\s+\S+$/ or confess "Error: invalid record in SCORECONS output: <$_>";
		push @score, $1;
	}
	close (SCORECONS);
	$keep_temp or unlink ("$prefix.fasta", "$prefix.scorecons_output");
	return (\@score);
}

#calculate the entropy score for a given klist alignment file and returns the scores for all alignment positions
sub scoreEntropy {
	my ($alignfile) = @_;
	my (@aamatrix);			#@aamatrix stores amino acid for whole alignment.
	my (@entropy, @state);		#@entropy and @state store entropy score or number of types of amino acid for each column
	my (@alignaa);			#@alignaa store the aligned amino acid of each column in the alignments
	my (%background_frequency);	#the background frequency is used in relative entropy calculation

	if ($background_frequency_file) {
		open (BACK, $background_frequency_file) or confess "Error: cannot read from background frequency file $background_frequency_file: $!";
		my $total_fre = 0;
		while (<BACK>) {
			chomp;
			/\S/ or next;
			/^(\w)\s+([\d\.e\-]+)$/ or confess "Error: invaild record in background frequency file: <$_>";
			$background_frequency{$1} = $2+0;	#I use "$2+0" to convert it to a number. If this is not a number (for exampel: 1e2e3),  then a warning message will be printed.
			$total_fre += $2;
		}
		print STDERR "NOTICE: finished reading background frequencies with total frequencies as $total_fre\n";
		close (BACK);
	} else {
		#from karlin.c as default background frequency	
		%background_frequency = (A=>0.0780474700897585, R=>0.0512953149316987, N=>0.0448725775979007, D=>0.0536397361638076, C=>0.0192460110427568, Q=>0.0426436013507063, E=>0.0629485981204668, G=>0.0737715654561964, H=>0.0219922696262025, I=>0.0514196403000682, L=>0.090191394464413, K=>0.0574383201866657, M=>0.0224251883196316, F=>0.0385564048655621, P=>0.0520279465667327, S=>0.0711984743501224, T=>0.0584129422708473, W=>0.013298374223799, Y=>0.0321647488738564, V=>0.0644094211988074);
	}

	#reading alignment file (in klist format) for number of states in each alignment position
	my ($num_align_pos);
	open (ALIGN, $alignfile) or confess "Error: cannot open alignment file\n";
	while (<ALIGN>) {
		s/[\r\n]+$//;
		/^([^\t]+)\t([^\t]+)$/ or confess "Error: invalid alignment record: <$_>";
		push @aamatrix, [split (//, $2)];
		$num_align_pos ||= length ($2);
		$num_align_pos == length ($2) or confess "Error: different number of alignment positions in alignment file $alignfile:  $num_align_pos versus ${\(length $2)}\n";
	}
	close (ALIGN);
	
	for my $i (0 .. $num_align_pos-1) {
		my @column = ();
		my %count;
		my $entropy;
		for my $j (0 .. @aamatrix-1) {
			if ($aamatrix[$j]->[$i] eq '-') {
				$exclude_gap and next;	#do not consider gap as an character (when --relative_entropy is set, this option should be set, too)
			} else {
				$background_frequency{$aamatrix[$j]->[$i]} or print STDERR "WARNING: Skipping unknown amino acid encountered in alignment file $alignfile: <$aamatrix[$j]->[$i]>\n" and next;
			}
			push @column, $aamatrix[$j]->[$i];			#@column contains all amino acid for a given aligned column
		}
		@column or print STDERR "WARNING: No valid amino acid found for an aligned column ${\($i+1)}. Entropy is N/A\n";
		foreach my $aa (@column) {
			$count{$aa}++;						#the occurance of each amino acid
			$alignaa[$i] .= $aa;					#the aligned amino acid of this column
		}
		foreach my $aa (keys %count) {
			if ($relative_entropy) {
				if ($aa ne '-') {
					$entropy += ($count{$aa}/@column) * log ($count{$aa}/@column/$background_frequency{$aa}) / log (2);
				} else {
					1;					#there is nothing we can do here for gaps (there is no background frequency for gaps)
				}
			} else {
				$entropy += - ($count{$aa}/@column) * log ($count{$aa}/@column) / log (2);		#entropy = - sum (p * log p / log 2)
			}
		}
		push @entropy, (defined $entropy)? $entropy : 'N/A';
		push @state, scalar (keys %count);
	}
	return (\@entropy, \@state, \@alignaa);
}

#calculate conservation scores by several different methods that share the some protocols
sub scoreConservation {
	my ($alignfile) = @_;
	my (@aamatrix);			#@aamatrix stores amino acid for whole alignment.
	my (@entropy, @state);		#@entropy and @state store entropy score or number of types of amino acid for each column
	my (@alignaa);			#@alignaa store the aligned amino acid of each column in the alignments
	my (%background_frequency);	#the background frequency is used in relative entropy calculation

	if ($background_frequency_file) {
		open (BACK, $background_frequency_file) or confess "Error: cannot read from background frequency file $background_frequency_file: $!";
		my $total_fre = 0;
		while (<BACK>) {
			chomp;
			/\S/ or next;
			/^(\w)\s+([\d\.e\-]+)$/ or confess "Error: invaild record in background frequency file: <$_>";
			$background_frequency{$1} = $2+0;	#I use "$2+0" to convert it to a number. If this is not a number (for exampel: 1e2e3),  then a warning message will be printed.
			$total_fre += $2;
		}
		print STDERR "NOTICE: finished reading background frequencies with total frequencies as $total_fre\n";
		close (BACK);
	} else {
		#from karlin.c as default background frequency	
		%background_frequency = (A=>0.0780474700897585, R=>0.0512953149316987, N=>0.0448725775979007, D=>0.0536397361638076, C=>0.0192460110427568, Q=>0.0426436013507063, E=>0.0629485981204668, G=>0.0737715654561964, H=>0.0219922696262025, I=>0.0514196403000682, L=>0.090191394464413, K=>0.0574383201866657, M=>0.0224251883196316, F=>0.0385564048655621, P=>0.0520279465667327, S=>0.0711984743501224, T=>0.0584129422708473, W=>0.013298374223799, Y=>0.0321647488738564, V=>0.0644094211988074);
	}

	#reading alignment file (in klist format) for number of states in each alignment position
	my ($num_align_pos);
	open (ALIGN, $alignfile) or confess "Error: cannot open alignment file\n";
	while (<ALIGN>) {
		s/[\r\n]+$//;
		/^([^\t]+)\t([^\t]+)$/ or confess "Error: invalid alignment record: <$_>";
		push @aamatrix, [split (//, $2)];
		$num_align_pos ||= length ($2);
		$num_align_pos == length ($2) or confess "Error: different number of alignment positions in alignment file $alignfile:  $num_align_pos versus ${\(length $2)}\n";
	}
	close (ALIGN);
	
	for my $i (0 .. $num_align_pos-1) {
		my @column = ();
		my %count;
		my $score;
		for my $j (0 .. @aamatrix-1) {
			if ($aamatrix[$j]->[$i] eq '-') {
				$exclude_gap and next;	#do not consider gap as an character (when --relative_entropy is set, this option should be set, too)
			} else {
				$background_frequency{$aamatrix[$j]->[$i]} or print STDERR "WARNING: Skipping unknown amino acid encountered in alignment file $alignfile: <$aamatrix[$j]->[$i]>\n" and next;
			}
			push @column, $aamatrix[$j]->[$i];			#@column contains all amino acid for a given aligned column
		}
		@column or print STDERR "WARNING: No valid amino acid found for an aligned column ${\($i+1)}. Entropy is N/A\n";
		foreach my $aa (@column) {
			$count{$aa}++;						#the occurance of each amino acid
			$alignaa[$i] .= $aa;					#the aligned amino acid of this column
		}
		
		if ($method eq 'entropy') {
			if ($relative_entropy) {
				$score = scoreEntropy (\%count, \%background_frequency);
			} else {
				$score = scoreEntropy (\%count);
			}
		} elsif ($method eq 'quantauc') {
			$score = scoreQuantAUC (\%count, \%background_frequency);
		} elsif ($method eq 'ksmirnov') {
			$score = scoreKSmirnov (\%count, \%background_frequency);
		}
		
		push @entropy, (defined $score)? $score : 'N/A';
		push @state, scalar (keys %count);
	}
	return (\@entropy, \@state, \@alignaa);
}

sub scoreEnt {
	my ($count, $background_frequency) = @_;
	my $num_type = scalar (keys %$count);
	my $score;
	for my $aa (keys %$count) {
		if ($background_frequency) {
			$score += ($count->{$aa}/$num_type) * log ($count->{$aa}/$num_type/$background_frequency->{$aa}) / log (2);
		} else {
			$score += - ($count->{$aa}/$num_type) * log ($count->{$aa}/$num_type) / log (2);
		}
	}
	return $score;
}

sub scoreQuantAUC {
	my ($aa_count) = @_;
	my @aa20 = qw/       A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y/;
	my @factor1 = qw/-0.591  -1.343  1.050   1.357   -1.006  -0.384  0.336   -1.239  1.831   -1.019  -0.663  0.945   0.189   0.931   1.538   -0.228  -0.032  -1.337  -0.595  0.260/;
	my @factor2 = qw/-1.302  0.465   0.302   -1.453  -0.590  1.652   -0.417  -0.547  -0.561  -0.987  -1.524  0.828   2.081   -0.179  -0.055  1.399   0.326   -0.279  0.009   0.830/;
	my @factor3 = qw/-0.733  -0.862  -3.656  1.477   1.891   1.330   -1.673  2.131   0.533   -1.505  2.219   1.299   -1.628  -3.005  1.502   -4.760  2.213   -0.544  0.672   3.097/;
	my @factor4 = qw/1.570   -1.020  -0.259  0.113   -0.397  1.045   -1.474  0.393   -0.277  1.266   -1.005  -0.169  0.421   -0.503  0.440   0.670   0.908   1.242   -2.128  -0.838/;
	my @factor5 = qw/-0.146  -0.255  -3.242  -0.837  0.412   2.064   -0.078  0.816   1.648   -0.912  1.212   0.933   -1.392  -1.853  2.897   -2.647  1.313   -1.262  -0.184  1.512/;
	my @factorall = (\@factor1, \@factor2, \@factor3, \@factor4, \@factor5);
	
	my (%factor);
	for my $i (0 .. @aa20-1) {
		$factor{$aa20[$i]} = $factorall[$facindex-1]->[$i];
	}
	
	my $score;
	my $num_type = scalar (keys %$aa_count);
	my (@auc);
	for my $aa (keys %$aa_count) {
		push @auc, [$factor{$aa}, $aa_count->{$aa}/$num_type];
	}
	@auc = sort {$a->[0] <=> $b->[0]} @auc;
	
	my $area = 0;
	for my $i (1 .. @auc-1) {
		my ($freq1, $freq2) = ($auc[$i-1]->[1], $auc[$i]->[1]);
		my $dist = $auc[$i]->[0]-$auc[$i-1]->[0];
		$area += ($freq1+$freq2)*$dist/2;
	}
	return $area;		
}

#given an alignment sequence containing gaps for a protein, extract indexes that correspond to non-gap characters, and extract amino acid for each index
sub extractNongapFromAlign {
	my ($targetalign) = @_;
	my @targetalign = split (//, $targetalign);
	my (@index, @posaa);
	for my $i (0 .. @targetalign-1) {
		$targetalign[$i] eq '-' and next;	#skip gap character
		$targetalign[$i] eq '.' and next;	#skip gap character
		push @index, $i;
		push @posaa, $targetalign[$i];
	}
	return (\@index, \@posaa);
}

#given an alignment sequence for a protein (may contain breaks) and the real full sequence for the protein, extract indexes that correspond to non-gap characters and extract amino acid for each index
#these are actually common situations that needs extra care: 
#(1) many times PDB files does not contain structures for all amino acids for a given protein, so structural alignments do not contain these amino acids as well.
#(2) many times PSI-BLAST output does not contain all amino acids for a given sequence and misses either head or tail.
#(3) many times multiple alignments for a given protein is available, but we are interested in a variant of the give protein that has several amino acid differences.
#to solve this problem, I think the best way is to generate a Needleman-Wuntch alignment between the two very similar sequences, then analyze the alignment.
sub extractNongapFromSeq {
	my ($orig_seq1, $orig_seq2) = @_;
	my ($seq1, $seq2) = ($orig_seq1, $orig_seq2);
	$verbose and print STDERR "NOTICE: generating alignments between <$seq1> and <$seq2>\n";

	#both orig_seq1 and orig_seq2 are extracted from certain alignments, so they may contain many gap characters
	#we are interested in adjusting the alignments in orig_seq2 (THIS IS IMPORTANT TO USE THIS SUBROUTINE!)
	#process seq1 and seq2 so that they do not contain non-alphabetic characters
	$seq1 =~ s/\s//g;		#discard space and return characters
	$seq2 =~ s/\s//g;
	$seq1 =~ s/\.|\-//g;		#discard gap characters
	$seq2 =~ s/\.|\-//g;
	$seq1 = uc $seq1;		#change to upper case
	$seq2 = uc $seq2;
	$seq1 =~ m/([^A-Z])/ and confess "Error: Non-alphabetic character <$1> found in sequence";
	$seq2 =~ m/([^A-Z])/ and confess "Error: Non-alphabetic character <$1> found in sequence";

	#generate two FASTA file and call the external program 'needle' to align these two sequences
	my $prefix = 'rand' . substr (rand (), 2, 4);
	open (SEQ, ">$prefix.seq1") or confess "Error: cannot write to $prefix.seq1: $!";
	print SEQ ">seq1\n$seq1\n";
	open (SEQ, ">$prefix.seq2") or confess "Error: cannot write to $prefix.seq2: $!";
	print SEQ ">seq2\n$seq2\n";
	close (SEQ);
	system ("needle -asequence $prefix.seq1 -bsequence $prefix.seq2 -aformat markx10 -auto 1 -stdout 1 > $prefix.needle_output") and confess "Error running 'needle' program: $?";

	#analyze the needle ealignment, which has a pseudo-FASTA format (by the '-aformat markx10' option in command line
	(my $FastaStream = $0) =~ s|[^\\\/]+$|FastaStream.pm|;
	eval {require $FastaStream};
	$@ and confess "Error loading the FastaStream module required for this program:\n\tPlease make sure that the file $FastaStream exists!\n";
	my (@alignseq);
	my $alignio = FastaStream->new (-file=>"$prefix.needle_output");
	while (my $alignobj = $alignio->next_seq) {
		$alignobj->{desc} =~ m/^>seq\d/ or next;
		$alignobj->{orig_seq} =~ m/^; sq_len: \d+\n; sq_type: p\n; al_start: (\d+)\n; al_stop: (\d+)\n; al_display_start: (\d+)\n(.+)/s or confess "Error: invalid pattern $alignobj->{orig_seq}";
		my ($al_start, $al_stop, $al_seq) = ($1, $2, $4);
		$al_seq =~ s/\n\n.*//s;		#there are some comment two lines after the alignment for the second protein, so delete the comments
		$al_seq =~ s/\n//g;		#discard any blank or return characters in the alignment
		push @alignseq, $al_seq;
	}
	@alignseq == 2 or confess "Error: needle alignment file $prefix.needle_output contains alignments for more than two sequences";
	length ($alignseq[0]) == length ($alignseq[1]) or confess "Error: length discordance found in alignment file $prefix.needle_output: <$alignseq[0]> and <$alignseq[1]>";
	
	#process alignment and extract indexes for interested positions
	my @align1 = split (//, $alignseq[0]);		#real full sequence
	my @align2 = split (//, $alignseq[1]);		#sequence extracted from multiple alignments and may miss some amino acids
	my @orig_seq2 = split (//, $orig_seq2);		#we are interested in extracting alignment indexes from orig_seq2, which is a line from multiple alignments
	my $pointer = -1;				#pointer points to alignment positions in the orig_alignment
	my @index;					#stores the important indexes in the orig_seq2 alignment (-1 for insertion)
	my @posaa;					#stores the amino acid identity for the corresponding position

	for my $i (0 .. @align1-1) {
		if ($align1[$i] ne '-' and $align2[$i] ne '-') {		#a normal aligned position
			$pointer++;
			$pointer++ while ($orig_seq2[$pointer] eq '-');		#move pointer to a non-gap position in sequence in orig_alignment
			push @index, $pointer;
			push @posaa, $align1[$i];
		} elsif ($align1[$i] ne '-' and $align2[$i] eq '-') {		#break found in sequence in orig_alignment
			push @index, "N/A";
			push @posaa, $align1[$i];
		} elsif ($align1[$i] eq '-' and $align2[$i] ne '-') {		#rare situation (extra amino acid in sequence in orig_alignment): do nothing
			$pointer++;
			$pointer++ while ($orig_seq2[$pointer] eq '-');
		} else {
			confess "FATAL ERROR: both aligned position in $prefix.needle_output is gap";
		}
	}
	@index == length ($seq1) or confess "Error: length discordance between <$seq1> and size of \@index ${\(scalar @index)}";
	$keep_temp or unlink ("$prefix.seq1", "$prefix.seq2", "$prefix.needle_output");
	return (\@index, \@posaa);
}

#normalize scores to be within 0-1 region. when $multiplier=-1, the scores are reversed so higher score indicates more important function
sub normalizeScore {
	my ($score, $multiplier) = @_;
	ref $score eq 'ARRAY' or confess "Error: argument type mismatch for <$score>";
	my ($min_score, $max_score);
	my @norm_score = @$score;
	
	for my $i (0 .. @norm_score-1) {
		$norm_score[$i] eq 'N/A' and next;			#there is no information for this position so the score is marked as 'N/A';
		$norm_score[$i] =~ m/^[\d\.\-\e]+$/ or confess "Error: invalid score found: <$norm_score[$i]>";
		$multiplier and $norm_score[$i] *= $multiplier;
		defined $min_score or $min_score = $norm_score[$i];
		defined $max_score or $max_score = $norm_score[$i];
		$min_score > $norm_score[$i] and $min_score = $norm_score[$i];
		$max_score < $norm_score[$i] and $max_score = $norm_score[$i];
	}

	$verbose and print STDERR "NOTICE: score normalization for $alignfile begins with max_score=$max_score and min_score=$min_score\n";
	if ($max_score == $min_score) {
		print STDERR "WARNING: all positions in $alignfile has the same and non-informative score!!!\n";
		for my $i (0 .. @norm_score-1) {
			if ($norm_score[$i] eq 'N/A') {
				print STDERR "NOTICE: The N/A score is treated as 0 (least important) in normalization process\n";
			}
			$norm_score[$i] = 0;
		}
	} else {
		for my $i (0 .. @norm_score-1) {
			if ($norm_score[$i] eq 'N/A') {
				print STDERR "NOTICE: The N/A score is treated as 0 (least important) in normalization process\n";
				$norm_score[$i] = 0;
			} else {
				$norm_score[$i] = ($norm_score[$i] - $min_score) / ($max_score - $min_score);
			}
		}
	}
	return \@norm_score;
}

#the following subroutine read a HMMER file that contain a single HMMER model, and calculate the log2 entropy for each alignment position
#the entropy calculation does not consider any gaps (i.e., only consider the emission probability to 20 amino acids)
sub scoreHmmentropy {
	my ($alignfile) = @_;
	$verbose and print STDERR "NOTICE: converting alignment file $alignfile to FASTA format $alignfile.fasta for use by hmmbuild\n";
	system ("convert_align_from_klist.pl -f fasta $alignfile > $alignfile.fasta") and confess "Error running convert_align";
	$verbose and print STDERR "NOTICE: building HMMER model $alignfile.hmmer from the FASTA alignment file $alignfile.fasta\n";
	system ("hmmbuild -F $alignfile.hmmer $alignfile.fasta >& /dev/null") and confess "Error running hmmer";

	my ($hmmerfile) = "$alignfile.hmmer";
	my (@null, @aa, @entropy, @allprob);					#store null score, amini acid, entropy score, probability of 20 amino acids (joined by ',')

	#from karlin.c
	my %background_frequency = (A=>0.0780474700897585, R=>0.0512953149316987, N=>0.0448725775979007, D=>0.0536397361638076, C=>0.0192460110427568, Q=>0.0426436013507063, E=>0.0629485981204668, G=>0.0737715654561964, H=>0.0219922696262025, I=>0.0514196403000682, L=>0.090191394464413, K=>0.0574383201866657, M=>0.0224251883196316, F=>0.0385564048655621, P=>0.0520279465667327, S=>0.0711984743501224, T=>0.0584129422708473, W=>0.013298374223799, Y=>0.0321647488738564, V=>0.0644094211988074);

	open (HMMER, $hmmerfile) or confess "Error: cannot open hmmer file $hmmerfile: $!";
	$_ = <HMMER>;
	$_ =~ /^HMMER2/ or confess "Error: invalid file format for HMMER in $hmmerfile: <$_>";
	while (<HMMER>) {
		if (/^NULE/) {
			@null = split (/\s+/, $_);
			shift @null;
			@null == 20 or confess "FATAL ERROR: wrong number of NULL probabilities in $hmmerfile: ${\(scalar @null)}\n";
		} elsif (/^HMM/) {
			@aa = split (/\s+/, $_);
			shift @aa;
			@aa == 20 or confess "FATAL ERROR: wrong number of amino acids in $hmmerfile: ${\(scalar @aa)}\n";
		} elsif (/^\s+\d+\s+/) {
			my ($total_prob, $total_entropy, @score, @prob) = (0, 0);
			@score = split (/\s+/, $_);
			shift @score;					#discard the first undef
			my $pos_hmm = shift @score;			#this is the HMM model position, not the alignment column position
			my $pos_align = pop @score;			#this is the alignment column
			@score == 20 or confess "FATAL ERROR: wrong number of amino acids in $hmmerfile: ${\(scalar @score)}\n";
			for my $aa_index (0 .. 19) {
				$prob[$aa_index] = (1/20*2**($null[$aa_index]/1000)) * 2**($score[$aa_index]/1000);
				$total_prob += $prob[$aa_index];
				if ($relative_entropy) {
					$total_entropy += $prob[$aa_index] * log ($prob[$aa_index] / $background_frequency{$aa[$aa_index]});
				} else {
					$total_entropy += -$prob[$aa_index]*log($prob[$aa_index])/log(2);
				}
			}
			abs ($total_prob - 1) >= 0.01 and confess "FATAL ERROR: HMMER file $hmmerfile cannot be parsed correctly: HMM model position $pos_hmm has total probability of $total_prob";
			$entropy[$pos_align-1] = $total_entropy;		#it is only an approximation since it is not normalized by total_prob!!!
			$allprob[$pos_align-1] = join (',', @prob);
		}
	}
	
	#the HMM model does not contain all columns of the alignment, so we have to add zero scores to these alignments
	open (ALIGN, $alignfile) or confess "ERROR: cannot read from alignment file $alignfile: $!";
	$_ = <ALIGN>;
	m/^[^\t]+\t([^\t]+)\n/ or confess "ERROR: wrong format for klist alignment file: <$_>";
	my $total_column = length ($1);
	
	for my $i (0 .. $total_column-1) {
		if ($relative_entropy) {
			defined $entropy[$i] or $entropy[$i] = 0;
		} else {
			defined $entropy[$i] or $entropy[$i] = log (20) / log (2);
		}
		defined $allprob[$i] or $allprob[$i] = join (',', 0.0780474700897585, 0.0192460110427568, 0.0536397361638076, 0.0629485981204668, 0.0385564048655621, 0.0737715654561964, 0.0219922696262025, 0.0514196403000682, 0.0574383201866657, 0.090191394464413,  0.0224251883196316, 0.0448725775979007, 0.0520279465667327, 0.0426436013507063, 0.0512953149316987, 0.0711984743501224, 0.0584129422708473, 0.0644094211988074, 0.013298374223799, 0.0321647488738564);
	}
	return (\@entropy, \@allprob);
}


=head1 SYNOPSIS

 scoreaa_from_align.pl [arguments] <klist-alignment-file>

 Optional arguments:
 	-v, --verbose			use verbose output
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-o, --outputfile <string>	outputfile name
 	-k, --keep_temp			keep temporary file for debugging purposes
	-n, --normalize			normalize score to be within 0-1 range
	-d, --detail			output detailed information after scores column
	    --(no)exclude_gap		exclude gap in aligned position when scoring
	    --method <string>		scoring method: 'entropy' (default) or 'ssr' or 'scorecons' or 'hmmentropy'
	    --seqfile <string>		calculate score for this sequence whose variant is in alignment 
	    --seqid <string>		calculate score for this sequence ID in alignment
	    --relative_entropy		calculate relative entropy using background frequency
	    --background_frequency_file <string>	a file containing residue background frequencies
	

Function: given a multiple alignment in klist format, score each 
position of the alignment (or a specified seuqence in the alignment) 
using the specified scoring method. The first and second 
tab-delimited columns of output is the amino acid position and its 
corresponding importance score.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--outputfile>

specify the name for the output file. By default the name will be the 
alignment file name appended by ssr or entropy or hmmentropy or scorecons

=item B<--keep_temp>

do not delete temporary files used during the program running. This 
option is useful for debugging purposes.

=item B<--normalize>

normalize scores to be within 0-1 range, with higher score indicating 
more important positions. Please notice that for different scoring 
method there might be different normalizing algorithms. Check the 
manual for more information.

=item B<--detail>

this option output additional details after the position 
column (first column) and scoring column (second column) in the output 
file. These details may include information about the number of amino acid 
types in a particulra column, etc.

=item B<--exclude_gap>

by default gap is treated as one amino acid type. Use of this option 
exclude gap from score calculation. For example, for a column with 
half Alanine and half gaps, using this option may treat the column as 
an extremely conservative one.

=item B<--method>

scoring method can be 'ssr' or 'scorecons' or 'entropy' (in log2 base) or 
'hmmentropy' (in log2 base). See manual for more details.

=item B<--seqfile>

sequence file in FASTA format. This sequence should also appear in the 
alignment file (though the actual sequence may have very slight 
differences). When this option is set, this program will output the 
scores only for those positions in the protein in this sequence file 
rather than all alignment positions. In addition, the actual amino 
acid identity will be printed in the first column of the outputfile in 
a format like "50_A" and "88_C". This program requires the external 
program 'needle' from the EMBOSS program package to perform 
Needleman-Wunsch pairwise sequence alignments.

=item B<--seqid>

sequence ID in the alignment file. When this option is set, this 
pogram will output the scores only for those positions in this 
sequence, rather than all alignment positions. In addition, the actual 
amino acid identity will be printed in the first column of the 
outputfile in a format like "50_A" and "88_C".

=item B<--relative_entropy>

calcualte the relative entropy that incorporates amino acid background 
frequencies in protein databases. I found that relative entropy is 
better than entropy in predicting functional sites.

=item B<--background_frequency_file>

a file containing background frequencies of amino acid residues that is used in 
relative entropy calculation. It contains two tab-delimited columns per line, 
with the amino acid and its corresponding frequencies. When this option is 
omitted, the default frequency from karlin.c of the BLAST program is used.

=back

=head1 DESCRIPTION

This program is used to score individual aligned position in given sequence or 
structure multiple alignments (in klist format) using various methods. The klist 
format has one sequence per line, and each line is composed of an ID and an 
alignment separated by tab. Gaps are usually represented by '-' in the 
alignment. When a target seqence is given by the --seqid argument or --seqfile 
argument, this program can also score each position in the target sequence (in 
other word, skip all positions that are gaps in the specified target sequence).

This program currently handles two scoring method: the SSR method, the SCORECONS 
method, the entropy method and the hmmentropy method. They are briefly described 
below:

The SSR (state-to-step ratio) method is proposed by Kai Wang and David Nickle in 
December 2004 but never published or documented in any written form due to 
laziness. The underlying hypothesis for the method is that functionally 
important position tend to have the same evolutionary pressure as the whole 
protein sequence, so their evolutionary patterns should approximate those of the 
whole sequence. Based on this hypothesis, a score called SSR can be calculated 
to measure the correlation of evolution of individual amino acid position with 
that of whole sequence evolution using an evolutionary tree. The method is 
composed of the following steps: (1) For a given protein, first we identify its 
homologous proteins from a protein sequence database via the SAM-T2K method (or 
other similar method), and generate multiple alignments for them. (2) Using the 
multiple alignments, next we generate a phylogenetic tree for these sequences 
using the PROTPARS algorithm (a fast yet accurate tree-building algorithm based 
on maximum parsimony) from the PHYLIP program suite. (3) We then analyze the 
best tree generated, and calculate the state-to-step (SSR=state/(step+1)) ratio 
for each position of the target sequence. (4) Optionally we can do other tricks 
with the SSR score; for example, if the protein has known structure, we can try 
to identify the secondary structure region that has the highest average SSR 
scores.

The SCORECONS method implements several variants, including the method proposed 
by Valdar and Thornton (Proteins 2001;42:108-124). There are arguments in the 
scorecons program that control the exact algorithm used. This method is not 
particularly developed for scoring residue conservation and is not recommended.

The entropy method is very commonly used and its variants have appeared in 
numerous publications. The general hypothesis is that the more divergent an 
amino acid position among multiple alignments, the less functionally important 
this position is. Entropy can be used to measure the divergence of a given 
aligned positions in multiple alignments. It is calculated as SUM(p * logP/log2) 
for each column.

The hmmentropy method is one of the variants of the default entropy method. It 
calculate the actual "expected" amino acid frequencies in each aligned position 
through the use of a HMM model (which means that sequence weighting are 
automatically accounted for, and unobserved amino acids can be assigned 
frequencies). The regular entropy score can then be calculated based on HMM-
estimated frequencies.

The entropy method has lots of other variants, and I will try to implement some 
of them in the future. In addition, there are lots of ways to apply weights to 
different sequences in the alignments, and I will explore ways to add a --weight 
option to the program.

In addition to entropy calculatoin by either the entropy or the hmmentropy 
method, the --relative_entropy argument can be used to calculate relative 
entropy to score sequence conservation in each aligned position. It has been 
shown that relative entropy better predict functional sites (presumably more 
conserved sites) than the default entropy method, by accounting for different 
background frequencies of amino acids in nature. By default, the relative 
entropy use the background frequencies given in the BLAST program, but users can 
supply a custome background frequency file using the --background_frequency_file 
argument, too.

In most cases, users are interested in conservation patterns of all amino acid 
in a given query protein, rather than the entire alignments that contain this 
protein (in cases that this protein contains gaps in the alignments). For 
example, for a query protein with 100 amino acids, one can probably generates 
multiple alignments that contain 150 aligned columsn, but usually we are only 
interested in the 100 aligned columns that corresponds to the 100 amino acids in 
the query protein. Users can use the --seqid argument to specify the particular 
sequence that they are interested in (the seqid is the first tab-delimited 
column in each line in the klist alignment file). In addition, users can supply 
a FASTA-formatted sequence file via the --seqfile argument, so that this program 
can automatically extract the interested aligned positions from the alignment 
file. There are many cases where the --seqfile argument is useful: for example, 
users can generate a multiple sequence alignments for a protein, whose 
structures contains chain breaks. To map the conservation score of this protein 
sequence to protein structure, one can just supply a FASTA file that contains 
amino acid sequence from the ATOM records of the PDB file.

=cut                                                                      
