#!/usr/bin/perl -w
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man, $format, $change, $operation, $nice, $upper, $inputfile);
our (@alignid, %align);

GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'format=s'=>\$format, 'change=s'=>\$change, 'operation=s'=>\$operation, 'nice'=>\$nice, 'upper'=>\$upper) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

($inputfile) = @ARGV;
defined $format or pod2usage ("Error in parameter: you should specify `format' parameter");
$format eq 'klist' or $format eq 'phylip' or $format eq 'clustalw' or $format eq 'fasta' or pod2usage ("Error in parameter: `format' parameter should be only klist, phylip or clustalw");
defined $change or $change = '';
defined $operation or $operation = '';
$operation eq '' or $operation eq 'compress' or pod2usage ("Error in parameter: `operation' can only be 'compress'");

readKlist ($inputfile);

if ($change) {
	my $command;
	for (@alignid) {
		$command = $change;
		$command =~ s/\$\$/\$align{"$_"}/g;
		eval $command;
		$@ and confess "Error executing command `$command'\n";
	}
	
	#if output format is klist, then we should not change the alignment length after executing the --change command
	if ($format eq 'klist') {
		my $len_align;
		for (@alignid) {
			$len_align ||= length ($align{$_});
			$len_align == length ($align{$_}) or confess "Error: after execution of <$command>, length of alignments are no longer the same: <$align{$_}> should be $len_align";
		}
	}
}

if ($operation eq 'compress') {
	compressAlign ();
}

if ($format eq 'klist') {
	foreach (@alignid) {
		print $_, "\t", $align{$_}, "\n";
	}
} elsif ($format eq 'phylip') {
	print @alignid+0, "\t", length ($align{$alignid[0]}), "\n";
	foreach (@alignid) {
		#PHYLIP does not allow any non-standard amino acid in the alignment (but other databases, such as UniRef, does contain some), so we convert them to X!
		$align{$_} =~ s/[BJOUXZ]/X/gi;
		
		if ($nice) {
			my $nicename = $_;
			$nicename =~ s/_//g;
			$nicename =~ s/^(\w+).*/$1/;
			print substr ($nicename.(' 'x10), 0, 10), $align{$_}, "\n";
		} else {
			print substr ($_.(' 'x10), 0, 10), $align{$_}, "\n";
		}
	}
} elsif ($format eq 'clustalw') {
	print "CLUSTAL W generated by third party program\n\n\n";
	my $max_len_name = 0;
	for (@alignid) {
		$max_len_name < length ($_) and $max_len_name = length ($_) ;
	}
	for (@alignid) {
		print substr ("$_".(' 'x$max_len_name), 0, $max_len_name+1), $align{$_}, "\n";
	}
} elsif ($format eq 'fasta') {
	for (@alignid) {
		print ">$_\n$align{$_}\n";
	}
}




sub readKlist {
	my ($inputfile) = @_;
	my $len_align;
	open (INPUT, $inputfile) or confess "Error: cannot open input file $inputfile: $!";
	while (<INPUT>) {
		/^([^\t]+)\t([^\t]+)\r?\n$/ or confess "Error: invalid record in klist file: <$_>";
		$align{$1} = $upper?(uc $2):$2;
		push @alignid, $1;
		$len_align ||= length ($2);
		$len_align == length ($2) or confess "Error: length of <$2> should be $len_align";
	}
}

sub compressAlign {
	my (@alignaa, %discard);
	foreach (@alignid) {
		push @alignaa, [split (//, $align{$_})];
	}
	for my $i (0 .. @{$alignaa[0]}-1) {
		my $keep = 0;
		for my $j (0 .. @alignaa-1) {
			$alignaa[$j]->[$i] =~ /^\w$/ and $keep = 1 and last;
		}
		$keep or $discard{$i}++;
	}
	for my $j (0 .. @alignaa-1) {
		my $seq;
		for my $i (0 .. @{$alignaa[0]}-1) {
			$discard{$i} or $seq .= $alignaa[$j]->[$i];
		}
		$align{$alignid[$j]} = $seq;
	}
	my $len_align;
	foreach (@alignid) {
		$len_align ||= length ($align{$_});
		$len_align == length ($align{$_}) or confess "Error: different length of alignment: length of <$align{$_}> should be $len_align";
	}
}
			








=head1 SYNOPSIS

convert_align_to_klist.pl [arguments] <inputfile>

 Optional arguments:
 	-v, --verbose			verbose output level 0(default), 1, 2 or 3
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-f, --format			our file format can be 'clustalw', 'fasta' or `klist'
 	-c, --change			change the alignment according to this Perl command
 	-o, --operation			operation can be "compress"
 	-n, --nice			use nice output for sequence name in PHYLIP output
 	-u, --upper			residues will be converted to upper case letters

 Function: convert format of alignment file into klist format, or perform operations on alignments

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--format>

currently this program can handle input file format as 'clustalw' or 'fasta' (which contain gaps in alignment)

=item B<--change>

a function being devloped and highly discouraged. use $$ to substitute each 
aligned sequence and use a perl command to change the alignment. (example: 
$$=~s/\-//g can be used to eliminate gaps in the alignment

=item B<--operation>

operation can be "compress" and other being developed

=item B<--nice>

use nice output for sequence name in PHYLIP output. (some software do not recognize underline(_) or slash(/) in sequence name, so use this option to get rid of them)

=back

=head1 DESCRIPTION

This program is used to convert alignment file from klist format to other format 
or even change the alignments themselves.

=cut
