#!/usr/bin/perl -w
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man, $format, $inputfile);
our (@align, %align);

GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'format=s'=>\$format) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

($inputfile) = @ARGV;

$format ||= 'fasta';
$format eq 'fasta' or $format eq 'clustalw' or $format eq 'blastpgpm6' or $format eq 'psiblastm6' or pod2usage ("Error in parameter: `format' option can only be fasta, clustalw");

if ($format eq 'fasta') {
	readFasta ($inputfile);
} elsif ($format eq 'clustalw') {
	readClustalw ($inputfile);
} elsif ($format eq 'blastpgpm6' or $format eq 'psiblastm6') {
	readBlastpgp ($inputfile);
}

my $len_align = 0;
foreach (@align) {
	print $_, "\t", $align{$_}, "\n";
	$len_align ||= length ($align{$_});
	$len_align == length ($align{$_}) or confess "Error: different length of alignment: length of $_ is <$align{$_}> (${\(length ($align{$_}))} is not $len_align";
}



sub readClustalw {
	my ($inputfile) = @_;
	open (INPUT, $inputfile) or confess "Error: cannot open input file $inputfile: $!";
	$_ = <INPUT>;
	/^CLUSTAL/ or confess "Error: file header does not seems like a clustalw file: <$_>";
	
	while (<INPUT>) {
		s/\r?\n//;
		/\S/ or next;
		/^\s+/ and next;
		if (/^(\S+)\s+(\S+)$/) {
			$align{$1} or push @align, $1;
			$align{$1} = $align{$1}?"$align{$1}$2":$2;
		} else {
			confess "Weird records found in $inputfile:<$_>";
		}
	}
}

sub readBlastpgp {
	my ($inputfile) = @_;
	my ($max_round, %align_block_id, @alignid);
	open (INPUT, $inputfile) or confess "Error: cannot open inputfile $inputfile: $!";
	while (<INPUT>) {
		/^Results from round (\d+)\s+/ and $max_round = $1;
	}
	open (INPUT, $inputfile) or confess "Error: cannot open inputfile $inputfile: $!";
	
	#when the -j option is used in blastpgp, multiple round of search is performed so we have to use the results from the last round.
	if ($max_round) {
		while (<INPUT>) {
			/^Results from round $max_round\s+/ and last;			#reach section for a particular round
		}
	}
	
	MAINLOOP: while (<INPUT>) {
		if (/^Sequences producing significant alignments/) {
			while (<INPUT>) {
				/^\s*$/ and last MAINLOOP;
			}
		}
	}
	while (<INPUT>) {
		/^\s*$/ and last;
		/^(\S+)/ and push @alignid, $1;
	}				
	while (<INPUT>) {
		if (/^(QUERY)\s+(?:\d+\s+)?(\S+)(?:\s+\d+)?\s*$/) {		#reach section for multiple alignments
			push @align, $1;
			$align{$1} = $2;
			%align_block_id = ($1=>1);
			last;
		}
	}
	
	while (<INPUT>) {
		/\S+/ or next;
		/^Searching/ and last;
		/^\s+Database:/ and last;
		if (/^(\S+)\s+(?:\d+\s+)?(\S+)(?:\s+\d+)?\s*$/) {
			if ($1 eq 'QUERY') {
				%align_block_id = ("QUERY"=>1);
				$align{$1} .= $2;
			} else {
				$align_block_id{$1} and next;				#only consider the first instance of an ID in a block
				$align_block_id{$1} = 1;
				#my $alignid = $alignid[$1] or confess "Error array=@alignid without element $1";
				#$align{$alignid} or push @align, $alignid;
				#$align{$alignid} ||= '';
				#$align{$alignid} .= $2;
				$align{$1} or push @align, $1;
				$align{$1} .= $2;
			}
		} else {
			confess "Error: invalid record in $inputfile: <$_>";
		}
	}
}
				

sub readFasta {
	my ($inputfile) = @_;
	(my $FastaStream = $0) =~ s|[^\/\\]+$|FastaStream.pm|;
	eval {require $FastaStream};
	$@ and confess "Error loading FastaStream.pm: please make sure that FastaStream.pm file is in the same directory as the current program";
	
	my $seqio = FastaStream->new (-file=>$inputfile);
	while (my $seqobj = $seqio->next_seq) {
		$seqobj->{desc} =~ m/^>(\S+)/ or confess "Error: invalid record in fasta file $inputfile: <$seqobj->{desc}>";
		$align{$1} = $seqobj->{seq};
		push @align, $1;
	}
}

=head1 SYNOPSIS

convert_align_to_klist.pl [arguments] <inputfile>

 Optional arguments:
 	-v, --verbose			verbose output level 0(default), 1, 2 or 3
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-f, --format			input file format can be 'clustalw' or 'fasta' or 'blastpgpm6' (same as 'psiblastm6')

 Function: convert format of alignment file into klist format

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--format>

currently this program can handle input file format as 'clustalw' or 
'fasta' (default) or 'blastpgpm6' (same as 'psiblastm6'). The 
'blastpgpm6' is the format generated by the blastpgp program in BLAST 
program suite using the '-m 6' option.

=back

=head1 DESCRIPTION

This program is used to convert alignment file formats. It will also check to 
see whether all alignemnt entry has the same length or not.

=cut
