#!/usr/bin/perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;

my ($verbose, $help, $man);
my ($record, $chain, $name, $length);
GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'record=s'=>\$record, 'chain=s'=>\$chain, 'name=s'=>\$name, 'length=i'=>\$length) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

my ($inputfile) = @ARGV;
$verbose and not $record and print STDERR "NOTICE: using the ATOM records in PDB file <$inputfile> to generate sequence\n";
$record ||= 'ATOM';
$record = uc $record;
$record eq 'ATOM' or $record eq 'SEQRES' or pod2usage ("Error in parameter: record can be set as `atom' or `seqres' only");

my %aa_1letter_3letter = (A=>'ALA', R=>'ARG', N=>'ASN', D=>'ASP', C=>'CYS', Q=>'GLN', E=>'GLU', G=>'GLY', H=>'HIS', I=>'ILE', L=>'LEU', K=>'LYS', M=>'MET', F=>'PHE', P=>'PRO', S=>'SER', T=>'THR', W=>'TRP', Y=>'TYR', V=>'VAL');
my %aa_3letter_1letter = (
	    ALA => "A", CYS => "C", ASP => "D", GLU => "E", PHE => "F", GLY => "G",
	    HIS => "H", ILE => "I", LYS => "K", LEU => "L", MET => "M", ASN => "N",
	    PRO => "P", GLN => "Q", ARG => "R", SER => "S", THR => "T", VAL => "V", 
	    TRP => "W", TYR => "Y");

#the non-standard amino acid alphabet will be updated constantly
my %nsaa_3letter_1letter = (
	    AAR => "R", ACY => "G", AEI => "T", AGM => "R", ASQ => "D", ASX => "N", 
	    AYA => "A", BHD => "D", CAS => "C", CAY => "C", CEA => "C", CGU => "E",
	    CME => "C", CMT => "C", CSB => "C", CSD => "C", CSE => "C", CSO => "C",
	    CSP => "C", CSS => "C", CSW => "C", CSX => "C", CXM => "M", CYG => "C",
	    CYM => "C", DOH => "D", EHP => "F", FME => "M", FTR => "W", GL3 => "G", GLX => "Q",
	    H2P => "H", HIC => "H", HIP => "H", HTR => "W", HYP => "P", KCX => "K",
	    LLP => "K", LLY => "K", LYZ => "K", M3L => "K", MEN => "N", MGN => "Q",
	    MHO => "M", MHS => "H", MIS => "S", MLY => "K", MLZ => "K",
	    NEP => "H", NPH => "C", OCS => "C", OCY => "C", OMT => "M", OPR => "R",
	    PAQ => "Y", PCA => "Q", PHD => "D", PRS => "P", PTH => "Y", PYX => "C",
	    SEP => "S", SMC => "C", SME => "M", SNC => "C", SNN => "D", SVA => "S",
	    TPO => "T", TPQ => "Y", TRF => "W", TRN => "W", TRO => "W", TYI => "Y",
	    TYN => "Y", TYQ => "Y", TYS => "Y", TYY => "Y", YOF => "Y", FOR => "X",
	    "5HP"=>"Q", UNK => "X",
	    FGL => "C",		#example PDB: 1auk                                                 
	    PVL => "S", 	#example PDB: 1aw8                                                 
	    SEC => "C", 	#example PDB: 1gp1                                                 
	    PTR => "T", 	#example PDB: 1ir3, 1k4t (PHOSPHOTYROSINE, O-PHOSPHOTYROSINE)      
	    SCY => "C", 	#example PDB: 1qfl (S-ACETYL-CYSTEINE)                             
	    IAS => "D", 	#example PDB: 1rtu (ASP  BETA CARBOXY LINKED TO NEXT RESIDUE)
	    MSE => "M",		#example PDB: 2hdh (SELENOMETHIONINE)
	    ACE => "X", 	#example PDB: 1pad                                                 
	    RHA => "X", 	#example PDB: 1tlp (ALPHA-L-RHAMNOPYRANOSYLOXYHYDROXYPHOSPHINYL)   
	    PVL => "X", 	#example PDB: 3jen (PYRUVOYL GROUP)                                
	    ALM => "X", 	#example PDB: 1pad (ALANINE,METHYLENE C BOUND TO CARBOXY C)        
	    );

my (%chain_seq);					#this hash stores the amino acid sequence of the PDB file for each chain
my (%bad_chain);					#this hash stores the chains that contain non-standard amino acids

#try to get the description line from the PDB file, if the PDB file is retrieved from the ASTRAL database
open (PDB, $inputfile) or confess "Error: cannot open input PDB file $inputfile: $!";
while (<PDB>) {
	/^REMARK  99 ASTRAL SCOP-sid: (\S+)/ and $name = ">$1 ";
	/^REMARK  99 ASTRAL SCOP-sccs: (\S+)/ and $name .= $1 and last;
}

#try to get a name for the sequence, if not specified in the command line
if (not $name) {
	($name = $inputfile) =~ s|.+/||;
	$name =~ s|\..*||;
}

if ($record eq 'ATOM') {
	my %res_id;									#key is the residue ID, value is 1

	open (PDB, $inputfile) or confess "Error: cannot open input PDB file $inputfile: $!";
	while (<PDB>) {
		chomp;
		/^MODEL/ and %chain_seq and last;					#encounter the next model, but we only need the first one
		/^ENDMDL/ and %chain_seq and last;					#encounter end of model (should be first model)
		/^ATOM  / or next;
		my ($chainid, $res_id, $res_name) = (substr ($_, 21, 1), substr ($_, 21, 6), substr ($_, 17, 3));	#res_id is the CHAINID, RESIDUE_NUMBER plus INSERTION_CODE

		if (defined $chain) {
			if ($chain eq 'first') {
				$chain = $chainid;					#this is the first encountered chain
			} else {
				$chainid eq $chain or next;				#a different chain than specified
			}
		}
		
		$res_id{$res_id} and next;						#this residue has been processed
		$res_id{$res_id} = 1;							#mark this residue as being processed

		#sometimes non-standard amino acid occur in PDB file as 3-letter residue name
		if ($aa_3letter_1letter{$res_name}) {
			$chain_seq{$chainid} .= $aa_3letter_1letter{$res_name};	#store the sequence for the corresponding chain
		} elsif ($nsaa_3letter_1letter{substr ($_, 17, 3)}) {
			$chain_seq{$chainid} .= $nsaa_3letter_1letter{$res_name};	#store the sequence for the corresponding chain
			print STDERR "WARNING: non-standard amino acid <$res_name> found in chain <$chainid> in $inputfile and was translated to <$nsaa_3letter_1letter{$res_name}>\n";
			$bad_chain{$chainid} = 1;
		} else {
			$chain_seq{$chainid} .= 'X';
			$bad_chain{$chainid} or print STDERR "WARNING: marking chain <$chainid> as potentially non-amino acid chain due to unrecognizable amino acid <$res_name>\n";
			$bad_chain{$chainid} = 1;
		}
	}
} elsif ($record eq 'SEQRES') {
	open (PDB, $inputfile) or confess "Error: cannot open input PDB file $inputfile: $!";
	while (<PDB>) {
		chomp;
		/^SEQRES/ or next;
		my ($chainid) = substr ($_, 11, 1);					#the 12th letter is the chain identifier in SEQRES record

		if (defined $chain) {
			if ($chain eq 'first') {
				$chain = $chainid;					#this is the first encountered chain
			} else {
				$chainid eq $chain or next;				#a different chain than specified
			}
		}
		
		#sometimes non-standard amino acid occur in PDB file as 3-letter residue name
		my $res_line = substr ($_, 19, 51);					#the residue part of this line
		$res_line =~ s/^\s+|\s+$//g;						#sometimes the start of the line is "  A", or the end of the line is before position 51
		my @res = split (/\s+/, $res_line);
		foreach my $res_name (@res) {
			if ($aa_3letter_1letter{$res_name}) {
				$chain_seq{$chainid} .= $aa_3letter_1letter{$res_name};
			} elsif ($nsaa_3letter_1letter{$res_name}) {
				$chain_seq{$chainid} .= $nsaa_3letter_1letter{$res_name};
				print STDERR "WARNING: non-standard amino acid <$res_name> found in chain <$chainid> in $inputfile and was translated to <$nsaa_3letter_1letter{$res_name}>\n";
				$bad_chain{$chainid} = 1;
			} else {
				$chain_seq{$chainid} .= 'X';
				$bad_chain{$chainid} or print STDERR "WARNING: marking chain <$chainid> as potentially non-amino acid chain due to unrecognizable amino acid <$res_name>\n";
				$bad_chain{$chainid} = 1;
			}
		}
	}
}

#delete those chains that are not composed of amino acids (as judged by the frequency of "X" higher than 10%)
for my $chainid (sort keys %bad_chain) {
	my $seq = $chain_seq{$chainid};
	my $num_x = ($seq =~ tr/X/X/);
	if ($num_x / length ($seq) >= 0.1) {
		print STDERR "WARNING: skipping chain <$chainid> in $inputfile since it may not be composed of amino acids\n";
		delete $chain_seq{$chainid};
	}
}

#print out the sequence for each chain
if (defined $chain) {
	not $chain_seq{$chain} and confess "FATAL ERROR: unable to write sequence for chain <$chain> in PDB file <$inputfile>";
	$length and length ($chain_seq{$chain}) < $length and print STDERR "WARNING: sequence for chain <$chain> in $inputfile is shorter than specified minimum length <$length>\n" and exit;
	outputChain ($name, $chain_seq{$chain});
} else {
	$verbose and print STDERR "NOTICE: ${\(scalar keys %chain_seq)} chains found in PDB file: <${\(join (' and ', keys %chain_seq))}>\n";
	for my $chainid (sort keys %chain_seq) {
		$length and length ($chain_seq{$chain}) < $length and print STDERR "WARNING: sequence for chain <$chain> in $inputfile is shorter than specified minimum length <$length>\n" and next;
		outputChain ($name . ($chainid || '_'), $chain_seq{$chainid});
	}
}

#print out a single chain
sub outputChain {
	my ($name, $seq) = @_;
	
	$seq =~ s/(.{80})/$1\n/g;
	$seq =~ s/\s+$//;		#get rid of the trailing "\n" if there is any

	print ">$name\n$seq\n";
}

=head1 SYNOPSIS

 pdbseq.pl [arguments] <inputfile>

 Optional arguments:
 	-v, --verbose			verbose output level 0(default), 1, 2 or 3
 	-h, --help			print help message
 	-m, --man			print complete documentation
 	-r, --record <string>		use either "ATOM" (default) or "SEQRES" record
 	-c, --chain <letter>		retrieve sequence for the specified PDB chain
 	-n, --name <string>		name of the sequence (after > in FASTA file)
 	-l, --length <int>		minimum length of sequence in a chain

Function: extract the SEQRES- or ATOM-based protein sequence of 
specified chain from the PDB file as a FASTA-formatted file.

=head1 OPTIONS

=over 8

=item B<--help>

print a brief help message and exit

=item B<--man>

print the complete manual of how to use the program

=item B<--verbose>

use verbose output

=item B<--record>

use either "ATOM" (default) or "SEQRES" record in the PDB file to 
generate the sequence

=item B<--chain>

specify the chain for which the sequence is to be retrieved. A special chain 
identifier 'first' can be used to specify the first encountered chain in the PDB 
file.

=item B<--name>

the name of the output sequence file, i.e., the word after the ">" 
character in the description line of the FASTA file. When multiple 
chains exist in the PDB file, the name of each sequence will be 
appended by the chain ID.

=item B<--length>

the minimum length of a sequence in a given PDB file to be written in output 
file. Some PDB files (such as 12as) contains extremely short chains so you may 
want to exclude them in the output.

=back

=head1 DESCRIPTION

This program is used to extract amino acid sequences from a given PDB file. The 
sequence information can retrieved from either the SEQRES or the ATOM records of 
the PDB file. Compared to the SEQRES record, the ATOM record may contain some 
chain breaks. Some PDB files, however, do not have a SEQRES record so we have to 
use the ATOM record.

You can specify a name for the sequence, or the name will be automatically 
assigned based on the inputfile name. For example, if the inputfile is 
"/usr/data/sample.pdb", then the name of the sequence will be assigned as 
"sample" automatically, and the first line of the output will be ">sample". 
However, if there are multiple chains in the PDB file, then there will be 
multiple sequences in the output, with names like ">sampleA" and ">sampleB". 
When chain ID is blank character, the name will be ">sample_".

Both the ATOM and the SEQRES records may contain non-standard amino acids (for 
example, amino acids that have been chemically modified or phosphorylated). 
Usually these non-standard amino acids are still annotated in the PDB file using 
a 3-letter code. I tried to list some of these non-standard code in the program 
and translate them into the corresponding one-letter code, but I cannot be 
comprehensive about it. For unrecognizable amino acids, I will just use "X" to 
represent them. If a chain contains too many "X", then it is probably a non-
amino acid chain (normally these chains should be represented by HETATOM 
records, but it is really the choice of the structure data depositors). 
Currently, if the frequency of "X" in a chain is higher or equal to 10%, then 
this chain is regarded as non-amino acid chain, and its sequence will not be 
shown in the output.

=cut
