package Bio::FASTASequence; use 5.006; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = (); our @EXPORT_OK = (); our @EXPORT = qw(); our $VERSION = '0.04'; # # new creates a new instance of Bio::FASTASequence # sub new{ my ($class, $string_text) = @_; my ($description,$accession_nr); my %f_db_nr = (); unless($string_text){ $string_text = ""; } my $self = {}; bless($self,$class); my $sequencer; #print STDERR "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text)); die "Object doesn't contain fasta-sequence" if($string_text && !_is_fasta($string_text)); if($string_text){ $string_text =~ s/^(\r?\n)+//; $string_text =~ s/^>+/>/; my ($description_line,$sequence) = split(/\n/,$string_text,2); $description_line =~ s/\s+$//; $description_line =~ s/\r?\n/\n/g; $sequence =~ s/\r?\n//g; unless($description_line =~ /^>/){ $self->{accession_nr} = ""; } else{ # parsing the description line if($description_line =~ /^>gi\|/){ my ($gi,$number,$db); $description_line =~ s/^>//; ($gi,$number,$db,$accession_nr,$description) = split(/\|/,$description_line); } elsif($description_line =~ /^>sp\|/ || $description_line =~ /^>sptrembl\|/){ $description_line =~ s/^>//; my $desc; ($desc,$description) = split(/\s/,$description_line,2); $accession_nr = (split(/\|/,$desc))[1]; } elsif($description_line =~ /^>tr\|/){ $description_line =~ s/^>//; my $desc; ($desc,$description) = split(/\s/,$description_line,2); $accession_nr = (split(/\|/,$desc))[1]; } elsif($description_line =~ /^>[XY\d+]/){ $description_line =~ s/>//; chomp $description_line; $description = (split(/\s/,$description_line,3))[-1]; $accession_nr = (split(/\s/,$description_line,3))[0]; } elsif($description_line =~ /^>[0-9A-Za-z_]+\s?/){ $description_line =~ s/^>//; #-------------------------------------------------# # IPI-Sequences # #-------------------------------------------------# if($description_line =~ /^IPI:/){ # split only at first whitespace and take first element my $foreign_numbers = (split(/\s/,$description_line,2))[0]; $description = (split(/\s/,$description_line,3))[2]; my @foreign_acs = split(/\|/,$foreign_numbers); # cross-references to other databases foreach my $f_ac(@foreign_acs){ my ($key, $value) = split(/:/,$f_ac); $f_db_nr{$key} = $value; } unless($f_db_nr{'SWISS-PROT'}){ $f_db_nr{'SWISS-PROT'} = "NULL"; } unless($f_db_nr{'ENSEMBL'}){ $f_db_nr{'ENSEMBL'} = "NULL"; } unless($f_db_nr{'REFSEQ_XP'}){ $f_db_nr{'REFSEQ_XP'} = "NULL"; } unless($f_db_nr{'TREMBL'}){ $f_db_nr{'TREMBL'} = "NULL"; } $accession_nr = $f_db_nr{'IPI'}; delete $f_db_nr{IPI}; } #-----------------------------------------# # format begins with accession-nr # #-----------------------------------------# elsif($description_line =~ /^[A-Za-z][0-9][A-Z0-9a-z]{3}[0-9][\s\|]/){ if($description_line =~ /\|/){ ($accession_nr, $description) = split(/\|/,$description_line,2); } else{ ($accession_nr, $description) = split(/\s/,$description_line,2); } } elsif($description_line =~ /^[A-Za-z0-9_]+\s*?$/){ chomp $description_line; $accession_nr = $description_line; $description_line = ''; } else{ ($accession_nr,$description)= split(/\s/,$description_line,2); } } } $accession_nr =~ s/^>//; $accession_nr =~ s/[^\w\d]*?$//; $accession_nr =~ s/\.\d$//; $sequence =~ s/[^A-Z]//g; $sequencer = $sequence; } $self->{text} = $sequencer; $self->{accession_nr} = $accession_nr; $self->{description} = $description; $self->{seq_length} = length($sequencer); $self->{dbrefs} = \%f_db_nr; $self->{crc64} = $self->_crc64(); return $self; }#end new # # getSequence returns the sequence itself. # sub getSequence{ my ($class) = @_; return $class->{text}; }# end getText # # _is_fasta checks whether the given Sequence is in fasta-format or not # sub _is_fasta{ my ($sequence) = @_; my @lines = split(/\r?\n/,$sequence); my $desc = shift(@lines); my $seq = join("",@lines); $seq =~ s/\s+//g; if($desc =~ /^>/ && $seq !~ /[^A-NP-Z\*\-]/i && length($seq) > 0){ return 1; } return 0; }# end _is_fasta # # getSequenceLength returns how man aminoacids the sequence contains # sub getSequenceLength{ my ($class) = @_; return $class->{seq_length}; }# end getSequenceLength # # getAccessionNr returns the parsed accession number # sub getAccessionNr{ my ($class) = @_; return $class->{accession_nr}; }# end of getAccessionNr # # getDescription returns the description # sub getDescription{ my ($class) = @_; return $class->{description} }# end getDescription # # getCrc64 returns the crc64-checksum of the sequence # sub getCrc64{ my ($class) = @_; return $class->{crc64}; }# end getCrc64 # # getDBRefs returns an anonymous hash containing all references to foreign databases # sub getDBRefs{ my ($class) = @_; return $class->{dbrefs}; }# end getDBRefs # # allIndexesOf returns a reference to an array containing all positions of the requested Substring # sub allIndexesOf{ my ($self,$search) = @_; my $i = 1; my $index = 0; my @indices = (); while($i != -1){ $index = index($self->{text},$search,$index); push(@indices,$index) unless ($index == -1); $i = $index; $index++; } return \@indices; }# end allIndicesOf # # _crc64 calculates the crc64-checksum of the sequence. It's the crc64-checksum like at swiss-prot # the code is mainly adapted from SWISS::CRC64 # sub _crc64 { my ($self) = @_; my $text = $self->{text}; use constant EXP => 0xd8000000; my @highCrcTable = 256; my @lowCrcTable = 256; my $initialized = (); my $low = 0; my $high = 0; unless($initialized) { $initialized = 1; for my $i(0..255) { my $low_part = $i; my $high_part = 0; for my $j(0..7) { my $flag = $low_part & 1; # rflag ist für alle ungeraden zahlen 1 $low_part >>= 1;# um ein bit nach rechts verschieben $low_part |= (1 << 31) if $high_part & 1; # bitweises oder mit 2147483648 (), wenn $parth ungerade $high_part >>= 1; # um ein bit nach rechtsverschieben $high_part ^= EXP if $flag; } $highCrcTable[$i] = $high_part; $lowCrcTable[$i] = $low_part; } } foreach (split '', $text) { my $shr = ($high & 0xFF) << 24; my $tmph = $high >> 8; my $tmpl = ($low >> 8) | $shr; my $index = ($low ^ (unpack "C", $_)) & 0xFF; $high = $tmph ^ $highCrcTable[$index]; $low = $tmpl ^ $lowCrcTable[$index]; } return sprintf("%08X%08X", $high, $low); }# end crc64 # # seq2file prints the sequence into a file in fasta-file # sub seq2file{ my ($self,$file,$args_ref) = @_; # open the file to write open(W_SEQUENCE,">$file") or die "Can't open $file: $!\n"; print W_SEQUENCE ">",$self->{accession_nr}; # add the references foreach my $dbkey(keys(%{$self->{dbrefs}})){ print W_SEQUENCE "|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne 'NULL'); } # add description print W_SEQUENCE " ",$self->{description},"\n"; # add the sequence print W_SEQUENCE $self->{text},"\n"; close W_SEQUENCE; }# end seq2file # # getFASTA return a string in fasta-format # sub getFASTA{ my ($self) = @_; my $fasta = ">".$self->{accession_nr}; # add the references to foreign databases foreach my $dbkey(keys(%{$self->{dbrefs}})){ $fasta .= "|".$dbkey.":".$self->{dbrefs}->{$dbkey} if($self->{dbrefs}->{$dbkey} ne "NULL"); } # add description $fasta .= " ".$self->{description} if($self->{description}); $fasta .= "\n"; # add sequence $fasta .= $self->{text}."\n"; return $fasta; }# end getFASTA # # addDBRef adds a reference to a foreign database to the anonymous hash # sub addDBRef{ my ($self,$db,$dbref) = @_; # if a reference to the requested database already exists, append the new reference if($self->{dbrefs}->{$db} && ($self->{dbrefs}->{$db} ne 'NULL')){ $self->{dbrefs}->{$db} .= ";".$dbref; } # if no reference exists, add the reference to the hash else{ $self->{dbrefs}->{$db} = $dbref; } }# end addDBRef # # seq2xml creates an xml-string containing all information about the given sequence. # sub seq2xml{ my ($self) = @_; # create the tags representing the references to foreign databases, e.g. SWISS-Prot my $dbrefs = " "; foreach(keys(%{$self->{dbrefs}})){ my $key = uc($_); $dbrefs .= "\n<".$key.'>'.${$self->{dbrefs}}->{$_}.'' if(${$self->{dbrefs}}->{$_} ne 'NULL'); } $dbrefs = "\n$dbrefs" if($dbrefs ne " "); # create the xml-string my $xml = qq~ $self->{description}$dbrefs $self->{seq_length} $self->{crc64} $self->{text} ~; return $xml; }# end seq2xml 1; __END__ # Below is stub documentation for your module. You'd better edit it! =head1 NAME Bio::FASTASequence - Perl extension for Bioinformatics. Parsing sequence informations. =head1 SYNOPSIS use Bio::FASTASequence; my $fasta = qq~>sp|P01815|HV2B_HUMAN Ig heavy chain V-II region COR - Homo sapiens (Human). QVTLRESGPALVKPTQTLTLTCTFSGFSLSSTGMCVGWIRQPPGKGLEWLARIDWDDDKY YNTSLETRLTISKDTSRNQVVLTMDPVDTATYYCARITVIPAPAGYMDVWGRGTPVTVSS ~; my $seq = Bio::FASTASequence->new($fasta); =head1 ABSTRACT Bio::FASTASequence is a perl module to parse information out off a Fasta-Sequence. =head1 DESCRIPTION This perl module is a simple utility to simplify the job of bioinformatics. It parses several information about a given FASTA-Sequence such as: =over 10 =item * accession number =item * description =item * sequence itself =item * length of sequence =item * crc64 checksum (as it is used by SWISS-PROT) =item * seq2xml =back =head2 METHODS =head3 new =head3 getAccessionNr my $accession = $seq->getAccessionNr(); returns the AccessionNr of the FASTA-Sequence =head3 getDescription my $description = $seq->getDescription(); returns the description standing in the first line of the FASTA-format (without the accession number) =head3 getSequence my $sequence = $seq->getSequence(); returns the sequence =head3 getCrc64 my $crc64_checksum = $seq->getCrc64(); returns the crc64 checksum of the sequence. This checksum corresponds with the crc64 checksum of SWISS-PROT =head3 addDBRef $seq->addDBRef(DB, REFERENCE_AC); DB is the name of the referenced database REFERENCE_AC is the accession number in the referenced database =head3 seq2file $seq->seq2file(FILENAME); FILENAME is the path of the file where the sequence has to be stored. =head3 allIndexesOf my $indexes = $seq->allIndexesOf(EXPR); returns a reference on an array, which contains all indexes of EXPR in the sequence =head3 getSequenceLength my $length = $seq->getSequenceLength(); returns the length of the sequence =head3 getDBRefs my $hashref = $seq->getDBRefs(); returns a hashreference. The hash contains all references hashref = {'SWISS-PROT' => 'P01815'}, =head3 getFASTA my $fasta_sequence = $seq->getFASTA(); returns the sequence in FASTA-format =head2 EXAMPLE use Bio::FASTASequence; my $fasta = qq~>sp|P01815|HV2B_HUMAN Ig heavy chain V-II region COR - Homo sapiens (Human). QVTLRESGPALVKPTQTLTLTCTFSGFSLSSTGMCVGWIRQPPGKGLEWLARIDWDDDKY YNTSLETRLTISKDTSRNQVVLTMDPVDTATYYCARITVIPAPAGYMDVWGRGTPVTVSS ~; my $seq = Bio::FASTASequence->new($fasta); print 'The sequence of '.$seq->getAccessionNr().' is '.$seq->getSequence(),"\n"; print 'This sequence contains '.scalar($seq->allIndexesOf('C').' times Cystein at the following positions:'; print $_+1.', ' for(@{$seq->allIndexesOf('C')}); =head1 ADDITIONAL INFORMATION =head3 accepted formats This module can parse the following formats: =over 4 =item >P02656 APC3_HUMAN Apolipoprotein C-III precursor (Apo-CIII). =item >IPI:IPI00166553|REFSEQ_XP:XP_290586|ENSEMBL:ENSP00000331094|TREMBL:Q8N3H0 T Hypothetical protein =item >sp|P01815|HV2B_HUMAN Ig heavy chain V-II region COR - Homo sapiens (Human). =back =head3 structure The structure of the hash for the example is: $VAR1 = { 'seq_length' => 120, 'accession_nr' => 'P01815', 'text' => 'QVTLRESGPALVKPTQTLTLTCTFSGFSLSSTGMCVGWIRQPPGKGLEWLARIDWDDDKYYNTSLETRLTISKDTSRNQVVLTMDPVDTATYYCARITVIPAPAGYMDVWGRGTPVTVSS', 'crc64' => '158A8B29AE7EEB98', 'dbrefs' => {}, 'description' => 'Ig heavy chain V-II region COR - Homo sapiens (Human).' } if you miss something please contact me. =head1 BUGS There is no bug known. If you experienced any problems, please contact me. =head1 SEE ALSO http://modules.renee-baecker.de # not available yet - this site is under construction the crc64-routine is based on the SWISS::CRC64 module. =head1 MODIFICATIONS More FASTA-Description lines are accepted. =head1 AUTHOR Renee Baecker, Emodule@renee-baecker.deE feel free to contact me. =head1 COPYRIGHT AND LICENSE Copyright 2004 by Renee Baecker This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut