# -*-Perl-*- Test Harness script for Bioperl # $Id$ use strict; BEGIN { use lib '.'; use Bio::Root::Test; test_begin(-tests => 561); use_ok('Bio::SeqIO'); } my $verbose = test_debug(); ################################## GenBank ################################## my $ast = Bio::SeqIO->new(-format => 'gbdriver' , -verbose => $verbose, -file => test_input_file("roa1.genbank")); $ast->verbose($verbose); my $as = $ast->next_seq(); is $as->molecule, 'mRNA',$as->accession_number; is $as->alphabet, 'dna'; is($as->primary_id, 3598416); my @class = $as->species->classification; is $class[$#class],'Eukaryota'; $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("NT_021877.gbk")); $ast->verbose($verbose); $as = $ast->next_seq(); is $as->molecule, 'DNA',$as->accession_number; is $as->alphabet, 'dna'; is($as->primary_id, 37539616); is($as->accession_number, 'NT_021877'); my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures(); is(($cds->get_tag_values('transl_except'))[1], '(pos:complement(4224..4226),aa:OTHER)'); # test for a DBSOURCE line $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("BAB68554.gb")); $ast->verbose($verbose); $as = $ast->next_seq(); is $as->molecule, 'linear',$as->accession_number;; is $as->alphabet, 'protein'; # Though older GenBank releases indicate SOURCE contains only the common name, # this is no longer true. In general, this line will contain an abbreviated # form of the full organism name (but may contain the full length name), # as well as the optional common name and organelle. There is no get/set # for the abbreviated name but it is accessible via name() ok defined($as->species->name('abbreviated')->[0]); is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise'; is($as->primary_id, 15824047); my $ac = $as->annotation; ok defined $ac; my @dblinks = $ac->get_Annotations('dblink'); is(scalar @dblinks,1); is($dblinks[0]->database, 'GenBank'); is($dblinks[0]->primary_id, 'AB072353'); is($dblinks[0]->version, '1'); is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated'); # test for multi-line SOURCE $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("NC_006346.gb")); $as = $ast->next_seq; is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;; @class = $as->species->classification; is($class[$#class],'Eukaryota'); is($as->species->common_name,'mushroomtongue salamander'); $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("U71225.gb")); $as = $ast->next_seq; @class = $as->species->classification; is($class[$#class],'Eukaryota',$as->accession_number); is $as->species->common_name,'black-bellied salamander'; # test for unusual common name $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("AB077698.gb")); $as = $ast->next_seq; # again, this is not a common name but is in name('abbreviated') ok defined($as->species->name('abbreviated')->[0]),$as->accession_number; is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA'; # test for common name with parentheses $ast = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file("DQ018368.gb")); $as = $ast->next_seq; is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata', $as->accession_number;; # test secondary accessions my $seqio = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('D10483.gbk')); my $seq = $seqio->next_seq; my @kw = $seq->get_keywords; is(scalar @kw, 118, $seq->accession_number); is($kw[-1], 'yabO'); my @sec_acc = $seq->get_secondary_accessions(); is(scalar @sec_acc,14); is($sec_acc[-1], 'X56742'); # bug #1487 my $str = Bio::SeqIO->new(-verbose => $verbose, -file => test_input_file('D12555.gbk')); eval { $seq = $str->next_seq; }; ok(! $@, 'bug 1487'); # bug 1647 rpt_unit sub-feature with multiple parens $str = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('mini-AE001405.gb')); ok($seq = $str->next_seq); my @rpts = grep { $_->primary_tag eq 'repeat_region' } $seq->get_SeqFeatures; is $#rpts, 2, 'bug 1647'; my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts; is $#rpt_units, 0; is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7'); # test bug #1673 , RDB-II genbank files $str = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('Mcjanrna_rdbII.gbk') ); ok($seq = $str->next_seq, 'bug 1673'); my @refs = $seq->annotation->get_Annotations('reference'); is(@refs, 1); is($seq->display_id,'Mc.janrrnA'); is($seq->molecule ,'RNA'); $str = Bio::SeqIO->new(-format => 'gbdriver', -file => test_input_file("AF165282.gb"), -verbose => $verbose); $seq = $str->next_seq; my @features = $seq->all_SeqFeatures(); is(@features, 5, $seq->accession_number); is($features[0]->start, 1); is($features[0]->end, 226); my $location = $features[1]->location; ok($location->isa('Bio::Location::SplitLocationI')); my @sublocs = $location->sub_Location(); is(@sublocs, 29); # version and primary ID - believe it or not, this wasn't working is ($seq->version, 1); is ($seq->seq_version, 1); is ($seq->primary_id, "5734104"); # streaming and Bio::RichSeq creation my $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank"), -verbose => $verbose, -format => 'gbdriver'); $stream->verbose($verbose); my $seqnum = 0; my $species; my @cl; my $lasts; my @ids = qw(DDU63596 DDU63595 HUMBDNF); my @tids = (44689, 44689, 9606); my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum", "Homo sapiens"); while($seq = $stream->next_seq()) { if($seqnum < 3) { is $seq->display_id(), $ids[$seqnum]; $species = $seq->species(); @cl = $species->classification(); is( $species->binomial(), $tnames[$seqnum], 'species parsing incorrect for genbank'); is( $cl[3] ne $species->genus(), 1, 'genus duplicated in genbank parsing'); is( $species->ncbi_taxid, $tids[$seqnum] ); } $seqnum++; $lasts = $seq; } is($seqnum, 5,'streaming'); is $lasts->display_id(), "HUMBETGLOA"; my ($ref) = $lasts->annotation->get_Annotations('reference'); is($ref->medline, 94173918); $stream->close(); $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank.noseq"), -verbose => $verbose, -format => 'gbdriver' ); $seqnum = 0; while($seq = $stream->next_seq()) { if($seqnum < 3) { is $seq->display_id(), $ids[$seqnum]; } elsif( $seq->display_id eq 'M37762') { is( ($seq->get_keywords())[0], 'neurotrophic factor'); } $seqnum++; } is $seqnum, 5, "Total number of sequences in test file"; # fuzzy $seq = Bio::SeqIO->new( -format => 'gbdriver', -verbose => $verbose, -file =>test_input_file("testfuzzy.genbank")); $seq->verbose($verbose); ok(defined($as = $seq->next_seq())); @features = $as->all_SeqFeatures(); is(@features,21,'Fuzzy in'); my $lastfeature = pop @features; # this is a split location; the root doesn't have strand is($lastfeature->strand, undef); $location = $lastfeature->location; #$location->verbose(-1); # silence the warning of undef seq_id() # see above; splitlocs roots do not have a strand really is($location->strand, undef); is($location->start, 83202); is($location->end, 84996); @sublocs = $location->sub_Location(); is(@sublocs, 2); my $loc = shift @sublocs; is($loc->start, 83202); is($loc->end, 83329); is($loc->strand, -1); $loc = shift @sublocs; is($loc->start, 84248); is($loc->end, 84996); is($loc->strand,1); my $outfile = test_output_file(); $seq = Bio::SeqIO->new(-format => 'genbank', -verbose => $verbose, -file=> ">$outfile"); $seq->verbose($verbose); ok($seq->write_seq($as),'Fuzzy out'); ## now genbank ## $str = Bio::SeqIO->new(-format =>'gbdriver', -verbose => $verbose, -file => test_input_file('BK000016-tpa.gbk')); $seq = $str->next_seq; ok(defined $seq, $seq->accession_number); ok(defined $seq->seq); is($seq->accession_number, 'BK000016',$seq->accession_number); is($seq->alphabet, 'dna'); is($seq->display_id, 'BK000016'); is($seq->length, 1162); is($seq->division, 'ROD'); is($seq->get_dates, 1); is($seq->keywords, 'Third Party Annotation; TPA'); is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.'); is($seq->seq_version, 1); is($seq->feature_count, 2); my $spec_obj = $seq->species; is ($spec_obj->common_name, 'house mouse'); is ($spec_obj->species, 'musculus'); is ($spec_obj->genus, 'Mus'); is ($spec_obj->binomial, 'Mus musculus'); $ac = $seq->annotation; my $reference = ($ac->get_Annotations('reference') )[0]; is ($reference->pubmed, '11479594'); is ($reference->medline, '21372465',$seq->accession_number); # validate that what is written is what is read my $testfile = test_output_file(); my $out = Bio::SeqIO->new(-file => ">$testfile", -format => 'genbank'); $out->write_seq($seq); $out->close(); $str = Bio::SeqIO->new(-format =>'gbdriver', -file => $testfile); $seq = $str->next_seq; ok(defined $seq,'roundtrip'); ok(defined $seq->seq); is($seq->accession_number, 'BK000016'); is($seq->alphabet, 'dna'); is($seq->display_id, 'BK000016'); is($seq->length, 1162); is($seq->division, 'ROD'); is($seq->get_dates, 1); is($seq->keywords, 'Third Party Annotation; TPA'); is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.'); is($seq->seq_version, 1); is($seq->feature_count, 2); $spec_obj = $seq->species; is ($spec_obj->common_name, 'house mouse'); is ($spec_obj->species, 'musculus'); is ($spec_obj->genus, 'Mus'); is ($spec_obj->binomial, 'Mus musculus'); $ac = $seq->annotation; $reference = ($ac->get_Annotations('reference') )[0]; is ($reference->pubmed, '11479594'); is ($reference->medline, '21372465'); # write revcomp split location my $gb = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('revcomp_mrna.gb')); $seq = $gb->next_seq(); $gb = Bio::SeqIO->new(-format => 'genbank', -file => ">$testfile"); $gb->write_seq($seq); undef $gb; ok(! -z $testfile, 'revcomp split location'); # bug 1925, continuation of long ORGANISM line ends up in @classification: # ORGANISM Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC # 9150 # Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; # Enterobacteriaceae; Salmonella. $gb = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('NC_006511-short.gbk')); $seq = $gb->next_seq; is $seq->species->common_name, undef, "Bug 1925"; is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150"; @class = $seq->species->classification; is $class[$#class], "Bacteria"; # WGS tests $gb = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('O_sat.wgs')); $seq = $gb->next_seq; my @tests = ('wgs' => 'AAAA02000001-AAAA02050231', 'wgs_scafld' => 'CM000126-CM000137', 'wgs_scafld' => 'CH398081-CH401163'); my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD)); my $ct=0; for my $wgs (@wgs) { my ($tagname, $value) = (shift @tests, shift @tests); is($wgs->tagname, $tagname, $tagname); is($wgs->value, $value); $ct++; } is ($ct, 3); # make sure we can retrieve a feature with a primary tag of 'misc_difference' $gb = Bio::SeqIO->new(-format => 'gbdriver', -verbose => $verbose, -file => test_input_file('BC000007.gbk')); $seq = $gb->next_seq; ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures; my @vals = $cds->get_tag_values('gene'); is $vals[0], 'PX19', $seq->accession_number; # Check that the source,organism section is identical between input and output. # - test an easy one where organism is species, then two different formats of # subspecies, then a species with a format that used to be mistaken for # subspecies, then a bacteria with no genus, and finally a virus with a genus. # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon- # based system for verifying taxonomic information. Right now they just verify # changes so are really useless; I will change them to verify common name, # organelle, scientific name, etc. # output always adds a period (GenBank std), but two of these files do not use them. foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') { my $infile = test_input_file($in); $outfile = test_output_file(); $str = Bio::SeqIO->new(-format =>'genbank', -verbose => $verbose, -file => $infile); $seq = $str->next_seq; $out = Bio::SeqIO->new(-file => $outfile, -format => 'genbank'); $out->write_seq($seq); $out->close(); open (IN, $infile); my @in = ; close(IN); open (RESULT, $outfile); my $line = 0; my $check = 0; my $is = 1; FILECHECK: while (my $result = ) { if ($result =~ /^KEYWORDS/) { $check = 1; next; } if ($result =~ /^REFERENCE/) { last FILECHECK; } if ($check) { # end periods don't count (not all input files have them) $result =~ s{\.$}{}; $in[$line] =~ s{\.$}{}; if ($result ne $in[$line]) { $is = 0; last; } } } continue { $line++ } close(RESULT); ok $is, $in; } # NB: there should probably be full testing on all lines to ensure that output # matches input. # 20061117: problem with *double* colon in some annotation-dblink values $ct = 0; foreach my $in ('P35527.gb') { my $infile = test_input_file($in); $str = Bio::SeqIO->new(-format =>'genbank', -verbose => $verbose, -file => $infile); $seq = $str->next_seq; my $ac = $seq->annotation(); # Bio::AnnotationCollection foreach my $key ($ac->get_all_annotation_keys() ) { my @values = $ac->get_Annotations($key); foreach my $ann (@values) { my $value = $ann->display_text; $ct++; if ($key eq 'dblink') { ok (index($value,'::') < 0); # this should never be true ok ($value, $value); # check value is not empty # print " ann/", sprintf('%12s ',$key), '>>>', $value , '<<<', "\n"; # print " index double colon: ",index($value ,'::'), "\n"; # check db name: my @parts = split(/:/,$value); if ( $parts[0] =~ /^(?: # not an exhaustive list of databases; # just the db's referenced in P35527.gb: swissprot | GenBank | GenPept | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress | GO | InterPro | Pfam| PRINTS | PROSITE )$/x ) { ok 1; } else { ok 0; } ok ( $parts[1], "$parts[0]" ); } # elsif ($key eq 'reference') { } } } } is($ct, 46); # bug 2195 $str = Bio::SeqIO->new(-format =>'gbdriver', -verbose => $verbose, -file => test_input_file('AF305198.gb') ); $species = $str->next_seq->species; is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195'); is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '. '16SrV (Elm yellows group), Candidatus Phytoplasma, '. 'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '. 'Firmicutes, Bacteria', 'Bug 2195'); # bug 2569, PROJECT line support, read and write, round-tripping $str = Bio::SeqIO->new(-format =>'gbdriver', -verbose => $verbose, -file => test_input_file('NC_008536.gb')); $seq = $str->next_seq; my $project = ($seq->annotation->get_Annotations('project'))[0]; isa_ok($project, 'Bio::Annotation::SimpleValue'); if ($project) { is($project->value, 'GenomeProject:12638'); } else { ok(0, "PROJECT not parsed"); } $outfile = test_output_file(); $gb = Bio::SeqIO->new(-format => 'genbank', -verbose => $verbose, -file=> ">$outfile"); $gb->write_seq($seq); $str = Bio::SeqIO->new(-format =>'gbdriver', -verbose => $verbose, -file => $outfile); $seq = $str->next_seq; $project = ($seq->annotation->get_Annotations('project'))[0]; isa_ok($project, 'Bio::Annotation::SimpleValue'); if ($project) { is($project->value, 'GenomeProject:12638'); } else { ok(0, "Roundtrip test failed"); } ################################## EMBL ################################## # Set to -1 for release version, so warnings aren't printed $ast = Bio::SeqIO->new( -format => 'embldriver', -verbose => $verbose, -file => test_input_file("roa1.dat")); $ast->verbose($verbose); $as = $ast->next_seq(); ok defined $as->seq; is($as->display_id, 'HSHNCPA1'); is($as->accession_number, 'X79536'); is($as->seq_version, 1); is($as->version, 1); is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1'); is($as->molecule, 'RNA'); is($as->alphabet, 'rna'); is(scalar $as->all_SeqFeatures(), 4); is($as->length, 1198); is($as->species->binomial(), 'Homo sapiens'); is($as->get_dates, 2); # EMBL Release 87 changes (8-17-06) $ast = Bio::SeqIO->new( -format => 'embldriver', -verbose => $verbose, -file => test_input_file("roa1_v2.dat")); $ast->verbose($verbose); $as = $ast->next_seq(); ok defined $as->seq; # accession # same as display name now is($as->display_id, 'X79536'); is($as->get_dates, 2); is($as->accession_number, 'X79536'); is($as->seq_version, 1); is($as->version, 1); is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1'); # mRNA instead of RNA is($as->molecule, 'mRNA'); is($as->alphabet, 'rna'); is(scalar $as->all_SeqFeatures(), 4); is($as->length, 1198); is($as->species->binomial(), 'Homo sapiens'); my $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), -format => 'embldriver'); $seq = $ent->next_seq(); is(defined $seq->seq(), 1, 'success reading Embl with ^ location and badly split double quotes'); is(scalar $seq->annotation->get_Annotations('reference'), 3); is($seq->get_dates, 0); $out = Bio::SeqIO->new(-file=> ">$outfile", -format => 'embl'); is($out->write_seq($seq),1, 'success writing Embl format with ^ < and > locations'); # embl with no FT $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), -format => 'embldriver'); $seq = $ent->next_seq(); ok($seq); is(lc($seq->subseq(1,10)),'gatcagtaga'); is($seq->length, 4870); # embl with no FH my $noFH = Bio::SeqIO->new(-file => test_input_file("no_FH.embl"), -format => 'embldriver'); $seq = $noFH->next_seq; is(scalar($seq->get_SeqFeatures), 4); is($seq->display_id, 'AE000001'); is($seq->get_dates, 0); # bug 1571 $ent = Bio::SeqIO->new(-format => 'embldriver', -file => test_input_file('test.embl2sq')); $seq = $ent->next_seq; is($seq->length,4870); is($seq->get_dates, 0); # embl repbase $ent = Bio::SeqIO->new(-file => test_input_file("BEL16-LTR_AG.embl"), -format => 'embldriver'); $seq = $ent->next_seq; is($seq->display_id,'BEL16-LTR_AG'); is($seq->get_dates, 2); # test secondary accessions in EMBL (bug #1332) $seqio = Bio::SeqIO->new(-format => 'embldriver', -file => test_input_file('ECAPAH02.embl')); $seq = $seqio->next_seq; is($seq->accession_number, 'D10483'); is($seq->seq_version, 2); my @accs = $seq->get_secondary_accessions(); is($accs[0], 'J01597'); is($accs[-1], 'X56742'); is($seq->get_dates, 2); ### TPA TESTS - Thanks to Richard Adams ### # test Third Party Annotation entries in EMBL/Gb format # to ensure compatability with parsers. $str = Bio::SeqIO->new(-verbose => $verbose, -format =>'embldriver', -file => test_input_file('BN000066-tpa.embl')); $seq = $str->next_seq; ok(defined $seq); is($seq->accession_number, 'BN000066'); is($seq->alphabet, 'dna'); is($seq->display_id, 'AGA000066'); is($seq->length, 5195); is($seq->division, 'INV'); is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA'); is($seq->seq_version, 1); is($seq->feature_count, 15); is($seq->get_dates, 2); $spec_obj = $seq->species; is ($spec_obj->common_name, 'African malaria mosquito'); is ($spec_obj->species, 'gambiae'); is ($spec_obj->genus, 'Anopheles'); is ($spec_obj->binomial, 'Anopheles gambiae'); $ac = $seq->annotation; $reference = ($ac->get_Annotations('reference') )[1]; is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"'); is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.'); my $cmmnt = ($ac->get_Annotations('comment') )[0]; is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org)'); $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), -format => 'embldriver'); $ent->verbose($verbose); $seq = $ent->next_seq(); $species = $seq->species(); @cl = $species->classification(); is( $cl[3] ne $species->genus(), 1, 'genus duplication test'); $ent->close(); # ## read-write - test embl writing of a PrimarySeq # my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA', -id => 'myid', -desc => 'mydescr', -alphabet => 'DNA', -accession_number => 'myaccession'); $verbose = -1 unless $ENV{'BIOPERLDEBUG'}; # silence warnings unless we are debuggin my $embl = Bio::SeqIO->new(-format => 'embl', -verbose => $verbose, -file => ">$outfile"); ok($embl->write_seq($primaryseq)); # this should generate a warning my $scalar = "test"; eval { $embl->write_seq($scalar); }; ok ($@); ############################## Swiss/UniProt ############################## $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => test_input_file('test.swiss')); isa_ok($seqio, 'Bio::SeqIO'); $seq = $seqio->next_seq; my @gns = $seq->annotation->get_Annotations('gene_name'); $outfile = test_output_file(); $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swiss', -file => ">$outfile"); $seqio->write_seq($seq); # reads it in once again $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => $outfile); $seq = $seqio->next_seq; isa_ok($seq->species, 'Bio::Species'); is($seq->species->ncbi_taxid, 6239); # version, seq_update, dates (5 tests) is($seq->version, 40); my ($ann) = $seq->annotation->get_Annotations('seq_update'); eval {is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated')}; ok(!$@); my @dates = $seq->get_dates; my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001); for my $date (@dates) { my $expdate = shift @date_check; is($date, $expdate,'dates'); } my @gns2 = $seq->annotation->get_Annotations('gene_name'); # check gene name is preserved (was losing suffix in worm gene names) ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value); # test swissprot multiple RP lines $str = Bio::SeqIO->new(-file => test_input_file('P33897')); $seq = $str->next_seq; isa_ok($seq, 'Bio::Seq::RichSeqI'); @refs = $seq->annotation->get_Annotations('reference'); is( @refs, 23); is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.'); # version, seq_update, dates (5 tests) is($seq->version, 44); ($ann) = $seq->annotation->get_Annotations('seq_update'); is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated'); @dates = $seq->get_dates; @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004); for my $date (@dates) { is($date, shift @date_check); } $ast = Bio::SeqIO->new(-verbose => $verbose, -format => 'swissdriver' , -file => test_input_file("roa1.swiss")); $as = $ast->next_seq(); ok defined $as->seq; is($as->id, 'ROA1_HUMAN', "id is ".$as->id); like($as->primary_id, qr(Bio::PrimarySeq)); is($as->length, 371); is($as->alphabet, 'protein'); is($as->division, 'HUMAN'); is(scalar $as->all_SeqFeatures(), 16); is(scalar $as->annotation->get_Annotations('reference'), 11); # version, seq_update, dates (5 tests) is($as->version, 35); ($ann) = $as->annotation->get_Annotations('seq_update'); is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated'); @dates = $as->get_dates; @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997); for my $date (@dates) { is($date, shift @date_check); } ($ent,$out) = undef; ($as,$seq) = undef; $seqio = Bio::SeqIO->new(-format => 'swissdriver' , -verbose => $verbose, -file => test_input_file("swiss.dat")); $seq = $seqio->next_seq; isa_ok($seq, 'Bio::Seq::RichSeqI'); # more tests to verify we are actually parsing correctly like($seq->primary_id, qr(Bio::PrimarySeq)); is($seq->display_id, 'MA32_HUMAN'); is($seq->length, 282); is($seq->division, 'HUMAN'); is($seq->alphabet, 'protein'); my @f = $seq->all_SeqFeatures(); is(@f, 2); is($f[1]->primary_tag, 'CHAIN'); is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN'); # version, seq_update, dates (5 tests) is($seq->version, 40); ($ann) = $seq->annotation->get_Annotations('seq_update'); is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated'); @dates = $seq->get_dates; @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000); for my $date (@dates) { is($date, shift @date_check); } my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP); ($ann) = $seq->annotation->get_Annotations('gene_name'); # use Data::Stag findval and element name to get values/nodes foreach my $gn ( $ann->findval('Name') ) { ok ($gn, shift(@genenames)); } foreach my $gn ( $ann->findval('Synonyms') ) { ok ($gn, shift(@genenames)); } like($ann->value, qr/Name: GC1QBP/); # test for feature locations like ?..N $seq = $seqio->next_seq(); isa_ok($seq, 'Bio::Seq::RichSeqI'); like($seq->primary_id, qr(Bio::PrimarySeq)); is($seq->display_id, 'ACON_CAEEL'); is($seq->length, 788); is($seq->division, 'CAEEL'); is($seq->alphabet, 'protein'); is(scalar $seq->all_SeqFeatures(), 5); foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) { ok ($gn->value, 'F54H12.1'); } # test species in swissprot -- this can be a n:n nightmare $seq = $seqio->next_seq(); isa_ok($seq, 'Bio::Seq::RichSeqI'); like($seq->primary_id, qr(Bio::PrimarySeq)); @sec_acc = $seq->get_secondary_accessions(); is($sec_acc[0], 'P29360'); is($sec_acc[1], 'Q63631'); is($seq->accession_number, 'P42655'); @kw = $seq->get_keywords; is( $kw[0], 'Brain'); is( $kw[1], 'Neurone'); is($kw[3], 'Multigene family'); is($seq->display_id, '143E_HUMAN'); # hybrid names from old sequences are no longer valid, these are chopped # off at the first organism is($seq->species->binomial, "Homo sapiens"); is($seq->species->common_name, "Human"); is($seq->species->ncbi_taxid, 9606); $seq = $seqio->next_seq(); isa_ok($seq, 'Bio::Seq::RichSeqI'); like($seq->primary_id, qr(Bio::PrimarySeq)); is($seq->species->binomial, "Bos taurus"); is($seq->species->common_name, "Bovine"); is($seq->species->ncbi_taxid, 9913); # multiple genes in swissprot $seq = $seqio->next_seq(); isa_ok($seq, 'Bio::Seq::RichSeqI'); like($seq->primary_id, qr(Bio::PrimarySeq)); ($ann) = $seq->annotation->get_Annotations("gene_name"); @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC); my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)"; my @names = @genenames; # copy array my @ann_names = $ann->get_all_values(); is(scalar(@ann_names), scalar(@names)); # do this in a layered way (nested tags) for my $node ($ann->findnode('gene_name')) { for my $name ($node->findval('Name')) { is($name, shift(@names)); } for my $name ($node->findval('Synonyms')) { is($name, shift(@names)); } } is(scalar(@names),0); # same entry as before, but with the new gene names format $seqio = Bio::SeqIO->new(-format => 'swissdriver', -verbose => $verbose, -file => test_input_file("calm.swiss")); $seq = $seqio->next_seq(); isa_ok($seq, 'Bio::Seq::RichSeqI'); like($seq->primary_id, qr(Bio::PrimarySeq)); ($ann) = $seq->annotation->get_Annotations("gene_name"); @names = @genenames; # copy array my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array is(scalar(@ann_names2), scalar(@names)); for my $node ($ann->findnode('gene_name')) { for my $name ($node->findval('Name')) { is($name, shift(@names)); } for my $name ($node->findval('Synonyms')) { is($name, shift(@names)); } } is(scalar(@names),0); # test proper parsing of references my @litrefs = $seq->annotation->get_Annotations('reference'); is(scalar(@litrefs), 17); my @titles = ( '"Complete amino acid sequence of human brain calmodulin."', '"Multiple divergent mRNAs code for a single human calmodulin."', '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."', '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."', '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."', undef, '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."', '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."', '"The DNA sequence and analysis of human chromosome 14."', '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."', '"Alpha-helix nucleation by a calcium-binding peptide loop."', '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."', '"Calmodulin structure refined at 1.7 A resolution."', '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."', '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."', '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."', '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."', ); my @locs = ( "Biochemistry 21:2565-2569(1982).", "J. Biol. Chem. 263:17055-17062(1988).", "J. Biol. Chem. 262:16663-16670(1987).", "Biochem. Int. 9:177-185(1984).", "Eur. J. Biochem. 225:71-82(1994).", "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.", "Cell Calcium 23:323-338(1998).", "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.", "Nature 421:601-607(2003).", "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).", "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).", "Nat. Struct. Biol. 8:990-997(2001).", "J. Mol. Biol. 228:1177-1192(1992).", "Biochemistry 33:15259-15265(1994).", "Nature 415:396-402(2002).", "EMBO J. 21:6721-6732(2002).", "Nat. Struct. Biol. 10:226-231(2003).", ); my @positions = ( undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, 94, 103, 1, 76, undef, undef, undef, undef, 5, 148, 1, 148, undef, undef, ); foreach my $litref (@litrefs) { is($litref->title, shift(@titles)); is($litref->location, shift(@locs)); is($litref->start, shift(@positions)); is($litref->end, shift(@positions)); } # format parsing changes (pre-rel 9.0) $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => test_input_file('pre_rel9.swiss')); ok($seqio); $seq = $seqio->next_seq; isa_ok($seq->species, 'Bio::Species'); is($seq->species->ncbi_taxid, "6239"); # version, seq_update, dates (5 tests) is($seq->version, 44); ($ann) = $seq->annotation->get_Annotations('seq_update'); is($ann->display_text, 1); @dates = $seq->get_dates; @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 ); for my $date (@dates) { is($date, shift @date_check); } my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090 IPR006092 IPR009075 IPR009100 IPR013764 PF00441 PF02770 PF02771 PS00072 PS00073); for my $dblink ( $seq->annotation->get_Annotations('dblink') ) { is($dblink->primary_id, shift @idcheck); } $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => test_input_file('pre_rel9.swiss')); my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL); while (my $seq = $seqio->next_seq) { is($seq->namespace, shift @namespaces); } # format parsing changes (rel 9.0, Oct 2006) $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => test_input_file('rel9.swiss')); ok($seqio); $seq = $seqio->next_seq; isa_ok($seq->species, 'Bio::Species'); is($seq->species->ncbi_taxid, 6239); is($seq->version, 47); ($ann) = $seq->annotation->get_Annotations('seq_update'); is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated'); @dates = $seq->get_dates; @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 ); for my $date (@dates) { is($date, shift @date_check); } @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090 IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770 PF02771 PS00072 PS00073 ); for my $dblink ( $seq->annotation->get_Annotations('dblink') ) { is($dblink->primary_id, shift @idcheck); } $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swissdriver', -file => test_input_file('rel9.swiss')); @namespaces = qw(Swiss-Prot TrEMBL TrEMBL); while (my $seq = $seqio->next_seq) { is($seq->namespace, shift @namespaces); } # bug 2288 # Q8GBD3.swiss $seqio = Bio::SeqIO->new( -verbose => $verbose, -format => 'swiss', -file => test_input_file('Q8GBD3.swiss')); while (my $seq = $seqio->next_seq) { my $lineage = join(';', $seq->species->classification); is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'. 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'. 'Proteobacteria;Bacteria'); } # test for GenBank swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536) $ast = Bio::SeqIO->new(-format => 'genbank', -verbose => $verbose, -file => test_input_file('P39765.gb')); $ast->verbose($verbose); $as = $ast->next_seq(); is $as->molecule, 'PRT',$as->accession_number;; is $as->alphabet, 'protein'; # Though older GenBank releases indicate SOURCE contains only the common name, # this is no longer true. In general, this line will contain an abbreviated # form of the full organism name (but may contain the full length name), # as well as the optional common name and organelle. There is no get/set # for the abbreviated name but it is accessible via name() ok defined($as->species->name('abbreviated')->[0]); is $as->species->name('abbreviated')->[0], 'Bacillus subtilis'; is($as->primary_id, 20141743); $ac = $as->annotation; ok defined $ac; @dblinks = $ac->get_Annotations('dblink'); is(scalar @dblinks,31); is($dblinks[0]->database, 'UniProtKB'); is($dblinks[0]->primary_id, 'PYRR_BACSU'); is($dblinks[0]->version, undef); is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');