# -*-Perl-*- Test Harness script for Bioperl # $Id$ use strict; BEGIN { use lib '.'; use Bio::Root::Test; test_begin(-tests => 1422, -requires_module => 'Bio::ASN1::EntrezGene'); use_ok('Bio::SeqIO::entrezgene'); } my @species=('Homo sapiens','Mus musculus', 'Caenorhabditis elegans'); my @pubmed=qw(15461460 15221005 14702039 12477932 8889549 3610142 3458201 2591067); my %pmed=(1=>8, 2=>55, 3=>1, 4=>0, 5=>0, 6=>0, 7=>0, 8=>1, 9=>32, 10=>58, 11=>1, 12=>76, 13=>7, 14=>5, 15=>13, 9996=>0, 11286=>0, 11287=>5, 11288=>0, 11289=>0, 11293=>0, 11294=>0, 11295=>0, 11296=>0, 11297=>0, 11298=>3, 11299=>0, 11300=>0, 11301=>0, 11302=>9, 11303=>54, 11304=>11, 11305=>3, 11306=>9, 171590=>0, 171591=>0, 171592=>0, 171593=>0, 171594=>0); my %asym=(1=>['A1B', 'ABG', 'GAB', 'HYST2477', 'DKFZp686F0970'], 2=>['FWP007','S863-7','DKFZp779B086'], 4=>['A12M1'], 5=>['A12M2'],6=>['A12M3'],7=>['A12M4'], 9=>['AAC1'],10=>['AAC2'],11=>['NATP'], 12=>['ACT','AACT','MGC88254'],13=>['DAC'],15=>['SNAT','AA-NAT'], 14=>[''], 11287=>['A1m','A2m','MAM'], 11298=>['Nat4','SNAT','Nat-2'], 11302=>['AATYK','mKIAA0641'],11303=>['Abc1'], 11304=>['RmP','Abcr','Abc10','D430003I15Rik'], 11305=>['Abc2','mKIAA1062','D2H0S1474E'], 11306=>['Abc7'], 171590=>['Y74C9A.3','CELK05052'], 171591=>['Y74C9A.2','CELK01753'], 171592=>['Y74C9A.4a','Y74C9A.4b','CELK08126'], 171593=>['Y74C9A.5','CELK09643'], 171594=>['Y48G1C.4','CELK05819']); my @ids=qw(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 9996 11286 11287 11288 11289 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 171590 171591 171592 171593 171594); my @loop_counts = ([1,1,1,1,5,1,12,1,1,1,14,8,16], [1,1,1,1,3,1,40,1,1,1,14,31], [1,1,1,1,0,1,4,1,10,5], [1,1,0,1,1,1,1,7,0], [1,1,0,1,1,1,1,7,0], [1,1,0,1,1,1,1,7,0], [1,1,0,1,1,1,1,7,0], [1,0,1,1,0,1,4,1,1,1,13,0], [1,1,1,1,1,1,33,1,1,1,14,41], [1,1,1,1,1,1,51,1,1,1,14,51], [1,0,1,1,1,1,1,1,10,1], [1,1,1,1,3,1,28,1,1,1,14,33], [1,1,1,1,1,1,17,1,1,1,14,10], [1,1,1,1,0,1,11,1,1,1,13,20], [1,1,1,1,2,1,16,1,1,1,14,23], [1,0,0,0,0,0,0,2,0], [1,0,0,0,0,0,0,3,0], [1,1,1,1,3,1,10,1,13,10], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,2,0], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,2,0], [1,0,0,0,0,0,0,2,0], [1,1,1,1,3,1,9,1,13,5,16], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,3,0], [1,0,0,0,0,0,0,2,0], [1,1,1,1,2,1,10,1,12,19], [1,1,1,1,1,1,50,1,13,14], [1,1,1,1,4,1,9,1,13,12], [1,1,1,1,3,1,9,1,13,8,8], [1,1,1,1,1,1,11,1,13,12], [1,1,0,1,2,0,0,1,1,1,1,9,4], [1,1,0,1,2,0,0,1,1,1,1,9,4], [1,1,0,1,3,0,0,1,1,1,1,9,8], [1,1,0,1,2,0,0,1,1,1,1,9,4], [1,1,0,1,2,0,0,1,1,1,1,9,4]); my $fs='!'; my @revkeys=('Entrez Gene Status','RefSeq status','Official Full Name','chromosome','cyto','Reference','dblink', 'ALIAS_SYMBOL','OntologyTerm','Index terms','Official Symbol','cM','Property'); ok my $eio=Bio::SeqIO->new(-file=>test_input_file('entrezgene.dat'), -format=>'entrezgene', -debug=>'on',-service_record=>'yes'); my ($seq,$struct,$uncapt); my $num_of_seqs = 0; while (1) { my $seq; ($seq,$struct,$uncapt)=$eio->next_seq; last unless ($seq); my @lc = @{$loop_counts[$num_of_seqs]}; $num_of_seqs++; #T0: GENERAL TESTS ok $seq; is ref($struct),'Bio::Cluster::SequenceFamily'; my $acc=$seq->accession_number; #T1: ORGANISM my $org=$seq->species->binomial; is grep(/\b$org\b/,@species),1; #T2: SUMMARY test ok $seq->desc if ($acc eq '1')||($acc eq '2')||($acc eq '11304'); ok !defined $seq->desc if ($acc eq '171592')||($acc eq '11306'); #Are we supposed to have this in our test? ok grep(/\b$acc\b/,@ids); my $ann=$seq->annotation(); my $tcount; #T3: ENTREZGENE STATUS TESTS my @egstatus=$ann->get_Annotations('Entrez Gene Status'); my $loop_count = 0; foreach my $status (@egstatus) { $loop_count++; STATUS: { if ($acc==1) {is $status->value,'live'; last STATUS;} if ($acc==2) {is $status->value,'live'; last STATUS;} if ($acc==4) {is $status->value,'discontinued'; last STATUS;} if ($acc==6) {is $status->value,'discontinued'; last STATUS;} if ($acc==11288) {is $status->value,'secondary'; last STATUS;} if ($acc==11293) {is $status->value,'secondary'; last STATUS;} if ($acc==171594) {is $status->value,'live'; last STATUS;} } } is $loop_count, shift @lc, "correct number of loops for T3"; $loop_count = 0; #T4: REFSEQ STATUS TESTS my @refstatus=$ann->get_Annotations('RefSeq status'); foreach my $status (@refstatus) { $loop_count++; STATUS: { if ($acc==1) {is $status->value,'REVIEWED'; last STATUS;} if ($acc==2) {is $status->value,'REVIEWED'; last STATUS;} if ($acc==3) {is $status->value,'PROVISIONAL'; last STATUS;} if ($acc==4) {is $status->value,'WITHDRAWN'; last STATUS;} if ($acc==9) {is $status->value,'VALIDATED'; last STATUS;} if ($acc==11300) {is $status->value,''; last STATUS;} if ($acc==11306) {is $status->value,'MODEL'; last STATUS;} if ($acc==11293) {is $status->value,'secondary'; last STATUS;} if ($acc==171594) {is $status->value,'Reviewed'; last STATUS;} } } is $loop_count, shift @lc, "correct number of loops for T4"; $loop_count = 0; #T5: GENE NAME TESTS my @ofname=$ann->get_Annotations('Official Full Name'); foreach my $name (@ofname) { $loop_count++; STATUS: { if ($acc==10) {is $name->value,'N-acetyltransferase 2 (arylamine N-acetyltransferase)'; last STATUS;} if ($acc==13) {is $name->value,'arylacetamide deacetylase (esterase)'; last STATUS;} if ($acc==14) {is $name->value,'angio-associated, migratory cell protein'; last STATUS;} if ($acc==11287) {is $name->value,'pregnancy zone protein'; last STATUS;} if ($acc==11298) {is $name->value,'arylalkylamine N-acetyltransferase'; last STATUS;} if ($acc==11304) {is $name->value,'ATP-binding cassette, sub-family A (ABC1), member 4'; last STATUS;} if ($acc==11306) {is $name->value,'ATP-binding cassette, sub-family B (MDR/TAP), member 7'; last STATUS;} } } is $loop_count, shift @lc, "correct number of loops for T5"; $loop_count = 0; #T6: CHROMOSOME TESTS my @chr=$ann->get_Annotations('chromosome'); foreach my $chr (@chr) { $loop_count++; STATUS: { if ($acc==5) {is $chr->value,1; last STATUS;} if ($acc==6) {is $chr->value,1; last STATUS;} if ($acc==7) {is $chr->value,17; last STATUS;} if ($acc==11306) {is $chr->value,'X'; last STATUS;} if ($acc==11304) {is $chr->value,3; last STATUS;} if ($acc==171590) {is $chr->value,'I'; last STATUS;} if ($acc==171592) {is $chr->value,'I'; last STATUS;} } } is $loop_count, shift @lc, "correct number of loops for T6"; $loop_count = 0; #T7: GENE SYMBOL ALIAS TESTS my @sym=$ann->get_Annotations('ALIAS_SYMBOL'); foreach my $sym (@sym) { $loop_count++; my $val = $sym->display_text; next if (($val eq '')||!defined($val)); is grep(/\b$val\b/,@{$asym{$acc}}),1; } is $loop_count, shift @lc, "correct number of loops for T7"; $loop_count = 0; #T8: CYTO LOCATION TESTS my @map=$ann->get_Annotations('cyto'); foreach my $map (@map) { $loop_count++; STATUS: { if ($acc==10) {is $map->value,'8p22'; last STATUS;} if ($acc==11) {is $map->value,'8p22'; last STATUS;} if ($acc==13) {is $map->value,'3q21.3-q25.2'; last STATUS;} if ($acc==11306) {is $map->value,'X C-D'; last STATUS;} if ($acc==11305) {is $map->value,'2 A2-B'; last STATUS;} if ($acc==11304) {is $map->value,'3 G1'; last STATUS;} if ($acc==11303) {is $map->value,'4 A5-B3'; last STATUS;} } } is $loop_count, shift @lc, "correct number of loops for T8"; $loop_count = 0; #T9: REFERENCE NUMBER TEST my @refs=$ann->get_Annotations('Reference'); my $refs=$#refs+1||0; is $pmed{$acc},$refs; my @dblinks=$ann->get_Annotations('dblink'); my @keys=$ann->get_all_annotation_keys; #T10: GENERIF AND OTHER DBLINK TESTS my @url=qw(HGMD Ensembl KEGG Homologene);#Only validate the URL foreach my $dblink (@dblinks) { $loop_count++; my $dbname=$dblink->database||''; DB: { if ( $dbname eq 'generif') {#Should have ID and text ok $dblink->primary_id; ok $dblink->comment->text; last DB; } if ($acc==2) { if (($dbname eq 'MIM')&&($dblink->authority)&&($dblink->authority eq 'phenotype')) { ok $dblink->optional_id; last DB; } if ($dbname eq 'Evidence viewer') { ok $dblink->url; #We may even validate the urls? is $dblink->primary_id,2; last DB; } if ($dbname eq 'Model maker') { ok $dblink->url; #We may even validate the urls? is $dblink->primary_id,2; last DB; } if ($dbname eq 'AceView') { ok $dblink->url; #We may even validate the urls? is $dblink->primary_id,2; last DB; } if (grep(/$dbname/,@url)) { ok $dblink->url; #We may even validate the urls? last DB; } if ($dbname eq 'GDB') { is $dblink->primary_id,'GDB:119639'; #We may even validate the urls? last DB; } if ($dbname eq 'UniGene') { ok $dblink->url; #We may even validate the urls? is $dblink->primary_id,'Hs.212838'; last DB; } if ($dbname eq 'PharmGKB') { is $dblink->primary_id,'PA24357'; last DB; } if ($dbname eq 'MGC') { ok $dblink->url; #We may even validate the urls? is $dblink->primary_id,'BC040071'; last DB; } } } } is $loop_count, shift @lc, "correct number of loops for T10"; $loop_count = 0; #T11: SOME EXTERNAL DATABASE IDS TESTS foreach my $key (@keys) { $loop_count++; next if grep(/\b$key\b/, @revkeys); my @all=$ann->get_Annotations($key); #Checking xref to some databases- OMIM, Wormbase and HGNC, others later my $loop_count_internal = 0; foreach my $pid (@all) { $loop_count_internal++; DBID: { if (($acc==8)&&($key eq 'MIM')) {is $pid->value,'108985'; last DBID;} if (($acc==9)&&($key eq 'HGNC')) {is $pid->value,'7645'; last DBID;} if (($acc==11298)&&($key eq 'MGI')) {is $pid->value,'1328365'; last DBID;} if (($acc==171593)&&($key eq 'AceView/WormGenes')) {is $pid->value,'1A502'; last DBID;} if (($acc==171594)&&($key eq 'WormBase')) {is $pid->value,'Y48G1C.4'; last DBID;} } } is $loop_count_internal, shift @lc, "correct number of loops for T11a"; } is $loop_count, shift @lc, "correct number of loops for T11"; $loop_count = 0; #T12: REFERENCE RECORD TEST if ($acc==1) { foreach my $ref (@refs) { $loop_count++; my $pmed=$ref->medline; is grep(/\b$pmed\b/,@pubmed),1; } is $loop_count, shift @lc, "correct number of loops for T12"; $loop_count = 0; } #T13/14: STS Markers and Gene Ontology my @syn=('MGI:707739','MPC786'); my @evid=qw(IEA TAS ISS); my (%pmeds,%go); $go{11305}=['5524', '16887', '5215', '8203', '6810', '16021' ,'5765']; $go{11298}=['8080', '8415', '4060', '16740']; $pmeds{11305}=['12466851']; my @types=qw(Function Component Process); if (($acc==11305)||($acc==11298)) { #Let's check just this two... foreach my $ot ($ann->get_Annotations('OntologyTerm')) { $loop_count++; if (($ot->term->authority)&&($ot->term->authority eq 'STS marker')) { if ($acc==11305) { is $ot->name,'AI413825'; is $ot->term->namespace,'UniSTS'; is $ot->identifier,158928; } else { is $ot->name,'D11Mit102'; is $ot->term->namespace,'UniSTS'; is $ot->identifier,126289; foreach my $syn ($ot->get_synonyms) { is grep(/\b$syn\b/,@syn),1; } } next; } my $evid=$ot->comment; $evid=~s/evidence: //i; my $type=$ot->ontology->name; my @ref=$ot->term->get_references; my $id=$ot->identifier; my $thispmed=$ref[0]->medline if (@ref); is grep(/\b$type\b/,@types),1; is grep(/\b$id\b/,@{$go{$acc}}),1; is grep(/\b$thispmed\b/,@{$pmeds{$acc}}),1 if ($thispmed); ok $ot->name; } is $loop_count, shift @lc, "correct number of loops for T13/14"; $loop_count = 0; } #T15/16/17: GENOMIC LOCATION TESTS/SEQUENCE TYPES TESTS/CONSERVED DOMAINS TESTS my @gffs=('SEQ entrezgene gene location 63548355 63556668 . + .', 'SEQ entrezgene genestructure 63548355 63556668 . + .', 'SEQ entrezgene gene location 31124733 31133046 . + .', 'SEQ entrezgene genestructure 31124733 31133046 . + .', 'SEQ entrezgene gene location 8163589 8172398 . + .', 'SEQ entrezgene genestructure 8163589 8172398 . + .'); my @contigs=$struct->get_members; my @auth=('mrna','genomic','product','mrna sequence','protein','peptide');#Known types.... foreach my $contig (@contigs) { $loop_count++; my $stype=$contig->authority; is grep(/^$stype$/i,@auth),1; if ($acc==1) {#Do just 1? if (($contig->authority eq 'genomic')||($contig->authority eq 'Genomic')) { foreach my $sf ($contig->get_SeqFeatures) { $sf->source_tag('entrezgene'); my $gff=$sf->gff_string; $gff=~s/[\t\s]+$//g; foreach my $gffstr (@gffs) { if ($gffstr eq $gff) { ok(1); last; } } } } if ($contig->authority eq 'Product') { is $contig->id,'NP_570602'; is $contig->accession_number,21071030; foreach my $sf ($contig->get_SeqFeatures) { foreach my $dblink ($sf->annotation->get_Annotations('dblink')) { my $key=$dblink->{_anchor}?$dblink->{_anchor}:$dblink->optional_id; my $db=$dblink->database; next unless (($db =~/cdd/i)||($sf->primary_tag=~ /conserved/i)); my $desc; if ($key =~ /:/) { ($key,$desc)=split(/:/,$key); } $desc=~s/^\s+//;#THIS SHOULD GO IN entrezgene.pm!!! is $desc,'IGc2; Immunoglobulin C-2 Type'; is $key,'smart00408'; is $sf->score,103; is $db,'CDD'; is $sf->start,223; is $sf->end,282; } } } } } cmp_ok( $loop_count,'>=', shift @lc, "correct number of loops for T15"); $loop_count = 0; } is $num_of_seqs, 39, 'looped through correct number of sequences'; #, -locuslink=>'convert'); #See if we can convert to locuslink #T18: BACKCOMPATIBILITY TESTS my @llsp =('OFFICIAL_GENE_NAME','CHR','MAP','OFFICIAL_SYMBOL'); ok my $eio_b=Bio::SeqIO->new(-file=>test_input_file('entrezgene.dat'),-format=>'entrezgene', -debug=>'on',-service_record=>'yes',-locuslink=>'convert'); my $loop_count = 0; while (my $seq=$eio_b->next_seq) { $loop_count++; ok $seq; my $acc=$seq->accession_number; is grep(/\b$acc\b/,@ids),1; my $ann=$seq->annotation; last if ($acc==4);#3 is enough? and 4 does not have gene name, so.... foreach my $key (@llsp) { my @vals=$ann->get_Annotations($key); ok @vals; } } is $loop_count, 4, "correct number of loops for T18";