#!/usr/bin/perl use strict; use warnings; use diagnostics; use Test; BEGIN { plan tests => 7991 }; # File : GO-TermFinder.t # Author : Gavin Sherlock # Date Begun : September 1st 2003 # $Id: GO-TermFinder-Obo.t,v 1.2 2007/11/15 18:34:39 sherlock Exp $ # This file forms a set of tests for the GO::TermFinder class use GO::TermFinder; use GO::AnnotationProvider::AnnotationParser; use GO::OntologyProvider::OboParser; $|=1; # turn off warnings from the TermFinder $GO::TermFinder::WARNINGS = 0; my $ontologyFile = "t/gene_ontology_edit.obo"; my $annotationFile = "t/gene_association.sgd"; my $aspect = "P"; # we'll check that all the public methods still exist in the interface my @methods = qw (findTerms); my $ontology = GO::OntologyProvider::OboParser->new(ontologyFile => $ontologyFile, aspect => $aspect); my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile); my $termFinder = GO::TermFinder->new(annotationProvider=> $annotation, ontologyProvider => $ontology, aspect => $aspect); ok($termFinder->isa("GO::TermFinder")); # check the object returns a code reference when asked it it can do a # method that should exist foreach my $method (@methods){ ok(ref($termFinder->can($method)), "CODE", "TermFinder can $method"); } # now check that the findTerms method actually returns the correct # answers for a selected list of genes (actually the methionine # cluster from Spellman et al, 1998). my @pvalues = $termFinder->findTerms(genes=>[qw(YPL250C MET11 MXR1 MET17 SAM3 MET28 STR3 MMP1 MET1 YIL074C MHT1 MET14 MET16 MET3 MET10 ECM17 MET2 MUP1 MET6)]); &testHypotheses(@pvalues); # now let's run exactly the same test again, but with a different # casing of the genes my @newpvalues = $termFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)]); # and compare that the stuff returned looks exactly the same &compareHypotheses(\@pvalues, \@newpvalues, 1); # now let's test the functionality of using a defined population # to create the background distribution. If we simply say that # the defined population is every gene from the annotation parser # then we should get the same result my $newTermFinder = GO::TermFinder->new(annotationProvider=> $annotation, ontologyProvider => $ontology, population => [$annotation->allDatabaseIds], aspect => $aspect); my @poppvalues = $newTermFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)]); # again, check that the stuff returned looks exactly the same &compareHypotheses(\@pvalues, \@poppvalues, 1); # now try using a TermFinder with a limited population of just a few genes. # All of the returned nodes should have a probability of 1 my $nextTermFinder = GO::TermFinder->new(annotationProvider=> $annotation, ontologyProvider => $ontology, population => [qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)], aspect => $aspect); @pvalues = $nextTermFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)]); foreach my $pvalue (@pvalues){ # round the pvalue to 2 decimal places, to avoid precision problems my $val = sprintf("%.2f", $pvalue->{PVALUE}); ok($val, "1.00"); } # Now we need a test to see what happens when we create a term finder, # and find terms for three 'nonsense' genes, to check that it doesn't # cause a problem - we'll defined the population as being of a size # of 3 more than exist in the annotation file, to accommodate these # 3 extra nonsense genes my @genes = qw (foo bar baz); my $numGenes = scalar(@genes); my $populationSize = $annotation->numAnnotatedGenes + $numGenes; my $nonsenseTester = GO::TermFinder->new(annotationProvider=> $annotation, ontologyProvider => $ontology, aspect => $aspect, totalNumGenes => $populationSize); @pvalues = $nonsenseTester->findTerms(genes=>\@genes); # grab best node, which should be the 'unannotated' node my $hypothesis = shift(@pvalues); # check attributes ok($hypothesis->{NODE}->term, "unannotated"); ok($hypothesis->{NODE}->goid, "GO:XXXXXXX"); # all our tested genes should be annotated to the node ok($hypothesis->{NUM_ANNOTATIONS}, $numGenes); # the total number of annotations to this node out of all genes should # be the total number of genes minus those which have an annotation to # this aspect. ok($hypothesis->{TOTAL_NUM_ANNOTATIONS}, ($populationSize - $annotation->numAnnotatedGenes($aspect))); # all of the above tests have been using the default setting for # multiple hypothesis correction, which should default to bonferroni. # We now need to test that using bonferroni as the supplied argument # gives the same answers as no argument, and also check that it gives # the expected correction factor, and run the termfinder with the # 'simulation' argument, and with the 'none' argument. # first try with an explicit bonferroni argument my @bonferroni = $termFinder->findTerms(genes => [qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)], correction => 'bonferroni'); # and compare the results to previously generated pvalues &compareHypotheses(\@newpvalues, \@bonferroni, 1); # we can also check that the correction value was correct - it should # be the same as the number of hypotheses we got back; also, in this # case, we know that should be 37. Can only test if the corrected # p_value is less than 1, as we have a ceiling placed on it at 1. ok(scalar(@newpvalues), 37); foreach my $hypothesis (@newpvalues){ if ($hypothesis->{CORRECTED_PVALUE} < 1){ ok ($hypothesis->{CORRECTED_PVALUE}/$hypothesis->{PVALUE}, scalar(@newpvalues)); } } # now let's test the termFinder when we ask for no correction - we # should get identical results as we got above, except there are no # corrected p-values. my @noCorrection = $termFinder->findTerms(genes => [qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)], correction => 'none'); &compareHypotheses(\@newpvalues, \@noCorrection, 0); # as our final test of multiple hypothesis correction, we want to # see if the simulation method works correctly my @simulation = $termFinder->findTerms(genes => [qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)], correction => 'simulation'); # and compare the results to previously generated pvalues, but ignore # the corrected pvalues &compareHypotheses(\@newpvalues, \@simulation, 0); # not sure what tests we'll do for the FDR calculations, but we should # at least make sure that they don't throw an error when generated, # and that the pvalues are the same: my @fdr = $termFinder->findTerms(genes => [qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6)], calculateFDR => 1); &compareHypotheses(\@fdr, \@bonferroni, 1); # now let's test that if we say that we're looking for significant # terms when we simply have a list of all genes, that we get none - # indeed the uncorrected p-values should all be equal to 1. my @nonsignificant = $termFinder->findTerms(genes=>[$annotation->allDatabaseIds]); foreach my $hypothesis (@nonsignificant){ ok($hypothesis->{PVALUE}, 1); } # now we want to test what happens when we use a TermFinder with a # defined population, and ask if to findTerms for a list of genes, # some of which are not in the population. # above, we generated @poppvalues, which were the pvalues generated # with a list of genes, with all databaseIds as the background. Now # we will generate some new pvalues with that same TermFinder object, # but add in a few bogus genes at the end. The bogus genes should be # ignored, and we should get exactly the same result. my @poppvalues2 = $newTermFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6 BLAH BLAH2 XXXZZZ CDCDCDC)]); my @discardedGenes = $newTermFinder->discardedGenes; # 4 genes should have been discarded ok(scalar(@discardedGenes), 4); # now check that the nodes and pvalues returned look exactly the same # as we saw before, when there were no genes to be discarded &compareHypotheses(\@poppvalues, \@poppvalues2, 1); # also need to test that the genes are correctly discarded when we # are doing a correction or calculating the FDR my @poppvalues3 = $newTermFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6 BLAH BLAH2 XXXZZZ CDCDCDC)], calculateFDR => 1); my @discardedGenes2 = $newTermFinder->discardedGenes; # 4 genes should have been discarded ok(scalar(@discardedGenes), 4); my @poppvalues4 = $newTermFinder->findTerms(genes=>[qw(ypl250c Met11 mxr1 Met17 SAM3 met28 Str3 MMp1 mET1 YIl074c Mht1 mEt14 Met16 Met3 mET10 ecm17 Met2 MuP1 MeT6 BLAH BLAH2 XXXZZZ CDCDCDC)], correction => 'simulation'); my @discardedGenes3 = $newTermFinder->discardedGenes; # 4 genes should have been discarded ok(scalar(@discardedGenes), 4); # now let's test that if we have a background population defined, and # that if none of the genes provided to find terms for are in the # background, that a fatal error is thrown eval { $newTermFinder->findTerms(genes=>[qw(BLAH BLAH2 XXXZZZ CDCDCDC)]); }; ok($@, qr/None of the genes provided for analysis are found in the background population/, "should die if genes not in background"); # now some tests that check that we have GO::TermFinder working # correctly with respect to the aspect node - in this case, test the # biological_process node, using a bunch of genes that are all # annotated directly to this node. Note, this is to accommodate the # changed behaviour, required by the change Ontologies, where they # eliminated the unannotated nodes my @unannotatedGenes = qw(YPR108W-A YPR109W YPR114W YPR115W YPR116W YPR117W YPR127W YPR145C-A YPR147C YPR148C YPR153W YPR157W YPR158W YPR159C-A YPR172W YPR174C YPR196W YPR202W YPR203W YPR204W); my @unannotatedListPValues = $newTermFinder->findTerms(genes=>\@unannotatedGenes); ok(scalar(@unannotatedListPValues), 1, "unannotated genes return a single term."); my $topHypothesis = $unannotatedListPValues[0]; ok($topHypothesis->{NODE}->goid, "GO:0008150"); ok($topHypothesis->{NODE}->term, "biological_process"); ok($topHypothesis->{NUM_ANNOTATIONS}, scalar(@unannotatedGenes), "all in list should be in this node"); ok($topHypothesis->{TOTAL_NUM_ANNOTATIONS}, 1505, "total num directly annotated to biological_process, hand checked"); # now add a single annotated gene, and make sure that we still get the # same number of total annotatations @unannotatedListPValues = $newTermFinder->findTerms(genes=>[(@unannotatedGenes, "CLB2", "CDC28")]); ok(scalar(@unannotatedListPValues), 15, "many terms are now tested."); $topHypothesis = $unannotatedListPValues[0]; ok($topHypothesis->{NODE}->goid, "GO:0008150"); ok($topHypothesis->{NODE}->term, "biological_process"); ok($topHypothesis->{NUM_ANNOTATIONS}, scalar(@unannotatedGenes), "should still be the same size as the list"); ok($topHypothesis->{TOTAL_NUM_ANNOTATIONS}, 1505, "total num directly annotated to biological_process, hand checked"); ###################################################################################### sub testHypotheses{ ###################################################################################### # the following are what should be the 11 most significant goids for # the test set of genes using this frozen dataset. Note if two nodes # result in the same p-value, they will be sorted by GOID, using a # text sort. my @pvalues = @_; my @topGoids = ("GO:0006790", "GO:0000096", "GO:0006555", "GO:0000097", "GO:0006520", "GO:0006519", "GO:0009066", "GO:0009308", "GO:0006807", "GO:0044272", "GO:0000103"); # now check that these are returned by the TermFinder for (my $i = 0; $i< @topGoids; $i++){ ok($pvalues[$i]->{NODE}->goid, $topGoids[$i], "$topGoids[$i] is ${i}th in the list of top hypotheses"); } return; } ###################################################################################### sub compareHypotheses{ ###################################################################################### # This subroutine expects to receive two arrays (by reference) of # hypotheses generated by GO::TermFinder. It will check whether they # are identical. An third arguments indicates if corrected p-values # should be compared. my ($ref1, $ref2, $shouldCompareCorrectedPValues) = @_; for (my $i = 0; $i < @{$ref1}; $i++){ ok($ref1->[$i]->{PVALUE}, $ref2->[$i]->{PVALUE}, $i."th p-value"); # Sometimes we don't want to compare the corrected p-values, # as a different method of multiple hypothesis correction may # have been used between two different runs if ($shouldCompareCorrectedPValues){ ok($ref1->[$i]->{CORRECTED_PVALUE}, $ref2->[$i]->{CORRECTED_PVALUE}, $i."th corrected p-value"); } ok($ref1->[$i]->{NUM_ANNOTATIONS}, $ref2->[$i]->{NUM_ANNOTATIONS}, $i."th NUM_ANNOTATIONS"); ok($ref1->[$i]->{TOTAL_NUM_ANNOTATIONS}, $ref2->[$i]->{TOTAL_NUM_ANNOTATIONS}, $i."th TOTAL_NUM_ANNOTATIONS"); ok($ref1->[$i]->{NODE}->goid, $ref2->[$i]->{NODE}->goid, $i."th GOID"); ok($ref1->[$i]->{NODE}->term, $ref2->[$i]->{NODE}->term, $i."th TERM"); # now check the genes # same number ok(scalar keys (%{$ref1->[$i]->{ANNOTATED_GENES}}), scalar keys (%{$ref2->[$i]->{ANNOTATED_GENES}}), $i."th number of annotated genes"); foreach my $gene (keys (%{$ref1->[$i]->{ANNOTATED_GENES}})){ # each one exists ok (exists $ref2->[$i]->{ANNOTATED_GENES}{$gene}, 1, $i."th ANNOTATED_GENE exists"); # and has the same name - note has to be done case-insensitively ok(uc($ref1->[$i]->{ANNOTATED_GENES}{$gene}), uc($ref2->[$i]->{ANNOTATED_GENES}{$gene}), $i."th ANNOTATED_GENE name"); } } return; } =head1 Modifications List them here. CVS information: # $Author: sherlock $ # $Date: 2007/11/15 18:34:39 $ # $Log: GO-TermFinder-Obo.t,v $ # Revision 1.2 2007/11/15 18:34:39 sherlock # Tweaked test to be regex instead of equality, as different Perls might # add different things to exceptions. This fixes a failure I was seeing # on Solaris. # # Revision 1.1 2007/03/18 01:33:14 sherlock # Adding new test files # # Revision 1.10 2006/07/28 00:02:46 sherlock # added new tests to make sure discarded genes are not lost when # calculating FDR or running simulations. # # Revision 1.9 2006/07/23 21:27:19 sherlock # forgot to turn warnings back off # # Revision 1.8 2006/07/23 00:41:40 sherlock # uncommented tests that were previously commented out for performance # reasons, as they seem to run fine. Also, added test for discarded # genes when using a population. # # Revision 1.7 2004/10/14 22:32:04 ihab # Removed tests of the internal P value computation; these are now in # GO-TermFinder-Native.t and are testing the C++ version. # # Revision 1.6 2004/05/06 01:58:27 sherlock # Added in tests to check that simulation, bonferroni and FDR options # work correctly. # # Revision 1.5 2003/12/11 19:47:35 sherlock # added in some tests to check that TermFinder behaves correctly in the # case that unrecognized gene names are passed in, which was not working # properly previously. # # Revision 1.4 2003/12/03 02:30:25 sherlock # added in a bunch of tests to be more precise in the testing of the # term finder, and to test the functionality of providing a population # of genes from which to calculate the background distribution # # Revision 1.3 2003/11/22 00:09:12 sherlock # added some new tests to see that differently cased versions of the # gene names still give the same result. # =cut