# $Id: Parser.pm,v 1.15 2006/04/20 22:48:23 cmungall Exp $ # # # see also - http://www.geneontology.org # - http://www.godatabase.org/dev # # You may distribute this module under the same terms as perl itself =head1 NAME GO::Parser - parses all GO files formats and types =head1 SYNOPSIS fetch L objects using a parser: # Scenario 1: Getting objects from a file use GO::Parser; my $parser = new GO::Parser({handler=>'obj',use_cache=>1}); $parser->parse("function.ontology"); # ontology $parser->parse("GO.defs"); # definitions $parser->parse("ec2go"); # external refs $parser->parse("gene-associations.sgd"); # gene assocs # get GO::Model::Graph object my $graph = $parser->handler->graph; # Scenario 2: Getting OBO XML from a file use GO::Parser; my $parser = new GO::Parser({handler=>'xml'}); $parser->handler->file("output.xml"); $parser->parse("gene_ontology.obo"); # Scenario 3: Using an XSL stylesheet to convert the OBO XML use GO::Parser; my $parser = new GO::Parser({handler=>'xml'}); # xslt files are kept in in $ENV{GO_ROOT}/xml/xsl # (if $GO_ROOT is not set, defaults to install directory) $parser->xslt("oboxml_to_owl"); $parser->handler->file("output.owl-xml"); $parser->parse("gene_ontology.obo"); # Scenario 4: via scripts my $cmd = "go2xml gene_ontology.obo | xsltproc my-transform.xsl -"; my $fh = FileHandle->new("$cmd |") || die("problem initiating $cmd"); while(<$fh>) { print $_ } $fh->close || die("problem running $cmd"); =cut =head1 DESCRIPTION Module for parsing GO flat files; for examples of GO/OBO flatfile formats see: L L For a description of the various file formats, see: L L This module will generate XML events from a correctly formatted GO/OBO file =head1 SEE ALSO This module is a part of go-dev, see: L for more details =head1 PUBLIC METHODS =head2 new Title : new Usage : my $p = GO::Parser->new({format=>'obo_xml',handler=>'obj'}); $p->parse("go.obo-xml"); my $g = $p->handler->graph; Synonyms: Function: creates a parser object Example : Returns : GO::Parser Args : a hashref of arguments: format: a format for which a parser exists handler: a format for which a perl handler exists use_cache: (boolean) see caching below =head2 parse Title : parse Usage : $p->parse($file); Synonyms: Function: parses a file Example : Returns : Args : str filename =head2 handler Title : handler Usage : my $handler = $p->handler; Synonyms: Function: gets/sets a GO::Handler object Example : Returns : L Args : L =head1 FORMATS This module is a front end wrapper for a number of different GO/OBO formats - see the relevant module documentation below for details. The full list of parsers can be found in the go-perl/GO/Parsers/ directory =over =item obo_text Files with suffix ".obo" This is a new file format replacement for the existing GO flat file formats. It handles ontologies, definitions and xrefs (but not associations) =item go_ont Files with suffix ".ontology" These store the ontology DAGs =item go_def Files with suffix ".defs" =item go_xref External database references for GO terms Files with suffix "2go" (eg ec2go, metacyc2go) =item go_assoc Annotations of genes or gene products using GO Files with prefix "gene-association." =item obo_xml Files with suffix ".obo.xml" or ".obo-xml" This is the XML version of the OBO flat file format above See L =item obj_yaml A YAML dump of the perl L object. You need L from CPAN for this to work =item obj_storable A dump of the perl L object. You need L from CPAN for this to work. This is intended to cache objects on the filesystem, for fast access. The obj_storable representation may not be portable =head2 PARSING ARCHITECTURE Each parser fires XML B. The XML events are known as B. These XML events can be B by a handler written in perl, or they can be caught by an XML parser written in some other language, or by using XSL stylesheets. go-dev comes with a number of stylesheets in the go-dev/xml/xsl directory Anything that catches these XML events is known as a B go-perl comes with some standard perl XML handlers, in addition to some standard XSL stylesheets. These can be found in the B directory If you are interested in getting perl B from files then you will want the B handler, which gives back L objects The parsing architecture gives you the option of using the go-perl object model, or just parsing the XML events directly If you are using the go-db-perl library, the load-go-into-db.pl script will perform the following processes when loading files into the database =over =item Obo-XML events fired using GO::Parser::* classes =item Obo-XML transformed into godb xml using oboxml_to_godb_prestore.xsl =item godb_prestore.xml stored in database using generic loader =back =head2 Obo-XML The Obo-XML format DTD is stored in the go-dev/xml/dtd directory =head2 HOW IT WORKS Currently the various parsers and perl event handlers use the B module for this - see L for more details, or http://stag.sourceforge.net =head2 NESTED EVENTS nested events can be thought of as xml, without attributes; nested events can easily be turned into xml events have a start, a body and an end event handlers can *catch* these events and do something with them. an object handler can turn the events into objects, centred around the GO::Model::Graph object; see GO::Handlers::obj other handlers can catch the events and convert them into other formats, eg OWL or OBO Or you can bypass the handler and get output as an XML stream - to do this, just run the go2xml script a database loading event handler can catch the events and turn them into SQL statements, loading a MySQL or postgres database (see the go-db-perl library) the advantage of an event based parsing architecture is that it is easy to build lightweight parsers, and heavy weight object models can be bypassed if prefered. =head2 EXAMPLES To see examples of the events generated by the GO::Parser class, run the script go2xml; for example go2xml function.ontology on any GO-formatted flatfile This also works on OBO-formatted files: go2xml gene_ontology.obo You can also use the script "stag-parse.pl" which comes with the L distribution. for example stag-parse.pl -p GO::Parsers::go_assoc_parser gene-association.fb =head2 XSLT HANDLERS The full list can be found in the go-dev/xml/xsl directory =head2 PERL HANDLERS see GO::Handlers::* for all the different handlers possible; more can be added dynamically. you can either create the handler object yourself, and pass it as an argument, e.g. my $apph = new GO::AppHandle(-db=>"go"); my $handler = new GO::Handlers::godb({apph=>$apph}); my $parser = new GO::Parser({handler=>$handler}); $parser->parse(@files); or you can use one of the registered handlers: my $parser = new GO::Parser({handler=>'db', handler_args=>{apph=>$apph}}); or you can just do things from the command line go2fmt.pl -w oboxml function.ontology the registered perl handlers are as follows: =over =item obo_xml writes out OBO-XML (which is basically a straightforward conversion of the event stream into XML) =item obo_text =item go_ont legacy GO-ontology file format =item go_xref GO xref file, for linking GO terms to terms and dbxrefs in other ontologies =item go_defs legacy GO-definitions file format =item go_assoc GO association file format =item rdf GO XML-RDF file format =item owl OWL format (default: OWL-DL) OWL is a W3C standard format for ontologies You will need the XSL files from the full go-dev distribution to run this; see the XML section in L =item prolog prolog facts - you will need a prolog compiler/interpreter to use these. You can reason over these facts using Obol or the forthcoming Bio-LP project =item sxpr lisp style S-Expressions, conforming to the OBO-XML schema; you will need lisp to make full use of these. you can also do some nice stuff just within emacs (use lisp-mode and load an sxpr file into your buffer) =item godb this is actually part of the go-db-perl library, not the go-perl library catches events and loads them into a database conforming to the GO database schema; see the directory go-dev/sql, as part of the whole go-dev distribution; or www.godatabase.org/dev/database =item obj_yaml A YAML dump of the perl L object. You need L from CPAN for this to work =item obj_storable A dump of the perl L object. You need L from CPAN for this to work. This is intended to cache objects on the filesystem, for fast access. The obj_storable representation may not be portable =back =head1 EXAMPLES OF DATATYPE TEXT FORMATS =head2 go_ont format eg format: go_ont for storing graphs and metadata; for example: !version: $Revision: 1.15 $ !date: $Date: 2006/04/20 22:48:23 $ !editors: Michael Ashburner (FlyBase), Midori Harris (SGD), Judy Blake (MGD) $Gene_Ontology ; GO:0003673 $cellular_component ; GO:0005575 %extracellular ; GO:0005576 file z.ontology 1075164285 GO:0003673 Gene_Ontology root GO:0005575 cellular_component root GO:0003673 GO:0005576 extracellular root GO:0005575 GO:0005577 fibrinogen root part_of GO:0005576 GO:0005972 fibrinogen alpha chain root part_of GO:0005577 GO:0005973 fibrinogen beta chain root part_of GO:0005577 =head2 go_def format eg format: go_defs for storing definitions: !Gene Ontology definitions ! term: 'de novo' protein folding goid: GO:0006458 definition: Processes that assist the folding of a nascent peptide chain into its correct tertiary structure. definition_reference: Sanger:mb See GO::Parsers::go_def_parser for more details =head2 go_xref format eg format: go_xrefs for storing links between GO IDs and IDs for terms in other DBs: EC:1.-.-.- > GO:oxidoreductase ; GO:0016491 EC:1.1.-.- > GO:1-phenylethanol dehydrogenase ; GO:0018449 See GO::Parsers::go_xref_parser for more details =head2 go_assoc format eg format: go-assocs for storing gene-associations: SGD S0004660 AAC1 GO:0005743 SGD:12031|PMID:2167309 TAS C ADP/ATP translocator YMR056C gene taxon:4932 20010118 SGD S0004660 AAC1 GO:0006854 SGD:12031|PMID:2167309 IDA P ADP/ATP translocator YMR056C gene taxon:4932 20010118 See GO::Parsers::go_assoc_parser for more details =head2 obo_text format L =cut package GO::Parser; use Exporter; use Carp; use GO::Model::Term; use FileHandle; use strict qw(subs vars refs); use base qw(GO::Model::Root); # Exceptions # Constructor =head2 new Usage - my $parser = GO::Parser->new() Returns - GO::Parser creates a new parser =cut sub new { my $proto = shift; my $class = ref($proto) || $proto;; my $self = {}; bless $self, $class; my $init_h = $_[0] || {}; if (!ref($init_h)) { $init_h = {@_}; } my $fmt = $init_h->{format} || $init_h->{fmt} || ''; my $use_cache = $init_h->{use_cache}; $fmt = lc($fmt) unless $fmt =~ /::/; # $fmt = 'gotext' unless $fmt; if (!$fmt) { # this parser guesses/defers on what type it is parsing $fmt = "unknown_format"; } my $p = $self->get_parser_impl($fmt); if ($init_h) { map {$p->{$_} = $init_h->{$_}} keys %$init_h; } my $handler = $init_h->{handler} || "base"; if (UNIVERSAL::isa($handler, "GO::AppHandle")) { require "GO/Handlers/DbStoreHandler.pm"; $handler = GO::Handlers::DbStoreHandler->new({apph=>$handler}); } unless (ref($handler)) { my $hclass = $handler; if ($handler !~ /::/) { if ($handler =~ /^::/) { $hclass = $handler; $hclass =~ s/^:://; } else { $hclass = "GO::Handlers::$handler"; } } eval { $class->load_module($hclass); }; if ($@) { print STDERR $@, "\n\n\n"; $self->throw("No such handler: $handler"); } $handler = $hclass->new($init_h->{handler_args}); } $p->handler($handler); $p->use_cache($use_cache); delete $init_h->{parser}; delete $init_h->{handler}; return $p; } sub get_parser_impl { my $self = shift; my $fmt = shift; my $mod; if ($fmt =~ /::/) { $mod = $fmt; } else { $mod = "GO::Parsers::$fmt"."_parser"; } $self->load_module($mod); my $p = $mod->new(); return $p; } =head2 create_handler Usage - my $handler = GO::Parser->create_handler('obj'); Returns - L Args - handler type [str] =cut sub create_handler { my $self = shift; my $type = shift || 'obj'; my $p = $self->new({handler=>$type}); return $p->handler; } sub load_module { my $self = shift; my $classname = shift; my $mod = $classname; $mod =~ s/::/\//g; if ($main::{"_<$mod.pm"}) { } else { eval { require "$mod.pm"; }; if ($@) { $self->throw("No such module: $classname;;\n$@"); } } } 1;