# $Id: obo_text.pm,v 1.16 2008/01/22 23:54:45 cmungall Exp $ # # This GO module is maintained by Chris Mungall # # see also - http://www.geneontology.org # - http://www.godatabase.org/dev # # You may distribute this module under the same terms as perl itself =head1 NAME GO::Handlers::obo_text - =head1 SYNOPSIS use GO::Handlers::obo_text =cut =head1 DESCRIPTION transforms OBO XML events into OBO Text L =head1 PUBLIC METHODS - =cut # makes objects from parser events package GO::Handlers::obo_text; use Data::Stag qw(:all); use GO::Parsers::ParserEventNames; use base qw(GO::Handlers::base); use strict qw(vars refs); sub s_obo { my $self = shift; #$self->SUPER::s_obo(@_); return; } sub e_header { my $self = shift; my $hdr = shift; my $fmt = stag_get($hdr,'format-version'); $self->tag("format-version"=> (stag_sget($hdr,'format-version') || '1.2')); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); $self->tag('date'=>sprintf("%02d:%02d:%04d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min)); $self->tag('autogenerated-by'=>$0); foreach (stag_tnodes($hdr)) { $self->tag(stag_name($_), _obo_escape($_->data)); } my @sts = stag_get($hdr,'synonymtypedef'); foreach (@sts) { my $scope = stag_sget($_,'scope'); $self->tag(synonymtypedef => sprintf("%s \"%s\" %s", stag_sget($_,ID), stag_sget($_,NAME) || '', ($scope ? uc($scope) : ''))); } my @ssdefs = stag_get($hdr,'subsetdef'); foreach (@ssdefs) { $self->tag(subsetdef => sprintf("%s \"%s\"", stag_sget($_,ID), stag_sget($_,NAME))); } $self->{__emitted_header} = 1; $self->print("\n"); return; } sub e_typedef { my $self = shift; my $t = shift; $self->stanza('Typedef', $t); } sub e_term { my $self = shift; my $t = shift; if (!$self->{__emitted_header}) { $self->e_header(stag_new(HEADER,[])); } $self->stanza('Term', $t); } sub e_annotation { my $self = shift; my $t = shift; if (!$self->{__emitted_header}) { $self->e_header(stag_new(HEADER,[])); } $self->stanza('Annotation', $t); } sub e_instance { my $self = shift; my $t = shift; if (!$self->{__emitted_header}) { $self->e_header(stag_new(HEADER,[])); } $self->stanza('Instance', $t); } sub stanza { my $self = shift; my $stanza = shift; my $t = shift; $self->print("[$stanza]\n"); my @BOOLEAN_TAGS = ( IS_ANONYMOUS, IS_OBSOLETE, IS_CYCLIC, IS_TRANSITIVE, IS_SYMMETRIC, IS_ANTI_SYMMETRIC, IS_REFLEXIVE, IS_METADATA_TAG, ); my @TAGS = (ID, NAME, ALT_ID, NAMESPACE, DEF, COMMENT, SUBSET, IS_A , RELATIONSHIP, UNION_OF, INTERSECTION_OF, SYNONYM, PROPERTY_VALUE, XREF_ANALOG, XREF_UNKNOWN, 'object', @BOOLEAN_TAGS, ); my %IS_BOOLEAN = map { ($_=>1) } @BOOLEAN_TAGS; my @IGNORE = qw(is_root); foreach my $tag (@IGNORE) { stag_unset($t, $tag); } foreach my $tag (@TAGS) { my @vals = stag_get($t, $tag); next unless @vals; if ($tag eq DEF) { my $def = shift @vals; my $defstr = $def->get_defstr; my $qn = stag_sget($t, "$tag/@"); $self->tag(def => _obo_escape($defstr), [$def->get_dbxref], $qn); } elsif ($tag eq RELATIONSHIP) { $self->tag(relationship => sprintf("%s %s", $_->sget_type, $_->sget_to), undef, $_->sget('@')) foreach @vals; } elsif ($tag eq INTERSECTION_OF) { $self->tag(intersection_of => sprintf("%s %s", $_->sget_type, $_->sget_to), undef, $_->sget('@')) foreach @vals; } elsif ($tag eq UNION_OF) { $self->tag(union_of => sprintf("%s %s", $_->sget_type, $_->sget_to), undef, $_->sget('@')) foreach @vals; } elsif ($tag eq SYNONYM) { foreach my $syn (@vals) { my $type = $syn->sget('@/synonym_type'); my $scope = $syn->sget('@/scope'); my @vals = (quote($syn->sget_synonym_text)); push(@vals,uc($scope)) if $scope; push(@vals,$type) if $type; $self->tag($tag, join(' ',@vals), [$syn->get_dbxref]); } } elsif ($tag eq XREF_ANALOG) { $self->tag('xref', dbxref($_),undef,$_->sget('@')) foreach @vals; } elsif ($tag eq PROPERTY_VALUE) { foreach (@vals) { my $dt = $_->sget_datatype; if ($dt) { $self->tag('property_value' => sprintf("%s %s %s", $_->sget_type, quote($_->sget_value), $dt)); } else { $self->tag('property_value' => sprintf("%s %s", $_->sget_type, $_->sget_to)); } } } elsif ($tag eq 'object') { # experimental: obof1.3 $self->tag('object' => $self->obo_id(@vals)); } elsif ($IS_BOOLEAN{$tag}) { $self->tag($tag, $vals[0] ? "true" : "false"); } else { foreach (@vals) { if (ref($_)) { $self->tag($tag, $_->sget('.'),undef,$_->sget('@')) } else { $self->tag($tag, _obo_escape($_)); } } } stag_unset($t, $tag); } my @tnodes = stag_tnodes($t); $self->tag($_->name, _obo_escape($_->data)) foreach @tnodes; my @ntnodes = stag_ntnodes($t); if (@ntnodes) { print STDERR $_->xml foreach @ntnodes; $self->throw( "unknown elements"); } $self->print("\n"); } sub obo_id { my $self = shift; my $v = shift; if (ref($v)) { my $isect = $v->sget_intersection; if ($isect) { my @links = $isect->get_link; my @genus = grep {!$_->get_type} @links; my @diffs = grep {$_->get_type} @links; my $s = join('^', (map {$self->obo_id($_->sget_to)} @genus), (map { sprintf("%s(%s)",$_->sget_type,$self->obo_id($_->sget_to)) } @diffs)); return $s; } else { } } else { return $v; } } sub tag { my $self = shift; my ($t, $v, $xrefsr, $qualsr) = @_; my @xrefs = @{$xrefsr || []}; return unless defined $v; if ($t eq DEF) { $v=quote($v); } my $xrefl = ''; if ($xrefsr) { $xrefl = ' ['.join(', ', map { dbxref($_); } @xrefs).']'; } my $ql = ''; if ($qualsr) { my %qh = stag_pairs($qualsr); $ql = ' {'.join( ', ', map { "$_=".quote($qh{$_}) } keys %qh ).'}'; } $self->printf("%s: %s$xrefl$ql\n", $t, $v); return; } sub _obo_escape { my $s=shift; $s =~ s/\\/\\\\/; $s =~ s/([\{\}])/\\$1/g; $s; } sub dbxref { my $x = shift; if (ref($x)) { my $xref = $x->sget_dbname . ':' . $x->sget_acc; my $name = $x->sget_name; if (defined($name)) { $name =~ s/\"/\\\"/g; $xref." \"$name\""; } else { $xref; } } else { $x; } } sub safe { my $word = shift; $word =~ s/ /_/g; $word =~ s/\-/_/g; $word =~ s/\'/prime/g; $word =~ tr/a-zA-Z0-9_//cd; $word =~ s/^([0-9])/_$1/; $word; } sub quote { my $word = shift; #$word =~ s/,/\\,/g; ## no longer required $word =~ s/\"/\\\"/g; "\"$word\""; } # -- EXPERIMENTAL CODE -- # obo format for gene_assocs # we are hardcoding aspects here; this is OK, only for # gene_assoc file which is GO specific our %ASPECT_IDX = (F => 'has_activity', P => 'involved_in', C => 'localised_to' ); sub e_prod { my $self = shift; my $prod = shift; my $proddb = $self->up_to('dbset')->get_proddb; my $acc = $prod->get_prodacc; my $id = "$proddb:$acc"; my $type = $prod->get_prodtype || 'gene_product'; $self->print("!! ***************************** \n"); $self->print("!! Gene Product: $id \n"); $self->print("!! ***************************** \n"); $self->print("[$type]\n"); $self->tag(id=>$id); $self->tag(dbname=>$proddb); $self->tag(acc=>$acc); $self->tag(symbol=>$prod->sget_prodsymbol); $self->tag(name=>$prod->sget_prodname); $self->tag(synonym=>$_) foreach $prod->sget_prodsyn; $self->tag(has_taxon=>'NCBI:'.$prod->sget_prodtaxa); $self->print("\n"); my @assocs = $prod->get_assoc; foreach my $assoc (@assocs) { my $termacc = $assoc->get_termacc; my $aspect = $assoc->get_aspect; my $ns = $ASPECT_IDX{$aspect}; $self->print("[gene_product_annotation]\n"); $self->tag(involves_gene_product=>$id); $self->tag($ns=>$termacc); $self->tag($_=>'true') foreach $assoc->get_qualifier; $self->tag(date=>$assoc->sget_assocdate); $self->tag(source_db=>$assoc->sget_source_db); my @evs = $assoc->get_evidence; foreach my $ev (@evs) { $self->tag(has_evidence=>$ev->sget_evcode, $ev->get_ref); $self->tag(with=>$_) foreach $ev->get_with; } $self->print("\n"); } $self->print("!! //\n\n"); } sub dbxrefstr { } 1;