# $Id: obo_text_parser.pm,v 1.50 2009/08/12 20:58:25 cmungall Exp $ # # # see also - http://www.geneontology.org # - http://www.godatabase.org/dev # # You may distribute this module under the same terms as perl itself package GO::Parsers::obo_text_parser; =head1 NAME GO::Parsers::obo_text_parser - OBO Flat file parser object =head1 SYNOPSIS do not use this class directly; use GO::Parser =cut =head1 DESCRIPTION =cut use Exporter; use Text::Balanced qw(extract_quotelike extract_bracketed); use base qw(GO::Parsers::base_parser); use GO::Parsers::ParserEventNames; use Carp; use FileHandle; use strict qw(vars refs); sub dtd { 'obo-parser-events.dtd'; } sub parse_fh { my ($self, $fh) = @_; $self->start_event(OBO); $self->parse_fh_inner($fh); $self->pop_stack_to_depth(0); $self->parsed_ontology(1); } sub parse_fh_inner { my ($self, $fh) = @_; my $file = $self->file; my $litemode = $self->litemode; my $is_go; local($_); # latest perl is more strict about modification of $_ $self->fire_source_event($file); $self->start_event(HEADER); my $stanza_count; my $in_hdr = 1; my $is_root = 1; # default my $namespace_set; my $id; my $namespace = $self->force_namespace; # default my $force_namespace = $self->force_namespace; my $usc = $self->replace_underscore; my %id_remap_h = (); my @imports = (); my $is_utf8; # temporary hack... if ($ENV{OBO_IDMAP}) { my @parts = split(/\;/,$ENV{OBO_IDMAP}); foreach (@parts) { if (/(.*)=(.*)/) { $id_remap_h{$1} = $2; } } } my $default_id_prefix; while(<$fh>) { chomp; if (/^encoding:\s*utf/) { $is_utf8 = 1; } if (!$is_utf8) { tr [\200-\377] [\000-\177]; # see 'man perlop', section on tr/ # weird ascii characters should be excluded tr/\0-\10//d; # remove weird characters; ascii 0-8 # preserve \11 (9 - tab) and \12 (10-linefeed) tr/\13\14//d; # remove weird characters; 11,12 # preserve \15 (13 - carriage return) tr/\16-\37//d; # remove 14-31 (all rest before space) tr/\177//d; # remove DEL character } s/^\!.*//; s/[^\\]\!.*//; #s/[^\\]\#.*//; s/^\s+//; s/\s+$//; next unless $_; next if ($litemode && $_ !~ /^(\[|id:|name:|is_a:|relationship:|namespace:|is_obsolete:)/ && !$in_hdr); if (/^\[(\w+)\]\s*(.*)/) { # new stanza # we are at the beginning of a new stanza # reset everything and make sure everything from # previous stanza is exported my $stanza = lc($1); my $rest = $2; if ($in_hdr) { $in_hdr = 0; $self->end_event(HEADER); } else { if (!$namespace_set) { if (!$namespace) { if ($stanza ne 'instance') { #$self->parse_err("missing namespace for ID: $id"); } } else { $self->event(NAMESPACE, $namespace); } } $self->event(IS_ROOT,1) if $is_root; $is_root = 1; # assume root by default; override if parents found $namespace_set = 0; $self->end_event; } $is_root = 0 unless $stanza eq 'term'; $self->start_event($stanza); $id = undef; $stanza_count++; } elsif ($in_hdr) { # we are in the header section if (/^([\w\-]+)\:\s*(.*)/) { # tag-val pair my ($tag, $val) = ($1,$2); if ($tag eq 'subsetdef') { if ($val =~ /(\S+)\s+(.*)/) { my $subset_id = $1; $val = $2; my ($subset_name, $parts) = extract_qstr($val); $val = [[ID,$subset_id], [NAME,$subset_name], map {dbxref($_)} @$parts]; } else { $self->parse_err("subsetdef: expect ID \"NAME\", got: $val"); } } if ($tag eq 'synonymtypedef') { if ($val =~ /(\S+)\s+\"(.*)\"\s*(.*)/) { my $stname = $1; my $stdef = $2; my $scope = $3; $val = [[ID,$stname], [NAME,$stdef], ($scope ? ['scope', $scope] : ())]; } else { $self->parse_err("synonymtypedef: expect ID \"NAME\", got: $val"); } } if ($tag eq 'idspace') { my ($idspace,$global,@rest) = split(' ',$val); if (!$global) { $self->parse_err("idspace requires two columns"); } $val = [['local',$idspace], ['global',$global], (@rest ? [COMMENT,join(' ',@rest)] : ()), ]; } if ($tag eq 'local-id-mapping') { if ($val =~ /(\S+)\s+(.*)/) { # with a local ID mapping we delay binding $val = [['local',$1], ['to',$2]]; } else { $self->parse_err("id-mapping requires two columns"); } } if ($tag eq 'import') { if ($ENV{OBO_FOLLOW_IMPORTS}) { push(@imports, $val); } else { # handled below #$self->event(import=>$val); } } $self->event($tag=>$val); # post-processing if ($tag eq 'default-namespace') { $namespace = $val unless $namespace; } if ($tag eq 'id-mapping') { if ($val =~ /(\S+)\s+(.*)/) { # bind at parse time if ($id_remap_h{$1}) { $self->parse_err("remapping $1 to $2"); } $id_remap_h{$1} = $2; } else { $self->parse_err("id-mapping requires two columns"); } } if ($tag eq 'default-id-prefix') { $default_id_prefix = $val; } } else { $self->parse_err("illegal header entry: $_"); } } # END OF IN-HEADER elsif (/^([\w\-]+)\:\s*(.*)/) { # tag-val pair my ($tag, $val) = ($1,$2); my $qh; ($val, $qh) = extract_quals($val); #$val =~ s/\\//g; my $val2 = $val; $val2 =~ s/\\,/,/g; $val2 =~ s/\\//g; if ($tag eq ID) { if ($id_remap_h{$val}) { $val = $id_remap_h{$val}; } if ($val !~ /:/) { if ($default_id_prefix) { $val = "$default_id_prefix:$val"; } } } elsif ($tag eq NAME) { # replace underscore in name $val = $val2; if ($usc) { $val =~ s/_/$usc/g; } } elsif ($tag eq RELATIONSHIP) { my ($type, @ids) = split(' ', $val2); my $id = shift @ids; if ($id_remap_h{$type}) { $type = $id_remap_h{$type}; } if ($type !~ /:/) { if ($default_id_prefix) { $type = "$default_id_prefix:$type"; } } $val = [[TYPE,$type],[TO,$id]]; push(@$val,map {['additional_argument',$_]} @ids); } elsif ($tag eq INTERSECTION_OF || $tag eq UNION_OF) { my ($type, $id) = split(' ', $val2); if ($id_remap_h{$type}) { $type = $id_remap_h{$type}; } if ($type !~ /:/) { if ($default_id_prefix) { $type = "$default_id_prefix:$type"; } } if ($id) { $val = [[TYPE,$type],[TO,$id]]; } else { $id = $type; $val = [[TO,$id]]; } } elsif ($tag eq INVERSE_OF || $tag eq TRANSITIVE_OVER || $tag eq IS_A) { if ($id_remap_h{$val}) { $val = $id_remap_h{$val}; } if ($val !~ /:/) { if ($default_id_prefix) { $val = "$default_id_prefix:$val"; } } } elsif ($tag eq DISJOINT_FROM) { if ($id_remap_h{$val}) { $val = $id_remap_h{$val}; } if ($val !~ /:/) { if ($default_id_prefix) { $val = "$default_id_prefix:$val"; } } } elsif ($tag eq XREF) { $tag = XREF_ANALOG; my $dbxref = dbxref($val); $val = $dbxref->[1]; } elsif ($tag eq XREF_ANALOG) { my $dbxref = dbxref($val); $val = $dbxref->[1]; } elsif ($tag eq XREF_UNKNOWN) { my $dbxref = dbxref($val); $val = $dbxref->[1]; } elsif ($tag eq PROPERTY_VALUE) { if ($val =~ /^(\S+)\s+(\".*)/) { # first form # property_value: relation "literal value" xsd:datatype my $type = $1; my $rest = $2; my ($to, $datatype) = extract_quotelike($rest); $to =~ s/^\"//; $to =~ s/\"$//; $datatype =~ s/^\s+//; $val = [[TYPE,$type], [VALUE,$to], [DATATYPE,$datatype]]; } else { # second form # property_value: relation ToID my ($type,$to) = split(' ',$val); $val = [[TYPE,$type], [TO,$to]]; } } elsif ($tag eq NAMESPACE) { if ($force_namespace) { # override whatever namespace was provided $val = $force_namespace; } else { # do nothing - we will export later } $namespace_set = $val; } elsif ($tag eq DEF) { my ($defstr, $parts) = extract_qstr($val); $val = [[DEFSTR,$defstr], map {dbxref($_)} @$parts]; } elsif ($tag =~ /(\w*)synonym/) { my $scope = $1 || ''; if ($scope) { $tag = SYNONYM; if ($scope =~ /(\w+)_$/) { $scope = $1; } else { $self->parse_err("bad synonym type: $scope"); $scope = ''; } } my ($syn, $parts, $extra_quals) = extract_qstr($val2); if (@$extra_quals) { $scope = shift @$extra_quals; $scope = lc($scope); $qh->{synonym_type} = shift @$extra_quals if @$extra_quals; } if ($qh->{scope}) { if ($scope) { if ($scope ne $qh->{scope}) { $self->parse_err("inconsistent scope: $scope/$qh->{scope}"); } else { $self->parse_err("redundant scope: $scope"); } } } else { $qh->{scope} = $scope; } $val = [[SYNONYM_TEXT,$syn], (map {dbxref($_)} @$parts)]; } elsif ($tag =~ /formula/) { my ($formula, $parts, $extra_quals) = extract_qstr($val2); my $lang = 'CLIF'; if (@$extra_quals) { $lang = shift @$extra_quals; } $qh->{format} = $lang; $val = [['formula_text',$formula], (map {dbxref($_)} @$parts)]; } elsif ($tag eq 'holds_temporally_between' || # experimental support for obof1.3 $tag eq 'holds_atemporally_between' || $tag eq 'holds_on_class_level_between') { my ($sub, $ob) = split(' ', $val2); if ($id_remap_h{$sub}) { $sub = $id_remap_h{$sub}; } if ($id_remap_h{$ob}) { $ob = $id_remap_h{$ob}; } $val = [[subject=>$sub],[object=>$ob]]; } elsif ($tag eq 'holds_over_chain' || $tag eq 'equivalent_to_chain') { # obof1.3 my @rels = split(' ', $val2); @rels = map { my $rel = $_; if ($id_remap_h{$_}) { $rel = $id_remap_h{$_} } if ($rel !~ /:/) { if ($default_id_prefix) { $rel = "$default_id_prefix:$rel"; } } $rel; } @rels; $val = [map {[relation=>$_]} @rels]; } else { $val = $val2; # normal tag:val } if (!ref($val) && $val eq 'true') { $val = 1; } if (!ref($val) && $val eq 'false') { $val = 0; } if (%$qh) { # note that if attributes are used for # terminal nodes then we effectively have # to 'push the node down' a level; # eg # x # ==> [is_a=>'x'] # x # ==> [is_a=>[[@=>[[t=>v]]],[.=>x]]] my $data = ref $val ? $val : [['.'=>$val]]; my @quals = map {[$_=>$qh->{$_}]} keys %$qh; $self->event($tag=>[['@'=>[@quals]], @$data, ]); } else { $self->event($tag=>$val); } if ($tag eq IS_A || $tag eq RELATIONSHIP) { $is_root = 0; } if ($tag eq IS_OBSOLETE && $val) { $is_root = 0; } if ($tag eq ID) { $id = $val; } if ($tag eq NAME) { if (!$id) { $self->parse_err("missing id!") } else { $self->acc2name_h->{$id} = $val; } } } else { $self->throw("uh oh: $_"); } } # duplicated code! check final event if (!$namespace_set) { if (!$namespace && $stanza_count) { #$self->parse_err("missing namespace for ID: $id"); } else { $self->event(NAMESPACE, $namespace); } } $self->event(IS_ROOT,1) if $is_root; foreach my $import_file (@imports) { $import_file = $self->download_file_if_required($import_file); $self->file($import_file); $self->pop_stack_to_depth(1); #$self->end_event(HEADER); my $ifh = FileHandle->new($import_file); $self->parse_fh_inner($ifh); #$self->pop_stack_to_depth(1); $ifh->close(); } return; } # each tag line can have trailing qualifiers in {}s at the end sub extract_quals { my $str = shift; my %q = (); if ($str =~ /(.*)\s+(\{.*)\}\s*$/) { my $return_str = $1; my $extr = $2; if ($extr) { my @qparts = split_on_comma($extr); foreach (@qparts) { if (/(\w+)=\"(.*)\"/) { $q{$1} = $2; } elsif (/(\w+)=\'(.*)\'/) { $q{$1} = $2; } elsif (/(\w+)=(\S+)/) { # current 1.2 standard; non-quoted $q{$1} = $2; } else { warn("$_ in $str"); } } } return ($return_str, \%q); } else { return ($str, {}); } } sub extract_qstr { my $str = shift; my ($extr, $rem, $prefix) = extract_quotelike($str); my $txt = $extr; $txt =~ s/^\"//; $txt =~ s/\"$//; if ($prefix) { warn("illegal prefix: $prefix in: $str"); } my @extra = (); # synonyms can have two words following quoted part # before dbxref section # - two if ($rem =~ /(\w+)\s+(\w+)\s+(\[.*)/) { $rem = $3; push(@extra,$1,$2); } elsif ($rem =~ /(\w+)\s+(\[.*)/) { $rem = $2; push(@extra,$1); } else { } my @parts = (); while (($extr, $rem, $prefix) = extract_bracketed($rem, '[]')) { last unless $extr; $extr =~ s/^\[//; $extr =~ s/\]$//; push(@parts, $extr) if $extr; } @parts = map {split_on_comma($_)} @parts; $txt =~ s/\\//g; return ($txt, \@parts, \@extra); } sub split_on_comma { my $str = shift; my @parts = (); while ($str =~ /(.*[^\\],\s*)(.*)/) { $str = $1; my $part = $2; unshift(@parts, $part); $str =~ s/,\s*$//; } unshift(@parts, $str); return map {s/\\//g;$_} @parts; } # turns a DB:ACC string into an obo-xml dbxref element sub dbxref { my $str = shift; $str =~ s/\\//g; my $name; if ($str =~ /(.*)\s+\"(.*)\"$/) { $str = $1; $name = $2; } my ($db, @rest) = split(/:/, $str); my $acc = join(':',@rest); $db =~ s/^\s+//; if ($db eq 'http' && $acc =~ /^\/\//) { # dbxref is actually a URI $db = 'URL'; $acc = simple_escape($acc); $acc =~ s/\s/\%20/g; $acc = "http:$acc"; } else { # $db=escape($db); # $acc=escape($acc); } $db =~ s/\s+/_/g; # HumanDO.obo has spaces in xref $acc =~ s/\s+/_/g; $db = 'NULL' unless $db; $acc = 'NULL' unless $acc; [DBXREF,[[ACC,$acc], [DBNAME,$db], defined $name ? [NAME,$name] : () ]]; } sub parse_term_expression { my $self = shift; my $expr = shift; my ($te,$rest) = $self->parse_term_expression_with_rest($expr); if ($rest) { $self->parse_err("trailing: $rest"); } return Data::Stag->nodify($te); } sub parse_term_expression_with_rest { my $self = shift; my $expr = shift; if ($expr =~ /^\((.*)/) { my $genus_expr = $1; my ($genus,$diff_expr) = $self->parse_term_expression_with_rest($genus_expr); my $next_c = substr($diff_expr,0,1,''); if ($next_c eq ')') { my ($diffs,$rest) = $self->parse_differentia_with_rest($diff_expr); my $stag = [intersection=>[ [link=>[[to=>[$genus]]]], @$diffs]]; return ($stag,$rest); } else { $self->parse_err("expected ) at end of genus. Got: $next_c followed by $diff_expr"); } } elsif ($expr =~ /^([\w\:]+)\^(.*)/) { my $genus = $1; my $diff_expr = $2; my ($diffs,$rest) = $self->parse_differentia_with_rest($diff_expr); my $stag = [intersection=>[ [link=>[[to=>$genus]]], @$diffs]]; return ($stag,$rest); } elsif ($expr =~ /^([\w\:]+)(.*)/) { return ($1,$2); } else { $self->parse_err("could not parse: $expr"); } } sub parse_differentia { my $self = shift; my $expr = shift; my ($diffs,$rest) = $self->parse_differentia_with_rest($expr); if ($rest) { $self->parse_err("trailing: $rest"); } Data::Stag->nodify($_) foreach @$diffs; return $diffs; } sub parse_differentia_with_rest { my $self = shift; my $expr = shift; if ($expr =~ /^(.+?)\((.*)/) { my $rel = $1; my $term_expr = $2; my ($term,$rest) = $self->parse_term_expression_with_rest($term_expr); my $diff = [link=>[[type=>$rel], [to=>(ref($term) ? [$term] : $term)]]]; if ($rest) { my $next_c = substr($rest,0,1,''); if ($next_c eq ')') { $next_c = substr($rest,0,1); if ($next_c eq '^' || $next_c eq ',') { my ($next_diffs,$next_rest) = $self->parse_differentia_with_rest(substr($rest,1)); return ([$diff,@$next_diffs],$next_rest); } elsif ($next_c eq '') { return ([$diff],$rest); } elsif ($next_c eq ')') { return ([$diff],$rest); } else { $self->parse_err("expected ^ or ). Got: $next_c followed_by: $rest"); } } else { $self->parse_err("exprected ). Got: $next_c followed by: $rest"); } } else { $self->parse_err("expected ). Got: \"\""); } } else { $self->parse_err("expect relation(...). Got: $expr"); } } # lifted from CGI::Util our $EBCDIC = "\t" ne "\011"; # (ord('^') == 95) for codepage 1047 as on os390, vmesa our @E2A = ( 0, 1, 2, 3,156, 9,134,127,151,141,142, 11, 12, 13, 14, 15, 16, 17, 18, 19,157, 10, 8,135, 24, 25,146,143, 28, 29, 30, 31, 128,129,130,131,132,133, 23, 27,136,137,138,139,140, 5, 6, 7, 144,145, 22,147,148,149,150, 4,152,153,154,155, 20, 21,158, 26, 32,160,226,228,224,225,227,229,231,241,162, 46, 60, 40, 43,124, 38,233,234,235,232,237,238,239,236,223, 33, 36, 42, 41, 59, 94, 45, 47,194,196,192,193,195,197,199,209,166, 44, 37, 95, 62, 63, 248,201,202,203,200,205,206,207,204, 96, 58, 35, 64, 39, 61, 34, 216, 97, 98, 99,100,101,102,103,104,105,171,187,240,253,254,177, 176,106,107,108,109,110,111,112,113,114,170,186,230,184,198,164, 181,126,115,116,117,118,119,120,121,122,161,191,208, 91,222,174, 172,163,165,183,169,167,182,188,189,190,221,168,175, 93,180,215, 123, 65, 66, 67, 68, 69, 70, 71, 72, 73,173,244,246,242,243,245, 125, 74, 75, 76, 77, 78, 79, 80, 81, 82,185,251,252,249,250,255, 92,247, 83, 84, 85, 86, 87, 88, 89, 90,178,212,214,210,211,213, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,179,219,220,217,218,159 ); sub escape { shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass)); my $toencode = shift; return undef unless defined($toencode); # force bytes while preserving backward compatibility -- dankogai $toencode = pack("C*", unpack("C*", $toencode)); if ($EBCDIC) { $toencode=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg; } else { $toencode=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg; } return $toencode; } sub simple_escape { return unless defined(my $toencode = shift); $toencode =~ s{&}{&}gso; $toencode =~ s{<}{<}gso; $toencode =~ s{>}{>}gso; $toencode =~ s{\"}{"}gso; # Doesn't work. Can't work. forget it. # $toencode =~ s{\x8b}{‹}gso; # $toencode =~ s{\x9b}{›}gso; $toencode; } 1;