# $Id: mgi_assoc_parser.pm,v 1.3 2006/08/05 20:26:12 cmungall Exp $ # # # see also - http://www.geneontology.org # - http://www.godatabase.org/dev # # You may distribute this module under the same terms as perl itself package GO::Parsers::mgi_assoc_parser; =head1 NAME GO::Parsers::mgi_assoc_parser - parses MGI gene assoc stanza files =head1 SYNOPSIS do not use this class directly; use GO::Parser =head1 DESCRIPTION This is an EXPERIMENTAL module for converting MGIs in-house stanza format this is richer than go_assoc files as it allows combinatorial assocs these are represented with the go-assoc xml element =head1 AUTHOR =cut use Exporter; use base qw(GO::Parsers::base_parser Exporter); use GO::Parsers::ParserEventNames; use Carp; use FileHandle; use strict; sub dtd { 'go_assoc-parser-events.dtd'; } sub ev_filter { my $self = shift; $self->{_ev_filter} = shift if @_; return $self->{_ev_filter}; } sub skip_uncurated { my $self = shift; $self->{_skip_uncurated} = shift if @_; return $self->{_skip_uncurated}; } sub _parse_ids { my $txt = shift; $txt =~ s/^:+//; $txt =~ s/^\s+//; my @parts = split(/\|/,$txt); return grep {$_} map {if (/(\w+:\S+)/){$1}else {()}} @parts; } sub parse_fh { my ($self, $fh) = @_; my $file = $self->file; $self->start_event(ASSOCS); $self->start_event(DBSET); $self->event(PRODDB, 'mgi'); my $curr_gene = ''; while (<$fh>) { # UNICODE causes problems for XML and DB # delete 8th bit tr [\200-\377] [\000-\177]; # see 'man perlop', section on tr/ # weird ascii characters should be excluded tr/\0-\10//d; # remove weird characters; ascii 0-8 # preserve \11 (9 - tab) and \12 (10-linefeed) tr/\13\14//d; # remove weird characters; 11,12 # preserve \15 (13 - carriage return) tr/\16-\37//d; # remove 14-31 (all rest before space) tr/\177//d; # remove DEL character chomp; if (/^\!/) { next; } if (!$_) { next; } if (/^MGI gene:\s(\S+);\s+(\S+)/) { if ($1 ne $curr_gene) { $self->pop_stack_to_depth(2); $self->start_event(PROD); $self->event(PRODACC, $1); $self->event(PRODSYMBOL, $2); $self->event(PRODTYPE, 'gene'); $curr_gene = $1; } } if (/^GO term:\s(.*);\s+(\S+)/) { $self->start_event(ASSOC); $self->event(TERMACC, $2); while (<>) { chomp; last unless $_; if (/^GO evidence:\s*(.*)/) { my $evtxt = $1; foreach (split(/\s+\|\s+/,$evtxt)) { $self->start_event(EVIDENCE); $self->event(EVCODE, $_); $self->end_event(EVIDENCE); } } elsif (/^anatomy:(.*)/) { my @ids = _parse_ids($1); $self->event(property_value=>[[type=>'located_in'], [to=>$_]]) foreach @ids; } elsif (/^cell type:(.*)/) { my @ids = _parse_ids($1); $self->event(property_value=>[[type=>'located_in'], [to=>$_]]) foreach @ids; } else { } } $self->end_event(ASSOC); } } $fh->close; $self->pop_stack_to_depth(0); } 1;