# $Id: NCBIParser.pm 2113 2010-09-29 Erick Antezana $ # # Module : NCBIParser.pm # Purpose : Parse NCBI files: names and nodes # License : Copyright ( c ) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # package OBO::Parser::NCBIParser; use OBO::Core::Term; use OBO::Parser::OBOParser; use OBO::Core::Ontology; use strict; use warnings; use Carp; $Carp::Verbose = 0; sub new { my $class = shift; my $self = {}; bless ( $self, $class ); return $self; } =head2 parse Usage - $ncbi_parser->parse ( $ncbi_nodes_path ) Returns - ref to a hash { child_id => parent_id } Args - nodes.dmp path ( string ) Usage - $ncbi_parser->parse ( $ncbi_names_path, $name_type ) Returns - ref to a hash { ncbi_id => ncbi_name } Args - 1. names.dmp path or nodes.dmp path, string 2. ncbi name type ( string, e.g. 'scientific name' ) if the first arg is names.dmp otherwise none Function - parses the complete NCBI taxonomy =cut sub parse { my ( $self, $input_path, $name_type # optional ) = @_; croak "Not enough arguments! " if ( @_ < 1 ); open my $IN, '<', $input_path || croak "Can't open file '$input_path': $! "; my @in_lines = <$IN>; my %map; if ( $name_type ) { # parsing names.dmp, # %map: ncbi_id => scientific_name foreach my $line ( @in_lines ){ my @fields = split /\t\|/, $line; if ( $fields[3] eq "\t$name_type" ) { my $key = $fields[0]; my $value = substr $fields[1], 1; $map{$key} = $value; } } } else { # parsing nodes.dmp # %map: child_id => parent_id foreach my $line ( @in_lines ){ my @fields = split /\t\|/, $line; my $key = $fields[0]; my $value = substr $fields[1], 1; $map{$key} = $value; } } close ( $IN ); return \%map; } =head2 work Usage - $NCBIParser->work ( $onto, $nodes, $names, $ncbi_ids ) Returns - map of added terms { NCBI ID => OBO::Core::Term object } Args - 1. input ontology, OBO::Core::Ontology object 2. ref to a hash { child_id => parent_id } 3. ref to a hash { ncbi_id => scientific_name } 4. parental ontology term for the root of the taxonomy, OBO::Core::Term object 5. ref to a list of NCBI taxon ids ( \d+ ) Function - adds NCBI taxonomy to the input ontology for the specified taxa =cut sub work { my ( $self, $ontology, $nodes, $names, $parent, $ncbi_ids, ) = @_; my %selected_nodes = ( ); # taxon id => parent id my %selected_names = ( ); # taxon id => taxon name # the hashes %selected_nodes and %selected_names are being populated: foreach my $ncbi_id ( @{$ncbi_ids} ) { getParentsRecursively ( $ncbi_id, $nodes, $names, \%selected_nodes, \%selected_names ); } my %map; # NCBI_ID=>taxon_term # the terms are created and added to the ontology and %map foreach my $ncbi_id ( keys %selected_nodes ){ my $selected_name = $selected_names{$ncbi_id}; my $taxon = OBO::Core::Term->new ( ); $taxon->id ( "NCBI:$ncbi_id" ); $taxon->name ( $selected_name ); $ontology->add_term ( $taxon ); $map{$ncbi_id} = $taxon; } # the end of foreach # Connect children to parents by 'is_a' relationships but not if the child is root ( cyclic is_a ) foreach my $ncbi_id ( keys %selected_nodes ) { my $child = $map{$ncbi_id} or croak "No term for '$ncbi_id' in the map: $! "; my $parent = $map{$selected_nodes{$ncbi_id}} or croak "No term for '$selected_nodes{$ncbi_id}' in the map: $! "; $ontology->create_rel ( $child, 'is_a', $parent ) if ( $ncbi_id != 1 ); } my $root = $ontology->get_term_by_id ( 'NCBI:1' ) or croak "No term in the ontology for 'root': $!"; $ontology->create_rel ( $root, 'is_a', $parent ); return \%map; } ######################################################################## # Subroutines ######################################################################## sub getParentsRecursively { my ( $ncbi_id, $nodes, $names, $selected_nodes, $selected_names ) = @_; my $child_id = $ncbi_id; my $parent_id = ${$nodes}{$ncbi_id}; my $child_name = ${$names}{$ncbi_id}; my $parent_name = ${$names}{$ncbi_id}; $selected_nodes->{$child_id} = $parent_id; $selected_names->{$child_id} = $child_name; getParentsRecursively ( $parent_id, $nodes, $names, $selected_nodes, $selected_names ) if ( $child_id != 1 ); } 1; __END__ =head1 NAME OBO::Parser::NCBIParser - A NCBI taxonomy to OBO translator. =head1 DESCRIPTION This parser converts chosen parts of the NCBI taxonomy-tree into an OBO file. A taxon ID is given to the parser and the whole tree up to the root is reconstructed in the given OBO ontology, using scientific names. The dump files ( nodes.dmp and names.dmp ) should be obtained from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz TODO: include ranks and disjoints only in correlating ranks. =head1 AUTHOR Mikel Egana Aranguren and Vladimir Mironov http://www.mikeleganaranguren.com, vladimir.n.mironov@gmail.com =head1 COPYRIGHT AND LICENSE Copyright ( C ) 2006 by Mikel Egana Aranguren This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.7 or, at your option, any later version of Perl 5 you may have available. =cut