package OBO::APO::NCBIToRDF; =head1 NAME OBO::APO::NCBIToRDF - A NCBI taxonomy dump to RDF converter. =head1 DESCRIPTION Converts NCBI taxonomy dump files (names and nodes) to a RDF graph. NCBI taxonomy dump files files can be obtained from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ The method 'work' gets the nodes file, the names file, and file handler for the RDF graph. =head1 AUTHOR Mikel Egana Aranguren mikel.egana.aranguren@gmail.com =head1 COPYRIGHT AND LICENSE Copyright (C) 2008 by Mikel Egana Aranguren This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.7 or, at your option, any later version of Perl 5 you may have available. =cut use strict; use warnings; use Carp; sub new { my $class = shift; my $self = {}; bless ($self, $class); return $self; } =head2 work Usage - $NCBIToRDF->work($NCBINodesFilePath,$NCBINamesFilePath,$RDF_file_handler) Returns - RDF file handler Args - The paths to the NCBI nodes and names files and a file handler for the new RDF file Function - Converts NCBI nodes and NCBI names to an RDF graph. =cut #vlmir # Argumenents # 1. Full path to the names.dmp file # 2. Full path to the nodes.dmp file # 3. File handle for writing RDF # 4. base URI (e.g. 'http://www.semantic-systems-biology.org/') # 5. name space (e.g. 'SSB') #vlmir sub work { my $self = shift; # TODO: have a thorough look into: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_readme.txt # Get the arguments # my ($NCBInodesFileName,$NCBInamesFileName,$file_handle ) = @_; my ($NCBInamesFileName,$NCBInodesFileName, $file_handle, $base, $namespace ) = @_; #vlmir # For the ID # $path_to_assoc_file =~ /.*\/(.*)/; # get what is after the slash in the path... # my $f_name = $1; # (my $prefix_id = $f_name) =~ s/\.goa//; # $prefix_id =~ s/\./_/g; # TODO: set all the NS and URI via arguments # my $default_URL = "http://www.semantic-systems-biology.org/"; my $default_URL = $base; #vlmir my $NS = $namespace; #vlmir my $ns = lc ($NS); my $rdf_subnamespace = "taxon"; my $obo_ns = $default_URL.$NS."#"; #$default_URL."OBO#"; my $ncbi_ns = $default_URL.$NS."#"; #$default_URL."NCBI#"; # Preamble of RDF file print $file_handle "\n"; print $file_handle "\n"; #print $file_handle "\txmlns:obo=\"".$obo_ns."\">\n"; my %nodes = (); my %names = (); # Open and parse names file (we want groups 1 and 2 only if group 4 is scientific name) open(NCBInamesFile, $NCBInamesFileName) || croak("can't open file: $!"); my @mynamelines = ; foreach my $theline (@mynamelines){ if ($theline =~ /(.+)\|(.+)\|(.+)\|(.+)\|/){ my $childid = $1; my $childname = $2; my $nametype = $4; $childid =~ s/\s//g; $nametype =~ s/\s//g; if($nametype eq 'scientificname'){ $childname =~ s/^\s+//; $childname =~ s/\s+$//; $names{$childid} = $childname; } } } close(NCBInamesFile); # Open and parse the nodes file open(NCBInodesFile, $NCBInodesFileName) || croak("can't open file: $!"); my @mynodelines =; foreach my $theline (@mynodelines){ if ($theline =~ /(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|/){ my $child = $1; my $parent = $2; my $rank = $3; $child =~ s/\s//g; $parent =~ s/\s//g; $rank =~ s/\s//g; $nodes{$child} = $parent; print $file_handle "\t<",$ns,":".$rdf_subnamespace." rdf:about=\"#"."NCBI"."_".$child."\">\n"; print $file_handle "\t\t".&char_hex_http($names{$child})."\n"; print $file_handle "\t\t<".$ns.":name xml:lang=\"en\">".&char_hex_http($names{$child})."\n"; print $file_handle "\t\t<".$ns.":rank>".&char_hex_http($rank)."\n"; unless ($child eq "1"){ print $file_handle "\t\t<".$ns.":is_a rdf:resource=\"#"."NCBI"."_".$parent."\"/>\n"; } print $file_handle "\t\n"; } } close(NCBInodesFile); print $file_handle "\n\n"; print $file_handle ""; return $file_handle; } sub __date { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); my $result = sprintf "%02d:%02d:%4d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min; # e.g. 11:05:2008 12:52 } =head2 char_hex_http Usage - $ontology->char_hex_http($seq) Returns - the sequence with the hexadecimal representation for the http special characters Args - the sequence of characters Function - Transforms a http character to its equivalent one in hexadecimal. E.g. : -> %3A =cut sub char_hex_http { $_[0] =~ s/:/%3A/g; $_[0] =~ s/;/%3B/g; $_[0] =~ s//%3E/g; $_[0] =~ s/\?/%3F/g; #number sign # 23 # --> # # --> # #dollar sign $ 24 $ --> $ $ --> $ #percent sign % 25 % --> % % --> % $_[0] =~ s/\//%2F/g; $_[0] =~ s/&/%26/g; return $_[0]; } 1;