# $Id: GoaToRDF.pm 2165 2010-09-29 erick.antezana $ # # Module : GoaToRDF.pm # Purpose : A GOA associations to RDF converter # License : Copyright (c) 2008 Application Ontology. All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # package OBO::APO::GoaToRDF; =head1 NAME OBO::APO::GoaToRDF - A GOA associations to RDF converter. =head1 DESCRIPTION Converts a GOA association file to a RDF graph. The RDF graph is very simple, containing a node for each line from the association file (called GOA_ASSOC_n), and several triples for the fields (e.g. obj_symb). GOA associations files can be obtained from http://www.ebi.ac.uk/GOA/proteomes.html The method 'work' gets an assoc file path and a file handler for the RDF graph. =head1 AUTHOR Mikel Egana Aranguren mikel.egana.aranguren@gmail.com =head1 COPYRIGHT AND LICENSE Copyright (C) 2008 by Mikel Egana Aranguren This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.7 or, at your option, any later version of Perl 5 you may have available. =cut use OBO::Parser::GoaParser; use strict; use warnings; use Carp; sub new { my $class = shift; my $self = {}; bless ($self, $class); return $self; } =head2 work Usage - $GoaToRDF->work($RDF_file_handler,$path_to_assoc_file) Returns - RDF file handler Args - a file handler for the new RDF file and the path to the assoc. file. Function - converts an assoc. file to an RDF graph =cut sub work { my $self = shift; # Get the arguments my ($file_handle, $path_to_assoc_file) = @_; # # Hard-coded evidence codes # my %evidence_code_by_id = ( 'IEA' => 'ECO_00000067', 'ND' => 'ECO_0000035', 'IDA' => 'ECO_0000002', 'IPI' => 'ECO_0000021', 'TAS' => 'ECO_0000033', 'NAS' => 'ECO_0000034', 'ISS' => 'ECO_0000041', 'IMP' => 'ECO_0000015', 'IC' => 'ECO_0000001', 'IGI' => 'ECO_0000011', 'IEP' => 'ECO_0000008', 'RCA' => 'ECO_0000053', 'IGC' => 'ECO_0000177', 'EXP' => 'ECO_0000006' ); # # Aspects # my %aspect = ( 'P' => 'participates_in', 'C' => 'located_in', 'F' => 'has_function' ); # For the ID $path_to_assoc_file =~ /.*\/(.*)/; # get what is after the slash in the path... my $f_name = $1; (my $prefix_id = $f_name) =~ s/\.goa//; $prefix_id =~ s/\./_/g; # TODO set all the NS and URI via arguments my $default_URL = "http://www.semantic-systems-biology.org/ontology/rdf/"; my $NS = "GOA"; my $ns = lc ($NS); my $rdf_subnamespace = "assoc"; # Preamble of RDF file print $file_handle "\n"; print $file_handle "\n"; my $GoaParser = OBO::Parser::GoaParser->new; my $goaAssocSet = $GoaParser->parse($path_to_assoc_file); my %prot_duplicated; # to add only one copy of the protein # Chunk of RDF file foreach ($goaAssocSet->get_set()) { my %assoc = %{$_}; # # the protein: (this should come from uniprot.rdf) # #print $file_handle "\t<",$ns,":".$prot_space." rdf:about=\"#".$prefix_id."_".$assoc{ASSC_ID}."\">\n"; if (!$prot_duplicated{$assoc{OBJ_ID}}) { my $prot_space = "prot"; print $file_handle "\t<".$ns.":".$prot_space." rdf:about=\"".$uniprot_ns.$assoc{OBJ_ID}."\">\n"; print $file_handle "\t\t".&char_hex_http($assoc{OBJ_SYMB})."\n"; print $file_handle "\t\t<".$ns.":annot_src>".&char_hex_http($assoc{ANNOT_SRC})."\n"; my $t = $assoc{TAXON}; $t =~ s/taxon:/NCBI_/; # clean it print $file_handle "\t\t<".$ns.":taxon>".$t."\n"; print $file_handle "\t\t\n"; print $file_handle "\t\t<".$ns.":type>".&char_hex_http($assoc{TYPE})."\n"; print $file_handle "\t\t<".$ns.":description>".&char_hex_http($assoc{DESCRIPTION})."\n"; print $file_handle "\t\t<".$ns.":obj_src>".&char_hex_http($assoc{OBJ_SRC})."\n"; print $file_handle "\t\n"; $prot_duplicated{$assoc{OBJ_ID}} = 1; } my $triple_prefix_id_assoc_id = $goa_ns."triple_".$prefix_id."_".$assoc{ASSC_ID}; my $goa_ns_prefix_id_assoc_id = $goa_ns."GOA_".$prefix_id."_".$assoc{ASSC_ID}; # # ASSOC: # print $file_handle "\t<",$ns,":".$rdf_subnamespace." rdf:about=\"".$goa_ns_prefix_id_assoc_id."\">\n"; print $file_handle "\t\t<".$ns.":date>".$assoc{DATE}."\n"; print $file_handle "\t\t<".$ns.":refer>".&char_hex_http($assoc{REFER})."\n"; print $file_handle "\t\t<".$ns.":sup_ref>".&char_hex_http($assoc{SUP_REF})."\n"; print $file_handle "\t\t\n"; print $file_handle "\t\n"; # # TRIPLE: # print $file_handle "\t\n"; print $file_handle "\t\t\n"; print $file_handle "\t\t\n"; print $file_handle "\t\t\n\n"; print $file_handle "\t\t\n"; print $file_handle "\t\n"; } print $file_handle "\n\n"; print $file_handle ""; return $file_handle; } sub __date { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); my $result = sprintf "%02d:%02d:%4d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min; # e.g. 11:05:2008 12:52 } =head2 char_hex_http Usage - $ontology->char_hex_http($seq) Returns - the sequence with the hexadecimal representation for the http special characters Args - the sequence of characters Function - Transforms a http character to its equivalent one in hexadecimal. E.g. : -> %3A =cut sub char_hex_http { $_[0] =~ s/:/_/g; # originally: $_[0] =~ s/:/%3A/g; but changed to get eh GO IDs properly: GO_0000001 $_[0] =~ s/;/%3B/g; $_[0] =~ s//%3E/g; $_[0] =~ s/\?/%3F/g; #number sign # 23 # --> # # --> # #dollar sign $ 24 $ --> $ $ --> $ #percent sign % 25 % --> % % --> % $_[0] =~ s/\//%2F/g; $_[0] =~ s/&/%26/g; return $_[0]; } 1;