# $Id: UniProtParser.pm 2030 2008-04-18 11:58:36Z vlmir $ # # Module : UniProtParser.pm # Purpose : Parse UniProt files and add data to an ontology # License : Copyright (c) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # Contact : CCO # package OBO::CCO::UniProtParser; =head1 NAME OBO::CCO::UniProtParser - A UniProt to OBO translator. =head1 DESCRIPTION Includes methods for adding information from UniProt files to ontologies UniProt files can be obtained from: ftp://ftp.expasy.org/databases/uniprot/knowledgebase/ The method 'work' incorporates relevant data from a UniProt file into the input ontology, writes the ontology into an OBO file, writes map files. This method assumes: - the input ontology contains already the term 'gene', 'protein', 'cell cycle modified protein' - the input ontology already contains relevant protein terms. - the input ontology already contains the NCBI taxonomy. - the input ontology already contains the relationship types 'is_a', 'encoded_by', 'codes_for', 'originates_from', 'tranformation_of', 'source_of' - the input UniProt file contains entries for one species only and for protein terms present in the input ontology only - the full map file ($long_file_name, the UNION of the species specific map files ($short_file_name)) contains all the proteins to be processed by the UniProtParser =head1 AUTHOR Vladimir Mironov vlmir@psb.ugent.be =head1 COPYRIGHT AND LICENSE Copyright (C) 2006 by Vladimir Mironov This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.7 or, at your option, any later version of Perl 5 you may have available. =cut use OBO::Parser::OBOParser; use OBO::Core::Relationship; use OBO::Core::Dbxref; use OBO::Util::DbxrefSet; use OBO::Core::Term; use OBO::Util::Set; use OBO::CCO::CCO_ID_Term_Map; use SWISS::Entry; use strict; use warnings; use Carp; sub new { my $class = $_[0]; my $self = {}; bless( $self, $class ); return $self; } =head2 work Usage - $UniProtParser->work($ref_file_names, 'Arabidopsis thaliana organism') Returns - updated OBO::Core::Ontology object Args - 1. reference to a list of filenames: - input OBO file, - output OBO file, - UniProt file, - CCO_id/protein_name map file one taxon only, - CCO_id/protein_name map file all taxa, - CCO_id/gene_name map file one taxon only, - CCO_id/gene_name map file all taxa, 2. taxon_name Function - parses a Uniprot file, adds relevant information to the input ontology, writes OBO and map files =cut sub work { my $self = $_[0]; # Get the arguments my ( $old_OBO_file, $new_OBO_file, $uniprot_file, $short_map_file, $long_map_file, $short_map_g_file, $long_map_g_file) = @{ $_[1] }; my $taxon_name = $_[2]; my %gene_name_sufix_by_taxon_name = ( 'Schizosaccharomyces pombe organism' => '_schpo', 'Saccharomyces cerevisiae organism' => '_yeast', 'Arabidopsis thaliana organism' => '_arath', 'Homo sapiens organism' => '_human' ); my $current_sufix = $gene_name_sufix_by_taxon_name{$taxon_name}; # Initialize the OBO parser, load the OBO file, check the assumptions my $my_parser = OBO::Parser::OBOParser->new(); my $ontology = $my_parser->work($old_OBO_file); my @rel_types = ( 'is_a', 'originates_from', 'source_of', 'encoded_by', 'codes_for', 'transformation_of', 'transforms_into' ); foreach (@rel_types) { confess "Not a valid relationship type: '", $_,"' (valid values: ", join(", ", @rel_types), ")" unless ( $ontology->{RELATIONSHIP_TYPES}->{$_} ); } my $taxon = $ontology->get_term_by_name($taxon_name) || die "No term for $taxon_name is defined in file '$old_OBO_file'"; # TODO Connect the core cell cycle genes to 'core cell cycle gene' my $onto_gene = $ontology->get_term_by_name('cell cycle gene') || die "No term for 'cell cycle gene' is defined in file '$old_OBO_file'"; my @gene_dbs = ( 'EMBL', 'Ensemble', 'GeneDB_Spombe', 'HGNC', 'SGD', 'TAIR', 'UniGene' ); # Initialize CCO_ID_Term_Map objects my $short_map = OBO::CCO::CCO_ID_Term_Map->new($short_map_file); my $long_map = OBO::CCO::CCO_ID_Term_Map->new($long_map_file); # Set of [B]iomolecules IDs my $short_map_g = OBO::CCO::CCO_ID_Term_Map->new($short_map_g_file); my $long_map_g = OBO::CCO::CCO_ID_Term_Map->new($long_map_g_file); # Set of Gene IDs # Parse the UniProt file open FH, $uniprot_file; local $/ = "\n//\n"; while () { my $entry = SWISS::Entry->fromText($_); my ( $accession, @accs ) = @{ $entry->ACs->{list} }; my ( $def, @syns ) = @{ $entry->DEs->{list} }; my $definition = $def->{text}; #<get_term_by_xref('UniProtKB', $ac); last if (defined $protein); } ## Added by ERICK to overcome the orthoMCL issues: orthoMCL does not provide AC's! if (!defined $protein) { my $p_name = $entry->ID; $protein = $ontology->get_term_by_name($p_name); #warn "The name '$p_name' was looked up in file '$old_OBO_file' giving this: ", $protein->id();; } # # Report the non-existing ACs in the OBO file (e.g. cco_I_$organism.obo) # if (!defined $protein) { warn "None of the UniProt AC's (", join(", ", @all_acs),") were found in file '$old_OBO_file': ", $!; next; } #>>EASR my $protein_name = $entry->ID; $protein->def_as_string( $definition, "UniProtKB:$accession" ); # add protein definition foreach (@all_acs) { $protein->xref_set_as_string("[UniProtKB:$_]"); # add secondary accessions } # add DB cross references to the protein my $dbxrefs = $entry->DRs; # an object containing all DB cross-references my @pids = $dbxrefs->pids; # an array containing EMBL protein accessions foreach (@pids) { $protein->xref_set_as_string("[EMBL:$_]"); } # add synonyms to the protein foreach (@syns) { $protein->synonym_as_string( $_->{text}, "[UniProtKB:$accession]", 'EXACT' ); } # <get_head_by_relationship_type($protein, $ontology->get_relationship_type_by_id('is_a'))}; my $link_found = 0; foreach my $head (@heads) { if ($head->name() eq 'core cell cycle protein') { $link_found = 1; last; } } if ($link_found == 0) { # assuming the term 'cell cycle protein' exists in the ontology my $cell_cycle_protein_term = $ontology->get_term_by_name("cell cycle protein"); # CCO:U0000007 $ontology->create_rel( $protein, 'is_a', $cell_cycle_protein_term); } # >>EASR $ontology->create_rel( $protein, 'originates_from', $taxon ); #$ontology->create_rel( $taxon, 'source_of', $protein); # get the object 'cell cycle modified protein' my $modified_protein_term = $ontology->get_term_by_name('cell cycle modified protein') || die "No term for 'cell cycle modified protein' is defined in file '$old_OBO_file'"; # add post-translationally modified derivatives of the protein if(my @fts = @{$entry->FTs->{list}}){ # an array of references to arrays corresponding to individual FT lines foreach my $ft (@fts){ # select only lines for modified residues $ft->[0] eq 'MOD_RES' ? my ($feature_key, $from_position, $to_position, $description, $qualifier, $FTId, $evidence_tag) = @{$ft}:next; # go to the next FT line next unless $from_position eq $to_position; # this feature concerns only a single residue my ($mod_prot_name, $mod_prot_comment, $mod_prot_def); if ($description =~ /(\S+);\s*?(\S+.*)/) {# description contains the name of the modified residue separated by a colon from the rest my ($mod_residue, $comment) = ($1, $2); $mod_prot_name = $protein_name.'-'.$mod_residue.$from_position; $mod_prot_def = "Protein $protein_name with the residue $from_position substituted with $mod_residue"; $mod_prot_comment = "$comment; $qualifier"; } else {# $description contains only the name of the modified residue $mod_prot_name = $protein_name.'-'.$description.$from_position; $mod_prot_def = "Protein $protein_name with the residue $from_position substituted with $description"; $mod_prot_comment = $qualifier if $qualifier; } # assign modified protein ID my $mod_prot_id; if ( $short_map->contains_value($mod_prot_name) ) { $mod_prot_id = $short_map->get_cco_id_by_term($mod_prot_name); } else { $mod_prot_id = $long_map->get_new_cco_id( "CCO", "B", $mod_prot_name ); $short_map->put( $mod_prot_id, $mod_prot_name ); # updates the species specific maps } # create protein terms for modified proteins and add to ontology my $mod_prot_obj = OBO::Core::Term->new(); $mod_prot_obj->name($mod_prot_name); $mod_prot_obj->id($mod_prot_id); $mod_prot_obj->def_as_string($mod_prot_def, "[UniProtKB:$accession]"); $mod_prot_obj->xref_set_as_string("[UniProtKB:$accession]"); $mod_prot_obj->comment($mod_prot_comment); $ontology->add_term($mod_prot_obj); $ontology->create_rel( $mod_prot_obj, 'is_a', $modified_protein_term ); $ontology->create_rel( $mod_prot_obj, 'originates_from', $taxon ); #$ontology->create_rel( $taxon, 'source_of', $mod_prot_obj ); $ontology->create_rel( $mod_prot_obj, 'transformation_of', $protein ); $ontology->create_rel( $protein, 'transforms_into', $mod_prot_obj); } } # create or retrieve gene terms my @gene_groups = @{ $entry->GNs->{list} }; if ( scalar @gene_groups == 1 ) { # there is only one gene associated with the protein my $gene_group = $gene_groups[0]; my $gene_name; # only one gene name is added # TODO add gene synonyms ( $gene_name = ${ $gene_group->{Names}->{list} }[0]->{text} ) || ( $gene_name = ${ $gene_group->{OLN}->{list} }[0]->{text} ) || ( $gene_name = ${ $gene_group->{ORFNames}->{list} }[0]->{text} ); $gene_name .= $current_sufix; my $gene; ( $gene = $ontology->get_term_by_name($gene_name) ) || ( $gene = &new_gene( $gene_group, $short_map_g, $long_map_g, $accession, $definition, $current_sufix) ); foreach my $db (@gene_dbs) { foreach my $xref ( @{ $dbxrefs->{list} } ) { $gene->xref_set_as_string("[$db:${$xref}[1]]") if ${$xref}[0] eq $db; } } $ontology->add_term($gene); # add relationtionships $ontology->create_rel( $gene, 'is_a', $onto_gene ); $ontology->create_rel( $protein, 'encoded_by', $gene ); $ontology->create_rel( $gene, 'codes_for', $protein ); # inverse of 'encoded_by' $ontology->create_rel( $gene, 'originates_from', $taxon ); #$ontology->create_rel( $taxon, 'source_of', $gene); } elsif ( scalar @gene_groups > 1 ) { # multiple genes associated with the protein; xrefs are not added!! foreach my $gene_group (@gene_groups) { my $gene_name; ( $gene_name = ${ $gene_group->{Names}->{list} }[0]->{text} ) || ( $gene_name = ${ $gene_group->{OLN}->{list} }[0]->{text} ) || ( $gene_name = ${ $gene_group->{ORFNames}->{list} }[0]->{text} ); $gene_name .= $current_sufix; my $gene; ($gene = $ontology->get_term_by_name($gene_name)) || ($gene = &new_gene($gene_group, $short_map_g, $long_map_g, $accession, $definition, $current_sufix)); $ontology->add_term($gene); # add relationtionships $ontology->create_rel( $gene, 'is_a', $onto_gene ); $ontology->create_rel( $protein, 'encoded_by', $gene ); $ontology->create_rel( $gene, 'codes_for', $protein ); # inverse of 'encoded_by' $ontology->create_rel( $gene, 'originates_from', $taxon ); #$ontology->create_rel( $taxon, 'source_of', $gene ); } } } # Write the new ontology and maps to disk open( FH, ">" . $new_OBO_file ) || die "Cannot write OBO file ($new_OBO_file): ", $!; $ontology->export( \*FH ); close FH; $short_map->write_map(); $long_map->write_map(); $short_map_g->write_map(); $long_map_g->write_map(); return $ontology; } ############################################################################################################# # # sub new_gene generates a new gene term with ID, name, definition, synonyms # ############################################################################################################# sub new_gene { my ( $gene_group, $short_map, $long_map, $accession, $definition, $current_sufix ) = @_; my $gene = OBO::Core::Term->new(); my $gene_name; foreach ( 'Names', 'OLN', 'ORFNames' ) { # gene group object must contain at least one of the three types of names if ( my ( $name, @names ) = @{ $gene_group->{$_}->{list} } ) { # list of gene name objects of one particular type # a bug in Swissknife (which version? 1.65?) - a list with a reference to an empty hash is returned instead of an empty array if the field 'Names' is empty if ( !$gene_name ) { # this is the first existing name type to process $gene_name = $name->{text} || next; $gene_name .= $current_sufix; $gene->name($gene_name); # get CCO id for the gene if ( $short_map->contains_value($gene_name) ) { $gene->id( $short_map->get_cco_id_by_term($gene_name) ); } else { my $gene_id = $long_map->get_new_cco_id( "CCO", "G", $gene_name ); $gene->id($gene_id); $short_map->put( $gene_id, $gene_name ); } if (@names) { # if there are other names of this type foreach (@names) { $gene->synonym_as_string( $_->{text}, "[UniProtKB:$accession]", 'EXACT' ); } } } else { # the name has already been assigned from another name type foreach ( $name, @names ) { $gene->synonym_as_string( $_->{text}, "[UniProtKB:$accession]", 'EXACT' ); } } } } # Add gene definition # Remark: if the gene is associated with multiple proteins the definition is derived from the first one # TODO take gene definitions from the original gene databases $definition =~ /^(\w+.* gene) protein/ ? $gene->def_as_string( $1, "UniProtKB:$accession" ) : $gene->def_as_string( $definition.' gene', "UniProtKB:$accession" ); return $gene; } 1;