The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: OrthoMCLParser.pm 2116 2008-05-23 12:12:21Z Erick Antezana $
#
# Module  : OrthoMCLParser.pm
# Purpose : Parse OrthoMCL files and add data to an ontology
# License : Copyright (c) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved.
#           This program is free software; you can redistribute it and/or
#           modify it under the same terms as Perl itself.
# Contact : CCO <ccofriends@psb.ugent.be>
#

package OBO::CCO::OrthoMCLParser;

=head1 NAME

OBO::CCO::OrthoMCLParser - An orthoMCL data to OBO translator

=head1 DESCRIPTION

Includes methods for adding information from orthoMCL data files to ontologies

orthoMCL can be obtained from http://orthomcl.cbil.upenn.edu/cgi-bin/OrthoMclWeb.cgi

The method 'parse()' parses an orthoMCL output data file into a data structure (hash of arrays)
The method 'work()' incorporates data from an orthoMCL data file into an ontology, writes the ontology into an OBO file, writes map files.

=head1 AUTHOR

Vladimir Mironov
vlmir@psb.ugent.be

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006, 2007, 2008 by Vladimir Mironov

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.

=cut

use OBO::Core::RelationshipType;
use OBO::Core::Relationship;
use OBO::Core::Term;
use OBO::CCO::CCO_ID_Term_Map;
use OBO::Core::Ontology;

use strict;
use warnings;
use Carp;

sub new {
    my $class = $_[0];
    my $self  = {};

    bless( $self, $class );
    return $self;
}

=head2 parse

  Usage    - $OrthoMCLParser->parse()
  Returns  - a reference to a data structure (hash of hashes) containing orthoMCL output data
  Args     - orthoMCL output data file
  Function - parses an orthoMCL output data file into a data structure
  
=cut

sub parse {
	my $self = shift;
	# get orthoMCL data file
	my $omclDataFile = shift;	
	open(my $FH, '<', $omclDataFile) || die "Cannot open file '$omclDataFile': $!";
	
	# parse orthoMCL output
	my %clusters; # %clusters{protein_name}{taxon_label}
	while(<$FH>){
		my ($cluster, $proteins) = split /:\s+/xms;
		my $cluster_num = $.-1;
		$cluster = $cluster_num;
		my @proteins = split /\s/xms, $proteins;
		foreach ( @proteins) {
			$_ =~/\A(\w+?)\((\w+?)\)/xms; # $1 is protein name, $2 is taxon label (e.g Hsa)
			$clusters{$cluster}->{$1} =  $2;			
		}
	}		
	close $FH;
	return \%clusters;
}

=head2 work

  Usage    - $OrthoMCLParser->work($omcl_data_ref, $file_names_ref, $taxa_ref)
  Returns  - OBO::Core::Ontology object 
  Args     - 1. reference to an orthoMCL data structure (output of parse()), 
           - 2. reference to a list of filenames (output OBO file, 4 map files (U,T,O,B, see the test file), 
           - 3. reference to a with taxons and taxon specific maps (see the test file)
  Function - converts orthoMCL data into an ontology, writes OBO and map files 
  
=cut

sub work {
	my ($self, $clust, $files,  $tax) = @_;
	my %clusters = %{$clust};
	my %taxa = %{$tax};
	# input/output files
	my ($new_OBO_file, $u_map_file, $t_map_file, $o_map_file, $b_map_file) = @{$files}; 
	
	# Initialize  maps (OBO::CCO::CCO_ID_Term_Map objects)
	my $u_map = OBO::CCO::CCO_ID_Term_Map->new($u_map_file);  # map for Upper Level Ontology terms
	my $t_map  = OBO::CCO::CCO_ID_Term_Map->new($t_map_file); # map for taxonomy terms
	my $o_map  = OBO::CCO::CCO_ID_Term_Map->new($o_map_file); # map for orthologous groups terms
	my $b_map  = OBO::CCO::CCO_ID_Term_Map->new($b_map_file); # map for biomolecule terms
	# taxon specific maps for biolmolecule terms
	foreach (keys %taxa) {
		push @{$taxa{$_}}, OBO::CCO::CCO_ID_Term_Map->new($taxa{$_}->[1]);
	}
	# @{$taxa{taxon_label}} contains now taxon name, taxon specific map file, taxon specific map object
	
	my $ontology = OBO::Core::Ontology->new();
	
	# populate ontology
	$ontology->add_relationship_type_as_string ('is_a',       'is_a');
	$ontology->add_relationship_type_as_string ('has_source', 'has_source');
	#$ontology->add_relationship_type_as_string ('source_of', 'source_of');
	
	my $protein = OBO::Core::Term->new();# upper level ontology term
	$protein->name('protein');
	$protein->id(assign_term_id($u_map, 'U', 'protein'));
	if ($u_map->contains_value('protein')) {
		$protein->id($u_map->get_cco_id_by_term('protein'));
	}else{
		$protein->id(assign_term_id($u_map, 'U', 'protein'));
	}
	$ontology->add_term($protein);
	
	foreach (keys %taxa) {
		my $taxon = OBO::Core::Term->new();
		my $taxon_lab = $_;
		my $tax_name = $taxa{$taxon_lab}->[0];
		$taxon->name($tax_name);
		$taxon->id(assign_term_id($t_map, 'T', $tax_name));
		$ontology->add_term($taxon);
		push @{$taxa{$taxon_lab}}, $taxon;
		#@{$taxa{$taxon_lab}} contains now taxon name, taxon specific map file, taxon specific map object, taxon term object
	}
	
	foreach (keys %clusters) {
		my $cluster = OBO::Core::Term->new();
		my $clust_num = $_;
		my $clust_name = "Type $_ protein";
		$cluster->name($clust_name);
		$cluster->id(assign_term_id($o_map, 'O', $clust_name));
		$cluster->def_as_string("A protein belonging to the orthological type $_ produced by orthoMCL", "[CCO:vm]");
		$ontology->create_rel($cluster, 'is_a', $protein);
		foreach (keys %{$clusters{$clust_num}}) {   # for each protein in the cluster
			my $prot_name = $_;
			my $taxon_lab = $clusters{$clust_num}{$prot_name};
			my $clust_protein = OBO::Core::Term->new();
			$clust_protein->name($prot_name);
			$clust_protein->id(assign_biomol_id($taxa{$taxon_lab}[2], $b_map, $prot_name));
			my $short_map = $taxa{$taxon_lab}[2];
			if ($short_map->contains_value($prot_name)) {
				$clust_protein->id($short_map->get_cco_id_by_term($prot_name));
			} else {
				$clust_protein->id(assign_biomol_id($short_map, $b_map, $prot_name));
			}
			$ontology->add_term($clust_protein);
			$ontology->create_rel($clust_protein, 'is_a', $cluster);
			
			$ontology->create_rel($clust_protein,         'has_source',   $taxa{$taxon_lab}->[3]);
			#$ontology->create_rel($taxa{$taxon_lab}->[3], 'source_of',         $clust_protein);
		}
	}
	
	# Write the new ontology and maps to disk
	open (my $FH, ">".$new_OBO_file) || die "Cannot write OBO file: ", $!;
	$ontology->export(\*$FH);
	close $FH;
	foreach ((keys %taxa)) {
		$taxa{$_}[2] -> write_map();
	}
	$b_map -> write_map();
	$o_map -> write_map();
	$t_map -> write_map();
	$u_map -> write_map();
	return $ontology;
}
#############################################################################################################
#
# sub assign_biomol_id
#
#############################################################################################################
sub assign_biomol_id {
	# used to obtain IDs for biomolecule terms 
	# arguments: taxon specific map (OBO::CCO::CCO_ID_Term_Map object), cumulative map for all taxa, protein name
	my ($short_map, $long_map, $term_name) = @_;
	my $term_id;
	if ($short_map->contains_value($term_name)) {#look up first in the species specific map
		$term_id = $short_map->get_cco_id_by_term($term_name);
		return $term_id;
	} else {
		$term_id = $long_map->get_new_cco_id( "CCO", "B", $term_name );
		$short_map->put( $term_id, $term_name ); # updates the species specific maps
		return  $term_id;
	}
}
#############################################################################################################
#
# sub assign_term_id
#
#############################################################################################################
sub assign_term_id {
	# used to obtain IDs for terms other than biomolecules
	# arguments: map (OBO::CCO::CCO_ID_Term_Map object), subdomain (a single character [BOTU]), term name
	my ($map, $subdomain, $term_name) = @_;
	my $term_id;
	if ($map->contains_value($term_name)) {
		$term_id = $map->get_cco_id_by_term($term_name);
		return $term_id;
	}
	else {
		$term_id = $map->get_new_cco_id( "CCO", $subdomain, $term_name );
	}
}
1;