# Copyright 2008-2010 Tim Rayner # # This file is part of Bio::MAGETAB. # # Bio::MAGETAB is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # Bio::MAGETAB is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Bio::MAGETAB. If not, see . # # $Id: DataMatrix.pm 368 2012-05-28 15:49:02Z tfrayner $ package Bio::MAGETAB::Util::Reader::DataMatrix; use Moose; use MooseX::FollowPBP; use Carp; BEGIN { extends 'Bio::MAGETAB::Util::Reader::Tabfile' }; has 'magetab_object' => ( is => 'rw', isa => 'Bio::MAGETAB::DataMatrix', required => 0 ); # This is used purely as a cache. has '_array_design' => ( is => 'rw', isa => 'Bio::MAGETAB::ArrayDesign', required => 0 ); sub parse { my ( $self ) = @_; # Find or create the target DataMatrix object. my $data_matrix; unless ( $data_matrix = $self->get_magetab_object() ) { # This is typically a stand-alone DM. FIXME consider type as # another attribute or argument to this reader object? my $type = $self->get_builder()->find_or_create_controlled_term({ category => 'DataType', # FIXME hard-coded. value => 'unknown', }); $data_matrix = $self->get_builder()->find_or_create_data_matrix({ uri => $self->get_uri(), dataType => $type, }); $self->set_magetab_object( $data_matrix ); } my $ad = $self->_determine_array_design( $data_matrix ); $self->_set_array_design( $ad ) if $ad; # This has to be set for Text::CSV_XS. local $/ = $self->get_eol_char(); my $qts; my $nodes; my $row_identifier_type; my $de_authority = q{}; my $de_namespace = q{}; my @matrix_rows; my $row_number = 1; FILE_LINE: while ( my $larry = $self->getline() ) { # Skip empty lines, comments. next FILE_LINE if $self->can_ignore( $larry ); # Strip surrounding whitespace from each element. $larry = $self->strip_whitespace( $larry ); if ( $row_number == 1 ) { $nodes = $self->_parse_node_heading( $larry ); } elsif ( $row_number == 2 ) { ( $qts, $row_identifier_type, $de_namespace ) = $self->_parse_qt_heading( $larry ); # If namespace isn't explicitly given in the matrix # header, set it to the name of the enclosing array # design. if ( my $ad = $self->_get_array_design() ) { $de_authority = $ad->get_authority(); unless ( defined $de_namespace ) { $de_namespace = $ad->get_name(); } } } else { my $element = $self->_parse_row_index( $larry->[0], $row_identifier_type, $de_namespace, $de_authority, ); push @matrix_rows, $self->get_builder()->find_or_create_matrix_row({ rowNumber => $row_number, designElement => $element, data_matrix => $data_matrix, }); } $row_number++; } # Confirm we've read to the end of the file. $self->confirm_full_parse(); # Sanity check. unless ( scalar @{ $qts } == scalar @{ $nodes } ) { croak(qq{Error: Mismatch in number of nodes and qt column headings.\n}); } # Create the MatrixColumn objects. my @matrix_columns; for ( my $col_number = 0; $col_number < scalar @{ $qts }; $col_number++ ) { push @matrix_columns, $self->get_builder()->find_or_create_matrix_column({ columnNumber => $col_number, quantitationType => $qts->[ $col_number ], referencedNodes => $nodes->[ $col_number ], data_matrix => $data_matrix, }); } $data_matrix->set_rowIdentifierType( $row_identifier_type ); $data_matrix->set_matrixColumns( \@matrix_columns ); $data_matrix->set_matrixRows( \@matrix_rows ); $self->get_builder()->update( $data_matrix ); return $data_matrix; } sub _determine_array_design { my ( $self, $object ) = @_; # Back out of the recursion quickly if we've already found an # array. FIXME consider throwing an exception if we just continue # looking and find more than one. foreach my $inputEdge ( $object->get_inputEdges() ) { my $previous = $inputEdge->get_inputNode(); if ( UNIVERSAL::isa( $previous, 'Bio::MAGETAB::Assay' ) ) { return $previous->get_arrayDesign(); } return $self->_determine_array_design( $previous ); } } sub _parse_node_heading { my ( $self, $larry ) = @_; my @nodes; my %type_map = ( qr/Normalization/ixms => 'get_normalization', qr/Scan /ixms => 'get_data_acquisition', qr/Hybridization/ixms => 'get_assay', qr/Assay /ixms => 'get_assay', qr/LabeledExtract/ixms => 'get_labeled_extract', qr/Extract /ixms => 'get_extract', qr/Sample /ixms => 'get_sample', qr/Source /ixms => 'get_source', qr/(?:Derived)? [ ]? Array [ ]? Data [ ]? File/ixms => 'get_data_file', ); my $getter; TYPE: while ( my ( $regex, $method ) = each %type_map ) { if ( $larry->[0] =~ /\A $regex [ ]? (?:REF)? \z/ixms ) { $getter = $method; last TYPE; } } unless ( $getter ) { croak(qq{Error: Unable to parse Node type from header "$larry->[0]".\n}); } # Data files are internally identified by "uri"; all other nodes by "name". my $id_field = $getter eq 'get_data_file' ? 'uri' : 'name'; foreach my $node_list ( @{ $larry }[1..$#$larry] ) { push @nodes, [ map { $self->get_builder()->$getter({ $id_field => $_ }) } split /\s*;\s*/, $node_list, ]; } return \@nodes; } sub _parse_qt_heading { my ( $self, $larry ) = @_; my %type_map = ( qr/Reporter/ixms => 'Reporter', qr/Composite [ ]? Element/ixms => 'Composite Element', qr/Term [ ]? Source/ixms => 'Term Source', qr/Coordinates/ixms => 'Coordinates', ); my $row_identifier_type; while ( my ( $regex, $id_type ) = each %type_map ) { if ( $larry->[0] =~ /\A $regex [ ]? (?:REF)? \z/ixms ) { $row_identifier_type = $id_type; } } unless ( $row_identifier_type ) { croak(qq{Error: Unable to parse rowIdentifierType from header "$larry->[0]".\n}); } my ( $namespace ) = ( $larry->[0] =~/ REF:(.*) \z/ixms ); my @qts; foreach my $qt ( @{ $larry }[1..$#$larry] ) { push @qts, $self->get_builder()->find_or_create_controlled_term({ category => 'QuantitationType', # FIXME hard-coded value => $qt, }); } return ( \@qts, $row_identifier_type, $namespace ); } sub _parse_row_index { my ( $self, $index, $row_identifier_type, $de_namespace, $de_authority ) = @_; my %type_map = ( 'Reporter' => 'reporter', 'Composite Element' => 'composite_element', 'Term Source' => 'reporter', 'Coordinates' => 'reporter', ); # We constrain how objects are created depending on # $row_identifier_type; for Term Source or Coordinates we don't # require that the parser already know about the objects. Note # that the Builder object may still override strict parsing for # Reporter/CompositeElement. my $method_prefix = $row_identifier_type eq 'Coordinates' ? 'find_or_create_' : $row_identifier_type eq 'Term Source' ? 'find_or_create_' : 'get_'; my $method = $method_prefix . $type_map{ $row_identifier_type }; # Start off with just Reporter/CompositeElement; we add different # arguments for Coordinates/Term Source style design elements. my $args = { name => $index, namespace => $de_namespace, authority => $de_authority, }; if ( $row_identifier_type eq 'Coordinates' ) { my ( $chr, $start, $end ) = ( $index =~ /([^:-]+) : (\d+) - (\d+)/ixms ); unless ( defined $chr && defined $start && defined $end ) { croak(qq{Error: unable to parse Coordinates "$index"; must be in chrX:123-456 format.\n}); } $args->{'chromosome'} = $chr; $args->{'startPosition'} = $start; $args->{'endPosition'} = $end; } elsif ( $row_identifier_type eq 'Term Source' ) { # Term Source Name must be in the $de_namespace (this is in # the MAGE-TAB spec). unless ( $de_namespace ) { croak(qq{Error: Data Matrix Term Source must include Name, e.g. "Term Source REF:embl".\n}); } my $ts_obj = $self->get_builder()->get_term_source({ 'name' => $de_namespace, }); # FIXME consider supporting semicolon-delimited accessions # here (not currently part of MAGE-TAB spec). my $term = $self->get_builder()->find_or_create_database_entry({ accession => $index, termSource => $ts_obj, }); $args->{'databaseEntries'} = [ $term ]; # In such cases this isn't really a namespace, so we get rid of it. delete $args->{'namespace'}; } else { # Reporter, CompositeElement. $args->{'name'} = $index; } my $element = $self->get_builder()->$method($args); return $element; } # Make the classes immutable. In theory this speeds up object # instantiation for a small compilation time cost. __PACKAGE__->meta->make_immutable(); no Moose; =head1 NAME Bio::MAGETAB::Util::Reader::DataMatrix - Data matrix parser class. =head1 SYNOPSIS use Bio::MAGETAB::Util::Reader::DataMatrix; my $parser = Bio::MAGETAB::Util::Reader::DataMatrix->new({ uri => $dm_filename, }); my $data_matrix = $parser->parse(); =head1 DESCRIPTION This class is used to parse data matrix files. It can be used on its own, but more often you will want to use the main Bio::MAGETAB::Util::Reader class which handles extended parsing options more transparently. =head1 ATTRIBUTES See the L class for superclass attributes. =over 2 =item magetab_object A Bio::MAGETAB::DataMatrix object. This can either be set upon instantiation, or a new object will be created for you. It can be retrieved at any time using C. =back =head1 METHODS =over 2 =item parse Parse the data matrix pointed to by C<$self-Eget_uri()>. Returns the Bio::MAGETAB::DataMatrix object updated with the data matrix contents. =back =head1 SEE ALSO L L L =head1 AUTHOR Tim F. Rayner =head1 LICENSE This library is released under version 2 of the GNU General Public License (GPL). =cut 1;