######### # Author: rpettett@cpan.org # Maintainer: rpettett@cpan.org # Created: 2005-08-23 # Last Modified: $Date: 2011-05-06 11:18:40 +0100 (Fri, 06 May 2011) $ $Author: zerojinx $ # Source: $Source: /var/lib/cvsd/cvsroot/Bio-DasLite/Bio-DasLite/lib/Bio/Das/Lite.pm,v $ # Id: $Id: Lite.pm 53 2011-05-06 10:18:40Z zerojinx $ # $HeadURL $ # package Bio::Das::Lite; use strict; use warnings; use WWW::Curl::Multi; use WWW::Curl::Easy; # CURLOPT imports use HTTP::Response; use Carp; use English qw(-no_match_vars); use Readonly; our $DEBUG = 0; our $VERSION = '2.11'; Readonly::Scalar our $TIMEOUT => 5; Readonly::Scalar our $REG_TIMEOUT => 15; Readonly::Scalar our $LINKRE => qr{]*?>([^<]*)|]*?/>}smix; Readonly::Scalar our $NOTERE => qr{]*>([^<]*)}smix; Readonly::Scalar our $DAS_STATUS_TEXT => { 200 => '200 OK', 400 => '400 Bad command (command not recognized)', 401 => '401 Bad data source (data source unknown)', 402 => '402 Bad command arguments (arguments invalid)', 403 => '403 Bad reference object', 404 => '404 Requested object unknown', 405 => '405 Coordinate error', 500 => '500 Server error', 501 => '501 Unimplemented feature', }; ######### # $ATTR contains information about document structure - tags, attributes and subparts # This is split up by call to reduce the number of tag passes for each response # our %COMMON_STYLE_ATTRS = ( zindex => [], height => [], fgcolor => [], bgcolor => [], label => [], bump => [], ); our %SCORED_STYLE_ATTRS = ( min => [], max => [], steps => [], color1 => [], color2 => [], color3 => [], height => [], ); our $ATTR = { '_segment' => { 'segment' => [qw(id start stop version label)], }, # feature notes and links are special cases and taken care of elsewhere 'feature' => { 'feature' => [qw(id label)], 'method' => [qw(id cvId)], 'type' => [qw(id category reference subparts superparts cvId)], 'target' => [qw(id start stop)], 'start' => [], 'end' => [], 'orientation' => [], 'phase' => [], 'score' => [], 'parent' => { 'parent' => [qw(id)] }, 'part' => { 'part' => [qw(id)] }, }, 'sequence' => { 'sequence' => [qw(id start stop version label)], }, # NOTE: The dna command is deprecated: 'dna' => { 'sequence' => { 'sequence' => [qw(id start stop version)], 'dna' => [qw(length)], }, }, 'entry_points' => { 'entry_points' => [qw(href total start end)], 'segment' => { 'segment' => [qw(id start stop type orientation subparts version)], }, }, # NOTE: The dsn command is deprecated: 'dsn' => { 'dsn' => [], 'source' => [qw(id)], 'mapmaster' => [], 'description' => [], }, 'type' => { 'type' => [qw(id category cvId)], 'segment' => [qw(id start stop version label)], }, 'alignment' => { 'alignment' => [qw(name alignType max)], 'alignobject' => { 'alignobject' => [qw(objVersion intObjectId type dbSource dbVersion dbAccessionId dbCoordSys)], 'alignobjectdetail' => { 'alignobjectdetail' => [qw(dbSource property)], }, 'sequence' => [], }, 'score' => [qw(score)], 'block' => { 'block' => [qw(blockOrder)], 'segment' => { 'segment' => [qw(intObjectId start end orientation)], 'cigar' => [], }, }, }, 'structure' => { 'object' => [qw(dbAccessionId inObjectId objectVersion type dbSource dbVersion dbCoordSys)], 'chain' => { 'chain' => [qw(id SwissprotId model)], 'group' => { 'group' => [qw(name type groupID)], 'atom' => { 'atom' => [qw(atomID occupancy tempFactor altLoc atomName x y z)] }, }, }, 'het' => { 'group' => { 'group' => [qw(name type groupID)], 'atom' => { 'atom' => [qw(atomId occupancy tempFactor altLoc atomName x y z)] }, }, }, 'connect' => { 'connect' => [qw(atomSerial type)], 'atomID' => { 'atomID' => [qw(atomID)], }, }, }, 'sources' => { 'source' => { 'source' => [qw(uri title doc_href description)], 'maintainer' => { 'maintainer' => [qw(email)], }, 'version' => { 'version' => [qw(uri created)], 'coordinates' => { 'coordinates' => [qw(uri source authority taxid test_range version)], }, 'capability' => { 'capability' => [qw(type query_uri)], }, 'prop' => { 'prop' => [qw(name value)], }, }, }, }, 'stylesheet' => { 'stylesheet' => [qw(version)], 'category' => { 'category' => [qw(id)], 'type' => { 'type' => [qw(id)], 'glyph' => { 'glyph' => [qw(zoom)], 'arrow' => { 'parallel' => [], 'southwest' => [], 'northeast' => [], %COMMON_STYLE_ATTRS, }, 'anchored_arrow' => { 'parallel' => [], %COMMON_STYLE_ATTRS, }, 'box' => { 'linewidth' => [], 'pattern' => [], # WTSI extension %COMMON_STYLE_ATTRS, }, 'cross' => { %COMMON_STYLE_ATTRS, }, 'dot' => \%COMMON_STYLE_ATTRS, 'ex' => { %COMMON_STYLE_ATTRS, }, 'hidden' => {}, 'line' => { 'style' => [], %COMMON_STYLE_ATTRS, }, 'span' => { %COMMON_STYLE_ATTRS, }, 'text' => { 'font' => [], 'fontsize' => [], 'string' => [], #'style' => [], HANDLED SEPARATELY 'fgcolor' => [], 'bgcolor' => [], 'label' => [], 'bump' => [], }, 'primers' => \%COMMON_STYLE_ATTRS, 'toomany' => { 'linewidth' => [], %COMMON_STYLE_ATTRS, }, 'triangle' => { 'linewidth' => [], 'direction' => [], %COMMON_STYLE_ATTRS, }, 'gradient' => \%SCORED_STYLE_ATTRS, 'histogram' => \%SCORED_STYLE_ATTRS, 'lineplot' => \%SCORED_STYLE_ATTRS, }, }, }, }, }; ######### # $OPTS contains information about parameters to use for queries # our $OPTS = { 'feature' => [qw(segment type category categorize feature_id maxbins)], 'type' => [qw(segment)], 'sequence' => [qw(segment)], 'dna' => [qw(segment)], 'entry_points' => [qw(rows)], 'dsn' => [], 'sources' => [], 'stylesheet' => [], 'alignment' => [qw(query rows subject subjectcoordsys)], 'structure' => [qw(query)], }; sub new { my ($class, $ref) = @_; $ref ||= {}; my $self = { 'dsn' => [], 'timeout' => $TIMEOUT, 'data' => {}, 'caching' => 1, 'registry' => [qw(http://www.dasregistry.org/das)], '_registry_sources' => [], }; bless $self, $class; if($ref && ref $ref) { for my $arg (qw(dsn timeout caching callback registry user_agent http_proxy proxy_user proxy_pass no_proxy)) { if(exists $ref->{$arg} && $self->can($arg)) { $self->$arg($ref->{$arg}); } } } elsif($ref) { $self->dsn($ref); } return $self; } sub new_from_registry { my ($class, $ref) = @_; my $user_timeout = defined $ref->{timeout} ? 1 : 0; my $self = $class->new($ref); # If the user specifies a timeout, use it. # But if not, temporarily increase the timeout for the registry request. if (!$user_timeout) { $self->timeout($REG_TIMEOUT); } my $sources = $self->registry_sources($ref); # And reset it back to the "normal" non-registry timeout. if (!$user_timeout) { $self->timeout($TIMEOUT); } $self->dsn([map { $_->{'url'} } @{$sources}]); return $self; } # We implement this method because LWP does not parse user/password sub http_proxy { my ($self, $proxy) = @_; if($proxy) { $self->{'http_proxy'} = $proxy; } if(!$self->{'_checked_http_proxy_env'}) { $self->{'http_proxy'} ||= $ENV{'http_proxy'} || q(); $self->{'_checked_http_proxy_env'} = 1; } if($self->{'http_proxy'} =~ m{^(https?://)(\S+):(.*?)\@(.*?)$}smx) { ######### # http_proxy contains username & password - we'll set them up here: # $self->proxy_user($2); $self->proxy_pass($3); $self->{'http_proxy'} = "$1$4"; } return $self->{'http_proxy'}; } sub no_proxy { my ($self, @args) = @_; if (scalar @args) { if ($args[0] && ref $args[0] && ref $args[0] eq 'ARRAY') { $self->{'no_proxy'} = $args[0]; } else { $self->{'no_proxy'} = \@args; } } if(!$self->{'_checked_no_proxy_env'}) { $self->{'no_proxy'} ||= [split /\s*,\s*/smx, $ENV{'no_proxy'} || q()]; $self->{'_checked_no_proxy_env'} = 1; } return $self->{'no_proxy'} || []; } sub _get_set { my ($self, $key, $value) = @_; if(defined $value) { $self->{$key} = $value; } return $self->{$key}; } sub proxy_user { my ($self, $val) = @_; return $self->_get_set('proxy_user', $val); } sub proxy_pass { my ($self, $val) = @_; return $self->_get_set('proxy_pass', $val); } sub user_agent { my ($self, $val) = @_; return $self->_get_set('user_agent', $val) || "Bio::Das::Lite v$VERSION"; } sub timeout { my ($self, $val) = @_; return $self->_get_set('timeout', $val); } sub caching { my ($self, $val) = @_; return $self->_get_set('caching', $val); } sub max_hosts { my ($self, $val) = @_; carp 'WARNING: max_hosts method is decprecated and has no effect'; return $self->_get_set('_max_hosts', $val); } sub max_req { my ($self, $val) = @_; carp 'WARNING: max_req method is decprecated and has no effect'; return $self->_get_set('_max_req', $val); } sub callback { my ($self, $val) = @_; return $self->_get_set('callback', $val); } sub basename { my ($self, $dsn) = @_; $dsn ||= $self->dsn(); my @dsns = (ref $dsn)?@{$dsn}:$dsn; my @res = (); for my $service (@dsns) { $service =~ m{(https?://.*/das)/?}smx; if($1) { push @res, $1; } } return \@res; } sub dsn { my ($self, $dsn) = @_; if($dsn) { if(ref $dsn eq 'ARRAY') { $self->{'dsn'} = $dsn; } else { $self->{'dsn'} = [$dsn]; } } return $self->{'dsn'}; } sub dsns { my ($self, $query, $opts) = @_; $opts ||= {}; $opts->{'use_basename'} = 1; return $self->_generic_request($query, 'dsn', $opts); } sub entry_points { my ($self, $query, $opts) = @_; return $self->_generic_request($query, 'entry_points', $opts); } sub types { my ($self, $query, $opts) = @_; return $self->_generic_request($query, 'type(s)', $opts); } sub features { my ($self, $query, $callback, $opts) = @_; if(ref $callback eq 'HASH' && !defined $opts) { $opts = $callback; undef $callback; } if($callback) { $self->{'callback'} = $callback; } return $self->_generic_request($query, 'feature(s)', $opts); } sub sequence { my ($self, $query, $opts) = @_; return $self->_generic_request($query, 'sequence', $opts); } sub dna { my ($self, $query, $opts) = @_; return $self->_generic_request($query, 'dna', $opts); } sub alignment { my ($self, $opts) = @_; return $self->_generic_request($opts, 'alignment'); } sub structure { my ($self, $opts) = @_; return $self->_generic_request($opts, 'structure'); } sub sources { my ($self, $opts) = @_; return $self->_generic_request($opts, 'sources'); } sub stylesheet { my ($self, $callback, $opts) = @_; if(ref $callback eq 'HASH' && !defined $opts) { $opts = $callback; undef $callback; } if($callback) { $self->{'callback'} = $callback; } return $self->_generic_request(undef, 'stylesheet', $opts); } ######### # Private methods # ######### # Build the query URL; perform an HTTP fetch; drop into the recursive parser; apply any post-processing # sub _generic_request { my ($self, $query, $fname, $opts) = @_; $opts ||= {}; delete $self->{'currentsegs'}; my $results = {}; my $reqname = $fname; $reqname =~ s/(?:[(]|[)])//smxg; ($fname) = $fname =~ /^([[:lower:]_]+)/smx; my $ref = $self->build_requests({ query => $query, fname => $fname, reqname => $reqname, opts => $opts, results => $results }); $self->_fetch($ref, $opts->{'headers'}); $DEBUG and print {*STDERR} qq(Content retrieved\n); $self->postprocess($fname, $results); ######### # deal with caching # if($self->{'caching'}) { $DEBUG and print {*STDERR} qq(Performing cache handling\n); for my $s (keys %{$results}) { if($DEBUG && !$results->{$s}) { print {*STDERR} qq(CACHE HIT for $s\n); ## no critic (InputOutput::RequireCheckedSyscalls) } $results->{$s} ||= $self->{'_cache'}->{$s}; $self->{'_cache'}->{$s} ||= $results->{$s}; } } return $results; } sub build_queries { my ($self, $query, $fname) = @_; my @queries; if($query) { if(ref $query eq 'HASH') { ######### # If the query param was a hashref, stitch the parts together # push @queries, join q(;), map { "$_=$query->{$_}" } grep { $query->{$_} } @{$OPTS->{$fname}}; } elsif(ref $query eq 'ARRAY') { ######### # If the query param was an arrayref # if(ref $query->[-1] eq 'CODE') { ######### # ... and the last arg is a code-block, set up the callback for this run and remove the arg # $self->callback($query->[-1]); pop @{$query}; } if(ref $query->[0] eq 'HASH') { ######### # ... or if the first array arg is a hash, stitch the series of queries together # push @queries, map { ## no critic (ProhibitComplexMappings) my $q = $_; join q(;), map { "$_=$q->{$_}" } grep { $q->{$_} } @{$OPTS->{$fname}}; } @{$query}; } else { ######### # ... but otherwise assume it's a plain segment string # push @queries, map { "segment=$_"; } @{$query}; } } else { ######### # and if it wasn't a hashref or an arrayref, then assume it's a plain segment string # push @queries, "segment=$query"; } } else { ######### # Otherwise we've no idea what you're trying to do # push @queries, q(); } return \@queries; } sub _hack_fname { my ($self, $fname) = @_; ######### # Sucky hacks # if($fname eq 'structure') { $fname = 'dasstructure'; } elsif($fname eq 'dna') { $fname = 'sequence'; } return $fname; } sub build_requests { my ($self, $args) = @_; my $query = $args->{query}; my $fname = $args->{fname}; my $reqname = $args->{reqname}; my $opts = $args->{opts}; my $results = $args->{results}; my $queries = $self->build_queries($query, $fname); my $attr = $ATTR->{$fname}; my $dsn = $opts->{'use_basename'}?$self->basename():$self->dsn(); my @bn = @{$dsn}; my $ref = {}; for my $bn (@bn) { ######### # loop over dsn basenames # $bn =~ s/\/+$//smx; for my $request (map { $_ ? "$bn/$reqname?$_" : "$bn/$reqname" } @{$queries}) { ######### # and for each dsn, loop over the query request # if($self->{'caching'} && $self->{'_cache'}->{$request}) { ######### # the key has to be present, but the '0' callback will be ignored by _fetch # $results->{$request} = 0; next; } $results->{$request} = []; $ref->{$request} = sub { my $data = shift || q(); $self->{'data'}->{$request} .= $data; if(!$self->{'currentsegs'}->{$request}) { ######### # If we haven't yet found segment information for this request # Then look for some. This one is a non-destructive scan. # my $matches = $self->{'data'}->{$request} =~ m{(]*>)}smix; if($matches) { my $seginfo = []; $self->_parse_branch({ request => $request, seginfo => $seginfo, attr => $ATTR->{'_segment'}, blk => $1, addseginfo => 0, }); $self->{'currentsegs'}->{$request} = $seginfo->[0]; } } if($DEBUG) { print {*STDERR} qq(invoking _parse_branch for $fname\n) or croak $ERRNO; } ######### # Sucky hacks # if($fname eq 'dna') { $attr = $attr->{'sequence'}; } $fname = $self->_hack_fname($fname); my $pat = qr{(<$fname.*?/$fname>|<$fname[^>]+/>)}smix; while($self->{'data'}->{$request} =~ s/$pat//smx) { $self->_parse_branch({ request => $request, seginfo => $results->{$request}, attr => $attr, blk => $1, addseginfo => 1, }); } if($DEBUG) { print {*STDERR} qq(completed _parse_branch\n) or croak $ERRNO; } return; }; } } return $ref; } sub postprocess { my ($self, $fname, $results) = @_; $fname = $self->_hack_fname($fname); ######### # Add in useful segment information for empty segments # In theory there should only ever be one element in @{$self->{'seginfo'}} # as requests are parallelised by segment # for my $req (keys %{$results}) { if(!$results->{$req} || scalar @{$results->{$req}} == 0) { $results->{$req} = $self->{'currentsegs'}->{$req}; } } ######### # fix ups # if($fname eq 'entry_points') { $DEBUG and print {*STDERR} qq(Running postprocessing for entry_points\n); for my $s (keys %{$results}) { my $res = $results->{$s} || []; for my $r (@{$res}) { delete $r->{'segment_id'}; } } } elsif($fname eq 'sequence') { $DEBUG and print {*STDERR} qq(Running postprocessing for dna\n); for my $s (keys %{$results}) { my $res = $results->{$s} || []; for my $r (@{$res}) { if(exists $r->{'dna'}) { $r->{'dna'} =~ s/\s+//smgx; } elsif(exists $r->{'sequence'}) { $r->{'sequence'} =~ s/\s+//smgx; } } } } return; } ######### # Set up the parallel HTTP fetching # This uses our LWP::Parallel::UserAgent subclass which handles DAS statuses # sub _fetch { my ($self, $url_ref, $headers) = @_; $self->{'statuscodes'} = {}; $self->{'specversions'} = {}; if(!$headers) { $headers = {}; } if($ENV{HTTP_X_FORWARDED_FOR}) { $headers->{'X-Forwarded-For'} ||= $ENV{'HTTP_X_FORWARDED_FOR'}; } $headers->{'X-DAS-Version'} ||= '1.6'; # Convert header pairs to strings my @headers; for my $h (keys %{ $headers }) { push @headers, "$h: " . $headers->{$h}; } # We will now issue the actual requests. Due to insufficient support for error # handling and proxies, we can't use WWW::Curl::Simple. So we generate a # WWW::Curl::Easy object here, and register it with WWW::Curl::Multi. my $curlm = WWW::Curl::Multi->new(); my %reqs; my $i = 0; # First initiate the requests for my $url (keys %{$url_ref}) { if(ref $url_ref->{$url} ne 'CODE') { next; } $DEBUG and print {*STDERR} qq(Building WWW::Curl::Easy for $url [timeout=$self->{'timeout'}] via $url_ref->{$url}\n); $i++; my $curl = WWW::Curl::Easy->new(); $curl->setopt( CURLOPT_NOPROGRESS, 1 ); $curl->setopt( CURLOPT_FOLLOWLOCATION, 1 ); $curl->setopt( CURLOPT_USERAGENT, $self->user_agent ); $curl->setopt( CURLOPT_URL, $url ); if (scalar @headers) { $curl->setopt( CURLOPT_HTTPHEADER, \@headers ); } my ($body_ref, $head_ref); open my $fileb, q[>], \$body_ref or croak 'Error opening data handle'; ## no critic (RequireBriefOpen) $curl->setopt( CURLOPT_WRITEDATA, $fileb ); open my $fileh, q[>], \$head_ref or croak 'Error opening header handle'; ## no critic (RequireBriefOpen) $curl->setopt( CURLOPT_WRITEHEADER, $fileh ); # we set this so we have the ref later on $curl->setopt( CURLOPT_PRIVATE, $i ); $curl->setopt( CURLOPT_TIMEOUT, $self->timeout || $TIMEOUT ); #$curl->setopt( CURLOPT_CONNECTTIMEOUT, $self->connection_timeout || 2 ); $self->_fetch_proxy_setup($curl); $curlm->add_handle($curl); $reqs{$i} = { 'uri' => $url, 'easy' => $curl, 'head' => \$head_ref, 'body' => \$body_ref, }; } $DEBUG and print {*STDERR} qq(Requests submitted. Waiting for content\n); $self->_receive($url_ref, $curlm, \%reqs); return; } sub _fetch_proxy_setup { my ($self, $curl) = @_; if ( my $proxy = $self->http_proxy ) { if ( defined $Bio::Das::Lite::{CURLOPT_PROXY} ) { $curl->setopt( &CURLOPT_PROXY, $proxy ); ## no critic (ProhibitAmpersandSigils) } else { croak 'Trying to set a proxy, but your version of libcurl does not support this feature'; } } if ( my $proxy_user = $self->proxy_user ) { if ( defined $Bio::Das::Lite::{CURLOPT_PROXYUSERNAME} ) { $curl->setopt( &CURLOPT_PROXYUSERNAME, $proxy_user ); ## no critic (ProhibitAmpersandSigils) } else { croak 'Trying to set a proxy username, but your version of libcurl does not support this feature'; } } if ( my $proxy_pass = $self->proxy_pass ) { if ( defined $Bio::Das::Lite::{CURLOPT_PROXYPASSWORD} ) { $curl->setopt( &CURLOPT_PROXYPASSWORD, $proxy_pass ); ## no critic (ProhibitAmpersandSigils) } else { croak 'Trying to set a proxy password, but your version of libcurl does not support this feature'; } } my @no_proxy = @{ $self->no_proxy }; if ( scalar @no_proxy ) { if ( defined $Bio::Das::Lite::{CURLOPT_NOPROXY} ) { $curl->setopt( &CURLOPT_NOPROXY, join q(,), @no_proxy ); ## no critic (ProhibitAmpersandSigils) } else { croak 'Trying to set proxy exclusions, but your version of libcurl does not support this feature'; } } return; } sub _receive { my ($self, $url_ref, $curlm, $reqs) = @_; # Now check for results as they come back my $i = scalar keys %{ $reqs }; while ($i) { my $active_transfers = $curlm->perform; if ($active_transfers != $i) { while (my ($id,$retcode) = $curlm->info_read) { $id || next; $i--; my $req = $reqs->{$id}; my $uri = $req->{'uri'}; my $head = ${ $req->{'head'} } || q(); my $body = ${ $req->{'body'} } || q(); # We got a response from the server: if ($retcode == 0) { my $res = HTTP::Response->parse( $head . "\n" . $body ); my $msg; # Workaround for redirects, which result in multiple headers: while ($res->content =~ /^HTTP\/\d+\.\d+\s\d+/mxs) { # check for status line like "HTTP/1.1 200 OK" $res = HTTP::Response->parse( $res->content ); } $self->{specversions}->{$uri} = $res->header('X-DAS-Version'); # Prefer X-DAS-Status my ($das_status) = ($res->header('X-DAS-Status') || q()) =~ m/^(\d+)/smx; if ($das_status) { $msg = $self->{statuscodes}->{$uri} = $DAS_STATUS_TEXT->{$das_status}; # just in case we get a status we don't understand: $msg ||= $das_status . q( ) . ($res->message || 'Unknown status'); } # Fall back to HTTP status else { $msg = $res->status_line; # workaround for bug in HTTP::Response parse method: $msg =~ s/\r//gsmx; } $self->{statuscodes}->{$uri} = $msg; $url_ref->{$uri}->($res->content); # run the content handling code } # A connection error, timeout etc (NOT an HTTP status): else { $self->{statuscodes}->{$uri} = '500 ' . $req->{'easy'}->strerror($retcode); } delete($reqs->{$id}); # put out of scope to free memory } } } return; } sub statuscodes { my ($self, $url) = @_; $self->{'statuscodes'} ||= {}; return $url?$self->{'statuscodes'}->{$url}:$self->{'statuscodes'}; } sub specversions { my ($self, $url) = @_; $self->{'specversions'} ||= {}; return $url ? $self->{'specversions'}->{$url} : $self->{'specversions'}; } ######### # Using the $attr structure describing the structure of this branch, # recursively parse the XML blocks and build the corresponding response data structure # sub _parse_branch { my ($self, $args) = @_; my $dsn = $args->{request}; my $ar_ref = $args->{seginfo}; my $attr = $args->{attr}; my $blk = $args->{blk}; my $addseginfo = $args->{addseginfo}; my $depth = $args->{depth} || 0; my $ref = {}; my (@parts, @subparts); while(my ($k, $v) = each %{$attr}) { if(ref $v eq 'HASH') { push @subparts, $k; } else { push @parts, $k; } } ######### # recursive child-node handling, usually for s # for my $subpart (@subparts) { my $subpart_ref = []; my $pat = qr{(<$subpart[^>]*/>|<$subpart[^>]*?(?!/)>.*?/$subpart>)}smix; while($blk =~ s/$pat//smx) { $self->_parse_branch({ request => $dsn, seginfo => $subpart_ref, attr => $attr->{$subpart}, blk => $1, addseginfo => 0, depth => $depth+1, }); } if(scalar @{$subpart_ref}) { $ref->{$subpart} = $subpart_ref; } ######### # To-do: normalise group data across features here - mostly for 'group' tags in feature responses # i.e. merge links, use cached hashrefs (keyed on group id) describing groups to reduce the parsed tree footprint # NOTE: groups are now deprecated # } ######### # Attribute processing for tags in blocks # my $tmp; for my $tag (@parts) { my $opts = $attr->{$tag}||[]; for my $a (@{$opts}) { ($tmp) = $blk =~ m{<$tag[^>]*\s+$a="([^"]+?)"}smix; if(defined $tmp) { $ref->{"${tag}_$a"} = $tmp; } } ($tmp) = $blk =~ m{<$tag[^>]*>([^<]+)}smix; if(defined $tmp) { $tmp =~ s/^\s+$//smgx; if(length $tmp) { $ref->{$tag} = $tmp; } } if($tmp && $DEBUG) { print {*STDERR} q( )x($depth*2), qq( $tag = $tmp\n); ## no critic (InputOutput::RequireCheckedSyscalls) } } $self->_parse_twig($dsn, $blk, $ref, $addseginfo); push @{$ar_ref}, $ref; $DEBUG and print {*STDERR} q( )x($depth*2), qq(leaving _parse_branch\n); ######### # only perform callbacks if we're at recursion depth zero # if($depth == 0 && $self->{'callback'}) { $DEBUG and print {*STDERR} q( )x($depth*2), qq(executing callback at depth $depth\n); $ref->{'dsn'} = $dsn; my $callback = $self->{'callback'}; &{$callback}($ref); } return q(); } sub _parse_twig { my ($self, $dsn, $blk, $ref, $addseginfo) = @_; ######### # handle multiples of twig elements here # $blk =~ s/$LINKRE/{ $ref->{'link'} ||= []; push @{$ref->{'link'}}, { 'href' => $1 || $3, 'txt' => $2, }; q() }/smegix; $blk =~ s/$NOTERE/{ $ref->{'note'} ||= []; push @{$ref->{'note'}}, $1; q() }/smegix; if($addseginfo && $self->{'currentsegs'}->{$dsn}) { while(my ($k, $v) = each %{$self->{'currentsegs'}->{$dsn}}) { $ref->{$k} = $v; } } return; } sub registry { my ($self, @reg) = @_; if((scalar @reg == 1) && (ref $reg[0]) && (ref$reg[0] eq 'ARRAY')) { push @{$self->{'registry'}}, @{$reg[0]}; } else { push @{$self->{'registry'}}, @reg; } return $self->{'registry'}; } sub registry_sources { my ($self, $filters, $flush) = @_; $filters ||= {}; my $category = $filters->{'category'} || []; my $capability = $filters->{'capability'} || $filters->{'capabilities'} || []; if(!ref $category) { $category = [$category]; } if(!ref $capability) { $capability = [$capability]; } $flush and $self->{'_registry_sources'} = []; ######### # Populate the list of sources if this is the first call or we're flushing # if (scalar @{$self->{'_registry_sources'}} == 0) { $self->_fetch_registry_sources() or return []; } ######### # Jump out if there's no filtering to be done # if(!scalar keys %{$filters}) { return $self->{'_registry_sources'}; } my $sources = $self->{'_registry_sources'}; ######### # Apply capability filter # if((ref $capability eq 'ARRAY') && (scalar @{$capability})) { my $str = join q(|), @{$capability}; my $match = qr/$str/smx; $sources = [grep { $self->_filter_capability($_, $match) } @{$sources}]; } ######### # Apply coordinatesystem/category filter # if((ref $category eq 'ARRAY') && (scalar @{$category})) { $sources = [grep { $self->_filter_category($_, $category) } @{$sources}]; } return $sources; } sub _fetch_registry_sources { my $self = shift; my $reg_urls = $self->registry(); if (!scalar @{ $reg_urls }) { return; } my $old_dsns = $self->dsn(); my $old_statuses = $self->{'statuscodes'}; $self->dsn($reg_urls); ######### # Run the DAS sources command # my $sources_ref = $self->sources(); my $statuses = $self->{'statuscodes'}; $self->dsn($old_dsns); $self->{'statuscodes'} = $old_statuses; for my $url (keys %{ $sources_ref || {} }) { my $status = $statuses->{$url} || 'Unknown status'; if ($status !~ m/^200/mxs) { carp "Error fetching sources from '$url' : $status"; next; } my $ref = $sources_ref->{$url} || []; ######### # Some basic checks # (ref $ref eq 'ARRAY') || return; $ref = $ref->[0] || {}; (ref $ref eq 'HASH') || return; $ref = $ref->{'source'} || []; (ref $ref eq 'ARRAY') || return; ######### # The sources command has sources (really groups of sources) and # versions (really individual sources). For compatibility with the # old SOAP way of doing things, we must: # 1. throw away this source grouping semantic # 2. convert the hash format to the old style # for my $sourcegroup (@{ $ref }) { $self->_fetch_registry_sources_sourcegroup($sourcegroup); } } return 1; } sub _fetch_registry_sources_sourcegroup { my ($self, $sourcegroup) = @_; my $versions = $sourcegroup->{'version'} || []; (ref $versions eq 'ARRAY') || next; for my $source (@{ $versions }) { my $caps = $source->{'capability'} || []; my $dsn; my $object = { capabilities => [], coordinateSystem => [], description => $sourcegroup->{source_description}, id => $source->{version_uri}, }; ######### # Some sources have 'more info' URLs # if ( my $doc_href = $sourcegroup->{source_doc_href} ) { $object->{helperurl} = $doc_href; } ######### # Add the capabilties # for my $cap (@{ $caps }) { ######### # Extract the DAS URL from one of the capabilities # NOTE: in DAS 1 we assume all capability query URLs for one # source are the same. Anything else would need the data # model to be redesigned. # if (!$dsn) { $dsn = $cap->{'capability_query_uri'} || q(); ($dsn) = $dsn =~ m{(.+/das\d?/[^/]+)}mxs; $object->{'url'} = $dsn; } my $cap_type = $cap->{'capability_type'} || q(); ($cap_type) = $cap_type =~ m/das\d:(.+)/mxs; $cap_type || next; push @{ $object->{'capabilities'} }, $cap_type; } ######### # If none of the capabilities have query URLs, we can't query them! # $object->{'url'} || next; ######### # Add the coordinates # my $coords = $source->{'coordinates'} || []; for my $coord (@{ $coords }) { ######### # All coordinates have a name and category # my $coord_ob = { name => $coord->{coordinates_authority}, category => $coord->{coordinates_source}, }; ######### # Some coordinates have a version # if ( my $version = $coord->{'coordinates_version'} ) { $coord_ob->{'version'} = $version; } ######### # Some coordinates have a species (taxonomy ID and name) # if ( my $taxid = $coord->{'coordinates_taxid'} ) { $coord_ob->{'NCBITaxId'} = $taxid; my $desc = $coord->{'coordinates'}; my ($species) = $desc =~ m/([^,]+)$/mxs; $coord_ob->{'organismName'} = $species; } ######### # Add the coordinate system # push @{ $object->{'coordinateSystem'} }, $coord_ob; } ######### # Add the actual source object # push @{ $self->{'_registry_sources'} }, $object; } return 1; } sub _filter_capability { my ($self, $src, $match) = @_; for my $scap (@{$src->{'capabilities'}}) { if($scap =~ $match) { return 1; } } return 0; }; sub _filter_category { my ($self, $src, $match) = @_; for my $scoord (@{$src->{'coordinateSystem'}}) { for my $m (@{$match}) { if ($m =~ m/,/mxs) { # regex REQUIRES "authority,type", and handles optional version (with proper underscore handling) and species my ($auth, $ver, $cat, $org) = $m =~ m/^ (.+?) (?:_([^_,]+))? ,([^,]+) (?:,(.+))? /mxs; if (lc $cat eq lc $scoord->{'category'} && $auth eq $scoord->{'name'} && (!$ver || lc $ver eq lc $scoord->{'version'}) && (!$org || lc $org eq lc $scoord->{'organismName'})) { return 1; } } else { return 1 if(lc $scoord->{'category'} eq lc $m); } } } return 0; } 1; __END__ =head1 NAME Bio::Das::Lite - Perl extension for the DAS (HTTP+XML) Protocol (http://biodas.org/) =head1 VERSION See $Bio::Das::Lite::VERSION =head1 SYNOPSIS use Bio::Das::Lite; my $bdl = Bio::Das::Lite->new_from_registry({'category' => 'GRCh_37,Chromosome,Homo sapiens'}); my $results = $bdl->features('22'); =head1 SUBROUTINES/METHODS =head2 new : Constructor my $das = Bio::Das::Lite->new('http://das.ensembl.org/das/ensembl1834'); my $das = Bio::Das::Lite->new({ 'timeout' => 60, 'dsn' => 'http://user:pass@das.ensembl.org/das/ensembl1834', 'http_proxy' => 'http://user:pass@webcache.local.com:3128/', }); Options can be: dsn (optional scalar or array ref, URLs of DAS services) timeout (optional int, HTTP fetch timeout in seconds) http_proxy (optional scalar, web cache or proxy if not set in %ENV) no_proxy (optional list/ref, non-proxiable domains if not set in %ENV) caching (optional bool, primitive caching on/off) callback (optional code ref, callback for processed XML blocks) registry (optional array ref containing DAS registry service URLs defaults to 'http://das.sanger.ac.uk/registry/services/das') proxy_user (optional scalar, username for authenticating forward-proxy) proxy_pass (optional scalar, password for authenticating forward-proxy) user_agent (optional scalar, User-Agent HTTP request header value) =head2 new_from_registry : Constructor Similar to 'new' above but supports 'capability' and 'category' in the given hashref, using them to query the DAS registry and configuring the DSNs accordingly. my $das = Bio::Das::Lite->new_from_registry({ 'capability' => ['features'], 'category' => ['Protein Sequence'], }); Options are as above, plus capability OR capabilities (optional arrayref of capabilities) category (optional arrayref of categories) For a complete list of capabilities and categories, see: http://das.sanger.ac.uk/registry/ The category can optionally be a full coordinate system name, allowing further restriction by authority, version and species. For example: 'Protein Sequence' OR 'UniProt,Protein Sequence' OR 'GRCh_37,Chromosome,Homo sapiens' =head2 http_proxy : Get/Set http_proxy $das->http_proxy('http://user:pass@squid.myco.com:3128/'); =head2 proxy_user : Get/Set proxy username for authenticating forward-proxies This is only required if the username wasn't specified when setting http_proxy $das->proxy_user('myusername'); =head2 proxy_pass : Get/Set proxy password for authenticating forward-proxies This is only required if the password wasn't specified when setting http_proxy $das->proxy_pass('secretpassword'); =head2 no_proxy : Get/Set domains to not use proxy for $das->no_proxy('ebi.ac.uk', 'localhost'); OR $das->no_proxy( ['ebi.ac.uk', 'localhost'] ); Always returns an arrayref =head2 user_agent : Get/Set user-agent for request headers $das->user_agent('GroovyDAS/1.0'); =head2 timeout : Get/Set timeout $das->timeout(30); =head2 caching : Get/Set caching $das->caching(1); =head2 callback : Get/Set callback code ref $das->callback(sub { }); =head2 basename : Get base URL(s) of service $das->basename(optional $dsn); =head2 dsn : Get/Set DSN $das->dsn('http://das.ensembl.org/das/ensembl1834/'); # give dsn (scalar or arrayref) here if not specified in new() Or, if you want to add to the existing dsn list and you're feeling sneaky... push @{$das->dsn}, 'http://my.server/das/additionalsource'; =head2 dsns : Retrieve information about other sources served from this server. Note this call is 'dsns', as differentiated from 'dsn' which is the current configured source my $src_data = $das->dsns(); =head2 entry_points : Retrieve the list of entry_points for this source e.g. chromosomes and associated information (e.g. sequence length and version) my $entry_points = $das->entry_points(); =head2 Types of argument for 'types', 'features', 'sequence' calls: Segment Id: '1' Segment Id with range: '1:1,1000' Segment Id with range and type: { 'segment' => '1:1,1000', 'type' => 'exon', } Multiple Ids with ranges and types: [ { 'segment' => '1:1,1000', 'type' => 'exon', }, { 'segment' => '2:1,1000', 'type' => 'exon', }, ] See DAS specifications for other parameters =head2 types : Find out about different data types available from this source my $types = $das->types(); # takes optional args - see DAS specs Retrieve the types of data available for this source e.g. 32k_cloneset, karyotype, swissprot =head2 features : Retrieve features from a segment e.g. clones on a chromosome ######### # Different ways to fetch features - # my $feature_data1 = $das->features('1:1,100000'); my $feature_data2 = $das->features(['1:1,100000', '2:20435000,21435000']); my $feature_data3 = $das->features({ 'segment' => '1:1,1000', 'type' => 'karyotype', # optional args - see DAS Spec }); my $feature_data4 = $das->features([ {'segment' => '1:1,1000000','type' => 'karyotype',}, {'segment' => '2:1,1000000',}, {'group_id' => 'OTTHUMG00000036084',}, ]); ######### # Feature fetch with callback # my $callback = sub { my $struct = shift; print {*STDERR} Dumper($struct); }; # then: $das->callback($callback); $das->features('1:1,1000000'); # or: $das->features('1:1,1000000', $callback); # or: $das->features(['1:1,1000000', '2:1,1000000', '3:1,1000000'], $callback); # or: $das->features([{'group_id' => 'OTTHUMG00000036084'}, '2:1,1000000', '3:1,1000000'], $callback); =head2 alignment : Retrieve protein alignment data for a query. This can be a multiple sequence alignment or pairwise alignment. Note - this has not been tested for structural alignments as there is currently no Das source avialable. my $alignment = $das->alignment({query => 'Q01234'}); =head2 structure : Retrieve known structure (i.e. PDB) for a query my $structure = $das->structure({ query => 'pdb_id'}); =head2 sources : Retrieves the list of sources form the DAS registry, via a DAS call. my $sources = $das->source; =head2 sequence : Retrieve sequence data for a segment (probably dna or protein) my $sequence = $das->sequence('2:1,1000'); # segment:start,stop (e.g. chromosome 2, bases 1 to 1000) =head2 stylesheet : Retrieve stylesheet data my $style_data = $das->stylesheet(); my $style_data2 = $das->stylesheet($callback); =head2 statuscodes : Retrieve HTTP status codes for request URLs my $code = $das->statuscodes($url); my $code_hashref = $das->statuscodes(); =head2 specversions : Retrieve a server's DAS specification version for a request URL my $version = $das->specversions($url); # e.g. 1.53, 1.6, 1.6E my $version_hashref = $das->specversions(); =head2 max_hosts set number of running concurrent host connections THIS METHOD IS NOW DEPRECATED AND HAS NO EFFECT $das->max_hosts(7); print $das->max_hosts(); =head2 max_req set number of running concurrent requests per host THIS METHOD IS NOW DEPRECATED AND HAS NO EFFECT $das->max_req(5); print $das->max_req(); =head2 registry : Get/Set accessor for DAS-Registry service URLs $biodaslite->registry('http://www.dasregistry.org/das'); my $registry_arrayref = $biodaslite->registry(); =head2 registry_sources : Arrayref of dassource objects from the configured registry services my $sources_ref = $biodaslite->registry_sources(); my $sources_ref = $biodaslite->registry_sources({ 'capability' => ['features','stylesheet'], }); my $sources_ref = $biodaslite->registry_sources({ 'category' => ['Protein Sequence'], }); =head2 build_queries Constructs an arrayref of DAS requests including parameters for each call =head2 build_requests Constructs the WWW::Curl callbacks =head2 postprocess Applies processing to the result set, e.g. removal of whitespace from sequence responses. =head1 DESCRIPTION This module is an implementation of a client for the DAS protocol (XML over HTTP primarily for biological-data). =head1 DEPENDENCIES =over =item strict =item warnings =item WWW::Curl =item HTTP::Response =item Carp =item English =item Readonly =back =head1 DIAGNOSTICS Set $Bio::Das::Lite::DEBUG = 1; =head1 CONFIGURATION AND ENVIRONMENT =head1 INCOMPATIBILITIES =head1 BUGS AND LIMITATIONS The max_req and max_hosts methods are now deprecated and have no effect. =head1 SEE ALSO DAS Specifications at: http://biodas.org/documents/spec.html ProServer (A DAS Server implementation also by the author) at: http://www.sanger.ac.uk/proserver/ The venerable Bio::Das suite (CPAN and http://www.biodas.org/download/Bio::Das/). The DAS Registry at: http://das.sanger.ac.uk/registry/ =head1 AUTHOR Roger Pettett, Erpettett@cpan.orgE =head1 LICENSE AND COPYRIGHT Copyright (C) 2007 GRL, by Roger Pettett This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available. =cut