package WWW::Search::PubMedLite; use base 'WWW::Search'; use warnings; use strict; our $VERSION = '0.05'; our $DEBUG = 0; use WWW::Search::PubMedLite::Lang; use HTML::Entities; use WWW::SearchResult; use XML::LibXML; =head1 NAME WWW::Search::PubMedLite - Access PubMed's database of journal articles =head1 SYNOPSIS use WWW::Search; my $search = new WWW::Search('PubMedLite'); $search->native_query( 126941 ); my $article = $search->next_result; my @fields = qw( pmid journal journal_abbreviation volulme issue title page year month affiliation abstract language doi text_url pmc_id ); foreach my $field ( @fields ) { printf "%s: %s\n", $field, $article->{$field}; } =cut sub native_setup_search { my( $self, $query ) = @_; my @ids = ref $query eq 'ARRAY' ? @$query : ( $query ); $self->{_ids} = \@ids; $self->{_idx} = 0; $self->agent_name('WWW-Search-PubMedLite'); $self->user_agent(1); $self->_debug(""); } sub native_retrieve_some { my $self = shift; $self->_debug( "native_retrieve_some(): starting..." ); my $id = $self->{_ids}->[$self->{_idx}++] or return; $self->{_current_url} = $self->_main_url($id); my %data = ( ); my @fields = $self->_fields; $self->_debug( "native_retrieve_some(): extracting fields" ); foreach my $field ( @fields ) { my $xml_url = $field->{xml_url}->( $self, $id ); my @keys = ref $field->{key} ? @{$field->{key}} : ($field->{key}); my @xpath = ref $field->{xpath} ? @{$field->{xpath}} : ($field->{xpath}); my $doc = $self->_doc( $xml_url ); my $value = undef; foreach my $xpath ( @xpath ) { $value = $self->_field_value( $doc, $xpath ); last if $value; } $data{$_} = $value foreach @keys; } $self->_debug( "native_retrieve_some(): done extracting fields" ); $data{year} =~ s/^(\d{4}).*/$1/ if $data{year}; $data{authors} = $self->_authors( $self->_doc( $self->{_current_url} ) ); $data{author} = join ', ', @{ $data{authors} }; $data{language_name} = WWW::Search::PubMedLite::Lang->abbr2name( $data{language} ); #$data{page} =~ s/-/decode_entities('–')/ge; my $hit = new WWW::SearchResult(); $hit->{$_} = $data{$_} for keys %data; $hit->url( $self->{_current_url} ); push @{$self->{cache}}, $hit; $self->{_current_url} = undef; $self->_debug( "native_retrieve_some(): done." ); return 1; } sub _authors { my( $self, $doc ) = @_; my @author_text; my $authors = $doc->findnodes('//AuthorList/Author'); for my $n ( 1 .. $authors->size ) { my $auth = $authors->get_node($n); my $lastname = $auth->findnodes('LastName') ? ($auth->findnodes('LastName'))[0]->to_literal : ''; my $initials = $auth->findnodes('Initials') ? ($auth->findnodes('Initials'))[0]->to_literal : ''; my $firstname = $auth->findnodes('ForeName') ? ($auth->findnodes('ForeName'))[0]->to_literal : ''; my $fname = $initials ? $initials : $firstname ? substr( $firstname, 0, 1 ) : ''; my $name = sprintf '%s %s', $lastname, $fname if $lastname and $fname; $name ||= $lastname if $lastname; push @author_text, $name if $name; } return \@author_text; } sub _main_url { my( $self, $id ) = @_; return sprintf 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml', $id; } # text_url sub _tu_xml_url { my( $self, $pmid ) = @_; return sprintf 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&cmd=prlinks', $pmid; } # pmc_id sub _pmc_xml_url { my( $self, $pmid ) = @_; return sprintf 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&db=pmc', $pmid; } sub _fields { ( { key => 'pmid', xml_url => \&_main_url, xpath => '//PMID' }, { key => 'journal', xml_url => \&_main_url, xpath => '//Journal/Title' }, { key => 'journal_abbreviation', xml_url => \&_main_url, xpath => [ '//Journal/ISOAbbreviation', '//MedlineJournalInfo/MedlineTA', '//Journal/Title' ] }, { key => 'volume', xml_url => \&_main_url, xpath => '//JournalIssue/Volume' }, { key => 'issue', xml_url => \&_main_url, xpath => '//JournalIssue/Issue' }, { key => 'title', xml_url => \&_main_url, xpath => '//Article/ArticleTitle' }, { key => 'page', xml_url => \&_main_url, xpath => '//Pagination/MedlinePgn' }, { key => 'year', xml_url => \&_main_url, xpath => [ '//JournalIssue/PubDate/Year', '//PubDate/MedlineDate' ] }, { key => 'month', xml_url => \&_main_url, xpath => '//JournalIssue/PubDate/Month' }, { key => 'affiliation', xml_url => \&_main_url, xpath => '//Affiliation' }, { key => 'abstract', xml_url => \&_main_url, xpath => '//Abstract/AbstractText' }, { key => 'language', xml_url => \&_main_url, xpath => '//Article/Language' }, { key => 'doi', xml_url => \&_main_url, xpath => '//PubmedData/ArticleIdList/ArticleId[@IdType="doi"]' }, { key => 'text_url', xml_url => \&_tu_xml_url, xpath => '//IdUrlSet/ObjUrl/Url' }, { key => 'pmc_id', xml_url => \&_pmc_xml_url, xpath => '/eLinkResult/LinkSet/LinkSetDb[LinkName="pubmed_pmc"]/Link/Id' }, # author ) } sub _field_value { my( $self, $doc, $xpath ) = @_; $self->_debug( "_field_value(): searthing for $xpath" ); $self->_debug( "_field_value(): \$doc is undef" ), return undef unless $doc; my $node = ($doc->findnodes($xpath))[0]; $self->_debug( "_field_value(): no node, returning undef" ), return undef unless $node; my $value = $node->to_literal; $self->_debug( "_field_value(): got $value" ); return $value; } # # Returns the content from $url, warning on error. # sub _fetch_data { my( $self, $url ) = @_; $self->user_agent->timeout(10); $self->_debug( "_fetch_data(): fetching $url" ); my $res = $self->user_agent->get($url); if( ! $res->is_success ) { $self->_debug( sprintf "_fetch_data(): could not fetch %s: error %s (agent: %s)", $url, $res->status_line, $self->user_agent->agent ); return undef; } return $res->content; } # # Given an $xml_url, returns a parsed XML document. Uses caching. # my %Doc; sub _doc { my( $self, $xml_url ) = @_; if( exists $Doc{$xml_url} ) { $self->_debug( "_doc(): cache for $xml_url" ); return $Doc{$xml_url}; } $self->_debug( "_doc(): no cache for $xml_url" ); my $xml = $self->_fetch_data($xml_url); $self->_debug( "_doc(): empty page" ), return undef unless $xml; my $parser = new XML::LibXML(); $Doc{$xml_url} = $parser->parse_string($xml); return $Doc{$xml_url}; } sub _debug { my( $self, @msg ) = @_; warn @msg, "\n" if $DEBUG; } =head1 AUTHOR David J. Iberri, C<< >> =head1 BUGS =head2 NCBI error 803/temporarily unavailable As of November 2008, the NCBI/PubMed servers have been a bit unpredictable in returning results to queries issued by W::S::PubMedLite. The only queries that appear to be failing are those that request the PubMed Central ID; for example: L The error returned by NCBI is "error 803/temporarily unavailable", which prevents WWW::Search::PubMedLite from filling the C field in results. This impacts C; specifically, if this error is encountered, the "pmc_id" test will be skipped and a brief explanation will be given. It should be safe to continue with the installation provided that the other tests are working. Again, it appears that this problem is confined to the C field. =head2 Bug reports Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc WWW::Search::PubMedLite You can also look for information at: =over 4 =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * RT: CPAN's request tracker L =item * Search CPAN L =back =head1 COPYRIGHT & LICENSE Copyright 2007 David J. Iberri, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1;