# $Id: Yahoo.pm,v 2.380 2009/05/02 13:28:41 Martin Exp $ =head1 NAME WWW::Search::Yahoo - backend for searching www.yahoo.com =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Yahoo'); my $sQuery = WWW::Search::escape_query("sushi restaurant Columbus Ohio"); $oSearch->native_query($sQuery); while (my $oResult = $oSearch->next_result()) print $oResult->url, "\n"; =head1 DESCRIPTION This class is a Yahoo specialization of L. It handles making and interpreting Yahoo searches F. This class exports no public interface; all interaction should be done through L objects. =head1 NOTES The default search is: Yahoo's web-based index (not Directory). =head1 PRIVATE METHODS If you just want to write Perl code to search Yahoo, you do NOT need to read any further here. Instead, just read the L documentation. If you want to write a subclass of this module (e.g. create a backend for another branch of Yahoo) then please read about the private methods here: =cut package WWW::Search::Yahoo; use strict; use warnings; use Carp (); use Data::Dumper; # for debugging only use HTML::TreeBuilder; use WWW::Search; use WWW::SearchResult; use URI; use URI::Escape; use vars qw( $iMustPause ); use base 'WWW::Search'; our $VERSION = do { my @r = (q$Revision: 2.380 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; our $MAINTAINER = 'Martin Thurn '; # Thanks to the hard work of Gil Vidals and his team at # positionresearch.com, we know the following: In early 2004, # yahoo.com implemented new robot-blocking tactics that look for # frequent requests from the same client IP. One way around these # blocks is to slow down and randomize the timing of our requests. We # therefore insert a random sleep before every request except the # first one. This variable is equivalent to a "first-time" flag for # this purpose: $iMustPause = 0; =head2 gui_query Yes, Virginia, we do try to emulate stupid-human queries. =cut sub gui_query { my ($self, $sQuery, $rh) = @_; $self->{'_options'} = { 'p' => $sQuery, # 'hc' => 0, # 'hs' => 0, 'ei' => 'UTF-8', }; # print STDERR " + Yahoo::gui_query() is calling native_query()...\n"; $rh->{'search_base_url'} = 'http://search.yahoo.com'; $rh->{'search_base_path'} = '/bin/query'; return $self->native_query($sQuery, $rh); } # gui_query sub _native_setup_search { my ($self, $native_query, $rhOptsArg) = @_; # print STDERR " + This is Yahoo::native_setup_search()...\n"; # print STDERR " + _options is ", $self->{'_options'}, "\n"; $self->{'_hits_per_page'} = 100; # $self->{'_hits_per_page'} = 10; # for debugging # www.yahoo.com refuses robots. $self->user_agent('non-robot'); # www.yahoo.com completely changes the HTML output depending on the # browser! # $self->{'agent_name'} = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)'; # $self->{agent_e_mail} = 'mthurn@cpan.org'; $self->{_next_to_retrieve} = 1; $self->{'search_base_url'} ||= 'http://search.yahoo.com'; $self->{'search_base_path'} ||= '/search'; if (! defined($self->{'_options'})) { # We do not clobber the existing _options hash, if there is one; # e.g. if gui_search() was already called on this object $self->{'_options'} = { 'vo' => $native_query, 'h' => 'w', # web sites 'n' => $self->{_hits_per_page}, # 'b' => $self->{_next_to_retrieve}-1, }; } # if my $rhOptions = $self->{'_options'}; if (defined($rhOptsArg)) { # Copy in new options, promoting special ones: foreach my $key (keys %$rhOptsArg) { # print STDERR " + inspecting option $key..."; if (WWW::Search::generic_option($key)) { # print STDERR "promote & delete\n"; $self->{$key} = $rhOptsArg->{$key} if defined($rhOptsArg->{$key}); delete $rhOptsArg->{$key}; } else { # print STDERR "copy\n"; $rhOptions->{$key} = $rhOptsArg->{$key} if defined($rhOptsArg->{$key}); } } # foreach # print STDERR " + resulting rhOptions is ", Dumper($rhOptions); # print STDERR " + resulting rhOptsArg is ", Dumper($rhOptsArg); } # if # Finally, figure out the url. $self->{'_next_url'} = $self->{'search_base_url'} . $self->{'search_base_path'} .'?'. $self->hash_to_cgi_string($rhOptions); $self->{_debug} = $self->{'search_debug'} || 0; $self->{_debug} = 2 if ($self->{'search_parse_debug'}); } # _native_setup_search =head2 need_to_delay This method tells the L controller code whether we need to pause and give the yahoo.com servers a breather. =cut sub need_to_delay { # print STDERR " + this is Yahoo::need_to_delay()\n"; return $iMustPause; } # need_to_delay =head2 user_agent_delay This method tells the L controller code how many seconds we should pause. =cut sub user_agent_delay { my $self = shift; my $iSecs = int(30 + rand(30)); print STDERR " + sleeping $iSecs seconds, to make yahoo.com think we're NOT a robot...\n" if (0 < $self->{_debug}); sleep($iSecs); } # user_agent_delay =head2 preprocess_results_page Clean up the Yahoo HTML before we attempt to parse it. =cut sub preprocess_results_page { my $self = shift; my $sPage = shift; if ($self->{_debug} == 77) { # For debugging only. Print the page contents and abort. print STDERR $sPage; exit 88; } # if # Delete the tag that appears BEFORE the tag (because # it causes HTML::TreeBuilder to NOT be able to parse it!) $sPage =~ s!]+>!!; return $sPage; } # preprocess_results_page =head2 _result_list_tags Returns a list, which will be passed as arguments to HTML::Element::look_down() in order to return a list of HTML::Element which contain the query results. =cut sub _result_list_tags { return (_tag => 'div', class => 'res', ); } # _result_list_tags =head2 _result_list_items Given an HTML::TreeBuilder tree, returns a list of HTML::Element, which contain the query results. =cut sub _result_list_items { my $self = shift; my $oTree = shift || die; my @ao = $oTree->look_down($self->_result_list_tags); return @ao; } # _result_list_items my $WS = q{[\t\r\n\240\ ]}; sub _parse_tree { my $self = shift; my $oTree = shift; print STDERR " + ::Yahoo got a tree $oTree\n" if (2 <= $self->{_debug}); # Every time we get a page from yahoo.com, we have to pause before # fetching another. $iMustPause++; my $hits_found = 0; # Only try to parse the hit count if we haven't done so already: print STDERR " + start, approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug}); if ($self->approximate_hit_count() < 1) { my $rh = $self->_where_to_find_count; my @aoDIV = $oTree->look_down(%$rh); DIV_TAG: foreach my $oDIV (@aoDIV) { next unless ref $oDIV; print STDERR " + try DIV ==", $oDIV->as_HTML if (2 <= $self->{_debug}); my $s = $oDIV->as_text; print STDERR " + TEXT ==$s==\n" if (2 <= $self->{_debug}); my $iCount = $self->_string_has_count($s); $iCount =~ tr!,\.!!d; if (0 <= $iCount) { $self->approximate_result_count($iCount); last DIV_TAG; } # if } # foreach DIV_TAG } # if print STDERR " + found approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug}); my @aoLI = $self->_result_list_items($oTree); print STDERR " DDD aoLI has ", scalar(@aoLI), " items...\n" if (2 <= $self->{_debug}); LI_TAG: foreach my $oLI (@aoLI) { # Sanity check: next LI_TAG unless ref($oLI); print STDERR " DDD found oLI is ==", $oLI->as_HTML, "==\n" if (2 <= $self->{_debug}); my $oA = $oLI->look_down(_tag => 'a'); next LI_TAG unless ref($oA); print STDERR " DDD found oA is ==", $oA->as_HTML, "==\n" if (2 <= $self->{_debug}); my $sTitle = $oA->as_text || ''; my $sURL = $oA->attr('href') || ''; next LI_TAG if ($sURL eq ''); print STDERR " + raw URL is ==$sURL==\n" if (2 <= $self->{_debug}); # Throw out various unwanted Yahoo links: next LI_TAG if ($sURL =~ m!\.yahoo\.com/(about|jobseeker|preferences|search)/!); next LI_TAG if ($sURL =~ m!//((answers|cgi|cn|de|docs|europe|help|local|myweb\d?|search|searchmarketing|video)\.)+yahoo\.com!); # Strip off the yahoo.com redirect part of the URL: $sURL =~ s!\A.*?\*-!!; $sURL =~ s!\Ahttp%3A!http:!i; print STDERR " + cooked URL is ==$sURL==\n" if (2 <= $self->{_debug}); my $hit = new WWW::SearchResult; $hit->description(q{}); $self->parse_details($oLI, $hit); $hit->add_url($sURL); $sTitle = $self->strip($sTitle); $hit->title($sTitle); push(@{$self->{cache}}, $hit); $hits_found++; } # foreach LI_TAG # Now try to find the "next page" link: my @aoA = $oTree->look_down('_tag' => 'a'); NEXT_A: foreach my $oA (reverse @aoA) { next NEXT_A unless ref($oA); my $sAhtml = $oA->as_HTML; printf STDERR (" + next A ==%s==\n", $sAhtml) if (2 <= $self->{_debug}); if ($self->_a_is_next_link($oA)) { # Here is an example of a raw next URL: # http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101 # http://rds.yahoo.com/;_ylt=AutpqXFv9tv2eTXen2Mw_c1XNyoA;_ylu=X3oDMTExN2UzODg3BGNvbG8DdwRzZWMDcGFnaW5hdGlvbgR2dGlkA0RGWDJfOQ--/SIG=19e131ad9/EXP=1130207429/**http%3A%2F%2Fsearch.yahoo.com%2Fsearch%3Fn%3D100%26vo%3Dpokemon%26ei%3DUTF-8%26xargs%3D12KPjg1hVSt4GmmvmnCOObHb%255F%252Dvj0Zlpi3g5UzTYR6a9RL8nQJDqADN%255F2aP%255FdLHL9y7XrQ0JOkvqV2HOs3qODiIxkSdWH8UbKsmJS5%255FIp9DLfdaXlzsbIu0%252Djv3NcQZy8nLl2qbeONz73ZI6L5Hk57%26pstart%3D6%26b%3D101 my $sURL = $oA->attr('href'); print STDERR " + raw next URL ==$sURL==\n" if (2 <= $self->{_debug}); # Delete Yahoo-redirect portion of URL: $sURL =~ s!\A.+?[-*]+(?=http)!!; print STDERR " + poached next URL ==$sURL==\n" if (2 <= $self->{_debug}); $sURL = WWW::Search::unescape_query($sURL); $self->{_next_url} = $self->absurl($self->{'_prev_url'}, $sURL); print STDERR " + cooked next URL ==$self->{_next_url}==\n" if (2 <= $self->{_debug}); last NEXT_A; } # if } # foreach NEXT_A return $hits_found; } # _parse_tree =head2 parse_details Given a (portion of an) HTML::TreeBuilder tree and a L object, parses one result out of the tree and populates the SearchResult. =cut sub parse_details { my $self = shift; # Required arg1 = (part of) an HTML parse tree: my $oLI = shift; # Required arg2 = a WWW::SearchResult object to fill in: my $hit = shift; my $oDIV = $oLI->look_down(_tag => 'div', class => 'abstr', ); if (ref($oDIV)) { my $sDesc = $oDIV->as_text; $hit->description($self->strip($sDesc)); } # if # Delete the useless human-readable restatement of the URL (first # tag we come across): my $oEM = $oLI->look_down(_tag => 'em'); if (ref($oEM)) { my $sSize = ''; $sSize = $1 if ($oLI->as_text =~ m!(\d+[kb]?)!gx); $hit->size($sSize); } # if return; # Delete any remaining tags: my @aoA = $oLI->look_down(_tag => 'a'); A_TAG: foreach my $oA (@aoA) { $oA->detach; $oA->delete; } # foreach A_TAG $oDIV = $oLI->look_down(_tag => 'div'); if (ref $oDIV) { $oDIV->detach; $oDIV->delete; } # if my $sDesc = $oLI->as_text; print STDERR " + raw sDesc is ==$sDesc==\n" if (2 <= $self->{_debug}); # Grab stuff off the end of the description: print STDERR " + cooked sDesc is ==$sDesc==\n" if (2 <= $self->{_debug}); $hit->description($self->strip($sDesc)); } # parse_details =head2 _where_to_find_count Returns a list, which will be passed as arguments to HTML::Element::look_down() in order to return an HTML::Element which contains the approximate result count. =cut sub _where_to_find_count { my %hash = ( _tag => 'div', # 'class' => 'ygbody', id => 'info', ); return \%hash; } # _where_to_find_count =head2 _string_has_count Given a string, returns the approximate result count if that string contains the approximate result count. =cut sub _string_has_count { my $self = shift; my $s = shift; # print STDERR " DDD Yahoo::string_has_count($s)?\n"; return $1 if ($s =~ m!\bof\s+(?:about\s+)?([,0-9]+)!i); return -1; } # _string_has_count =head2 _a_is_next_link Given an HTML::Element, returns true if it seems to contain the clickable "next page" widget. =cut sub _a_is_next_link { my $self = shift; my $oA = shift; return 0 if (! ref $oA); my $sID = $oA->attr('id') || ''; return 1 if ($sID eq 'pg-next'); my $s = $oA->as_text; print STDERR " + next A as_text ==$s==\n" if (2 <= $self->{_debug}); return ($s =~ m!\A$WS*Next$WS+>$WS*\z!i); } # _a_is_next_link =head2 strip Given a string, strips leading and trailing whitespace off of it. =cut sub strip { my $self = shift; my $s = &WWW::Search::strip_tags(shift); $s =~ s!\A$WS+ !!x; $s =~ s! $WS+\Z!!x; return $s; } # strip 1; __END__ GUI search: http://ink.yahoo.com/bin/query?p=sushi+restaurant+Columbus+Ohio&hc=0&hs=0 Advanced search: http://search.yahoo.com/search?h=w&fr=op&va=&vp=&vo=Martin+Thurn&ve=&bbase=Search&vl=&vc=&vd=all&vt=any&vss=i&vs=&vr=&vk= http://ink.yahoo.com/bin/query?o=1&p=LSAm&d=y&za=or&h=c&g=0&n=20 actual next link from page: http://google.yahoo.com/bin/query?p=%22Shelagh+Fraser%22&b=21&hc=0&hs=0&xargs= _next_url : http://google.yahoo.com/bin/query?%0Ap=%22Shelagh+Fraser%22&b=21&hc=0&hs=0&xargs= http://rds.yahoo.com/_ylt=A0Je5ra.FlVEwsQA1RhXNyoA/SIG=13517q7d2/EXP=1146513470/**http%3a//search.yahoo.com/search%3fn=100%26vo=pokemon%26ei=UTF-8%26pstart=1%26b=101 =head1 SEE ALSO To make new back-ends, see L. =head1 BUGS Please tell the maintainer if you find any! =head1 AUTHOR As of 1998-02-02, C is maintained by Martin Thurn (mthurn@cpan.org). C was originally written by Wm. L. Scheding, based on C. =head1 LEGALESE THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 LICENSE Copyright (C) 1998-2009 Martin 'Kingpin' Thurn This software is released under the same license as Perl itself. =cut