#################################################################################### #################################################################################### #################################################################################### #################################################################################### package WWW::Scraper; use strict; require Exporter; use vars qw($VERSION $MAINTAINER @ISA @EXPORT @EXPORT_OK $PRINT_VERSION); $VERSION = '3.05'; my $CVS_VERSION = sprintf("%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/); $MAINTAINER = 'Glenn Wood http://search.cpan.org/search?mode=author&query=GLENNWOOD'; $PRINT_VERSION = 0; sub import { my $package = shift; for my $opts (grep { "HASH" eq ref($_) } @_) { for my $opt ( keys %$opts ) { my $optfunc = { 'PRINT_VERSION' => sub { $WWW::Scraper::PRINT_VERSION = 1; } }->{$opt}; if ( $optfunc ) { &$optfunc($opts->{$opt}); } else { warn "Unknown option '$opt' in $package\n"; } } } # Prints "WWW::Scraper v3.01", as appropriate to the sub-class of Scraper. eval "print \"$package v\$$package\::VERSION\\\n\"" if ( $WWW::Scraper::PRINT_VERSION ); @_ = ($package, grep { "HASH" ne ref($_) } @_); goto &Exporter::import; } use Carp (); use URI::URL; # some Unix boxes simply won't load this one via WWW:Search, so . . . use WWW::Search( 2.28 ); use WWW::Scraper::Request; use WWW::Scraper::Response; use WWW::Scraper::Response::generic; use WWW::Scraper::TidyXML; use WWW::Scraper::Opcode; @EXPORT_OK = qw( generic_option testParameters trimTags trimLFs trimLFLFs trimComments @ENGINES_WORKING addURL trimXPathAttr trimXPathHref findNextForm findNextFormInXML removeScriptsInHTML cleanupHeadBody); #------------------------------------------------------------------# # Here we begin our gradual migration from "can-o-worms" to # Class::Struct structured Scraper. { package WWW::Scraper::_struct_; use Class::Struct; struct ( 'WWW::Scraper::_struct_' => { 'response' => '$' ,'searchEngineHome' => '$' ,'searchEngineLogo' => '$' ,'errorMessage' => '$' ,'_artifactFolder' => '$' # Folder into which certain Scraper artifacts will be gathered. ,'_responseClass' => '$' ,'_wantsNativeRequest' => '$' ,'_wwwSearchBackend' => '$' ,'_forInterator' => '$' ,'_retryGetCount' => '$' ,'_tidyXmlObject' => '$' ,'pageNumber' => '$' # Page number of the current 'response' object. ,'_scraperRequest' => '$' ,'_scraperFrame' => '$' ,'_scraperDetail' => '$' } ); } use base qw( WWW::Scraper::_struct_ WWW::Search Exporter ); my @HitStack = (); #------------------------------------------------------------------# sub new { my ($class, $subclass, $native_query, $native_options) = @_; my ($self, $wantsNativeRequest); $subclass = '' unless $subclass; $wantsNativeRequest = $subclass =~ s/^NativeRequest\:\:(.*)$/$1/; if ( $subclass =~ m-^\.\.[\/](.*)$- ) { # Allow the form "../name" to indicate die "The '..\\backend' form is deprecated in favor of scraperFrame = 'WWW::Search::backend' - see HeadHunter.pm for an example.\n"; } else { if ( $subclass =~ s/^(.*)\((.*)\)$/$1/ ) { my $subclassVersion = $2; eval "use WWW::Scraper::$subclass($subclassVersion)"; if ( $@ ) { print "Can't use engine $subclass($subclassVersion): $@\n"; return undef; } } ## > > > > > > ############################################################################################# ############################################################################################# ############################################################################################# ############################################################################################# # THIS STUFF IS A FACTORING OF WWW::Search::new() - we must track any of Martin's changes! # $self = new WWW::Search("../Scraper::$subclass"); my $newclass = "${class}::$subclass"; if (!defined(&$newclass)) { eval "use $newclass"; Carp::croak("unknown Scraper interface '$newclass': $@") if ($@); } $self = bless { engine => $newclass, maximum_to_retrieve => 500, # both pages and hits interrequest_delay => 0.25, # in seconds agent_name => "WWW::Scraper/$VERSION", agent_e_mail => 'glenwood@alumni.caltech.edu;MartinThurn@iname.com', env_proxy => 0, http_method => 'GET', http_proxy => '', http_proxy_user => undef, http_proxy_pwd => undef, timeout => 60, _debug => 0, _parse_debug => 0, search_from_file => undef, search_to_file => undef, search_to_file_index => 0, @_, # variable initialization goes here }, $newclass; $self->reset_search(); } ############################################################################################# ############################################################################################# $self->_wantsNativeRequest($wantsNativeRequest); $self->{'agent_name'} = 'Mozilla/4.0 (compatible; MSIE 4.01; Windows 95)';#"Mozilla/WWW::Scraper/$VERSION"; $self->{'scraperQF'} = 0; # Explicitly declare 'scraperQF' as the deprecated mode. $self->{'scraperName'} = $subclass; $self->_init(); # Some property initializations, mostly to eliminate useless diagnostic warnings. # Finally, call the sub-scraper's init() method. return $self->init($subclass, $native_query, $native_options); } sub _init { my $self = shift; $self->{cache} = []; # Eliminate some useless "warnings" from WWW::Search(lines 544-549) during make test, and elsewhere. $self->pageNumber(0); # Use of uninitialized value in addition (+) at lib/WWW/Search/Scraper.pm line 609 if ( $self->scraperFrame ) { my @scfld = @{$self->scraperFrame}; my $next_scaffold = $scfld[$#scfld] if ref($scfld[$#scfld]) eq 'ARRAY'; WWW::Scraper::Opcode::InitiateScaffold($next_scaffold) if $next_scaffold; #$self->GetScraperFrame)}; } if ( $self->scraperDetail ) { my @scfld = @{$self->scraperDetail}; my $next_scaffold = $scfld[$#scfld] if ref($scfld[$#scfld]) eq 'ARRAY'; WWW::Scraper::Opcode::InitiateScaffold($next_scaffold) if $next_scaffold; #$self->GetScraperDetail); } } # The Scraper module should override this. sub init { my ($self, $subclass, $native_query, $native_options) = @_; my $scraperFrame = $self->scraperFrame(); if ( ${$scraperFrame}[0] =~ m{^WWW::Search::(.*)$} ) { $self->_wwwSearchBackend(new WWW::Search($1, $native_query, $native_options)); # Uses a WWW::Search backend. } else { if ( ref($native_query) && !$native_options ) { $native_options = $native_query; $native_query = undef; } $self->native_query($native_query, $native_options); } return $self; } # To help avoid embarrassment when he inadvertently releases test, debug or tracing code to CPAN, Glenn uses this. sub isGlennWood { return $ENV{'VSROOT'} and ($ENV{'USERNAME'} eq 'Glenn') and ($ENV{'USERDOMAIN'} eq 'ORCHID'); } # Access methods for the structural declarations of this Scraper engine. use vars qw($scraperRequest $scraperFrame $scraperDetail); sub SetScraperRequest { my ($self,$rqst) = @_; $self->_scraperRequest($rqst); } sub SetScraperFrame { my ($self,$frame) = @_; $self->_scraperFrame($frame); WWW::Scraper::Opcode::InitiateScaffold($frame->[1]) if $frame; #$self->GetScraperFrame)}; } sub SetScraperDetail { my ($self,$frame) = @_; $self->_scraperDetail($frame); WWW::Scraper::Opcode::InitiateScaffold($frame->[1]) if $frame; #$self->GetScraperFrame)}; } sub GetScraperRequest { $_[0]->_scraperRequest; } sub GetScraperFrame { $_[0]->_scraperFrame; } sub GetScraperDetail { $_[0]->_scraperDetail; } sub scraperFrameX { $_[0]->{'_options'}{'scrapeFrame'} = $_[1] if $_[1]; return $_[0]->{'_options'}{'scrapeFrame'} } # backward compatible, pre v3.01 ( gdw.2003.03.14 ) sub scraperRequest { return $_[0]->GetScraperRequest } sub scraperFrame { $_[0]->SetScraperFrame($_[1]) if $_[1]; return $_[0]->GetScraperFrame } sub scraperDetail { $_[0]->SetScraperDetail($_[1]) if $_[1]; return $_[0]->GetScraperDetail } # Return empty testFrame for sub-scrapers that choose not to provide one. sub testParameters { my ($self) = @_; if ( ref $self ) { $self->{'isTesting'} = 1; } my $isNotTestable = WWW::Scraper::isGlennWood()?0:'No testParameters provided.'; return { 'SKIP' => $isNotTestable ,'testNativeQuery' => 'search scraper' ,'expectedOnePage' => 9 ,'expectedMultiPage' => 11 ,'expectedBogusPage' => 0 }; } sub artifactFolder { my ($self, $fldr) = @_; if ( $fldr ) { mkdir $fldr unless -d $fldr; $self->_artifactFolder($fldr); } return $self->_artifactFolder(); } sub generic_option { my ($option) = @_; return 1 if $option =~ /^scrape/; return WWW::Search::generic_option($option); } # A generalize get/set method for object attributes. sub _attr { my ($self, $attr, $value) = @_; my $rtn = $self->{$attr}; $self->{$attr} = $value if defined $value; # neat idea, but we've got to rewrite a lot of method invocations to make this ok. gdw.2001.07.04 # if ( wantarray ) { # return $rtn if 'ARRAY' eq ref $rtn; # return [$rtn]; # } return $rtn; } # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## ### # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # sub query { $_[0]->_attr('_query', $_[1]) } sub queryDefaults { $_[0]->_attr('_queryDefaults', $_[1]) } sub queryOptions { $_[0]->_attr('_queryOptions', $_[1]) } sub fieldTranslations { $_[0]->_attr('_fieldTranslations', $_[1]) } # Some tracing options - # U - lists URLs as they are generated/accessed. # T - lists progress of each TidyXML tree-walking operation. # d - excruciating details about parsing the results and details pages. sub ScraperTrace { return undef unless defined $_[0]->{'_traceFlags'}; return $_[0]->{'_traceFlags'} unless $_[1]; # default traceFlags if no match string sent. return ( $_[0]->{'_traceFlags'} =~ m-$_[1]- ); } sub setScraperTrace { $_[0]->{'_traceFlags'} = $_[1]; } # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## ### # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # sub SetRequest { my ($self, $rqst) = @_; my $nonBlankWWWSearchNativeQuery = 'nonBlankWWWSearchNativeQuery'; if ( $rqst ) { # Make sure the request object is ready for us. $rqst->prepare($self); # Move the debug option from the request to the Scraper module. $self->{'_debug'} = $rqst->_Scraper_debug() if defined $rqst->_Scraper_debug(); $self->{'_scraperRequest'} = $rqst; $nonBlankWWWSearchNativeQuery = $rqst->_native_query() || $nonBlankWWWSearchNativeQuery; } # WWW::Search(2.26) required native_query to be non-blank, even before it hands it off to Scraper! $self->{'native_query'} = $nonBlankWWWSearchNativeQuery unless $self->{'native_query'}; return $self->{'_scraperRequest'}; } sub GetRequest { return $_[0]->{'_scraperRequest'} } sub SetResponseClass { $_[0]->_responseClass($_[1]) } sub GetResponseClass { $_[0]->_responseClass() } sub native_setup_search { my $self = shift; my ($native_query, $native_options) = @_; $native_query = WWW::Search::unescape_query($native_query); # Thanks, but no thanks, Search.pm! $self->{'_first_url'} = undef; $self->{'_first_url_method'} = undef; my $scraperRequest = $self->scraperRequest; # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # # This pecular set of code translates old interface mode into 'canonical request' mode, # # NOTE THAT IF THE CANONICAL REQUEST HAS BEEN SET, ALL native_setup_search() PARAMETERS ARE IGNORED! # # otherwise, they get picked up here. # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # Get the scraperRequest declaration of the Scraper module, or fake one (as in when using a WWW::Search module). unless ( $scraperRequest ) { $scraperRequest = { 'type' => 'SEARCH' # This is a WWW::Search module - notify native_setup_search_NULL() of that. # This is the basic URL on which to build the query. ,'url' => 'http://' # names the native input field to recieve the query string. ,'nativeQuery' => 'query' # specify defaults, by native field names ,'nativeDefaults' => { } ,'fieldTranslations' => undef # This gives us a null %inputsHash, so WWW::Scraper will ignore that functionality (hopefully) , 'cookies' => 0 # The WWW::Search module must maintain its own cookies. }; $self->scraperRequest($scraperRequest, $native_query, $native_options); } # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## # ## #$self->SetRequest( new WWW::Scraper::Request($self, $native_query, $native_options) ) unless ( $self->GetRequest()); # These traceFlags will ultimately come from many places . . . #$self->setScraperTrace($self->{'_debug'}) unless $self->{'_traceFlags'}; for ( $self->scraperRequest()->{'type'} ) { m/^SHERLOCK$/ && do { return $self->native_setup_search_SHERLOCK(@_); }; m/^FORM$/ && do { return $self->native_setup_search_FORM(@_); }; m/^QUERY$|^GET$/ && do { return $self->native_setup_search_QUERY(@_); }; m/^POST$/ && do { $self->{'_http_method'} = 'POST'; return $self->native_setup_search_QUERY(@_); }; m/^SEARCH$/ && do { return $self->native_setup_search_NULL(@_); }; m/^WSDL$/ && do { return $self->native_setup_search_WSDL(@_); }; m/^WWW::Search/ && do { return $self->native_setup_search_WWW_Search(@_); }; die "Invalid mode in WWW::Scraper - '$_'\n"; } } sub native_setup_search_SHERLOCK { my ($self, $native_query, $native_options) = @_; $self->SetRequest( new WWW::Scraper::Request($self, $native_query, $native_options) ) unless ( $self->GetRequest()); die "Unimplemented mode in WWW::Scraper - 'SHERLOCK'\n"; } sub native_setup_search_FORM { my ($self, $native_query, $native_options) = @_; $self->SetRequest( new WWW::Scraper::Request($self, $native_query, $native_options) ) unless ( $self->GetRequest()); $self->user_agent('user'); $self->{_next_to_retrieve} = 0; # $scraperForm = [ 'url', 'formIndex' (or formName, NYI), 'submitButtonName' or undef ] my $url = $self->scraperRequest($native_query, $native_options)->{'url'}; if ( ref $url ) { $self->{'_base_url'} = &$url($self, $self->GetRequest()->_native_query(), $self->{'native_options'}); } else { $self->{'_base_url'} = $url; } unless ( $self->{'_base_url'} ) { print STDERR "No base url was specified by ".ref($self).".pm, so no search is possible.\n"; undef $self->{'_next_url'}; return undef; } $self->{'_http_method'} = 'GET' unless $self->{'_http_method'}; print STDERR 'FORM URL: '.$self->{'_base_url'} . "\n" if ($self->ScraperTrace('U')); my $response = $self->http_request($self->{'_http_method'}, $self->{'_base_url'}); unless ( $response->is_success ) { print STDERR "Request for FORM failed in Scraper.pm: ".$response->message() if $self->ScraperTrace(); return undef ; } my @forms = HTML::Form->parse($response->content(), $response->base()); return undef unless @forms; my $formNameOrNumber = $self->scraperRequest->{'formNameOrNumber'}; my $form; if ( $formNameOrNumber =~ m{^\d+$} ) { # is formNameOrNumber a number? $form = $forms[$self->scraperRequest->{'formNameOrNumber'} or 0]; } else { # it is a name, not a number. # Unfortunately, HTML::Form->parse() does not stash the forms' names, so we use # this inperfect method to get to them (inperfect? what if "