package WWW::Scraper::ZIPplus4; ##################################################################### require Exporter; @EXPORT = qw(); @EXPORT_OK = qw(trimTags); @ISA = qw(WWW::Scraper Exporter); $VERSION = sprintf("%d.%02d", q$Revision: 1.9 $ =~ /(\d+)\.(\d+)/); use Carp (); use WWW::Scraper(qw(3.03 generic_option addURL trimTags trimLFs trimComments)); use strict; my $scraperRequest = { 'type' => 'GET' ,'formNameOrNumber' => '1' ,'submitButton' => 'Submit' # This is the basic URL on which to get the form to build the query. # ,'url' => 'http://www.usps.com/ncsc/lookups/lookup_zip+4.html' # _OLD ,'url' => 'http://www.usps.com/cgi-bin/zip4/zip4inq2?' ,'url' => 'http://www.usps.com/zip4/zip4_response.jsp?' # specify defaults, by native field names # ,'nativeQuery' => 'Delivery+Address' ,'nativeDefaults' => { 'Selection' => '1' ,'urbanization' => '' ,'firm' => '' ,'address2' => '' ,'Submit.x' => '1' ,'Submit.y' => '1' } # specify translations from canonical fields to native fields ,'defaultRequestClass' => 'ZIPplus4' ,'fieldTranslations' => { '*' => { 'City' => 'city' ,'State' => 'state' ,'ZipCode' => 'zipcode' ,'DeliveryAddress' => 'address' ,'address1' => 'address' # Weird but true! ,'*' => '*' # Thanks to Klemens Schmid (klemens.schmid@gmx.de)! } # See FormSniffer at http://www.wap2web.de/formsniffer2.aspx } # Miscellaneous options for the Scraper operation. ,'cookies' => 0 }; my $scraperFrame = [ 'HTML', [ [ 'BODY', '', '', [ [ 'HIT*' , [ ['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, 'firm'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, 'address'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, \&parseCity, 'city'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, \&cleanUpUsps, 'carrierRoute'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, 'county'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, \&cleanUpUsps, 'deliveryPoint'] ,['REGEX', '(].*?.*?)', \&trimComments, \&trimLFs, \&cleanUpUsps, 'checkDigit'] # this regex never matches; just lets us declare fields. ,[ 'REGEX', 'neverMatch', 'state', 'zipcode' ] ] ] ] ] ] ]; my $scraperFrame_OLD = [ 'HTML', [ [ 'BODY', 'The standardized address is:', '(.*?(
)?.*?)
\s*(.*?)\s(..)\s(\d\d\d\d\d-\d\d\d\d)
.*?(.*?).*?(.*?).*?(.*?).*?(.*?)' ,'address', undef, 'city', 'state', 'zip', 'carrierRoute', 'county', 'deliveryPoint' , 'checkDigit' ] ] ] ,[ 'HIT*' , [ [ 'REGEX', '(.*?).*?(.*?)\s(..)\s(\d\d\d\d\d-\d\d\d\d).*?(.*?).*?(.*?).*?(.*?).*?(.*?)' ,'address', 'city', 'state', 'zip', 'carrierRoute', 'county', 'deliveryPoint' , 'checkDigit' ] ] ] ] ] ] ]; sub testParameters { my ($self) = @_; if ( ref $self ) { $self->{'isTesting'} = 1; } return { 'SKIP' => ''#'ZIPplus4 test parameters have not yet been fixed' ,'testNativeQuery' => '20500' ,'testNativeOptions' => { 'address' => '1600 Pennsylvannia Ave' ,'city' => 'Washington' ,'state' => 'DC' ,'zipcode' => '' } ,'expectedOnePage' => 1 ,'expectedMultiPage' => 1 ,'expectedBogusPage' => 1 }; } # Access methods for the structural declarations of this Scraper engine. sub scraperRequest { $scraperRequest } sub scraperFrame { $_[0]->SUPER::scraperFrame($scraperFrame); } sub cleanUpUsps { my ($self, $hit, $dat) = @_; $dat = $self->trimLFs($hit, $dat); $dat =~ s/^County://gs; $dat =~ s/^Carrier Route://gs; $dat =~ s/^Delivery Point://gs; $dat =~ s/^Check Digit://gs; $dat =~ s/\s*-->//gs; return $dat; } sub parseCity { my ($self, $hit, $dat) = @_; $dat = $self->cleanUpUsps($hit, $dat); $dat =~ s/^(.*)\s+(\w+)\s+(\d\d\d\d\d)\s?(-\d\d\d\d)$/$1/s; $hit->plug_elem('state', $2); $hit->plug_elem('zipcode', "$3$4"); return $dat; } { package AddressDedup; # This package helps ZipPlus4.pl to de-duplicate the address list. # With minor or no modification, it might be useful to others, too. use Class::Struct; struct ( 'AddressDedup' => [ 'Address' => '$' ,'City' => '$' ,'State' => '$' ,'Zip' => '$' ,'Name' => '$' ,'_allColumns' => '$' ,'_zipColumn' => '$' ] ); sub isEqual { my ($self, $other) = @_; return 0 unless ($self->_isEqualAddress($other->Address)); return 0 unless ($self->_isEqualCity($other->City)); return 0 unless ($self->_isEqualState($other->State)); return 0 unless ($self->_isEqualZip($other->Zip)); # return 0 unless ($self->_isEqualName($other->Name)); return 1; } sub _isEqualAddress { my ($self, $str) = @_; return ($self->Address eq $str); } sub _isEqualCity { my ($self, $str) = @_; return ($self->City eq $str); } sub _isEqualState { my ($self, $str) = @_; return ($self->State eq $str); } sub _isEqualZip { my ($self, $str) = @_; return ($self->Zip eq $str); } sub _isEqualName { my ($self, $str) = @_; return ($self->Name eq $str); } sub setValue { my ($self, $colNums, $fullLine) = @_; chomp $fullLine; my @cols = split ',', $fullLine; $self->_allColumns(\@cols); $self->Address($cols[$colNums->{'colAddress'}]); $self->City($cols[$colNums->{'colCity'}]); $self->State($cols[$colNums->{'colState'}]); $self->Zip($cols[$colNums->{'colZip'}]); $self->_zipColumn($colNums->{'colZip'}); } sub isEmpty { my ($self) = @_; return 0 if $self->Address; return 0 if $self->City; return 0 if $self->State; return 0 if $self->Zip; return 0 if $self->Name; return 1; } sub asString { my ($self) = @_; my $allColumns = $self->_allColumns(); $$allColumns[$self->_zipColumn] = $self->Zip; return join ',', @$allColumns; } } 1; __END__ =pod =head1 NAME WWW::Scraper::ZIPplus4 - Get ZIP+4 code, given street address, from www.usps.com. Also helps de-duplicate a mailing list. =head1 SYNOPSIS =over 1 =item Simple use WWW::Scraper(qw(2.25)); use WWW::Scraper::Request::ZIPplus4; my $ZIPplus4 = new WWW::Scraper( 'ZIPplus4', ,{ 'address1' => '1600 Pennsylvannia Ave' ,'city' => 'Washington' ,'state' => 'DC' ,'zipcode' => '20500' } ); while ( my $response = $ZIPplus4->next_response() ) { print $response->zipcode()."\n"; } =item Complete use WWW::Scraper(qw(2.25)); use WWW::Scraper::Request::ZIPplus4; my $ZIPplus4 = new WWW::Scraper( 'ZIPplus4' ); my $request = new WWW::Scraper::Request::ZIPplus4; # Note: Delivery_Address(), and either Zip_Code(), or City() and State(), are required. $request->address1('1600 Pennsylvannia Ave'); $request->city('Washington'); $request->state('DC'); $request->zipcode('20500'); $ZIPplus4->scraperRequest($request); while ( my $response = $ZIPplus4->next_response() ) { for ( qw(address city state zipcode county carrierRoute checkDigit deliveryPoint) ) { print "$_: ".${$response->$_()}."\n"; } } =back =head1 DESCRIPTION This class is an ZIPplus4 specialization of WWW::Scraper. It handles making and interpreting ZIPplus4 searches F. =head1 SPECIAL THANKS =over 8 =item To Klemens Schmid (klemens.schmid@gmx.de), for FormSniffer. This tool is an excellent compliment to Scraper to almost instantly discover form and CGI parameters for configuring new Scraper modules. It instantly revealed what I was doing wrong in the new ZIPplus4 format one day (after hours of my own clumsy attempts). See FormSniffer at http://www.wap2web.de/formsniffer2.aspx (Win32 only). =back =head1 AUTHOR and CURRENT VERSION C is written and maintained by Glenn Wood, http://search.cpan.org/search?mode=author&query=GLENNWOOD. =head1 COPYRIGHT Copyright (c) 2001 Glenn Wood All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut