package WWW::Search::Pagesjaunes; use strict; use Carp qw(carp croak); use HTML::Form; use WWW::Mechanize; use HTML::TokeParser; use HTTP::Request::Common; use LWP::UserAgent; $WWW::Search::Pagesjaunes::VERSION = '0.14'; sub ROOT_URL() { 'http://www.pagesjaunes.fr' } sub new { my $class = shift; my $self = {}; my $ua = shift() || WWW::Mechanize->new( env_proxy => 1, keep_alive => 1, timeout => 30, agent => "WWW::Search::Pagesjaunes/$WWW::Search::Pagesjaunes::VERSION", ); $self->{ua} = $ua; $self->{limit} = 50; $self->{fast} = 0; $self->{error} = 1; $self->{lang} = 'FR'; bless( $self, $class ); } sub agent { my $self = shift; if ( $_[0] ) { my $old = $self->{ua}; $self->{ua} = $_[0]; return $old; } else { return $self->{ua}; } } sub find { my $self = shift; my %opt = @_; my $p = $opt{activite} ? 'j' : 'b'; # Make the first request to pagesjaunes.fr $self->{URL} = ROOT_URL . "/p$p.cgi"; if ( $self->{fast} ) { $self->{req} = POST( $self->{URL}, [ faire => 'decode_input_image', DEFAULT_ACTION => $p . 'f_inscriptions_req', lang => $self->{lang}, pays => 'FR', srv => uc("p$p"), TYPE_RECHERCHE => 'ZZZ', input_image => '', FRM_ACTIVITE => $p eq 'j' ? $opt{activite} : undef, FRM_NOM => $opt{nom}, FRM_PRENOM => $p eq 'b' ? $opt{prenom} : undef, FRM_ADRESSE => $opt{adresse}, FRM_LOCALITE => $opt{localite}, FRM_DEPARTEMENT => $opt{departement}, #'${p}F_INSCRIPTIONS_REQ.x' => 1, #'${p}F_INSCRIPTIONS_REQ.y' => 1, ]); } else { my $req = $self->{ua}->get($self->{URL}); if ( !$req->content || !$req->is_success ) { croak('Error while retrieving the HTML page'); } my @forms = HTML::Form->parse( $req->content, $self->{URL} ); # BooK finds the form by grepping thru all of them, instead # of limiting ourselves to the first and second form. my ($form) = grep { $_->find_input('lang') } @forms; eval { # HTML::Form complains when you change hidden fields values. local $^W; $form->value( 'lang', $self->{lang} ); $form->value( 'FRM_ACTIVITE', $opt{activite} ) if $opt{activite}; $form->value( 'FRM_NOM', $opt{nom} ); $form->value( 'FRM_PRENOM', $opt{prenom} ) if !$opt{activite}; $form->value( 'FRM_ADRESSE', $opt{adresse} ); $form->value( 'FRM_LOCALITE', $opt{localite} ); $form->value( 'FRM_DEPARTEMENT', $opt{departement} ); }; croak "Cannot fill the pagesjaunes request form. try with the 'fast' option\n" if $@; $self->{limit} = $opt{limit} || $self->{limit}; $self->{req} = $form->click; } return $self; } sub results { my $self = shift; my $result_page = $self->{ua}->request( $self->{req} )->content; my $parser = HTML::TokeParser->new( \$result_page ); # All the
tags are transformed to '§¤§', to separate # multiple phone numbers $parser->{textify} = { 'br' => sub() { '§¤§' } }; my @results; if ( $self->{limit} == 0 ) { $self->{has_more} = 0; return @results; } # XXX This is a really crude parsing of the data, but it seems to # get the job done. # # # # # #
# # # # # # # # # #
Name 
Address(télécopie)? Phone
#
# $self->{has_more} = 0; while ( my $token = $parser->get_tag("table") ) { next unless $token->[1] && $token->[1]{class} && $token->[1]{class} eq 'fdinscr'; { # We're inside an entry table $parser->get_tag("td"); # The first is the name my $name = _trim( $parser->get_trimmed_text('/td') ); $parser->get_tag("td"); # The second is ignored $parser->get_tag("td"); # The third is the address my $address = _trim( $parser->get_trimmed_text('/td') ); $address =~ s/\W*\|.*$//g; $parser->get_tag("td"); # The fourth is the phone number my $phone = _trim( $parser->get_trimmed_text('/td') ); my @phones = map { _trim($_); s/\.(\s*\d)/$1/; $_ } split(/§¤§/, $phone); # The fifth tag is either the mail or the descr, depending # on the class my @emails = (''); my $tag = $parser->get_tag("td"); if ( $tag->[1]{class} && $tag->[1]{class} eq 'txtinscr'){ my $email = _trim( $parser->get_trimmed_text('/td') ); @emails = map { _trim($_); s/Mail\s*:\s*//; $_ } split(/§¤§/, $email); } push( @results, WWW::Search::Pagesjaunes::Entry->new( $name, $address, [ @phones ], [ @emails ] ) ); return @results if --$self->{limit} == 0; } } foreach my $form ( HTML::Form->parse( $result_page, $self->{URL} ) ) { if ( $form->find_input('faire') && $form->value('faire') eq 'decode_input_image' ) { $self->{has_more} = 1; $self->{req} = $form->click(); } } # If there was no result, we look for an error message in the HTML page if ( !@results && $self->{error} ) { $parser = HTML::TokeParser->new( \$result_page ); while ( my $token = $parser->get_tag("font") ) { next unless $token->[1] && $token->[1]{color} && $token->[1]{color} eq '#ff0000'; $parser->{textify} = { 'br' => sub() { " " } }; carp _trim( $parser->get_trimmed_text('/font') ) . "\n"; } } wantarray ? @results : $results[0]; } sub _trim { $_[0] =~ s/\xa0/ /g; # Transform the   into whitespace $_[0] =~ s/^\s*|\s*$//g; $_[0] =~ s/\s+/ /g; $_[0]; } sub limit { my $self = shift; $self->{limit} = $_[0] || $self->{limit}; } sub has_more { $_[0]->{has_more} } package WWW::Search::Pagesjaunes::Entry; # The entry object is a blessed array with the following indices: # 0 - Name # 1 - Address # 2 - Arrayref of phone numbers # 3 - E-mail (pj) # 4 - Notes (pj) sub new { my $class = shift; bless [ @_ ], $class } sub name { $_[0]->[0] } sub address { $_[0]->[1] } sub phone { $_[0]->[2] } sub email { $_[0]->[3] } sub entry { # Name Address First email Phones $_[0]->[0], $_[0]->[1], $_[0]->[3]->[0], @{ @{ $_[0] }[2] }, } 1; __END__ =pod =head1 NAME WWW::Search::Pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr =head1 SYNOPSIS use WWW::Search::Pagesjaunes; my $pj = new WWW::Search::Pagesjaunes; $pj->find( activite => "Plombier", localite => "Paris" ); do { print $_->entry . "\n" foreach ($pj->results); } while $pj->has_more; =head1 DESCRIPTION The WWW::Search::Pagesjaunes provides name, phone number and addresses of French telephone subscribers by using the L directory. =head1 METHODS Two classes are used in this module, a first one (WWW::Search::Pagesjaunes) to do the fetching and parsing, and the second one and a second one (WWW::Search::Pagesjaunes::Entry) holding the entry infos. Here are the methods for the main WWW::Search::Pagesjaunes module: =over 4 =item new() The constructor accept an optional LWP::UserAgent as argument, if you want to provide your own. =item find( %request ) Here are the values for the %request hash that are understood. They each have two name, the first is the french one and the second is the english one: =over 4 =item nom / name Name of the person you're looking for. =item activite / business Business type of the company you're looking for. Note that if this field is filled, the module searches in the yellow pages. =item localite / town Name of the town you're searching in. =item prenom / firstname First name of the person you're looking for. It is not set if you set the 'activite' field. =item departement / district Name or number of the Département or Région you're searching in. =back =item results() Returns an array of WWW::Search::Pagesjaunes::Entry containing the first matches of the query. =item limit($max_number_of_entries) Set the maximum number of entries returned. Default to 50. =item has_more() If the query leads to more than a few results, the field has_more is set. You can then call the results() method again to fetch the datas. =back The WWW::Search::Pagesjaunes::Entry class has six methods: =over 4 =item new($name, $address, $phone, $fax) Returns a new WWW::Search::Pagesjaunes::Entry. =item name Returns the name of the entry. =item address Returns the address of the entry. =item phone Returns the phone number of the entry. =item is_fax Returns true if the phone number is a fax one, false otherwise. Note that currently, this method always returns 0. =item entry($separator) Returns the concatenation of the name and the phone number, separated by " - ". You can specify your own separator as first argument. =back =head1 BUGS The phone numbers are sometimes not correctly parsed, esp. when one entry has several phone numbers. If you found a bug and want to report it or send a patch, you are encouraged to use the CPAN Request Tracker interface: L =head1 COPYRIGHT Please read the Publisher information of L available at the following URL: L WWW::Search::Pagesjaunes is Copyright (C) 2002, Briac Pilpré This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =head1 AUTHOR Briac Pilpré =cut