package WWW::Scraper::CraigsList;
#####################################################################
use strict;
use vars qw($VERSION @ISA);
@ISA = qw(WWW::Scraper);
use WWW::Scraper(qw(3.02 generic_option addURL trimTags));
use WWW::Scraper::FieldTranslation;
$VERSION = sprintf("%d.%02d", q$Revision: 1.16 $ =~ /(\d+)\.(\d+)/);
# Craigs List differs from other search engines in a few ways.
# One of them is the results page is not tablulated, or data lined.
# It returns each job listing on a single line.
# This line can be parsed with a single regular expression, which is what we do.
#
# SAMPLE :
#
#
Apr 24 - Senior Software Engineer (San Francisco) (internet engineering jobs)
#
#
# private
# NOTE: sometimes the response may read:
#
# craigslist: online community
# craigslist
# online community
# The requested function is offline for maintenance.
# Please try again a little later.
#
# NEW: by 2002.09.27
# Jobs - web-dev
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=eng&cat=14&group=J&type_search=&query=Perl&new_cat=14
# Jobs - software/QA/DBA/etc jobs
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=sof&cat=21&group=J&type_search=&query=Quality&new_cat=21 return {
# Car stuff
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=car&cat=6&group=S&type_search=&query=honda+accord&new_cat=6&maxAsk=11000
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=car&cat=6&group=S&type_search=&query=Honda&new_cat=6&maxAsk=
my $scraperRequest =
{
'type' => 'POST' # Type of query generation is 'POST'
# This is the basic URL on which to build the query.
,'url' => 'http://www.craigslist.org/cgi-bin/search?'
# This is the Scraper attributes => native input fields mapping
,'nativeQuery' => 'query'
,'nativeDefaults' =>
{ 'areaID' => '1'
,'subAreaID' => '0'
,'group' => 'S'
,'cat' => 'all'
,'new_cat' => '6'
,'catAbbreviation' => 'car'
,'group' => 'J'
,'type_search' => ''
,'min_ask' => '' # catAbbreviation='car'
,'max_ask' => '' # catAbbreviation='car'
,'query' => ''
}
# ,'defaultRequestClass' => 'Job'
,'fieldTranslations' =>
{ '*' =>
{ '*' => '*'
,'skills' => 'query'
# ,'payrate' => \&translatePayrate
,'locations' => new WWW::Scraper::FieldTranslation('CraigsList', 'Job', 'locations')
}
}
# Some more options for the Scraper operation.
,'cookies' => 0
};
my $scraperFrame =
[ 'HTML',
[
[ 'NEXT', 'Next ' ]
,[ 'BODY', '', '' ,
[
[ 'COUNT', 'Found: (\d+)']
,[ 'HIT*' ,
[
# NEW: by 2002.09.27
#
May-18 =====>>2001 Honda XR 650L - $3500 (vallejo) <<cycles #
Sep-26 Plugged In Enterprises Web Producer (East Palo Alto) [ 'REGEX', '
\s*( )*(\w*?-\d+)[^<]*]+)>(.*?)([^<]*)',
undef, 'date', 'url', 'title', 'location'
]
]
]
]
]
]
];
sub testParameters {
# 'POST' style scraperFrames can't be tested cause of a bug in WWW::Search(2.2[56]) !
my $isNotTestable = WWW::Scraper::isGlennWood()?0:0;
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=sof&cat=21&group=J&type_search=&query=Quality&new_cat=21 return {
return {
'SKIP' => $isNotTestable
,'testNativeQuery' => 'Honda'
,'expectedOnePage' => 50
,'expectedMultiPage' => 100
,'expectedBogusPage' => 0
# http://www.craigslist.org/cgi-bin/search?areaID=1&subAreaID=0&catAbbreviation=car&group=S&type_search=&query=Honda&cat=6&minAsk=min&maxAsk=max
,'testNativeOptions' => { 'areaID' => '1'
,'subAreaID' => '0'
,'group' => 'S'
,'cat' => 'all'
,'new_cat' => '6'
,'catAbbreviation' => 'car'
,'group' => 'S'
,'type_search' => ''
,'min_ask' => 'min'
,'max_ask' => 'max'
}
,'usesPOST' => 1
};
}
sub init {
my ($self) = @_;
$self->searchEngineHome('http://www.CraigsList.org');
$self->searchEngineLogo('craigslist');
return $self;
}
# Access methods for the structural declarations of this Scraper engine.
sub scraperRequest { $scraperRequest }
sub scraperFrame { $_[0]->SUPER::scraperFrame($scraperFrame); }
sub scraperDetail{ undef }
1;
__END__
=pod
=head1 NAME
WWW::Scraper::CraigsList - Scrapes CraigsList
=head1 SYNOPSIS
require WWW::Scraper;
$search = new WWW::Scraper('CraigsList');
=head1 DESCRIPTION
This class is an CraigsList specialization of WWW::Search.
It handles making and interpreting CraigsList searches
F