package WWW::Google::News; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(get_news get_news_greg_style get_news_for_topic); our $VERSION = '0.12'; use Carp; use LWP; use URI::Escape; sub new { my $pkg = shift; my $self = {}; bless $self, $pkg; if (! $self->init(@_)) { return undef; } return $self; } sub init { my $self = shift; my $args = (ref($_[0]) eq "HASH") ? shift : {@_}; $self->{'_topic'} = $args->{'topic'}; $self->{'_start_date'} = $args->{'start_date'}; $self->{'_end_date'} = $args->{'end_date'}; $self->{'_sort'} = $args->{'sort'}; $self->{'_max'} = $args->{'max'} || 20; return 1; } sub topic { my $self = shift; $self->{'_topic'} = shift; return $self->{'_topic'}; } sub start_date { my $self = shift; $self->{'_start_date'} = shift; return $self->{'_start_date'}; } sub end_date { my $self = shift; $self->{'_end_date'} = shift; return $self->{'_end_date'}; } sub sort { my $self = shift; $self->{'_sort'} = shift; return $self->{'_sort'}; } sub max { my $self = shift; $self->{'_max'} = shift; return $self->{'_max'}; } sub search { my $self = shift; return get_news_for_topic($self->{'_topic'},$self->{'_start_date'},$self->{'_end_date'},$self->{'_sort'},$self->{'_max'}); } sub get_news { my $url = 'http://news.google.com/news/gnmainlite.html'; my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0'); my $response = $ua->get($url); return unless $response->is_success; my $content = $response->content; my $results = {}; my $re1 = '
]+)"?[^>]*>(.+?)
]+>]+>([^<]*?)(.*?)
]+>(.+?)\s*...\s*';
my @page_links = split /(\&start=\d+>)/mi,$content;
foreach my $pl (@page_links) {
if ($pl =~ /\&start=(\d+)>/) {
if (!exists($URL{$1})) {
$URL{$1} = 1;
}
}
}
my( $section ) = ( $content =~ m/$re1/s ) or next;
$section =~ s/\n//g;
my @stories = split /($re2)/mi,$section;
foreach my $story (@stories) {
if ($story =~ m/$re2/i) {
my $story_h = {};
my( $url, $headline, $source, $date, $summary ) = ( $1, $2, $3, $4, $5 );
_clean_string($source);
_clean_string($headline);
_clean_string($date);
_clean_string($summary);
$story_h->{url} = $url;
$story_h->{headline} = $headline;
$story_h->{source} = $source;
$story_h->{date} = $date;
$story_h->{description} = "$source: $summary";
$story_h->{summary} = $summary;
push(@results,$story_h);
last MAIN if $max>0 && scalar(@results)>=$max;
}
}
}
last MAIN unless $flag;
}
return \@results;
}
sub _clean_string {
$_[0] =~ s/ / /ig;
$_[0] =~ s/"/"/ig;
$_[0] =~ s/&/&/ig;
$_[0] =~ s/'/'/g;
$_[0] =~ s/
/ /ig;
$_[0] =~ s/<[^>]+>//g;
$_[0] =~ s/\s*-?\s*$//;
$_[0] =~ s/^\s+//;
}
1;
__END__
=head1 NAME
WWW::Google::News - Access to Google's News Service (Not Usenet)
=head1 SYNOPSIS
# OO search interface
use WWW::Google::News;
my $news = WWW::Google::News->new();
$news->topic("Frank Zappa");
my $results = $news->search();
# original news functions
use WWW:Google::News qw(get_news);
my $results = get_news();
my $results = get_news_for_topic('impending asteriod impact');
=head1 DESCRIPTION
This module provides a couple of methods to scrape results from Google News, returning
a data structure similar to the following (which happens to be suitable to feeding into XML::RSS).
{
'Top Stories' =>
[
{
'url' => 'http://www.washingtonpost.com/wp-dyn/articles/A9707-2002Nov19.html',
'headline' => 'Amendment to Homeland Security Bill Defeated'
},
{
'url' => 'http://www.ananova.com/news/story/sm_712444.html',
'headline' => 'US and UN at odds as Iraq promises to meet deadline'
}
],
'Entertainment' =>
[
{
'url' => 'http://abcnews.go.com/sections/entertainment/DailyNews/Coburn021119.html',
'headline' => 'James Coburn Dies'
},
{
'url' => 'http://www.cbsnews.com/stories/2002/11/15/entertainment/main529532.shtml',
'headline' => '007s On Parade At \'Die\' Premiere'
}
]
}
=head1 METHODS
=over 4
=item search()
Perform search on Google News. Options for search term (topic), sort, date range, and maximum results. Scraper will maximize results per page, and will page through results until it gets enough stories. Internally uses get_news_for_topic().
use WWW::Google::News;
my $news = WWW::Google::News->new();
# these methods will get or set their values
$news->topic("Frank Zappa"); # search term
$news->sort("date"); # relevance or date, relevance is default
$news->start_date("2005-04-20"); # must provide start and end date,
$news->end_date("2005-04-20"); # changes default sort to date
$news->max(2); # max stories, default 20. -1 => all stories.
my $results = $news->search();
foreach (@{$results}) {
print "Source: " . $_->{source} . "\n";
print "Date: " . $_->{date} . "\n";
print "URL: " . $_->{url} . "\n";
print "Summary: " . $_->{summary} . "\n";
print "Headline: " . $_->{headline} . "\n";
print "\n";
}
=item get_news()
Scrapes L