package WWW::Google::News; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(get_news get_news_greg_style get_news_for_topic); our $VERSION = '0.07'; use Carp; use LWP; use URI::Escape; sub get_news { my $url = 'http://news.google.com/news/gnmainlite.html'; my $ua = LWP::UserAgent->new; my $response = $ua->get($url); my $results = {}; return unless $response->is_success; #print STDERR "\n",length($response->content)," bytes \n"; my $re1 = '\s* (.*?)\s*'; my $re2 = '
\s*(.*?)
'; my @sections = split /($re1)/m,$response->content; my $current_section = ''; foreach my $section (@sections) { if ($section =~ m/$re1/m) { $current_section = $1; #print STDERR $1,"\n"; } else { my @stories = split /($re2)/mi,$section; foreach my $story (@stories) { if ($story =~ m/$re2/mi) { if (!(exists($results->{$current_section}))) { $results->{$current_section} = []; } my $story_h = {}; $story_h->{url} = $1; $story_h->{headline} = $2; push(@{$results->{$current_section}},$story_h); } } } } #print STDERR Dumper($results); return $results; } sub get_news_greg_style { my $results = get_news(); my $greg_results = {}; foreach my $section (keys(%$results)) { $greg_results->{$section} = {}; my $cnt = 0; foreach my $story_h (@{$results->{$section}}) { $cnt++; $greg_results->{$section}->{$cnt} = $story_h; } } return $greg_results; } sub get_news_for_topic { my $topic = uri_escape( $_[0] ); my @results = (); my $url = "http://news.google.com/news?hl=en&edition=us&q=$topic"; my $ua = LWP::UserAgent->new(); $ua->agent('Mozilla/5.0'); my $response = $ua->get($url); return unless $response->is_success; my $re1 = '
(.+)<.*br>
'; my $re2 = '(.*?)
]+>]+>([^<]*?)([^<]*?)
]+>\s*...\s*(.*?)'; my( $section ) = ( $response->content =~ m/$re1/s ); $section =~ s/\n//g; my @stories = split /($re2)/mi,$section; foreach my $story (@stories) { if ($story =~ m/$re2/i) { my $story_h = {}; my( $url, $headline, $source, $date, $summary ) = ( $1, $2, $3, $4, $5 ); $source =~ s/ / /g; $source =~ s/\s+/ /g; $date =~ s/ / /g; $date =~ s/\s+/ /g; $date =~ s/-//g; #$summary = $hs->parse($summary); $hs->eof; $summary =~ s#
# #gi; $summary =~ s#<.+?>##gi; $story_h->{url} = $url; $story_h->{headline} = $headline; $story_h->{source} = $source; $story_h->{date} = $date; $story_h->{description} = "$source: $summary"; push(@results,$story_h); } } return \@results; } 1; __END__ =head1 NAME WWW::Google::News - Access to Google's News Service (Not Usenet) =head1 SYNOPSIS use WWW:Google::News qw(get_news); my $results = get_news(); my $results = get_news_for_topic('impending asteriod impact'); =head1 DESCRIPTION This module provides a couple of methods to scrape results from Google News, returning a data structure similar to the following (which happens to be suitable to feeding into XML::RSS). { 'Top Stories' => [ { 'url' => 'http://www.washingtonpost.com/wp-dyn/articles/A9707-2002Nov19.html', 'headline' => 'Amendment to Homeland Security Bill Defeated' }, { 'url' => 'http://www.ananova.com/news/story/sm_712444.html', 'headline' => 'US and UN at odds as Iraq promises to meet deadline' } ], 'Entertainment' => [ { 'url' => 'http://abcnews.go.com/sections/entertainment/DailyNews/Coburn021119.html', 'headline' => 'James Coburn Dies' }, { 'url' => 'http://www.cbsnews.com/stories/2002/11/15/entertainment/main529532.shtml', 'headline' => '007s On Parade At \'Die\' Premiere' } ] } =head1 METHODS =over 4 =item get_news() Scrapes L and returns a reference to a hash keyed on News Section, which points to an array of hashes keyed on URL and Headline. =item get_news_for_topic( $topic ) Queries L for results on a particular topic, and returns a pointer to an array of hashes containing result data. An RSS feed can be constructed from this very easily: use WWW::Google::News; use XML::RSS; $results = get_news_for_topic( $topic ) my $rss = XML::RSS->new; $rss->channel(title => "Google News -- $topic"); for (@{$news}) { $rss->add_item( title => $_->{headline}, link => $_->{url}, description => $_->{description}, ); } print $rss->as_string; =item get_news_greg_style() It also provides a method called get_news_greg_style() which returns the same data, only using a hash keyed on story number instead of the array described in the above. =head1 TODO * Implement an example RSS feed. -- Done, see above * Seek out a good psychologist so we can work through Greg's obsession with hashes. =head1 AUTHORS Greg McCarroll , Bowen Dwelle =head1 KUDOS Darren Chamberlain for rss_alternate.pl Leon Brocard for pulling me up on my obsessive compulsion to use hashes. =head1 SEE ALSO L L =cut