#!/usr/local/bin/perl # FILE %usr/unixonly/hp200lx/ps/getst.pl # # VERY simple script which can be used to fetch the Palm Pilot # edition of "Der Standard" # # written: 1998-10-19 # latest update: 1998-10-30 18:14:20 # use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use lib '.'; use HP200LX::DB; %ENT= ( 'auml' => 'ae', 'ouml' => 'oe', 'uuml' => 'ue', 'Auml' => 'Ae', 'Ouml' => 'Oe', 'Uuml' => 'Ue', 'szlig' => 'ss', 'nbsp' => ' ', 'gt' => '>', 'lt' => '<', 'amp' => '&', 'quot' => '"', ); $ua= new LWP::UserAgent; # $ua->proxy ('http', 'http://falbala.wu-wien.ac.at'); $BASE= 'http://derstandard.at/Palm'; if (defined ($arg= shift (@ARGV))) { $BASE .= '/'. $arg; $WHAT= $BASE .'/Seite1.htm'; } else { $WHAT= $BASE. '/Titel.htm'; } @arts= &get_standard ($ua, $WHAT); $db= HP200LX::DB::openDB ('template/standard.gdb'); tie (@db, HP200LX::DB, $db); foreach $art (@arts) { # print "\n\n", '-' x 72, "\n", join (':', %$art), "\n"; $db[$i++]= $art; } $today=~ s/-//g; $db->refresh_viewpt (-1); $db->saveDB ("ps/$today.gdb"); exit (0); sub get_standard { my $ua= shift; my $URL= shift; my %URLS= ( 'http://www.comyan.com' => 'ignore', 'http://derstandard.at/Palm/19981020/standard.htm' => 'ignore', ); my (@rec, $art); push (@URLS, $URL); while ($url= shift (@URLS)) { next if (defined ($URLS{$url})); next unless ($url =~ /^$BASE/); next if ($url =~ /Sport\.htm/); $URLS{$url}= '1'; print ">>> $url\n"; my $req= new HTTP::Request ('GET', $url); my $res= $ua->request ($req); if ($res->is_success) { $txt= $res->content; push (@URLS, &parse_page ($url, $txt)); if ($url =~ /(\d{4})(\d{2})(\d{2})\/(\d+)\.htm$/) { $date= "$1-$2-$3"; $page= $4; $today= $date unless ($today); $art= &get_article ($txt); $art->{Date}= $date; $art->{Page}= sprintf ("%3d", $page); $art->{Source}= 'Standard'; push (@rec, $art); # print ">>> txt=$txt\n"; # last; } } else { print $res->error_as_HTML; } } @rec; } sub ent2txt { my $ent= shift; $ENT{$ent} || "&$ent;"; } sub get_article { my $html= shift; if ($html =~ m#