############################################################################# ## $Id: WaybackMachine.pm 6702 2006-07-25 01:43:27Z spadkins $ ############################################################################# use strict; package WWW::WebArchive::WaybackMachine; use WWW::WebArchive::Agent; use vars qw($VERSION @ISA); $VERSION = "0.50"; @ISA = ("WWW::WebArchive::Agent"); use WWW::Mechanize; sub restore { &App::sub_entry if ($App::trace); my ($self, $options) = @_; my $dir = $options->{dir}; $dir = $self->{dir} if (!defined $dir); $dir = "." if (!defined $dir); $dir .= "/$self->{name}"; my $url = $options->{url} || die "restore(): URL not provided"; $url =~ s!/$!!; if ($url !~ /^[a-z]+:/) { $url = "http://$url"; } my $domain = $url; $domain =~ s!^[a-z]+://!!; $domain =~ s!/.*!!; my $seclvl_domain = $domain; if ($seclvl_domain =~ /([^\.]+\.[^\.]+)$/) { $seclvl_domain = $1; } my $verbose = $options->{verbose}; $verbose = $self->{verbose} if (!defined $verbose); $verbose = 0 if (!defined $verbose); ################################################################### # Initialize User Agent ################################################################### my $ua = WWW::Mechanize->new(); $ua->agent_alias("Windows IE 6"); $ua->stack_depth(1); # limit the number of pages we remember to 1 (one back() allowed) ################################################################### # Search Internet Archive Wayback Machine for cached documents ################################################################### my (%link, @links, $link); my ($done, $next_url, $link_text, $link_url); my ($link_text2, $link_url2); $done = 0; print "Restoring [$url]\n" if ($verbose); $ua->get("http://web.archive.org/web/*sr_1nr_100/$url*"); $self->check_status($ua); while (!$done) { @links = $ua->links(); $done = 1; foreach $link (@links) { $link_text = $link->text(); $link_url = $link->url_abs(); printf("> Link: %-40s %s\n", $link_text, $link_url) if ($verbose >= 3); if ($link_url =~ m!^http://web.archive.org/web/.*$seclvl_domain! && $link_text =~ m!$seclvl_domain!) { printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose); $link{$link_text} = $link; } if ($link_text eq "Next") { $next_url = $link->url_abs(); } } if ($next_url) { #print "Next: $next_url\n"; $ua->get($next_url); $self->check_status($ua); $done = 0; $next_url = ""; } } ################################################################### # Mirror cached documents to local file system ################################################################### my ($action, $file); foreach $link_text (sort keys %link) { $link = $link{$link_text}; $link_url = $link->url_abs(); if ($link_url =~ m!^http://web.archive.org/web/([^/]+)/(.*)$!) { $action = $1; $file = $2; if ($file =~ m!/$!) { print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); } elsif ($file =~ m!/[^/\\\.]+$!) { print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); } elsif ($file =~ m!/\?[DMNS]=[DA]$!) { print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); } else { if ($action eq "*hh_") { $self->mirror($ua, "http://web.archive.org/http://$file", $file, $dir, $domain); #print "Getting historical versions [$link_url] ...\n" if ($verbose >= 1); #$ua->get($link_url); #$self->check_status($ua); #if ($ua->success()) { # @links = $ua->links(); # foreach $link (@links) { # $link_text2 = $link->text(); # $link_url2 = $link->url_abs(); # if ($link_url2 =~ m!^http://web.archive.org/web/.*$domain! && # $link_text2 =~ m!$domain!) { # #printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose); # printf("> Link: %-40s %s\n", $link_text2, $link_url2); # #$link{$link_text} = $link; # } # } #} #else { # print "Can't get URL [$link_url]\n"; #} } elsif ($action =~ /^[0-9]+$/) { $self->mirror($ua, $link_url, $file, $dir, $domain); } else { print "Unknown link type [$link_url]\n"; } } } else { print "Unknown link type [$link_url]\n"; } } &App::sub_exit() if ($App::trace); } sub mirror { &App::sub_entry if ($App::trace); my ($self, $ua, $url, $file, $basedir, $domain) = @_; if (! -f "$basedir/$file" || $App::options{clobber}) { $ua->get($url); $self->check_status($ua); if ($ua->success()) { my $content = $ua->content(); my $content_type = $ua->ct(); if ($content_type eq "text/html") { $content = $self->clean_html($content, $file, $domain); } my $len = length($content); $self->write_file("$basedir/$file", $content); print "Wrote file [$file] ($len bytes)\n"; } else { print "Missed file [$file]\n"; } } else { print "File exists [$file]\n"; } &App::sub_exit() if ($App::trace); } sub clean_html { &App::sub_entry if ($App::trace); my ($self, $html, $file, $domain) = @_; # Unix files. No CR's allowed. $html =~ s/\r//g; # clean up weird additions to . Unfortunately, this wipes out real uses of the tag in the original doc. $html =~ s###; $html =~ s#]*>\s*##si; # the first one was put in by Internet Archive $html =~ s#<(BASE [^<>]*)>\s*##si; # there may be a real tag. keep in comment. all URL's must be relative. #$html =~ s##\n#; # clean up the spacing to get rid of extraneous lines $html =~ s#\s*#\n#si; $html =~ s#\s*#\n#si; $html =~ s#\s*#\n#si; $html =~ s#\s*#\n#; # remove a really odd background="foo.html" attribute from the $html =~ s#]*) background="[^"]*.html?"#]>\s*##s; $html =~ s#\s*##s; $html =~ s#]*>\s*