#! /usr/bin/perl use LWP::UserAgent; require HTML::Parse; require HTML::FormatText; use HTML::Entities; use Getopt::Long; use File::Path; use HTTP::Date qw(time2str str2time); #use LWP::Debug qw(+); =head1 NAME webmirror - Simple WWW mirror program =head1 SYNOPSIS webmirror [--from URL] [--to directory] [--proxy] [--checklinks] [--verbose] =head1 DESCRIPTION This program can be used to mirror parts of WWW servers to a local directory. The traversal of the remote pages is recursive, that is, linked pages are mirrored, too, if they start with the same prefix. For example, if you mirror C, then C would be mirrored, too, but C would not. These are the meanings of the options: =over 4 =item --from Where to mirror from. This has to be an HTTP URL. =item --to Where to put the mirrored files. =item --proxy Use the WWW proxy settings from the environment (http_proxy, no_proxy). You should not use this if your proxy is a caching proxy, only if it is a firewall proxy. Otherwise you may mirror old versions of the pages. =item --checklinks Check not only links that have the same prefix but one level of other links, too. This is very useful for checking your own pages for invalid outbound links. =item --verbose Print debugging information. =back =head1 DIAGNOSTICS webmirror normally prints only errors, for example: Getting http://www.math.fu-berlin.de/~leitner/mutt/idnex.html Error: 404 not found Linked from: http://www.math.fu-berlin.de/~leitner/index.html =head1 SEE ALSO L, L, L, L =head1 COPYRIGHT webmirror is Copyright (c) 1996 Felix von Leitner. All rights reserved. libwww-perl is Copyright (c) 1995, 1996 Gisle Aas. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut &GetOptions( "from=s" => \$from, "to=s" => \$to, "proxy!" => \$proxy, "checklinks!" => \$checklinks, "verbose!" => \$verbose); if ($#ARGV==1) { $from = shift(@ARGV); $to = shift(@ARGV); } if ($#ARGV==0) { if (length($from)) { $to = shift(@ARGV); } else { $from = shift(@ARGV); } } if (not defined $from) { print "Syntax: $0 [--from URL] [--to DIRECTORY] [--proxy] [--verbose] [--checklinks]\n"; exit 0; } $to =~ s/^\~/$ENV{HOME}/; #$baseurl=$from; $ua = new LWP::UserAgent; $ua->agent("Fefe-Mini-Mirror/0.1 " . $ua->agent); $ua->env_proxy if ($proxy); push @links,$from; push @linkedfrom,"command line"; $num=0; while ($url = shift @links) { $linkedfrom = shift @linkedfrom; next if (exists $history{$url}); $history{$url}=1; print STDERR "Getting $url\n" if ($verbose); $req = new HTTP::Request 'GET' => $url; if (not defined $baseurl) { $baseurl = $url; $baseurl =~ s,[^/]+$,, if ($baseurl =~ m,[^/]$,); print "Setting baseurl to $baseurl\n" if ($verbose); } if ($url =~ m,^$baseurl,) { my $rel=$url; $rel=~s/^$baseurl//; $rel .= "/index.html" if ($rel=~m,/$, or $rel eq ""); # print STDERR " relative URL is $rel\n"; $dest = "$to/$rel"; $dest =~ s,//,/,g; # print STDERR " destination is $dest\n"; if (($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($dest)) { $req->header('If-Modified-Since', time2str($mtime)); print " If-Modified-Since: ",time2str($mtime),"\n" if ($verbose); } } $res = $ua->request($req); $success=$res->is_success; if (not $url =~ m,^$baseurl,) { if ($res->content =~ m/\<(moved|please update)\>/) { print STDERR "Getting $url\n" if (not $verbose); print STDERR " Error: page moved!\n"; print STDERR " Linked from: $linkedfrom\n"; next; } next if ($success); } if ($res->code == 500 or $res->code == 304) { # Kludge! $success=1; $needtosave=0; open FILE,$dest || die; read FILE,$content,10*1024*1024; close FILE; $base="$baseurl$rel"; #print " Base: $base\n"; $mimetype="unknown"; $mimetype="text/html" if ($dest =~ m/\.html?$/); # print STDERR "Content-Type: $mimetype\n"; } else { $content=$res->content; $needtosave=1; $mimetype=$res->header('Content-type'); # print STDERR "Content-Type: $mimetype\n"; $time=$res->header('Last-Modified'); if (defined $time) { $time=str2time($time); # print " Last-Modified: $time\n" if ($verbose); } } if ($success) { my $rel=$url; $rel=~s/^$baseurl//; $rel .= "/index.html" if ($rel=~m,/$, or $rel eq ""); $dest = "$to/$rel"; $dest =~ s,//,/,g; #print STDERR " relative URL is $rel\n"; { my $localdest=$dest; $localdest =~ s,^$ENV{HOME},~,; print STDERR " Saving to $localdest\n" if ($verbose and $needtosave); } { my $path=$dest; $path =~ s,/[^/]+$,,; mkpath($path); } if ($needtosave) { { my @tags = split(/),=index.html$1,; } $content = join('<',@tags); } open FILE,">$dest" || die; print FILE $content; close FILE; utime $time,$time,$dest; $base = $res->base; } else { print " Not modified, not saving\n" if ($verbose); } my $html = HTML::Parse::parse_html($content); # print " Base: $base\n"; print STDERR " Content-Type: $mimetype\n" if ($verbose); if ($mimetype =~ m,^text/html$,) { for ( @{ $html->extract_links } ) { my($link, $elem) = @$_; my $tag = uc $elem->tag; # print " Link: $link -> \n"; $link = new URI::URL $link, $res->base; # print $link->abs->as_string,"\n"; # print STDERR " found ",$link->abs->as_string,"\n"; my $Link = $link->abs->as_string; $Link =~ s/#.*$//; if ($Link =~ m/$baseurl/ or $checklinks) { next if ($Link =~ m/^(mailto|news):/i); push @links,$Link; push @linkedfrom,$url; } } } # print content; } else { print STDERR "Getting $url\n" if (not $verbose); print STDERR " Error: " . $res->code . " " . $res->message,"\n"; print STDERR " Linked from: $linkedfrom\n"; } }