#! /usr/bin/perl
use LWP::UserAgent;
require HTML::Parse;
require HTML::FormatText;
use HTML::Entities;
use Getopt::Long;
use File::Path;
use HTTP::Date qw(time2str str2time);
#use LWP::Debug qw(+);
=head1 NAME
webmirror - Simple WWW mirror program
=head1 SYNOPSIS
webmirror [--from URL] [--to directory] [--proxy] [--checklinks]
[--verbose]
=head1 DESCRIPTION
This program can be used to mirror parts of WWW servers to a local
directory. The traversal of the remote pages is recursive, that is,
linked pages are mirrored, too, if they start with the same prefix. For
example, if you mirror C,
then C would be
mirrored, too, but C would not.
These are the meanings of the options:
=over 4
=item --from
Where to mirror from. This has to be an HTTP URL.
=item --to
Where to put the mirrored files.
=item --proxy
Use the WWW proxy settings from the environment (http_proxy, no_proxy).
You should not use this if your proxy is a caching proxy, only if it is
a firewall proxy. Otherwise you may mirror old versions of the pages.
=item --checklinks
Check not only links that have the same prefix but one level of other
links, too. This is very useful for checking your own pages for invalid
outbound links.
=item --verbose
Print debugging information.
=back
=head1 DIAGNOSTICS
webmirror normally prints only errors, for example:
Getting http://www.math.fu-berlin.de/~leitner/mutt/idnex.html
Error: 404 not found
Linked from: http://www.math.fu-berlin.de/~leitner/index.html
=head1 SEE ALSO
L, L, L, L
=head1 COPYRIGHT
webmirror is Copyright (c) 1996 Felix von Leitner. All rights reserved.
libwww-perl is Copyright (c) 1995, 1996 Gisle Aas. All rights reserved.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
&GetOptions(
"from=s" => \$from,
"to=s" => \$to,
"proxy!" => \$proxy,
"checklinks!" => \$checklinks,
"verbose!" => \$verbose);
if ($#ARGV==1) {
$from = shift(@ARGV);
$to = shift(@ARGV);
}
if ($#ARGV==0) {
if (length($from)) {
$to = shift(@ARGV);
} else {
$from = shift(@ARGV);
}
}
if (not defined $from) {
print "Syntax: $0 [--from URL] [--to DIRECTORY] [--proxy] [--verbose] [--checklinks]\n";
exit 0;
}
$to =~ s/^\~/$ENV{HOME}/;
#$baseurl=$from;
$ua = new LWP::UserAgent;
$ua->agent("Fefe-Mini-Mirror/0.1 " . $ua->agent);
$ua->env_proxy if ($proxy);
push @links,$from;
push @linkedfrom,"command line";
$num=0;
while ($url = shift @links) {
$linkedfrom = shift @linkedfrom;
next if (exists $history{$url});
$history{$url}=1;
print STDERR "Getting $url\n" if ($verbose);
$req = new HTTP::Request 'GET' => $url;
if (not defined $baseurl) {
$baseurl = $url;
$baseurl =~ s,[^/]+$,, if ($baseurl =~ m,[^/]$,);
print "Setting baseurl to $baseurl\n" if ($verbose);
}
if ($url =~ m,^$baseurl,) {
my $rel=$url; $rel=~s/^$baseurl//;
$rel .= "/index.html" if ($rel=~m,/$, or $rel eq "");
# print STDERR " relative URL is $rel\n";
$dest = "$to/$rel"; $dest =~ s,//,/,g;
# print STDERR " destination is $dest\n";
if (($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($dest)) {
$req->header('If-Modified-Since', time2str($mtime));
print " If-Modified-Since: ",time2str($mtime),"\n" if ($verbose);
}
}
$res = $ua->request($req); $success=$res->is_success;
if (not $url =~ m,^$baseurl,) {
if ($res->content =~ m/\<(moved|please update)\>/) {
print STDERR "Getting $url\n" if (not $verbose);
print STDERR " Error: page moved!\n";
print STDERR " Linked from: $linkedfrom\n";
next;
}
next if ($success);
}
if ($res->code == 500 or $res->code == 304) { # Kludge!
$success=1; $needtosave=0;
open FILE,$dest || die;
read FILE,$content,10*1024*1024;
close FILE;
$base="$baseurl$rel";
#print " Base: $base\n";
$mimetype="unknown";
$mimetype="text/html" if ($dest =~ m/\.html?$/);
# print STDERR "Content-Type: $mimetype\n";
} else {
$content=$res->content;
$needtosave=1;
$mimetype=$res->header('Content-type');
# print STDERR "Content-Type: $mimetype\n";
$time=$res->header('Last-Modified');
if (defined $time) {
$time=str2time($time);
# print " Last-Modified: $time\n" if ($verbose);
}
}
if ($success) {
my $rel=$url; $rel=~s/^$baseurl//;
$rel .= "/index.html" if ($rel=~m,/$, or $rel eq "");
$dest = "$to/$rel"; $dest =~ s,//,/,g;
#print STDERR " relative URL is $rel\n";
{ my $localdest=$dest;
$localdest =~ s,^$ENV{HOME},~,;
print STDERR " Saving to $localdest\n" if ($verbose and $needtosave);
}
{
my $path=$dest; $path =~ s,/[^/]+$,,;
mkpath($path);
}
if ($needtosave) {
{
my @tags = split(/,$content);
foreach $i (@tags) {
$i =~ s,$baseurl/*,/,;
$i =~ s,="",="index.html",;
$i =~ s,=(\s|>),=index.html$1,;
}
$content = join('<',@tags);
}
open FILE,">$dest" || die;
print FILE $content;
close FILE;
utime $time,$time,$dest;
$base = $res->base;
} else {
print " Not modified, not saving\n" if ($verbose);
}
my $html = HTML::Parse::parse_html($content);
# print " Base: $base\n";
print STDERR " Content-Type: $mimetype\n" if ($verbose);
if ($mimetype =~ m,^text/html$,) {
for ( @{ $html->extract_links } ) {
my($link, $elem) = @$_;
my $tag = uc $elem->tag;
# print " Link: $link -> \n";
$link = new URI::URL $link, $res->base;
# print $link->abs->as_string,"\n";
# print STDERR " found ",$link->abs->as_string,"\n";
my $Link = $link->abs->as_string;
$Link =~ s/#.*$//;
if ($Link =~ m/$baseurl/ or $checklinks) {
next if ($Link =~ m/^(mailto|news):/i);
push @links,$Link;
push @linkedfrom,$url;
}
}
}
# print content;
} else {
print STDERR "Getting $url\n" if (not $verbose);
print STDERR " Error: " . $res->code . " " . $res->message,"\n";
print STDERR " Linked from: $linkedfrom\n";
}
}