#!/bin/env perl -w require 5.004; use strict; #------------------------------------------------------------------------------ # # Start of POD # #------------------------------------------------------------------------------ =head1 NAME sitemapper.pl - script for generating site maps =head1 SYNOPSIS sitemapper.pl [ -verbose ] [ -help ] [ -doc ] [ -depth ] [ -proxy ] [ -[no]envproxy ] [ -agent ] [ -authen ] [ -format ] [ -summary ] [ -title ] [ -email ] [ -gui ] -url =cut =head1 DESCRIPTION B generates site maps for a given site. It traverses a site from the root URL given as the -site option and generates an HTML page consisting of a bulleted list which reflects the structure of the site. The structure reflects the distance from the home page of the pages listed; i.e. the first level bullets are pages accessible directly from the home page, the next level, pages accessible from those pages, etc. Obviously, pages that are linked from "higher" up pages may appear in the "wrong place" in the tree, than they "belong". The -format option can be used to specify alternative options for formating the site map. Currently the options are html (as described above - the default), js, which uses Jef Pearlman's (jef@mit.edu) Javascript Tree class to display the site map as a collapsable tree, and text (plain text). =head1 OPTIONS =head2 -depth Option to specify the depth of the site map generated. If no specified, generates a sitemap of unlimited depth. =head2 -email Option to specify the e-mail address which is reported by the robot to the site it gets pages from. =head2 -url Option to specify a root URL to generate a site map for. =head2 -proxy Specify an HTTP proxy to use. =head2 -[no]envproxy If -envproxy is set, the proxy specified by the $http_proxy environment variable will be used (this is the default behaviour). Use -noenvproxy to suppress this. -proxy takes precedence over -envproxy. =head2 -agent Allows the user to specify an agent for the robot to pretend to be (e.g. 'Mozilla/4.5'). This can be necessary for sites that do browser sniff for serving particular content, etc. =head2 -format Option for specifying the for the site map. Possible values are: =over 4 =item html Plain old HTML bulleted list. =item js A collapsable DHTML tree, generated using Jef Pearlman's (jef@mit.edu) Javascript Tree class. =item text Plain text. =item xml An XML graph of linkage between pages. =back =head2 -summary Automatically extract a summary to display with the title. This will be truncated at the specified number of characters. =head2 -title Option to specify a page title for the site map. =head2 -authen Option to use LWP::AuthenAgent to get HTML pages. This allows the user to type username / password for pages that are access controlled. =head2 -gui Use a Tk GUI to run sitemapper. =head2 -help Display a short help message to standard output, with a brief description of purpose, and supported command-line switches. =head2 -doc Display the full documentation for the script, generated from the embedded pod format doc. =head2 -version Print out the current version number. =head2 -verbose Turn on verbose error messages. =head1 ENVIRONMENT B makes use of the C<$http_proxy> environment variable, if it is set. =head1 PREREQUISITES Date::Format HTML::Entities Getopt::Long IO::File LWP::AuthenAgent LWP::UserAgent Pod::Usage URI::URL WWW::Sitemap =head1 OSNAMES hpux 10 PA-RISC1.1 linux 2.2.1 ppc-linux linux 2.2.2 i686-linux MSWin32 4.0 MSWin32-x86 sunos 4.1.4 sun4-sunos sunos 5.6 sun4-solaris =head1 SEE ALSO Jef Pearlman's Javascript Tree class (http://developer.netscape.com/docs/examples/dynhtml/tree.html) =head1 BUGS The Javascript sitemap has only been tested on Netscape 4.05. =head1 AUTHOR Ave Wrigley EAve.Wrigley@itn.co.ukE =head1 COPYRIGHT Copyright (c) 1998 Canon Research Centre Europe. All rights reserved. This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SCRIPT CATEGORIES Web =cut #------------------------------------------------------------------------------ # # End of POD # #------------------------------------------------------------------------------ use Date::Format; use Getopt::Long; use HTML::Entities; use URI::URL; require WWW::Sitemap; require IO::File; #------------------------------------------------------------------------------ # # Public global variables # #------------------------------------------------------------------------------ use vars qw( $NAME $VERSION $CONTACT $WHEN $HEADER $FOOTER %DOC2POD %FORMATS ); # command line options - see pod use vars qw ( $opt_verbose $opt_version $opt_help $opt_doc $opt_authen $opt_depth $opt_title $opt_summary $opt_format $opt_url $opt_email $opt_proxy $opt_envproxy $opt_agent $opt_output $opt_gui ); #------------------------------------------------------------------------------ # # Initialize global variables # #------------------------------------------------------------------------------ ( $NAME ) = $0 =~ m{([^/]+)$}; $CONTACT = 'Ave.Wrigley@itn.co.uk'; $VERSION = '1.019'; $WHEN = time2str( "on %A the %o of %B %Y at %r", time ); $HEADER = sub { my $title = shift; return < $title

$title


HTML_HEADER }; $FOOTER = <
$NAME version $VERSION $CONTACT
Generated $WHEN
FOOTER %DOC2POD = ( doc => [ 2, 0 ], help => [ 1, 0 ], usage => [ 0, 1 ], ); %FORMATS = ( html => [ 'HTML Files', [ '.html', '.htm' ] ], text => [ 'Text Files', [ '.txt', '.text' ] ], js => [ 'HTML Files', [ '.html', '.htm' ] ], xml => [ 'XML Files', [ '.xml' ] ], ); #------------------------------------------------------------------------------ # # Set command line option defaults # #------------------------------------------------------------------------------ $opt_verbose = 0; $opt_authen = 0; $opt_envproxy = 1; #------------------------------------------------------------------------------ # # Display hashes - these hashes are used to print out sitemap, using # $opt_format as a key # #------------------------------------------------------------------------------ my %print_start_all_lists = ( 'js' => sub { print '"[' }, 'html' => sub { print < START_LIST }, 'text' => sub { }, ); my %print_end_all_lists = ( 'js' => sub { print ']"' }, 'html' => sub { print < END_LIST }, 'text' => sub { }, ); my %print_start_list = ( 'js' => sub { print '[' }, 'html' => sub { print '
    ' }, 'text' => sub { }, ); my %print_end_list = ( 'js' => sub { print '],' }, 'html' => sub { print '
' }, 'text' => sub { }, ); my %print_node = ( 'js' => sub { my $url = shift; my $depth = shift; my $title = shift || "[No Title]"; my $summary = shift || "[No Summary]"; # ditch the funny stuff $title = encode_entities( $title, "^a-z0-9A-Z " ); $summary = encode_entities( $summary, "^a-z0-9A-Z " ); print "'
$title
$summary
',"; }, 'html' => sub { my $url = shift; my $depth = shift; my $title = shift || "[No Title]"; my $summary = shift || "[No Summary]"; print <
$title
$summary
HTML_NODE }, 'text' => sub { my $url = shift; my $depth = shift; my $title = shift; print " " x $depth, $url, "::", $title, "\n"; return; }, ); my %print_page_start = ( 'js' => sub { my $title = shift; print $HEADER->( $title ); print join( '', ); print < firstTree = new Tree ( { id: "sitemap", items: JS ; }, 'html' => sub { print $HEADER->( shift ); }, 'text' => sub { my $title = shift; print "$title\n", "-" x 80, "\n"; }, ); my %print_page_end = ( 'js' => sub { print <
+ Click to expand sub-pages
- Click to contract sub-pages
o No sub-pages
$FOOTER
JAVASCRIPT_FOOTER }, 'html' => sub { print $FOOTER, < HTML_FOOTER }, 'text' => sub { my $title = shift; print "-" x 80, "\n"; print "Generated ", $WHEN, "\n"; print "$NAME version $VERSION $CONTACT\n"; }, ); #============================================================================== # # Some utility functions # #============================================================================== #------------------------------------------------------------------------------ # # verbose - print a message to STDERR, if the -verbose flag is set # #------------------------------------------------------------------------------ sub verbose { print STDERR @_, "\n" if $opt_verbose; }; #------------------------------------------------------------------------------ # # autoloader for documentation stuff # #------------------------------------------------------------------------------ sub AUTOLOAD { use vars qw( $AUTOLOAD ); require 'Pod/Usage.pm'; import Pod::Usage; my ( $function ) = $AUTOLOAD =~ m/.*::(.*)/; return unless exists( $DOC2POD{ $function } ); pod2usage( 'verbose' => $DOC2POD{ $function }[ 0 ], 'exitval' => $DOC2POD{ $function }[ 1 ] ); } #------------------------------------------------------------------------------ # # check_options - check that a command line option conforms for a specified # format # #------------------------------------------------------------------------------ sub check_options { my $option_name = shift; my $options = shift; my $default = shift; eval "\$opt_$option_name ||= '$default'"; my $regex = '(' . join( '|', @$options ) . ')' ; eval <$opt_output" ); select OUTPUT_FH; } usage( '-url argument is required' ) unless $opt_url; my $ua = create_useragent(); my $sitemap = new WWW::Sitemap EMAIL => $opt_email || 'your@email.address', USERAGENT => $ua, ROOT => $opt_url, SUMMARY_LENGTH => $opt_summary || 200, DEPTH => $opt_depth, VERBOSE => $opt_verbose, or die "new WWW::Sitemap failed\n"; if ( defined $callback ) { $sitemap->url_callback( sub { my ( $url, $depth, $title, $summary ) = @_; $callback->( "$url ...\n" ); } ); } $sitemap->generate(); print_sitemap( $sitemap ); } #------------------------------------------------------------------------------ # # create_useragent() - create the useragent, and set the http proxy, if # necessary # #------------------------------------------------------------------------------ sub create_useragent { my $ua; if ( $opt_authen ) { require LWP::AuthenAgent; $ua = new LWP::AuthenAgent; } else { require LWP::UserAgent; $ua = new LWP::UserAgent; } # Set the proxy from the environment or the proxy option if ( defined( $opt_proxy ) ) { verbose( "proxy = $opt_proxy ..." ); $ua->proxy( [ 'http' ], $opt_proxy ); } elsif ( $opt_envproxy and exists( $ENV{ http_proxy } ) ) { verbose( "getting proxy from environment ..." ); verbose( "proxy = $ENV{ http_proxy } ..." ); $ua->env_proxy(); } else { verbose( "no proxy ..." ); $ua->no_proxy(); } if ( $opt_agent ) { verbose( "Setting agent to $opt_agent ..." ); $ua->agent( $opt_agent ); } return $ua; } #------------------------------------------------------------------------------ # # print_sitemap() - print the sitemap, according to $opt_format # #------------------------------------------------------------------------------ sub print_sitemap() { my $sitemap = shift; # Print out the link graph, if $opt_format is 'xml' ... if ( $opt_format eq 'xml' ) { print_xml_link_graph( $sitemap ); return; } $print_page_start{ $opt_format }->( defined( $opt_title ) ? $opt_title : "Site map for $opt_url" ); $print_start_all_lists{ $opt_format }->( ); $sitemap->traverse( sub { my ( $sitemap, $url, $depth, $flag ) = @_; if ( $flag == 0 ) { $print_start_list{ $opt_format }->( ); } elsif( $flag == 1 ) { my $title = $sitemap->title( $url ); my $summary = $sitemap->summary( $url ); $print_node{ $opt_format }->( $url, $depth, $title, $summary ); } elsif( $flag == 2 ) { $print_end_list{ $opt_format }->( ); } } ); $print_end_all_lists{ $opt_format }->( ); $print_page_end{ $opt_format }->( ); } #------------------------------------------------------------------------------ # # print_xml_link_graph - print an XML format graph of all the URLs and links # #------------------------------------------------------------------------------ sub print_xml_link_graph { my $sitemap = shift; printf <root(); ROOT for my $from_url ( $sitemap->urls() ) { for my $to_url ( $sitemap->links( $from_url ) ) { print < LINK } } for my $url ( $sitemap->urls() ) { my $title = $sitemap->title( $url ); my $summary = $sitemap->summary( $url ); $title = encode_entities( $title ); $summary = encode_entities( $summary ); print < URL } } #------------------------------------------------------------------------------ # # report_error - pop up a Tk error dialog box # #------------------------------------------------------------------------------ sub report_error { my $mw = shift; my $msg = shift; my $errorBox = $mw->Dialog( -title => 'Error: ', ); $errorBox->configure( -wraplength => '4i', -text => $msg, ); $errorBox->Show; } #------------------------------------------------------------------------------ # # get_options_from_gui() # #------------------------------------------------------------------------------ sub get_options_from_gui { require Tk; import Tk; require Tk::ROText; require Tk::Dialog; # the main window my $mw = MainWindow->new; # the about diolog my $diAbout = $mw->Dialog( -title => 'About: ', ); $diAbout ->configure( -wraplength => '4i', -text => < Tk GUI version by Nicholas Marshall This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself. ABOUT_TEXT ); # the menu bar my $menu = $mw->Frame( -relief => 'raised', -borderwidth => 2 ); $menu->pack( -fill => 'x' ); my $menuFile = $menu->Menubutton( -text => 'File' , -underline => 0 ); $menuFile->command( -label =>'Exit', -command => [ sub{ exit }, 'Exit' ] ); $menuFile->pack( -side => 'left' ); my $menuHelp = $menu->Menubutton( -text => 'Help', -underline => 0 ); $menuHelp->command( -label =>'About', -command => [ sub{ $diAbout->Show } ] ); $menuHelp->pack( -side => 'right' ); # the url widget ... my $urlFrame = $mw->Frame; $urlFrame->pack( -fill => 'x' ); my $urlLabel = $urlFrame->Label( -text => 'Enter target URL:' ); $urlLabel->pack( -side => 'left'); my $urlEntry = $urlFrame->Entry( -width => 40, -textvariable => \$opt_url, ); $urlEntry->pack( -side => 'right'); # the email widget ... my $emailFrame = $mw->Frame; $emailFrame->pack( -fill => 'x'); my $emailLabel = $emailFrame->Label( -text => 'Enter your email:' ); $emailLabel->pack( -side => 'left'); my $emailEntry = $emailFrame->Entry( -textvariable => \$opt_email, ); $emailEntry->pack( -side => 'right'); # the depth widget ... my $depthFrame = $mw->Frame; $depthFrame->pack( -fill => 'x'); my $depthLabel = $depthFrame->Label( -text => 'Enter how deep to go:' ); $depthLabel->pack( -side => 'left'); my $depthEntry = $depthFrame->Entry( -textvariable => \$opt_depth, ); $depthEntry->pack( -side => 'right'); # the format widget ... my $formatRB = $mw->Frame; $formatRB->pack( -fill => 'x' ); my $formatLabel = $formatRB->Label( -text => 'Output Format?' ); foreach my $format ( keys %FORMATS ) { $formatRB->Radiobutton( -text => $format, -variable => \$opt_format, -relief => 'flat', -value => lc( $format ), )->pack(-side => 'right' ); } $formatLabel->pack( -side => 'left'); # the output widget ... my $outputFrame = $mw->Frame( -relief => 'flat', -borderwidth => 0 ); $outputFrame->pack( -fill => 'x'); my $outputLabel = $outputFrame->Label( -text => 'Select the output file:' ); $outputLabel->pack( -side => 'left'); my $outputEntry = $outputFrame->Entry( -width => 20, -textvariable => \$opt_output, ); $outputEntry->pack( -side => 'left'); my $outputButton = $outputFrame->Button( -text => 'Browse..', -command => sub{ my $file = $mw->getSaveFile( -filetypes => [ $FORMATS{ $opt_format }, [ 'All files', '*' ] ], -initialfile => 'Untitled', -defaultextension => $FORMATS{ $opt_format }->[ 1 ][ 0 ] ); if ( defined $file and $file ne '' ) { $outputEntry->delete( 0, 'end' ); $outputEntry->insert( 0, $file ); $outputEntry->xview( 'end' ); } }, ); $outputButton->pack( -side => 'left'); my $status_text; my $pid; my $statusBox = $mw->Scrolled( 'ROText' ); my $attackButton = $mw->Button( -text => 'Generate Sitemap', -command => sub { report_error( $mw, "No URL Specified" ) and return unless $opt_url ; report_error( $mw, "No format specified" ) and return unless $opt_format ; # report_error( $mw, "No depth specified" ) and return # unless $opt_depth # ; report_error( $mw, "No output specified" ) and return unless $opt_output ; $statusBox->insert( 'end', "Generating sitemap of $opt_url\n" ); $statusBox->yview( 'end' ); create_sitemap( sub { my $text = shift; $statusBox->insert( 'end', $text ); $statusBox->yview( 'end' ); $mw->update(); } ); $statusBox->insert( 'end', "Sitemap of $opt_url written to $opt_output\n" ); $statusBox->yview( 'end' ); $mw->Dialog( -text => 'Sitemap generation complete', -buttons => [ 'Exit' ], )->Show(); exit; } ); $attackButton->pack; $statusBox->pack; MainLoop(); } #============================================================================== # # End of subroutines # #============================================================================== #============================================================================== # # JavaScript Code - Jef Pearlman's (jef@mit.edu) Tree class # http://developer.netscape.com/docs/examples/dynhtml/tree.html # #============================================================================== __END__