#!/usr/bin/perl use strict; use warnings; use WWW::Mechanize; use Getopt::Long; use Pod::Usage; use OpenGuides::RDF::Reader; use XML::RSS; use Data::Dumper; use OpenGuides; use OpenGuides::Config; use CGI::Wiki::Plugin::Locator::UTM; our $VERSION = 0.04; my $help = 0; my $list_version = 0; my $days=0; my $scrape=0; my ($site,$config_file); GetOptions( 'site=s' => \$site, 'config=s' => \$config_file, 'days=i' => \$days, 'scrape!' => \$scrape, 'help+' => \$help, 'version!' => \$list_version, ) || pod2usage( -verbose => 0 ); (print STDERR "$0 version $VERSION\n\n"), exit 0 if $list_version; pod2usage( -verbose => $help) if $help || !($site && $config_file); =head1 NAME og_mirror - Replicate an OpenGuides site =head1 SYNOPSIS og_mirror --site http://from.site.url/ --config /path/to/wiki.conf [--days 1] [--scrape] =head1 DESCRIPTION This is a script to mirror the contents from another OpenGuides website. It can be run from a cron job to update periodically. To initially load the wiki, run the script without the --days option. Then, the script can be run periodically with the --days option, to keep the site in line. =head1 OPTIONS =over 4 =item C<--site> Specify the guide website to mirror from. =item C<--config | -c> Path to the config file for the wiki on the localhost. =item C<--days | -d> Number of days back to look at in the RSS feed. Omit this option to work in "hoover" mode. =item C<--scrape> If the OpenGuides site is prior to 0.51, it will not support format=raw. Specify the option --scrape to use HTML scraping of the edit form of action=edit instead. =item C<--help | -h> Show this list of options. =item C<--help --help | -h -h> Display man page. =item C<--version> Show the mirror script's version number =back =head1 HISTORY 0.01 18-Oct-2005 Initial version 0.02 19-Oct-2005 Exclude updates for pages that haven't changed 0.03 29-Oct-2005 Add HTML scraping option for guides that don't have format=raw Check source URL against database for mirroring multiple guides =head1 BUGS Please report any bugs in this package using http://rt.cpan.org/ or posting to bugs-openguides-rdf-reader (at) rt.cpan.org. =head1 SUPPORT For discussion of all matters relating to OpenGuides, there is a mailing list http://openguides.org/mm/listinfo/openguides-dev. =head1 AUTHOR Ivor Williams CPAN ID: IVORW ivorw-openguides (at) xemaps.com http://openguides.org/ =head1 COPYRIGHT Copyright (C) Ivor Williams, 2005 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut my $agent = WWW::Mechanize->new(); my $config = OpenGuides::Config->new( file => $config_file ); my $guide = OpenGuides->new( config => $config ); my $wiki = $guide->wiki; my $locator = CGI::Wiki::Plugin::Locator::UTM->new; $wiki->register_plugin( plugin => $locator ); $locator->og_config($config); my @pagelist = $days ? get_recent_changes($agent, $site, $days) : get_all_pages($agent, $site); $|=1; for (@pagelist) { chomp; print $_,":"; my %meta = eval { get_page_metadata($agent, $site, $_) }; (print "Failed to parse metadata\n$@\n"),next if $@; # print Dumper \%meta; my $text = $scrape ? scrape_page_content($agent, $site, $_) : get_page_content($agent, $site, $_); # print $text; populate_local_wiki($wiki, $_, $text, \%meta, $locator); } sub get_all_pages { my ($ua, $url) = @_; $ua->get("$url?action=index;format=plain"); split /\n/,$ua->content; } sub get_recent_changes { my ($ua, $url, $days) = @_; $ua->get("$url?action=rss;days=$days"); my $rss = XML::RSS->new; $rss->parse($ua->content); reverse map {$_->{title}} @{$rss->{items}}; } sub get_page_metadata { my ($ua, $url, $page) = @_; $ua->get("$url?id=$page;format=rdf"); my $rdf = $ua->content; parse_rdf($rdf); } sub get_page_content { my ($ua, $url, $page) = @_; $ua->get("$url?id=$page;format=raw"); $ua->content; } sub scrape_page_content { my ($ua, $url, $page) = @_; $ua->get("$url?id=$page;action=edit"); for ($ua->forms) { my $in = $_->find_input('content'); return $in->value if $in; } undef; } sub populate_local_wiki { my ($wiki, $page, $content, $metadata, $pl) = @_; my %metadata = map { my $md = $metadata->{$_}; $_, ref($md) ? $md : [$md] } keys %$metadata; my $node = $wiki->formatter->node_param_to_node_name( $page ); my %old_data = $wiki->retrieve_node($node); if ($old_data{version}) { my $from = $old_data{metadata}{source}[0]; (print "Skipping as source URL is $from\n"), return if $from ne $metadata->{source}; (print "Unchanged\n"), return if $old_data{metadata}{version}[0] == $metadata->{version}; print "Updating... "; } else { print "Creating... "; } $pl->pre_write( $node, $content, \%metadata ); my $written = $wiki->write_node( $node, $content, $old_data{checksum}, \%metadata); print $written ? "Done\n" : "Failed\n"; }