#!/usr/bin/env perl # ABSTRACT: analyze several HTML documents based on the same template # PODNAME: untemplate use 5.010; use strict; use utf8::all; use warnings qw(all); use Carp qw(croak); use File::Basename; use File::Temp; use Getopt::Long; use HTML::Linear; use IO::Interactive qw(is_interactive); use Class::Load qw(try_load_class); use Pod::Usage; use Term::ANSIColor qw(:constants); use Tie::IxHash; ## no critic (ProhibitDeepNests, ProhibitPackageVars) our $VERSION = '0.016'; # VERSION GetOptions( q(help) => \my $help, q(color!) => \my $color, q(html!) => \my $html, q(encoding=s) => \my $encoding, q(partial!) => \my $partial, q(shrink!) => \my $shrink, q(strict!) => \my $strict, q(unmangle=s) => \my @unmangle, ) or pod2usage(q(-verbose) => 1); pod2usage(q(-verbose) => 1) if $help or $#ARGV < 1; $color //= is_interactive(*STDOUT); if ($html) { (%HTML::Linear::Path::xpath_wrap) = (%{$HTML::Linear::Path::Colors::scheme{html}}); $color = 0; print $HTML::Linear::Path::Colors::html[0]; } elsif ($color) { (%HTML::Linear::Path::xpath_wrap) = (%{$HTML::Linear::Path::Colors::scheme{terminal}}); $html = 0; } try_load_class('YADA') and fetch_documents(); tie my %elem, 'Tie::IxHash'; parse_files(\%elem); tie my %xpath, 'Tie::IxHash'; build_xpath(\%elem, \%xpath); for my $xpath (keys %xpath) { dump_diffs($xpath, \%xpath); } print $HTML::Linear::Path::Colors::html[1] if $html; sub fetch_documents { my (@local, @remote); for (@ARGV) { if (m{^https?://}x) { push @remote, $_; } else { push @local, $_; } } return unless @remote; ## no critic (RequireLocalizedPunctuationVars) @ARGV = @local; my $q = YADA->new; for (@remote) { my $tmp = File::Temp->new( SUFFIX => '.html', TEMPLATE => 'doc-XXXX', TMPDIR => 1, ); $q->append(sub { YADA::Worker->new({ initial_url => $_, on_init => sub { $_[0]->setopt(writedata => $tmp); }, on_finish => sub { $tmp->flush; push @ARGV, $tmp unless $_[0]->has_error; }, }) }); } $q->wait; return; } sub parse_files { my ($elem) = @_; for my $file (@ARGV) { my $hl = HTML::Linear->new; $hl->set_shrink if $shrink // 1; $hl->set_strict if $strict // 0; open(my $fh, '<:' . ($encoding ? "encoding($encoding)" : 'utf8' ), $file) or croak "Can't open $file: $!"; $hl->parse_file($fh); close $fh; push @{$elem->{$_}}, [ $_ => basename($file) ] for $hl->as_list; } return; } sub build_xpath { my ($elem, $xpath) = @_; while (my ($key, $list) = each %$elem) { for (@{$list}) { my ($el, $file) = @{$_}; if (@unmangle) { for my $path (@{$el->path}) { for my $attr (keys %{$path->attributes}) { ## no critic (ProtectPrivateSubs) next unless HTML::Linear::Path::_isgroup($el->path->[-1], $attr); for my $unmangle (@unmangle) { $path->attributes->{$attr} =~ s/$unmangle//x; } } } } my $hash = $el->as_hash; ++$xpath->{$_}->{$hash->{$_}}{$file} for keys %{$hash}; } } return; } sub dump_diffs { my ($xpath, $xpath_ref) = @_; my %file; my $m = 0; my $n = 0; for my $p (keys %{$xpath_ref->{$xpath}}) { for my $q (keys %{$xpath_ref->{$xpath}->{$p}}) { push @{$file{$q}}, $p; ++$m; } ++$n; } my $flag = 0; $flag = 1 if $n == $m / scalar @ARGV; $flag = 1 if not ($partial // 0) and scalar keys %file != scalar @ARGV; return if $flag; if (1 < scalar keys %file) { if ($html) { say '' . HTML::Linear::Path::Colors::wrap_xpath($xpath) . ''; } else { say $xpath; } for my $file (sort keys %file) { for (@{$file{$file}}) { if ($html) { say '' . $file . '' . HTML::Linear::Path::Colors::wrap_content($_, 1) . ''; } else { if ($color) { print GREEN . $file . RESET; $_ = HTML::Linear::Path::Colors::wrap_content($_); } else { print $file; } say "\t${_}"; } } } if ($html) { say ''; } else { say ''; } } return; } __END__ =pod =encoding utf8 =head1 NAME untemplate - analyze several HTML documents based on the same template =head1 VERSION version 0.016 =head1 SYNOPSIS untemplate [options] HTML1 HTML2 [HTML3] [...] =head1 DESCRIPTION Takes multiple HTML documents generated using the same template and attempts to extract only the data inserted into original template. Accepts URL if L is present. =head1 OPTIONS =over 4 =item --help This. =item --encoding=name Specify the HTML document encoding (C, C). UTF-8 is assumed by default. =item --[no]color Enable syntax highlight for XPath. By default, enabled automatically on interactive terminals. =item --[no]html Disables the C<--color> option and highlights using HTML/CSS. =item --[no]partial Enable the display of "partial" templates, that is, nodes present in B documents. By default, only the nodes present in B documents are displayed. =item --[no]shrink Shrink the XPath to the minimal unique identifier. For example: /html/body[@id='cpansearch']/form[@class='searchbox']/input[@name='query'] Could be shortened as: //input[@name='query'] The shrinking is enabled by default. =item --[no]strict Strict mode disables grouping by C, C or C attributes. The grouping is enabled by default. =item --unmangle=regex Specify regex(es) to unmangle C/C attributes. Some CMS (WordPress) insert unique identifiers into HTML elements, like: This tend to break HTML tree analysis. To fix the above case, use C<--unmangle 'post-id-\d+'>. Multiple unmanglers are accepted (C<--unmangle a --unmangle b>). =back =head1 EXAMPLES untemplate --color http://bash.org/?1839 http://bash.org/?2486 | less -R =head1 CAVEATS Trying to I HTML documents B based on the same template, the results will be empty. Unfortunately, employing any kind of document identifier as part of element class/id (common practice in L themes) is enough to constitute "not same template". See the C<--unmangle> option for a work-around. =head1 AUTHOR Stanislaw Pusep =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2013 by Stanislaw Pusep. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut