#!/usr/bin/perl package HTML::Diff; $VERSION = '0.55'; use strict; use Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(line_diff word_diff html_word_diff); # This list of tags is taken from the XHTML spec and includes # all those for which no closing tag is expected. In addition # the pattern below matches any tag which ends with a slash / our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link col colgroup frame input isindex area embed img bgsound marquee); use Algorithm::Diff 'sdiff'; sub member { my ($item, @list) = @_; return scalar(grep {$_ eq $item} @list); } sub html_word_diff { my ($left, $right) = @_; # Split the two texts into words and tags. my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm; my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm; @leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } @leftchks; @rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } @rightchks; # Remove blanks; maybe the above regexes could handle this? @leftchks = grep { $_ ne '' } @leftchks; @rightchks = grep { $_ ne '' } @rightchks; # Now we process each segment by turning it into a pair. The first element # is the text as we want it to read in the result. The second element is # the value we will to use in comparisons. It contains an identifier # for each of the balanced tags that it lies within. # This subroutine holds state in the tagstack variable my $tagstack = []; my $smear_tags = sub { if ($_ =~ /^<.*>/) { if ($_ =~ m|^]*)|; $tag = lc $tag; # If we found the closer for the tag on top # of the stack, pop it off. if ($$tagstack[-1] eq $tag) { my $stacktag = pop @$tagstack; } return [$_, $tag]; } else { my ($tag) = m|^<\s*([^\s>]*)|; $tag = lc $tag; if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#) { # (tags without correspond closer tags) return [$_, $tag]; } else { push @$tagstack, $tag; } return [$_, $_]; } } else { my $result = [$_, (join "!!!", (@$tagstack, $_)) ]; return $result; } }; # Now do the "smear tags" operation across each of the chunk-lists $tagstack = []; @leftchks = map { &$smear_tags } @leftchks; # TBD: better modularity would preclude having to reset the stack $tagstack = []; @rightchks = map { &$smear_tags } @rightchks; # Now do the diff, using the "comparison" half of the pair to # compare two chuncks. my $chunks = sdiff(\@leftchks, \@rightchks, sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ }); # Finally, process the output of sdiff by concatenating # consecutive chunks that were "unchanged." my $lastsignal = ''; my ($lbuf, $rbuf); my @result; my $ch; foreach $ch (@$chunks) { my ($signal, $left, $right) = @$ch; if ($signal eq 'u' && $lastsignal ne 'u') { push @result, [$lastsignal, $lbuf, $rbuf] unless $lastsignal eq ''; $lbuf = ""; $rbuf = ""; } elsif ($signal ne 'u' && $lastsignal eq 'u') { push @result, [$lastsignal, $lbuf, $rbuf]; $lbuf = ""; $rbuf = ""; } $lbuf .= elem_mkp($left) || ''; $rbuf .= elem_mkp($right) || ''; $lastsignal = $signal; } push @result, [$lastsignal, $lbuf, $rbuf]; return \@result; } # these are like "accessors" for the two halves of the diff-chunk pairs sub elem_mkp { my ($e) = @_; return undef unless ref $e eq 'ARRAY'; my ($mkp, $cmp) = @$e; return $mkp; } sub elem_cmprsn { my ($e) = @_; return undef unless ref $e eq 'ARRAY'; my ($mkp, $cmp) = @$e; return $cmp; } # Finally a couple of non-HTML diff routines sub line_diff { my ($left, $right) = @_; my (@leftchks) = $left =~ m/(.*\n?)/gm; my (@rightchks) = $right =~ m/(.*\n?)/gm; my $result = sdiff(\@leftchks, \@rightchks); # my @result = map { [ $_->[1], $_->[2] ] } @$result; return $result; } sub word_diff { my ($left, $right) = @_; my (@leftchks) = $left =~ m/([^\s]*\s?)/gm; my (@rightchks) = $right =~ m/([^\s]*\s?)/gm; my $result = sdiff(\@leftchks, \@rightchks); my @result = (map { [ $_->[1], $_->[2] ] } @$result); return $result; } 1; =pod =head1 HTML::Diff This module compares two strings of HTML and returns a list of a chunks which indicate the diff between the two input strings, where changes in formatting are considered changes. =head1 SYNOPSIS $result = html_word_diff($left_text, $right_text); =head1 DESCRIPTION Returns a reference to a list of triples [, , ]. Concatenating all the members from the return value should produce the input $left_text, and likewise for the members. The is either 'u', '+', '-', or 'c', indicating whether the two chunks are the same, the $right_text contained this chunk and the left chunk didn't, or vice versa, or the two chunks are simply different. This follows the usage of Algorithm::Diff. The difference is computed on a word-by-word basis, "breaking" on visible words in the HTML text. If a tag only is changed, it will not be returned as an independent chunk but will be shown as a change to one of the neighboring words. For balanced tags, such as , it is intended that a change to the tag will be treated as a change to all words in between. =head1 AUTHOR Whipped up by Ezra elias kilty Cooper, ezra@ezrakilty.net =head1 SEE ALSO Algorithm::Diff =cut