#--------------------------------------------------------------------- package HTML::Embellish; # # Copyright 2007 Christopher J. Madsen # # Author: Christopher J. Madsen # Created: October 8, 2006 # $Id: Embellish.pm 1838 2007-07-07 20:07:06Z cjm $ # # This program is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the # GNU General Public License or the Artistic License for more details. # # Typographically enhance HTML trees #--------------------------------------------------------------------- use 5.008; # Need good Unicode support use warnings; use strict; use Carp qw(croak); use Exporter (); #===================================================================== # Package Global Variables: our $VERSION = '0.02'; # Also update VERSION section in documentation our @ISA = qw(Exporter); our @EXPORT = qw(embellish); my $mdash = chr(0x2014); my $lsquo = chr(0x2018); my $rsquo = chr(0x2019); my $ldquo = chr(0x201C); my $rdquo = chr(0x201D); my $notQuote = qq/[^\"$ldquo$rdquo]/; my $balancedQuoteString = qq/(?: $notQuote | $ldquo $notQuote* $rdquo)*/; #===================================================================== # Constants: #--------------------------------------------------------------------- BEGIN { my $i = 0; for (qw(textRefs fixQuotes fixDashes fixEllipses totalFields)) { ## no critic (ProhibitStringyEval) eval "sub $_ () { $i }"; ++$i; } } # end BEGIN #===================================================================== # Exported functions: #--------------------------------------------------------------------- sub embellish { my $html = shift @_; croak "First parameter of embellish must be an HTML::Element" unless ref $html and $html->can('content_refs_list'); my $e = HTML::Embellish->new(@_); $e->process($html); } # end embellish #===================================================================== # Class Methods: #--------------------------------------------------------------------- sub new { my $class = shift; croak "Odd number of parameters passed to HTML::Embellish->new" if @_ % 2; my %parms = @_; my $self = [ (undef) x totalFields ]; bless $self, $class; my $def = (exists $parms{default} ? $parms{default} : 1); $self->[textRefs] = undef; $self->[fixDashes] = (exists $parms{dashes} ? $parms{dashes} : $def); $self->[fixEllipses] = (exists $parms{ellipses} ? $parms{ellipses} : $def); $self->[fixQuotes] = (exists $parms{quotes} ? $parms{quotes} : $def); return $self; } # end new #--------------------------------------------------------------------- # Convert quotes & apostrophes into curly quotes: # # Input: # self: The HTML::Embellish object # refs: Arrayref of stringrefs to the text of this paragraph sub curlyquote { my ($self, $refs) = @_; local $_ = join('', map { $$_ } @$refs); s/^([\xA0\s]*)"/$1$ldquo/; s/(?<=[\s\pZ])"(?=[^\s\pZ])/$ldquo/g; s/(?<=\pP)"(?=\w)/$ldquo/g; s/(?<=[ \t\n\r])"(?=\xA0)/$ldquo/g; s/\("/($ldquo/g; s/"[\xA0\s]*$/$rdquo/; s/(?process must be passed an HTML::Element" unless ref $elt and $elt->can('content_refs_list'); my $isP = ($elt->tag =~ /^(?: p | h\d | d[dt] | div | blockquote )$/x); $self->[textRefs] = [] if $isP; my @content = $elt->content_refs_list; if ($self->[fixQuotes] and $self->[textRefs] and @content) { # A " that opens a tag can be assumed to be a left quote ${$content[ 0]} =~ s/^"/$ldquo/ unless ref ${$content[ 0]}; # A " that ends a tag can be assumed to be a right quote ${$content[-1]} =~ s/"$/$rdquo/ unless ref ${$content[-1]}; } foreach my $r (@content) { if (ref $$r) { # element node my $tag = $$r->tag; next if $tag =~ /^(?: ~comment | script | style )$/x; if ($self->[textRefs] and $tag eq 'br') { my $break = "\n"; push @{$self->[textRefs]}, \$break; } $self->process($$r); } else { # text node # Convert -- to em-dash: if ($self->[fixDashes]) { $$r =~ s/(?[fixEllipses]) { $$r =~ s/(?[textRefs]}, $r if $self->[textRefs]; } # end else text node } # end foreach $r if ($isP and $self->[textRefs]) { $self->curlyquote($self->[textRefs]) if $self->[fixQuotes]; $self->[textRefs] = undef; } # end if this was a paragraph-like element } # end process #===================================================================== # Package Return Value: 1; __END__ =head1 NAME HTML::Embellish - Typographically enhance HTML trees =head1 VERSION This document describes HTML::Embellish version 0.02 =head1 SYNOPSIS use HTML::Embellish; use HTML::TreeBuilder; my $html = HTML::TreeBuilder->new_from_file(...); embellish($html); =head1 DESCRIPTION HTML::Embellish adds typographical enhancements to HTML text. It converts certain ASCII characters to Unicode characters. It converts quotation marks and apostrophes into curly quotes. It converts hyphens into em-dashes. It inserts non-breaking spaces between the periods of an ellipsis. (It doesn't use the HORIZONTAL ELLIPSIS character (U+2026), because I like more space in my ellipses.) =head1 INTERFACE =over =item C This subroutine (exported by default) is the main entry point. It's a shortcut for C<< HTML::Embellish->new(...)->process($html) >>. If you're going to process several trees with the same parameters, the object-oriented interface will be slightly more efficient. =item C<< $emb = HTML::Embellish->new(flag => value, ...) >> This creates an HTML::Embellish object that will perform the specified enhancements. These are the (optional) flags that you can pass: =over =item dashes If true, converts sequences of hyphens into em-dashes. Two or 3 hyphens become one em-dash. Four hyphens become two em-dashes. Any other sequence of hyphens is not changed. =item ellipses If true, inserts non-breaking spaces between the periods making up an ellipsis. Also converts the space before an ellipsis that appears to end a sentence to a non-breaking space. =item quotes If true, converts quotation marks and apostrophes into curly quotes. =item default This is the default value used for flags that you didn't specify. It defaults to 1 (enabled). The main reason for using this flag is to disable any enhancements that might be introduced in future versions of HTML::Embellish. =back =item C<< $emb->process($html) >> The C method enhances the content of the HTML::Element you pass in. You can pass the root element to process the entire tree, or any sub-element to process just that part of the tree. The tree is modified in-place; the return value is not meaningful. =back =head1 DIAGNOSTICS =over =item C<< First parameter of embellish must be an HTML::Element >> You didn't pass a valid HTML::Element object to embellish. =item C<< HTML::Embellish->process must be passed an HTML::Element >> You didn't pass a valid HTML::Element object to embellish. =item C<< Odd number of parameters passed to HTML::Embellish->new >> C<< HTML::Embellish->new >> takes parameters in C<< KEY => VALUE >> style, so there must always be an even number of them. =back =head1 CONFIGURATION AND ENVIRONMENT HTML::Embellish requires no configuration files or environment variables. =head1 DEPENDENCIES Requires the L distribution from CPAN (or some other module that implements the L interface). Versions of HTML::Tree prior to 3.21 had some bugs involving Unicode characters and non-breaking spaces. =head1 INCOMPATIBILITIES None reported. =head1 BUGS AND LIMITATIONS No bugs have been reported. =head1 AUTHOR Christopher J. Madsen C<< >> Please report any bugs or feature requests to S<< C<< >> >>, or through the web interface at L =head1 LICENSE AND COPYRIGHT Copyright 2007 Christopher J. Madsen This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See L. =head1 DISCLAIMER OF WARRANTY BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.