package HTML::Hyphenate; # -*- cperl; cperl-indent-level: 4 -*- use strict; use warnings; # $Id: Hyphenate.pm 387 2010-12-21 19:41:17Z roland $ # $Revision: 387 $ # $HeadURL: svn+ssh://ipenburg.xs4all.nl/srv/svnroot/elaine/trunk/HTML-Hyphenate/lib/HTML/Hyphenate.pm $ # $Date: 2010-12-21 20:41:17 +0100 (Tue, 21 Dec 2010) $ use 5.006000; use utf8; use charnames qw(:full); our $VERSION = '0.05'; use Log::Log4perl qw(:easy get_logger); use Set::Scalar; use TeX::Hyphen; use TeX::Hyphen::Pattern; use HTML::Entities; use HTML::TreeBuilder; use Readonly; ## no critic qw(ProhibitCallsToUnexportedSubs) Readonly::Scalar my $EMPTY => q{}; Readonly::Scalar my $HYPHEN => q{-}; Readonly::Scalar my $SOFT_HYPHEN => qq{\N{SOFT HYPHEN}}; Readonly::Scalar my $ONE_LEVEL_UP => -1; Readonly::Scalar my $DEFAULT_MIN_LENGTH => 10; Readonly::Scalar my $DEFAULT_MIN_PRE => 2; Readonly::Scalar my $DEFAULT_MIN_POST => 2; Readonly::Scalar my $DEFAULT_LANG => q{en_us}; Readonly::Scalar my $HTML_ESCAPE => undef; Readonly::Scalar my $DEFAULT_INCLUDED => 1; Readonly::Scalar my $DEFAULT_XML => 1; Readonly::Scalar my $LANG => q{lang}; Readonly::Scalar my $TEXT => q{text}; Readonly::Scalar my $NOBR => q{nobr}; Readonly::Scalar my $PRE => q{pre}; Readonly::Scalar my $NOWRAP => q{nowrap}; Readonly::Scalar my $STYLE => q{style}; Readonly::Scalar my $CLASS => q{class}; Readonly::Scalar my $LOG_TRAVERSE => q{Traversing HTML element '%s'}; Readonly::Scalar my $LOG_LANGUAGE_SET => q{Language changed to '%s'}; Readonly::Scalar my $LOG_PATTERN_FILE => q{Using pattern file '%s'}; Readonly::Scalar my $LOG_TEXT_NODE => q{Text node value '%s'}; Readonly::Scalar my $LOG_HYPHEN_TEXT => q{Hyphenating text '%s'}; Readonly::Scalar my $LOG_HYPHEN_WORD => q{Hyphenating word '%s' to '%s'}; Readonly::Scalar my $LOG_LOOKING_UP => q{Looking up for %d class(es)}; Readonly::Scalar my $LOG_HTML_METHOD => q{Using HTML passed to method '%s'}; Readonly::Scalar my $LOG_HTML_PROPERTY => q{Using HTML property '%s'}; Readonly::Scalar my $LOG_HTML_UNDEF => q{HTML to hyphenate is undefined}; Readonly::Scalar my $LOG_NOT_HYPHEN => q{No pattern found for '%s'}; ## use critic my $ANYTHING = qr/.*/xsm; # HTML %Text attributes my $text_attr = Set::Scalar->new(qw/abbr alt label standby summary title/); # Strip document root tags from a fragment added by HTML::TreeBuilder: my $ROOT = qr{ ^(?:)??(.*)\s*$ }ixsm; # Match inline style requesting not to wrap anyway: my $STYLE_NOWRAP = qr/\bwhite-space\s*:\s*nowrap\b/xsm; Log::Log4perl->easy_init($ERROR); my $log = get_logger(); use Class::Meta::Express qw(class ctor has meta method); use HTML::Hyphenate::TypeDef; class { meta 'html-hyphenate'; ctor 'new'; has html => ( is => 'string' ); has style => ( is => 'string' ); has min_length => ( is => 'integer', default => $DEFAULT_MIN_LENGTH ); has min_pre => ( is => 'integer', default => $DEFAULT_MIN_PRE ); has min_post => ( is => 'integer', default => $DEFAULT_MIN_POST ); has output_xml => ( is => 'integer', default => $DEFAULT_XML ); has default_lang => ( is => 'string', default => $DEFAULT_LANG ); has default_included => ( is => 'integer', default => $DEFAULT_INCLUDED ); has classes_included => ( is => 'arrayref', default => [] ); has classes_excluded => ( is => 'arrayref', default => [] ); has _hyphenators => ( is => 'hashref', default => {}, view => 'PRIVATE' ); has _lang => ( is => 'string', view => 'PRIVATE' ); has _tree => ( is => 'tree', view => 'PRIVATE' ); method hyphenated => \&__hyphenated; method _traverse_html => ( code => \&__traverse_html, view => 'PRIVATE', ); method _clean_html => ( code => \&__clean_html, view => 'PRIVATE', ); method _hyphen => ( code => \&__hyphen, view => 'PRIVATE', ); method _hyphen_word => ( code => \&__hyphen_word, view => 'PRIVATE', ); method _configure_lang => ( code => \&__configure_lang, view => 'PRIVATE', ); method _add_tex_hyphen_to_cache => ( code => \&__add_tex_hyphen_to_cache, view => 'PRIVATE', ); method _hyphenable_by_class => ( code => \&__hyphenable_by_class, view => 'PRIVATE', ); method _hyphenable => ( code => \&__hyphenable, view => 'PRIVATE', ); method _get_nearest_ancestor_level_by_classname => ( code => \&__get_nearest_ancestor_level_by_classname, view => 'PRIVATE', ); method _reset_tree => ( code => \&__reset_tree, view => 'PRIVATE', ); }; sub __hyphenated { my ( $self, $html ) = @_; if ( defined $html ) { $log->debug( sprintf $LOG_HTML_METHOD, $html ); $self->html($html); } else { $log->debug( sprintf $LOG_HTML_PROPERTY, $self->html ); } if ( defined $self->html ) { $self->_traverse_html(); return $self->_clean_html(); } $log->warn($LOG_HTML_UNDEF); return; } sub __traverse_html { my ($self) = @_; $self->_reset_tree; $self->_tree->parse_content( $self->html ); $self->_tree->objectify_text(); $self->_tree->traverse( [ sub { my $element = $_[0]; $log->debug( sprintf $LOG_TRAVERSE, $element->tag() ); $self->_configure_lang($element); if ( $element->attr($TEXT) ) { if ( $self->_hyphenable($element) ) { $log->debug( sprintf $LOG_TEXT_NODE, $element->attr($TEXT) ); $element->attr( $TEXT, $self->_hyphen( $element->attr($TEXT) ) ); } } else { foreach my $attr ( $element->all_external_attr_names() ) { if ( $text_attr->has($attr) ) { $element->attr( $attr, $self->_hyphen( $element->attr($attr) ) ); } } } ## no critic qw(RequireExplicitInclusion ProhibitCallsToUnexportedSubs) return HTML::Element::OK; ## use critic }, undef ] ); return; } sub __clean_html { my ($self) = @_; $self->_tree->deobjectify_text(); my $html = $self->output_xml ? $self->_tree->as_XML() : $self->_tree->as_HTML( $HTML_ESCAPE, $EMPTY, {} ); $self->_reset_tree; $html =~ s/$ROOT/$1/xgism; return $html; } sub __hyphen { my ( $self, $text ) = @_; $log->debug( sprintf $LOG_HYPHEN_TEXT, $text ); $text =~ s/(\w{@{[$self->min_length]},})/$self->_hyphen_word($1)/xsmeg; return $text; } sub __hyphen_word { my ( $self, $word ) = @_; if ( defined $self->_hyphenators->{ $self->_lang } ) { $log->debug( sprintf $LOG_HYPHEN_WORD, $word, $self->_hyphenators->{ $self->_lang }->visualize($word) ); my $number = 0; foreach my $pos ( $self->_hyphenators->{ $self->_lang }->hyphenate($word) ) { substr $word, $pos + $number, 0, $SOFT_HYPHEN; $number += length $SOFT_HYPHEN; } } else { $log->warn( sprintf $LOG_NOT_HYPHEN, $self->_lang ); } return $word; } sub __configure_lang { my ( $self, $element ) = @_; my $lang = $element->attr_get_i($LANG); $lang ||= $element->attr_get_i(qq{xml:$LANG}); my %hyphen_opts = ( leftmin => $self->min_pre, rightmin => $self->min_post, ); defined $self->style && ( $hyphen_opts{style} = $self->style ); defined $lang || ( $lang = $self->default_lang ); if ( !defined $self->_lang || $lang ne $self->_lang ) { $self->_lang($lang); $log->debug( sprintf $LOG_LANGUAGE_SET, $lang ); if ( !exists $self->_hyphenators->{$lang} ) { $self->_add_tex_hyphen_to_cache(); } } return; } sub __add_tex_hyphen_to_cache { my ($self) = @_; my $thp = TeX::Hyphen::Pattern->new(); $thp->label( $self->_lang ); my $cache = $self->_hyphenators; if ( my $file = $thp->filename ) { $log->debug( sprintf $LOG_PATTERN_FILE, $file ); ${$cache}{ $self->_lang } = TeX::Hyphen->new( file => $file, leftmin => $self->min_pre, rightmin => $self->min_post, ); $self->_hyphenators($cache); } return; } sub __hyphenable_by_class { my ( $self, $element ) = @_; my $included_level = $ONE_LEVEL_UP; my $excluded_level = $ONE_LEVEL_UP; $self->default_included && $excluded_level--; $self->default_included || $included_level--; $included_level = $self->_get_nearest_ancestor_level_by_classname( $element, $self->classes_included, $included_level ); $excluded_level = $self->_get_nearest_ancestor_level_by_classname( $element, $self->classes_excluded, $excluded_level ); return !( $excluded_level > $included_level ); } sub __hyphenable { my ( $self, $element ) = @_; return !( $element->is_inside($NOBR) || $element->is_inside($PRE) || $element->look_up( $NOWRAP, $ANYTHING ) || $element->look_up( $STYLE, $STYLE_NOWRAP ) || !$self->_hyphenable_by_class($element) ); } sub __get_nearest_ancestor_level_by_classname { my ( $self, $element, $ar_classnames, $level ) = @_; my $classnames = Set::Scalar->new( @{$ar_classnames} ); $log->debug( sprintf $LOG_LOOKING_UP, $classnames->size ); # Only looking up the tree if there is something to look for: if ( $classnames && ( my $container = $element->look_up( $CLASS => $ANYTHING, sub { $_[0] && $classnames->has( $_[0]->attr($CLASS) ); } ) ) ) { $level = $container->depth; } return $level; } sub __reset_tree { my ($self) = @_; $self->_tree && $self->_tree( $self->_tree->delete ); my $tree = HTML::TreeBuilder->new(); $tree->warn(1); $tree->store_pis(1); $self->_tree($tree); return; } 1; __END__ =encoding utf8 =for stopwords Ipenburg Readonly =head1 NAME HTML::Hyphenate - class for inserting soft hyphens into HTML. =head1 VERSION This is version 0.05. =head1 SYNOPSIS use HTML::Hyphenate; $hyphenator = new HTML::Hyphenate(); $html_with_soft_hyphens = $hyphenator->hyphenated($html); $hyphenator->html($html); $hyphenator->style($style); # czech or german $hyphenator->min_length(10); $hyphenator->min_pre(2); $hyphenator->min_post(2); $hyphenator->output_xml(1); $hyphenator->default_lang('en-us'); $hyphenator->default_included(1); $hyphenator->classes_included(['shy']); $hyphenator->classes_excluded(['noshy']); =head1 DESCRIPTION Most HTML rendering engines used in web browsers don't figure out by themselves how to hyphenate words when needed, but we can tell them how they might do it by inserting soft hyphens into the words. =head1 SUBROUTINES/METHODS =over 4 =item HTML::Hyphenate-Enew() Constructs a new HTML::Hyphenate object. =item $hyphenator-Ehyphenated() Returns the HTML including the soft hyphens. =item $hyphenator->html(); Gets or sets the HTML to hyphenate. =item $hyphenator->style(); Gets or sets the style to use for pattern usages in L. Can be C or C. =item $hyphenator->min_length(); Gets or sets the minimum word length required for having soft hyphens inserted. Defaults to 10 characters. =item $hyphenator->min_pre(2); Gets or sets the minimum amount of characters in a word preserved before the first soft hyphen. Defaults to 2 characters. =item $hyphenator->min_post(2); Gets or sets the minimum amount of characters in a word preserved after the last soft hyphen. Defaults to 2 characters. =item $hyphenator->output_xml(1); Have L output HTML in HTML or XML mode. =item $hyphenator->default_lang('en-us'); Gets or sets the default pattern to use when no language can be derived from the HTML. =item $hyphenator->default_included(); Gets or sets if soft hyphens should be included in the whole tree by default. This can be used to insert soft hyphens only in parts of the HTML having specific class names. =item $hyphenator->classes_included(); Gets or sets a reference to an array of class names that will have soft hyphens inserted. =item $hyphenator->classes_excluded(); Gets or sets a reference to an array of class names that will not have soft hyphens inserted. =back =head1 DEPENDENCIES L L L L L L L L L L L L =head1 INCOMPATIBILITIES =over 4 This module has the same limits as TeX::Hyphen, TeX::Hyphen::Pattern and HTML::TreeBuilder. =back =head1 DIAGNOSTICS This module uses Log::Log4perl for logging. =over 4 =item * It warns when a language encountered in the HTML is not supported by TeX::Hyphen::Pattern =back =head1 BUGS AND LIMITATIONS =over 4 =item * Empty subclass test fails, this is probably a Class::Meta::Express issue. The empty subclass can't be empty, it needs at least: use Class::Meta::Express; class { ctor 'new'; }; =item * Perfect hyphenation can be more complicated than just inserting a hyphen somewhere in a word, and sometimes requires semantics to get it right. For example C should be hyphenated as C and not C and C can be hyphenated as C or C, depending on it's meaning. While HTML could provide a bit more context (mainly the language being used) than plain text to handle these issues, the initial purpose of this module is to make it possible for HTML rendering engines that support soft hyphens to be able to break long words over multiple lines to avoid unwanted overflow. =item * The hyphenation doesn't get better than TeX::Hyphenate and it hyphenation patterns provide. =item * The round trip from HTML source via HTML::Tree to HTML source might introduce changes to the source, for example accented characters might be transformed to HTML encoded entity equivalent. =back =head1 CONFIGURATION AND ENVIRONMENT The output is generated by L and can be either HTML or XML. =head1 AUTHOR Roland van Ipenburg C<< >> =head1 LICENSE AND COPYRIGHT Copyright (C) 2010 by Roland van Ipenburg This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.10.0 or, at your option, any later version of Perl 5 you may have available. =head1 DISCLAIMER OF WARRANTY BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. =cut