package Daizu::HTML; use warnings; use strict; use base 'Exporter'; our @EXPORT_OK = qw( dom_body_to_html4 dom_node_to_html4 dom_body_to_text dom_filtered_for_feeds absolutify_links html_escape_text html_escape_attr ); use XML::LibXML; use HTML::Tagset; use URI; use Encode qw( encode ); use Carp qw( croak ); use Carp::Assert qw( assert DEBUG ); use Daizu::Util qw( trim ); =head1 NAME Daizu::HTML - functions for handling HTML and XHTML content =head1 FUNCTIONS The following functions are available for export from this module. None of them are exported by default. =over =item dom_body_to_html4($doc, [$start_node], [$end_node]) Given an L object for an XHTML document fragment, whose root element should be C, returns a string representation of the content in S format. C<$start_node> and C<$end_node> are both independently optional. If either is present then only part of the document will be presented in the HTML output. Both must be either C or a node from the root (C) element of the document. C<$start_node> should be the first node to be shown in the output, or C to start from the beginning. C<$end_node> should be the node I the last node to be output, or C to end at the end of the document. =cut sub dom_body_to_html4 { my ($doc, $start_node, $end_node) = @_; my $html = ''; my $right_part = !defined $start_node; for my $child ($doc->documentElement->childNodes) { $right_part = 1 if defined $start_node && $child->isSameNode($start_node); $right_part = 0 if defined $end_node && $child->isSameNode($end_node); $html .= dom_node_to_html4($child) if $right_part; } return $html; } =item dom_node_to_html4($node) Used by the L function above to process individual nodes. The argument should be an L object of some kind. Returns a string containing S code, which for example will have text properly escaped. =cut sub dom_node_to_html4 { my ($node) = @_; my $type = $node->nodeType; return encode('UTF-8', html_escape_text($node->data), Encode::FB_CROAK) if $type == XML::LibXML::XML_TEXT_NODE || $type == XML::LibXML::XML_CDATA_SECTION_NODE; if ($type == XML::LibXML::XML_ELEMENT_NODE) { my $ns = $node->namespaceURI; return '' if defined $ns && $ns eq $Daizu::HTML_EXTENSION_NS; my $elem_name = lc $node->localname; my $html = "<$elem_name"; for my $attr ($node->attributes) { next unless $attr->nodeType == XML::LibXML::XML_ATTRIBUTE_NODE; my $attr_name = lc $attr->localname; $html .= " $attr_name"; my $boolattr = $HTML::Tagset::boolean_attr{$elem_name}; $html .= '="' . encode('UTF-8', html_escape_attr($attr->value), Encode::FB_CROAK) . '"' unless $boolattr && ((!ref $boolattr && $boolattr eq $attr_name) || (ref $boolattr && $boolattr->{$attr_name})); } $html .= '>'; if (!$HTML::Tagset::emptyElement{$elem_name}) { for my $child ($node->childNodes) { $html .= dom_node_to_html4($child); } $html .= ""; } elsif ($node->hasChildNodes) { warn "element '$elem_name' at line " . $node->line_number . " shouldn't have content"; } return $html; } return '' if $type == XML::LibXML::XML_COMMENT_NODE; return '' if $type == XML::LibXML::XML_XINCLUDE_START || $type == XML::LibXML::XML_XINCLUDE_END; die "node type $type in XML::LibXML DOM not expected"; # These are the node types I don't currently bother with: # XML::LibXML::XML_ATTRIBUTE_NODE = 2 # XML::LibXML::XML_ENTITY_REF_NODE = 5 # XML::LibXML::XML_ENTITY_NODE = 6 # XML::LibXML::XML_PI_NODE = 7 # XML::LibXML::XML_DOCUMENT_NODE = 9 # XML::LibXML::XML_DOCUMENT_TYPE_NODE = 10 # XML::LibXML::XML_DOCUMENT_FRAG_NODE = 11 # XML::LibXML::XML_NOTATION_NODE = 12 # XML::LibXML::XML_HTML_DOCUMENT_NODE = 13 # XML::LibXML::XML_DTD_NODE = 14 # XML::LibXML::XML_ELEMENT_DECL = 15 # XML::LibXML::XML_ATTRIBUTE_DECL = 16 # XML::LibXML::XML_ENTITY_DECL = 17 # XML::LibXML::XML_NAMESPACE_DECL = 18 # XML::LibXML::XML_DOCB_DOCUMENT_NODE = 21 } =item dom_body_to_text($doc) Given an XHTML body (as an L object in the usually format) return a plain text version of the content, with some markup translatted into text formatting in a limited way to make it reasonably readable. =cut sub dom_body_to_text { my ($doc) = @_; my $text = ''; my $accum = ''; # This 'object' is used to track the progress of the formatting and # accumulate the output text. my $fmt = { # State: txt => '', linelen => 0, indent => 0, indent_stack => [], list_type => 'ul', list_pos => 1, list_stack => [], block_started => 0, word_gap => 0, text_level => undef, # undef=normal, otherwise 'sup' or 'sub' # Configuration: max_linelen => 72, min_breakable_line => 10, block_indent => ' ', ul_indent => ' * ', ol_indent => ' %d. ', }; _dom_node_children_to_text($doc->documentElement, $fmt); return _fmt_finish($fmt); } our %SUPERSCRIPT_CHARS = ( 0x0028 => 0x207D, # SUPERSCRIPT LEFT PARENTHESIS 0x0029 => 0x207E, # SUPERSCRIPT RIGHT PARENTHESIS 0x002B => 0x207A, # SUPERSCRIPT PLUS SIGN 0x002D => 0x207B, # close enough for superscript HYPHEN-MINUS 0x0030 => 0x2070, # SUPERSCRIPT ZERO 0x0031 => 0x00B9, # SUPERSCRIPT ONE 0x0032 => 0x00B2, # SUPERSCRIPT TWO 0x0033 => 0x00B3, # SUPERSCRIPT THREE 0x0034 => 0x2074, # SUPERSCRIPT FOUR 0x0035 => 0x2075, # SUPERSCRIPT FIVE 0x0036 => 0x2076, # SUPERSCRIPT SIX 0x0037 => 0x2077, # SUPERSCRIPT SEVEN 0x0038 => 0x2078, # SUPERSCRIPT EIGHT 0x0039 => 0x2079, # SUPERSCRIPT NINE 0x003D => 0x207C, # SUPERSCRIPT EQUALS SIGN 0x0069 => 0x2071, # SUPERSCRIPT LATIN SMALL LETTER I 0x006E => 0x207F, # SUPERSCRIPT LATIN SMALL LETTER N 0x2212 => 0x207B, # SUPERSCRIPT MINUS ); our %SUBSCRIPT_CHARS = ( 0x0028 => 0x208D, # SUBSCRIPT LEFT PARENTHESIS 0x0029 => 0x208E, # SUBSCRIPT RIGHT PARENTHESIS 0x002B => 0x208A, # SUBSCRIPT PLUS SIGN 0x002D => 0x208B, # close enough for subscript HYPHEN-MINUS 0x0030 => 0x2080, # SUBSCRIPT ZERO 0x0031 => 0x2081, # SUBSCRIPT ONE 0x0032 => 0x2082, # SUBSCRIPT TWO 0x0033 => 0x2083, # SUBSCRIPT THREE 0x0034 => 0x2084, # SUBSCRIPT FOUR 0x0035 => 0x2085, # SUBSCRIPT FIVE 0x0036 => 0x2086, # SUBSCRIPT SIX 0x0037 => 0x2087, # SUBSCRIPT SEVEN 0x0038 => 0x2088, # SUBSCRIPT EIGHT 0x0039 => 0x2089, # SUBSCRIPT NINE 0x003D => 0x208C, # SUBSCRIPT EQUALS SIGN 0x0069 => 0x1D62, # LATIN SUBSCRIPT SMALL LETTER I 0x0072 => 0x1D63, # LATIN SUBSCRIPT SMALL LETTER R 0x0075 => 0x1D64, # LATIN SUBSCRIPT SMALL LETTER U 0x0076 => 0x1D65, # LATIN SUBSCRIPT SMALL LETTER V 0x03B2 => 0x1D66, # GREEK SUBSCRIPT SMALL LETTER BETA 0x03B3 => 0x1D67, # GREEK SUBSCRIPT SMALL LETTER GAMMA 0x03C1 => 0x1D68, # GREEK SUBSCRIPT SMALL LETTER RHO 0x03C6 => 0x1D69, # GREEK SUBSCRIPT SMALL LETTER PHI 0x03C7 => 0x1D6A, # GREEK SUBSCRIPT SMALL LETTER CHI 0x2212 => 0x208B, # SUBSCRIPT MINUS ); sub _fmt_add_text { my ($fmt, $text) = @_; return if $text eq ''; # Split into words, but keep track of where whitespace appeared. # The ugly character class are because \s matches \xA0 ( ), # which shouldn't be collapsed like normal spaces. $text =~ s/[ \t\x0A\x0D]+/ /g; $fmt->{word_gap} = 1 if $text =~ s/\A //; my $word_gap_at_end = $text =~ s/ \z//; if (defined $fmt->{text_level}) { my $new = $text; my $lookup = $fmt->{text_level} eq 'sup' ? \%SUPERSCRIPT_CHARS : \%SUBSCRIPT_CHARS; $new =~ s{([^ ])}{ exists $lookup->{ord $1} ? chr($lookup->{ord $1}) : '@' }ge; $text = $new unless $new =~ /@/; } my $not_first; for my $word (split ' ', $text) { $fmt->{word_gap} = 1 if $not_first; $not_first = 1; $fmt->{word_gap} = 0 if $fmt->{linelen} == $fmt->{indent}; _fmt_new_line($fmt) if $fmt->{linelen} >= $fmt->{min_breakable_line} && $fmt->{linelen} + 1 + length($word) > $fmt->{max_linelen}; $word = " $word" if $fmt->{word_gap}; $fmt->{txt} .= $word; $fmt->{linelen} += length $word; $fmt->{block_started} = 1; } $fmt->{word_gap} = $word_gap_at_end; } sub _fmt_new_line { my ($fmt) = @_; $fmt->{txt} .= "\n" . (' ' x $fmt->{indent}); $fmt->{linelen} = $fmt->{indent}; $fmt->{word_gap} = 0; } sub _fmt_new_block { my ($fmt, $extra_indent) = @_; $fmt->{txt} .= "\n" # end last line if $fmt->{linelen} > $fmt->{indent}; if ($fmt->{block_started}) { $fmt->{txt} .= "\n" if $fmt->{txt} ne ''; # gap between blocks $fmt->{txt} .= ' ' x $fmt->{indent}; $fmt->{linelen} = $fmt->{indent}; } push @{$fmt->{indent_stack}}, $fmt->{indent}; if (defined $extra_indent) { $fmt->{txt} .= $extra_indent; $fmt->{linelen} += length $extra_indent; $fmt->{indent} += length $extra_indent; } $fmt->{block_started} = 0; $fmt->{word_gap} = 0; } sub _fmt_end_block { my ($fmt) = @_; assert(@{$fmt->{indent_stack}}) if DEBUG; $fmt->{indent} = pop @{$fmt->{indent_stack}}; $fmt->{word_gap} = 0; } sub _fmt_finish { my ($fmt) = @_; if ($fmt->{linelen} > $fmt->{indent} && $fmt->{txt} ne '') { $fmt->{txt} .= "\n"; $fmt->{linelen} = 0; $fmt->{word_gap} = 0; } return $fmt->{txt}; } sub _dom_node_children_to_text { my ($node, $fmt) = @_; for my $child ($node->childNodes) { _dom_node_to_text($child, $fmt); } } sub _dom_node_to_text { my ($node, $fmt) = @_; my $type = $node->nodeType; if ($type == XML_TEXT_NODE) { _fmt_add_text($fmt, $node->textContent); } elsif ($type == XML_ELEMENT_NODE) { my $name = $node->nodeName; # TODO - definition lists # TODO - a marker for the presence of an object/embed/applet if ($name =~ /^(?:p|div|td|th|h\d)$/) { _fmt_new_block($fmt); _dom_node_children_to_text($node, $fmt); _fmt_end_block($fmt); } elsif ($name eq 'blockquote' || $name eq 'table') { _fmt_new_block($fmt, $fmt->{block_indent}); _dom_node_children_to_text($node, $fmt); _fmt_end_block($fmt); } elsif ($name eq 'li') { my $indent = $fmt->{list_type} eq 'ul' ? $fmt->{ul_indent} : sprintf $fmt->{ol_indent}, $fmt->{list_pos}; ++$fmt->{list_pos}; _fmt_new_block($fmt, $indent); _dom_node_children_to_text($node, $fmt); _fmt_end_block($fmt); } elsif ($name eq 'ul' || $name eq 'ol') { push @{$fmt->{list_type_stack}}, [ $fmt->{list_type}, $fmt->{list_pos} ]; $fmt->{list_type} = $name; $fmt->{list_pos} = 1; _dom_node_children_to_text($node, $fmt); ($fmt->{list_type}, $fmt->{list_pos}) = @{pop @{$fmt->{list_type_stack}}}; } elsif ($name eq 'pre') { _fmt_new_block($fmt, $fmt->{block_indent}); my $indent = ' ' x $fmt->{indent}; my $code = trim($node->textContent); $code =~ s/(?:\x0D\x0A|\x0A|\x0D)/\n$indent/g; $fmt->{txt} .= $code; $code =~ s/^.*\n//s; if ($code =~ /\S/) { $fmt->{linelen} = $fmt->{indent} + length $code; $fmt->{block_started} = 1; } _fmt_end_block($fmt); } elsif ($name eq 'img') { my $alt = trim($node->getAttribute('alt')); $alt = '' unless defined $alt; _fmt_add_text($fmt, $alt); } elsif ($name eq 'br') { _fmt_new_line($fmt); } elsif ($name eq 'q') { _fmt_add_text($fmt, chr 8220); _dom_node_children_to_text($node, $fmt); _fmt_add_text($fmt, chr 8221); } elsif ($name eq 'sup' || $name eq 'sub') { my $old_text_level = $fmt->{text_level}; $fmt->{text_level} = $name; _dom_node_children_to_text($node, $fmt); $fmt->{text_level} = $old_text_level; } else { # Unknown element. Ignore the markup and just process the text. _dom_node_children_to_text($node, $fmt); } } } =item dom_filtered_for_feeds($doc) Return a new version of the article content in C<$doc>, with bits of markup which aren't relevant or might be unwelcome in feed content, such as C