package XHTML::Util; use strict; use warnings; no warnings "uninitialized"; our $VERSION = "0.04"; use Encode; use Carp; # By verbosity? use Scalar::Util "blessed"; use HTML::Tagset 3.02 (); use HTML::Entities; use XML::LibXML; use HTML::Selector::XPath (); use HTML::TokeParser::Simple; # LWP::Simple, external styles use CSS::Tiny; my $isKnown = \%HTML::Tagset::isKnown; my $emptyElement = \%HTML::Tagset::emptyElement; #my $canTighten = \%HTML::Tagset::canTighten; #my $isHeadElement = \%HTML::Tagset::isHeadElement; my $isBodyElement = \%HTML::Tagset::isBodyElement; my $isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; #my $isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; #my $isList = \%HTML::Tagset::isList; #my $isTableElement = \%HTML::Tagset::isTableElement; my $isFormElement = \%HTML::Tagset::isFormElement; #my $p_closure_barriers = \@HTML::Tagset::p_closure_barriers; # Accommodate HTML::TokeParser's idea of a "tag." for my $t ( keys %{$emptyElement} ) { $isKnown->{"$t/"} = 1 } my $isBlockLevel = { map {; $_ => 1 } grep { ! ( $isPhraseMarkup->{$_} || $isFormElement->{$_} ) } keys %{$isBodyElement} }; sub new { my $class = shift; my $self = bless {}, $class; $self; } sub strip_tags { my $self = shift; my $content = shift; my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); carp "No selector was given to strip_tags" and return $content unless $xpath; my $root = blessed($content) =~ /\AXML::LibXML::/ ? $content : $self->_fragment_to_body_node($content); my $doc = $root->getOwnerDocument; for my $node ( $root->findnodes($xpath) ) { my $fragment = $doc->createDocumentFragment; for my $n ( $node->childNodes ) { $fragment->appendChild($n); } $node->replaceNode($fragment); } my $out = ""; $out .= $_->serialize(1) for $root->childNodes; _trim($out); } sub _trim { s/\A\s+|\s+\z//g for @_; wantarray ? @_ : $_[0]; } sub remove { # Synonymous for remove_nodes, all gone. my $self = shift; # my $content = shift; my $content = $self->_sanitize_fragment(shift) or return; my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); carp "No selector was given to strip_tags" and return $content unless $xpath; my $root = blessed($content) =~ /\AXML::LibXML::/ ? $content : $self->_fragment_to_body_node($content); $_->parentNode->removeChild($_) for $root->findnodes($xpath); my $out = ""; $out .= $_->serialize(1) for $root->childNodes; _trim($out); } # No... ? requires object->call shuffling to work : sub enpara_tag { +shift->{enpara_tag} = shift || "p"; } sub enpara { my $self = shift; my $content = $self->_sanitize_fragment(shift) or return; my $selector = shift; my $root = blessed($content) eq 'XML::LibXML::Element' ? $content : $self->_fragment_to_body_node($content); $root->normalize; my $doc = $root->getOwnerDocument; if ( my $xpath = HTML::Selector::XPath::selector_to_xpath($selector) ) { NODE: for my $designated_enpara ( $root->findnodes($xpath) ) { next unless $designated_enpara->nodeType == 1; if ( $designated_enpara->nodeName eq 'pre' ) # I don't think so, honky. { # Expand or leave it alone? or ->validate it...? carp "It makes no sense to enpara within a
; skipping"; next NODE; } next unless $isBlockLevel->{$designated_enpara->nodeName}; _enpara_this_nodes_content($designated_enpara, $doc); } } _enpara_this_nodes_content($root, $doc); my $out = ""; $out .= $_->serialize(1) for $root->childNodes; _trim($out); } sub _enpara_this_nodes_content { my ( $parent, $doc ) = @_; my $lastChild = $parent->lastChild; my @naked_block; for my $node ( $parent->childNodes ) { if ( $isBlockLevel->{$node->nodeName} or $node->nodeName eq "a" # special case block level, so IGNORE and grep { $_->nodeName eq "img" } $node->childNodes ) { next unless @naked_block; # nothing to enblock my $p = $doc->createElement("p"); $p->setAttribute("enpara","enpara"); $p->appendChild($_) for @naked_block; $parent->insertBefore( $p, $node ) if $p->textContent =~ /\S/; @naked_block = (); } elsif ( $node->nodeType == 3 and $node->nodeValue =~ /(?:[^\S\n]*\n){2,}/ ) { my $text = $node->nodeValue; my @text_part = map { $doc->createTextNode($_) } split /([^\S\n]*\n){2,}/, $text; my @new_node; for ( my $x = 0; $x < @text_part; $x++ ) { if ( $text_part[$x]->nodeValue =~ /\S/ ) { push @naked_block, $text_part[$x]; } else # it's a blank newline node so _STOP_ { next unless @naked_block; my $p = $doc->createElement("p"); $p->setAttribute("enpara","enpara"); $p->appendChild($_) for @naked_block; @naked_block = (); push @new_node, $p; } } if ( @new_node ) { $parent->insertAfter($new_node[0], $node); for ( my $x = 1; $x < @new_node; $x++ ) { $parent->insertAfter($new_node[$x], $new_node[$x-1]); } } $node->unbindNode; } else { push @naked_block, $node; # if $node->nodeValue =~ /\S/; } if ( $node->isSameNode( $lastChild ) and @naked_block ) { my $p = $doc->createElement("p"); $p->setAttribute("enpara","enpara"); $p->appendChild($_) for ( @naked_block ); $parent->appendChild($p) if $p->textContent =~ /\S/; } } my $newline = $doc->createTextNode("\n"); my $br = $doc->createElement("br"); for my $p ( $parent->findnodes('//p[@enpara="enpara"]') ) { $p->removeAttribute("enpara"); $parent->insertBefore( $newline->cloneNode, $p ); $parent->insertAfter( $newline->cloneNode, $p ); my $frag = $doc->createDocumentFragment(); my @kids = $p->childNodes(); for ( my $i = 0; $i < @kids; $i++ ) { my $kid = $kids[$i]; next unless $kid->nodeName eq "#text"; my $text = $kid->nodeValue; $text =~ s/\A\r?\n// if $i == 0; $text =~ s/\r?\n\z// if $i == $#kids; my @lines = map { $doc->createTextNode($_) } split /(\r?\n)/, $text; for ( my $i = 0; $i < @lines; $i++ ) { $frag->appendChild($lines[$i]); unless ( $i == $#lines or $lines[$i]->nodeValue =~ /\A\r?\n\z/ ) { $frag->appendChild($br->cloneNode); } } $kid->replaceNode($frag); } } } sub traverse { # traverse("/*") -> callback my ( $self, $selector, $callback ) = @_; croak "not implemented"; } sub translate_tags { croak "not implemented"; } sub remove_style { # (* or [list]) # just calls remove with args croak "not implemented"; } sub inline_stylesheets { # (names/paths) / external sheets allowed. croak "not implemented"; my $self = shift; my $thing = shift; # :before and :after stuff is still missing # ??