package XHTML::Util; use strict; use warnings; no warnings "uninitialized"; our $VERSION = "0.04"; use Encode; use Carp; # By verbosity? use Scalar::Util "blessed"; use HTML::Tagset 3.02 (); use HTML::Entities; use XML::LibXML; use HTML::Selector::XPath (); use HTML::TokeParser::Simple; # LWP::Simple, external styles use CSS::Tiny; my $isKnown = \%HTML::Tagset::isKnown; my $emptyElement = \%HTML::Tagset::emptyElement; #my $canTighten = \%HTML::Tagset::canTighten; #my $isHeadElement = \%HTML::Tagset::isHeadElement; my $isBodyElement = \%HTML::Tagset::isBodyElement; my $isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; #my $isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; #my $isList = \%HTML::Tagset::isList; #my $isTableElement = \%HTML::Tagset::isTableElement; my $isFormElement = \%HTML::Tagset::isFormElement; #my $p_closure_barriers = \@HTML::Tagset::p_closure_barriers; # Accommodate HTML::TokeParser's idea of a "tag." for my $t ( keys %{$emptyElement} ) { $isKnown->{"$t/"} = 1 } my $isBlockLevel = { map {; $_ => 1 } grep { ! ( $isPhraseMarkup->{$_} || $isFormElement->{$_} ) } keys %{$isBodyElement} }; sub new { my $class = shift; my $self = bless {}, $class; $self; } sub strip_tags { my $self = shift; my $content = shift; my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); carp "No selector was given to strip_tags" and return $content unless $xpath; my $root = blessed($content) =~ /\AXML::LibXML::/ ? $content : $self->_fragment_to_body_node($content); my $doc = $root->getOwnerDocument; for my $node ( $root->findnodes($xpath) ) { my $fragment = $doc->createDocumentFragment; for my $n ( $node->childNodes ) { $fragment->appendChild($n); } $node->replaceNode($fragment); } my $out = ""; $out .= $_->serialize(1) for $root->childNodes; _trim($out); } sub _trim { s/\A\s+|\s+\z//g for @_; wantarray ? @_ : $_[0]; } sub remove { # Synonymous for remove_nodes, all gone. my $self = shift; # my $content = shift; my $content = $self->_sanitize_fragment(shift) or return; my $xpath = HTML::Selector::XPath::selector_to_xpath(shift); carp "No selector was given to strip_tags" and return $content unless $xpath; my $root = blessed($content) =~ /\AXML::LibXML::/ ? $content : $self->_fragment_to_body_node($content); $_->parentNode->removeChild($_) for $root->findnodes($xpath); my $out = ""; $out .= $_->serialize(1) for $root->childNodes; _trim($out); } # No... ? requires object->call shuffling to work : sub enpara_tag { +shift->{enpara_tag} = shift || "p"; } sub enpara { my $self = shift; my $content = $self->_sanitize_fragment(shift) or return; my $selector = shift; my $root = blessed($content) eq 'XML::LibXML::Element' ? $content : $self->_fragment_to_body_node($content); $root->normalize; my $doc = $root->getOwnerDocument; if ( my $xpath = HTML::Selector::XPath::selector_to_xpath($selector) ) { NODE: for my $designated_enpara ( $root->findnodes($xpath) ) { next unless $designated_enpara->nodeType == 1; if ( $designated_enpara->nodeName eq 'pre' ) # I don't think so, honky. { # Expand or leave it alone? or ->validate it...? carp "It makes no sense to enpara within a
; skipping";
                next NODE;
            }
            next unless $isBlockLevel->{$designated_enpara->nodeName};
            _enpara_this_nodes_content($designated_enpara, $doc);
        }
    }
    _enpara_this_nodes_content($root, $doc);
    my $out = "";
    $out .= $_->serialize(1) for $root->childNodes;
    _trim($out);
}

sub _enpara_this_nodes_content {
    my ( $parent, $doc ) = @_;
    my $lastChild = $parent->lastChild;
    my @naked_block;
    for my $node ( $parent->childNodes )
    {
        if ( $isBlockLevel->{$node->nodeName}
             or
             $node->nodeName eq "a" # special case block level, so IGNORE
             and
             grep { $_->nodeName eq "img" } $node->childNodes
             )
        {
            next unless @naked_block; # nothing to enblock
            my $p = $doc->createElement("p");
            $p->setAttribute("enpara","enpara");
            $p->appendChild($_) for @naked_block;
            $parent->insertBefore( $p, $node )
                if $p->textContent =~ /\S/;
            @naked_block = ();
        }
        elsif ( $node->nodeType == 3
                and
                $node->nodeValue =~ /(?:[^\S\n]*\n){2,}/
                )
        {
            my $text = $node->nodeValue;
            my @text_part = map { $doc->createTextNode($_) }
                split /([^\S\n]*\n){2,}/, $text;

            my @new_node;
            for ( my $x = 0; $x < @text_part; $x++ )
            {
                if ( $text_part[$x]->nodeValue =~ /\S/ )
                {
                    push @naked_block, $text_part[$x];
                }
                else # it's a blank newline node so _STOP_
                {
                    next unless @naked_block;
                    my $p = $doc->createElement("p");
                    $p->setAttribute("enpara","enpara");
                    $p->appendChild($_) for @naked_block;
                    @naked_block = ();
                    push @new_node, $p;
                }
            }
            if ( @new_node )
            {
                $parent->insertAfter($new_node[0], $node);
                for ( my $x = 1; $x < @new_node; $x++ )
                {
                    $parent->insertAfter($new_node[$x], $new_node[$x-1]);
                }
            }
            $node->unbindNode;
        }
        else
        {
            push @naked_block, $node; # if $node->nodeValue =~ /\S/;
        }

        if ( $node->isSameNode( $lastChild )
             and @naked_block )
        {
            my $p = $doc->createElement("p");
            $p->setAttribute("enpara","enpara");
            $p->appendChild($_) for ( @naked_block );
            $parent->appendChild($p) if $p->textContent =~ /\S/;
        }
    }

    my $newline = $doc->createTextNode("\n");
    my $br = $doc->createElement("br");

    for my $p ( $parent->findnodes('//p[@enpara="enpara"]') )
    {
        $p->removeAttribute("enpara");
        $parent->insertBefore( $newline->cloneNode, $p );
        $parent->insertAfter( $newline->cloneNode, $p );

        my $frag = $doc->createDocumentFragment();

        my @kids = $p->childNodes();
        for ( my $i = 0; $i < @kids; $i++ )
        {
            my $kid = $kids[$i];
            next unless $kid->nodeName eq "#text";
            my $text = $kid->nodeValue;
            $text =~ s/\A\r?\n// if $i == 0;
            $text =~ s/\r?\n\z// if $i == $#kids;

            my @lines = map { $doc->createTextNode($_) }
                split /(\r?\n)/, $text;

            for ( my $i = 0; $i < @lines; $i++ )
            {
                $frag->appendChild($lines[$i]);
                unless ( $i == $#lines
                         or
                         $lines[$i]->nodeValue =~ /\A\r?\n\z/ )
                {
                    $frag->appendChild($br->cloneNode);
                }
            }
            $kid->replaceNode($frag);
        }
    }
}

sub traverse { # traverse("/*") -> callback
    my ( $self, $selector, $callback ) = @_;
    croak "not implemented";
}

sub translate_tags {
    croak "not implemented";
}

sub remove_style { # (* or [list])
    # just calls remove with args
    croak "not implemented";
}

sub inline_stylesheets { # (names/paths) / external sheets allowed.
    croak "not implemented";
    my $self = shift;
    my $thing = shift;
# :before and :after stuff is still missing
# ??