package HTML::TreeBuilder::XPath;
use strict;
use warnings;
use vars qw($VERSION);
$VERSION = '0.09';
my %ENT= ( '&' => '&', '<' => '<', '>' => '>', );
package HTML::TreeBuilder::XPath;
use base( 'HTML::TreeBuilder');
package HTML::TreeBuilder::XPath::Node;
sub isElementNode { 0 }
sub isAttributeNode { 0 }
sub isNamespaceNode { 0 }
sub isTextNode { 0 }
sub isProcessingInstructionNode { 0 }
sub isPINode { 0 }
sub isCommentNode { 0 }
sub getChildNodes { return wantarray ? () : []; }
sub getFirstChild { return undef; }
sub getLastChild { return undef; }
sub getElementById
{ my ($self, $id) = @_;
return scalar $self->look_down( id => $id);
}
sub to_number { return XML::XPathEngine::Number->new( shift->getValue); }
sub cmp
{ my( $a, $b)=@_;
# comparison with the root (in $b, or processed in HTML::TreeBuilder::XPath::Root)
if( $b->isa( 'HTML::TreeBuilder::XPath::Root') ) { return -1; }
# easy cases
return 0 if( $a == $b);
return 1 if( $a->is_inside($b)); # a starts after b
return -1 if( $b->is_inside($a)); # a starts before b
# lineage does not include the element itself
my @a_pile= ($a, $a->lineage);
my @b_pile= ($b, $b->lineage);
# the 2 elements are not in the same twig
unless( $a_pile[-1] == $b_pile[-1])
{ warn "2 nodes not in the same pile: ", ref( $a), " - ", ref( $b), "\n";
print "a: ", $a->string_value, "\nb: ", $b->string_value, "\n";
return undef;
}
# find the first non common ancestors (they are siblings)
my $a_anc= pop @a_pile;
my $b_anc= pop @b_pile;
while( $a_anc == $b_anc)
{ $a_anc= pop @a_pile;
$b_anc= pop @b_pile;
}
if( defined( $a_anc->{_rank}) && defined( $b_anc->{_rank}))
{ return $a_anc->{_rank} <=> $b_anc->{_rank}; }
else
{
# from there move left and right and figure out the order
my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
while()
{ $a_prev= $a_prev->getPreviousSibling || return -1;
return 1 if( $a_prev == $b_anc);
$a_next= $a_next->getNextSibling || return 1;
return -1 if( $a_next == $b_anc);
$b_prev= $b_prev->getPreviousSibling || return 1;
return -1 if( $b_prev == $a_next);
$b_next= $b_next->getNextSibling || return -1;
return 1 if( $b_next == $a_prev);
}
}
}
# need to modify directly the HTML::Element package, because HTML::TreeBuilder won't let me
# change the class of the nodes it generates
package HTML::Element;
use Scalar::Util qw(weaken);
use vars qw(@ISA);
push @ISA, 'HTML::TreeBuilder::XPath::Node';
use XML::XPathEngine;
{ my $xp;
sub xp
{ $xp ||=XML::XPathEngine->new();
return $xp;
}
}
sub findnodes { my( $elt, $path)= @_; return xp->findnodes( $path, $elt); }
sub findnodes_as_string { my( $elt, $path)= @_; return xp->findnodes_as_string( $path, $elt); }
sub findvalue { my( $elt, $path)= @_; return xp->findvalue( $path, $elt); }
sub exists { my( $elt, $path)= @_; return xp->exists( $path, $elt); }
sub find_xpath { my( $elt, $path)= @_; return xp->find( $path, $elt); }
sub matches { my( $elt, $path)= @_; return xp->matches( $elt, $path, $elt); }
sub set_namespace { my $elt= shift; xp->new->set_namespace( @_); }
sub getRootNode
{ my $elt= shift;
# The parent of root is a HTML::TreeBuilder::XPath::Root
# that helps getting the tree to mimic a DOM tree
return $elt->root->getParentNode; # I like this one!
}
sub getParentNode
{ my $elt= shift;
return $elt->{_parent} || bless { _root => $elt }, 'HTML::TreeBuilder::XPath::Root';
}
sub getName { return shift->tag; }
sub getNextSibling { my( $elt)= @_;
my $parent= $elt->{_parent} || return undef;
return $parent->_child_as_object( scalar $elt->right, ($elt->{_rank} || 0) + 1);
}
sub getPreviousSibling { my( $elt)= @_;
my $parent= $elt->{_parent} || return undef;
return undef unless $elt->{_rank};
return $parent->_child_as_object( scalar $elt->left, $elt->{_rank} - 1);
}
sub isElementNode { return ref $_[0] && ($_[0]->{_tag}!~ m{^~}) ? 1 : 0; }
sub isCommentNode { return ref $_[0] && ($_[0]->{_tag} eq '~comment') ? 1 : 0; }
sub isProcessingInstructionNode { return ref $_[0] && ($_[0]->{_tag} eq '~pi') ? 1 : 0; }
sub isTextNode { return ref $_[0] ? 0 : 1; }
sub getValue
{ my $elt= shift;
if( $elt->isCommentNode) { return $elt->{_text}; }
return $elt->as_text;
}
sub getChildNodes
{ my $parent= shift;
my $rank=0;
my @children= map { $parent->_child_as_object( $_, $rank++) } $parent->content_list;
return wantarray ? @children : \@children;
}
sub getFirstChild
{ my $parent= shift;
my @content= $parent->content_list;
if( @content)
{ return $parent->_child_as_object( $content[0], 0); }
else
{ return undef; }
}
sub getLastChild
{ my $parent= shift;
my @content= $parent->content_list;
if( @content)
{ return $parent->_child_as_object( $content[-1], $#content); }
else
{ return undef; }
}
sub getAttributes
{ my $elt= shift;
my %atts= $elt->all_external_attr;
my $rank=0;
my @atts= map { bless( { _name => $_, _value => $atts{$_},
_elt => $elt, _rank => $rank++,
},
'HTML::TreeBuilder::XPath::Attribute'
)
} sort keys %atts;
return wantarray ? @atts : \@atts;
}
sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
sub string_value
{ my $elt= shift;
if( $elt->isCommentNode) { return $elt->{_text}; }
return $elt->as_text;
};
# called on a parent, with a child as second argument and its rank as third
# returns the child if it is already an element, or
# a new HTML::TreeBuilder::XPath::Text element if it is a plain string
sub _child_as_object
{ my( $elt, $elt_or_text, $rank)= @_;
return undef unless( defined $elt_or_text);
if( ! ref $elt_or_text)
{ # $elt_or_text is a string, turn it into a TextNode object
$elt_or_text= bless { _content => $elt_or_text, _parent => $elt, },
'HTML::TreeBuilder::XPath::TextNode'
;
}
if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); }
$elt_or_text->{_rank}= $rank; # used for sorting;
return $elt_or_text;
}
package HTML::TreeBuilder::XPath::TextNode;
use base 'HTML::TreeBuilder::XPath::Node';
sub getParentNode { return shift->{_parent}; }
sub getValue { return shift->{_content}; }
sub isTextNode { return 1; }
sub getAttributes { return wantarray ? () : []; }
# extracted from _HTML::Element as_XML
sub as_XML
{ my( $node, $entities)= @_;
my $content= $node->{_content};
if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
{ $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
else
{ HTML::Element::_xml_escape($content); }
return $content;
}
sub getPreviousSibling
{ my $self= shift;
my $rank= $self->{_rank};
#unless( defined $self->{_rank})
# { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
my $parent= $self->{_parent};
return $rank ? $parent->_child_as_object( $parent->{_content}->[$rank-1], $rank-1) : undef;
}
sub getNextSibling
{ my $self= shift;
my $rank= $self->{_rank};
#unless( defined $self->{_rank})
# { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
my $parent= $self->{_parent};
my $next_sibling= $parent->{_content}->[$rank+1];
return defined( $next_sibling) ? $parent->_child_as_object( $next_sibling, $rank+1) : undef;
}
sub getRootNode
{ return shift->{_parent}->getRootNode; }
sub string_value { return shift->{_content}; }
# added to provide element-like methods to text nodes, for use by cmp
sub lineage
{ my( $node)= @_;
my $parent= $node->{_parent};
return( $parent, $parent->lineage);
}
sub is_inside
{ my( $text, $node)= @_;
return $text->{_parent}->is_inside( $node);
}
sub xml_escape
{ my( $text)= @_;
$text=~ s{([&<>])}{$ENT{$1}}g;
return $text;
}
1;
package HTML::TreeBuilder::XPath::Attribute;
use base 'HTML::TreeBuilder::XPath::Node';
sub getParentNode { return $_[0]->{_elt}; }
sub getValue { return $_[0]->{_value}; }
sub getName { return $_[0]->{_name} ; }
sub getLocalName { (my $name= $_[0]->{_name}) =~ s{^.*:}{}; $name; }
sub string_value { return $_[0]->{_value}; }
sub to_number { return XML::XPathEngine::Number->new( $_[0]->{_value}); }
sub isAttributeNode { 1 }
sub toString { return qq{$_[0]->{_name}="$_[0]->{_value}"}; }
# awfully inefficient, but hopefully this is called only for weird (read test-case) queries
sub getPreviousSibling
{ my $self= shift;
my $rank= $self->{_rank};
return undef unless $rank;
my %atts= $self->{_elt}->all_external_attr;
my $previous_att_name= (sort keys %atts)[$rank-1];
return bless( { _name => $previous_att_name,
_value => $atts{$previous_att_name},
_elt => $self->{_elt}, _rank => $rank-1,
}, 'HTML::TreeBuilder::XPath::Attribute'
);
}
sub getNextSibling
{ my $self= shift;
my $rank= $self->{_rank};
my %atts= $self->{_elt}->all_external_attr;
my $next_att_name= (sort keys %atts)[$rank+1] || return undef;
return bless( { _name => $next_att_name, _value => $atts{$next_att_name},
_elt => $self->{_elt}, _rank => $rank+1,
}, 'HTML::TreeBuilder::XPath::Attribute'
);
}
# added to provide element-like methods to attributes, for use by cmp
sub lineage
{ my( $att)= @_;
my $elt= $att->{_elt};
return( $elt, $elt->lineage);
}
sub is_inside
{ my( $att, $node)= @_;
return ($att->{_elt} == $node) || $att->{_elt}->is_inside( $node);
}
1;
package HTML::TreeBuilder::XPath::Root;
use base 'HTML::TreeBuilder::XPath::Node';
sub getParentNode { return (); }
sub getChildNodes { my @content= ( $_[0]->{_root}); return wantarray ? @content : \@content; }
sub getAttributes { return [] }
sub isDocumentNode { return 1 }
# added to provide element-like methods to root, for use by cmp
sub lineage { return ($_[0]); }
sub is_inside { return 0; }
sub cmp { return $_[1]->isa( ' HTML::TreeBuilder::XPath::Root') ? 0 : 1; }
1;
__END__
=head1 NAME
HTML::TreeBuilder::XPath - add XPath support to HTML::TreeBuilder
=head1 SYNOPSIS
use HTML::TreeBuilder::XPath;
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse_file( "mypage.html");
my $nb=$tree->findvalue( '/html/body//p[@class="section_title"]/span[@class="nb"]');
my $id=$tree->findvalue( '/html/body//p[@class="section_title"]/@id');
my $p= $html->findnodes( '//p[@id="toto"]')->[0];
my $link_texts= $p->findvalue( './a'); # the texts of all a elements in $p
=head1 DESCRIPTION
This module adds typical XPath methods to HTML::TreeBuilder, to make it
easy to query a document.
=head1 METHODS
Extra methods added both to the tree object and to each element:
=head2 findnodes ($path)
Returns a list of nodes found by C<$path>.
In scalar context returns an C object.
=head2 findnodes_as_string ($path)
Returns the text values of the nodes
=head2 findvalue ($path)
Returns either a C, a C
or a C object. If the path returns a NodeSet,
$nodeset->xpath_to_literal is called automatically for you (and thus a
C is returned). Note that
for each of the objects stringification is overloaded, so you can just
print the value found, or manipulate it in the ways you would a normal
perl value (e.g. using regular expressions).
=head2 exists ($path)
Returns true if the given path exists.
=head2 matches($path)
Returns true if the element matches the path.
=head2 find ($path)
The find function takes an XPath expression (a string) and returns either a
Tree::XPathEngine::NodeSet object containing the nodes it found (or empty if
no nodes matched the path), or one of XML::XPathEngine::Literal (a string),
XML::XPathEngine::Number, or XML::XPathEngine::Boolean. It should always
return something - and you can use ->isa() to find out what it returned. If
you need to check how many nodes it found you should check $nodeset->size.
See L.
=head1 SEE ALSO
L
L
=head1 AUTHOR
Michel Rodriguez, Emirod@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Michel Rodriguez
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut