package HTML::Query;
our $VERSION = '0.08';
use Badger::Class
version => $VERSION,
debug => 0,
base => 'Badger::Base',
utils => 'blessed',
import => 'class CLASS',
vars => 'AUTOLOAD',
constants => 'ARRAY',
constant => {
ELEMENT => 'HTML::Element',
BUILDER => 'HTML::TreeBuilder',
},
exports => {
any => 'Query',
hooks => {
query => \&_export_query_to_element,
},
},
messages => {
no_elements => 'No elements specified to query',
no_query => 'No query specified',
no_source => 'No argument specified for source: %s',
bad_element => 'Invalid element specified: %s',
bad_source => 'Invalid source specified: %s',
bad_query => 'Invalid query specified: %s',
bad_spec => 'Invalid specification "%s" in query: %s',
is_empty => 'The query does not contain any elements',
};
our $SOURCES = {
text => sub {
class(BUILDER)->load;
BUILDER->new_from_content(shift);
},
file => sub {
class(BUILDER)->load;
BUILDER->new_from_file(shift);
},
tree => sub {
$_[0]
},
query => sub {
ref $_[0] eq ARRAY
? @{ $_[0] }
: $_[0];
},
};
sub Query (@) {
CLASS->new(@_);
}
sub new {
my $class = shift;
my ($element, @elements, $type, $code, $select);
# expand a single list ref into items
unshift @_, @{ shift @_ }
if @_ == 1 && ref $_[0] eq ARRAY;
$class = ref $class || $class;
my $self = {
error => undef,
suppress_errors => undef,
match_self => undef,
elements => \@elements,
specificity => {}
};
# each element should be an HTML::Element object, although we might
# want to subclass this module to recognise a different kind of object,
# so we get the element class from the ELEMENT constant method which a
# subclass can re-define.
my $element_class = $class->ELEMENT;
while (@_) {
$element = shift;
$class->debug("argument: ".$element) if DEBUG;
if (! ref $element) {
# a non-reference item is a source type (text, file, tree)
# followed by the source, or if it's the last argument following
# one ore more element options or named argument pairs then it's
# a selection query
if (@_) {
$type = $element;
$code = $SOURCES->{ $type }
|| return $class->error_msg( bad_source => $type );
$element = shift;
$class->debug("source $type: $element") if DEBUG;
unshift(@_, $code->($element));
next;
}
elsif (@elements) {
$select = $element;
last;
}
}
elsif (blessed $element) {
# otherwise it should be an HTML::Element object or another
# HTML::Query object
if ($element->isa($element_class)) {
push(@elements, $element);
next;
}
elsif ($element->isa($class)) {
push(@elements, @{$element->get_elements});
next;
}
}
return $class->error_msg( bad_element => $element );
}
bless $self, $class;
return defined $select ? $self->query($select) : $self;
}
sub query {
my ($self, $query) = @_;
my @result;
my $ops = 0;
my $pos = 0;
$self->{error} = undef;
return $self->error_msg('no_query')
unless defined $query && length $query;
# multiple specs can be comma separated, e.g. "table tr td, li a, div.foo"
COMMA: while (1) {
# each comma-separated traversal spec is applied downward from
# the source elements in the $self->{elements} query
my @elements = @{$self->get_elements};
my $comops = 0;
my $specificity = 0;
my $startpos = pos($query) || 0;
my $hack_sequence = 0; # look for '* html'
warn "Starting new COMMA" if DEBUG;
# for each whitespace delimited descendant spec we grok the correct
# parameters for look_down() and apply them to each source element
# e.g. "table tr td"
SEQUENCE: while (1) {
my @args;
$pos = pos($query) || 0;
my $relationship = '';
my $leading_whitespace;
warn "Starting new SEQUENCE" if DEBUG;
# ignore any leading whitespace
if ($query =~ / \G (\s+) /cgsx) {
$leading_whitespace = defined($1) ? 1 : 0;
warn "removing leading whitespace\n" if DEBUG;
}
# grandchild selector is whitespace sensitive, requires leading whitespace
if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) {
# can't have a relationship modifier as the first part of the query
$relationship = $1;
warn "relationship = $relationship\n" if DEBUG;
}
# get other relationship modifiers
if ($query =~ / \G (>|\+) \s* /cgx) {
# can't have a relationship modifier as the first part of the query
$relationship = $1;
warn "relationship = $relationship\n" if DEBUG;
if (!$comops) {
return $self->_report_error( $self->message( bad_spec => $relationship, $query ) );
}
}
# optional leading word is a tag name
if ($query =~ / \G ([\w\*]+) /cgx) {
my $tag = $1;
if ($tag =~ m/\*/) {
if (($leading_whitespace || $comops == 0) && ($tag eq '*')) {
warn "universal tag\n" if DEBUG;
push(@args, _tag => qr/\w+/);
if ($comops == 0) { #we need to catch the case where we see '* html'
$hack_sequence++;
}
}
else {
return $self->_report_error( $self->message( bad_spec => $tag, $query ) );
}
}
else {
warn "html tag\n" if DEBUG;
$specificity += 1; # standard tags are worth 1 point
push( @args, _tag => $tag );
if ($comops == 1 && $tag eq 'html') {
$hack_sequence++;
}
}
}
# loop to collect a description about this specific part of the rule
while (1) {
my $work = scalar @args;
# that can be followed by (or the query can start with) a #id
if ($query =~ / \G \# ([\w\-]+) /cgx) {
$specificity += 100;
push( @args, id => $1 );
}
# and/or a .class
if ($query =~ / \G \. ([\w\-]+) /cgx) {
$specificity += 10;
push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x );
}
# and/or none or more [ ] attribute specs
if ($query =~ / \G \[ (.*?) \] /cgx) {
my $attribute = $1;
$specificity += 10;
#if we have an operator
if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) {
my ($name,$attribute_op,$value) = ($1,$2,$3);
unless (defined($name) && length($name)) {
return $self->_report_error( $self->message( bad_spec => $name, $query ) );
}
warn "operator $attribute_op" if DEBUG;
if (defined $value) {
for ($value) {
s/^['"]//;
s/['"]$//;
}
if ($attribute_op eq '=') {
push( @args, $name => $value);
}
elsif ($attribute_op eq '|=') {
push(@args, $name => qr/\b${value}-?/)
}
elsif ($attribute_op eq '~=') {
push(@args, $name => qr/\b${value}\b/)
}
else {
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
}
}
else {
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
}
}
else {
unless (defined($attribute) && length($attribute)) {
return $self->_report_error( $self->message( bad_spec => $attribute, $query ) );
}
# add a regex to match anything (or nothing)
push( @args, $attribute => qr/.*/ );
}
}
# and/or one or more pseudo-classes
if ($query =~ / \G : ([\w\-]+) /cgx) {
my $pseudoclass = $1;
$specificity += 10;
if ($pseudoclass eq 'first-child') {
push( @args, sub { ! grep { ref $_ } $_[0]->left() } );
} elsif ($pseudoclass eq 'last-child') {
push( @args, sub { ! grep { ref $_ } $_[0]->right() } );
} else {
warn "Pseudoclass :$pseudoclass not supported";
next;
}
}
# keep going until this particular expression is fully processed
last unless scalar(@args) > $work;
}
# we must have something in @args by now or we didn't find any
# valid query specification this time around
last SEQUENCE unless @args;
$self->debug(
'Parsed ', substr($query, $pos, pos($query) - $pos),
' into args [', join(', ', @args), ']'
) if DEBUG;
# we want to skip certain hack sequences like '* html'
if ($hack_sequence == 2) {
@elements = []; # clear out our stored elements to match behaviour of modern browsers
}
# we're just looking for any descendent
elsif( !$relationship ) {
if ($self->{match_self}) {
# if we are re-querying, be sure to match ourselves not just descendents
@elements = map { $_->look_down(@args) } @elements;
} else {
# look_down() will match self in addition to descendents,
# so we explicitly disallow matches on self as we iterate
# thru the list. The other cases below already exclude self.
# https://rt.cpan.org/Public/Bug/Display.html?id=58918
my @accumulator;
foreach my $e (@elements) {
if ($e->root() == $e) {
push(@accumulator, $e->look_down(@args));
}
else {
push(@accumulator, grep { $_ != $e } $e->look_down(@args));
}
}
@elements = @accumulator;
}
}
# immediate child selector
elsif( $relationship eq '>' ) {
@elements = map {
$_->look_down(
@args,
sub {
my $tag = shift;
my $root = $_;
return $tag->depth == $root->depth + 1;
}
)
} @elements;
}
# immediate sibling selector
elsif( $relationship eq '+' ) {
@elements = map {
$_->parent->look_down(
@args,
sub {
my $tag = shift;
my $root = $_;
my @prev_sibling = $tag->left;
# get prev next non-text sibling
foreach my $sibling (reverse @prev_sibling) {
next unless ref $sibling;
return $sibling == $root;
}
}
)
} @elements;
}
# grandchild selector
elsif( $relationship eq '*' ) {
@elements = map {
$_->look_down(
@args,
sub {
my $tag = shift;
my $root = $_;
return $tag->depth > $root->depth + 1;
}
)
} @elements;
}
# so we can check we've done something
$comops++;
# dedup the results we've gotten
@elements = $self->_dedup(\@elements);
map { warn $_->as_HTML } @elements if DEBUG;
}
if ($comops) {
$self->debug(
'Added', scalar(@elements), ' elements to results'
) if DEBUG;
my $selector = substr ($query,$startpos, $pos - $startpos);
$self->_add_specificity($selector,$specificity);
#add in the recent pass
push(@result,@elements);
# dedup the results across the result sets, necessary for comma based selectors
@result = $self->_dedup(\@result);
# sort the result set...
@result = sort _by_address @result;
# update op counter for complete query to include ops performed
# in this fragment
$ops += $comops;
}
else {
# looks like we got an empty comma section, e.g. : ",x, ,y,"
# so we'll ignore it
}
last COMMA unless $query =~ / \G \s*,\s* /cgsx;
}
# check for any trailing text in the query that we couldn't parse
if ($query =~ / \G (.+?) \s* $ /cgsx) {
return $self->_report_error( $self->message( bad_spec => $1, $query ) );
}
# check that we performed at least one query operation
unless ($ops) {
return $self->_report_error( $self->message( bad_query => $query ) );
}
return wantarray ? @result : $self->_new_match_self(@result);
}
# return elements stored from last query
sub get_elements {
my $self = shift;
return wantarray ? @{$self->{elements}} : $self->{elements};
}
###########################################################################################################
# from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity
###########################################################################################################
# A selector's specificity is calculated as follows:
#
# * count the number of ID attributes in the selector (= a)
# * count the number of other attributes and pseudo-classes in the selector (= b)
# * count the number of element names in the selector (= c)
# * ignore pseudo-elements.
#
# Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity.
#
# Example(s):
#
# Some examples:
#
# * {} /* a=0 b=0 c=0 -> specificity = 0 */
# LI {} /* a=0 b=0 c=1 -> specificity = 1 */
# UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */
# UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */
# H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */
# UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */
# LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */
# #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */
###########################################################################################################
=pod
=item specificity()
Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade
A selector's specificity is calculated as follows:
* count the number of ID attributes in the selector (= a)
* count the number of other attributes and pseudo-classes in the selector (= b)
* count the number of element names in the selector (= c)
* ignore pseudo-elements.
The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted
as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD.
See the following spec for additional details:
L
=back
=cut
sub get_specificity {
my ($self,$selector) = @_;
unless (exists $self->{specificity}->{$selector}) {
# if the invoking tree happened to be large this could get expensive real fast
# instead load up an empty instance and query that.
local $self->{elements} = [];
$self->query($selector);
}
return $self->{specificity}->{$selector};
}
sub suppress_errors {
my ($self, $setting) = @_;
if (defined($setting)) {
$self->{suppress_errors} = $setting;
}
return $self->{suppress_errors};
}
sub get_error {
my ($self) = @_;
return $self->{error};
}
sub list {
# return list of items or return unblessed list ref of items
return wantarray ? @{ $_[0] } : [ @{ $_[0] } ];
}
sub size {
my $self = shift;
return scalar @{$self->get_elements};
}
sub first {
my $self = shift;
return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty');
}
sub last {
my $self = shift;
return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty');
}
####################################################################
#
# Everything below here is a private method subject to change
#
####################################################################
sub _add_specificity {
my ($self, $selector, $specificity) = @_;
$self->{specificity}->{$selector} = $specificity;
return();
}
sub _report_error {
my ($self, $message) = @_;
if ($self->suppress_errors()) {
if (defined($message)) {
$self->{error} = $message;
}
return undef;
}
else {
$self->error($message); # this will DIE
}
}
# this Just Works[tm] because first arg is HTML::Element object
sub _export_query_to_element {
class(ELEMENT)->load->method(
query => \&Query,
);
}
# remove duplicate elements in the case where elements are nested between multiple matching elements
sub _dedup {
my ($self,$elements) = @_;
my %seen = ();
my @unique = ();
foreach my $item (@{$elements}) {
if (!exists($seen{$item})) {
push(@unique, $item);
}
$seen{$item}++;
}
return @unique;
}
# utility method to assist in sorting of query return sets
sub _by_address
{
my $self = shift;
my @a = split /\./, $a->address();
my @b = split /\./, $b->address();
my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b;
for (my $index=0; $index<$max; $index++) {
if (!defined($a[$index]) && !defined($b[$index])) {
return 0;
}
elsif (!defined($a[$index])) {
return -1;
}
elsif(!defined($b[$index])) {
return 1;
}
if ($a[$index] == $b[$index]) {
next; #move to the next
}
else {
return $a[$index] <=> $b[$index];
}
}
}
# instantiate an instance with match_self turned on, for use with
# follow-up queries, so they match the top-most elements.
sub _new_match_self {
my $self = shift;
my $result = $self->new(@_);
$result->{match_self} = 1;
return $result;
}
sub AUTOLOAD {
my $self = shift;
my ($method) = ($AUTOLOAD =~ /([^:]+)$/ );
return if $method eq 'DESTROY';
# we allow Perl to catch any unknown methods that the user might
# try to call against the HTML::Element objects in the query
my @results =
map { $_->$method(@_) }
@{$self->get_elements};
return wantarray ? @results : \@results;
}
1;
=head1 NAME
HTML::Query - jQuery-like selection queries for HTML::Element
=head1 SYNOPSIS
Creating an C object using the L constructor
subroutine:
use HTML::Query 'Query';
# using named parameters
$q = Query( text => $text ); # HTML text
$q = Query( file => $file ); # HTML file
$q = Query( tree => $tree ); # HTML::Element object
$q = Query( query => $query ); # HTML::Query object
$q = Query(
text => $text1, # or any combination
text => $text2, # of the above
file => $file1,
file => $file2,
tree => $tree,
query => $query,
);
# passing elements as positional arguments
$q = Query( $tree ); # HTML::Element object(s)
$q = Query( $tree1, $tree2, $tree3, ... );
# or from one or more existing queries
$q = Query( $query1 ); # HTML::Query object(s)
$q = Query( $query1, $query2, $query3, ... );
# or a mixture
$q = Query( $tree1, $query1, $tree2, $query2 );
# the final argument (in all cases) can be a selector
my $spec = 'ul.menu li a'; #