package HTML::Query; our $VERSION = '0.08'; use Badger::Class version => $VERSION, debug => 0, base => 'Badger::Base', utils => 'blessed', import => 'class CLASS', vars => 'AUTOLOAD', constants => 'ARRAY', constant => { ELEMENT => 'HTML::Element', BUILDER => 'HTML::TreeBuilder', }, exports => { any => 'Query', hooks => { query => \&_export_query_to_element, }, }, messages => { no_elements => 'No elements specified to query', no_query => 'No query specified', no_source => 'No argument specified for source: %s', bad_element => 'Invalid element specified: %s', bad_source => 'Invalid source specified: %s', bad_query => 'Invalid query specified: %s', bad_spec => 'Invalid specification "%s" in query: %s', is_empty => 'The query does not contain any elements', }; our $SOURCES = { text => sub { class(BUILDER)->load; BUILDER->new_from_content(shift); }, file => sub { class(BUILDER)->load; BUILDER->new_from_file(shift); }, tree => sub { $_[0] }, query => sub { ref $_[0] eq ARRAY ? @{ $_[0] } : $_[0]; }, }; sub Query (@) { CLASS->new(@_); } sub new { my $class = shift; my ($element, @elements, $type, $code, $select); # expand a single list ref into items unshift @_, @{ shift @_ } if @_ == 1 && ref $_[0] eq ARRAY; $class = ref $class || $class; my $self = { error => undef, suppress_errors => undef, match_self => undef, elements => \@elements, specificity => {} }; # each element should be an HTML::Element object, although we might # want to subclass this module to recognise a different kind of object, # so we get the element class from the ELEMENT constant method which a # subclass can re-define. my $element_class = $class->ELEMENT; while (@_) { $element = shift; $class->debug("argument: ".$element) if DEBUG; if (! ref $element) { # a non-reference item is a source type (text, file, tree) # followed by the source, or if it's the last argument following # one ore more element options or named argument pairs then it's # a selection query if (@_) { $type = $element; $code = $SOURCES->{ $type } || return $class->error_msg( bad_source => $type ); $element = shift; $class->debug("source $type: $element") if DEBUG; unshift(@_, $code->($element)); next; } elsif (@elements) { $select = $element; last; } } elsif (blessed $element) { # otherwise it should be an HTML::Element object or another # HTML::Query object if ($element->isa($element_class)) { push(@elements, $element); next; } elsif ($element->isa($class)) { push(@elements, @{$element->get_elements}); next; } } return $class->error_msg( bad_element => $element ); } bless $self, $class; return defined $select ? $self->query($select) : $self; } sub query { my ($self, $query) = @_; my @result; my $ops = 0; my $pos = 0; $self->{error} = undef; return $self->error_msg('no_query') unless defined $query && length $query; # multiple specs can be comma separated, e.g. "table tr td, li a, div.foo" COMMA: while (1) { # each comma-separated traversal spec is applied downward from # the source elements in the $self->{elements} query my @elements = @{$self->get_elements}; my $comops = 0; my $specificity = 0; my $startpos = pos($query) || 0; my $hack_sequence = 0; # look for '* html' warn "Starting new COMMA" if DEBUG; # for each whitespace delimited descendant spec we grok the correct # parameters for look_down() and apply them to each source element # e.g. "table tr td" SEQUENCE: while (1) { my @args; $pos = pos($query) || 0; my $relationship = ''; my $leading_whitespace; warn "Starting new SEQUENCE" if DEBUG; # ignore any leading whitespace if ($query =~ / \G (\s+) /cgsx) { $leading_whitespace = defined($1) ? 1 : 0; warn "removing leading whitespace\n" if DEBUG; } # grandchild selector is whitespace sensitive, requires leading whitespace if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) { # can't have a relationship modifier as the first part of the query $relationship = $1; warn "relationship = $relationship\n" if DEBUG; } # get other relationship modifiers if ($query =~ / \G (>|\+) \s* /cgx) { # can't have a relationship modifier as the first part of the query $relationship = $1; warn "relationship = $relationship\n" if DEBUG; if (!$comops) { return $self->_report_error( $self->message( bad_spec => $relationship, $query ) ); } } # optional leading word is a tag name if ($query =~ / \G ([\w\*]+) /cgx) { my $tag = $1; if ($tag =~ m/\*/) { if (($leading_whitespace || $comops == 0) && ($tag eq '*')) { warn "universal tag\n" if DEBUG; push(@args, _tag => qr/\w+/); if ($comops == 0) { #we need to catch the case where we see '* html' $hack_sequence++; } } else { return $self->_report_error( $self->message( bad_spec => $tag, $query ) ); } } else { warn "html tag\n" if DEBUG; $specificity += 1; # standard tags are worth 1 point push( @args, _tag => $tag ); if ($comops == 1 && $tag eq 'html') { $hack_sequence++; } } } # loop to collect a description about this specific part of the rule while (1) { my $work = scalar @args; # that can be followed by (or the query can start with) a #id if ($query =~ / \G \# ([\w\-]+) /cgx) { $specificity += 100; push( @args, id => $1 ); } # and/or a .class if ($query =~ / \G \. ([\w\-]+) /cgx) { $specificity += 10; push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x ); } # and/or none or more [ ] attribute specs if ($query =~ / \G \[ (.*?) \] /cgx) { my $attribute = $1; $specificity += 10; #if we have an operator if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) { my ($name,$attribute_op,$value) = ($1,$2,$3); unless (defined($name) && length($name)) { return $self->_report_error( $self->message( bad_spec => $name, $query ) ); } warn "operator $attribute_op" if DEBUG; if (defined $value) { for ($value) { s/^['"]//; s/['"]$//; } if ($attribute_op eq '=') { push( @args, $name => $value); } elsif ($attribute_op eq '|=') { push(@args, $name => qr/\b${value}-?/) } elsif ($attribute_op eq '~=') { push(@args, $name => qr/\b${value}\b/) } else { return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); } } else { return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); } } else { unless (defined($attribute) && length($attribute)) { return $self->_report_error( $self->message( bad_spec => $attribute, $query ) ); } # add a regex to match anything (or nothing) push( @args, $attribute => qr/.*/ ); } } # and/or one or more pseudo-classes if ($query =~ / \G : ([\w\-]+) /cgx) { my $pseudoclass = $1; $specificity += 10; if ($pseudoclass eq 'first-child') { push( @args, sub { ! grep { ref $_ } $_[0]->left() } ); } elsif ($pseudoclass eq 'last-child') { push( @args, sub { ! grep { ref $_ } $_[0]->right() } ); } else { warn "Pseudoclass :$pseudoclass not supported"; next; } } # keep going until this particular expression is fully processed last unless scalar(@args) > $work; } # we must have something in @args by now or we didn't find any # valid query specification this time around last SEQUENCE unless @args; $self->debug( 'Parsed ', substr($query, $pos, pos($query) - $pos), ' into args [', join(', ', @args), ']' ) if DEBUG; # we want to skip certain hack sequences like '* html' if ($hack_sequence == 2) { @elements = []; # clear out our stored elements to match behaviour of modern browsers } # we're just looking for any descendent elsif( !$relationship ) { if ($self->{match_self}) { # if we are re-querying, be sure to match ourselves not just descendents @elements = map { $_->look_down(@args) } @elements; } else { # look_down() will match self in addition to descendents, # so we explicitly disallow matches on self as we iterate # thru the list. The other cases below already exclude self. # https://rt.cpan.org/Public/Bug/Display.html?id=58918 my @accumulator; foreach my $e (@elements) { if ($e->root() == $e) { push(@accumulator, $e->look_down(@args)); } else { push(@accumulator, grep { $_ != $e } $e->look_down(@args)); } } @elements = @accumulator; } } # immediate child selector elsif( $relationship eq '>' ) { @elements = map { $_->look_down( @args, sub { my $tag = shift; my $root = $_; return $tag->depth == $root->depth + 1; } ) } @elements; } # immediate sibling selector elsif( $relationship eq '+' ) { @elements = map { $_->parent->look_down( @args, sub { my $tag = shift; my $root = $_; my @prev_sibling = $tag->left; # get prev next non-text sibling foreach my $sibling (reverse @prev_sibling) { next unless ref $sibling; return $sibling == $root; } } ) } @elements; } # grandchild selector elsif( $relationship eq '*' ) { @elements = map { $_->look_down( @args, sub { my $tag = shift; my $root = $_; return $tag->depth > $root->depth + 1; } ) } @elements; } # so we can check we've done something $comops++; # dedup the results we've gotten @elements = $self->_dedup(\@elements); map { warn $_->as_HTML } @elements if DEBUG; } if ($comops) { $self->debug( 'Added', scalar(@elements), ' elements to results' ) if DEBUG; my $selector = substr ($query,$startpos, $pos - $startpos); $self->_add_specificity($selector,$specificity); #add in the recent pass push(@result,@elements); # dedup the results across the result sets, necessary for comma based selectors @result = $self->_dedup(\@result); # sort the result set... @result = sort _by_address @result; # update op counter for complete query to include ops performed # in this fragment $ops += $comops; } else { # looks like we got an empty comma section, e.g. : ",x, ,y," # so we'll ignore it } last COMMA unless $query =~ / \G \s*,\s* /cgsx; } # check for any trailing text in the query that we couldn't parse if ($query =~ / \G (.+?) \s* $ /cgsx) { return $self->_report_error( $self->message( bad_spec => $1, $query ) ); } # check that we performed at least one query operation unless ($ops) { return $self->_report_error( $self->message( bad_query => $query ) ); } return wantarray ? @result : $self->_new_match_self(@result); } # return elements stored from last query sub get_elements { my $self = shift; return wantarray ? @{$self->{elements}} : $self->{elements}; } ########################################################################################################### # from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity ########################################################################################################### # A selector's specificity is calculated as follows: # # * count the number of ID attributes in the selector (= a) # * count the number of other attributes and pseudo-classes in the selector (= b) # * count the number of element names in the selector (= c) # * ignore pseudo-elements. # # Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity. # # Example(s): # # Some examples: # # * {} /* a=0 b=0 c=0 -> specificity = 0 */ # LI {} /* a=0 b=0 c=1 -> specificity = 1 */ # UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */ # UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */ # H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */ # UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */ # LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */ # #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */ ########################################################################################################### =pod =item specificity() Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade A selector's specificity is calculated as follows: * count the number of ID attributes in the selector (= a) * count the number of other attributes and pseudo-classes in the selector (= b) * count the number of element names in the selector (= c) * ignore pseudo-elements. The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD. See the following spec for additional details: L =back =cut sub get_specificity { my ($self,$selector) = @_; unless (exists $self->{specificity}->{$selector}) { # if the invoking tree happened to be large this could get expensive real fast # instead load up an empty instance and query that. local $self->{elements} = []; $self->query($selector); } return $self->{specificity}->{$selector}; } sub suppress_errors { my ($self, $setting) = @_; if (defined($setting)) { $self->{suppress_errors} = $setting; } return $self->{suppress_errors}; } sub get_error { my ($self) = @_; return $self->{error}; } sub list { # return list of items or return unblessed list ref of items return wantarray ? @{ $_[0] } : [ @{ $_[0] } ]; } sub size { my $self = shift; return scalar @{$self->get_elements}; } sub first { my $self = shift; return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty'); } sub last { my $self = shift; return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty'); } #################################################################### # # Everything below here is a private method subject to change # #################################################################### sub _add_specificity { my ($self, $selector, $specificity) = @_; $self->{specificity}->{$selector} = $specificity; return(); } sub _report_error { my ($self, $message) = @_; if ($self->suppress_errors()) { if (defined($message)) { $self->{error} = $message; } return undef; } else { $self->error($message); # this will DIE } } # this Just Works[tm] because first arg is HTML::Element object sub _export_query_to_element { class(ELEMENT)->load->method( query => \&Query, ); } # remove duplicate elements in the case where elements are nested between multiple matching elements sub _dedup { my ($self,$elements) = @_; my %seen = (); my @unique = (); foreach my $item (@{$elements}) { if (!exists($seen{$item})) { push(@unique, $item); } $seen{$item}++; } return @unique; } # utility method to assist in sorting of query return sets sub _by_address { my $self = shift; my @a = split /\./, $a->address(); my @b = split /\./, $b->address(); my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b; for (my $index=0; $index<$max; $index++) { if (!defined($a[$index]) && !defined($b[$index])) { return 0; } elsif (!defined($a[$index])) { return -1; } elsif(!defined($b[$index])) { return 1; } if ($a[$index] == $b[$index]) { next; #move to the next } else { return $a[$index] <=> $b[$index]; } } } # instantiate an instance with match_self turned on, for use with # follow-up queries, so they match the top-most elements. sub _new_match_self { my $self = shift; my $result = $self->new(@_); $result->{match_self} = 1; return $result; } sub AUTOLOAD { my $self = shift; my ($method) = ($AUTOLOAD =~ /([^:]+)$/ ); return if $method eq 'DESTROY'; # we allow Perl to catch any unknown methods that the user might # try to call against the HTML::Element objects in the query my @results = map { $_->$method(@_) } @{$self->get_elements}; return wantarray ? @results : \@results; } 1; =head1 NAME HTML::Query - jQuery-like selection queries for HTML::Element =head1 SYNOPSIS Creating an C object using the L constructor subroutine: use HTML::Query 'Query'; # using named parameters $q = Query( text => $text ); # HTML text $q = Query( file => $file ); # HTML file $q = Query( tree => $tree ); # HTML::Element object $q = Query( query => $query ); # HTML::Query object $q = Query( text => $text1, # or any combination text => $text2, # of the above file => $file1, file => $file2, tree => $tree, query => $query, ); # passing elements as positional arguments $q = Query( $tree ); # HTML::Element object(s) $q = Query( $tree1, $tree2, $tree3, ... ); # or from one or more existing queries $q = Query( $query1 ); # HTML::Query object(s) $q = Query( $query1, $query2, $query3, ... ); # or a mixture $q = Query( $tree1, $query1, $tree2, $query2 ); # the final argument (in all cases) can be a selector my $spec = 'ul.menu li a'; #