package HTML::TreeBuilder::LibXML; use strict; use warnings; our $VERSION = '0.23'; use Carp (); use base 'HTML::TreeBuilder::LibXML::Node'; use XML::LibXML; use 5.008001; sub new { my $class = shift; bless {}, $class; } sub new_from_content { my $class = shift; my $self = $class->new; for my $content (@_) { $self->parse($content); } $self->eof; return $self; } sub new_from_file { my $class = shift; my $self = $class->new; $self->parse_file(@_); return $self; } my $PARSER; sub _parser { unless ($PARSER) { $PARSER = XML::LibXML->new(); $PARSER->recover(1); $PARSER->recover_silently(1); $PARSER->keep_blanks(0); $PARSER->expand_entities(1); $PARSER->no_network(1); } $PARSER; } sub parse { my ($self, $html) = @_; $self->{_content} .= $html; } sub parse_content { my $self = shift; $self->parse($_[0]); $self->eof; } sub parse_file { my $self = shift; open (my $fh, '<', $_[0]) or die "Can't open $_[0]: $!\n"; my $content = do { local $/; <$fh> }; $self->parse_content($content); } sub eof { my ($self, ) = @_; $self->{_content} = ' ' if defined $self->{_content} && $self->{_content} eq ''; # HACK $self->{_implicit_html} = 1 unless $self->{_content} =~ / was inserted $self->{_implicit_doctype} = 1 unless $self->{_content} =~ /_parser->parse_html_string($self->{_content}); $self->{node} = $self->_documentElement($doc); } sub _documentElement { my($self, $doc) = @_; return $doc->documentElement || do { my $elem = $doc->createElement("html"); $elem->appendChild($doc->createElement("body")); $elem; }; } sub elementify { bless shift, 'HTML::TreeBuilder::LibXML::Node'; } sub guts { my ($self, $destructive) = @_; my @out = $self->{_implicit_html} ? map { $_->nonBlankChildNodes } $self->{node}->findnodes('/html/head | /html/body') : $self->{node}; if ($destructive && @out > 0) { my $doc = XML::LibXML->createDocument; if (!$self->{_implicit_doctype} && (my $dtd = $out[0]->ownerDocument->internalSubset)) { $doc->createInternalSubset( $dtd->getName, $dtd->publicId, $dtd->systemId ); } $doc->setDocumentElement($out[0]); # 1st child $out[0]->addSibling($_) foreach @out[1..$#out]; } return map { HTML::TreeBuilder::LibXML::Node->new($_) } @out if wantarray; # one simple normal case. return unless @out; my $doc = XML::LibXML->createDocument; if (!$self->{_implicit_doctype} && (my $dtd = $out[0]->ownerDocument->internalSubset)) { $doc->createInternalSubset( $dtd->getName, $dtd->publicId, $dtd->systemId ); } if (@out == 1) { $doc->adoptNode($out[0]); $doc->setDocumentElement($out[0]); return HTML::TreeBuilder::LibXML::Node->new($out[0]); } my $div = $doc->createElement('div'); # TODO put the _implicit flag somewhere, to be compatible with HTML::TreeBuilders $doc->setDocumentElement($div); $div->appendChild($_) for @out; return HTML::TreeBuilder::LibXML::Node->new($div); } sub disembowel { my ($self) = @_; $self->guts(1); } sub replace_original { require HTML::TreeBuilder::XPath; my $orig = HTML::TreeBuilder::XPath->can('new'); no warnings 'redefine'; *HTML::TreeBuilder::XPath::new = sub { HTML::TreeBuilder::LibXML->new(); }; if (defined wantarray) { return HTML::TreeBuilder::LibXML::Destructor->new( sub { *HTML::TreeBuilder::XPath::new = $orig } ); } return; } # The HTML::TreeBuilder has this method and it is needed to us for web-scraper module sub store_comments { } sub ignore_unknown { } package # hide from cpan HTML::TreeBuilder::LibXML::Destructor; sub new { my ( $class, $callback ) = @_; bless { cb => $callback }, $class; } sub DESTROY { my $self = shift; $self->{cb}->(); } 1; __END__ =encoding utf-8 =head1 NAME HTML::TreeBuilder::LibXML - HTML::TreeBuilder and XPath compatible interface with libxml =head1 SYNOPSIS use HTML::TreeBuilder::LibXML; my $tree = HTML::TreeBuilder::LibXML->new; $tree->parse($html); $tree->eof; # $tree and $node compatible to HTML::Element my @nodes = $tree->findvalue($xpath); for my $node (@nodes) { print $node->tag; my %attr = $node->all_external_attr; } HTML::TreeBuilder::LibXML->replace_original(); # replace HTML::TreeBuilder::XPath->new =head1 DESCRIPTION HTML::TreeBuilder::XPath is libxml based compatible interface to HTML::TreeBuilder, which could be slow for a large document. HTML::TreeBuilder::LibXML is drop-in-replacement for HTML::TreeBuilder::XPath. This module doesn't implement all of HTML::TreeBuilder and HTML::Element APIs, but enough methods are defined so modules like Web::Scraper work. =head1 BENCHMARK This is a benchmark result by tools/benchmark.pl Web::Scraper: 0.26 HTML::TreeBuilder::XPath: 0.09 HTML::TreeBuilder::LibXML: 0.01_01 Rate no_libxml use_libxml no_libxml 5.45/s -- -94% use_libxml 94.3/s 1632% -- =head1 AUTHOR Tokuhiro Matsuno Etokuhirom@gmail.comE Tatsuhiko Miyagawa Emiyagawa@cpan.orgE Masahiro Chiba =head1 THANKS TO woremacx++ http://d.hatena.ne.jp/woremacx/20080202/1201927162 id:dailyflower =head1 SEE ALSO L, L =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut