# # $Id: libXML.pm,v 0.2 2009/02/21 11:47:58 dankogai Exp dankogai $ # package HTML::Tidy::libXML; use warnings; use strict; use Encode; use XML::LibXML; our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g; sub new { my $class = shift; my $lx = XML::LibXML->new; $lx->validation(0); $lx->recover_silently(1); bless { lx => $lx }, $class; } sub html2dom { my ( $self, $html, $encoding ) = @_; $encoding ||= 'iso-8859-1'; $html =~ s/\r\n?/\n/msg; # normalize CRLF to LF $html = decode( $encoding, $html ); # leave the utf8 flag $self->{lx}->parse_html_string($html); } sub dom2xml { my ($self, $dom, $level) = @_; my $root = $dom->findnodes('/html')->shift; $root->setAttribute( xmlns => 'http://www.w3.org/1999/xhtml' ); for my $meta ( $dom->findnodes('//meta[@http-equiv!=""]') ) { $meta->setAttribute( content => 'text/html; charset=utf-8' ); } _tidy_dom($dom) if $level > 0; my $xhtml = $root->toString( 0, 'utf-8' ); # utf8 flag off return < $xhtml EOT } sub html2xml { my ( $self, $html, $encoding, $level ) = @_; my $dom = $self->html2dom( $html, $encoding ); $self->dom2xml($dom, $level); } sub _tidy_dom { my $dom = shift; # remove empty attributes (like
) for my $node ( $dom->findnodes('//*[attribute::*=""]') ) { for my $attr ( $node->attributes ) { next if $attr->getValue; $node->removeAttribute( $attr->getName ); } } # handle $script->appendChild( $dom->createTextNode("") ); } } # handle