package WWW::Webrobot::Html2XHtml;
use strict;
use warnings;
# Author: Stefan Trcek
# Copyright(c) 2004 ABAS Software AG
=head1 NAME
WWW::Webrobot::Html2XHtml - convert HTML to XML
=head1 SYNOPSIS
use WWW::Webrobot::Html2XHtml;
my $converter = WWW::Webrobot::Html2XHtml -> new();
$converter->to_xhtml($dirty_html, $encoding);
=head1 DESCRIPTION
=head1 METHODS
=over
=cut
use HTML::TreeBuilder;
use HTML::Entities;
use WWW::Webrobot::MyEncode qw/has_Encode octet_to_internal_utf8/;
my $XML_HEADER = qq(\n);
my %e2c =
map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})}
grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value && $value < 256}
keys %HTML::Entities::entity2char;
=item new
Constructor
=cut
sub new {
my $class = shift;
my $self = bless({}, ref($class) || $class);
return $self;
}
sub html_decode_entities_utf8 {
my ($value) = @_;
foreach ($value) {
s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg;
s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ? pack("U", $c) : $1 /eg;
s/(&(\w+);?)/ $e2c{$2} || $1 /eg;
}
return $value;
}
=item to_xhtml($dirty_html, $encoding)
Convert C<$dirty_html> to XML.
C<$dirty_html> is a sequence of octets and is assumend to be
coded in C<$encoding>.
=cut
sub to_xhtml {
my ($self, $dirty_html, $encoding) = @_;
#return "NO VALID ENCODING='$encoding'\n" if ! $encoding;
my $parser = new HTML::TreeBuilder();
$parser->no_space_compacting(1);
$parser->ignore_ignorable_whitespace(0);
# Encode $dirty_html to Perls internal encoding UTF-8.
$dirty_html = octet_to_internal_utf8($encoding, $dirty_html);
# Decode HTML entities, because HTML::TreeBuilder doesn't handle it right.
# Can't use HTML::Entities::decode_entities because it uses 'chr($x)'
# instead of 'pack("U",$x)'
$dirty_html = html_decode_entities_utf8($dirty_html);
# Parse $dirty_html and encode all remaining bytes as html entities.
# That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx
my $tree = $parser->parse($dirty_html);
my $xml = $XML_HEADER . $tree->as_XML();
# $xml has all byte encoded as x;
$tree = $tree -> delete;
if (! has_Encode()) {
# Decode UTF-8 characters and control characters, $xml is ASCII
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;
}
elsif (Encode::is_utf8($xml)) { # SunOS 5.7 / perl 5.8.5
# Decode UTF-8 characters and control characters, $xml is UTF-8
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("U", $2) /eg;
}
else { # Linux perl 5.8.0/5.8.5, Win32 perl 5.8.0
# Decode UTF-8 characters and control characters, $xml is ASCII
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;
# Now we have an UTF-8 string and must Perl believe so too.
Encode::_utf8_on($xml);
}
return $xml;
}
=back
=cut
1;