The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use Getopt::Long;
use HTML::HTML5::Parser;

my $output = $ENV{HTML_OUTPUT} || 'debug:json';
my $help;
GetOptions(
	'output|o=s'   => \$output,
	'help|usage|h' => \$help,
	);

if ($help)
{
	my $name = $0;
	print <<HELP and exit(1);
Usage: cat document.html | $0 --output=FORMAT

  Reads HTML from STDIN and prints it out in a format useful for debugging.

Formats:

  clarkml        - ClarkML
  debug          - Debugging tree
  debug:json     - Debugging tree as JSON (default)
  errors         - List of errors
  html           - HTML
  parser         - Parser internals
  parser:json    - Parser internals as JSON
  xml            - XHTML
  
HELP
}

sub load
{
	my ($pkg) = @_;
	unless (eval "use $pkg; 1;")
	{
		die "This output format requires $pkg.\n";
	}
}

my $p = HTML::HTML5::Parser->new;
my $h = join '', <>;
my $hash;

if ($output =~ /debug/i)
{
	load('XML::LibXML::Debugging');
	$hash = $p->parse_string($h)->toDebuggingHash;
}
elsif ($output =~ /clark/i)
{
	load('XML::LibXML::Debugging');
	print $p->parse_string($h)->toClarkML;
}
elsif ($output =~ /html/i)
{
	load('HTML::HTML5::Writer');
	print HTML::HTML5::Writer->new->document($p->parse_string($h));
}
elsif ($output =~ /parser/i)
{
	$p->parse_string($h);
	$hash = $p;
}
elsif ($output =~ /err/i)
{
	$p->parse_string($h);
	print "$_\n" foreach $p->errors;
}
else
{
	print $p->parse_string($h)->toString;
}

if (defined $hash and $output =~ /json/)
{
	load('JSON');
	print to_json($hash, {pretty=>1,canonical=>1});
}
elsif (defined $hash)
{
	load('Data::Dumper');
	print Dumper($hash);
}