package Text::TOC::InputHandler::HTML;
{
$Text::TOC::InputHandler::HTML::VERSION = '0.09';
}
use strict;
use warnings;
use namespace::autoclean;
use HTML::DOM;
use HTML::Entities qw( encode_entities );
use Text::TOC::Node::HTML;
use Text::TOC::Types qw( Int );
use Moose;
use MooseX::StrictConstructor;
with 'Text::TOC::Role::InputHandler';
has _counter => (
traits => ['Counter'],
is => 'ro',
isa => Int,
default => 0,
handles => { _inc_counter => 'inc' },
);
__PACKAGE__->meta()->make_immutable();
sub _process_file {
my $self = shift;
my $file = shift;
my $content = shift;
my $dom = HTML::DOM->new();
$dom->write($content);
$self->_walk_nodes( $dom->body() || $dom, $file );
return $dom;
}
sub _walk_nodes {
my $self = shift;
my $parent = shift;
my $file = shift;
for my $node ( grep { $_->isa('HTML::DOM::Element') }
$parent->childNodes() ) {
if ( $self->_filter()->node_is_interesting($node) ) {
$self->_save_node( $node, $file );
}
else {
$self->_walk_nodes( $node, $file );
}
}
return;
}
sub _save_node {
my $self = shift;
my $node = shift;
my $file = shift;
my $wrapped = Text::TOC::Node::HTML->new(
type => lc $node->tagName(),
contents => $node,
anchor_name => $self->_anchor_name($node),
source_file => $file,
);
$self->_add_node($wrapped);
return;
}
sub _anchor_name {
my $self = shift;
my $domlet = shift;
my $text_contents = $domlet->as_text();
$text_contents =~ s/\s+/_/g;
# These are the only characters allowed in a name according to the HTML
# spec.
$text_contents =~ s/[^A-Za-z0-9-_:.]//g;
my $name = encode_entities($text_contents) . q{-} . $self->_counter();
# Anchors must begin with a letter.
$name = 'A-' . $name unless $name =~ /^[A-Za-z]/;
$self->_inc_counter();
return $name;
}
1;
# ABSTRACT: Implements an input handler for HTML documents
__END__
=pod
=head1 NAME
Text::TOC::InputHandler::HTML - Implements an input handler for HTML documents
=head1 VERSION
version 0.09
=head1 DESCRIPTION
This class processes an HTML document and finds nodes which should be included
in the table of contents.
It has no end-user facing parts at the moment.
=head1 ROLES
This class does the L role.
=head1 AUTHOR
Dave Rolsky
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2012 by Dave Rolsky.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut