package HTML::Strip::Whitespace;
package HTML::Strip::Whitespace::State;
use strict;
use warnings;
sub new
{
my $class = shift;
my $self = {};
bless $self, $class;
$self->initialize(@_);
return $self;
}
sub to_array
{
my $v = shift;
return (ref($v) eq "ARRAY" ? (@$v) : $v);
}
sub initialize
{
my $self = shift;
my %args = (@_);
$self->{'prev'} = undef;
$self->{'next'} = undef;
$self->{'this'} = undef;
$self->{'parser'} =
HTML::TokeParser::Simple->new(
to_array($args{'parser_args'})
);
$self->{'strip_newlines'} = $args{'strip_newlines'} || 0;
$self->{'out_fh'} = $args{'out_fh'};
# Get the first element to initialize the parser
# Otherwise the first call to next_state would return undef;
$self->next_state();
return 0;
}
sub next_state
{
my $self = shift;
($self->{'prev'}, $self->{'this'}, $self->{'next'}) =
($self->{'this'}, $self->{'next'}, $self->{'parser'}->get_token());
if (!defined($self->{'this'}))
{
return undef;
}
return 1;
}
sub prev
{
my $self = shift;
return $self->{'prev'};
}
sub next
{
my $self = shift;
return $self->{'next'};
}
sub this
{
my $self = shift;
return $self->{'this'};
}
sub text_strip
{
my $self = shift;
# my $p = $self->prev();
# my $n = $self->next();
my $text = $self->this()->as_is();
$text =~ s{([\s\n]+)}{($1 =~ /\n/) ? "\n" : " "}eg;
return $text;
}
my %preserving_start_tags =
(
'pre' => 1,
);
sub is_preserving_start_tag
{
my $self = shift;
my $t = $self->this();
if ($t->is_start_tag() &&
exists($preserving_start_tags{$t->get_tag()})
)
{
return $t->get_tag();
}
return undef;
}
sub handle_text
{
my $state = shift;
if ($state->this->is_text())
{
$state->out($state->text_strip());
return 0;
}
else
{
return 1;
}
}
sub out
{
my $self = shift;
my $what = shift;
my $out_fh = $self->{'out_fh'};
if (ref($out_fh) eq "CODE")
{
&{$out_fh}($what);
}
elsif (ref($out_fh) eq "SCALAR")
{
$$out_fh .= $what;
}
elsif (ref($out_fh) eq "GLOB")
{
print {*{$out_fh}} $what;
}
return 0;
}
sub out_this
{
my $state = shift;
$state->out($state->this()->as_is());
}
sub process
{
my $state = shift;
my $tag_type;
while ($state->next_state())
{
if (! $state->handle_text())
{
# Text was handled
}
# If it's a preserving start tag, preserve all the text inside it.
# This is for example, a
tag in which the spaces matter.
elsif ($tag_type = $state->is_preserving_start_tag())
{
my $do_once = 1;
while ($do_once || $state->next_state())
{
$do_once = 0;
$state->out_this();
last if ($state->this()->is_end_tag($tag_type))
}
}
else
{
$state->out_this();
}
}
# Return 0 on success.
return 0;
}
package HTML::Strip::Whitespace;
use 5.004;
use strict;
use warnings;
use HTML::TokeParser::Simple;
require Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use HTML::Strip::Whitespace ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
%EXPORT_TAGS = ( 'all' => [ qw(
html_strip_whitespace
) ] );
@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} });
@EXPORT = qw(
);
$VERSION = '0.1.6';
# Preloaded methods go here.
sub html_strip_whitespace
{
my %args = (@_);
my $source = $args{'source'} or
die "source argument not specified.";
my $strip_newlines = $args{'strip_newlines'} || 0;
my $out_fh = $args{'out'} or
die "out argument not specified.";
my $state =
HTML::Strip::Whitespace::State->new(
'parser_args' => $source,
'strip_newlines' => $strip_newlines,
'out_fh' => $out_fh,
);
return $state->process();
}
# Autoload methods go after =cut, and are processed by the autosplit program.
1;
__END__
=head1 NAME
HTML::Strip::Whitespace - Perl extension for stripping whitespace out of
HTML.
=head1 SYNOPSIS
use HTML::Strip::Whitespace qw(html_strip_whitespace);
my $html = <<"EOF";
Hello there!
EOF
my $buffer = "";
html_strip_whitespace(
'source' => \$html,
'out' => \$buffer
);
=head1 DESCRIPTION
This module tries to strip as much whitespace from an HTML as it can
without eliminating valid whitespace (like the one inside
).
To use it call the function C,
with named parameters. C is the HTML::TokeParser source for the
HTML. C can be a reference to a buffer which will be filled with the
stripped HTML, or alternatively a reference to a sub-routine or a file handle
that will output it.
=head1 FUNCTIONS
=head2 html_strip_whitespace(source => $src, out => $out, strip_newlinews => $strip)
C is the HTML::TokeParser source for the
HTML. C can be a reference to a buffer which will be filled with the
stripped HTML, or alternatively a reference to a sub-routine or a file handle
that will output it.
=head1 SEE ALSO
HTML Tidy with its Perl binding, which probably does a better and faster job
of rendering this page.
=head1 AUTHOR
Shlomi Fish, Eshlomif@iglu.org.ilE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2004 by Shlomi Fish
This library is free software; you can redistribute it and/or modify it
under the terms of the MIT X11 license.
=cut