package Apache::Clean; use 5.008; use Apache2::Filter (); # $f use Apache2::RequestRec (); # $r use Apache2::RequestUtil (); # $r->dir_config() use Apache2::Log (); # $log->info() use APR::Table (); # dir_config->get() and headers_out->get() use Apache2::Const -compile => qw(OK DECLINED); use HTML::Clean (); use strict; our $VERSION = '2.00_7'; sub handler { my $f = shift; my $r = $f->r; my $log = $r->server->log; # we only process HTML documents unless ($r->content_type =~ m!text/html!i) { $log->info('skipping request to ', $r->uri, ' (not an HTML document)'); return Apache2::Const::DECLINED; } my $context; unless ($f->ctx) { # these are things we only want to do once no matter how # many times our filter is invoked per request # parse the configuration options my $level = $r->dir_config->get('CleanLevel') || 1; my %options = map { $_ => 1 } $r->dir_config->get('CleanOption'); # store the configuration $context = { level => $level, options => \%options, extra => undef }; # output filters that alter content are responsible for removing # the Content-Length header, but we only need to do this once. $r->headers_out->unset('Content-Length'); } # retrieve the filter context, which was set up on the first invocation $context ||= $f->ctx; # now, filter the content while ($f->read(my $buffer, 1024)) { # prepend any tags leftover from the last buffer or invocation $buffer = $context->{extra} . $buffer if $context->{extra}; # if our buffer ends in a split tag ('{extra}) = $buffer =~ m/(<[^>]*)$/) { $buffer = substr($buffer, 0, - length($context->{extra})); } my $h = HTML::Clean->new(\$buffer); $h->level($context->{level}); $h->strip($context->{options}); $f->print(${$h->data}); } if ($f->seen_eos) { # we've seen the end of the data stream # print any leftover data $f->print($context->{extra}) if $context->{extra}; } else { # there's more data to come # store the filter context, including any leftover data # in the 'extra' key $f->ctx($context); } return Apache2::Const::OK; } 1; __END__ =head1 NAME Apache::Clean - interface into HTML::Clean for mod_perl 2.0 =head1 SYNOPSIS httpd.conf: PerlModule Apache::Clean Alias /clean /usr/local/apache2/htdocs PerlOutputFilterHandler Apache::Clean PerlSetVar CleanOption shortertags PerlAddVar CleanOption whitespace =head1 DESCRIPTION Apache::Clean uses HTML::Clean to tidy up large, messy HTML, saving bandwidth. Only documents with a content type of "text/html" are affected - all others are passed through unaltered. For more information, see http://www.perl.com/pub/a/2003/04/17/filters.html =head1 OPTIONS Apache::Clean supports few options - all of which are based on options from HTML::Clean. Apache::Clean will only tidy up whitespace (via $h->strip) and will not perform other options of HTML::Clean (such as browser compatibility). See the HTML::Clean manpage for details. =over 4 =item CleanLevel sets the clean level, which is passed to the level() method in HTML::Clean. PerlSetVar CleanLevel 9 CleanLevel defaults to 1. =item CleanOption specifies the set of options which are passed to the options() method in HTML::Clean - see the HTML::Clean manpage for a complete list of options. PerlSetVar CleanOption shortertags PerlAddVar CleanOption whitespace CleanOption has no default. =back =head1 NOTES This is alpha software, and as such has not been tested on multiple platforms or environments. =head1 FEATURES/BUGS probably lots - this is the preliminary port to mod_perl 2.0 =head1 SEE ALSO perl(1), mod_perl(3), Apache(3), HTML::Clean(3) =head1 AUTHOR Geoffrey Young =head1 COPYRIGHT Copyright (c) 2005, Geoffrey Young All rights reserved. This module is free software. It may be used, redistributed and/or modified under the same terms as Perl itself. =head1 HISTORY This code is derived from the Cookbook::Clean and Cookbook::TestMe modules available as part of "The mod_perl Developer's Cookbook". For more information, visit http://www.modperlcookbook.org/ =cut