package Apache::Clean; #--------------------------------------------------------------------- # usage: PerlHandler Apache::Clean # # see the Apache::Clean manpage or POD at the end of this file # # $Id: Clean.pm,v 1.12 2002/07/02 18:51:52 geoff Exp $ #--------------------------------------------------------------------- use 5.004; use mod_perl 1.21; use Apache::Constants qw( OK DECLINED ); use Apache; use Apache::File; use Apache::Log; use HTML::Clean; use strict; $Apache::Clean::VERSION = '0.05'; # set debug level # 0 - messages at info or debug log levels # 1 - verbose output at info or debug log levels $Apache::Clean::DEBUG = 0; # Get the package modification time... (my $package = __PACKAGE__) =~ s!::!/!g; my $package_mtime = (stat $INC{"$package.pm"})[9]; # ...and when httpd.conf was last modified my $conf_mtime = (stat Apache->server_root_relative('conf/httpd.conf'))[9]; # When the server is restarted we need to # make sure we recognize config file changes and propigate # them to the client to clear the client cache if necessary. Apache->server->register_cleanup(sub { $conf_mtime = (stat Apache->server_root_relative('conf/httpd.conf'))[9]; }); sub handler { #--------------------------------------------------------------------- # initialize request object and variables #--------------------------------------------------------------------- my $r = shift; my $filter = lc $r->dir_config('Filter') eq 'on'; # Register ourselves with Apache::Filter so # later filters can see our output. $r = $r->filter_register if $filter; my $log = $r->server->log; my ($fh, $cache) = (); #--------------------------------------------------------------------- # do some preliminary stuff... #--------------------------------------------------------------------- $log->info("Using Apache::Clean"); # we need separate content-type checks for filtered # unfiltered cases. in the unfiltered case we can decline sooner... unless ($r->content_type =~ m!text/html!i || $filter) { $log->info("\trequest is not for an html document ", "(unfiltered request) - skipping...") if $Apache::Clean::DEBUG; $log->info("Exiting Apache::Clean"); return DECLINED; } #--------------------------------------------------------------------- # get the filehandle #--------------------------------------------------------------------- if ($filter) { $log->info("\tgetting request input from Apache::Filter") if $Apache::Clean::DEBUG; # Get any output from previous filters in the chain. ($fh, my $status) = $r->filter_input; unless ($status == OK) { $log->warn("\tApache::Filter returned $status"); $log->info("Exiting Apache::Clean"); return $status; } } else { $log->info("\tgetting request input from Apache::File") if $Apache::Clean::DEBUG; # We are not part of a filter chain, so just process as normal. $fh = Apache::File->new($r->filename); unless ($fh) { $log->warn("\tcannot open request! $!"); $log->info("Exiting Apache::Clean"); return DECLINED; } # since we're essentially sending a static file # we can set cache headers properly based on the # file itself - although we're modifying the # content the meaning of the content doesn't # change unless it: # changes on disk # this package is modified # our httpd.conf options have changed # however, in the interests of back compatibility, make # proper cache behavior an option $cache = lc $r->dir_config('CleanCache') || 'on'; if ($cache eq 'on') { # set what we can from here, more later... $r->update_mtime($package_mtime); $r->update_mtime((stat $r->finfo)[9]); $r->update_mtime($conf_mtime); $r->set_last_modified; $r->set_etag; } } # special decline case for Apache::Filter # here, we need to send the content onward even though # we don't process it. this is to make sure that the # next filter (either a PerlHandler or the browser) gets # the content if ($r->content_type !~ m!text/html!i && $filter) { $log->info("\trequest is not for an html document ", "(Apache::Filter) - skipping...") if $Apache::Clean::DEBUG; $r->send_http_header($r->content_type); print while <$fh>; $log->info("Exiting Apache::Clean"); # we can't ever return DECLINED when using Apache::Filter return OK; } #--------------------------------------------------------------------- # clean up the html #--------------------------------------------------------------------- # Slurp the file. my $dirty = do { local $/; <$fh> }; # Create the new HTML::Clean object. my $h = HTML::Clean->new(\$dirty); # Set the level of suds. $h->level($r->dir_config('CleanLevel') || 1); my %options = map { $_ => 1 } $r->dir_config->get('CleanOption'); # clean the HTML $h->strip(\%options); #--------------------------------------------------------------------- # print the clean results #--------------------------------------------------------------------- if ($cache eq 'on') { # we needed to clean the data first before we # could find the length $r->set_content_length(length ${$h->data}); # only send the file if it meets cache criteria if ((my $status = $r->meets_conditions) == OK) { $r->send_http_header($r->content_type); } else { return $status; } } else { # else we just send a header $r->send_http_header($r->content_type); } print ${$h->data}; #--------------------------------------------------------------------- # wrap up... #--------------------------------------------------------------------- $log->info("Exiting Apache::Clean"); return OK; } 1; __END__ =head1 NAME Apache::Clean - mod_perl interface into HTML::Clean =head1 SYNOPSIS httpd.conf: PerlModule Apache::Clean SetHandler perl-script PerlHandler Apache::Clean PerlSetVar CleanLevel 3 PerlSetVar CleanOption shortertags PerlAddVar CleanOption whitespace PerlSetVar CleanCache On Apache::Clean is Filter aware, meaning that it can be used within Apache::Filter framework without modification. Just include the directives PerlModule Apache::Filter PerlSetVar Filter On and modify the PerlHandler directive accordingly... =head1 DESCRIPTION Apache::Clean uses HTML::Clean to tidy up large, messy HTML, saving bandwidth. It is particularly useful with Apache::Compress for ultimate savings. Only documents with a content type of "text/html" are affected - all others are passed through unaltered. =head1 OPTIONS Apache::Clean supports few options, most of which are based on options from HTML::Clean. Apache::Clean will only tidy up whitespace (via $h->strip) and will not perform other options of HTML::Clean (such as browser compatibility). See the HTML::Clean manpage for details. =over 4 =item CleanLevel sets the clean level, which is passed to the level() method in HTML::Clean. PerlSetVar CleanLevel 9 CleanLevel defaults to 3. =item CleanOption specifies the set of options which are passed to the options() method in HTML::Clean. PerlAddVar CleanOption shortertags PerlSetVar CleanOption whitespace CleanOption has do default. =item CleanCache sets the behavior of Apache::Clean in regards to proper cache header behavior. this option is only meaningful when Apache::Clean is _not_ part of an Apache::Filter chain. mainly, CleanCache On enables Apache::Clean to set the Last-Modified, Content-Length, and Etag headers, as well as allowing it do decide whether a 304 response is allowed. See recipe 6.6 in the mod_perl Developer's Cookbook for a more detailed discussion on handling conditional and cache-based headers - the code is practically identical to what you will find there. The basic idea here is that although Apache::Clean is dynamically manipulating the content of the requested resource, the meaning of the document has not changed just because was changed to . If you disagree with this assessment you can set CleanCache to Off. CleanCache defaults to On. =back =head1 NOTES Verbose debugging is enabled by setting $Apache::Clean::DEBUG=1 or greater. To turn off all debug information, set your apache LogLevel directive above info level. This is alpha software, and as such has not been tested on multiple platforms or environments. It requires PERL_LOG_API=1, PERL_FILE_API=1, and maybe other hooks to function properly. =head1 FEATURES/BUGS No known bugs or features at this time... =head1 SEE ALSO perl(1), mod_perl(3), Apache(3), HTML::Clean(3), Apache::Compress(3), Apache::Filter(3) =head1 AUTHORS Geoffrey Young Paul Lindner Randy Kobes =head1 COPYRIGHT Copyright (c) 2002, Geoffrey Young, Paul Lindner, Randy Kobes. All rights reserved. This module is free software. It may be used, redistributed and/or modified under the same terms as Perl itself. =head1 HISTORY This code is derived from the Cookbook::Clean and Cookbook::TestMe modules available as part of "The mod_perl Developer's Cookbook". For more information, visit http://www.modperlcookbook.org/ =cut