package HTTP::Proxy::BodyFilter::save; use strict; use HTTP::Proxy; use HTTP::Proxy::BodyFilter; use vars qw( @ISA ); @ISA = qw( HTTP::Proxy::BodyFilter ); use Fcntl; use File::Spec; use File::Path; use Carp; sub init { my $self = shift; # options my %args = ( template => File::Spec->catfile( '%h', '%P' ), no_host => 0, no_dirs => 0, cut_dirs => 0, prefix => '', filename => undef, multiple => 1, keep_old => 0, # no_clobber in wget parlance timestamp => 0, status => [ 200 ], @_ ); # keep_old and timestamp can't be selected together croak "Can't timestamp and keep older files at the same time" if $args{keep_old} && $args{timestamp}; croak "status must be an array reference" unless ref($args{status}) eq 'ARRAY'; croak "status must contain only HTTP codes" if grep { !/^[12345]\d\d$/ } @{ $args{status} }; croak "filename must be a code reference" if defined $args{filename} && !UNIVERSAL::isa( $args{filename}, 'CODE' ); $self->{"_hpbf_save_filename_code"} = $args{filename}; $self->{"_hpbf_save_$_"} = $args{$_} for qw( template no_host no_dirs cut_dirs prefix multiple keep_old timestamp status ); } sub begin { my ( $self, $message ) = @_; # internal data initialisation delete @{$self}{qw( _hpbf_save_filename _hpbf_save_fh )}; my $uri = $message->isa( 'HTTP::Request' ) ? $message->uri : $message->request->uri; # save only the accepted status codes if( $message->isa( 'HTTP::Response' ) ) { my $code = $message->code; return unless grep { $code eq $_ } @{ $self->{_hpbf_save_status} }; } my $file = ''; if( defined $self->{_hpbf_save_filename_code} ) { # use the user-provided callback $file = $self->{_hpbf_save_filename_code}->($message); unless ( defined $file and $file ne '' ) { $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", "Filter will not save $uri" ); return; } } else { # set the template variables from the URI my @segs = $uri->path_segments; # starts with an empty string shift @segs; splice(@segs, 0, $self->{_hpbf_save_cut_dirs} >= @segs ? @segs - 1 : $self->{_hpbf_save_cut_dirs} ); my %vars = ( '%' => '%', h => $self->{_hpbf_save_no_host} ? '' : $uri->host, f => $segs[-1] || 'index.html', # same default as wget p => $self->{_hpbf_save_no_dirs} ? $segs[-1] || 'index.html' : File::Spec->catfile(@segs), q => $uri->query, ); pop @segs; $vars{d} = $self->{_hpbf_save_no_dirs} ? '' : @segs ? File::Spec->catfile(@segs) : ''; $vars{P} = $vars{p} . ( $vars{q} ? "?$vars{q}" : '' ); # create the filename $file = File::Spec->catfile( $self->{_hpbf_save_prefix} || (), $self->{_hpbf_save_template} ); $file =~ s/%(.)/$vars{$1}/g; } $file = File::Spec->rel2abs( $file ); # create the directory my $dir = File::Spec->catpath( (File::Spec->splitpath($file))[ 0, 1 ], '' ); if( ! -e $dir ) { eval { mkpath( $dir ) }; if ($@) { $self->proxy->log( HTTP::Proxy::ERROR, "HTBF::save", "Unable to create directory $dir" ); return; } $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", "Created directory $dir" ); } # keep old file? if ( -e $file ) { if ( $self->{_hpbf_save_timestamp} ) { # FIXME timestamp } elsif ( $self->{_hpbf_save_keep_old} ) { $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", "Skip saving $uri" ); delete $self->{_hpbf_save_fh}; # it's a closed filehandle return; } } # open and lock the file my ( $ext, $n, $i ) = ( "", 0 ); my $flags = O_WRONLY | O_EXCL | O_CREAT; while( ! sysopen( $self->{_hpbf_save_fh}, "$file$ext", $flags ) ) { $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "Too many errors opening $file$ext" ), return if $i++ - $n == 10; # should be ok now if( $self->{_hpbf_save_multiple} ) { $ext = "." . ++$n while -e $file.$ext; next; } else { $flags = O_WRONLY | O_CREAT; } } # we have an open filehandle $self->{_hpbf_save_filename} = $file.$ext; binmode( $self->{_hpbf_save_fh} ); # for Win32 and friends $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", "Saving $uri to $file$ext" ); } sub filter { my ( $self, $dataref ) = @_; return unless exists $self->{_hpbf_save_fh}; # save the data to the file my $res = $self->{_hpbf_save_fh}->syswrite( $$dataref ); $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "syswrite() error: $!") if ! defined $res; # FIXME error handling } sub end { my ($self) = @_; # close file if( $self->{_hpbf_save_fh} ) { $self->{_hpbf_save_fh}->close; # FIXME error handling delete $self->{_hpbf_save_fh}; } } sub will_modify { 0 } 1; __END__ =head1 NAME HTTP::Proxy::BodyFilter::save - A filter that saves transfered data to a file =head1 SYNOPSIS use HTTP::Proxy; use HTTP::Proxy::BodyFilter::save; my $proxy = HTTP::Proxy->new; # save RFC files as we browse them $proxy->push_filter( path => qr!/rfc\d+.txt!, mime => 'text/plain', response => HTTP::Proxy::BodyFilter::save->new( template => '%f', prefix => 'rfc', keep_old => 1, ) ); $proxy->start; =head1 DESCRIPTION The HTTP::Proxy::BodyFilter::save filter can save HTTP messages (responses or request) bodies to files. The name of the file is determined by a template and the URI of the request. Simply insert this filter in a filter stack, and it will save the data as it flows through the proxy. Depending on where the filter is located in the stack, the saved data can be more or less modified. This filter I create directories if it needs to! I Remember that the default C parameter for C is C and that you may need to change it for other MIME types. =head2 Constructor The constructor accepts quite a few options. Most of them control the construction of the filename that will be used to save the response body. There are two options to compute this filename: =over 4 =item * use a template =item * use your own filename creation routine =back The template option uses the following options: =over 4 =item B