package WWW::Proxy4FreeCom; use warnings; use strict; our $VERSION = '0.001'; use Carp; use URI; use LWP::UserAgent; use HTML::TokeParser::Simple; use base 'Class::Data::Accessor'; __PACKAGE__->mk_classaccessors qw( list filtered_list error ua debug ); sub new { my $self = bless {}, shift; croak "Must have even number of arguments to new()" if @_ & 1; my %args = @_; $args{ +lc } = delete $args{ $_ } for keys %args; $args{timeout} ||= 30; $args{ua} ||= LWP::UserAgent->new( timeout => $args{timeout}, agent => 'Mozilla/5.0 (X11; U; Linux x86_64; en-US;' . ' rv:1.8.1.12) Gecko/20080207 Ubuntu/7.10 (gutsy)' . ' Firefox/2.0.0.12', ); $self->ua( $args{ua} ); $self->debug( $args{debug} ); return $self; } sub get_list { my $self = shift; my $custom_pages = shift; $self->$_(undef) for qw(list error); my @pages_list = defined $custom_pages ? ( ref $custom_pages ? @$custom_pages : $custom_pages ) : ( 1 ); return $self->_set_error('Page number can only be 1..5') if grep { $_ < 1 or $_ > 5 } @pages_list; my $ua = $self->ua; my @proxies; for ( @pages_list ) { my $response = $ua->get('http://proxy4free.com/page' . $_ . '.html'); if ( $response->is_success ) { push @proxies, $self->_parse_proxy_list( $response->content ); } else { $self->debug and carp "Page $_: " . $response->status_line; } } return $self->list( \@proxies ); } sub filter { my $self = shift; $self->$_(undef) for qw(error filtered_list); croak "Must have even number of arguments to filter()" if @_ & 1; my %args = @_; $args{ +lc } = delete $args{ $_ } for keys %args; my %valid_filters; @valid_filters{ qw(ip port type country last_test) } = (1) x 5; grep { not exists $valid_filters{$_} } keys %args and return $self->_set_error( 'Invalid filter specified, valid ones are: '. join q|, |, keys %valid_filters ); my $list_ref = $self->list or return $self->_set_error( 'Proxy list seems to be undefined, did you call get_list() first?' ); my @filtered; foreach my $proxy_ref ( @$list_ref ) { my $is_good = 0; for ( keys %args ) { $proxy_ref->{$_} eq $args{$_} and $is_good++; } $is_good == keys %args and push @filtered, { %$proxy_ref }; } return $self->filtered_list( \@filtered ); } sub _parse_proxy_list { my ( $self, $content ) = @_; my $parser = HTML::TokeParser::Simple->new( \$content ); my %data_names; @data_names{ 1..5 } = qw(ip port type country last_test); my %nav; @nav{ qw(get_info get_data data_level) } = (0) x 3; my @data; my %proxy; while ( my $t = $parser->get_token ) { if ( $t->is_start_tag('tr') and defined $t->get_attr('class') and defined $t->get_attr('height') and $t->get_attr('class') eq 'text' and $t->get_attr('height') eq '10' ) { @nav{ qw(get_info level) } = (1, 1); } elsif ( $nav{get_info} == 1 and $t->is_start_tag('td') ) { @nav{ qw(get_data level) } = (1, 2); $nav{data_level}++; } elsif ( $nav{get_data} and $t->is_end_tag('td') ) { @nav{ qw(get_data level) } = (0, 3); } elsif ( $nav{get_data} and $t->is_text ) { next unless exists $data_names{$nav{data_level}}; $proxy{ $data_names{$nav{data_level}} } = $t->as_is; } elsif ( $nav{get_info} == 1 and $t->is_end_tag('tr') ) { @nav{qw(get_info data_level level)} = (0, 0, 4); my %done_proxy = %proxy; %proxy = (); for ( values %data_names ) { $done_proxy{ $_ } = 'N/A' unless exists $done_proxy{ $_ }; } push @data, \%done_proxy; } } return @data; } sub _set_error { my ( $self, $error_or_response, $type ) = @_; if ( defined $type and $type eq 'net' ) { $self->error( 'Network error: ' . $error_or_response->status_line ); } else { $self->error( $error_or_response ); } return; } 1; __END__ =head1 NAME WWW::Proxy4FreeCom - fetch proxy list from http://proxy4free.com/ =head1 SYNOPSIS use strict; use warnings; use WWW::Proxy4FreeCom; my $prox = WWW::Proxy4FreeCom->new; $prox->get_list or die $prox->error; my $filtered_ref = $prox->filter( country => 'China', type => 'anonymous' ) or die $prox->error; printf "http://%s:%d (last tested on %s)\n", @$_{ qw(ip port last_test) } for @%filtered_ref; =head1 DESCRIPTION The module provides means to fetch proxy list from L website with means to filter by certain fields. =head1 CONSTRUCTOR =head2 new my $prox = WWW::Proxy4FreeCom->new; my $prox = WWW::Proxy4FreeCom->new( timeout => 10, debug => 1, ); my $prox = WWW::Proxy4FreeCom->new( ua => LWP::UserAgent->new( timeout => 10, agent => 'ProxUA', ), ); Constructs and returns a brand new yummy juicy WWW::Proxy4FreeCom object. Takes a few I arguments. Possible arguments are as follows: =head3 timeout ->new( timeout => 10 ); B. Specifies the C argument of L's constructor, which is used for retrieving data. B C<30> seconds. =head3 ua ->new( ua => LWP::UserAgent->new( agent => 'Foos!' ) ); B. If the C argument is not enough for your needs of mutilating the L object used for retrieving proxy list, feel free to specify the C argument which takes an L object as a value. B the C argument to the constructor will not do anything if you specify the C argument as well. B plain boring default L object with C argument set to whatever C's C argument is set to as well as C argument is set to mimic Firefox. =head3 debug ->new( debug => 1 ); When C is called any unsuccessfull page retrievals will be silently ignored. Setting C argument to a true value will C any network errors if they occur. =head1 METHODS =head2 get_list my $list_ref = $prox->get_list # just from the "proxy list 1" or die $prox->error; my $list_ref = $prox->get_list( 2 ) # just from the "proxy list 2" or die $prox->error; $prox->get_list( [3,5] ) # lists 3 and 5 only or die $prox->error; Instructs the objects to fetch a fresh list of proxies from L. If an error occured returns C or an empty list depending on the context. On success returns an arrayref of hashrefs each representing a proxy entry. Takes one optional argument which can be either a number between 1 and 5 (inclusive) or an arrayref with several of these numbers. The numbers represent the numbers of "proxy list"s on L. B only the list from the "proxy list 1" will be fetched. Each hashref in the returned arrayref is in a following format (if any field is missing on the site it will be reported as a string C): { 'country' => 'Indonesia', 'ip' => '202.173.23.141', 'last_test' => '2008-03-14', 'type' => 'transparent', 'port' => '8080' } =head2 filter my $filtered_ref = $prox->filter( country => 'China', port => 80 ) or die $prox->error; Must be called after a successfull call to C. Returns an arrayref of hashrefs each of which will be representing the proxies the exact same way C returns them except proxies will be filtered by a "ruleset". Takes a "ruleset" for filtering which is a set of key/value arguments. The valid names of arguments are the keys of the "proxy" hashefs, namely: C, C, C, C, and C. Will return either C or an empty list (depending on the context) if called with an invalid filter or last C was unsuccessfull and C method will tell you exactly what was wrong. =head2 error my $filtered_ref = $prox->filter( country => 'China', port => 80 ) or die $prox->error; $prox->get_list # just from the "proxy list 1" or die $prox->error; If either C or C methods fail they will return either C or an empty list depending on the context and the reason for the error will be available via C method. Takes no arguments, return a human parsable error message explaining the failure. =head2 list my $last_list_ref = $prox->list; Must be called after a successfull call to C. Takes no arguments, returns the same arrayref of hashref as last call to C returned. =head2 filtered_list my $last_filtered_ref = $prox->filtered_list; Must be called after a successfull call to C. Takes no arguments, returns the same arrayref of hashref as last call to C returned. =head2 ua my $old_LWP_UA_obj = $prox->ua; $prox->ua( LWP::UserAgent->new( timeout => 10, agent => 'foos' ); Returns a currently used L object used for retrieving data. Takes one optional argument which must be an L object, and the object you specify will be used in any subsequent calls to C. =head2 debug my $old_debug => $prox->debug; $prox->debug(1); Returns a currently set debug value, when called with an optional argument (which can be either a true or false value) will set debug to that value. See C argument to constructor for more information. =head1 AUTHOR 'Zoffix, C<< <'zoffix at cpan.org'> >> (L, L) =head1 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc WWW::Proxy4FreeCom You can also look for information at: =over 4 =item * RT: CPAN's request tracker L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * Search CPAN L =back =head1 COPYRIGHT & LICENSE Copyright 2008 'Zoffix, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut