package WWW::FreeProxyListsCom; use warnings; use strict; our $VERSION = '0.002'; use Carp; use URI; use WWW::Mechanize; use HTML::TokeParser::Simple; use HTML::Entities; use Devel::TakeHashArgs; use base 'Class::Data::Accessor'; __PACKAGE__->mk_classaccessors qw( error mech debug list filtered_list ); sub new { my $self = bless {}, shift; get_args_as_hash(\@_, \my %args, { timeout => 30 } ) or croak $@; $args{mech} ||= WWW::Mechanize->new( timeout => $args{timeout}, agent => 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1.12)' .' Gecko/20080207 Ubuntu/7.10 (gutsy) Firefox/2.0.0.12', ); $self->mech( $args{mech} ); $self->debug( $args{debug} ); return $self; } sub get_list { my $self = shift; $self->$_(undef) for qw(error list); get_args_as_hash(\@_, \my %args, { type => 'elite', max_pages => 1, } ) or croak $@; my %page_for = ( non_anonymous => 'non-anonymous', map { $_ => $_ } qw( elite anonymous https standard us socks ), ); exists $page_for{ $args{type} } or croak 'Invalid `type` argument was passed to fetch(). ' . 'Must be one of' . join q|, |, keys %page_for; my $mech = $self->mech; my $page_type = $page_for{ $args{type} }; my $uri = URI->new( "http://www.freeproxylists.com/$page_type.html" ); $mech->get($uri)->is_success or return $self->_set_error($mech,'net'); $page_type eq 'anonymous' and $page_type = 'anon'; $page_type eq 'non-anonymous' and $page_type = 'nonanon'; # little tweaking to get the URI to the file normally loaded with AJAX my @links = map { "http://www.freeproxylists.com/load_${page_type}_" . ($_->url =~ m|([^/]+$)|)[0] } $mech->find_all_links(text_regex => qr/^detailed list #\d+/i); $args{max_pages} and @links = splice @links, 0, $args{max_pages}; $self->debug and print "Going to fetch data from: \n" . join "\n", @links,''; my @proxies; for ( @links ) { unless ( $mech->get($_)->is_success ) { $self->debug and carp 'Network error: ' . $mech->res->status_line; next; } my $list_ref = $self->_parse_list( $mech->res->content ) or next; push @proxies, @$list_ref; } return $self->list( \@proxies ); } sub filter { my $self = shift; $self->$_(undef) for qw(error filtered_list); get_args_as_hash( \@_, \my %args) or croak $@; my %valid_filters; @valid_filters{ qw(ip port is_https country last_test latency) } = (1) x 5; grep { not exists $valid_filters{$_} } keys %args and return $self->_set_error( 'Invalid filter specified, valid ones are: '. join q|, |, keys %valid_filters ); my $list_ref = $self->list or return $self->_set_error( 'Proxy list seems to be undefined, did you call get_list() first?' ); my @filtered; foreach my $proxy_ref ( @$list_ref ) { my $is_good = 0; for ( keys %args ) { if ( ref $args{$_} eq 'Regexp' ) { $proxy_ref->{$_} =~ /$args{$_}/ and $is_good++; } else { $proxy_ref->{$_} eq $args{$_} and $is_good++; } } $is_good == keys %args and push @filtered, { %$proxy_ref }; } return $self->filtered_list( \@filtered ); } sub _parse_list { my ( $self, $content ) = @_; # EVIL EVIL EVIL!! WEEE \o/ ( $content ) = $content =~ m|(.+?)|s; decode_entities $content; my $parser = HTML::TokeParser::Simple->new( \$content ); my %cells; @cells{ 1..6 } = qw(ip port is_https latency last_test country); my %nav; @nav{ qw(get_data level data_cell) } = (0) x 3; my @data; my %current; while ( my $t = $parser->get_token ) { if ( $t->is_start_tag('tr') ) { @nav{ qw(get_data level) } = (1, 1); } elsif ( $nav{get_data} == 1 and $t->is_start_tag('td') ) { $nav{level} = 2; $nav{data_cell}++; } elsif ( $nav{data_cell} and $t->is_text ) { $current{ $cells{ $nav{data_cell} } } = $t->as_is; } elsif ( $t->is_end_tag('tr') ) { @nav{ qw(level get_data data_cell) } = ( 3, 0, 0 ); next unless keys %current; $current{ $_ } = 'N/A' for grep { !defined $current{$_} or !length $current{$_} } values %cells; push @data, { %current }; %current = (); } } return \@data; } sub _set_error { my ( $self, $mech_or_error, $type ) = @_; if ( defined $type and $type eq 'net' ) { $self->error('Network error: ' . $mech_or_error->res->status_line); } else { $self->error( $mech_or_error ); } return; } 1; __END__ =head1 NAME WWW::FreeProxyListsCom - get proxy lists from http://www.freeproxylists.com =head1 SYNOPSIS use strict; use warnings; use WWW::FreeProxyListsCom; my $prox = WWW::FreeProxyListsCom->new; my $ref = $prox->get_list( type => 'non_anonymous' ) or die $prox->error; print "Got a list of " . @$ref . " proxies\nFiltering...\n"; $ref = $prox->filter( port => qr/(80){1,2}/ ); print "Filtered list contains: " . @$ref . " proxies\n" . join "\n", map( "$_->{ip}:$_->{port}", @$ref), ''; =head1 DESCRIPTION The module provides interface to fetch proxy server lists from L =head1 CONSTRUCTOR =head2 C my $prox = WWW::FreeProxyListCom->new; my $prox2 = WWW::FreeProxyListCom->new( timeout => 20, # or 'mech' mech => WWW::Mechanize->new( agent => 'foos', timeout => 20 ), debug => 1, ); Bakes up and returns a fresh WWW::FreeProxyListCom object. Takes a few arguments, all of which are I. Possible arguments are as follows: =head3 C my $prox = WWW::FreeProxyListCom->new( timeout => 10 ); Takes a scalar as a value which is the value that will be passed to the L object to indicate connection timeout in seconds. B C<30> seconds =head3 C my $prox = WWW::FreeProxyListCom->new( mech => WWW::Mechanize->new( agent => '007', timeout => 10 ), ); If a simple timeout is not enough for your needs feel free to specify the C argument which takes a L object as a value. B plain L object with C argument set to whatever WWW::FreeProxyListCom's C argument is set to as well as C argument is set to mimic FireFox. =head3 C my $prox = WWW::FreeProxyListCom->new( debug => 1 ); When set to a true value will make the object print out some debugging info. B C<0> =head1 METHODS =head2 C my $list_ref = $prox->get_list or die $prox->error; my $list_ref2 = $prox->get_list( type => 'standard', max_pages => 5, ) or die $prox->error; Instructs the object ot fetch a list of proxies from L website. On failure returns either C or an empty list depending on the context and the reason for failure will be available via C method. B if request for a each of the "list" (see C argument below) fails the C will NOT error out, if you are getting empty proxy lists try setting C option on in the constructor and it will carp() any failures on the "list" gets. On success returns an arrayref of hashrefs, see C section below for details. Takes several arguments all of which are I. To understand them better you should visit L first. The possible arguments are as follows: =head3 C ->get_list( type => 'standard' ); B. Specifies the list of proxies to fetch. B C. Possible arguments are as follows (valid C values are on the left, corresponding "list" site's menu link names are on the right): elite => http elite proxies anonymous => http anonymous lists non_anonymous => http non-anonymous https => https (SSL enabled) standard => http standard ports us => us proxies only socks => socks (version 4/5) =head3 C ->get_list( max_pages => 4 ); B. Specifies how many "lists" to fetch. In other words, if you go to list section titled "http elite proxies" you'll see several lists in the table; the C specifies how many of those lists to fetch. If C is larger than the number of available lists only the number of available lists will be fetched. A special value of C<0> indicates that the object should fetch all available lists for a specified C. B C<1> (which is more than enough). =head3 RETURN VALUE $VAR1 = [ { 'country' => 'China', 'last_test' => '3/15 4:23:14 pm', 'ip' => '121.15.200.147', 'latency' => '5115', 'port' => '80', 'is_https' => 'true' }, ] On success C method returns a (possibly empty) arrayref of "proxy" hashrefs. The hashrefs represent each proxy listed on the proxy list on the site. Each will contain the following keys (if the value for a specific key was not found on the site it will be set to C): =over 10 =item ip The IP address of the proxy =item port The port of the proxy =item country The country of the proxy =item last_test When was the proxy last tested to be alive, this is the "Date checked, UTC" column on the site. =item latency Corresponds to the "Latency" column on the site =item is_https Corresponds to "HTTPS" column on the site. =back =head2 C my $filtered_list_ref = $prox->filter( port => 80, ip => qr/^120/, country => 'Russia', is_https => 'true', last_test => qr|^3/15|, # march 15's latency => qr/\d{1,2}/, ); Must be called after a successfull call to C will croak otherwise. Takes one or more key/value pairs of arguments which specify filtering rule. The keys are the same as the keys of "proxy" hashref in the return value of the C method. Values can be either simple scalars or regexes (C). If value is a regex the corresponding value in the "proxy" hashref will matched against the regex, otherwise the C will be done. Returns an arrayref of "proxy" hashrefs in the exact same format as C returns except filtered. In other words calling C<< $prox->filter( port => 80, latency => qr/\d{1,2}/ ) >> will return only proxies with port number C<80> and for which latency is a two digit value. On failure returns either C or an empty list depending on the context and the reason for the error will be available via C method. Although, C should not fail if you pass proper filter arguments and call it after successfull C. =head2 C my $list_ref = $prox->get_list or die $prox->error; When either C or C methods fail they will return either C or an empty list depending on the context and the reason for the failure will be available via C method. Takes no arguments, returns a human parsable message explaining why C or C failed. =head2 C my $last_list_ref = $prox->list; Must be called after a successfull call to C. Takes no arugments, returns the same arrayref of hashrefs last call to C returned. =head2 C my $last_filtered_list_ref = $prox->filtered_list; Must be called after a successfull call to C. Takes no arugments, returns the same arrayref of hashrefs last call to C returned. =head2 C my $old_mech = $prox->mech; $prox->mech( WWW::Mechanize->new( agent => 'blah' ) ); Returns a L object used for fetching proxy lists. When called with an optional argument (which must be a L object) will use it in any subsequent C calls. =head2 C my $old_debug = $prox->debug; $prox->debug( 1 ); Returns a currently set debug flag (see C argument to constructor). When called with an argument will set the debug flag to the value specified. =head1 AUTHOR Zoffix Znet, C<< >> (L, L) =head1 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc WWW::FreeProxyListsCom You can also look for information at: =over 4 =item * RT: CPAN's request tracker L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * Search CPAN L =back =head1 COPYRIGHT & LICENSE Copyright 2008 Zoffix Znet, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut