package WWW::AUR::Iterator; use warnings 'FATAL' => 'all'; use strict; use WWW::AUR::Package qw(); use WWW::AUR::URI qw( pkg_uri ); use WWW::AUR qw( _path_params _category_name _useragent ); sub new { my $class = shift; my $self = bless {}, $class; $self->init( @_ ); } sub init { my $self = shift; my $startidx; $startidx = shift if @_ % 2 == 1; %$self = ( %$self, _path_params( @_ )); $self->reset(); $self->set_pos( $startidx ) if $startidx; return $self; } sub set_pos { my ($self, $startidx) = @_; Carp::croak 'Argument to set_pos() must an integer' unless $startidx =~ /\A\d+\z/; $self->{'curridx'} = $startidx; $self->{'finished'} = 0; $self->{'packages'} = []; return; } sub reset { my ($self) = @_; $self->{'curridx'} = 0; $self->{'finished'} = 0; $self->{'packages'} = []; $self->{'useragent'} = _useragent(); return; } #---HELPER FUNCTION--- sub _pkglist_uri { my ($startidx) = @_; return pkg_uri( q{SB} => q{n}, q{O} => $startidx, q{SO} => q{a}, q{PP} => 100 ); } #---PRIVATE METHOD--- sub _scrape_pkglist { my ($self) = @_; my $uri = _pkglist_uri( $self->{'curridx'} ); my $resp = $self->{'useragent'}->get( $uri ); Carp::croak 'Failed to GET package list webpage: ' . $resp->status_line unless $resp->is_success; my @pkginfos; my @rows = _splitrows( $resp->content ); shift @rows; # remove the header column for my $rowhtml ( @rows ) { my @cols = _splitcols( $rowhtml ); # cat, name, version, votes, desc, maintainer push @pkginfos, @cols; } return \@pkginfos; } sub _splitrows { my ($html) = @_; my @rows = $html =~ m{ ]*> ( .*? ) }gxs; return @rows; } sub _splitcols { my ($rowhtml) = @_; my @cols = $rowhtml =~ m{ ]*> ( .*? ) }gxs; for ( @cols ) { s/<[^>]+>//g; # delete tags s/\A\s+//; s/\s+\z//; # trim whitespace } return @cols; } sub next { my ($self) = @_; # There are no more packages to iterate over... return undef if $self->{'finished'}; my @pkginfo = splice @{ $self->{'packages'} }, 0, 6; if ( @pkginfo ) { my $pkg; my @k = qw/cat name version votes desc/; for my $i (0 .. $#k) { $pkg->{$k[$i]} = $pkginfo[$i]; } my $maint = $pkginfo[5]; $pkg->{'maint'} = ($maint eq 'orphan' ? undef : $maint); return $pkg; } # Load a new batch of packages if our internal list is empty... my $newpkgs = $self->_scrape_pkglist; $self->{'curridx'} += 100; $self->{'packages'} = $newpkgs; $self->{'finished'} = 1 if scalar @$newpkgs == 0; # Recurse, just avoids code copy/pasting... return $self->next(); } sub next_obj { my ($self) = @_; my $next = $self->next; return ( $next ? WWW::AUR::Package->new( $next->{'name'}, %$self ) : undef ); } 1; __END__ =head1 NAME WWW::AUR::Iterator - An iterator for looping through all AUR packages. =head1 SYNOPSIS my $aurobj = WWW:AUR->new(); my $iter = $aurobj->iter(); # or without WWW::AUR: my $iter = WWW::AUR::Iterator->new(); while ( my $pkg = $iter->next_obj ) { print $pkg->name, "\n"; } $iter->reset; while ( my $p = $iter->next ) { print "$_:$p->{$_}\n" for qw{ id name version cat desc maint }; print "---\n"; } # Retrieve information on the 12,345th package, alphabetically. $iter->set_pos(12_345); my $pkginfo = $iter->next; =head1 DESCRIPTION A B object can be used to iterate through I packages currently listed on the AUR webiste. =head1 CONSTRUCTOR $OBJ = WWW::AUR::Iterator->new( %PATH_PARAMS ); =over 4 =item C<%PATH_PARAMS> The parameters are the same as the L constructor. These are propogated to any L objects that are created. =item C<$OBJ> A L object. =back =head1 METHODS =head2 reset $OBJ->reset; The iterator is reset to the beginning of all packages available in the AUR. This starts the iteration over just like creating a new I object. =head2 next \%PKGINFO | undef = $OBJ->next(); This package scrapes the L webpage as if it kept clicking the Next button and recording each package. =over 4 =item C<\%PKGINFO> A hash reference containing all the easily available information about that particular package. The follow table lists each key and its corresponding value. |------------+------------------------------------------------| | NAME | VALUE | |------------+------------------------------------------------| | name | The name (pkgname) of the package. | | votes | The number of votes for the package. | | desc | The description (pkgdesc) of the package. | | cat | The AUR category name assigned to the package. | | maint | The name of the maintainer of the package. | |------------+------------------------------------------------| =item C If we have iterated through all packages, then C is returned. =back =head2 next_obj $PKGOBJ | undef = $OBJ->next_obj(); This package is like the L method above but creates a new object as a convenience. Keep in mind an HTTP request to AUR must be made when creating a new WWW::AUR::Package object. Use the L method if you can, it is faster. =over 4 =item C<$PKGOBJ> A L object representing the next package in the AUR. =item C If we have iterated through all packages, then C is returned. =back =head2 set_pos undef = $OBJ->set_pos( $POS ); Set the iterator position to the given index in the entire list of packages from packages.php. =over 4 =item C<$POS> This is not the package ID but simply the list offset on the package webpage. =back =head1 SEE ALSO L =head1 AUTHOR Justin Davis, C<< >> =head1 BUGS Please email me any bugs you find. I will try to fix them as quick as I can. =head1 SUPPORT Send me an email if you have any questions or need help. =head1 LICENSE AND COPYRIGHT Copyright 2012 Justin Davis. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See http://dev.perl.org/licenses/ for more information.