package WWW::Sucksub::Attila; =head1 NAME WWW::Sucksub::Attila - automated access to attila french subtitles database =head1 VERSION Version 0.06 =cut our $VERSION = '0.06'; =head1 SYNOPSIS WWW::SuckSub::Attila is a web robot based on the WWW::Mechanize Module. it parses distant web database specialised on french subtitles and build a dbm file to store result ( film title - http link for subtitle file ). The dbm file is used like a dictionnary you can update and use to do quick search. use WWW::Sucksub::Attila; my $test=WWW::Sucksub::Attila>new( motif => $mot, debug =>1, logout => '/where/debug/file/is/written.txt', dbfile=>'/where/dbm/file/is.db', html=>'/where/html/report/will/be/written.html' ); $test->update(); #parse all site and collect subtitles http link $test->search(); #search on local dbm file and produce html report =head1 CONSTRUCTOR AND STARTUP =head2 Attila Constructor The new() constructor, is associated to default values : you can modify these one as shown in the synopsis example. Default value are these : my $foo = WWW::Sucksub::Divxstation->new( dbfile => "$ENV{HOME}"."/attila.db"; html => "$ENV{HOME}"."/attila_repport.html"; motif=> undef, tempfile=> "$ENV{HOME}"."/.attila_tmp.html"; debug=> 0, logout => \*STDOUT useragent=> "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007" ); The environnement variable $ENV{HOME} must exist unless you redefine the constructor value which need it. =head3 new() constructor attributes and associated methods All listed attributes can be modified by corresponding methods : - set the attributes value when calling equivalent method whith args. - get the attribute value when calling equivalent method whithout args. $foo->WWW::Sucksub::Attila->new() $foo->useragent() # get the useragent attribute value $foo->useragent('tructruc') # set the useragent attribute value to 'tructruc' =head4 motif() you should here give a real value to this function : if $foo->motif is undef, the package execution will be aborted $foo->motif('xxx') allows to precise that you're searching a word that contains 'xxx' $foo->motif() return the current value of the string you search. =head4 debug() WWW-Sucksub-Divxstation can produce a lot of interresting informations The default value is "0" : that means that any debug informations will be written on the output ( see the logout() method too.) $foo->debug(0) # stop the product of debbugging informations $foo->debug(1) # debug info will be written to the log file ( see logout() method) . =head4 logout() A log file can be defined to keep a trace of website parsing You have to set $obj->debug(1) to get more detailled informations. $foo->logout(); #get the current logout() value $foo->logout('/home/xxx/log.txt') #set logout() value. Note that default value is STDOUT the logout() value can only be set in the new constructor. =head4 dbfile() define dbm file for store and retrieving extracted informations you must provide au full path to the db file to store results dbfile('/where/your/db/is.db') The file will should be readable/writable. =head4 html() Define simple html output where to write search report. you must provide au full path to the html file if you want to get an html output. html('/where/the html/repport/is/written.html') If $foo->html() is defined. you can get the value of this attribute like this : my $html_page = $foo->html Default value is automatically defined on the new() call. html => "$ENV{HOME}"."/attila_report.html"; html file will be used for reporting with search() methods =head4 useragent() arg should be a valid useragent. There's no reason to change this default value. $foo->useragent() return the value of the current useragent $foo->useragent('xxxxxxxx') set the useragent() value to ''xxxxxxxx'. =head1 FUNCTIONS these functions use the precedent attributes value. =head2 search() this function takes no arguments. it allows to launch a local dbm search. $foo-> search() the dbm file is read to give you every couple (title,link) which corresponds to the motif() pattern you defined before. =head2 update() this function takes no arguments. it allows to initiate the distant search on the web site http://davidbillemont5.free.fr/ ( attila website) the local dbm file is automatically written. Results are accumulated to the dbm file you define on new() call . Note that the update can take a while. =head1 AUTHOR Timothée Foucart, C<< >> =head1 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SEE ALSO =over 4 =item * L =item * L =item * L =item * L =item * L =back =head1 COPYRIGHT & LICENSE Copyright 2005 Timothée Foucart, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut require Exporter; use vars qw(@ISA @EXPORT $VERSION); @ISA = qw(Exporter); @EXPORT=qw( debug dbfile get_all_result html logout motif search update useragent ); #use warnings; use utf8; use warnings; use strict; use Carp; use WWW::Mechanize; # use Alias qw(attr); use vars qw( $site $nbres $base $debug $useragent $motif %sstsav $logout $fh $tempfile $dbfile $html ); sub new{ my $attila=shift; my $classe= ref($attila) || $attila; my $self={ }; bless($self,$classe); $self=$self->_init(@_); logout($self->{logout}); return $self; }; sub _init{ my $self= attr shift; # # -- init default values # $self->{base} ="http://davidbillemont5.free.fr/"; $self->{site} = "http://davidbillemont5.free.fr/Sous-Titres%200.htm"; $self->{tempfile} = "$ENV{HOME}"."/.attila_tmp.html"; $self->{useragent} ="Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007"; $self->{motif} = undef; $self->{debug} = 0; $self->{logout} = \*STDOUT; $self->{nbres} = 0; $self->{sstsav} = {}; $self->{dbfile} = "$ENV{HOME}"."/attila.db"; $self->{html} = "$ENV{HOME}"."/attila_repport.html"; # # -- replace "forced" values # if (@_) { my %param=@_; while (my($x,$y) =each(%param)){$self->{$x}=$y;}; } return $self; }; sub useragent { my $self =attr shift; if (@_) {$useragent=shift;} return $useragent; } sub dbfile { my $self =attr shift; if (@_) {$dbfile=shift;} return $dbfile; } sub html { my $self =attr shift; if (@_) {$html=shift;} return $html; } sub debug { my $self =attr shift; if (@_) {$debug=shift;} return $debug; } sub sstsav { my $self =attr shift; if (@_) {%sstsav=shift;} return %sstsav; } sub get_all_result { my $self =attr shift; %sstsav=$self->sstsav(); return %sstsav; } sub motif { my $self =attr shift; if (@_) {$motif=shift;} return $motif; } sub logout { #no update after first init if (@_){$logout=shift; } if ($logout) { open(FH , ">>", $logout) or croak "$logout : $!\n"; $fh=(\*FH);} else { $fh=(\*STDOUT);}; return $logout; } sub update{ my $self =attr shift; my $mech = WWW::Mechanize->new(agent=>$useragent, stack_depth => 1, ); print $fh "------------------------------------------------------------------------------------------------\n" if ($debug); print $fh "[DEBUG] begin updating local database from $site at : ".localtime()."\n" if ($debug); print $fh "------------------------------------------------------------------------------------------------\n" if ($debug); my @update_base; my $ipage=0; my $attila_page; $mech->get($site) or warn "[WARNING] http get problem on : $site !! \n"; my $links=$mech->find_all_links(); for ( my $ind=0; $ind <= $#{$links} ; $ind++) { if ($links->[$ind]->url_abs()=~m/Sous-Titres/m) { $ipage++; print $fh "[DEBUG][SUBTITLE PAGE : $ipage ]\t".$links->[$ind]->url_abs()."\n" if $debug; push @update_base,$links->[$ind]->url_abs(); }; }; foreach $attila_page (@update_base) { if (-e ($tempfile)) {unlink $tempfile or croak "can not suppress $tempfile : $!\n";}; print $fh "[DEBUG] parsing : ".$attila_page ."\n"; $mech->get($attila_page); open (TAMPON,'>', $tempfile) or croak "can not open $tempfile:$!\n"; print TAMPON $mech->response->as_string; close TAMPON; my %x=parse_attila($tempfile); while (my ($k, $v) = each %x) { $sstsav{$k}=$v;}; save_dbm(%sstsav); }; print $fh "[DEBUG] update finished\n"; return; }; # # --- recherche du motif dans la db sub search{ my $self =attr shift; $motif=$self->motif(); if ($html) { open (HTMLFILE,">>",$html) or warn "can not access $html : $! \n"; print HTMLFILE "
html generated by suckSub perl module
\n"; print HTMLFILE "searching : ".$motif." on ".$site."
\n"; print HTMLFILE " ".localtime()."

\n"; }; my %hashread; return unless $motif; print $fh " file db is : ". $dbfile."\n"; unless (-e ($dbfile)) {croak "[DEBUG SEARCH] db file ".$dbfile." not found \n maybe you should use update() method to build it ! \n";}; use DB_File; tie(%hashread,'DB_File',$dbfile) or croak "can not access : $dbfile : $!\n"; while (my ($k,$v)=each(%hashread)) { if ($v =~ m/$motif/i) { print $fh "[FOUND Libelle ] $v \n[FOUND LINK]".$k ."\n"; if ($html) { print HTMLFILE "".$v."
\n"; $nbres++ }; }; }; untie(%hashread); if ($html) { print HTMLFILE "
".$nbres." result(s) found
\n"; print HTMLFILE " html finished at ".localtime()."
\n"; close HTMLFILE; }; return; }; #--------------------------------------------------------------------------- #-- save updated hash into dbm file #-- internal use only #--------------------------------------------------------------------------- sub save_dbm{ my $self =attr shift; my %hashtosave; use DB_File; tie (%hashtosave,'DB_File',$dbfile ) or croak "can not use $dbfile : $!\n"; while (my ($k, $v) = each %sstsav) { $hashtosave{$k}=$v;}; untie(%hashtosave); return; }; #--------------------------------------------------------------------------- #--- parse one .htm page and extract label + info + link into memo hash #-- internal use only #--------------------------------------------------------------------------- sub parse_attila{ use HTML::Parser; use vars qw( %hsav $top_label1 $label $endor ); my $file=$_[0]; $label=""; $top_label1=0; #flag begin label or text to get $endor=0;# flag end of row => re-init counters for states analyse my $p = HTML::Parser->new(); # $p->handler( start => \&start_attila, "tagname,attr" ); $p->handler( text => \&text_attila, "text" ); $p->unbroken_text( 1 ); $p->marked_sections( 0 ); $p->ignore_elements(qw(script style)); # $p->parse_file($file); $p->eof; # # # sub start_attila { my ( $tag, $args ) = @_; #--- searching 'td' tag -> verify width of each column if ( $tag eq 'td' ) { return unless $args->{width}; # french label and orig title in the array if ( ($args->{width} eq '39%') && ($top_label1==0) ) { $top_label1++;}; if ( ($args->{width} eq '39%') && ($top_label1==1) ) { $top_label1++;}; # possible width variation if ( ($args->{width} eq '38%') && ($top_label1==0) ) { $top_label1++;}; if ( ($args->{width} eq '38%') && ($top_label1==1) ) { $top_label1++;}; # number of cd width = 10-11% if ( ($args->{width} eq '10%') && ($top_label1>0)) { $top_label1++;$endor=1}; if ( ($args->{width} eq '11%') && ($top_label1>0)) { $top_label1++;$endor=1}; } #---searching sub links in html page if (( $tag eq 'a' ) && ($args->{href})) { if ($args->{href} =~ m/Subs\// ) { $hsav{$base.$args->{href}}=$label; #DEBUG#print "[DEBUG PARSER]". $args->{href} ." ===>".$label."\n"; $label="";$top_label1=0; }; }; }; sub text_attila { my $text= shift; $text =~ tr/ //s; # nbsp html $text =~ tr/ /_/s; # $text =~ s/-/_/gi; $text =~ s/\n//gi; # $text =~ tr/_/_/s; # if ($top_label1>0) { return if ($text eq "_"); # texte parasite $label=$label."[".$text."]"; $top_label1++; if ($endor==1){$top_label1=0;$endor=0}; #DEBUG#print "[DEBUG PARSER LABEL] ". $label ."\n"; }; return $label }; return %hsav; } # 1; # End of WWW::Sucksub::Attila