package Foorum::TheSchwartz::Worker::Scraper; use strict; use warnings; our $VERSION = '1.001000'; use base qw( TheSchwartz::Moosified::Worker ); use Foorum::SUtils qw/schema/; use Foorum::Logger qw/error_log/; use Foorum::XUtils qw/cache/; use Foorum::Scraper::MailMan; use Foorum::Utils qw/encodeHTML/; use POSIX qw(strftime); use File::Spec; use Encode qw/from_to/; use YAML::XS qw/LoadFile/; use Cwd qw/abs_path/; my ( undef, $path ) = File::Spec->splitpath(__FILE__); $path = abs_path($path); my $scraper_config = LoadFile( File::Spec->catfile( $path, '..', '..', '..', '..', 'conf', 'scraper.yml' ) ); my @FullName_months = ( '', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ); my @Re_s = ( 'Re\:', '答复\:' ); sub work { my $class = shift; my $job = shift; # if not setted, just return unless ($scraper_config) { return $job->completed(); } my @args = $job->arg; my $schema = schema(); my $cache = cache(); my $log_text; my @gmtimes = gmtime( time() - 86400 ); # check one day before my $year = $gmtimes[5] + 1900; my $month = $gmtimes[4] + 1; my $fullname_month = $FullName_months[$month]; my $postfix = "$year-$fullname_month/thread.html"; my $scraper = new Foorum::Scraper::MailMan(); my @mailmans = @{ $scraper_config->{scraper}->{mailman} }; foreach my $mailman (@mailmans) { $log_text .= "Working on $mailman->{name}\n"; next unless ( $mailman->{forum_id} ); my $forum_id = $mailman->{forum_id}; my $user_id = $mailman->{user_id}; my $name = $mailman->{name}; my $last_msg_id = get_last_scraped_msg_id( $schema, $forum_id, "scraper-mailman-$name" ); next if ( $last_msg_id == -1 ); # non-exists my $scraper_url = $mailman->{url} . $postfix; # scraper as a hash of array my $ret = $scraper->scraper($scraper_url); # group by title my %title_related; foreach (@$ret) { if ( exists $title_related{ $_->{title} } ) { push @{ $title_related{ $_->{title} } }, $_; } else { $title_related{ $_->{title} } = [$_]; } } my $is_changed = 0; # flag to update forum or not my $last_post_id = 0; # set forum's last_post_id # start to skip/insert foreach my $title ( keys %title_related ) { $title =~ s/(^\s+|\s+$)//isg; next unless ( length($title) ); $log_text .= "\n[title] $title : "; my @populate_contents; my @contents = @{ $title_related{$title} }; @contents = sort { $a->{msg_id} <=> $b->{msg_id} } @contents; foreach my $content (@contents) { my $msg_id = $content->{msg_id}; if ( $msg_id <= $last_msg_id ) { $log_text .= "Skip $msg_id, "; } else { $log_text .= "Insert $msg_id, "; push @populate_contents, $content; } } if ( scalar @populate_contents ) { # get topic_id or create one my ( $topic_id, $reply_to ) = get_topic_or_create( $schema, $forum_id, $title, $user_id, scalar @populate_contents - 1 ); $last_post_id = $topic_id; foreach my $content (@populate_contents) { my $text = qq~

$content->{who} posted on $content->{when}:

$content->{text}
~; my $comment = $schema->resultset('Comment')->create( { object_type => 'topic', object_id => $topic_id, author_id => $user_id, title => $title, text => $text, formatter => 'html', post_on => time(), post_ip => '127.0.0.1', reply_to => $reply_to, forum_id => $forum_id, upload_id => 0, } ); $is_changed = 1; # if $reply_to == 0 means new topic # then we use the first comment's comment_id as reply_to $reply_to = $comment->comment_id if ( $reply_to == 0 ); # update $last_msg_id so that no need to run again $last_msg_id = $content->{msg_id} if ( $content->{msg_id} > $last_msg_id ); } # clear cache my $cache_key = "comment|object_type=topic|object_id=$topic_id"; $cache->remove($cache_key); } } # update last_msg_id update_last_scraped_msg_id( $schema, "scraper-mailman-$name", $last_msg_id ); # update threads|replies count for forum and user if ( $is_changed and $last_post_id ) { update_forum( $schema, $cache, $forum_id, $last_post_id ); my $user = $schema->resultset('User')->get( { user_id => $user_id } ); $schema->resultset('User')->update_threads_and_replies($user); } } error_log( $schema, 'info', $log_text ); $job->completed(); } sub get_last_scraped_msg_id { my ( $schema, $forum_id, $name ) = @_; my $count = $schema->resultset('Forum')->count( { forum_id => $forum_id } ); return -1 unless ($count); # forum non-exists $name = substr( $name, 0, 24 ); my $rs = $schema->resultset('Variables')->search( { type => 'log', name => $name } )->first; return $rs ? $rs->value : 0; } sub update_last_scraped_msg_id { my ( $schema, $name, $value ) = @_; $name = substr( $name, 0, 24 ); $schema->resultset('Variables')->search( { type => 'log', name => $name, } )->delete; $schema->resultset('Variables')->create( { type => 'log', name => $name, value => $value } ); } sub get_topic_or_create { my ( $schema, $forum_id, $title, $user_id, $replies_no ) = @_; # trim 'Re:\s+' foreach my $tre (@Re_s) { $title =~ s/^$tre\s+//isg; } my $topic = $schema->resultset('Topic')->search( { title => { 'LIKE', $title }, forum_id => $forum_id, }, { columns => ['topic_id'], } )->first; if ($topic) { my $rs = $schema->resultset('Comment')->search( { object_type => 'topic', object_id => $topic->topic_id, }, { order_by => 'post_on', rows => 1, page => 1, columns => ['comment_id'], } )->first; if ($rs) { my $reply_to = $rs->comment_id; return ( $topic->topic_id, $reply_to ); } } # or else, create one my $topic_title = encodeHTML($title); my $new_topic = $schema->resultset('Topic')->create( { forum_id => $forum_id, title => $topic_title, author_id => $user_id, last_updator_id => $user_id, last_update_date => time(), hit => 0, total_replies => $replies_no } ); return ( $new_topic->topic_id, 0 ); } sub update_forum { my ( $schema, $cache, $forum_id, $last_post_id ) = @_; my $forum = $schema->resultset('Forum')->count( { forum_id => $forum_id } ); return unless ($forum); # update forum $schema->resultset('Forum')->search( { forum_id => $forum_id, } ) ->update( { last_post_id => $last_post_id || 0, } ); $cache->remove("forum|forum_id=$forum_id"); } 1;