#!/usr/bin/perl package Mail::Summary::Tools::Downloader::NNTP; use Moose; use Net::NNTP; use Mail::Message; use List::MoreUtils qw/any/; use Mail::Summary::Tools::ThreadFilter::Util; use DateTime; use DateTime::Infinite; use DateTime::Format::Mail; use DateTime::Format::DateManip; has cache => ( isa => "Object", is => "rw", required => 1, ); has server => ( isa => "Str", is => "ro", required => 1, ); has overviews => ( isa => "HashRef", is => "ro", default => sub { return {} }, ); has connection => ( isa => "Net::NNTP", is => "ro", lazy => 1, default => sub { $_[0]->connect }, ); has fetch_recursively => ( isa => "Bool", is => "rw", default => 1, ); has _downloaded => ( isa => "HashRef", is => "ro", default => sub { return {} }, ); sub connect { my $self = shift; Net::NNTP->new( $self->server, Debug => 1 ) || die "couldn't connect to " . $self->server; } sub overviews_for_group { my ( $self, %params ) = @_; my ( $group, $to, $from ) = @params{qw/group to from/}; my $cache = $self->cache; my $server = $self->server; # effiency hack unless ( $self->overviews->{$group} ) { if ( $cache->isa("Mail::Summary::Tools::YAMLCache") ) { my $all_overviews_key = join(":", "overviews", $server, $group); return $self->overviews->{$group} = $self->cache->get( $all_overviews_key ) || do { my %overviews; $self->cache->set( $all_overviews_key, \%overviews ); \%overviews; } } else { my %overviews; $self->overviews->{$group} = \%overviews; foreach my $article ( $from .. $to ) { next if $overviews{$article}; my $cache_key = join(":", "overviews", $server, $group, $article); if ( my $article_overviews = $cache->get( $cache_key ) ) { $overviews{$article} = $article_overviews; } } return \%overviews; } } } sub split_ranges { my ( $self, @ranges ) = @_; # split the ranges up into smaller chunks... If xover gets a too big number it barfs sometimes. return map { my $start = $_->[0]; my $end = $_->[1]; my $count = $end - $start; my $magic = 1000; my $div = int( $count / $magic ); ( $div ? ( (map { [ $start + ( ($_-1) * $magic ), ($start + ( $_ * $magic ))-1 ] } 1 .. $div), [ $start + ( $div * $magic ), $end ] ) : ( $_ ) ) } @ranges; } sub determine_missing_header_ranges { my ( $self, %params ) = @_; my ( $overviews, $to, $from ) = @params{qw/overviews to from/}; if ( ( my @got = sort { $a <=> $b } keys %$overviews ) > 10 ) { my @ranges; warn "previous articles exist"; my $prev = shift @got; push @ranges, [ $from, $prev-1 ] unless $from == $prev; foreach my $article ( @got ) { if ( ($article - 1) != $prev ) { warn "adding range: $prev+1 .. $article-1"; push @ranges, [ $prev+1, $article-1 ]; } $prev = $article; } push @ranges, [ $prev+1, $to ] unless $to == $prev; return @ranges; } else { warn "getting everything"; return [ $from, $to ]; } } sub fetch_overviews_in_ranges { my ( $self, %params ) = @_; my ( $overviews, $ranges, $group ) = @params{qw/overviews ranges group/}; my @important_headers = qw/Date References Message-ID/; my $connection = $self->connection; my $cache = $self->cache; my $server = $self->server; my @overview_headers = map { my $header = $_; $header =~ s/:$//; $header } @{ $connection->overview_fmt }; my %header_indices; @header_indices{@overview_headers} = 0 .. $#overview_headers; my @keep_headers; @keep_headers[map { $header_indices{$_} } @important_headers] = ( (1) x scalar(@important_headers) ); foreach my $range ( @$ranges ) { my $raw_overviews = $connection->xover($range); foreach my $overview ( values %$raw_overviews ) { $overview = { map { $overview_headers[$_] => $overview->[$_] } grep { $keep_headers[$_] } 0 .. $#overview_headers }; } @{ $overviews }{ keys %$raw_overviews } = values %$raw_overviews; unless ( $cache->isa("Mail::Summary::Tools::YAMLCache") ) { # this is not necessary for the yaml cache, because of our hack. # the hash is shared that way and just gets updated in place foreach my $article ( keys %$overviews ) { my $cache_key = join(":", "overviews", $server, $group, $article); $cache->set( $cache_key, $overviews->{$article} ); } } } } sub fetch_overviews_for_group { my ( $self, %params ) = @_; my $overviews = $self->overviews_for_group(%params); my @ranges = $self->split_ranges( $self->determine_missing_header_ranges( %params, overviews => $overviews, ), ); $self->fetch_overviews_in_ranges( ranges => \@ranges, overviews => $overviews, ); delete @{ $overviews }{ 1 .. $params{from}-1 }; # FIXME cache this in non YAML cache too return $overviews; } sub set_group { my ( $self, $group ) = @_; my ( $to, $from ) = $self->connection->group($group) or die "$group doesn't exist"; my $overviews = $self->fetch_overviews_for_group( group => $group, from => $from, to => $to, ); return ( overviews => $overviews, from => $from, to => $to, ); } sub download { my ( $self, %params ) = @_; my ( $from_date, $to_date ) = delete @params{qw/from to/}; my $range = Date::Range::Forgiving->new( $from_date, $to_date ); # ACKCKKK Fixme %params = ( $self->set_group($params{group}), %params ); my $overviews = $params{overviews}; $self->for_articles_in_date_range( sub { my $article = shift; $self->get_article( %params, article => $article, overview => $overviews->{$article}, ); }, %params, date_range => $range, ); } sub for_articles_in_date_range { my ( $self, $body, %params ) = @_; my ( $overviews, $from, $to, $range ) = @params{qw/overviews from to date_range/}; foreach my $article ( $from .. $to ) { next unless my $overview = $overviews->{$article}; my $date_header = $overview->{Date}; my $date; my @errors; $date = eval { DateTime::Format::Mail->new->loose->parse_datetime( $date_header ) }; push @errors, $@ if $@; $date ||= eval { DateTime::Format::DateManip->parse_datetime( $date_header ) }; push @errors, $@ if $@; warn "Error parsing date '$date_header': @errors" unless defined $date; $date ||= DateTime->now; $body->($article) if $range->includes( $date ); } } sub get_article { my ( $self, %params ) = @_; my ( $overview, $article ) = delete @params{qw/overview article/}; if ( $self->fetch_recursively ) { foreach my $message_id ( $overview->{'References'} =~ / ( < \S+ \@ \S+ > ) /gx ) { warn "additional thread root: $message_id"; $self->get_message_if_needed( %params, message_id => $message_id, ); } } $self->get_message_if_needed( %params, article => $article, message_id => $overview->{'Message-ID'}, ); } sub get_message_if_needed { my ( $self, %params ) = @_; my ( $message_id, $mbox, $extra ) = @params{qw/message_id mailbox extra_mailboxes/}; return if $self->_downloaded->{$message_id}++; return if any { $_->find($message_id) } $mbox, @$extra; $self->fetch_message_id( %params ); } sub fetch_message_id { my ( $self, %params ) = @_; my ( $article, $message_id, $mbox ) = @params{qw/article message_id mailbox/}; if ( my $article = $self->connection->article( $article || $message_id ) ) { my $message = Mail::Message->read( $article ); $mbox->addMessage( $message ); } else { warn "couldn't fetch article: " . ($article || $message_id); } } __PACKAGE__; __END__ =pod =head1 NAME Mail::Summary::Tools::Downloader::NNTP - Get NNTP articles and their thread roots. =head1 SYNOPSIS use Mail::Summary::Tools::Downloader::NNTP; my $downloader = Mail::Summary::Tools::Downloader::NNTP->new( server => "nntp.perl.org", cache => $article_cache, fetch_recursively => 0, ); my $mgr = Mail::Box::Manager->new; my $mbox = $mgr->open( "foo" ); $downloader->download( group => "perl.perl6.language", from => 10000, to => 11000, mailbox => $mbox, extra_mailboxes => \@extra, ); =head1 DESCRIPTION This utility makes downloading mailing list archives from an nntp server into a mailbox trivial. Messages whose message ID is already in any of the mailboxes are not downloaded. Additionally, message IDs listed in the C header will also be fetched if C is on (the default). Since L can thread messages from multiple mailboxes this one can download the next batch of articles with C enabled, and using a log-rotation like mechanism delete older mailboxes without fear of breaking the threads, at the cost of some redundant downloads. =cut