# Mail::Freshmeat.pm -- # Perl module for parsing daily newsletters from http://freshmeat.net/ # (derived from the Mail::Internet class) # # Copyright (c) 1999 Adam Spiers . All rights # reserved. This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # $Id: Freshmeat.pm,v 1.16 2000/11/02 15:13:12 adam Exp $ # package Mail::Freshmeat; use strict; BEGIN { require 5.005; } require AutoLoader; use Mail::Internet; use Carp; use vars qw($VERSION @ISA); @ISA = qw(Mail::Internet AutoLoader); $VERSION = '0.94'; =head1 NAME Mail::Freshmeat - class for parsing e-mail newsletters from freshmeat.net =head1 SYNOPSIS $newsletter = new Mail::Freshmeat( \*STDIN ); $newsletter->parse; foreach my $entry (@{$newsletter->entries}) { print "Name: $entry->{name}"; print "Version: $entry->{version}; ... # Get an entry line as if it was from the first list # in the newsletter print $newsletter->short_entry($entry), "\n"; } =head1 DESCRIPTION A subclass of B. This package provides parsing of the daily e-mail newsletters which are sent out from F to any individual who requests them. =head1 METHODS =over 4 =item * B $newsletter->parse; This method must be called before any accessors can be used. =cut sub parse { my $self = shift; my $clean_parse = 1; my $body = join '', @{ $self->body }; my $blank_line = qr/ ^ \s * $ \n /x; my $blank_lines = qr/ (?: $blank_line )* /x; my $divider = qr/ ^ \s* [-\s]{10,} \s* $ /x; $body =~ m! (^This\ is\ the\ official\ freshmeat\ newsletter\ for\ (.+?)\. (?s: .+) total,\ (\d+)\ articles\ have\ been\ posted\ and\ are\ included\ in\ this\ email\.) \s* $ \n $blank_lines (?: ( ^ \s* \[\ advertising\ \] \s* $ \n $blank_lines ) ((?s:.+?)) $blank_lines )? ( ^ \s* \[\ article\ list\ \] \s* $ \n $blank_lines ) ((?: ^o\ .* $ \n)+) $blank_lines ( ^ \s* \[\ article\ details\ \] \s* $ \n $blank_lines ) ((?s:.+?)) $blank_lines $divider $blank_lines (^ that's\ it\ for\ today (?s: .*) ) $ !mx or _fatal_bug("Couldn't parse newsletter structure."); $self->{fm_summary} = $1; $self->{fm_date} = $2; $self->{fm_total} = $3; $self->{fm_ad_header} = $4; $self->{fm_ad} = $5; $self->{fm_list_header} = $6; $self->{fm_list} = $7; $self->{fm_details_header} = $8; $self->{fm_details} = $9; $self->{fm_footer} = $10; my @entries = (); my $count = 1; foreach my $entry (split / $blank_line $divider $blank_line /mx, $self->{fm_details}) { if ($entry !~ / ^ \s* name: \s (.*) $ \n ^ \s* posted\ on: \s (.*) $ \n (?: ^ \s* license: \s (.*) $ \n )? ^ \s* category: \s (.*) $ \n (?: $blank_line (?: ^ \s* homepage: \s (.*) $ \n )? (?: ^ \s* download: \s (.*) $ \n )? (?: ^ \s* changelog: \s (.*) $ \n )? )? $blank_line ^ (body|description) : \s* $ \n (?s: (.+?) ) (?: $blank_line ^ changes: \s* $ \n (?s: (.+?) ) )? (?: $blank_line ^ urgency: \s* $ \n (?s: (.+?) ) )? (?: $blank_line )? \|> \s (.+?) \s* $ \n /mx) { my $entry_start = $entry; if ($entry_start =~ /^\s*(subject: .*)$/m) { $entry_start = $1; } else { $entry_start =~ s/\n/\\n/g; $entry_start = substr($entry_start, 0, 40); } warn "Couldn't parse entry beginning '$entry_start'; ignoring.\n"; $clean_parse = 0; next; } # REMINDER: If you change the following keys, you must change # the entry_keys method and its documentation. my $new_entry = { name_and_version => $1, posted_on => $2, license => $3, category => $4, homepage => $5, download => $6, changelog => $7, body_type => $8, body => $9, changes => $10, urgency => $11, url => $12, }; @$new_entry{qw/name version/} = $self->parse_entry_version($new_entry); $new_entry->{body} =~ s/\r$//mg; foreach my $key (keys %$new_entry) { delete $new_entry->{$key} unless $new_entry->{$key}; } $new_entry->{position} = $count; push @entries, $new_entry; $count++; } # Bit of sanity checking never hurt anyone my $total_entries = @entries; if ($total_entries != $self->{fm_total}) { warn <{fm_total}) and actual number found ($total_entries). Weird! Will ignore number mentioned in summary from now on ... EOF $clean_parse = 0; } $self->{fm_entries} = \@entries; return ($self->{fm_parsed} = $clean_parse ? 'ok' : 'unclean'); } sub parse_entry_version { my ($self, $entry) = @_; # Start of first word of version must match this my $version_first_word_start = qr/ ( [.\d] | pre | alpha | beta | patch | r | rel | release | build | v(?:er)? [^a-z] ) /ix; # Start of further words of version must match this my $version_other_words_start = qr/ ( [.\d(] | pre | alpha | beta | r | rel | release | build | patch ) /ix; # Rest of each word of version must match this my $version_rest_of_word = qr/ ( [.\w()\/-] | pre | alpha | beta | patch | \d{1,6}(?!\d) # not more than six digits # in a row (how silly am I?) )* /ix; my ($name, $version) = ($entry->{name_and_version}, ''); if ($entry->{category} ne 'Community' and $entry->{name_and_version} =~ /^ (.+?) # save name in $1 \s+ ( # save version in $2 $version_first_word_start $version_rest_of_word (?: \s+ $version_other_words_start $version_rest_of_word )* ) # end saving $2 $/ix) { $name = $1; $version = $2; } return ($name, $version); } =back =cut 1; ######################## End of preloaded code ######################## __END__ =head1 ACCESSORS =over 4 =item * B my @entry_keys = $newsletter->entry_keys; Returns the keys which each entry may have set, in the order in which they are encountered in the newsletter: - position - name_and_version - name - version - posted_on - license - category - homepage - download - changelog - body_type - body - changes - urgency - url =cut sub entry_keys { return qw/ position name_and_version name version posted_on license category homepage download changelog body_type body changes urgency url /; } ## my $do_parse_first_err = "You must call the parse() method on the object first"; ## =item * B $summary = $self->summary; Returns the paragraph starting 'This is the official freshmeat newsletter ...'. =cut sub summary { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return $self->{fm_summary}; } ## =item * B $date = $self->date; Returns the date on which this newsletter was released. =cut sub date { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return $self->{fm_date}; } ## =item * B $total = $self->total; Returns the total number of entries in the newsletter. =cut sub total { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return scalar(@{$self->{fm_entries}}); } ## =item * B Returns a reference to an array of hashes, each containing fully parsed information about an entry of the newsletter. The entries are in the original newsletter order. The keys of each hash will be a subset of the list returned by the entry_keys method. =cut sub entries { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return $self->{fm_entries}; } ## =item * B $ad = $self->advertisement; Returns the '[ advertisement ]' section of the newsletter, which has one entry per line. =cut sub advertisement { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return $self->{fm_ad}; } ## =item * B $list = $self->list; Returns the '[ article list ]' section of the newsletter, which has one entry per line. =cut sub list { my $self = shift; croak $do_parse_first_err unless $self->{fm_parsed}; return $self->{fm_list}; } ## =item * B