package Text::Original; use 5.006; use strict; use warnings; use Memoize; =head1 NAME Text::Original - Find original, non-quoted text in a message =head1 SYNOPSIS use Text::Original; my $sentence = first_sentence($email->body); =head1 FUNCTIONS =cut our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( first_lines first_paragraph first_sentence) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = '1.4'; =head2 first_lines first_lines($text, 20); Returns the a number of lines after the first non blank, none quoted line of the body of the email. It will guess at attribution lines and skip them as well. It will return super cited lines. This is the super-citers' fault, not ours. It won't catch all types of attribution lines; It can optionally be passed a number of lines to get. =cut sub first_lines { my $text = shift; my $num = shift || 1; return _significant_signal($text, lines => $num); } =head2 first_paragraph Returns the first original paragraph of the message =cut sub first_paragraph { return _significant_signal(shift, para => 1); } =head2 first_sentence Returns the first original sentence of the message =cut sub first_sentence { my $text = first_paragraph(shift); $text =~ s/([.?!]) .*/$1/s; return $text; } # Kudos to Damian Conway for this bit. my $quotechar = qq{[!#%=|:]}; my $quotechunk = qq{(?:$quotechar(?![a-z])|[a-z]*>+)}; my $quoter = qq{(?:(?i)(?:$quotechunk(?:[ \\t]*$quotechunk)*))}; sub _significant_signal { my $text = shift; my %opts = @_; my $return = ""; my $lines = 0; # get all the lines from the main part of the body my @lines = split /$/m, $text; # right, find the start of the original content or quoted # content (i.e. skip past the attributation) my $not_started = 1; while (@lines && $not_started) { # next line local $_ = shift @lines; #print "}}$_"; # blank lines, euurgh next if /^\s*$/; # quotes (we don't count quoted From's) next if /^\s*>(?!From)/; # Other kinds of quoter: next if /^\s*$quoter/; # skip obvious attribution next if /^\s*On (Mon|Tue|Wed|Thu|Fri|Sat|Sun)/i; next if /\d{4}-?\w{2,3}-?\d{2}.*\d+:\d+:\d+/i; # Looks like a date next if /^\w+(\s\w+)?:$/; # lathos' minimalist attributions. :) next if /^\s*.+=? wrote:/i; # skip signed messages next if /^\s*-----/; next if /^Hash:/; # annoying hi messages (this won't work with i18n) next if /^\s*(?:hello|hi|hey|greetings|salut |good (?:morning|afternoon|day|evening)) (?:\W.{0,14})?\s*$/ixs; # snips next if m~\s* # whitespace [<.=-_*+({\[]*? # opening bracket (?:snip|cut|delete|deleted) # snip? [^>}\]]*? # some words? [>.=-_*+)}\]]*? # closing bracket \s*$ # end of the line ~xi; # [.. foo ..] or ...foo.. or so on next if m~\s*\[?\.\..*?\.\.]?\s*$~; # ... or [...] next if m~\s*\[?\.\.\.]?\s*$~; # if we got this far then we've probably got past the # attibutation lines unshift @lines, $_; # undo the shift undef $not_started; # and say we've started. } # okay, let's _try_ to build up some content then foreach (@lines) { # are we at the end of a paragraph? last if (defined $opts{'para'} # paragraph mode? && $opts{'para'}==1 && $lines>0 # got some lines aready? && /^\s*$/); # and now we've found a gap? # blank lines, euurgh next if /^\s*$/; # quotes (we don't count quoted From's) next if /^\s*>(?!From)/; # if we got this far then the line was a useful one $lines++; # sort of munged Froms s/^>From/From/; s/^\n+//; $return .= "\n" if $lines>1; $return .= $_; last if (defined $opts{'lines'} && $opts{'lines'}==$lines); } return $return; } memoize('_significant_signal'); 1; =head1 EXPORTS All of the above. =head1 AUTHOR Simon Wistow and the Mariachi project. See http://siesta.unixbeard.net/ Packaged by Simon Cozens Currently maintained by Simon Wistow =head1 COPYRIGHT Copyright 2004 The Siesta Project This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.