package Mail::Mbox::MessageParser::Perl; no strict; @ISA = qw( Exporter Mail::Mbox::MessageParser ); use strict; use Carp; use Mail::Mbox::MessageParser; use Mail::Mbox::MessageParser::Config; use vars qw( $VERSION $DEBUG ); $VERSION = sprintf "%d.%02d%02d", q/1.60.4/ =~ /(\d+)/g; *ENTRY_STILL_VALID = \&Mail::Mbox::MessageParser::MetaInfo::ENTRY_STILL_VALID; sub ENTRY_STILL_VALID; *DEBUG = \$Mail::Mbox::MessageParser::DEBUG; *dprint = \&Mail::Mbox::MessageParser::dprint; sub dprint; #------------------------------------------------------------------------------- sub new { my ($proto, $self) = @_; carp "Need file_handle option" unless defined $self->{'file_handle'}; bless ($self, __PACKAGE__); $self->_init(); return $self; } #------------------------------------------------------------------------------- sub _init { my $self = shift; $self->{'CURRENT_LINE_NUMBER'} = 1; $self->{'CURRENT_OFFSET'} = 0; $self->{'READ_BUFFER'} = ''; $self->{'START_OF_EMAIL'} = 0; $self->{'END_OF_EMAIL'} = 0; $self->{'READ_CHUNK_SIZE'} = $Mail::Mbox::MessageParser::Config{'read_chunk_size'}; $self->SUPER::_init(); } #------------------------------------------------------------------------------- sub reset { my $self = shift; $self->{'CURRENT_LINE_NUMBER'} = ($self->{'prologue'} =~ tr/\n//) + 1; $self->{'CURRENT_OFFSET'} = length($self->{'prologue'}); $self->{'READ_BUFFER'} = ''; $self->{'START_OF_EMAIL'} = 0; $self->{'END_OF_EMAIL'} = 0; $self->SUPER::reset(); } #------------------------------------------------------------------------------- sub end_of_file { my $self = shift; # Reset eof in case the file was appended to. Hopefully this works all the # time. See perldoc -f seek for details. seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'}; return eof $self->{'file_handle'} && $self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'}); } #------------------------------------------------------------------------------- sub _read_prologue { my $self = shift; dprint "Reading mailbox prologue using Perl"; $self->_read_until_match( qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/,0); my $start_of_email = pos($self->{'READ_BUFFER'}); $self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email); # Set up for read_next_email $self->{'CURRENT_LINE_NUMBER'} += ($self->{'prologue'} =~ tr/\n//); $self->{'CURRENT_OFFSET'} = $start_of_email; $self->{'END_OF_EMAIL'} = $start_of_email; } #------------------------------------------------------------------------------- sub read_next_email { my $self = shift; return undef if $self->end_of_file(); $self->{'email_line_number'} = $self->{'CURRENT_LINE_NUMBER'}; $self->{'email_offset'} = $self->{'CURRENT_OFFSET'}; $self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'}; # Slurp in an entire multipart email (but continue looking for the next # header so that we can get any following newlines as well) unless ($self->_read_header()) { return $self->_extract_email_and_finalize(); } unless ($self->_read_email_parts()) { # Could issue a warning here, but I'm not sure how to do this cleanly for # a work-only module like this. Maybe something like CGI's cgi_error()? dprint "Inconsistent multi-part message. Could not find ending for " . "boundary \"" . $self->_multipart_boundary() . "\""; # Try to read the content length and use that my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'}, $self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'}); my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD( \$email_header, 'Content-Length:', $self->{'endline'}); if (defined $content_length) { $content_length =~ s/Content-Length: *(\d+).*/$1/i; pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length; } # Otherwise use the start of the body else { pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'}; } # Reset the search and look for the start of the # next email. $self->_read_rest_of_email(); return $self->_extract_email_and_finalize(); } $self->_read_rest_of_email(); return $self->_extract_email_and_finalize(); } #------------------------------------------------------------------------------- sub _read_rest_of_email { my $self = shift; # Look for the start of the next email while (1) { while ($self->{'READ_BUFFER'} =~ m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg) { $self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1); my $endline = $self->{'endline'}; # Keep looking if the header we found is part of a "Begin Included # Message". my $end_of_string = ''; my $backup_amount = 100; do { $backup_amount *= 2; $end_of_string = substr($self->{'READ_BUFFER'}, $self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount); } while (index($end_of_string, "$endline$endline") == -1 && $backup_amount < $self->{'END_OF_EMAIL'}); next if $end_of_string =~ /$endline-----(?: Begin Included Message |Original Message)-----$endline[^\r\n]*(?:$endline)*$/i; next unless $end_of_string =~ /$endline$endline$/; # Found the next email! return; } # Didn't find next email in current buffer. Most likely we need to read some # more of the mailbox. Shift the current email to the front of the buffer # unless we've already done so. my $shift_amount = $self->{'START_OF_EMAIL'}; $self->{'READ_BUFFER'} = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'}); $self->{'START_OF_EMAIL'} -= $shift_amount; $self->{'START_OF_BODY'} -= $shift_amount; pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'}); # Start looking at the end of the buffer, but back up some in case the # edge of the newly read buffer contains the start of a new header. I # believe the RFC says header lines can be at most 90 characters long. my $backup_amount = 90; $backup_amount = length($self->{'READ_BUFFER'}) - 1 if length($self->{'READ_BUFFER'}) < $backup_amount; unless ($self->_read_until_match( qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/,$backup_amount)) { $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'}); return; } redo; } } #------------------------------------------------------------------------------- sub _multipart_boundary { my $self = shift; my $endline = $self->{'endline'}; if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'}, $self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~ /^(content-type: *multipart[^\n\r]*$endline( [^\n\r]*$endline)*)/im) { my $content_type_header = $1; $content_type_header =~ s/$endline//g; if ($content_type_header =~ /boundary *= *"([^"]*)"/i || $content_type_header =~ /boundary *= *([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i) { return $1 } } return undef; } #------------------------------------------------------------------------------- sub _read_email_parts { my $self = shift; my $boundary = $self->_multipart_boundary(); return 1 unless defined $boundary; # RFC 1521 says the boundary can be no longer than 70 characters. Back up a # little more than that. my $endline = $self->{'endline'}; $self->_read_until_match(qr/^--\Q$boundary\E--$endline/,76) or return 0; return 1; } #------------------------------------------------------------------------------- sub _extract_email_and_finalize { my $self = shift; $self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'}; my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'}, $self->{'email_length'}); $self->{'CURRENT_LINE_NUMBER'} += ($email =~ tr/\n//); $self->{'CURRENT_OFFSET'} += $self->{'email_length'}; $self->{'email_number'}++; $self->SUPER::read_next_email(); return \$email; } #------------------------------------------------------------------------------- sub _read_header { my $self = shift; $self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/,0) or return 0; $self->{'START_OF_BODY'} = pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}"); return 1; } #------------------------------------------------------------------------------- # The search position is at the start of the pattern when this function # returns 1. sub _read_until_match { my $self = shift; my $pattern = shift; my $backup = shift; # Start looking at the end of the buffer, but back up some in case the edge # of the newly read buffer contains part of the pattern. if (!defined pos($self->{'READ_BUFFER'}) || pos($self->{'READ_BUFFER'}) - $backup <= 0) { pos($self->{'READ_BUFFER'}) = 0; } else { pos($self->{'READ_BUFFER'}) -= $backup; } while (1) { if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg) { pos($self->{'READ_BUFFER'}) -= length($1); return 1; } pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'}); unless ($self->_read_chunk()) { $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'}); return 0; } if (pos($self->{'READ_BUFFER'}) - $backup <= 0) { pos($self->{'READ_BUFFER'}) = 0; } else { pos($self->{'READ_BUFFER'}) -= $backup; } } } #------------------------------------------------------------------------------- # Maintains pos($self->{'READ_BUFFER'}) sub _read_chunk { my $self = shift; my $search_position = pos($self->{'READ_BUFFER'}); # Can't use sysread because it doesn't work with ungetc if ($self->{'READ_CHUNK_SIZE'} == 0) { local $/ = undef; return 0 if eof $self->{'file_handle'}; # < $self->{'file_handle'} > doesn't work, so we use readline $self->{'READ_BUFFER'} = readline($self->{'file_handle'}); pos($self->{'READ_BUFFER'}) = $search_position; return 1; } else { my $total_amount_read = 0; my $amount_read = 0; while ($total_amount_read < $self->{'READ_CHUNK_SIZE'}) { $amount_read = read($self->{'file_handle'}, $self->{'READ_BUFFER'}, $self->{'READ_CHUNK_SIZE'} - $total_amount_read, length($self->{'READ_BUFFER'})); pos($self->{'READ_BUFFER'}) = $search_position; if ($amount_read == 0) { return 1 unless $total_amount_read == 0; return 0; } $total_amount_read += $amount_read; } return 1; } } #------------------------------------------------------------------------------- 1; __END__ # -------------------------------------------------------------------------- =head1 NAME Mail::Mbox::MessageParser::Perl - A Perl-based mbox folder reader =head1 SYNOPSIS #!/usr/bin/perl use Mail::Mbox::MessageParser; my $filename = 'mail/saved-mail'; my $filehandle = new FileHandle($filename); my $folder_reader = new Mail::Mbox::MessageParser( { 'file_name' => $filename, 'file_handle' => $filehandle, } ); die $folder_reader unless ref $folder_reader; # Any newlines or such before the start of the first email my $prologue = $folder_reader->prologue; print $prologue; # This is the main loop. It's executed once for each email while(!$folder_reader->end_of_file()); { my $email = $folder_reader->read_next_email(); print $email; } =head1 DESCRIPTION This module implements a Perl-based mbox folder reader. Users must not instantiate this class directly--use Mail::Mbox::MessageParser instead. The base MessageParser module will automatically manage the use of faster implementations if they can be used. =head2 METHODS AND FUNCTIONS The following methods and functions are specific to the Mail::Mbox::MessageParser::Perl package. For additional inherited ones, see the Mail::Mbox::MessageParser documentation. =over 4 =item $ref = new( { 'file_name' => , 'file_handle' => }); - The full filename of the mailbox - An opened file handle for the mailbox The constructor for the class takes two parameters. The optional I parameter is the filename of the mailbox. The required I argument is the opened file handle to the mailbox. Returns a reference to a Mail::Mbox::MessageParser object, or a string describing the error. =back =head1 BUGS No known bugs. Contact david@coppit.org for bug reports and suggestions. =head1 AUTHOR David Coppit . =head1 LICENSE This software is distributed under the terms of the GPL. See the file "LICENSE" for more information. =head1 HISTORY This code was originally part of the grepmail distribution. See http://grepmail.sf.net/ for previous versions of grepmail which included early versions of this code. =head1 SEE ALSO Mail::Mbox::MessageParser =cut