package Email::MIME::RFC2047::Decoder; BEGIN { $Email::MIME::RFC2047::Decoder::VERSION = '0.91'; } use strict; use Encode (); use MIME::Base64 (); my $rfc_specials = '()<>\[\]:;\@\\,."'; my $rfc_specials_no_quote = '()<>\[\]:;\@\\,.'; # Regex for encoded words. # This also checks the validity of base64 encoded data because MIME::Base64 # silently ignores invalid characters. # Captures ($encoding, $content_b, $content_q) my $encoded_word_text_re = qr/ (?: ^ | (?<= \s ) ) = \? ( [\w-]+ ) \? (?: [Bb] \? ( (?: [A-Za-z0-9+\/]{2} (?: == | [A-Za-z0-9+\/] [A-Za-z0-9+\/=] ) )+ ) | [Qq] \? ( [^?\x00-\x20\x7f-\x{ffff}]+ ) ) \? = (?= \z | \s ) /x; # Same as $encoded_word_text_re but excluding RFC 822 special chars # Also matches after and before special chars my $encoded_word_phrase_re = qr/ (?: ^ | (?<= [\s$rfc_specials_no_quote] ) ) = \? ( [\w-]+ ) \? (?: [Bb] \? ( (?: [A-Za-z0-9+\/]{2} (?: == | [A-Za-z0-9+\/] [A-Za-z0-9+\/=] ) )+ ) | [Qq] \? ( [^?\x00-\x20$rfc_specials\x7f-\x{ffff}]+ ) ) \? = (?= \z | [\s$rfc_specials_no_quote] ) /x; my $quoted_string_re = qr/ " ( (?: [^"\\] | \\ . )* ) " /sx; sub new { my $package = shift; my $self = {}; return bless($self, $package); } sub decode_text { my $self = shift; return $self->_decode('text', @_); } sub decode_phrase { my $self = shift; return $self->_decode('phrase', @_); } sub _decode { my ($self, $mode, $encoded) = @_; my $encoded_ref = ref($encoded) ? $encoded : \$encoded; my $result = ''; my $enc_flag; # use shortest match on any characters we don't want to decode my $regex = $mode eq 'phrase' ? qr/([^$rfc_specials]*?)($encoded_word_phrase_re|$quoted_string_re)/ : qr/(.*?)($encoded_word_text_re)/s; while($$encoded_ref =~ /\G$regex/cg) { my ($text, $match, $encoding, $b_content, $q_content, $qs_content) = ($1, $2, $3, $4, $5, $6, $7); if(defined($encoding)) { # encoded words shouldn't be longer than 75 chars but # let's allow up to 255 chars if(length($match) > 255) { $result .= $text; $result .= $match; $enc_flag = undef; next; } my $content; if(defined($b_content)) { # MIME B $content = MIME::Base64::decode_base64($b_content); } else { # MIME Q $content = $q_content; $content =~ tr/_/ /; $content =~ s/=([0-9A-Fa-f]{2})/chr(hex($1))/eg; } my $chunk; eval { $chunk = Encode::decode( $encoding, $content, Encode::FB_CROAK ); }; if($@) { warn($@); # display raw encoded word in case of errors $result .= $text; $result .= $match; $enc_flag = undef; next; } # ignore whitespace between encoded words $result .= $text if !$enc_flag || $text =~ /\S/; $result .= $chunk; $enc_flag = 1; } else { # quoted string $result .= $text; # make sure there is whitespace before the quoted string $result .= ' '; # unquote $qs_content =~ s/\\(.)/$1/gs; $result .= $qs_content; # make sure there is whitespace after the quoted string $result .= ' '; $enc_flag = undef; } } $regex = $mode eq 'phrase' ? qr/[^$rfc_specials]+/ : qr/.+/s; $result .= $& if $$encoded_ref =~ /\G$regex/cg; # normalize whitespace $result =~ s/^\s+//; $result =~ s/\s+\z//; $result =~ s/\s+/ /g; # remove potentially dangerous ASCII control chars $result =~ s/[\x00-\x1f\x7f]//g; return $result; } 1; __END__ =head1 NAME Email::MIME::RFC2047::Decoder - Decoding of non-ASCII MIME headers =head1 SYNOPSIS use Email::MIME::RFC2047::Decoder; my $decoder = Email::MIME::RFC2047::Decoder->new(); my $string = $decoder->decode_text($encoded_text); my $string = $decoder->decode_phrase($encoded_phrase); =head1 DESCRIPTION This module decodes parts of MIME email message headers containing non-ASCII text according to RFC 2047. =head1 CONSTRUCTOR =head2 new my $decoder = Email::MIME::RFC2047::Decoder->new(); Creates a new decoder object. =head1 METHODS =head2 decode_text my $string = $decoder->decode_text($encoded_text); Decodes any MIME header field for which the field body is defined as '*text' (as defined by RFC 822), for example, any Subject or Comments header field. $encoded_text can also be a reference to a scalar. In this case the scalar is processed starting from the current search position. See L. The resulting string is trimmed and any whitespace is collapsed. =head2 decode_phrase my $string = $decoder->decode_phrase($encoded_phrase); Decodes any 'phrase' token (as defined by RFC 822) in a MIME header field, for example, one that precedes an address in a From, To, or Cc header. This method works like I but additionally unquotes any 'quoted-strings'. It also stops at any special character as defined by RFC 822. If $encoded_phrase is a reference to a scalar the current search position is set accordingly. This is helpful when parsing RFC 822 address headers. =head1 AUTHOR Nick Wellnhofer =head1 COPYRIGHT AND LICENSE Copyright (C) Nick Wellnhofer, 2009 This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.0 or, at your option, any later version of Perl 5 you may have available. =cut