package HTML::Entities::ImodePictogram;
use strict;
use vars qw($VERSION);
$VERSION = 0.06;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(encode_pictogram decode_pictogram remove_pictogram);
@EXPORT_OK = qw(find_pictogram);
%EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
my $one_byte = '[\x00-\x7F\xA1-\xDF]';
my $two_bytes = '[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]';
use vars qw($Sjis_re $Pictogram_re $ExtPictorgram_re);
$Sjis_re = qr<$one_byte|$two_bytes>;
$Pictogram_re = '\xF8[\x9F-\xFC]|\xF9[\x40-\x7E\x80-\xB0]';
$ExtPictorgram_re = '\xF9[\xB1-\xFC]';
sub find_pictogram (\$&) {
my($r_text, $callback) = @_;
my $num_found = 0;
$$r_text =~ s{(($Pictogram_re)|($ExtPictorgram_re)|$Sjis_re)}{
my $orig_match = $1;
if (defined $2 || defined $3) {
$num_found++;
my $number = unpack 'n', $orig_match;
$callback->($orig_match, $number, _num2cp($number));
}
else {
$orig_match;
}
}eg;
return $num_found;
}
sub encode_pictogram {
my($text, %opt) = @_;
find_pictogram($text, sub {
my($char, $number, $cp) = @_;
if ($opt{unicode} || $cp >= 59148) {
return sprintf '%x;', $cp;
} else {
return '' . $number . ';';
}
});
return $text;
}
sub decode_pictogram {
my $html = shift;
$html =~ s{(\&\#(\d{5});)|(\&\#x([0-9a-fA-F]{4});)}{
if (defined $1) {
my $cp = _num2cp($2);
defined $cp ? pack('n', $2) : $1;
} elsif (defined $3) {
my $num = _cp2num(hex($4));
defined $num ? pack('n', $num) : $3;
}
}eg;
return $html;
}
sub remove_pictogram {
my $text = shift;
find_pictogram($text, sub {
return '';
});
return $text;
}
sub _num2cp {
my $num = shift;
if ($num >= 63647 && $num <= 63740) {
return $num - 4705;
} elsif (($num >= 63808 && $num <= 63817) ||
($num >= 63824 && $num <= 63838) ||
($num >= 63858 && $num <= 63870)) {
return $num - 4772;
} elsif ($num >= 63872 && $num <= 63996) {
return $num - 4773;
} else {
return;
}
}
sub _cp2num {
my $cp = shift;
if ($cp >= 58942 && $cp <= 59035) {
return $cp + 4705;
} elsif (($cp >= 59036 && $cp <= 59045) ||
($cp >= 59052 && $cp <= 59066) ||
($cp >= 59086 && $cp <= 59098)) {
return $cp + 4772;
} elsif (($cp >= 59099 && $cp <= 59146) ||
($cp >= 59148 && $cp <= 59223)) {
return $cp + 4773;
} else {
return;
}
}
1;
__END__
=head1 NAME
HTML::Entities::ImodePictogram - encode / decode i-mode pictogram
=head1 SYNOPSIS
use HTML::Entities::ImodePictogram;
$html = encode_pictogram($rawtext);
$rawtext = decode_pictogram($html);
$cleantext = remove_pictogram($rawtext);
use HTML::Entities::ImodePictogram qw(find_pictogram);
$num_found = find_pictogram($rawtext, \&callback);
=head1 DESCRIPTION
HTML::Entities::ImodePictogram handles HTML entities for i-mode
pictogram (emoji), which are assigned in Shift_JIS private area.
See http://www.nttdocomo.co.jp/i/tag/emoji/index.html for details
about i-mode pictogram.
=head1 FUNCTIONS
In all functions in this module, input/output strings are asssumed as
encoded in Shift_JIS. See L for conversion between Shift_JIS
and other encodings like EUC-JP or UTF-8.
This module exports following functions by default.
=over 4
=item encode_pictogram
$html = encode_pictogram($rawtext);
$html = encode_pictogram($rawtext, unicode => 1);
Encodes pictogram characters in raw-text into HTML entities. If
$rawtext contains extended pictograms, they are encoded in Unicode
format. If you add C option explicitly, all pictogram
characters are encoded in Unicode format (C<>). Otherwise,
encoding is done in decimal format (C<NNNNN;>).
=item decode_pictogram
$rawtext = decode_pictogram($html);
Decodes HTML entities (both for C<> and C<NNNNN;>) for
pictogram into raw-text in Shift_JIS.
=item remove_pictogram
$cleantext = remove_pictogram($rawtext);
Removes pictogram characters in raw-text.
=back
This module also exports following functions on demand.
=over 4
=item find_pictogram
$num_found = find_pictorgram($rawtext, \&callback);
Finds pictogram characters in raw-text and executes callback when
found. It returns the total numbers of charcters found in text.
The callback is given three arguments. The first is a found pictogram
character itself, and the second is a decimal number which represents
Shift_JIS codepoint of the character. The third is a Unicode
codepoint. Whatever the callback returns will replace the original
text.
Here is a stub implementation of encode_pictogram(), which will be the
good example for the usage of find_pictogram(). Note that this example
version doesn't support extended pictograms.
sub encode_pictogram {
my $text = shift;
find_pictogram($text, sub {
my($char, $number, $cp) = @_;
return '' . $number . ';';
});
return $text;
}
=back
=head1 CAVEAT
=over 4
=item *
This module works so slow, because regex used here matches C
characters in the text. This is due to the difficulty of extracting
character boundaries of Shift_JIS encoding.
=item *
Extended pictogram support of this module is not complete. If you
handle pictogram characters in Unicode, try Encode module with perl
5.8.0, or Unicode::Japanese.
=back
=head1 AUTHOR
Tatsuhiko Miyagawa
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=head1 SEE ALSO
L, L,
http://www.nttdocomo.co.jp/p_s/imode/tag/emoji/
=cut