package HTML::Entities::Latin2;
use 5.006;
use strict;
use warnings;
use vars qw(*encode_entities);
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(encode encode_entities);
our $VERSION = '0.04';
my %ascii_entities = (
'"' => ['"', '"', '"', "\x{0022}", 'QUOTATION MARK'],
'&' => ['&', '&', '&', "\x{0026}", 'AMPERSAND'],
'\'' => [''', ''', ''', "\x{0027}", 'APOSTROPHE'],
'<' => ['<', '<', '<', "\x{003C}", 'LESS-THAN SIGN'],
'>' => ['>', '>', '>', "\x{003E}", 'GREATER-THAN SIGN'],
);
my @char_map = (
# chr(160) to chr(255)
[' ', ' ', ' ', "\x{00A0}", 'NO-BREAK SPACE'],
['Ą', 'Ą', 'Ą', "\x{0104}", 'LATIN CAPITAL LETTER A WITH OGONEK'],
['˘', '˘', '˘', "\x{02D8}", 'BREVE'],
['Ł', 'Ł', 'Ł', "\x{0141}", 'LATIN CAPITAL LETTER L WITH STROKE'],
['¤', '¤', '¤', "\x{00A4}", 'CURRENCY SIGN'],
['Ľ', 'Ľ', 'Ľ', "\x{013D}", 'LATIN CAPITAL LETTER L WITH CARON'],
['Ś', 'Ś', 'Ś', "\x{015A}", 'LATIN CAPITAL LETTER S WITH ACUTE'],
['§', '§', '§', "\x{00A7}", 'SECTION SIGN'],
['¨', '¨', '¨', "\x{00A8}", 'DIAERESIS'],
['Š', 'Š', 'Š', "\x{0160}", 'LATIN CAPITAL LETTER S WITH CARON'],
['Ş', 'Ş', 'Ş', "\x{015E}", 'LATIN CAPITAL LETTER S WITH CEDILLA'],
['Ť', 'Ť', 'Ť', "\x{0164}", 'LATIN CAPITAL LETTER T WITH CARON'],
['Ź', 'Ź', 'Ź', "\x{0179}", 'LATIN CAPITAL LETTER Z WITH ACUTE'],
['', '', '', "\x{00AD}", 'SOFT HYPHEN'],
['Ž', 'Ž', 'Ž', "\x{017D}", 'LATIN CAPITAL LETTER Z WITH CARON'],
['Ż', 'Ż', 'Ż', "\x{017B}", 'LATIN CAPITAL LETTER Z WITH DOT ABOVE'],
['˚', '°', '°', "\x{00B0}", 'DEGREE SIGN'],
['ą', 'ą', 'ą', "\x{0105}", 'LATIN SMALL LETTER A WITH OGONEK'],
['˛', '˛', '˛', "\x{02DB}", 'OGONEK'],
['ł', 'ł', 'ł', "\x{0142}", 'LATIN SMALL LETTER L WITH STROKE'],
['ˊ', '´', '´', "\x{00B4}", 'ACUTE ACCENT'],
['ľ', 'ľ', 'ľ', "\x{013E}", 'LATIN SMALL LETTER L WITH CARON'],
['ś', 'ś', 'ś', "\x{015B}", 'LATIN SMALL LETTER S WITH ACUTE'],
['ˇ', 'ˇ', '&caron', "\x{02C7}", 'CARON'],
['¸', '¸', '¸', "\x{00B8}", 'CEDILLA'],
['š', 'š', 'š', "\x{0161}", 'LATIN SMALL LETTER S WITH CARON'],
['ş', 'ş', 'ş', "\x{015F}", 'LATIN SMALL LETTER S WITH CEDILLA'],
['ť', 'ť', 'ť', "\x{0165}", 'LATIN SMALL LETTER T WITH CARON'],
['ź', 'ź', 'ź', "\x{017A}", 'LATIN SMALL LETTER Z WITH ACUTE'],
['˝', '˝', '˝', "\x{02DD}", 'DOUBLE ACUTE ACCENT'],
['ž', 'ž', 'ž', "\x{017E}", 'LATIN SMALL LETTER Z WITH CARON'],
['ż', 'ż', 'ż', "\x{017C}", 'LATIN SMALL LETTER Z WITH DOT ABOVE'],
['Ŕ', 'Ŕ', 'Ŕ', "\x{0154}", 'LATIN CAPITAL LETTER R WITH ACUTE'],
['Á', 'Á', 'Á', "\x{00C1}", 'LATIN CAPITAL LETTER A WITH ACUTE'],
['Â', 'Â', 'Â', "\x{00C2}", 'LATIN CAPITAL LETTER A WITH CIRCUMFLEX'],
['Ă', 'Ă', 'Ă', "\x{0102}", 'LATIN CAPITAL LETTER A WITH BREVE'],
['Ä', 'Ä', 'Ä', "\x{00C4}", 'LATIN CAPITAL LETTER A WITH UMLAUT'],
['Ĺ', 'Ĺ', 'Ĺ', "\x{0139}", 'LATIN CAPITAL LETTER L WITH ACUTE'],
['Ć', 'Ć', 'Ć', "\x{0106}", 'LATIN CAPITAL LETTER C WITH ACUTE'],
['Ç', 'Ç', 'Ç', "\x{00C7}", 'LATIN CAPITAL LETTER C WITH CEDILLA'],
['Č', 'Č', 'Č', "\x{010C}", 'LATIN CAPITAL LETTER C WITH CARON'],
['É', 'É', 'É', "\x{00C9}", 'LATIN CAPITAL LETTER E WITH ACUTE'],
['Ę', 'Ę', 'Ę', "\x{0118}", 'LATIN CAPITAL LETTER E WITH OGONEK'],
['Ë', 'Ë', 'Ë', "\x{00CB}", 'LATIN CAPITAL LETTER E WITH UMLAUT'],
['Ě', 'Ě', 'Ě', "\x{011A}", 'LATIN CAPITAL LETTER E WITH CARON'],
['Í', 'Í', 'Í', "\x{00CD}", 'LATIN CAPITAL LETTER I WITH ACUTE'],
['Î', 'Î', 'Î', "\x{00CE}", 'LATIN CAPITAL LETTER I WITH CIRCUMFLEX'],
['Ď', 'Ď', 'Ď', "\x{010E}", 'LATIN CAPITAL LETTER D WITH CARON'],
['Đ', 'Đ', 'Đ', "\x{0110}", 'LATIN CAPITAL LETTER D WITH STROKE'],
['Ń', 'Ń', 'Ń', "\x{0143}", 'LATIN CAPITAL LETTER N WITH ACUTE'],
['Ň', 'Ň', 'Ň', "\x{0147}", 'LATIN CAPITAL LETTER N WITH CARON'],
['Ó', 'Ó', 'Ó', "\x{00D3}", 'LATIN CAPITAL LETTER O WITH ACUTE'],
['Ô', 'Ô', 'Ô', "\x{00D4}", 'LATIN CAPITAL LETTER O WITH CIRCUMFLEX'],
['Ő', 'ő', 'Ő', "\x{0151}", 'LATIN CAPITAL LETTER O WITH DOUBLE ACUTE'],
['Ö', 'Ö', 'Ö', "\x{00D6}", 'LATIN CAPITAL LETTER O WITH UMLAUT'],
['×', '×', '×', "\x{00D7}", 'MULTIPLICATION SIGN'],
['Ř', 'Ř', 'Ř', "\x{0158}", 'LATIN CAPITAL LETTER R WITH CARON'],
['Ů', 'Ů', 'Ů', "\x{016E}", 'LATIN CAPITAL LETTER U WITH RING ABOVE'],
['Ú', 'Ú', 'Ú', "\x{00DA}", 'LATIN CAPITAL LETTER U WITH ACUTE'],
['Ű', 'Ű', 'Ű', "\x{0170}", 'LATIN CAPITAL LETTER U WITH DOUBLE ACUTE'],
['Ü', 'Ü', 'Ü', "\x{00DC}", 'LATIN CAPITAL LETTER U WITH UMLAUT'],
['Ý', 'Ý', 'Ý', "\x{00DD}", 'LATIN CAPITAL LETTER Y WITH ACUTE'],
['Ţ', 'Ţ', 'Ţ', "\x{0162}", 'LATIN CAPITAL LETTER T WITH CEDILLA'],
['ß', 'ß', 'ß', "\x{00DF}", 'LATIN SMALL LETTER SHARP S'],
['ŕ', 'ŕ', 'ŕ', "\x{0155}", 'LATIN SMALL LETTER R WITH ACUTE'],
['á', 'á', 'á', "\x{00E1}", 'LATIN SMALL LETTER A WITH ACUTE'],
['â', 'â', 'â', "\x{00E2}", 'LATIN SMALL LETTER A WITH CIRCUMFLEX'],
['ă', 'ă', 'ă', "\x{0103}", 'LATIN SMALL LETTER A WITH BREVE'],
['ä', 'ä', 'ä', "\x{00E4}", 'LATIN SMALL LETTER A WITH UMLAUT'],
['ĺ', 'ĺ', 'ĺ', "\x{013A}", 'LATIN SMALL LETTER L WITH ACUTE'],
['ć', 'ć', 'ć', "\x{0107}", 'LATIN SMALL LETTER C WITH ACUTE'],
['ç', 'ç', 'ç', "\x{00E7}", 'LATIN SMALL LETTER C WITH CEDILLA'],
['č', 'č', 'č', "\x{010D}", 'LATIN SMALL LETTER C WITH CARON'],
['é', 'é', 'é', "\x{00E9}", 'LATIN SMALL LETTER E WITH ACUTE'],
['ę', 'ę', 'ę', "\x{0119}", 'LATIN SMALL LETTER E WITH OGONEK'],
['ë', 'ë', 'ë', "\x{00EB}", 'LATIN SMALL LETTER E WITH UMLAUT'],
['ě', 'ě', 'ě', "\x{011B}", 'LATIN SMALL LETTER E WITH CARON'],
['í', 'í', 'í', "\x{00ED}", 'LATIN SMALL LETTER I WITH ACUTE'],
['î', 'î', 'î', "\x{00EE}", 'LATIN SMALL LETTER I WITH CIRCUMFLEX'],
['ď', 'ď', 'ď', "\x{010F}", 'LATIN SMALL LETTER D WITH CARON'],
['đ', 'đ', 'đ', "\x{0111}", 'LATIN SMALL LETTER D WITH STROKE'],
['ń', 'ń', 'ń', "\x{0144}", 'LATIN SMALL LETTER N WITH ACUTE'],
['ň', 'ň', 'ň', "\x{0148}", 'LATIN SMALL LETTER N WITH CARON'],
['ó', 'ó', 'ó', "\x{00F3}", 'LATIN SMALL LETTER O WITH ACUTE'],
['ô', 'ô', 'ô', "\x{00F4}", 'LATIN SMALL LETTER O WITH CIRCUMFLEX'],
['ő', 'ő', 'ő', "\x{0151}", 'LATIN SMALL LETTER O WITH DOUBLE ACUTE'],
['ö', 'ö', 'ö', "\x{00F6}", 'LATIN SMALL LETTER O WITH UMLAUT'],
['÷', '÷', '÷', "\x{00F7}", 'DIVISION SIGN'],
['ř', 'ř', 'ř', "\x{0159}", 'LATIN SMALL LETTER R WITH CARON'],
['ů', 'ů', 'ů', "\x{016F}", 'LATIN SMALL LETTER U WITH RING ABOVE'],
['ú', 'ú', 'ú', "\x{00FA}", 'LATIN SMALL LETTER U WITH ACUTE'],
['ű', 'ű', 'ű', "\x{0171}", 'LATIN SMALL LETTER U WITH DOUBLE ACUTE'],
['ü', 'ü', 'ü', "\x{00FC}", 'LATIN SMALL LETTER U WITH UMLAUT'],
['ý', 'ý', 'ý', "\x{00FD}", 'LATIN SMALL LETTER Y WITH ACUTE'],
['ţ', 'ţ', 'ţ', "\x{0163}", 'LATIN SMALL LETTER T WITH CEDILLA'],
['·', '˙', '˙', "\x{02D9}", 'DOT ABOVE'],
);
sub encode {
my($source_str, $scheme_name, $unsafe) = @_;
my $scheme = {
decimal=>0, number=>0, numeric=>0, 'hex'=>1, name=>2, named=>2, utf8=>3, description=>4
}->{lc($scheme_name)};
$scheme = 0 unless defined $scheme; # defaults to decimal/numeric entities
my %unsafe = ();
if ($unsafe) {
foreach (split //, $unsafe) {
if (defined $ascii_entities{$_}) { $unsafe{ord $_} = $ascii_entities{$_}; }
}
}
my $encoded = '';
foreach my $char_val (unpack('C*', $source_str)) {
if ($char_val < 127) { # ASCII character
if (defined $unsafe{$char_val}) {
$encoded .= $unsafe{$char_val}->[$scheme];
}
else { $encoded .= chr $char_val; }
}
elsif ($char_val >= 160) {
$encoded .= $char_map[$char_val - 160]->[$scheme];
}
else {
warn 'character not in Latin-2 map, character code: '.$char_val;
}
}
return $encoded;
}
*encode_entities = \&encode;
1;
__END__
=head1 NAME
HTML::Entities::Latin2 - Encode ISO-8859-2 characters into HTML entities.
=head1 SYNOPSIS
use HTML::Entities::Latin2;
$lat2_string = "\"k\xF6zponti\" sz\xE1m\xEDt\xF3g\xE9p";
print HTML::Entities::Latin2::encode($lat2_string);
# "központi" számítógép
print HTML::Entities::Latin2::encode($lat2_string, 'name', '<"');
# "központi" <b>számítógép</b>
print HTML::Entities::Latin2::encode($lat2_string, 'hex');
# "központi" számítógép
=head1 DESCRIPTION
Translate high-bit Latin2 characters into HTML entities based on the ISO-8859-2
character map, with option of using named, decimal or hex entities. Using
this process will allow Eastern European encoded text to be used in ASCII
HTML pages.
=head2 FUNCTIONS
encode($latin2_string, $encoding_scheme, $unsafe_chars);
=head1 SEE ALSO
HTML::Entities
http://czyborra.com/charsets/iso8859.html#ISO-8859-2
http://www.w3schools.com/html/html_entitiesref.asp
http://www.microsoft.com/globaldev/reference/iso/28592.htm
=head1 AUTHOR
Michael J. Mathews, michael@perlinpractice.com
=head1 COPYRIGHT AND LICENSE
Copyright 2005 Michael J. Mathews. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=head1 CAVEATS
This module has only been tested on Unix, Perl 5.6.1, and 5.8.1.
=cut