package HTML::Entities::Latin2; use 5.006; use strict; use warnings; use vars qw(*encode_entities); require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(encode encode_entities); our $VERSION = '0.04'; my %ascii_entities = ( '"' => ['"', '"', '"', "\x{0022}", 'QUOTATION MARK'], '&' => ['&', '&', '&', "\x{0026}", 'AMPERSAND'], '\'' => [''', ''', ''', "\x{0027}", 'APOSTROPHE'], '<' => ['<', '<', '<', "\x{003C}", 'LESS-THAN SIGN'], '>' => ['>', '>', '>', "\x{003E}", 'GREATER-THAN SIGN'], ); my @char_map = ( # chr(160) to chr(255) [' ', ' ', ' ', "\x{00A0}", 'NO-BREAK SPACE'], ['Ą', 'Ą', 'Ą', "\x{0104}", 'LATIN CAPITAL LETTER A WITH OGONEK'], ['˘', '˘', '˘', "\x{02D8}", 'BREVE'], ['Ł', 'Ł', 'Ł', "\x{0141}", 'LATIN CAPITAL LETTER L WITH STROKE'], ['¤', '¤', '¤', "\x{00A4}", 'CURRENCY SIGN'], ['Ľ', 'Ľ', 'Ľ', "\x{013D}", 'LATIN CAPITAL LETTER L WITH CARON'], ['Ś', 'Ś', 'Ś', "\x{015A}", 'LATIN CAPITAL LETTER S WITH ACUTE'], ['§', '§', '§', "\x{00A7}", 'SECTION SIGN'], ['¨', '¨', '¨', "\x{00A8}", 'DIAERESIS'], ['Š', 'Š', 'Š', "\x{0160}", 'LATIN CAPITAL LETTER S WITH CARON'], ['Ş', 'Ş', 'Ş', "\x{015E}", 'LATIN CAPITAL LETTER S WITH CEDILLA'], ['Ť', 'Ť', 'Ť', "\x{0164}", 'LATIN CAPITAL LETTER T WITH CARON'], ['Ź', 'Ź', 'Ź', "\x{0179}", 'LATIN CAPITAL LETTER Z WITH ACUTE'], ['–', '­', '­', "\x{00AD}", 'SOFT HYPHEN'], ['Ž', 'Ž', 'Ž', "\x{017D}", 'LATIN CAPITAL LETTER Z WITH CARON'], ['Ż', 'Ż', 'Ż', "\x{017B}", 'LATIN CAPITAL LETTER Z WITH DOT ABOVE'], ['˚', '°', '°', "\x{00B0}", 'DEGREE SIGN'], ['ą', 'ą', 'ą', "\x{0105}", 'LATIN SMALL LETTER A WITH OGONEK'], ['˛', '˛', '˛', "\x{02DB}", 'OGONEK'], ['ł', 'ł', 'ł', "\x{0142}", 'LATIN SMALL LETTER L WITH STROKE'], ['ˊ', '´', '´', "\x{00B4}", 'ACUTE ACCENT'], ['ľ', 'ľ', 'ľ', "\x{013E}", 'LATIN SMALL LETTER L WITH CARON'], ['ś', 'ś', 'ś', "\x{015B}", 'LATIN SMALL LETTER S WITH ACUTE'], ['ˇ', 'ˇ', '&caron', "\x{02C7}", 'CARON'], ['¸', '¸', '¸', "\x{00B8}", 'CEDILLA'], ['š', 'š', 'š', "\x{0161}", 'LATIN SMALL LETTER S WITH CARON'], ['ş', 'ş', 'ş', "\x{015F}", 'LATIN SMALL LETTER S WITH CEDILLA'], ['ť', 'ť', 'ť', "\x{0165}", 'LATIN SMALL LETTER T WITH CARON'], ['ź', 'ź', 'ź', "\x{017A}", 'LATIN SMALL LETTER Z WITH ACUTE'], ['˝', '˝', '˝', "\x{02DD}", 'DOUBLE ACUTE ACCENT'], ['ž', 'ž', 'ž', "\x{017E}", 'LATIN SMALL LETTER Z WITH CARON'], ['ż', 'ż', 'ż', "\x{017C}", 'LATIN SMALL LETTER Z WITH DOT ABOVE'], ['Ŕ', 'Ŕ', 'Ŕ', "\x{0154}", 'LATIN CAPITAL LETTER R WITH ACUTE'], ['Á', 'Á', 'Á', "\x{00C1}", 'LATIN CAPITAL LETTER A WITH ACUTE'], ['Â', 'Â', 'Â', "\x{00C2}", 'LATIN CAPITAL LETTER A WITH CIRCUMFLEX'], ['Ă', 'Ă', 'Ă', "\x{0102}", 'LATIN CAPITAL LETTER A WITH BREVE'], ['Ä', 'Ä', 'Ä', "\x{00C4}", 'LATIN CAPITAL LETTER A WITH UMLAUT'], ['Ĺ', 'Ĺ', 'Ĺ', "\x{0139}", 'LATIN CAPITAL LETTER L WITH ACUTE'], ['Ć', 'Ć', 'Ć', "\x{0106}", 'LATIN CAPITAL LETTER C WITH ACUTE'], ['Ç', 'Ç', 'Ç', "\x{00C7}", 'LATIN CAPITAL LETTER C WITH CEDILLA'], ['Č', 'Č', 'Č', "\x{010C}", 'LATIN CAPITAL LETTER C WITH CARON'], ['É', 'É', 'É', "\x{00C9}", 'LATIN CAPITAL LETTER E WITH ACUTE'], ['Ę', 'Ę', 'Ę', "\x{0118}", 'LATIN CAPITAL LETTER E WITH OGONEK'], ['Ë', 'Ë', 'Ë', "\x{00CB}", 'LATIN CAPITAL LETTER E WITH UMLAUT'], ['Ě', 'Ě', 'Ě', "\x{011A}", 'LATIN CAPITAL LETTER E WITH CARON'], ['Í', 'Í', 'Í', "\x{00CD}", 'LATIN CAPITAL LETTER I WITH ACUTE'], ['Î', 'Î', 'Î', "\x{00CE}", 'LATIN CAPITAL LETTER I WITH CIRCUMFLEX'], ['Ď', 'Ď', 'Ď', "\x{010E}", 'LATIN CAPITAL LETTER D WITH CARON'], ['Đ', 'Đ', 'Đ', "\x{0110}", 'LATIN CAPITAL LETTER D WITH STROKE'], ['Ń', 'Ń', 'Ń', "\x{0143}", 'LATIN CAPITAL LETTER N WITH ACUTE'], ['Ň', 'Ň', 'Ň', "\x{0147}", 'LATIN CAPITAL LETTER N WITH CARON'], ['Ó', 'Ó', 'Ó', "\x{00D3}", 'LATIN CAPITAL LETTER O WITH ACUTE'], ['Ô', 'Ô', 'Ô', "\x{00D4}", 'LATIN CAPITAL LETTER O WITH CIRCUMFLEX'], ['Ő', 'ő', 'Ő', "\x{0151}", 'LATIN CAPITAL LETTER O WITH DOUBLE ACUTE'], ['Ö', 'Ö', 'Ö', "\x{00D6}", 'LATIN CAPITAL LETTER O WITH UMLAUT'], ['×', '×', '×', "\x{00D7}", 'MULTIPLICATION SIGN'], ['Ř', 'Ř', 'Ř', "\x{0158}", 'LATIN CAPITAL LETTER R WITH CARON'], ['Ů', 'Ů', 'Ů', "\x{016E}", 'LATIN CAPITAL LETTER U WITH RING ABOVE'], ['Ú', 'Ú', 'Ú', "\x{00DA}", 'LATIN CAPITAL LETTER U WITH ACUTE'], ['Ű', 'Ű', 'Ű', "\x{0170}", 'LATIN CAPITAL LETTER U WITH DOUBLE ACUTE'], ['Ü', 'Ü', 'Ü', "\x{00DC}", 'LATIN CAPITAL LETTER U WITH UMLAUT'], ['Ý', 'Ý', 'Ý', "\x{00DD}", 'LATIN CAPITAL LETTER Y WITH ACUTE'], ['Ţ', 'Ţ', 'Ţ', "\x{0162}", 'LATIN CAPITAL LETTER T WITH CEDILLA'], ['ß', 'ß', 'ß', "\x{00DF}", 'LATIN SMALL LETTER SHARP S'], ['ŕ', 'ŕ', 'ŕ', "\x{0155}", 'LATIN SMALL LETTER R WITH ACUTE'], ['á', 'á', 'á', "\x{00E1}", 'LATIN SMALL LETTER A WITH ACUTE'], ['â', 'â', 'â', "\x{00E2}", 'LATIN SMALL LETTER A WITH CIRCUMFLEX'], ['ă', 'ă', 'ă', "\x{0103}", 'LATIN SMALL LETTER A WITH BREVE'], ['ä', 'ä', 'ä', "\x{00E4}", 'LATIN SMALL LETTER A WITH UMLAUT'], ['ĺ', 'ĺ', 'ĺ', "\x{013A}", 'LATIN SMALL LETTER L WITH ACUTE'], ['ć', 'ć', 'ć', "\x{0107}", 'LATIN SMALL LETTER C WITH ACUTE'], ['ç', 'ç', 'ç', "\x{00E7}", 'LATIN SMALL LETTER C WITH CEDILLA'], ['č', 'č', 'č', "\x{010D}", 'LATIN SMALL LETTER C WITH CARON'], ['é', 'é', 'é', "\x{00E9}", 'LATIN SMALL LETTER E WITH ACUTE'], ['ę', 'ę', 'ę', "\x{0119}", 'LATIN SMALL LETTER E WITH OGONEK'], ['ë', 'ë', 'ë', "\x{00EB}", 'LATIN SMALL LETTER E WITH UMLAUT'], ['ě', 'ě', 'ě', "\x{011B}", 'LATIN SMALL LETTER E WITH CARON'], ['í', 'í', 'í', "\x{00ED}", 'LATIN SMALL LETTER I WITH ACUTE'], ['î', 'î', 'î', "\x{00EE}", 'LATIN SMALL LETTER I WITH CIRCUMFLEX'], ['ď', 'ď', 'ď', "\x{010F}", 'LATIN SMALL LETTER D WITH CARON'], ['đ', 'đ', 'đ', "\x{0111}", 'LATIN SMALL LETTER D WITH STROKE'], ['ń', 'ń', 'ń', "\x{0144}", 'LATIN SMALL LETTER N WITH ACUTE'], ['ň', 'ň', 'ň', "\x{0148}", 'LATIN SMALL LETTER N WITH CARON'], ['ó', 'ó', 'ó', "\x{00F3}", 'LATIN SMALL LETTER O WITH ACUTE'], ['ô', 'ô', 'ô', "\x{00F4}", 'LATIN SMALL LETTER O WITH CIRCUMFLEX'], ['ő', 'ő', 'ő', "\x{0151}", 'LATIN SMALL LETTER O WITH DOUBLE ACUTE'], ['ö', 'ö', 'ö', "\x{00F6}", 'LATIN SMALL LETTER O WITH UMLAUT'], ['÷', '÷', '÷', "\x{00F7}", 'DIVISION SIGN'], ['ř', 'ř', 'ř', "\x{0159}", 'LATIN SMALL LETTER R WITH CARON'], ['ů', 'ů', 'ů', "\x{016F}", 'LATIN SMALL LETTER U WITH RING ABOVE'], ['ú', 'ú', 'ú', "\x{00FA}", 'LATIN SMALL LETTER U WITH ACUTE'], ['ű', 'ű', 'ű', "\x{0171}", 'LATIN SMALL LETTER U WITH DOUBLE ACUTE'], ['ü', 'ü', 'ü', "\x{00FC}", 'LATIN SMALL LETTER U WITH UMLAUT'], ['ý', 'ý', 'ý', "\x{00FD}", 'LATIN SMALL LETTER Y WITH ACUTE'], ['ţ', 'ţ', 'ţ', "\x{0163}", 'LATIN SMALL LETTER T WITH CEDILLA'], ['·', '˙', '˙', "\x{02D9}", 'DOT ABOVE'], ); sub encode { my($source_str, $scheme_name, $unsafe) = @_; my $scheme = { decimal=>0, number=>0, numeric=>0, 'hex'=>1, name=>2, named=>2, utf8=>3, description=>4 }->{lc($scheme_name)}; $scheme = 0 unless defined $scheme; # defaults to decimal/numeric entities my %unsafe = (); if ($unsafe) { foreach (split //, $unsafe) { if (defined $ascii_entities{$_}) { $unsafe{ord $_} = $ascii_entities{$_}; } } } my $encoded = ''; foreach my $char_val (unpack('C*', $source_str)) { if ($char_val < 127) { # ASCII character if (defined $unsafe{$char_val}) { $encoded .= $unsafe{$char_val}->[$scheme]; } else { $encoded .= chr $char_val; } } elsif ($char_val >= 160) { $encoded .= $char_map[$char_val - 160]->[$scheme]; } else { warn 'character not in Latin-2 map, character code: '.$char_val; } } return $encoded; } *encode_entities = \&encode; 1; __END__ =head1 NAME HTML::Entities::Latin2 - Encode ISO-8859-2 characters into HTML entities. =head1 SYNOPSIS use HTML::Entities::Latin2; $lat2_string = "\"k\xF6zponti\" sz\xE1m\xEDt\xF3g\xE9p"; print HTML::Entities::Latin2::encode($lat2_string); # "központi" számítógép print HTML::Entities::Latin2::encode($lat2_string, 'name', '<"'); # "központi" <b>számítógép</b> print HTML::Entities::Latin2::encode($lat2_string, 'hex'); # "központi" számítógép =head1 DESCRIPTION Translate high-bit Latin2 characters into HTML entities based on the ISO-8859-2 character map, with option of using named, decimal or hex entities. Using this process will allow Eastern European encoded text to be used in ASCII HTML pages. =head2 FUNCTIONS encode($latin2_string, $encoding_scheme, $unsafe_chars); =head1 SEE ALSO HTML::Entities http://czyborra.com/charsets/iso8859.html#ISO-8859-2 http://www.w3schools.com/html/html_entitiesref.asp http://www.microsoft.com/globaldev/reference/iso/28592.htm =head1 AUTHOR Michael J. Mathews, michael@perlinpractice.com =head1 COPYRIGHT AND LICENSE Copyright 2005 Michael J. Mathews. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 CAVEATS This module has only been tested on Unix, Perl 5.6.1, and 5.8.1. =cut