#line 1
package HTML::Entities;
#line 129
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
use vars qw(%entity2char %char2entity);
require 5.004;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(encode_entities decode_entities _decode_entities);
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
$VERSION = "3.60";
sub Version { $VERSION; }
require HTML::Parser; # for fast XS implemented decode_entities
%entity2char = (
# Some normal chars that have special meaning in SGML context
amp => '&', # ampersand
'gt' => '>', # greater than
'lt' => '<', # less than
quot => '"', # double quote
apos => "'", # single quote
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
AElig => chr(198), # capital AE diphthong (ligature)
Aacute => chr(193), # capital A, acute accent
Acirc => chr(194), # capital A, circumflex accent
Agrave => chr(192), # capital A, grave accent
Aring => chr(197), # capital A, ring
Atilde => chr(195), # capital A, tilde
Auml => chr(196), # capital A, dieresis or umlaut mark
Ccedil => chr(199), # capital C, cedilla
ETH => chr(208), # capital Eth, Icelandic
Eacute => chr(201), # capital E, acute accent
Ecirc => chr(202), # capital E, circumflex accent
Egrave => chr(200), # capital E, grave accent
Euml => chr(203), # capital E, dieresis or umlaut mark
Iacute => chr(205), # capital I, acute accent
Icirc => chr(206), # capital I, circumflex accent
Igrave => chr(204), # capital I, grave accent
Iuml => chr(207), # capital I, dieresis or umlaut mark
Ntilde => chr(209), # capital N, tilde
Oacute => chr(211), # capital O, acute accent
Ocirc => chr(212), # capital O, circumflex accent
Ograve => chr(210), # capital O, grave accent
Oslash => chr(216), # capital O, slash
Otilde => chr(213), # capital O, tilde
Ouml => chr(214), # capital O, dieresis or umlaut mark
THORN => chr(222), # capital THORN, Icelandic
Uacute => chr(218), # capital U, acute accent
Ucirc => chr(219), # capital U, circumflex accent
Ugrave => chr(217), # capital U, grave accent
Uuml => chr(220), # capital U, dieresis or umlaut mark
Yacute => chr(221), # capital Y, acute accent
aacute => chr(225), # small a, acute accent
acirc => chr(226), # small a, circumflex accent
aelig => chr(230), # small ae diphthong (ligature)
agrave => chr(224), # small a, grave accent
aring => chr(229), # small a, ring
atilde => chr(227), # small a, tilde
auml => chr(228), # small a, dieresis or umlaut mark
ccedil => chr(231), # small c, cedilla
eacute => chr(233), # small e, acute accent
ecirc => chr(234), # small e, circumflex accent
egrave => chr(232), # small e, grave accent
eth => chr(240), # small eth, Icelandic
euml => chr(235), # small e, dieresis or umlaut mark
iacute => chr(237), # small i, acute accent
icirc => chr(238), # small i, circumflex accent
igrave => chr(236), # small i, grave accent
iuml => chr(239), # small i, dieresis or umlaut mark
ntilde => chr(241), # small n, tilde
oacute => chr(243), # small o, acute accent
ocirc => chr(244), # small o, circumflex accent
ograve => chr(242), # small o, grave accent
oslash => chr(248), # small o, slash
otilde => chr(245), # small o, tilde
ouml => chr(246), # small o, dieresis or umlaut mark
szlig => chr(223), # small sharp s, German (sz ligature)
thorn => chr(254), # small thorn, Icelandic
uacute => chr(250), # small u, acute accent
ucirc => chr(251), # small u, circumflex accent
ugrave => chr(249), # small u, grave accent
uuml => chr(252), # small u, dieresis or umlaut mark
yacute => chr(253), # small y, acute accent
yuml => chr(255), # small y, dieresis or umlaut mark
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
copy => chr(169), # copyright sign
reg => chr(174), # registered sign
nbsp => chr(160), # non breaking space
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
iexcl => chr(161),
cent => chr(162),
pound => chr(163),
curren => chr(164),
yen => chr(165),
brvbar => chr(166),
sect => chr(167),
uml => chr(168),
ordf => chr(170),
laquo => chr(171),
'not' => chr(172), # not is a keyword in perl
shy => chr(173),
macr => chr(175),
deg => chr(176),
plusmn => chr(177),
sup1 => chr(185),
sup2 => chr(178),
sup3 => chr(179),
acute => chr(180),
micro => chr(181),
para => chr(182),
middot => chr(183),
cedil => chr(184),
ordm => chr(186),
raquo => chr(187),
frac14 => chr(188),
frac12 => chr(189),
frac34 => chr(190),
iquest => chr(191),
'times' => chr(215), # times is a keyword in perl
divide => chr(247),
( $] > 5.007 ? (
'OElig;' => chr(338),
'oelig;' => chr(339),
'Scaron;' => chr(352),
'scaron;' => chr(353),
'Yuml;' => chr(376),
'fnof;' => chr(402),
'circ;' => chr(710),
'tilde;' => chr(732),
'Alpha;' => chr(913),
'Beta;' => chr(914),
'Gamma;' => chr(915),
'Delta;' => chr(916),
'Epsilon;' => chr(917),
'Zeta;' => chr(918),
'Eta;' => chr(919),
'Theta;' => chr(920),
'Iota;' => chr(921),
'Kappa;' => chr(922),
'Lambda;' => chr(923),
'Mu;' => chr(924),
'Nu;' => chr(925),
'Xi;' => chr(926),
'Omicron;' => chr(927),
'Pi;' => chr(928),
'Rho;' => chr(929),
'Sigma;' => chr(931),
'Tau;' => chr(932),
'Upsilon;' => chr(933),
'Phi;' => chr(934),
'Chi;' => chr(935),
'Psi;' => chr(936),
'Omega;' => chr(937),
'alpha;' => chr(945),
'beta;' => chr(946),
'gamma;' => chr(947),
'delta;' => chr(948),
'epsilon;' => chr(949),
'zeta;' => chr(950),
'eta;' => chr(951),
'theta;' => chr(952),
'iota;' => chr(953),
'kappa;' => chr(954),
'lambda;' => chr(955),
'mu;' => chr(956),
'nu;' => chr(957),
'xi;' => chr(958),
'omicron;' => chr(959),
'pi;' => chr(960),
'rho;' => chr(961),
'sigmaf;' => chr(962),
'sigma;' => chr(963),
'tau;' => chr(964),
'upsilon;' => chr(965),
'phi;' => chr(966),
'chi;' => chr(967),
'psi;' => chr(968),
'omega;' => chr(969),
'thetasym;' => chr(977),
'upsih;' => chr(978),
'piv;' => chr(982),
'ensp;' => chr(8194),
'emsp;' => chr(8195),
'thinsp;' => chr(8201),
'zwnj;' => chr(8204),
'zwj;' => chr(8205),
'lrm;' => chr(8206),
'rlm;' => chr(8207),
'ndash;' => chr(8211),
'mdash;' => chr(8212),
'lsquo;' => chr(8216),
'rsquo;' => chr(8217),
'sbquo;' => chr(8218),
'ldquo;' => chr(8220),
'rdquo;' => chr(8221),
'bdquo;' => chr(8222),
'dagger;' => chr(8224),
'Dagger;' => chr(8225),
'bull;' => chr(8226),
'hellip;' => chr(8230),
'permil;' => chr(8240),
'prime;' => chr(8242),
'Prime;' => chr(8243),
'lsaquo;' => chr(8249),
'rsaquo;' => chr(8250),
'oline;' => chr(8254),
'frasl;' => chr(8260),
'euro;' => chr(8364),
'image;' => chr(8465),
'weierp;' => chr(8472),
'real;' => chr(8476),
'trade;' => chr(8482),
'alefsym;' => chr(8501),
'larr;' => chr(8592),
'uarr;' => chr(8593),
'rarr;' => chr(8594),
'darr;' => chr(8595),
'harr;' => chr(8596),
'crarr;' => chr(8629),
'lArr;' => chr(8656),
'uArr;' => chr(8657),
'rArr;' => chr(8658),
'dArr;' => chr(8659),
'hArr;' => chr(8660),
'forall;' => chr(8704),
'part;' => chr(8706),
'exist;' => chr(8707),
'empty;' => chr(8709),
'nabla;' => chr(8711),
'isin;' => chr(8712),
'notin;' => chr(8713),
'ni;' => chr(8715),
'prod;' => chr(8719),
'sum;' => chr(8721),
'minus;' => chr(8722),
'lowast;' => chr(8727),
'radic;' => chr(8730),
'prop;' => chr(8733),
'infin;' => chr(8734),
'ang;' => chr(8736),
'and;' => chr(8743),
'or;' => chr(8744),
'cap;' => chr(8745),
'cup;' => chr(8746),
'int;' => chr(8747),
'there4;' => chr(8756),
'sim;' => chr(8764),
'cong;' => chr(8773),
'asymp;' => chr(8776),
'ne;' => chr(8800),
'equiv;' => chr(8801),
'le;' => chr(8804),
'ge;' => chr(8805),
'sub;' => chr(8834),
'sup;' => chr(8835),
'nsub;' => chr(8836),
'sube;' => chr(8838),
'supe;' => chr(8839),
'oplus;' => chr(8853),
'otimes;' => chr(8855),
'perp;' => chr(8869),
'sdot;' => chr(8901),
'lceil;' => chr(8968),
'rceil;' => chr(8969),
'lfloor;' => chr(8970),
'rfloor;' => chr(8971),
'lang;' => chr(9001),
'rang;' => chr(9002),
'loz;' => chr(9674),
'spades;' => chr(9824),
'clubs;' => chr(9827),
'hearts;' => chr(9829),
'diams;' => chr(9830),
) : ())
);
# Make the opposite mapping
while (my($entity, $char) = each(%entity2char)) {
$entity =~ s/;\z//;
$char2entity{$char} = "&$entity;";
}
delete $char2entity{"'"}; # only one-way decoding
# Fill in missing entities
for (0 .. 255) {
next if exists $char2entity{chr($_)};
$char2entity{chr($_)} = "$_;";
}
my %subst; # compiled encoding regexps
sub decode_entities_old
{
my $array;
if (defined wantarray) {
$array = [@_]; # copy
} else {
$array = \@_; # modify in-place
}
my $c;
for (@$array) {
s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
s/(&(\w+);?)/$entity2char{$2} || $1/eg;
}
wantarray ? @$array : $array->[0];
}
sub encode_entities
{
return undef unless defined $_[0];
my $ref;
if (defined wantarray) {
my $x = $_[0];
$ref = \$x; # copy
} else {
$ref = \$_[0]; # modify in-place
}
if (defined $_[1] and length $_[1]) {
unless (exists $subst{$_[1]}) {
# Because we can't compile regex we fake it with a cached sub
my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
$subst{$_[1]} = eval $code;
die( $@ . " while trying to turn range: \"$_[1]\"\n "
. "into code: $code\n "
) if $@;
}
&{$subst{$_[1]}}($$ref);
} else {
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
}
$$ref;
}
sub encode_entities_numeric {
local %char2entity;
return &encode_entities; # a goto &encode_entities wouldn't work
}
sub num_entity {
sprintf "%X;", ord($_[0]);
}
# Set up aliases
*encode = \&encode_entities;
*encode_numeric = \&encode_entities_numeric;
*encode_numerically = \&encode_entities_numeric;
*decode = \&decode_entities;
1;