use strict;
use warnings;
use Test::More qw(no_plan);
use MARC::Charset::Constants ':all';
use MARC::Charset 'utf8_to_marc8';
## MAKE SURE ALL THE CHARACTER SETS ARE THERE
is(
utf8_to_marc8(chr(0x0041)),
chr(0x41),
'ASCII'
);
is(
utf8_to_marc8(chr(0x0131)),
chr(0xB8),
'Ansel'
);
is(
utf8_to_marc8(chr(0x0628)),
ESCAPE . SINGLE_G0_A . BASIC_ARABIC . chr(0x48) .
ESCAPE . ASCII_DEFAULT,
'Basic Arabic'
);
is(
utf8_to_marc8(chr(0x068D)),
ESCAPE . SINGLE_G1_A . EXTENDED_ARABIC . chr(0xB9) .
ESCAPE . SINGLE_G1_A . EXTENDED_LATIN,
'Extended Arabic'
);
is(
utf8_to_marc8(chr(0x0440)),
ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x52) .
ESCAPE . ASCII_DEFAULT,
'Basic Cyrillic'
);
is(
utf8_to_marc8(chr(0x0408)),
ESCAPE . SINGLE_G1_A . EXTENDED_CYRILLIC . chr(0xE8) .
ESCAPE . SINGLE_G1_A . EXTENDED_LATIN,
'Extended Cyrillic'
);
is(
utf8_to_marc8(chr(0x0398)),
ESCAPE . SINGLE_G0_A . BASIC_GREEK . chr(0x4B) .
ESCAPE . ASCII_DEFAULT,
'Greek'
);
## note: we skip Greek Symbols since when mapping from utf8 to marc8
## we always use the Greek character set instead
is(
utf8_to_marc8(chr(0x05E0)),
ESCAPE . SINGLE_G0_A . BASIC_HEBREW . chr(0x70) .
ESCAPE . ASCII_DEFAULT,
'Hebrew'
);
is(utf8_to_marc8(chr(0x2083)),
ESCAPE . SUBSCRIPTS . chr(0x33) . ESCAPE . ASCII_DEFAULT,
'Subscripts'
);
is(utf8_to_marc8(chr(0x2074)),
ESCAPE . SUPERSCRIPTS . chr(0x34) . ESCAPE . ASCII_DEFAULT,
'Superscripts'
);
is(
utf8_to_marc8(chr(0x71AC)),
ESCAPE . MULTI_G0_A . CJK . chr(0x21) . chr(0x49) . chr(0x7C) .
ESCAPE . ASCII_DEFAULT,
'East Asian'
);
## COMBINING CHARACTERS
is(
utf8_to_marc8('c' . chr(0x0327) . 'edilla'),
chr(0xF0) . 'cedilla',
'string with interior combining character'
);
is(
utf8_to_marc8('abc' . chr(0x0327) . chr(0x0300) . chr(0x0301)
. 'def'),
'ab' . chr(0xF0) . chr(0xE1) . chr(0xE2) . 'cdef',
'string with multiple interior combining characters'
);
## ESCAPING TO OTHER CHARACTER SETS
is(
utf8_to_marc8(chr(0x043A)),
ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x4B) .
ESCAPE . ASCII_DEFAULT ,
'CYRILLIC SMALL LETTER KA'
);
is(
utf8_to_marc8(chr(0x05D0) . chr(0x043B)),
ESCAPE . SINGLE_G0_A . BASIC_HEBREW . chr(0x60) .
ESCAPE . SINGLE_G0_A . BASIC_CYRILLIC . chr(0x4C) .
ESCAPE . ASCII_DEFAULT,
'string with multiple character sets'
);
is(
utf8_to_marc8(chr(0x0396). ' ' . chr(0x0398)),
ESCAPE . SINGLE_G0_A . BASIC_GREEK . ## set G0 to Greek
chr(0x49) . ## ZETA
' ' . ## SPACE
chr(0x4B) . ## THETA
ESCAPE . ASCII_DEFAULT, ## Back to ASCII
'greek utf8 with an internal space'
);