#include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "sxmuni.h" #include "fmsj0213.h" #include "tosj0213.h" #define PkgName "ShiftJIS::X0213::MapUTF" #define Is_SJIS_SNG(i) (0x00<=(i) && (i)<=0x7F || 0xA1<=(i) && (i)<=0xDF) #define Is_SJIS_LED(i) (0x81<=(i) && (i)<=0x9F || 0xE0<=(i) && (i)<=0xFC) #define Is_SJIS_TRL(i) (0x40<=(i) && (i)<=0x7E || 0x80<=(i) && (i)<=0xFC) #define STMT_ASSIGN_CVREF_AND_SRC(func_name) \ cvref = NULL; \ if (SvROK(ST(0))) { \ if (SvTYPE(SvRV(ST(0))) == SVt_PVCV) \ cvref = SvRV(ST(0)); \ else \ croak("RV other than CODEREF " \ "cannot be used in %s", func_name); \ } \ src = cvref \ ? (1 < items) ? ST(1) : &PL_sv_undef \ : ST(0); \ #define STMT_ASSIGN_LENDST(maxlen) \ s = (U8*)SvPV(src,srclen); \ e = s + srclen; \ dstlen = srclen * maxlen + 1; \ dst = sv_2mortal(newSV(dstlen)); \ (void)SvPOK_only(dst); #define STMT_GET_MBLEN \ mblen = Is_SJIS_LED(*(p)) && 2 <= (e - p) \ ? (Is_SJIS_TRL((p)[1])) ? 2 : 0 \ : Is_SJIS_SNG(*(p)) ? 1 : 0; #define STMT_GET_UV_FROM_MB \ lb = fmsjis0213_tbl[*p]; \ uv = lb.tbl ? lb.tbl[p[1]] : lb.sbc; \ if (!use2004 && isADDED2004(uv)) \ uv = 0; #define STMT_FETCH_FROM_UV_AND_UV2 \ j = 0; \ if (p < e && isbase(uv)) { \ uv2 = id_utf \ ? ord_uv(p, e - p, &retlen) \ : utf8n_to_uvuni(p, (e - p), &retlen, 0); \ if (retlen) \ j = (U16)getcomposite(uv, uv2); \ if (j) \ p += retlen; \ } \ if (!use2004 && isADDED2004(uv)) \ j = 0; \ else if (!j) { \ tbl_plain = Is_VALID_UTF(uv) \ ? tosjis0213_tbl[uv >> 16] \ : NULL; \ tbl_row = tbl_plain \ ? tbl_plain[(uv >> 8) & 0xff] \ : NULL; \ j = tbl_row ? tbl_row[uv & 0xff] : 0; \ } /* Perl 5.6.1 ? */ #ifndef uvuni_to_utf8 #define uvuni_to_utf8 uv_to_utf8 #endif /* uvuni_to_utf8 */ /* Perl 5.6.1 ? */ #ifndef utf8n_to_uvuni #define utf8n_to_uvuni utf8_to_uv #endif /* utf8n_to_uvuni */ static void sv_cat_retcvref (SV *dst, SV *cv, SV *sv, bool isbyte) { dSP; int count; ENTER; SAVETMPS; PUSHMARK(SP); if (isbyte) XPUSHs(&PL_sv_undef); XPUSHs(sv_2mortal(sv)); PUTBACK; count = call_sv(cv, (G_EVAL|G_SCALAR)); SPAGAIN; if (SvTRUE(ERRSV) || count != 1) { croak("died in XS, " PkgName "\n"); } sv_catsv(dst,POPs); PUTBACK; FREETMPS; LEAVE; } #define NUM_toUTF (6) #define NUM_fromUTF (8) static char* funcname_to[2 * NUM_toUTF] = { "sjis2004_to_unicode", "sjis2004_to_utf8", "sjis2004_to_utf16le", "sjis2004_to_utf16be", "sjis2004_to_utf32le", "sjis2004_to_utf32be", "sjis0213_to_unicode", "sjis0213_to_utf8", "sjis0213_to_utf16le", "sjis0213_to_utf16be", "sjis0213_to_utf32le", "sjis0213_to_utf32be", }; static char* funcname_fm[2 * NUM_fromUTF] = { "unicode_to_sjis2004", "utf8_to_sjis2004", "utf16le_to_sjis2004", "utf16be_to_sjis2004", "utf32le_to_sjis2004", "utf32be_to_sjis2004", "utf16_to_sjis2004", "utf32_to_sjis2004", "unicode_to_sjis0213", "utf8_to_sjis0213", "utf16le_to_sjis0213", "utf16be_to_sjis0213", "utf32le_to_sjis0213", "utf32be_to_sjis0213", "utf16_to_sjis0213", "utf32_to_sjis0213", }; static STRLEN maxlen_to[NUM_toUTF] = { MaxLenToUni, MaxLenToU8, MaxLenToU16, MaxLenToU16, MaxLenToU32, MaxLenToU32, }; static STRLEN maxlen_fm[NUM_fromUTF] = { MaxLenFmUni, MaxLenFmU8, MaxLenFmU16, MaxLenFmU16, MaxLenFmU32, MaxLenFmU32, MaxLenFmU16, MaxLenFmU32, }; static U8* (*app_uv_in[NUM_toUTF])(U8*, UV) = { NULL, app_in_utf8, app_in_utf16le, app_in_utf16be, app_in_utf32le, app_in_utf32be, }; static UV (*ord_uv_in[NUM_fromUTF])(U8 *, STRLEN, STRLEN *) = { NULL, ord_in_utf8, ord_in_utf16le, ord_in_utf16be, ord_in_utf32le, ord_in_utf32be, ord_in_utf16be, /* w/o BOM*/ ord_in_utf32be, /* w/o BOM*/ }; MODULE = ShiftJIS::X0213::MapUTF PACKAGE = ShiftJIS::X0213::MapUTF PROTOTYPES: DISABLE void sjis2004_to_unicode (...) ALIAS: sjis2004_to_utf8 = 1 sjis2004_to_utf16le = 2 sjis2004_to_utf16be = 3 sjis2004_to_utf32le = 4 sjis2004_to_utf32be = 5 sjis0213_to_unicode = 6 sjis0213_to_utf8 = 7 sjis0213_to_utf16le = 8 sjis0213_to_utf16be = 9 sjis0213_to_utf32le = 10 sjis0213_to_utf32be = 11 PREINIT: SV *src, *dst, *cvref; STRLEN srclen, dstlen, mblen, ulen; U8 *s, *e, *p, *d, uni[UTF8_MAXLEN + 1]; UV uv, u_temp; struct leading lb; U8* (*app_uv)(U8*, UV); int id_utf, use2004; PPCODE: use2004 = ix < NUM_toUTF; id_utf = ix % NUM_toUTF; STMT_ASSIGN_CVREF_AND_SRC(funcname_to[ix]) if (SvUTF8(src)) { src = sv_mortalcopy(src); sv_utf8_downgrade(src, 0); } STMT_ASSIGN_LENDST(maxlen_to[id_utf]) if (id_utf == 0) SvUTF8_on(dst); app_uv = app_uv_in[id_utf]; if (cvref) { for (p = s; p < e; p += mblen) { STMT_GET_MBLEN if (!mblen) { sv_cat_retcvref(dst, cvref, newSVuv((UV)*p), TRUE); p++; continue; } STMT_GET_UV_FROM_MB if (uv || !*p) { if (Is_VALID_UTF(uv)) { ulen = id_utf ? app_uv(uni, uv) - uni : uvuni_to_utf8(uni, uv) - uni; sv_catpvn(dst, (char*)uni, ulen); } else { u_temp = (uv >> 16); ulen = id_utf ? app_uv(uni, u_temp) - uni : uvuni_to_utf8(uni, u_temp) - uni; sv_catpvn(dst, (char*)uni, ulen); u_temp = (uv & 0xFFFF); ulen = id_utf ? app_uv(uni, u_temp) - uni : uvuni_to_utf8(uni, u_temp) - uni; sv_catpvn(dst, (char*)uni, ulen); } } else sv_cat_retcvref(dst, cvref, newSVpvn((char*)p, mblen), FALSE); } } else { d = (U8*)SvPVX(dst); for (p = s; p < e; p += mblen) { STMT_GET_MBLEN if (!mblen) { p++; continue; } STMT_GET_UV_FROM_MB if (uv || !*p) { if (Is_VALID_UTF(uv)) { d = id_utf ? app_uv(d, uv) : uvuni_to_utf8(d, uv); } else { u_temp = (uv >> 16); d = id_utf ? app_uv(d, u_temp) : uvuni_to_utf8(d, u_temp); u_temp = (uv & 0xFFFF); d = id_utf ? app_uv(d, u_temp) : uvuni_to_utf8(d, u_temp); } } } *d = '\0'; SvCUR_set(dst, d - (U8*)SvPVX(dst)); } XPUSHs(dst); void unicode_to_sjis2004 (...) ALIAS: utf8_to_sjis2004 = 1 utf16le_to_sjis2004 = 2 utf16be_to_sjis2004 = 3 utf32le_to_sjis2004 = 4 utf32be_to_sjis2004 = 5 utf16_to_sjis2004 = 6 utf32_to_sjis2004 = 7 unicode_to_sjis0213 = 8 utf8_to_sjis0213 = 9 utf16le_to_sjis0213 = 10 utf16be_to_sjis0213 = 11 utf32le_to_sjis0213 = 12 utf32be_to_sjis0213 = 13 utf16_to_sjis0213 = 14 utf32_to_sjis0213 = 15 PREINIT: SV *src, *dst, *cvref; STRLEN srclen, dstlen, retlen; U8 *s, *e, *p, *d, mbc[3]; U16 j, *tbl_row, **tbl_plain; UV uv, uv2; UV (*ord_uv)(U8 *, STRLEN, STRLEN *); int id_utf, use2004; PPCODE: use2004 = ix < NUM_fromUTF; id_utf = ix % NUM_fromUTF; STMT_ASSIGN_CVREF_AND_SRC(funcname_fm[ix]) if (id_utf == 0 && !SvUTF8(src)) { src = sv_mortalcopy(src); sv_utf8_upgrade(src); } else if (id_utf && SvUTF8(src)) { src = sv_mortalcopy(src); sv_utf8_downgrade(src, FALSE); } STMT_ASSIGN_LENDST(maxlen_fm[id_utf]) ord_uv = ord_uv_in[id_utf]; if (id_utf == 6 && 2 <= e - s) { /* UTF-16 */ if (memEQ("\xFF\xFE",s,2)) { s += 2; ord_uv = ord_in_utf16le; } else if (memEQ("\xFE\xFF",s,2)) { s += 2; } } else if (id_utf == 7 && 4 <= e - s) { /* UTF-32 */ if (memEQ("\xFF\xFE\x00\x00",s,4)) { s += 4; ord_uv = ord_in_utf32le; } else if (memEQ("\x00\x00\xFE\xFF",s,4)) { s += 4; } } if (cvref) { for (p = s; p < e;) { uv = id_utf ? ord_uv(p, e - p, &retlen) : utf8n_to_uvuni(p, (e - p), &retlen, 0); if (retlen) p += retlen; else { sv_cat_retcvref(dst, cvref, newSVuv((UV)*p), TRUE); p++; continue; } STMT_FETCH_FROM_UV_AND_UV2 if (j || !uv) { if (j >= 256) { mbc[0] = (U8)(j >> 8); mbc[1] = (U8)(j & 0xff); sv_catpvn(dst, (char*)mbc, 2); } else { mbc[0] = (U8)(j & 0xff); sv_catpvn(dst, (char*)mbc, 1); } } else sv_cat_retcvref(dst, cvref, newSVuv(uv), FALSE); } } else { d = (U8*)SvPVX(dst); for (p = s; p < e;) { uv = id_utf ? ord_uv(p, e - p, &retlen) : utf8n_to_uvuni(p, (e - p), &retlen, 0); if (retlen) p += retlen; else { p++; continue; } STMT_FETCH_FROM_UV_AND_UV2 if (j || !uv) { if (j >= 256) *d++ = (U8)(j >> 8); *d++ = (U8)(j & 0xff); } } *d = '\0'; SvCUR_set(dst, d - (U8*)SvPVX(dst)); } XPUSHs(dst);