/* $Id: sjis_jsky.c,v 1.4 2002/10/31 11:08:50 hio Exp $ */ #include "Japanese.h" #include #define ECHO_EJ2U(arg) /*fprintf arg */ #define ON_EJ2U(cmd) /*cmd */ #define ECHO_U2EJ(arg) /*fprintf arg */ #define ON_U2EJ(cmd) /*cmd */ #ifndef __cplusplus #undef bool #undef true #undef false typedef enum bool { false, true, } bool; #endif /* --------------------------------------------------------------------------- * jsky 1 ==> utf8 * ------------------------------------------------------------------------- */ EXTERN_C SV* xs_sjis_jsky1_utf8(SV* sv_str) { const unsigned char* src; int len; SV_Buf result; const unsigned char* src_end; if( sv_str==&PL_sv_undef ) { return newSVsv(&PL_sv_undef); } src = (unsigned char*)SvPV(sv_str,PL_na); len = sv_len(sv_str); ECHO_EJ2U((stderr,"Unicode::Japanese::(xs)sjis_jsky_utf8\n",len)); ON_EJ2U( bin_dump("in ",src,len) ); SV_Buf_init(&result,len*3/2+4); src_end = src+len; while( src=src_end || src[1]!='$' ) { /* 絵文字じゃない */ SV_Buf_append_ch(&result,*src); ++src; continue; } /*fprint(stderr,"detect j-sky emoji-start escape\n"); */ /* E_JSKY_1 */ if( src[2]!='E' && src[2]!='F' && src[2]!='G' ) { /*fprintf(stderr,"first char is invalid"); */ SV_Buf_append_ch(&result,*src); ++src; continue; } begin = src; src += 3; /* E_JSKY_2 */ while( src+1\n",begin[2],*ptr,j1+*ptr); */ /*fprintf(stderr," => %04x\n",g_ej2u1_table[j1+*ptr]); */ const unsigned char* str = (unsigned char*)&g_ej2u1_table[j1+*ptr]; /*fprintf(stderr," len: %d\n",str[3]?4:strlen((char*)str)); */ SV_Buf_append_str(&result,str,str[3]?4:strlen((char*)str)); } /*fprintf(stderr,"j-sky string done.\n"); */ continue; }else if( 0xa1<=src[0] && src[0]<=0xdf ) { /* 半角カナ */ /*fprintf(stderr,"kana": %02x\n",src[0]); */ ptr = (unsigned char*)&g_s2u_table[src[0]]; ++src; }else if( ((0x81<=src[0] && src[0]<=0x9f) || (0xe0<=src[0] && src[0]<=0xfc) ) && (0x40<=src[1] && src[1]<=0xfc && src[1]!=0x7f) ) { /* 2バイト文字 */ register const unsigned short sjis = ntohs(*(unsigned short*)src); /*fprintf(stderr,"sjis: %04x\n",sjis); */ ptr = (unsigned char*)&g_s2u_table[sjis]; src += 2; }else { /* 不明 */ /*fprintf(stderr,"unknown: %02x\n",src[0]); */ SV_Buf_append_ch(&result,'?'); ++src; continue; } /*fprintf(stderr,"utf8-char : %02x %02x %02x %02x\n",ptr[0],ptr[1],ptr[2],ptr[3]); */ if( ptr[3] ) { /*fprintf(stderr,"utf8-len: [%d]\n",4); */ SV_Buf_append_ch4(&result,*(int*)ptr); }else if( ptr[2] ) { /*fprintf(stderr,"utf8-len: [%d]\n",3); */ SV_Buf_append_ch3(&result,*(int*)ptr); }else if( ptr[1] ) { /*fprintf(stderr,"utf8-len: [%d]\n",2); */ SV_Buf_append_ch2(&result,*(short*)ptr); }else { /*fprintf(stderr,"utf8-len: [%d]\n",1); */ SV_Buf_append_ch(&result,*ptr); } } ON_EJ2U( bin_dump("out",result.getBegin(),result.getLength()) ); SV_Buf_setLength(&result); return SV_Buf_getSv(&result); } /* --------------------------------------------------------------------------- * jsky 2 ==> utf8 * ------------------------------------------------------------------------- */ EXTERN_C SV* xs_sjis_jsky2_utf8(SV* sv_str) { const unsigned char* src; int len; SV_Buf result; const unsigned char* src_end; if( sv_str==&PL_sv_undef ) { return newSVsv(&PL_sv_undef); } src = (unsigned char*)SvPV(sv_str,PL_na); len = sv_len(sv_str); ECHO_EJ2U((stderr,"Unicode::Japanese::(xs)sjis_jsky_utf8\n",len)); ON_EJ2U( bin_dump("in ",src,len) ); SV_Buf_init(&result,len*3/2+4); src_end = src+len; while( src=src_end || src[1]!='$' ) { /* 絵文字じゃない */ SV_Buf_append_ch(&result,*src); ++src; continue; } /* warn("detect j-sky emoji-start escape\n"); */ /* E_JSKY_1 */ if( src[2]=='E' || src[2]=='F' || src[2]=='G' ) { j1 = (src[2]-'E')<<8; table = g_ej2u1_table; }else if( src[2]=='O' || src[2]=='P' || src[2]=='Q' ) { j1 = (src[2]-'O')<<8; table = g_ej2u2_table; }else { j1 = 0; table = NULL; /* warn("first char is invalid"); */ SV_Buf_append_ch(&result,*src); ++src; continue; } begin = src; src += 3; /* E_JSKY_2 */ while( src+1\n",begin[2],*ptr,j1+*ptr); */ /*fprintf(stderr," => %04x\n",g_ej2u2_table[j1+*ptr]); */ const unsigned char* str = (unsigned char*)&table[j1+*ptr]; /*fprintf(stderr," len: %d\n",str[3]?4:strlen((char*)str)); */ SV_Buf_append_str(&result,str,str[3]?4:strlen((char*)str)); } /*fprintf(stderr,"j-sky string done.\n"); */ continue; }else if( 0xa1<=src[0] && src[0]<=0xdf ) { /* 半角カナ */ /*fprintf(stderr,"kana": %02x\n",src[0]); */ ptr = (unsigned char*)&g_s2u_table[src[0]]; ++src; }else if( ((0x81<=src[0] && src[0]<=0x9f) || (0xe0<=src[0] && src[0]<=0xfc) ) && (0x40<=src[1] && src[1]<=0xfc && src[1]!=0x7f) ) { /* 2バイト文字 */ register const unsigned short sjis = ntohs(*(unsigned short*)src); /*fprintf(stderr,"sjis: %04x\n",sjis); */ ptr = (unsigned char*)&g_s2u_table[sjis]; src += 2; }else { /* 不明 */ /*fprintf(stderr,"unknown: %02x\n",src[0]); */ SV_Buf_append_ch(&result,'?'); ++src; continue; } /*fprintf(stderr,"utf8-char : %02x %02x %02x %02x\n",ptr[0],ptr[1],ptr[2],ptr[3]); */ if( ptr[3] ) { /*fprintf(stderr,"utf8-len: [%d]\n",4); */ SV_Buf_append_ch4(&result,*(int*)ptr); }else if( ptr[2] ) { /*fprintf(stderr,"utf8-len: [%d]\n",3); */ SV_Buf_append_ch3(&result,*(int*)ptr); }else if( ptr[1] ) { /*fprintf(stderr,"utf8-len: [%d]\n",2); */ SV_Buf_append_ch2(&result,*(short*)ptr); }else { /*fprintf(stderr,"utf8-len: [%d]\n",1); */ SV_Buf_append_ch(&result,*ptr); } } ON_EJ2U( bin_dump("out",result.getBegin(),result.getLength()) ); SV_Buf_setLength(&result); return SV_Buf_getSv(&result); } /* --------------------------------------------------------------------------- * utf8 ==> jsky 1 * ------------------------------------------------------------------------- */ EXTERN_C SV* xs_utf8_sjis_jsky1(SV* sv_str) { unsigned char* src; int len; SV_Buf result; const unsigned char* src_end; if( sv_str==&PL_sv_undef ) { return newSVsv(&PL_sv_undef); } src = (unsigned char*)SvPV(sv_str,PL_na); len = sv_len(sv_str); ECHO_U2EJ((stderr,"Unicode::Japanese::(xs)utf8_sjis\n")); ON_U2EJ( bin_dump("in ",src,len) ); SV_Buf_init(&result,len+4); src_end = src+len; while( src=src_end ) { ECHO_U2EJ((stderr," no enough buffer, here is %d, need %d\n",src_end-src,utf8_len)); SV_Buf_append_ch(&result,'?'); ++src; continue; } /* 2バイト目以降が正しい文字範囲か確認 */ succ = true; for( i=1; i=4); if( ucs<0x0ff000 ) { /* 知らない使用領域 */ SV_Buf_append_ch(&result,'?'); src += utf8_len; continue; } /* 絵文字判定(j-sky) */ sjis = &g_eu2j1_table[(ucs - 0x0ff000)*5]; /*fprintf(stderr," emoji: %02x %02x %02x %02x %02x\n", */ /* sjis[0],sjis[1],sjis[2],sjis[3],sjis[4]); */ if( sjis[4]!=0 ) { /* 5バイト文字に. */ SV_Buf_append_ch5(&result,sjis); }else if( sjis[3]!=0 ) { /* 4バイト文字に. */ assert("not reach here" && 0); SV_Buf_append_ch4(&result,*(const int*)(sjis)); }else if( sjis[2]!=0 ) { /* 3バイト文字に. */ assert("not reach here" && 0); SV_Buf_append_ch3(&result,*(const int*)(sjis)); }else if( sjis[1]!=0 ) { /* 2バイト文字に. */ SV_Buf_append_ch2(&result,*(const unsigned short*)(sjis)); }else if( sjis[0]!=0 ) { /* 1バイト文字に. */ SV_Buf_append_ch(&result,*sjis); }else { /* マッピングなし */ SV_Buf_append_ch(&result,'?'); } src += utf8_len; continue; } if( ucs & ~0xFFFF ) { /* ucs2の範囲外 (ucs4の範囲) */ SV_Buf_append_ch(&result,'?'); src += utf8_len; continue; } /* ucs => sjis */ ECHO_U2EJ((stderr,"ucs2 [%04x]\n",ucs)); /*const unsigned short sjis = g_u2s_table[ucs]; */ ECHO_U2EJ((stderr,"sjis [%04x]\n",ntohs(sjis) )); if( g_u2s_table[ucs] || !ucs ) { /* 対応文字がある時とucs=='\0'の時 */ if( g_u2s_table[ucs] & 0xff00 ) { SV_Buf_append_ch2(&result,g_u2s_table[ucs]); }else { SV_Buf_append_ch(&result,(unsigned char)g_u2s_table[ucs]); } }else if( ucs<=0x7F ) { SV_Buf_append_ch(&result,(unsigned char)ucs); }else { SV_Buf_append_ch(&result,'?'); } src += utf8_len; /*bin_dump("now",dst_begin,dst-dst_begin); */ } /* for */ ON_U2EJ( bin_dump("out",result.getBegin(),result.getLength()) ); SV_Buf_setLength(&result); sv_2mortal(SV_Buf_getSv(&result)); { /* packing J-SKY emoji escapes */ SV_Buf pack; unsigned char* ptr; unsigned char tmpl[5] = { '\x1b','$',0,0,'\x0f',}; SV_Buf_init(&pack,SV_Buf_getLength(&result)); src = SV_Buf_getBegin(&result); src_end = src + SV_Buf_getLength(&result); ptr = src; for( ; src+5*2-1=5; src+= 5 ) { tmpl[3] = src[3]; if( memcmp(src,tmpl,5)!=0 ) break; /*fprintf(stderr," packing...[%02x]\n",src[3]); */ SV_Buf_append_ch(&pack,src[3]); } /*fprintf(stderr," pack done.\n"); */ SV_Buf_append_ch(&pack,'\x0f'); ptr = src; } /*fprintf(stderr," pack complete.\n"); */ /*fprintf(stderr," append len %0d\n",src_end-ptr); */ if( ptr!=src_end ) { SV_Buf_append_str(&pack,ptr,src_end-ptr); } ON_U2EJ( bin_dump("out",pack.getBegin(),pack.getLength()) ); SV_Buf_setLength(&pack); return SV_Buf_getSv(&pack); } } /* --------------------------------------------------------------------------- * utf8 ==> jsky 2 * ------------------------------------------------------------------------- */ EXTERN_C SV* xs_utf8_sjis_jsky2(SV* sv_str) { unsigned char* src; int len; SV_Buf result; const unsigned char* src_end; if( sv_str==&PL_sv_undef ) { return newSVsv(&PL_sv_undef); } src = (unsigned char*)SvPV(sv_str,PL_na); len = sv_len(sv_str); ECHO_U2EJ((stderr,"Unicode::Japanese::(xs)utf8_sjis\n")); ON_U2EJ( bin_dump("in ",src,len) ); SV_Buf_init(&result,len+4); src_end = src+len; while( src=src_end ) { ECHO_U2EJ((stderr," no enough buffer, here is %d, need %d\n",src_end-src,utf8_len)); SV_Buf_append_ch(&result,'?'); ++src; continue; } /* 2バイト目以降が正しい文字範囲か確認 */ succ = true; for( i=1; i=4); if( ucs<0x0ff000 ) { /* 知らない使用領域 */ SV_Buf_append_ch(&result,'?'); src += utf8_len; continue; } /* 絵文字判定(j-sky) */ sjis = &g_eu2j2_table[(ucs - 0x0ff000)*5]; /*fprintf(stderr," emoji: %02x %02x %02x %02x %02x\n", */ /* sjis[0],sjis[1],sjis[2],sjis[3],sjis[4]); */ if( sjis[4]!=0 ) { /* 5バイト文字に. */ SV_Buf_append_ch5(&result,sjis); }else if( sjis[3]!=0 ) { /* 4バイト文字に. */ assert("not reach here" && 0); SV_Buf_append_ch4(&result,*(const int*)(sjis)); }else if( sjis[2]!=0 ) { /* 3バイト文字に. */ assert("not reach here" && 0); SV_Buf_append_ch3(&result,*(const int*)(sjis)); }else if( sjis[1]!=0 ) { /* 2バイト文字に. */ SV_Buf_append_ch2(&result,*(const unsigned short*)(sjis)); }else if( sjis[0]!=0 ) { /* 1バイト文字に. */ SV_Buf_append_ch(&result,*sjis); }else { /* マッピングなし */ SV_Buf_append_ch(&result,'?'); } src += utf8_len; continue; } if( ucs & ~0xFFFF ) { /* ucs2の範囲外 (ucs4の範囲) */ SV_Buf_append_ch(&result,'?'); src += utf8_len; continue; } /* ucs => sjis */ ECHO_U2EJ((stderr,"ucs2 [%04x]\n",ucs)); /*const unsigned short sjis = g_u2s_table[ucs]; */ ECHO_U2EJ((stderr,"sjis [%04x]\n",ntohs(sjis) )); if( g_u2s_table[ucs] || !ucs ) { /* 対応文字がある時とucs=='\0'の時 */ if( g_u2s_table[ucs] & 0xff00 ) { SV_Buf_append_ch2(&result,g_u2s_table[ucs]); }else { SV_Buf_append_ch(&result,(unsigned char)g_u2s_table[ucs]); } }else if( ucs<=0x7F ) { SV_Buf_append_ch(&result,(unsigned char)ucs); }else { SV_Buf_append_ch(&result,'?'); } src += utf8_len; /*bin_dump("now",dst_begin,dst-dst_begin); */ } /* for */ ON_U2EJ( bin_dump("out",result.getBegin(),result.getLength()) ); SV_Buf_setLength(&result); sv_2mortal(SV_Buf_getSv(&result)); { /* packing J-SKY emoji escapes */ SV_Buf pack; unsigned char* ptr; SV_Buf_init(&pack,SV_Buf_getLength(&result)); src = SV_Buf_getBegin(&result); src_end = src + SV_Buf_getLength(&result); ptr = src; for( ; src+5*2-1=5; src+= 5 ) { tmpl[3] = src[3]; if( memcmp(src,tmpl,5)!=0 ) break; /*fprintf(stderr," packing...[%02x]\n",src[3]); */ SV_Buf_append_ch(&pack,src[3]); } /*fprintf(stderr," pack done.\n"); */ SV_Buf_append_ch(&pack,'\x0f'); ptr = src; } /*fprintf(stderr," pack complete.\n"); */ /*fprintf(stderr," append len %0d\n",src_end-ptr); */ if( ptr!=src_end ) { SV_Buf_append_str(&pack,ptr,src_end-ptr); } ON_U2EJ( bin_dump("out",pack.getBegin(),pack.getLength()) ); SV_Buf_setLength(&pack); return SV_Buf_getSv(&pack); } }