=head1 NAME Lingua::ZH::Romanize::DictZH - Dictionary converter =head1 SYNOPSIS perl -MLingua::ZH::Romanize::DictZH -e 'Lingua::ZH::Romanize::DictZH->update();' =head1 DESCRIPTION This module creates dictionary cache files for L module. Source dictionary files: (included in this package) Lingua/ZH/Romanize/big5/CTLauBig5.tit Lingua/ZH/Romanize/big5/PY.tit Lingua/ZH/Romanize/gb/CTLau.tit Lingua/ZH/Romanize/gb/PY.tit Cached dictionary files: Lingua/ZH/Romanize/Cantonese.store Lingua/ZH/Romanize/Pinyin.store DictZH is called only from Makefile.PL in the Pinyin package. =head1 REQUIRED MODULES L module is required to create cached dictionary files. Both of L and L modules are required on Perl 5.005 and 5.6.x to install the Pinyin package. L module is used on Perl 5.8.x. =head1 SEE ALSO L =cut package Lingua::ZH::Romanize::DictZH; use strict; use vars qw( $VERSION ); $VERSION = "0.23"; my $PERL581 = 1 if ( $] >= 5.008001 ); my $DICT_FILES = { 'Cantonese' => [qw( cxterm/dict/big5/CTLauBig5.tit cxterm/dict/gb/CTLau.tit )], 'Pinyin' => [qw( cxterm/dict/big5/PY.tit cxterm/dict/gb/PY.tit )], }; sub target { ( keys %$DICT_FILES ); } sub update { my $package = shift; my $base = shift; print "Updater: ", __PACKAGE__, " (", $VERSION, ")\n"; unless ( defined $base ) { $base = $INC{ join( '/', split( '::', (__PACKAGE__) . '.pm' ) ) }; $base =~ s#/[^/]*$##; } my @target = $package->target(); my $update = 0; foreach my $mode ( @target ) { my $storename = $mode.'.store'; my $storepath = $base . '/' . $storename; if ( -r $storepath ) { warn "Already-Exist: $storepath\n"; } else { print "Path: ", $storepath, "\n"; $update++; } } warn "Loading-Module: Storable.pm\n"; &require_storable(); # required if ($PERL581) { warn "Loading-Module: Encode.pm\n"; &require_encode(); # Perl 5.8.x } else { warn "Loading-Module: Unicode::Map\n"; &require_unicode_map(); # Perl 5.005/5.6.x warn "Loading-Module: Unicode::String\n"; &require_unicode_string(); # } foreach my $mode ( @target ) { my $storename = $mode.'.store'; my $hash = {}; my $titlist = $DICT_FILES->{$mode}; foreach my $titpath ( @$titlist ) { warn "Loading-Dictionary: $titpath\n"; $hash = &read_tit_dict( $titpath, $hash ); } foreach my $key ( keys %$hash ) { next unless ref $hash->{$key}; my $list = $hash->{$key}; my $uniq = { map { $_ => 1 } @$list }; foreach my $chk ( @$list ) { next unless ( $chk =~ s/[0-9]+$// ); delete $uniq->{$chk} if exists $uniq->{$chk}; } $hash->{$key} = join( '/', sort keys %$uniq ); } my $storepath = $base . '/' . $storename; warn "Writing-Storable: $storepath\n"; Storable::store( $hash, $storepath ) or die "$! - $storename\n"; } print "Done.\n"; undef; } sub read_tit_dict { my $titname = shift or return; my $hash = shift || {}; my $cmap = {qw( GB GB2312 BIG5 BIG5 KS EUC-KR JIS EUC-JP )}; my $unistr = Unicode::String->new() unless $PERL581; # find ENCODE: and wait until BEGINDICTIONARY open( TIT, $titname ) or die "$! - $titname\n"; my $code; while () { next if /^#/; $code = $cmap->{ uc($1) } if (/^ENCODE:\s*(\S+)/); last if /^BEGINDICTIONARY/; } warn "Dictionary-Encoding: $code\n" if $code; my $unimap = Unicode::Map->new($code) unless $PERL581; while ( my $line = ) { next if ( $line =~ /^#/ ); chomp $line; my ( $roman, $kanji ) = split( /\s+/, $line, 2 ); $roman =~ s/^\\[0-7]{3}//s; # $roman =~ s/\d+$//s; # convert encoding from GB/BIG5 to UTF-8 if ($code) { if ($PERL581) { Encode::from_to( $kanji, $code, 'UTF-8' ); # GB/BIG5 to UTF-8 } else { my $utf16 = $unimap->to_unicode($kanji); # GB/BIG5 to UCS2 $unistr->ucs2($utf16); $kanji = $unistr->utf8(); # UCS2 to UTF-8 } } # split every UTF-8 wide characters while ( $kanji =~ /([\300-\377][\200-\277]+)/g ) { my $char = $1; if ( !exists $hash->{$char} ) { $hash->{$char} = $roman; } elsif ( !ref $hash->{$char} ) { $hash->{$char} = [ $hash->{$char}, $roman ]; } else { push( @{ $hash->{$char} }, $roman ); } } } close(TIT); $hash; } sub require_storable { return if defined $Storable::VERSION; local $@; eval { require Storable; }; die "Storable module is required.\n" if $@; } sub require_encode { return if defined $Encode::VERSION; local $@; eval { require Encode; }; die "Encode module is required.\n" if $@; } sub require_unicode_string { return if defined $Unicode::String::VERSION; local $@; eval { require Unicode::String; }; die "Unicode::String module is required.\n" if $@; } sub require_unicode_map { return if defined $Unicode::Map::VERSION; local $@; eval { require Unicode::Map; }; die "Unicode::Map module is required.\n" if $@; } package Lingua::ZH::Romanize::DictZH::Pinyin; use strict; use base qw( Lingua::ZH::Romanize::DictZH ); sub target { 'Pinyin' } package Lingua::ZH::Romanize::DictZH::Cantonese; use strict; use base qw( Lingua::ZH::Romanize::DictZH ); sub target { 'Cantonese' } 1;