umap - metacpan.org

#!/usr/bin/perl -w

=head1 NAME

umap - map between different character sets

=head1 SYNOPSIS

 umap [options] <before>:<after>

=head1 DESCRIPTION

The I<umap> script acts as a filter between different encodings and
character sets.

The following options are recognized:

=over 4

=item --list [charset]

Without argument list all character sets recognized.  With a specified
character set list the mapping between this set and Unicode.

=item --strict

Do the stict mapping between the character sets.  The default is to
not translate unmapped character.  With I<--stict> we will remove
unmapped characters or use the default specified with I<--def8> or
I<--def16>.

=item --def8=<charcode>

Set the default 8-bit code for unmapped chars.

=item --def16=<charcode>

Set the default 16-bit code for unmapped chars.

=item --verbose

Generate more verbose output.

=item --version

Print the version number of this program and quit.

=item --help

Print the usage message.

=back

=head1 SEE ALSO

L<Unicode::String>,
L<Unicode::Map8>,
recode(1)

=head1 COPYRIGHT

Copyright 1998 Gisle Aas.

This is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut


use strict;
use Getopt::Long  qw(GetOptions);

my $VERSION = "1.05";

my $list;
my $strict;
my $verbose;
my $def8;
my $def16;
my $before;
my $after;

GetOptions('version' => \&print_version,
	   'help'    => \&usage,
	   'list:s'  => \$list,
	   'verbose' => \$verbose,
	   'strict!' => \$strict,
	   'def8=i'  => \$def8,
	   'def16=i' => \$def16,
	  ) || usage ();


if (defined $list) {
    if (length($list)) {
	list_charset($list);
    } else {
	list_charsets();
    }
    exit;
}

# Try to extract $before/$after from the remaining arguments
$before = shift || $ENV{UMAP_BEFORE} || "latin1";
if (!@ARGV && $before =~ s/([^\\]):/$1\0/) {
    ($before, $after) = split('\0', $before, 2);
}
unless ($after) {
    $after  = shift || $ENV{UMAP_AFTER}  || "utf8";
}
for ($before, $after) {
    s/\\:/:/g;
}
usage() if @ARGV;

print STDERR "$before --> $after\n" if $verbose;


#------------------------------------------------------------------
package MySpace;  # use a new namespace

use Unicode::String 2.00 qw(ucs4 ucs2 utf16 utf7 utf8);

my $bsub = \&{$before};

unless (defined(&$bsub)) {
    require Unicode::Map8;
    my $map = Unicode::Map8->new($before);
    die "Don't know about charset '$before'\n" unless $map;
    $map->nostrict unless $strict;
    $map->default_to16($def16) if defined($def16);
    no strict 'refs';
    *{$before} = sub {	$map->tou($_[0]); };
}

if ($after =~ /^(ucs[24]|utf16|utf[78])$/) {
    *out = sub { print $_[0]->$after(); };
} elsif ($after eq "hex") {
    *out = sub {
	my $hex = $_[0]->hex;
	$hex =~ s/U\+000a\s*/U+000a\n/g;
	print $hex;
    };
} elsif ($after eq "uname") {
    require Unicode::CharName;
    *out = sub {
	for ($_[0]->unpack) {
	    printf "U+%04X   %s\n", $_, Unicode::CharName::uname($_) || "";
	}
    };
} else {
    require Unicode::Map8;
    my $map = Unicode::Map8->new($after);
    die "Don't know about charset '$after'\n" unless $map;
    $map->nostrict unless $strict;
    $map->default_to8($def8) if defined($def8);
    #*out = sub { print $map->to8(${$_[0]}); };
    *out = sub { print $map->to8(${$_[0]}); };
}

if (-t STDIN || $before =~ /^utf[78]$/) {
    # must read a line at the time (should not break encoded chars)
    my $line;
    while (defined($line = <STDIN>)) {
	out(&$bsub($line));
    }
} else {
    my $n;
    my $buf;
    # must read buffers which are multiples of 4 bytes (ucs4)
    while ( $n = read(STDIN, $buf, 512)) {
	#print "$n bytes read\n";
	out(&$bsub($buf));
    }
}


#------------------------------------------------------------------
package main;

sub list_charset
{
    require Unicode::Map8;
    require Unicode::CharName;

    my($charset, $format) = @_;
    my $m = Unicode::Map8->new($charset);
    die "Don't know about charset $charset\n" unless $m;

    my @res8;
    my %map16;
    for (my $i = 0; $i < 256; $i++) {
	my $u = $m->to_char16($i);
	if ($u == Unicode::Map8::NOCHAR()) {
	    push(@res8, sprintf "# 0x%02X unmapped\n", $i) if $verbose;
	} else {
	    push(@res8, sprintf "0x%02X 0x%04X   # %s\n", $i, $u,
		                               Unicode::CharName::uname($u));
	    $map16{$u} = $i;
	}
    }

    my @res16;
    my @blocks;
    for (my $block = 0; $block < 256; $block++) {
	next if $m->_empty_block($block);
	push(@blocks, $block);
	for (my $i = 0; $i < 256; $i++) {
	    my $u = $block*256 + $i;
	    my $c = $m->to_char8($u);
	    next if $c == Unicode::Map8::NOCHAR();
	    next if exists $map16{$u} && $map16{$u} == $c;
	    push(@res16, sprintf "0x%02X 0x%04X   # %s\n", $c, $u,
		                                Unicode::CharName::uname($u));
	}
    }

    print "# Mapping for '$charset'\n";
    print "#\n";
    printf "# %d allocated blocks", scalar(@blocks);
    if (@blocks > 1 || $blocks[0] != 0) {
	print " (", join(", ", map  "#".($_+1), @blocks), ")";
    }
    print "\n";
    print "#\n";
    print @res8;

    if (@res16) {
	print "\n# Extra 16-bit to 8-bit mappings\n";
	print @res16;
    }
}


sub list_charsets
{
    require Unicode::Map8;
    my %set = (
	       ucs4 => {},
	       ucs2 => {utf16 => 1},
	       utf7 => {},
	       utf8 => {},
	      );
    if (opendir(DIR, $Unicode::Map8::MAPS_DIR)) {
	my $f;
	while (defined($f = readdir(DIR))) {
	    next unless -f "$Unicode::Map8::MAPS_DIR/$f";
	    $f =~ s/\.(?:bin|txt)$//;
	    $set{$f} = {} if Unicode::Map8->new($f);
	}
    }

    my $avoid_warning = keys %Unicode::Map8::ALIASES;
    while ( my($alias, $charset) = each %Unicode::Map8::ALIASES) {
	if (exists $set{$charset}) {
	    $set{$charset}{$alias} = 1;
	} else {
	    warn "$charset does not seem to exist (aliased as $alias)\n";
	}
    }

    for (sort keys %set) {
	print "$_";
	if (%{$set{$_}}) {
	    print " ", join(" ", sort keys %{$set{$_}});
	}
	print "\n";
    }
}


sub print_version
{
    require Unicode::Map8;
    my $avoid_warning = $Unicode::Map8::VERSION;
    print <<"EOT";
This is umap version $VERSION (Unicode-Map8-$Unicode::Map8::VERSION)

Copyright 1998, Gisle Aas.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
EOT
    exit 0;
}


sub usage
{
    (my $progname = $0) =~ s,.*/,,;
    die "Usage:\t$progname [options] <before>:<after>
The options are:
  --list [charset]    list character sets
  --strict            use the strict mapping
  --def8 <code>       default 8-bit code for unmapped chars
  --def16 <code>      default 16-bit code for unmapped chars
  --version           print version number and quit
";
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)