#! /opt/perl5/bin/perl -w # $Id: trainlid 1.3 Sun, 18 Nov 2001 23:26:28 +0100 mxp $ # Copyright © 2000 Michael Piotrowski. All Rights Reserved. =head1 NAME trainlid - build transition matrix for Lingua::Ident module =head1 SYNOPSIS B I < I > I =head1 DESCRIPTION B builds a trigram transition matrix for use with the B module. It reads a training text from standard input and outputs a transition matrix with the specified I as identifier to standard output. It is recommended that I be a POSIX locale name constructed from an ISO 639 2-letter language code, possibly extended by an ISO 3166 2-letter country code and a character set identifier. Example: B. =head1 AUTHOR B was developed by Michael Piotrowski . =head1 SEE ALSO Lingua::Ident(3) =cut ############################################################################### require 5.004; while(defined($c = getc)) { $c =~ s/[\x00-\x1F\x21-\x40\x7B-\x7F]/ /og; $c = lc($c); push @chars, $c; $alphabet{$c} = ""; $chars = join "", @chars; $chars =~ s/ +/ /og; @chars = split //, $chars; $nc++; if(@chars == 3) { $trigram = join("", @chars); $matrix{$trigram}++; $bigrams{substr($trigram, 0, 2)}++; shift @chars; } } $size_of_alphabet = keys(%alphabet); print STDERR "alphabet: ", sort(keys(%alphabet)), "\n"; undef %alphabet; while(($trigram, $count) = each(%matrix)) { $prob = ($count + 1)/ ($bigrams{substr($trigram, 0, 2)} + $size_of_alphabet); $matrix{$trigram} = $prob; } print "_LANG:", $ARGV[0], "\n"; print "_NULL:", 1 / $size_of_alphabet, "\n"; print "#ALPH:$size_of_alphabet\n"; while(($key, $val) = each(%matrix)) { print "$key:$val\n"; } while (($key, $val) = each %bigrams) { print "$key;", $val + $size_of_alphabet, "\n"; } print STDERR "size of alphabet: $size_of_alphabet\n"; print STDERR "chars total: $nc\n";