# Documentation and Copyright exist after __END__ package Lingua::Spelling::Alternative; require 5.001; use strict; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); use Exporter; $VERSION = '0.01'; @ISA = ('Exporter'); #@EXPORT = qw(); @EXPORT_OK = qw( &alternatives ); my $debug=0; # # make new instance of language, get args # sub new { my $class = shift; my $self = {}; bless($self, $class); $self->{ARGS} = {@_}; $debug = $self->{ARGS}->{DEBUG}; @{$self->{affix_regexp}} = (); @{$self->{affix_add}} = (); @{$self->{affix_sub}} = (); $self ? return $self : return undef; } # # load affix file in internal structures # sub load_affix { my $self = shift; my $filename = shift; my $suffixes=0; my ($regexp,$add,$sub); print STDERR "reading affix file $filename\n" if ($debug); open (A,$filename) || die "Can't open affix file $filename: $!"; while() { chomp; next if (/^#|^[\s\t\n\r]*$/); if (/^suffixes/i) { $suffixes++; next; } next if (! $suffixes); if (/^flag[\s\t]+\*{0,1}(.):/i) { undef $regexp; undef $add; undef $sub; next; } if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) { $regexp = $1; $add = $2; $sub = $3 if ($3 ne "-"); } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) { $regexp = $1; $sub = $2; } sub nuke_s { my $tmp = $_[0]; return if (!$tmp); # $tmp=~s/^\s+//g; # $tmp=~s/\s+$//g; $tmp=~s/\s+//g; return $tmp; } push @{$self->{affix_regexp}},nuke_s($regexp); push @{$self->{affix_add}},nuke_s($add); push @{$self->{affix_sub}},nuke_s($sub); } return 1; } # # function for reading raw findaffix output # sub load_findaffix { my $self = shift; my $filename = shift; print STDERR "reading findaffix output $filename\n" if ($debug); open (A,$filename) || die "Can't open findaffix output $filename: $!"; while() { chomp; my @line=split(m;/;,$_,4); if ($#line > 2) { push @{$self->{affix_regexp}},'.'; push @{$self->{affix_sub}},$line[0]; push @{$self->{affix_add}},$line[1]; } } return 1; } # # function which returns original word and all alternatives # sub alternatives { my $self = shift; my @out; foreach my $word (@_) { push @out,$word; # save original word next if (length($word) < 3); # cludge: preskoci kratke for(my $i=0; $i<=$#{$self->{affix_regexp}}; $i++) { my $regexp = $self->{affix_regexp}[$i]; my $add = $self->{affix_add}[$i]; my $sub = $self->{affix_sub}[$i]; print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug); next if length($word) < length($sub); my $tmp_word = $word; if ($sub) { next if ($word !~ m/$sub$/i); if ($add) { $tmp_word =~ s/$sub$/$add/i; } else { $tmp_word =~ s/$sub$//i; } } else { $tmp_word = $word.$add; } print STDERR "\t ?:$tmp_word\n" if ($debug); if ($tmp_word =~ m/$regexp/ix) { # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; push @out,lc($tmp_word); } } } return @out; } # # function which return minimal word of all alternatives # sub minimal { my $self = shift; my @out; foreach my $word (@_) { my @alt = $self->alternatives($word); my $minimal = shift @alt; foreach (@alt) { $minimal=$_ if (length($_) < length($minimal)); } push @out,$minimal; } return @out; } ############################################################################### 1; __END__ =head1 NAME Alternative.pm - alternative spelling of a given word in a given language =head1 SYNOPSIS use Lingua::Spelling:Alternative; my $en = new Alternative; $en->load_affix('/usr/lib/ispell/english.aff') or die $!; print $en->alternatives("cars"); =head1 DESCRIPTION This module is designed to return all valid forms of a given word (for example when you want to see all possible forms of some word entered in search engine) =head1 PUBLIC METHODS =over 4 =item new The new() constructor (without parameters) create container for new language. Only parametar it supports is DEBUG which turns on (some) debugging output. =item load_affix Function load_affix() loads ispell's affix file for later usage. =item load_findaffix This function loads output of findaffix program from ispell package. This is better idea (if you are creating affix file for particular language yourself or you can get your hands on one) because affix file from ispel is limited to 26 entries (because each entry is denoted by single character). =item alternatives Function alternatives return all alternative spellings of particular word(s). It will also return spelling which are not correct if there is rule like that in affix file. =item minimal This function returns minimal of all alternatives of a given word(s). It's a poor man's version of normalize (because we don't know gramatic of particular language, just some spelling rules). =head1 PRIVATE METHODS Documented as being not documented. =head1 EXAMPLES Please see the test.pl program in distribution which exercises some aspects of Alternative.pm. =head1 BUGS There are no known bugs. =head1 CONTACT AND COPYRIGHT Copyright 2002 Dobrica Pavlinusic (dpavlin@rot13.org). All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut