# Documentation and Copyright exist after __END__
package Lingua::Spelling::Alternative;
require 5.001;
use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
use Exporter;
$VERSION = '0.01';
@ISA = ('Exporter');
#@EXPORT = qw();
@EXPORT_OK = qw(
&alternatives
);
my $debug=0;
#
# make new instance of language, get args
#
sub new {
my $class = shift;
my $self = {};
bless($self, $class);
$self->{ARGS} = {@_};
$debug = $self->{ARGS}->{DEBUG};
@{$self->{affix_regexp}} = ();
@{$self->{affix_add}} = ();
@{$self->{affix_sub}} = ();
$self ? return $self : return undef;
}
#
# load affix file in internal structures
#
sub load_affix {
my $self = shift;
my $filename = shift;
my $suffixes=0;
my ($regexp,$add,$sub);
print STDERR "reading affix file $filename\n" if ($debug);
open (A,$filename) || die "Can't open affix file $filename: $!";
while() {
chomp;
next if (/^#|^[\s\t\n\r]*$/);
if (/^suffixes/i) {
$suffixes++;
next;
}
next if (! $suffixes);
if (/^flag[\s\t]+\*{0,1}(.):/i) {
undef $regexp;
undef $add;
undef $sub;
next;
}
if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
$regexp = $1;
$add = $2;
$sub = $3 if ($3 ne "-");
} elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
$regexp = $1;
$sub = $2;
}
sub nuke_s {
my $tmp = $_[0];
return if (!$tmp);
# $tmp=~s/^\s+//g;
# $tmp=~s/\s+$//g;
$tmp=~s/\s+//g;
return $tmp;
}
push @{$self->{affix_regexp}},nuke_s($regexp);
push @{$self->{affix_add}},nuke_s($add);
push @{$self->{affix_sub}},nuke_s($sub);
}
return 1;
}
#
# function for reading raw findaffix output
#
sub load_findaffix {
my $self = shift;
my $filename = shift;
print STDERR "reading findaffix output $filename\n" if ($debug);
open (A,$filename) || die "Can't open findaffix output $filename: $!";
while() {
chomp;
my @line=split(m;/;,$_,4);
if ($#line > 2) {
push @{$self->{affix_regexp}},'.';
push @{$self->{affix_sub}},$line[0];
push @{$self->{affix_add}},$line[1];
}
}
return 1;
}
#
# function which returns original word and all alternatives
#
sub alternatives {
my $self = shift;
my @out;
foreach my $word (@_) {
push @out,$word; # save original word
next if (length($word) < 3); # cludge: preskoci kratke
for(my $i=0; $i<=$#{$self->{affix_regexp}}; $i++) {
my $regexp = $self->{affix_regexp}[$i];
my $add = $self->{affix_add}[$i];
my $sub = $self->{affix_sub}[$i];
print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug);
next if length($word) < length($sub);
my $tmp_word = $word;
if ($sub) {
next if ($word !~ m/$sub$/i);
if ($add) {
$tmp_word =~ s/$sub$/$add/i;
} else {
$tmp_word =~ s/$sub$//i;
}
} else {
$tmp_word = $word.$add;
}
print STDERR "\t ?:$tmp_word\n" if ($debug);
if ($tmp_word =~ m/$regexp/ix) {
# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
push @out,lc($tmp_word);
}
}
}
return @out;
}
#
# function which return minimal word of all alternatives
#
sub minimal {
my $self = shift;
my @out;
foreach my $word (@_) {
my @alt = $self->alternatives($word);
my $minimal = shift @alt;
foreach (@alt) {
$minimal=$_ if (length($_) < length($minimal));
}
push @out,$minimal;
}
return @out;
}
###############################################################################
1;
__END__
=head1 NAME
Alternative.pm - alternative spelling of a given word in a given language
=head1 SYNOPSIS
use Lingua::Spelling:Alternative;
my $en = new Alternative;
$en->load_affix('/usr/lib/ispell/english.aff') or die $!;
print $en->alternatives("cars");
=head1 DESCRIPTION
This module is designed to return all valid forms of a given word
(for example when you want to see all possible forms of some word
entered in search engine)
=head1 PUBLIC METHODS
=over 4
=item new
The new() constructor (without parameters) create container for new language.
Only parametar it supports is DEBUG which turns on (some) debugging output.
=item load_affix
Function load_affix() loads ispell's affix file for later usage.
=item load_findaffix
This function loads output of findaffix program from ispell package.
This is better idea (if you are creating affix file for particular language
yourself or you can get your hands on one) because affix file from ispel
is limited to 26 entries (because each entry is denoted by single character).
=item alternatives
Function alternatives return all alternative spellings of particular
word(s). It will also return spelling which are not correct if there is
rule like that in affix file.
=item minimal
This function returns minimal of all alternatives of a given word(s). It's
a poor man's version of normalize (because we don't know gramatic of
particular language, just some spelling rules).
=head1 PRIVATE METHODS
Documented as being not documented.
=head1 EXAMPLES
Please see the test.pl program in distribution which exercises some
aspects of Alternative.pm.
=head1 BUGS
There are no known bugs.
=head1 CONTACT AND COPYRIGHT
Copyright 2002 Dobrica Pavlinusic (dpavlin@rot13.org). All
rights reserved. This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.
=cut