# -*- Mode: Cperl -*- # $Basename: Filter.pm $ # $Revision: 1.9 $ # ITIID : $ITI$ $Header $__Header$ # Author : Ulrich Pfeifer # Created On : Thu Aug 15 18:09:51 1996 # Last Modified By: Ulrich Pfeifer # Last Modified On: Sun Nov 22 18:44:46 1998 # Language : CPerl # Update Count : 105 # Status : Unknown, Use with caution! # # Copyright (c) 1996-1997, Ulrich Pfeifer # package WAIT::Filter; require WAIT; use strict; use Carp; use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD); use subs qw(grundform); require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw( Stem Soundex Phonix Metaphone isolc disolc isouc disouc isotr disotr stop grundform utf8iso ); # (most implemented in WAIT.xs) $VERSION = substr q$Revision: 1.9 $, 10; sub split { map split(' ', $_), @_; } $SPLIT = q[ sub splitXXX { grep length($_)>=XXX, map split(' ', $_), @_; } ]; sub AUTOLOAD { my $func = $AUTOLOAD; $func =~ s/.*:://; if ($func =~ /split(\d+)/) { my $num = $1; my $split = $SPLIT; $split =~ s/XXX/$num/g; eval $split; if ($@ eq '') { goto &$AUTOLOAD; } } elsif ($func eq 'grundform') { eval {require Text::German;}; croak "You must have Text::German to use 'grundform'" if $@ ne ''; *grundform = Text::German->can('reduce'); goto &grundform; } elsif ($func eq 'date') { eval {require Time::ParseDate;}; croak "You must have Time::ParseDate to use 'date'" if $@ ne ''; *date = Time::ParseDate->can('parsedate'); goto \&date; } elsif ($func eq 'decode_entities') { eval {require HTML::Entities;}; croak "You must have HTML::Entities to use 'date'" if $@ ne ''; *decode_entities = HTML::Entities->can('decode_entities'); goto &decode_entities; } elsif ($func =~ /^d?utf8iso$/) { require WAIT::Filter::utf8iso; croak "Your perl version must at least be 5.00556 to use '$func'" if $] < 5.00556; no strict 'refs'; *$func = \&{"WAIT::Filter::utf8iso::$func"}; goto &utf8iso; } Carp::confess "Class WAIT::Filter::$func not found"; } while () { chomp; last if /__END__/; next if /^\s*#/; # there's a comment $STOP{$_}++; } sub stop { if (exists $STOP{$_[0]}) { '' } else { $_[0]; } } sub gdate { my $date = shift; $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:; date($date); } 1; __DATA__ a about above according across actually adj after afterwards again against all almost alone along already also although always among amongst an and another any anyhow anyone anything anywhere are aren't around as at b be became because become becomes becoming been before beforehand begin beginning behind being below beside besides between beyond billion both but by c can can't cannot caption co co. could couldn't d did didn't do does doesn't don't down during e eg eight eighty either else elsewhere end ending enough etc even ever every everyone everything everywhere except f few fifty first five vfor former formerly forty found four from further g h had has hasn't have haven't he he'd he'll he's hence her here here's hereafter hereby herein hereupon hers herself him himself his how however hundred i i'd i'll i'm i've ie if in inc. indeed instead into is isn't it it's its itself j k l last later latter latterly least less let let's like likely ltd m made make makes many maybe me meantime meanwhile might million miss more moreover most mostly mr mrs much must my myself n namely neither never nevertheless next nine ninety no nobody none nonetheless noone nor not nothing now nowhere o of off often on once one one's only onto or other others otherwise our ours ourselves out over overall own p per perhaps q r rather recent recently s same seem seemed seeming seems seven seventy several she she'd she'll she's should shouldn't since six sixty so some somehow someone something sometime sometimes somewhere still stop such t taking ten than that that'll that's that've the their them themselves then thence there there'd there'll there're there's there've thereafter thereby therefore therein thereupon these they they'd they'll they're they've thirty this those though thousand three through throughout thru thus to together too toward towards trillion twenty two u under unless unlike unlikely until up upon us used using v very via w was wasn't we we'd we'll we're we've well were weren't what what'll what's what've whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who'd who'll who's whoever whole whom whomever whose why will with within without won't would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z # occuring in more than 100 files acc accent accents and are bell can character corrections crt daisy dash date defined definitions description devices diablo dummy factors following font for from fudge give have header holds log logo low lpr mark name nroff out output pitch put rcsfile reference resolution revision see set simple smi some string synopsis system that the this translation troff typewriter ucb unbreakable use used user vroff wheel will with you __END__ # Below is the stub of documentation for your module. You better edit it! =head1 NAME WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions =head1 SYNOPSIS use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc isotr disotr stop grundform utf8iso); $stem = Stem($word); $scode = Soundex($word); $pcode = Phonix($word); $lword = isolc($word); disolc($word); $uword = isouc($word); disouc($word); $trword = isotr($word); disotr($word); $word = stop($word); $word = grundform($word); @words = WAIT::Filter::split($word); @words = WAIT::Filter::split2($word); @words = WAIT::Filter::split3($word); @words = WAIT::Filter::split4($word); # arbitrary numbers allowed =head1 DESCRIPTION This tiny modules gives access to the basic reduction functions build in B. =over 5 =item B(I) reduces I using the well know Porter algorithm. AU: Porter, M.F. TI: An Algorithm for Suffix Stripping JT: Program VO: 14 PP: 130-137 PY: 1980 PM: JUL =item B(I) computes the 4 byte B code for I. AU: Gadd, T.N. TI: 'Fisching for Werds'. Phonetic Retrieval of written text in Information Retrieval Systems JT: Program VO: 22 NO: 3 PP: 222-237 PY: 1988 =item B(I) computes the 8 byte B code for I. AU: Gadd, T.N. TI: PHONIX: The Algorithm JT: Program VO: 24 NO: 4 PP: 363-366 PY: 1990 PM: OCT =head1 ISO charcater case functions There are some additional function which transpose some/most ISOlatin1 characters to upper and lower case. To allow for maximum speed there are also I versions which change the argument instead of allocating a copy which is returned. For convenience, the destructive version also B the argument. So all of the following is valid and C<$word> will contain the lowercased string. $word = isolc($word); $word = disolc($word); disolc($word); Here are the hardcoded characters which are recognized: abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß =item C<$new = >BC<($word)> =item BC<($word)> transposes to lower case. =item C<$new = >BC<($word)> =item BC<($word)> transposes to upper case. =item C<$new = >BC<($word)> =item BC<($word)> Remove non-letters according to the above table. =item C<$new = >BC<($word)> Returns an empty string if $word is a stopword. =item C<$new = >BC<($word)> Calls Text::German::reduce =item C<$new = >BC<($word)> Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is internally based on the Latin1 character set, so if you process anything in a different encoding, you should convert to Latin1 as the first filter. =item split, split2, split3, ... The splitN funtions all take a scalar as input and return a list of words. Split acts just like the perl split(' '). Split2 eliminates all words from the list that are shorter than 2 characters (bytes), split3 eliminates those shorter than 3 characters (bytes) and so on. =head1 AUTHOR Ulrich Pfeifer EFE =head1 SEE ALSO perl(1). =cut