#-*-perl-*- package Uplug::PreProcess::Tokenizer; =head1 NAME Uplug::PreProcess::Tokenizer =head1 SYNOPSIS my $tokenizer = new Uplug::PreProcess::Tokenizer( lang => 'en' ); my @tokens = tokenizer->tokenize( 'Mr. Smith says: "What is a text anyway?"' ); my $text = detokenize( '" Big improvement ! " says Mr. Smith .'); =head1 IMPLEMENTS =head2 C Tokenize a given text. Returns a list of tokens. =head2 C De-tokenize a space-separated text or a list of tokens. Returns plain text. =head2 C Load language specific abbreviations and other non-breaking prefixes. =head1 DESCRIPTION This module heavily relies on the implementation of the tokenizer and detokenizer used in the Moses toolkit for SMT. All credits go to the original authors (Josh Schroeder and Philipp Koehn). =cut use strict; use Uplug::Config; # defaults: language = English our $DEFAULT_LANG = 'en'; our %NONBREAKING_PREFIX; our $NONBREAKING_PREFIX_DIR = &shared_lang() . '/nonbreaking_prefixes'; our $DEFAULT_NONBREAKING_PREFIX = $NONBREAKING_PREFIX_DIR . '/nonbreaking_prefix.en'; sub new { my $class = shift; my %self = @_; if ( $self{lang} ) { load_prefixes( $self{lang}, \%NONBREAKING_PREFIX ); } return bless \%self, $class; } sub tokenize { my $self = shift; my ($text) = @_; chomp($text); $text = " $text "; # seperate out all "other" special characters $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; #` #multi-dots stay together $text =~ s/\.([\.]+)/ DOTMULTI$1/g; while ( $text =~ /DOTMULTI\./ ) { $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; $text =~ s/DOTMULTI\./DOTDOTMULTI/g; } # seperate out "," except if within numbers (5,300) $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; # separate , pre and post number $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; # turn `into ' $text =~ s/\`/\'/g; #` #turn '' into " $text =~ s/\'\'/ \" /g; if ( $$self{lang} eq "en" ) { #split contractions right $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; #special case for "1990's" $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; } elsif ( ( $$self{lang} eq "fr" ) or ( $$self{lang} eq "it" ) ) { #split contractions left $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; } else { $text =~ s/\'/ \' /g; } #word token method my @words = split( /\s/, $text ); $text = ""; for ( my $i = 0; $i < ( scalar(@words) ); $i++ ) { my $word = $words[$i]; if ( $word =~ /^(\S+)\.$/ ) { my $pre = $1; if (( $pre =~ /\./ && $pre =~ /\p{IsAlpha}/ ) || ( $NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre} == 1 ) || ( $i < scalar(@words) - 1 && ( $words[ $i + 1 ] =~ /^[\p{IsLower}]/ ) ) ) { #no change } elsif ( ( $NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre} == 2 ) && ( $i < scalar(@words) - 1 && ( $words[ $i + 1 ] =~ /^[0-9]+/ ) ) ) { #no change } else { $word = $pre . " ."; } } $text .= $word . " "; } # clean up extraneous spaces $text =~ s/ +/ /g; $text =~ s/^ //g; $text =~ s/ $//g; #restore multi-dots while ( $text =~ /DOTDOTMULTI/ ) { $text =~ s/DOTDOTMULTI/DOTMULTI./g; } $text =~ s/DOTMULTI/./g; #ensure final line break $text .= "\n" unless $text =~ /\n$/; return split( /\s+/, $text ); } sub load_prefixes { my ( $language, $PREFIX_REF ) = @_; my $prefixfile = $NONBREAKING_PREFIX_DIR . '/nonbreaking_prefix.' . $language; #default back to English if we don't have a language-specific prefix file if ( !( -e $prefixfile ) ) { $prefixfile = $DEFAULT_NONBREAKING_PREFIX; unless ( -e $prefixfile ) { return 0; } } if ( -e "$prefixfile" ) { open( PREFIX, "<:encoding(utf8)", "$prefixfile" ); while () { my $item = $_; chomp($item); if ( ($item) && ( substr( $item, 0, 1 ) ne "#" ) ) { if ( $item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/ ) { $PREFIX_REF->{$1} = 2; } else { $PREFIX_REF->{$item} = 1; } } } close(PREFIX); } } # this is the Moses detokenizer # written by Josh Schroeder, based on code by Philipp Koehn sub detokenize { my $self = shift; my $token = shift; my $text = ref($token) eq 'ARRAY' ? join( ' ', @$token ) : $token; my $language = $self->{lang} || 'en'; # chomp($text); $text =~ s/\n/ /gs; $text =~ s/\s\s+/ /gs; $text = " $text "; my $word; my $i; my @words = split( / /, $text ); $text = ""; my %quoteCount = ( "\'" => 0, "\"" => 0 ); my $prependSpace = " "; for ( $i = 0; $i < ( scalar(@words) ); $i++ ) { if ( $words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/ ) { # perform right shift on currency and other random punctuation items $text = $text . $prependSpace . $words[$i]; $prependSpace = ""; } elsif ( $words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/ ) { # perform left shift on punctuation items $text = $text . $words[$i]; $prependSpace = " "; } elsif (( $language eq "en" ) && ( $i > 0 ) && ( $words[$i] =~ /^[\'][\p{IsAlpha}]/ ) && ( $words[ $i - 1 ] =~ /[\p{IsAlnum}]$/ ) ) { # left-shift the contraction for English $text = $text . $words[$i]; $prependSpace = " "; } elsif (( $language eq "fr" ) && ( $i < ( scalar(@words) - 2 ) ) && ( $words[$i] =~ /[\p{IsAlpha}][\']$/ ) && ( $words[ $i + 1 ] =~ /^[\p{IsAlpha}]/ ) ) { # right-shift the contraction for French $text = $text . $prependSpace . $words[$i]; $prependSpace = ""; } elsif ( $words[$i] =~ /^[\'\"]+$/ ) { # combine punctuation smartly if ( ( $quoteCount{ $words[$i] } % 2 ) eq 0 ) { if ( ( $language eq "en" ) && ( $words[$i] eq "'" ) && ( $i > 0 ) && ( $words[ $i - 1 ] =~ /[s]$/ ) ) { # single quote for posesssives ending in s... "The Jones' house" # left shift $text = $text . $words[$i]; $prependSpace = " "; } else { # right shift $text = $text . $prependSpace . $words[$i]; $prependSpace = ""; $quoteCount{ $words[$i] } = $quoteCount{ $words[$i] } + 1; } } else { # left shift $text = $text . $words[$i]; $prependSpace = " "; $quoteCount{ $words[$i] } = $quoteCount{ $words[$i] } + 1; } } else { $text = $text . $prependSpace . $words[$i]; $prependSpace = " "; } } # clean up spaces at head and tail of each line as well as any double-spacing $text =~ s/ +/ /g; $text =~ s/\n /\n/g; $text =~ s/ \n/\n/g; $text =~ s/^ //g; $text =~ s/ $//g; # add trailing break #$text .= "\n" unless $text =~ /\n$/; return $text; } 1;