package Lingua::EN::Hyphenate;
use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw( hyphenate syllables def_syl def_hyph );
$VERSION = '0.01';
sub debug { print @_ if $::debug }
my @diphthong = qw { ao ia io ii iu oe uo ue };
my @diphthong1 = map { substr($_,0,1)."(?=".substr($_,1,1).")" } @diphthong;
my $diphthong = "(" . join('|', @diphthong1) . ")(.)";
my $vowels = '(?:[aeiou]+y?|y)';
my $precons = '( str
|sch
|sph
|squ
|thr
|b[r]
|d[rw]
|f[lr]
|g[nr]
|k[n]
|p[nr]
|r[h]
|s[lmnw]
|t[w]
|qu
)';
my $ppcons1 = '( b[l]
|c[hlr]
|g[hl]
|m[n]
|p[l]
|t[h](?!r)
|s[chpt](?!r)
|s[k]
|tr
)';
my $ppcons2 = '((?=[a-z])[^aeiouy])';
my $postcons = '( ght
|nst
|rst
|tch
|rth
|bb
|c[ckt]
|d[dlz]
|f[ft]
|g[gt]
|l[bcdfgklmnptv]
|m[mp]
|n[cdgknstx]
|pp
|r[bcdfgklmnprtv]
|ss
|t[tz]
|vv
|wn
|x[tx]
)';
my @paircons = qw { ph tl n't };
my $paircons = "(" . join('|', @paircons) . ")";
my @dblcons = qw { c~tr n~th n~c[th] n~s[th] ns~d l~pr s~tl
n~c n~s c~t r~t };
my @dblcons1 = map { /(.+)~(.+)/; "$1(?=$2)" } @dblcons;
my @dblcons2 = map { /(.+)~(.+)/; "$2" } @dblcons;
my $dblcons = "(" . join('|', @dblcons1) . ")(" . join('|', @dblcons2) . ")";
my @repcons = map { "$_(?=$_)" } qw { b c g h j k m n p q r t v w x z };
my $repcons = "(" . join('|', @repcons) . ")";
my $pprecons = "($ppcons1|$precons|$ppcons2)";
my $ppostcons = "($ppcons1|$postcons|$ppcons2)";
sub abstract
{
no strict;
sub C_ { debug "C_($_[0])\n"; return { type => 'C_', val => $_[0] } }
sub _C { debug "_C($_[0])\n"; return { type => '_C', val => $_[0] } }
sub _S { debug "_S($_[0])\n"; return { type => '_S', val => $_[0] } }
sub _C_ { debug "_C_($_[0])\n"; return { type => '_C_', val => $_[0] } }
sub V { debug "V($_[0])\n"; return { type => 'V', val => $_[0] } }
sub E { debug "E($_[0])\n"; return { type => 'E', val => $_[0] } }
local $_ = shift;
local @head = (); sub app { push @head, @_ if defined $_[0]; '' }
local @tail = (); sub prep { unshift @tail, @_ if defined $_[0]; '' }
#debug "\A${pprecons}${diphthong}${postcons}\Z\n";
s/\A${pprecons}${diphthong}${ppostcons}\Z/app C_($1),V("$5$6"),_C($7)/eix;
s/\Ay/app C_("y")/ei
or s/\Aex/app V("e"),_C("x")/ei
or s/\Ai([nmg])/app V("i"),_C($1)/ei
or s/\A([eu])([nm])/app V($1),_C($2)/ei
or s/\Airr/app V("i"),_C("r"),C_("r")/ei
or s/\Aill/app V("i"),_C("l"),C_("l")/ei
or s/\Acon/app C_("c"), V("o"), _C("n")/ei
or s/\Aant([ie])/app V("a"),_C("n"),C_("t"),V($1),_C('')/ei
or s/\A(w[hr])/app C_("$1")/ei
or s/\Amay/app C_("m"), V("a"), _C("y")/ei
;
s/([bd])le\Z/prep C_($1), V(''), _C("le")/ei
or s/sm\Z/prep C_("s"), V(''), _C("m")/ei
or s/${repcons}\1e\Z/do{prep _C("$1$1e")}/eix
or s/(?=..e)${dblcons}e\Z/do{prep _C("$1$2e")}/eix
or s/(${vowels})${ppcons2}es\Z/do{prep _C("$2es");$1}/eix
or s/(${vowels})(ples?)\Z/do{prep C_($2);$1}/eix
or s/([td])ed\Z/prep C_($1),V("e"), _C("d")/eix
or s/([^aeiou])\1ed\Z/prep _C("$1$1ed")/eix
or s/${pprecons}ed\Z/prep _C("$1ed")/eix
or s/${ppostcons}ed\Z/prep _C("$1ed")/eix
or s/([aeou])ic(s?)\Z/prep V($1), V("i"),_C("c$2")/ei
or s/([sct])ion(s?)\Z/prep _C_($1),V("io"),_C("n$2")/ei
or s/([cts])ia([nl]s?)\Z/prep _C_($1),V("ia"),_C($2)/ei
or s/([ts])ia(s?)\Z/prep _C_($1),V("ia$2")/ei
or s/t(i?ou)s\Z/prep _C_("t"),V($1),_C("s")/ei
or s/cious\Z/prep _C_("c"),V("iou"),_C("s")/ei
or s/${ppostcons}(e?s)\Z/prep _C("$1$5")/eix
;
1 while s/${dblcons}\Z/do{prep _C("$1$2")}/eix;
while (/[a-z]/i)
{
debug "=====[$_]=====\n";
s/\A(s'|'s)\Z/app _S($1)/eix and next;
s/\A${dblcons}/app _C($1),C_($2)/eix and next;
s/\A${dblcons}/app _C($1),C_($2)/eix and next;
s/\A${repcons}/app _C($1)/eix and next;
s/\A${paircons}/app _C($1)/eix and next;
s/\A${ppcons1}e(?![aeiouy])/app _C_($1),E("e")/eix
and next;
s/\A${precons}e(?![aeiouy])/app C_($1),E("e")/eix
and next;
s/\A${postcons}e(?![aeiouy])/app _C($1),E("e")/eix
and next;
s/\A${ppcons2}e(?![aeiouy])/app _C_($1),E("e")/eix
and next;
s/\A${postcons}?([sct])ion/app C_(($1||'').$2),V("io"),_C("n")/eix
and next;
s/\A${postcons}?tial/app C_(($1||'')."t"),V("ia"),_C("l")/eix
and next;
s/\A${postcons}?([ct])ia([nl])/app C_(($1||'').$2),V("ia"),_C($3)/eix
and next;
s/\A${postcons}?t(i?ou)s/app C_(($1||'')."t"),V($1),_C("s")/eix
and next;
s/\Aience/app V("i"),V("e"),_C("nc"),E('e')/eix
and next;
s/\Acious/app C_(($1||'')."c"),V("iou"),_C("s")/eix
and next;
s/\A$diphthong/app V($1),V($2)/ei and next;
s/\A$ppcons1/app _C_($1)/eix and next;
s/\A$precons/app C_($1)/eix and next;
s/\A$postcons/app _C($1)/eix and next;
s/\A$ppcons2/app _C_($1)/eix and next;
s/\A($vowels)/app V($1)/ei and next;
}
return (@head, @tail);
}
sub partition
{
no strict;
local @list = @_;
local @syls = ();
sub is_S { @list > 1 && $list[$#list]->{val} =~ /'?s'?/ }
sub isR { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C'
&& $list[$i]->{val} eq 'r' }
sub isC { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C' }
sub is_C { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'_C' }
sub isC_ { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C_' }
sub isV { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~/V|E/ }
sub isVnE { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type} eq 'V'
&& $list[$i]->{val} !~ /\Ae/
}
sub isE { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type} eq 'E' }
sub syl { my $syl = "";
for (1..$_[0]) { $syl = pop(@list)->{val}.$syl }
unshift @syls, $syl;
1}
is_S(0) && do { my $val = pop @list; $list[$#list]->{val} .= $val->{val} };
while (@list)
{
print "\t[@syls]\n" if $::debug;
isE(-2) && isR(-1) && isVnE(0) && syl(1) && next;
isC(-1) && is_C(0) && syl(1) && next;
isC_(-3) && isV(-2) && isC(-1) && isE(0) && syl(4) && next;
isC_(-2) && isV(-1) && is_C(0) && syl(3) && next;
isV(-2) && isC(-1) && isE(0) && syl(3) && next;
isC_(-1) && isV(0) && syl(2) && next;
isV(-1) && is_C(0) && syl(2) && next;
isC(0) && syl(1) && next;
isV(0) && syl(1) && next;
}
return @syls;
}
my %user_def_syl = ();
my %user_def_hyph = ();
sub def_syl($)
{
my $word = $_[0];
$word =~ tr/~//d;
$user_def_syl{$word} = [split /\~/, $_[0]];
}
sub def_hyph($)
{
my $word = $_[0];
$word =~ tr/~//d;
$user_def_hyph{$word} = [split /\~/, $_[0]];
}
sub syllables($) # ($word)
{
return ($_[0]) unless $_[0] =~ /[A-Za-z]/;
my $word = $_[0];
$word =~ s/\A([^a-zA-Z]+)//;
my $leader = $1||'';
$word =~ s/([^a-zA-Z]+)\Z//;
my $trailer = $1||'';
my @syls = @{$user_def_syl{$word}||[]};
unless (@syls)
{
my @part = split /((?:\s|'(?![ts]\b)|'[^A-Za-z]|[^A-Za-z'])+)/, $word;
for (my $p = 0; $p < @part; $p++)
{
if ($p & 1) { $syls[$#syls] .= $part[$p] }
else { push @syls, partition(abstract($part[$p])) }
}
}
$syls[0] = $leader . $syls[0];
$syls[$#syls] .= $trailer;
return @syls if wantarray;
return join '~', @syls;
}
sub hyphenate($$;$) # ($word, $width; $hyphen)
{
my $word = shift;
my @syls = @{$user_def_hyph{$word}||[]};
@syls = syllables($word) unless @syls;
my ($width, $hyphen) = (@_,'-');
my $hlen = length $hyphen;
my $first = '';
while (@syls)
{
if ($#syls) { last if length($first) + length($syls[0]) + $hlen > $width }
else { last if length($first) + length($syls[0]) > $width }
$first .= shift @syls;
}
$first .= $hyphen if $first && @syls && $first !~ /$hyphen\Z/;
return ("$first",join '',@syls);
}
1;
__END__
=head1 NAME
Lingua::En::Hyphenate - Perl extension for syllable-based hyphenation
=head1 SYNOPSIS
use Lingua::En::Hyphenate;
=head1 DESCRIPTION
=head1 AUTHOR
=cut