The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::EN::Hyphenate;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

require Exporter;

@ISA = qw(Exporter);

@EXPORT_OK = qw( hyphenate syllables def_syl def_hyph );

$VERSION = '0.01';

sub debug {  print @_ if $::debug }

my @diphthong = qw { ao ia io ii iu oe uo ue };
my @diphthong1 = map { substr($_,0,1)."(?=".substr($_,1,1).")" } @diphthong;
my $diphthong = "(" . join('|', @diphthong1) . ")(.)";

my $vowels = '(?:[aeiou]+y?|y)';

my $precons = '( str
		 |sch
		 |sph
		 |squ
		 |thr
	         |b[r]
	         |d[rw]
	         |f[lr]
	         |g[nr]
	         |k[n]
	         |p[nr]
	         |r[h]
	         |s[lmnw]
	         |t[w]
		 |qu
		 )';

my $ppcons1 = '(  b[l]
	         |c[hlr]
	         |g[hl]
	         |m[n]
	         |p[l]
	         |t[h](?!r)
	         |s[chpt](?!r)
	         |s[k]
	         |tr
		 )';

my $ppcons2 = '((?=[a-z])[^aeiouy])';

my $postcons = '( ght
		 |nst
		 |rst
		 |tch
		 |rth
		 |bb
	         |c[ckt]
	         |d[dlz]
		 |f[ft]
	         |g[gt]
	         |l[bcdfgklmnptv]
	         |m[mp]
	         |n[cdgknstx]
		 |pp
	         |r[bcdfgklmnprtv]
		 |ss
		 |t[tz]
		 |vv
		 |wn
	         |x[tx]
		 )';

my @paircons = qw { ph tl n't };
my $paircons = "(" . join('|', @paircons) . ")";

my @dblcons = qw { c~tr n~th n~c[th] n~s[th] ns~d l~pr s~tl
		   n~c n~s c~t r~t };
my @dblcons1 = map { /(.+)~(.+)/; "$1(?=$2)" } @dblcons;
my @dblcons2 = map { /(.+)~(.+)/; "$2" } @dblcons;
my $dblcons = "(" . join('|', @dblcons1) . ")(" . join('|', @dblcons2) . ")";

my @repcons = map { "$_(?=$_)" } qw { b c g h j k m n p q r t v w x z };
my $repcons = "(" . join('|', @repcons) . ")";

my $pprecons = "($ppcons1|$precons|$ppcons2)";
my $ppostcons = "($ppcons1|$postcons|$ppcons2)";

sub abstract
{
	no strict;
	sub C_  { debug "C_($_[0])\n"; return { type => 'C_',  val => $_[0] } }
	sub _C  { debug "_C($_[0])\n"; return { type => '_C',  val => $_[0] } }
	sub _S  { debug "_S($_[0])\n"; return { type => '_S',  val => $_[0] } }
	sub _C_ { debug "_C_($_[0])\n"; return { type => '_C_', val => $_[0] } }
	sub V   { debug "V($_[0])\n"; return { type => 'V',   val => $_[0] } }
	sub E   { debug "E($_[0])\n"; return { type => 'E',   val => $_[0] } }

	local $_ = shift;
	local @head = (); sub app  { push @head, @_ if defined $_[0]; '' }
	local @tail = (); sub prep { unshift @tail, @_ if defined $_[0]; '' }

	#debug "\A${pprecons}${diphthong}${postcons}\Z\n";

	s/\A${pprecons}${diphthong}${ppostcons}\Z/app C_($1),V("$5$6"),_C($7)/eix;

	s/\Ay/app C_("y")/ei
		or s/\Aex/app V("e"),_C("x")/ei
		or s/\Ai([nmg])/app V("i"),_C($1)/ei
		or s/\A([eu])([nm])/app V($1),_C($2)/ei
		or s/\Airr/app V("i"),_C("r"),C_("r")/ei
		or s/\Aill/app V("i"),_C("l"),C_("l")/ei
		or s/\Acon/app C_("c"), V("o"), _C("n")/ei
		or s/\Aant([ie])/app V("a"),_C("n"),C_("t"),V($1),_C('')/ei
		or s/\A(w[hr])/app C_("$1")/ei
		or s/\Amay/app C_("m"), V("a"), _C("y")/ei
		;

	s/([bd])le\Z/prep C_($1), V(''), _C("le")/ei
		or s/sm\Z/prep C_("s"), V(''), _C("m")/ei
		or s/${repcons}\1e\Z/do{prep _C("$1$1e")}/eix
		or s/(?=..e)${dblcons}e\Z/do{prep _C("$1$2e")}/eix
		or s/(${vowels})${ppcons2}es\Z/do{prep _C("$2es");$1}/eix
		or s/(${vowels})(ples?)\Z/do{prep C_($2);$1}/eix
		or s/([td])ed\Z/prep C_($1),V("e"), _C("d")/eix
		or s/([^aeiou])\1ed\Z/prep _C("$1$1ed")/eix
		or s/${pprecons}ed\Z/prep _C("$1ed")/eix
		or s/${ppostcons}ed\Z/prep _C("$1ed")/eix
		or s/([aeou])ic(s?)\Z/prep V($1), V("i"),_C("c$2")/ei
		or s/([sct])ion(s?)\Z/prep _C_($1),V("io"),_C("n$2")/ei
		or s/([cts])ia([nl]s?)\Z/prep _C_($1),V("ia"),_C($2)/ei
		or s/([ts])ia(s?)\Z/prep _C_($1),V("ia$2")/ei
		or s/t(i?ou)s\Z/prep _C_("t"),V($1),_C("s")/ei
		or s/cious\Z/prep _C_("c"),V("iou"),_C("s")/ei
		or s/${ppostcons}(e?s)\Z/prep _C("$1$5")/eix
		;

	1 while s/${dblcons}\Z/do{prep _C("$1$2")}/eix;

	while (/[a-z]/i)
	{
		debug "=====[$_]=====\n";
		s/\A(s'|'s)\Z/app _S($1)/eix	 		and next;
		s/\A${dblcons}/app _C($1),C_($2)/eix		and next;
		s/\A${dblcons}/app _C($1),C_($2)/eix		and next;
		s/\A${repcons}/app _C($1)/eix			and next;
		s/\A${paircons}/app _C($1)/eix			and next;
		s/\A${ppcons1}e(?![aeiouy])/app _C_($1),E("e")/eix
								and next;
		s/\A${precons}e(?![aeiouy])/app C_($1),E("e")/eix
								and next;
		s/\A${postcons}e(?![aeiouy])/app _C($1),E("e")/eix
								and next;
		s/\A${ppcons2}e(?![aeiouy])/app _C_($1),E("e")/eix
								and next;
		s/\A${postcons}?([sct])ion/app C_(($1||'').$2),V("io"),_C("n")/eix
								and next;
		s/\A${postcons}?tial/app C_(($1||'')."t"),V("ia"),_C("l")/eix
								and next;
		s/\A${postcons}?([ct])ia([nl])/app C_(($1||'').$2),V("ia"),_C($3)/eix
								and next;
		s/\A${postcons}?t(i?ou)s/app C_(($1||'')."t"),V($1),_C("s")/eix
								and next;
		s/\Aience/app V("i"),V("e"),_C("nc"),E('e')/eix
								and next;
		s/\Acious/app C_(($1||'')."c"),V("iou"),_C("s")/eix
								and next;
		s/\A$diphthong/app V($1),V($2)/ei		and next;
		s/\A$ppcons1/app _C_($1)/eix			and next;
		s/\A$precons/app C_($1)/eix			and next;
		s/\A$postcons/app _C($1)/eix			and next;
		s/\A$ppcons2/app _C_($1)/eix			and next;
		s/\A($vowels)/app V($1)/ei			and next;
	}
	return (@head, @tail);
}

sub partition
{
	no strict;
	local @list = @_;
	local @syls = ();

	sub is_S  { @list > 1 && $list[$#list]->{val} =~ /'?s'?/  }
	sub isR   { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C'
						  && $list[$i]->{val} eq 'r'  }
	sub isC   { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C' }
	sub is_C  { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'_C' }
	sub isC_  { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~'C_' }
	sub isV   { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type}=~/V|E/ }
	sub isVnE { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type} eq 'V'
						  && $list[$i]->{val} !~ /\Ae/
						  }
	sub isE   { my $i = $#list+$_[0]; $i >= 0 && $list[$i]->{type} eq 'E' }

	sub syl { my $syl = "";
		  for (1..$_[0]) { $syl = pop(@list)->{val}.$syl }
		  unshift @syls, $syl;
		  1}

	is_S(0) && do { my $val = pop @list; $list[$#list]->{val} .= $val->{val} };

	while (@list)
	{
		print "\t[@syls]\n" if $::debug;
		isE(-2) && isR(-1) && isVnE(0) 		   && syl(1) && next;
		isC(-1) && is_C(0)			   && syl(1) && next;
		isC_(-3) && isV(-2) && isC(-1) && isE(0)   && syl(4) && next;
		isC_(-2) && isV(-1) && is_C(0)		   && syl(3) && next;
		isV(-2) && isC(-1) && isE(0) 		   && syl(3) && next;
		isC_(-1) && isV(0)			   && syl(2) && next;
		isV(-1) && is_C(0)			   && syl(2) && next;
		isC(0)					   && syl(1) && next;
		isV(0)					   && syl(1) && next;
	}
	return @syls;
}

my %user_def_syl = ();
my %user_def_hyph = ();

sub def_syl($)
{
	my $word = $_[0];
	$word =~ tr/~//d;
	$user_def_syl{$word} = [split /\~/, $_[0]];
}

sub def_hyph($)
{
	my $word = $_[0];
	$word =~ tr/~//d;
	$user_def_hyph{$word} = [split /\~/, $_[0]];
}

sub syllables($)  # ($word)
{
	return ($_[0]) unless $_[0] =~ /[A-Za-z]/;
	my $word = $_[0];
	$word =~ s/\A([^a-zA-Z]+)//;
	my $leader = $1||'';
	$word =~ s/([^a-zA-Z]+)\Z//;
	my $trailer = $1||'';
	my @syls = @{$user_def_syl{$word}||[]};
	unless (@syls)
	{
		my @part = split /((?:\s|'(?![ts]\b)|'[^A-Za-z]|[^A-Za-z'])+)/, $word;
		for (my $p = 0; $p < @part; $p++)
		{
			if ($p & 1) { $syls[$#syls] .= $part[$p] }
			else        { push @syls, partition(abstract($part[$p])) }
		}
	}
	$syls[0] = $leader . $syls[0];
	$syls[$#syls] .= $trailer;
	return @syls if wantarray;
	return join '~', @syls;
}


sub hyphenate($$;$)  # ($word, $width; $hyphen)
{
	my $word = shift;
	my @syls = @{$user_def_hyph{$word}||[]};
	@syls = syllables($word) unless @syls;
	my ($width, $hyphen) = (@_,'-');
	my $hlen = length $hyphen;
	my $first = '';
	while (@syls)
	{
		if ($#syls) { last if length($first) + length($syls[0]) + $hlen > $width }
		else { last if length($first) + length($syls[0]) > $width }
		$first .= shift @syls;
	}
	$first .= $hyphen if $first && @syls && $first !~ /$hyphen\Z/;
	return ("$first",join '',@syls);
}

1;
__END__

=head1 NAME

Lingua::En::Hyphenate - Perl extension for syllable-based hyphenation

=head1 SYNOPSIS

  use Lingua::En::Hyphenate;

=head1 DESCRIPTION

=head1 AUTHOR

=cut