The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#! /usr/bin/perl -w
use strict;
use 5.010;
use autodie ':all';
use # Hide from release warnings
Data::Dump 'dump';

# Structure of the data we're decoding...

open my $target_fh, '>', 'lib/Lingua/EN/Inflexion/Nouns.pm';
select $target_fh;

my $COMMENT_LINE_PAT = qr{ \A \s* \# }xms;
my $COMMENT_PAT      = qr{ \# .* }xms;
my $BLANK_LINE_PAT   = qr{ \A \s* $ }xms;
my $WS               = qr{ [\s]* }xms;
my $DATA_PAT         = qr{
    \A
      (?: $WS < ([^>]+) > )?    # ...optional category tag
      $WS ([*-]?) $WS             # ...leading whitespace and optional generic marker
      (.*?)                     # ...singular word
      $WS =>                    # ...singular/plural separator
      $WS ([*-]?) $WS             # ...leading whitespace and optional generic marker
      (.*?)                     # ...plural of word
      (?:                       # ...optionally:
        $WS \|                  #    ...modern/classical separator
        $WS ([*-]?) $WS           #    ...leading whitespace and optional generic marker
        (.*?)                   #    ...classical plural of word
      )?
    $WS                         # ...trailing whitespace
    $COMMENT_PAT?               # Optional trailing comment
    \Z                          # ...trailing whitespace
}xms;

# The data structures we're building...
my %modern_plural_of;
my %classical_plural_of;
my %singular_of;
my %singular_word;
my %plural_word;
my @tests_singular;
my @tests_plural;
my @tests_classical_plural;
my @tests_is_singular;
my @tests_is_plural;
my %tested;
my %is_ambiguous_plural;

# Access noun data...
open my $noun_data_fh, '<', 'nouns.lei';

# Collect and categorize data...
INFLECTION:
while (my $line = readline $noun_data_fh) {

    # Skip noise...
    next if $line =~ $COMMENT_LINE_PAT
         || $line =~ $BLANK_LINE_PAT;

    # Extract data...
    my ($tag, $is_generic, $sing, $is_pure_generic, $pl1, undef, $pl2)
        = $line =~ $DATA_PAT
            or die "Unknown input: $line";

    # Fill in the blanks...
    $pl1 = $pl2 if $pl1 !~ /\S/;
    $tag //= q{};

    my ($sing_pat, $pl1_pat, $pl2_pat) = ($sing, $pl1, $pl2);

    # Build pattern tables...
    if ($is_generic) {
        # Convert generic marker to regex syntax ('*' -> /.*/, '-' -> /.+/)...
        $is_generic = $is_generic eq '*' ? '.*' : '.+';

        # Remove generic markers and optional restriction...
        my $restriction_sing = $sing =~ s{ ( \[.*?\] ) }{}xms ? $1 : q{};
        my $restriction_pl1  =  $pl1 =~ s{ ( \[.*?\] ) }{}xms ? $1 : q{};
        my $restriction_pl2  =  $pl2 && $pl2 =~ s{ ( \[.*?\] ) }{}xms ? $1 : q{};

        # Add the modern plural pattern data to the list of generics...
        push @{$modern_plural_of{pattern}},
             { from=>"($is_generic$restriction_sing)$sing", to => '${1}'.$pl1,  tag => $tag };

        push @{$singular_of{pattern}},
             { from=>"($is_generic$restriction_pl1)$pl1",  to => '${1}'.$sing, tag => $tag };

        # Add the classical plural pattern data (if any) to the list of generics...
        if ($pl2) {
            push @{$classical_plural_of{pattern}},
                 { from=>"($is_generic$restriction_sing)$sing", to => '${1}'.$pl2,  tag => $tag };

            push @{$singular_of{pattern}},
                 { from=>"($is_generic$restriction_pl2)$pl2",  to => '${1}'.$sing, tag => $tag };
        }
        else {
            push @{$classical_plural_of{pattern}},
                 { from=>"($is_generic$restriction_sing)$sing", to => '${1}'.$pl1,  tag => $tag };
        }
    }

    # Handle recursively inflected forms (usually prepositional suffixes)...
    elsif ($sing =~ m{ \(SING\) | \(PREP\) }xms) {{

        push @{$modern_plural_of{pattern}},
             { build_recursive({from=>$sing, to => $pl1, from_type=>'singular', to_type=>'modern_plural'}),  tag => $tag, };
        push @{$singular_of{pattern}},
             { build_recursive({from=>$pl1,  to => $sing, from_type=>'modern_plural', to_type=>'singular'}),      tag => $tag, };

        $pl2 //= $pl1;
        push @{$classical_plural_of{pattern}},
            { build_recursive({from=>$sing, to => $pl2, from_type=>'singular', to_type=>'classical_plural'}), tag => $tag, };

        push @{$singular_of{pattern}},
                { build_recursive({from=>$pl2,  to => $sing, from_type=>'classical_plural', to_type=>'singular'}),        tag => $tag, };
        
        redo if grep { defined && s/-/ /g } $sing, $pl1, $pl2;
        $is_pure_generic = 1;
    }}

    if (!$is_pure_generic) {{
        # If no explicit classical form, classical form is same as modern form...
        $pl2 ||= $pl1;

        # Build literal tables...
        $modern_plural_of{literal}{$sing}    //= $pl1;
        $classical_plural_of{literal}{$sing} //= $pl2;
        $singular_of{literal}{$pl1}          //= $sing;
        $singular_of{literal}{$pl2}          //= $sing;

        $singular_word{$sing} //= 1;
        $plural_word{$pl1}    //= 1;
        $plural_word{$pl2}    //= 1;

        # Allow hyphenated terms without the hyphens...
        if ($sing =~ s{-}{ }gxms) {
             $pl1 =~ s{-}{ }gxms;
             $pl2 =~ s{-}{ }gxms;
             redo;
        }
    }}
}

# Add conversions for possessives...
unshift @{$modern_plural_of{pattern}},
        { from=>q{(.*?)'s?}, to => q{${continue if !is_singular($1); my
        $plural = modern_plural_of($1); \($plural =~ /s$/ ? $plural.q{'} : $plural.q{'s})}},  tag => q{}, };

unshift @{$classical_plural_of{pattern}},
        { from=>q{(.*?)'s?}, to => q{${continue if !is_singular($1); my
        $plural = classical_plural_of($1); \($plural =~ /s$/ ? $plural.q{'} : $plural.q{'s})}},  tag => q{}, };

unshift @{$singular_of{pattern}},
        { from=>q{(.*?)'s?}, to => q{${continue if !is_plural($1); my $sing = convert_to_singular($1); \($sing =~ /s$/ ? $sing.q{'} : $sing.q{'s})}},  tag => q{}, };


# Emit standard header...
say '##########################################';
say '## NOTE: This module was autogenerated. ##';
say '## Contains no user-servicable parts!!! ##';
say '##########################################';
say '';
say 'package Lingua::EN::Inflexion::Nouns;';
say 'use 5.010; use strict; use warnings; use re q{eval};';
say '';

my ($sec,$min,$hour,$day,$mon,$year) = localtime;
say 'our $VERSION = ', sprintf("%04d%02d%02d.%02d%02d%02d", $year+1900, $mon+1, $day, $hour, $min, $sec), ';';
say '';

# Emit tables...
say 'my $modern_plural_of = ', dump($modern_plural_of{literal}) . ';';
say '';
say 'my $classical_plural_of = ', dump($classical_plural_of{literal}) . ';';
say '';
say 'my $singular_of = ', dump($singular_of{literal}) . ';';
say '';
say 'my $is_plural = {}; @{$is_plural}{values %{$modern_plural_of} } = ();';
say '                    @{$is_plural}{values %{$classical_plural_of} } = ();';
say '                    @{$is_plural}{keys %{$singular_of} } = ();';
say '';
say 'my $is_singular = {}; @{$is_singular}{values %{$singular_of} } = ();';
say '                      @{$is_singular}{keys %{$modern_plural_of} } = ();';
say '                      @{$is_singular}{keys %{$classical_plural_of} } = ();';
say '';

# Emit preposition pattern...
say 'my $PREP_PAT = qr{ about | above | across | after | among | around | at | athwart | before | behind | below | beneath | beside | besides | between | betwixt | beyond | but | by | during | except | for | from | in | into | near | of | off | on | onto | out | over | since | till | to | under | until | unto | upon | with }xms;';
say '';

# Emit pattern-based inflection subroutines...
emit_converter( 'modern_plural'   , $modern_plural_of{pattern}    );
say '';
emit_converter( 'classical_plural', $classical_plural_of{pattern} );
say '';
emit_converter( 'singular'        , $singular_of{pattern}         );
say '';

# Emit pattern-based recognition subroutines...
my $singulars_ref = [ @{$singular_of{pattern}} ];
my $plurals_ref   = [ @{$modern_plural_of{pattern}}, @{$classical_plural_of{pattern}} ];
emit_recognizer( 'plural',    'singular', $plurals_ref, $singulars_ref );
say '';
emit_recognizer( 'singular',  'plural',   $singulars_ref, $plurals_ref );
say '';

# Emit stupid trailing package terminator...
say '1;';


# Generate test files...
{
    # 1. is_singular()

    open my $test_file_fh, '>', 't/noun_is_singular.t';
    select $test_file_fh;

    say 'use Test::More;';
    say 'use Lingua::EN::Inflexion;';
    say '';

    for my $singular (keys %singular_word) {
        say "ok noun(q{$singular})->is_singular  => q{is_singular: '$singular'};";
    }

    say '';
    say 'done_testing();';
}
{
    # 2. is_plural()

    open my $test_file_fh, '>', 't/noun_is_plural.t';
    select $test_file_fh;

    say 'use Test::More;';
    say 'use Lingua::EN::Inflexion;';
    say '';
    
    for my $plural (keys %plural_word) {
        say "ok noun(q{$plural})->is_plural  => q{is_plural: '$plural'};";
    }

    say '';
    say 'done_testing();';
}
{
    # 3. plural()

    open my $test_file_fh, '>', 't/noun_plural.t';
    select $test_file_fh;

    say 'use Test::More;';
    say 'use Lingua::EN::Inflexion;';
    say '';

    for my $sing (keys %{$modern_plural_of{literal}}) {
        my $plural = $modern_plural_of{literal}{$sing};

        say "is noun(q{$sing})->plural,   q{$plural}  => q{plural: '$sing' --> '$plural'};";
        say "is noun(q{$plural})->plural, q{$plural}  => q{plural: '$plural' --> '$plural'};"
            if !$singular_word{$plural};
    }

    say '';
    say 'done_testing();';
}
{
    # 4. classical_plural()

    open my $test_file_fh, '>', 't/noun_classical_plural.t';
    select $test_file_fh;

    say 'use Test::More;';
    say 'use Lingua::EN::Inflexion;';
    say '';

    for my $sing (keys %{$classical_plural_of{literal}}) {
        my $plural = $classical_plural_of{literal}{$sing};

        say "is noun(q{$sing})->classical->plural,   q{$plural}  => q{classical plural: '$sing' --> '$plural'};";
        say "is noun(q{$plural})->classical->plural, q{$plural}  => q{classical plural: '$plural' --> '$plural'};"
            if !$singular_word{$plural};
    }

    say '';
    say 'done_testing();';
}
{
    # 5. singular()

    open my $test_file_fh, '>', 't/noun_singular.t';
    select $test_file_fh;

    say 'use Test::More;';
    say 'use Lingua::EN::Inflexion;';
    say '';

    for my $plural (keys %plural_word) {
        my $sing = $singular_of{literal}{$plural} // next;

        say "is noun(q{$plural})->singular, q{$sing}  => q{singular: '$plural' --> '$sing'};";
        say "is noun(q{$sing})->singular,   q{$sing}  => q{singular: '$sing' --> '$sing'};"
            if !$plural_word{$sing};
    }

    say '';
    say 'done_testing();';
}

#====[ Utility subs ]=====================================

sub emit_converter {
    my ($name, $replacement_suffixes) = @_;

    my $type             = $name =~ /plural/ ? 'plural'                  : 'singular';
    my $plus_extra_check = $name =~ /plural/ ? ' && !is_singular($word)' : '';

    # Generate subroutine structure...
    say "sub convert_to_${name} {";
    say '    my ($word) = @_;';
    say "    return \$${name}_of->{\$word}    if exists \$${name}_of->{\$word};";
    say "    return \$${name}_of->{lc \$word} if exists \$${name}_of->{lc \$word};";
    say "    return \$word                    if is_$type(\$word)$plus_extra_check;";
    say '    given ($word) {';

    # Generate each alternative replacement pattern...
    my %generated;
    for my $replacement_ref ( @{$replacement_suffixes} ) {
        my $recognizer = '        when (m{\A' . $replacement_ref->{from} . '$}i) { return "' . $replacement_ref->{to} .'"; }';
        next if $generated{$recognizer}++;
        say $recognizer;
    }

    # Subroutine defaults to failure...
    say '        default { return $_; }';
    say '    }';
    say '}';
}

sub emit_recognizer {
    my ($name, $compl_name, $negatives_ref, $positives_ref) = @_;

    # Generate subroutine structure...
    say "sub is_${name} {";
    say '    my ($word) = @_;';
    say "    return 1 if exists \$is_${name}\->{\$word};";
    say "    return 1 if exists \$is_${name}\->{lc \$word};";
    say "    return 0 if exists \$is_${compl_name}\->{\$word};";
    say "    return 0 if exists \$is_${compl_name}\->{lc \$word};";
    say '    given ($word) {';

    # Generate each alternative success pattern...
    my %generated;
    for my $replacement_ref ( @{$positives_ref} ) {
        # Ignore patterns that are marked non-indicative...
        next if $replacement_ref->{tag} eq 'nonindicative';

        # Otherwise, use them...
        my $recognizer = '        when (m{\A' . $replacement_ref->{from}
                       . '$}i) { '.($replacement_ref->{conditional}//q{}).'return 1 }';
        next if $generated{$recognizer}++;
        say $recognizer;

    }

    # Subroutine defaults to failure...
    if ($name eq 'singular') {
        say '        default { return !is_plural($word); }';
    }
    else {
        say '        default { return $word =~ m{ s \Z }xms; }';
    }
    say '    }';
    say '}';
}

sub build_recursive {
    my ($from, $to, $from_type, $to_type) = @{shift()}{'from', 'to', 'from_type', 'to_type'};

    # Replace plurality placeholders with patterns (remembering plurality constraints)...
    my @replacements;
    my $conditional = q{};
    $from =~ s{
          (?<star>  \*        )
        | (?<sing>  \(SING\)  )
        | (?<plur>  \(PL\)    )
        | (?<prep>  \(PREP\)  )
    }{
        my $cap_var = '$' . (@replacements + 1);
        if ($+{star}) {
            push @replacements, $cap_var;
            '(.*?)';
        }
        elsif ($+{sing}) {
            push @replacements, "\${is_singular($cap_var) ? \\convert_to_${to_type}($cap_var) : \\$cap_var}";
            $conditional = "continue if !is_singular($cap_var);";
            q{(.*?)};
        }
        elsif ($+{plur}) {
            push @replacements, "\${is_plural($cap_var) ? \\convert_to_${to_type}($cap_var) : \\$cap_var}";
            $conditional = "continue if !is_plural($cap_var);";
            q{(.*?)};
        }
        elsif ($+{prep}) {
            push @replacements, $cap_var;
            '($PREP_PAT)';
        }
    }gexms;

    # Replace plurality placeholders with converters (inserting remembered constraints)...
    $to =~ s{  \*  | \(SING\)  |  \(PL\)  |  \(PREP\)  }
            { shift @replacements; }gexms;

    return (from=>$from, to=>$to, conditional => $conditional);
}