The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#-*-perl-*-
#
# USAGE: sta2moses sta-align-file.xml
#
# simple script to get the aligned sentences from Stockholm Treealigner files
#

use strict;
use FindBin;
use lib $FindBin::Bin.'/../lib';
use Lingua::Align::Corpus::Parallel::STA;
use File::Basename;

my $algfile = shift(@ARGV);
my $corpus=new Lingua::Align::Corpus::Parallel::STA(-alignfile => $algfile);
my $base=basename($algfile);
$base=~s/\.xml//;

my $srcfile=$base.'.src';
my $trgfile=$base.'.trg';

open SRC,">$srcfile" || die "cannot open $srcfile\n";
open TRG,">$trgfile" || die "cannot open $trgfile\n";
binmode(SRC,":encoding(utf8)");
binmode(TRG,":encoding(utf8)");

my %srctree=();
my %trgtree=();
my $links;

while ($corpus->next_alignment(\%srctree,\%trgtree,\$links)){
    print SRC lc(join(' ',$corpus->{SRC}->get_all_leafs(\%srctree)));
    print SRC "\n";
    print TRG lc(join(' ',$corpus->{TRG}->get_all_leafs(\%trgtree)));
    print TRG "\n";
}

close SRC;
close TRG;



__END__

=head1 NAME

sta2moses - convert from Stockholm Tree Aligner format to Moses/GIZA++ (plain text)

=head1 SYNOPSIS

    sta2moses alignments.xml

=head1 DESCRIPTION

This script reads through a parallel treebank using the tree alignment file (alignments.xml) and produces sentence aligned plain text files (to be used with Moses/Giza++). The corpus will be stored in F<alignments.src> and F<alignments.trg>.


=head1 SEE ALSO

L<Lingua::Align::Corpus>
 

=head1 AUTHOR

Joerg Tiedemann

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut