#!/usr/bin/perl
#-*-perl-*-
#
# USAGE: sta2moses sta-align-file.xml
#
# simple script to get the aligned sentences from Stockholm Treealigner files
#
use strict;
use FindBin;
use lib $FindBin::Bin.'/../lib';
use Lingua::Align::Corpus::Parallel::STA;
use File::Basename;
my $algfile = shift(@ARGV);
my $corpus=new Lingua::Align::Corpus::Parallel::STA(-alignfile => $algfile);
my $base=basename($algfile);
$base=~s/\.xml//;
my $srcfile=$base.'.src';
my $trgfile=$base.'.trg';
open SRC,">$srcfile" || die "cannot open $srcfile\n";
open TRG,">$trgfile" || die "cannot open $trgfile\n";
binmode(SRC,":encoding(utf8)");
binmode(TRG,":encoding(utf8)");
my %srctree=();
my %trgtree=();
my $links;
while ($corpus->next_alignment(\%srctree,\%trgtree,\$links)){
print SRC lc(join(' ',$corpus->{SRC}->get_all_leafs(\%srctree)));
print SRC "\n";
print TRG lc(join(' ',$corpus->{TRG}->get_all_leafs(\%trgtree)));
print TRG "\n";
}
close SRC;
close TRG;
__END__
=head1 NAME
sta2moses - convert from Stockholm Tree Aligner format to Moses/GIZA++ (plain text)
=head1 SYNOPSIS
sta2moses alignments.xml
=head1 DESCRIPTION
This script reads through a parallel treebank using the tree alignment file (alignments.xml) and produces sentence aligned plain text files (to be used with Moses/Giza++). The corpus will be stored in F<alignments.src> and F<alignments.trg>.
=head1 SEE ALSO
L<Lingua::Align::Corpus>
=head1 AUTHOR
Joerg Tiedemann
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2009 by Joerg Tiedemann
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.
=cut