# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. use strict; use warnings; use File::Spec::Functions qw( catfile catdir ); use Encode qw( encode ); use Text::Wrap qw( wrap ); # Don't use tabs. Wrap at 78 columns. $Text::Wrap::unexpand = 0; $Text::Wrap::columns = 78; if ( @ARGV != 2 ) { die "Usage: perl update_snowstop.pl SNOWBALL_SVN_CO LUCY_SNOWSTOP_DIR"; } my ( $snow_co_dir, $dest_dir ) = @ARGV; # Update to a particular rev of the Snowball repository. die("Not a directory: '$snow_co_dir'") unless -d $snow_co_dir; my $retval = system( "svn", "update", "-r", "541", $snow_co_dir ); die "svn update failed" if ( $retval >> 8 ); # Open destination C file and print start of file. my $outpath = catfile( $dest_dir, 'source', 'snowball_stoplists.c' ); open( my $out_fh, '>', $outpath ) or die "Can't open '$outpath': $!"; print $out_fh <<'END_STUFF'; /* Auto-generated file -- DO NOT EDIT! * * The words in this file are taken from stoplists provided by the Snowball * project. */ #include "Lucy/Analysis/SnowballStopFilter.h" END_STUFF my %languages = ( da => "danish", de => "german", en => "english", es => "spanish", fi => "finnish", fr => "french", hu => "hungarian", it => "italian", nl => "dutch", no => "norwegian", pt => "portuguese", ru => "russian", sv => "swedish", ); for my $iso ( sort keys %languages ) { my $language = $languages{$iso}; # Grab stoplists from Snowball source files. my $stop_path = "$snow_co_dir/website/algorithms/$language/stop.txt"; my $source_enc = $iso eq 'ru' ? 'koi8-r' : 'iso-8859-1'; open( my $stopfile_fh, "<:encoding($source_enc)", $stop_path ) or die "Couldn't open file '$stop_path': $!"; my @words; while ( defined( my $line = <$stopfile_fh> ) ) { $line =~ s/\|.*//g; next unless length($line); push @words, split( /\s+/, $line ); } # Encode as UTF-8, change all non-ASCII bytes to octal escapes, and format # as C string literals. my @escaped = map { '"' . encode( 'UTF-8', $_ ) . '"' } @words; s/([\x80-\xFF])/octal_escape($1)/ge for @escaped; # Wrap text and print to outfile. my $joined = join( ', ', @escaped, 'NULL' ); my $wrapped = wrap( ' ', ' ', $joined ); print $out_fh <