/* -*- Mode: C; c-file-style: "stroustrup" -*- */ /* NATools - Package with parallel corpora tools * Copyright (C) 1998-2001 Djoerd Hiemstra * Copyright (C) 2002-2012 Alberto Simões * * This package is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include #include #include "standard.h" #include #include "matrix.h" /** * @file * @brief Allocated the sparse matrix with words co-occurrences */ /* #define SAVE_DOTS 1 */ static nat_boolean_t load_exc_words(nat_uint32_t nr, char *buffer, char* file) { FILE *fd; nat_uint32_t id; fd = fopen(file, "rb"); if (!fd) return FALSE; do { if (fread(&id, sizeof(nat_uint32_t), 1, fd)) { if (id >= nr) return FALSE; buffer[id] = 1; } } while(!feof(fd)); return TRUE; } static Matrix* InitialEstimate(nat_boolean_t quiet, nat_uint32_t Nrow, nat_uint32_t Ncolumn, Corpus *corpus1, Corpus *corpus2, char *excWrds1, char *excWrds2) { #ifdef SAVE_DOTS FILE *dots_fd; #endif Matrix *matrix; unsigned long cSentence, nSentences; unsigned long r, c, l; int jjdoneR, jjdoneC; CorpusCell *s1, *s2, *sen2; if (!quiet) fprintf(stderr, "\nAllocating the sparse matrix (%d x %d): ", Nrow, Ncolumn); /* Alloc matrix */ matrix = AllocMatrix(Nrow, Ncolumn); if (!matrix) report_error("InitialEstimate: AllocMatrix failed"); /* prepare variables for percent counting */ nSentences = corpus_sentences_nr(corpus1); cSentence = 0; s1 = corpus_first_sentence(corpus1); s2 = sen2 = corpus_first_sentence(corpus2); #ifdef SAVE_DOTS dots_fd = fopen("__dots__", "w"); if (!dots_fd) report_error("cannot open __dots__ file"); #endif while (s1 != NULL && s2 != NULL) { /* print percentage information */ if (!quiet) fprintf(stderr, "\b\b\b\b\b%4.1f%%", (float) (cSentence++) * 99.9f / (float) nSentences); l = max(corpus_sentence_length(s1), corpus_sentence_length(s2)); if (l <= MAXLEN) { jjdoneR = 0; for(r = 1; r <= l && !jjdoneR ; r++) { if (!(excWrds1 && s1->word && excWrds1[s1->word])) { jjdoneC = 0; for(c = 1; c <= l && !jjdoneC ; c++) { if (excWrds2 && s2->word && excWrds2[s2->word]) { ++s2; } else { if (s1->word && s2->word) { if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, s1->word, s2->word)) report_error("InitialEstimate: IncValue failed"); #ifdef SAVE_DOTS fprintf(dots_fd, "%d %d\n", s1->word, s2->word); #endif ++s2; } else { if (s1->word == 0) { if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, NULLWORD, s2->word)) report_error("InitialEstimate: IncValue failed"); #ifdef SAVE_DOTS fprintf(dots_fd, "0 %d\n", s2->word); #endif ++s2; jjdoneR=1; } else { if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, s1->word, NULLWORD)) report_error("InitialEstimate: IncValue failed"); #ifdef SAVE_DOTS fprintf(dots_fd, "%d 0\n", s1->word); #endif jjdoneC=1; } } } } } if (s1->word) s1++; s2 = sen2; } } s1 = corpus_next_sentence(corpus1); s2 = sen2 = corpus_next_sentence(corpus2); } #ifdef SAVE_DOTS fclose(dots_fd); #endif if (s1 != NULL || s2 != NULL) report_error("InitialEstimate: failed to evaluate all sentences"); if (!quiet) fprintf(stderr, "\b\b\b\b\b\b done \n"); return matrix; } void show_help () { printf("Usage:\n" " nat-initmat [-q] corpusFile1 corpusFile2 matFile\n" " nat-initmat [-q] corpusFile1 corpusFile2 excludeWrds1 excludeWrds2 matFile\n"); printf("Supported options:\n" " -h shows this help message and exits\n" " -V shows "PACKAGE" version and exits\n" " -q activates quiet mode\n" "Check nat-initmat manpage for details.\n"); } /** * @brief The main function * * @todo Document this */ int main(int argc, char **argv) { char *excWrds1 = NULL; char *excWrds2 = NULL; char *matFile; Corpus *corpus1, *corpus2; Matrix *matrix; nat_uint32_t total1, total2; nat_boolean_t quiet = FALSE; // extern char *optarg; extern int optind; int c; while ((c = getopt(argc, argv, "hqV")) != EOF) { switch (c) { case 'h': show_help(); return 0; case 'V': printf(PACKAGE " version " VERSION "\n"); return 0; case 'q': quiet = TRUE; break; default: show_help(); return 1; } } if (argc != optind + 3 && argc != optind + 5) { printf("nat-initmat: wrong number of arguments\n"); show_help(); return 1; } corpus1 = corpus_new(); corpus2 = corpus_new(); corpus_load(corpus1, argv[optind + 0]); corpus_load(corpus2, argv[optind + 1]); if (corpus_sentences_nr(corpus1) != corpus_sentences_nr(corpus2)) report_error("initmat.c: lengths do not match"); /* total1 and total2 are number of words (??) */ total1 = corpus_diff_words_nr(corpus1); total2 = corpus_diff_words_nr(corpus2); if (argc == 6) { excWrds1 = g_new0(char, total1 + 1); if (!load_exc_words(total1, excWrds1, argv[optind + 2])) report_error("initmat.c: error loading excludeWrds1"); excWrds2 = g_new0(char, total2 + 1); if (!load_exc_words(total2, excWrds2, argv[optind + 3])) report_error("initmat.c: error loading excludeWrds2"); matFile = argv[optind + 4]; } else { matFile = argv[optind + 2]; } matrix = InitialEstimate(quiet, total1, total2, corpus1, corpus2, excWrds1, excWrds2); if (argc == optind + 5) { g_free(excWrds1); g_free(excWrds2); } if (SaveMatrix(matrix, matFile)) report_error("SaveMatrix"); /* fprintf(stderr, "Matrix total after initial estimate:%9.2f\n", MatrixTotal(matrix, Matrix1)); */ if (!quiet) { fprintf(stderr, "Memory used:%10.1f kb\n\n", (float) BytesInUse(matrix) / 1024.0f); } corpus_free(corpus1); corpus_free(corpus2); FreeMatrix(matrix); return 0; }