/* -*- Mode: C; c-file-style: "stroustrup" -*- */ /** * @file * @brief routines to manage the higher-level aspects of spell-checking * * Copyright 1983, by Pace Willisson * Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA * Copyright 1994-2006 Ulisses Pinto & José João Almeida & Alberto Simões * Projecto Natura, Universidade do Minho */ #include #include #include "jsconfig.h" #include "jspell.h" #include "proto.h" #include "msgs.h" #include "version.h" #include "good.h" void checkfile(void); int casecmp(char *a, char *b, int canonical); void makepossibilities(ichar_t *word); int compoundgood(ichar_t *word); int ins_root_cap(ichar_t *word, ichar_t *pattern, int prestrip, int preadd, int sufstrip, int sufadd, struct dent *firstdent, struct flagent *pfxent, struct flagent *sufent); void askmode(void); /*---------------------------------------------------------------------------*/ void printhelp(register FILE *helpout) /* File to write help to */ { fprintf(helpout, CORR_C_HELP_1); fprintf(helpout, CORR_C_HELP_2); fprintf(helpout, CORR_C_HELP_3); fprintf(helpout, CORR_C_HELP_4); fprintf(helpout, CORR_C_HELP_5); fprintf(helpout, CORR_C_HELP_6); fprintf(helpout, CORR_C_HELP_7); fprintf(helpout, CORR_C_HELP_8); fprintf(helpout, CORR_C_HELP_9); fprintf(helpout, CORR_C_HELP_COMMANDS); fprintf(helpout, CORR_C_HELP_R_CMD); fprintf(helpout, CORR_C_HELP_E_CMD); fprintf(helpout, CORR_C_HELP_BLANK); fprintf(helpout, CORR_C_HELP_A_CMD); fprintf(helpout, CORR_C_HELP_I_CMD); fprintf(helpout, CORR_C_HELP_U_CMD); fprintf(helpout, CORR_C_HELP_0_CMD); fprintf(helpout, CORR_C_HELP_L_CMD); fprintf(helpout, CORR_C_HELP_X_CMD); fprintf(helpout, CORR_C_HELP_Q_CMD); fprintf(helpout, CORR_C_HELP_BANG); fprintf(helpout, CORR_C_HELP_REDRAW); fprintf(helpout, CORR_C_HELP_SUSPEND); fprintf(helpout, CORR_C_HELP_HELP); } /*---------------------------------------------------------------------------*/ /* checkfile */ /*---------------------------------------------------------------------------*/ static void init_zero_contextbufs() { int bufno; for (bufno = 0; bufno < contextsize; bufno++) contextbufs[bufno][0] = '\0'; } /*---------------------------------------------------------------------------*/ static void move_contextbufs_down() { int bufno; for (bufno = contextsize; --bufno > 0; ) strcpy(contextbufs[bufno], contextbufs[bufno - 1]); } /*---------------------------------------------------------------------------*/ void checkfile(void) { int bufsize, ch; init_zero_contextbufs(); for ( ; ; ) { move_contextbufs_down(); if (quit) { /* quit can't be set in l mode */ while (fgets(contextbufs[0], sizeof contextbufs[0], infile) != NULL) fputs(contextbufs[0], outfile); break; } /* * Only read in enough characters to fill half this buffer so that any * corrections we make are not likely to cause an overflow. */ if (fgets(contextbufs[0], (sizeof contextbufs[0]) / 2, infile) == NULL) break; /* * If we didn't read to end-of-line, we may have ended the * buffer in the middle of a word. So keep reading until we * see some sort of character that can't possibly be part of a * word. (or until the buffer is full, which fortunately isn't * all that likely). */ bufsize = strlen(contextbufs[0]); if (bufsize == (sizeof contextbufs[0]) / 2 - 1) { ch = (unsigned char) contextbufs[0][bufsize - 1]; while (bufsize < sizeof contextbufs[0] - 1 && (iswordch((ichar_t) ch) || isboundarych((ichar_t) ch) || isstringstart(ch))) { ch = getc(infile); if (ch == EOF) break; contextbufs[0][bufsize++] = (char) ch; contextbufs[0][bufsize] = '\0'; } } checkline(outfile); } } /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ static int posscmp(char *a, char *b) { return casecmp(a, b, 0); } /*---------------------------------------------------------------------------*/ int casecmp(char *a, char *b, int canonical) /* int canonical - NZ for canonical string chars */ { register ichar_t *ap, *bp; ichar_t inta[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; ichar_t intb[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; strtoichar(inta, a, sizeof inta, canonical); strtoichar(intb, b, sizeof intb, canonical); for (ap = inta, bp = intb; *ap != 0; ap++, bp++) { if (*ap != *bp) { if (*bp == '\0') return hashheader.sortorder[*ap]; else if (mylower(*ap)) { if (mylower(*bp) || mytoupper(*ap) != *bp) return (int) hashheader.sortorder[*ap] - (int) hashheader.sortorder[*bp]; } else { if (myupper(*bp) || mytolower(*ap) != *bp) return (int) hashheader.sortorder[*ap] - (int) hashheader.sortorder[*bp]; } } } if (*bp != '\0') return -(int) hashheader.sortorder[*bp]; for (ap = inta, bp = intb; *ap; ap++, bp++) { if (*ap != *bp) { return (int) hashheader.sortorder[*ap] - (int) hashheader.sortorder[*bp]; } } return 0; } /*---------------------------------------------------------------------------*/ /* makepossibilities */ /*---------------------------------------------------------------------------*/ static int insert(register ichar_t *word) { register int i; register char *realword; realword = ichartosstr(word, 0); for (i = 0; i < pcount; i++) { if (strcmp(possibilities[i], realword) == 0) return 0; } strcpy(possibilities[pcount++], realword); i = strlen(realword); if (i > maxposslen) maxposslen = i; if (pcount >= MAXPOSSIBLE) return -1; else return 0; } /*---------------------------------------------------------------------------*/ /* Insert one or more correctly capitalized versions of word */ static int ins_cap(ichar_t *word, ichar_t *pattern) { int prestrip, preadd, sufstrip, sufadd, hitno; if (*word == 0) return 0; for (hitno = numhits; --hitno >= 0; ) { if (hits[hitno].prefix) { prestrip = hits[hitno].prefix->stripl; preadd = hits[hitno].prefix->affl; } else prestrip = preadd = 0; if (hits[hitno].suffix) { sufstrip = hits[hitno].suffix->stripl; sufadd = hits[hitno].suffix->affl; } else sufadd = sufstrip = 0; if (ins_root_cap(word, pattern, prestrip, preadd, sufstrip, sufadd, hits[hitno].dictent, hits[hitno].prefix, hits[hitno].suffix) < 0) return -1; } return 0; } /*---------------------------------------------------------------------------*/ #ifndef NO_CAPITALIZATION_SUPPORT static void wrongcapital(register ichar_t *word) { ichar_t newword[MAXWLEN]; /* When the third parameter to "good" is nonzero, it ignores case. If the word matches this way, "ins_cap" will recapitalize it correctly. */ if (bgood(word, 0, 1, 1)) { icharcpy(newword, word); upcase(newword); ins_cap(newword, word); } } #endif /*---------------------------------------------------------------------------*/ static void wrongletter(register ichar_t *word) { register int i, j, n; ichar_t savechar; ichar_t newword[MAXWLEN]; n = icharlen(word); icharcpy(newword, word); #ifndef NO_CAPITALIZATION_SUPPORT upcase(newword); #endif for (i = 0; i < n; i++) { savechar = newword[i]; for (j = 0; j < Trynum; ++j) { if (Try[j] == savechar) continue; newword[i] = Try[j]; if (bgood(newword, 0, 1, 1)) { if (ins_cap(newword, word) < 0) return; } } newword[i] = savechar; } } /*---------------------------------------------------------------------------*/ static void extraletter(register ichar_t *word) { ichar_t newword[MAXWLEN]; register ichar_t *p, *r; if (icharlen(word) < 2) return; icharcpy(newword, word + 1); for (p = word, r = newword; *p != 0; ) { if (bgood(newword, 0, 1, 1)) { if (ins_cap(newword, word) < 0) return; } *r++ = *p++; } } /*---------------------------------------------------------------------------*/ static void missingletter(ichar_t *word) { ichar_t newword[MAXWLEN + 1]; register ichar_t *p, *r; register int i; icharcpy(newword + 1, word); for (p = word, r = newword; *p != 0; ) { /* for each char. in the word */ for (i = 0; i < Trynum; i++) { /* try all possible chars */ *r = Try[i]; if (bgood(newword, 0, 1, 1)) { /* the new word exists in dictionary */ if (ins_cap(newword, word) < 0) return; } } *r++ = *p++; } for (i = 0; i < Trynum; i++) { *r = Try[i]; if (bgood(newword, 0, 1, 1)) { if (ins_cap(newword, word) < 0) return; } } } /*---------------------------------------------------------------------------*/ static void missingspace(ichar_t *word) { ichar_t newword[MAXWLEN + 1]; register ichar_t *p, savech; /* ** We don't do words of length less than 3; this keeps us from ** splitting all two-letter words into two single letters. ** Also, we just duplicate the existing capitalizations, rather ** than try to reconstruct both, which would require a smarter ** version of ins_cap. */ if (word[0] == 0 || word[1] == 0 || word[2] == 0) return; icharcpy(newword, word); for (p = newword + 1; *p != 0; p++) { savech = *p; *p = 0; if (bgood(newword, 0, 1, 1)) { /* left word correct */ *p = savech; if (bgood(p, 0, 1, 1)) { /* right word correct */ *p = ' '; icharcpy(p + 1, word + (p - newword)); if (insert(newword) < 0) return; *p = '-'; if (insert(newword) < 0) return; icharcpy(p, word + (p - newword)); } } *p = savech; } } /*---------------------------------------------------------------------------*/ static void transposedletter(register ichar_t *word) { ichar_t newword[MAXWLEN]; register ichar_t *p, temp; icharcpy(newword, word); for (p = newword; p[1] != 0; p++) { temp = *p; *p = p[1]; p[1] = temp; if (bgood(newword, 0, 1, 1)) { if (ins_cap(newword, word) < 0) return; } temp = *p; *p = p[1]; p[1] = temp; } } /*---------------------------------------------------------------------------*/ static void tryveryhard(ichar_t *word) { bgood(word, 1, 0, 1); /* the second parameter is 1 to ignoreflagbits */ } /*---------------------------------------------------------------------------*/ void makepossibilities(register ichar_t *word) { register int i; for (i = 0; i < MAXPOSSIBLE; i++) possibilities[i][0] = 0; pcount = 0; maxposslen = 0; easypossibilities = 0; /* number of possiblities using 4 usual errors */ my_poss_count = 0; #ifndef NO_CAPITALIZATION_SUPPORT wrongcapital(word); #endif /* * according to Pollock and Zamora, CACM April 1984 (V. 27, No. 4), * page 363, the correct order for this is: * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION * thus, it was exactly backwards in the old version. -- PWP */ if (!yflag) { /* not supressing typing errors */ if (pcount < MAXPOSSIBLE) missingletter(word); /* omission */ if (pcount < MAXPOSSIBLE) transposedletter(word); /* transposition */ if (pcount < MAXPOSSIBLE) extraletter(word); /* insertion */ if (pcount < MAXPOSSIBLE) wrongletter(word); /* substitution */ if (missingspaceflag && pcount < MAXPOSSIBLE && !aflag) missingspace(word); /* two words */ } easypossibilities = pcount; if (tryhardflag) tryveryhard(word); if ((sortit || (pcount > easypossibilities)) && pcount) { if (easypossibilities > 0 && sortit) qsort((char *) possibilities, (unsigned) easypossibilities, sizeof(possibilities[0]), (int (*) (const void *, const void *)) posscmp); if (pcount > easypossibilities) qsort((char *) &possibilities[easypossibilities][0], (unsigned) (pcount - easypossibilities), sizeof (possibilities[0]), (int (*) (const void *, const void *)) posscmp); } } /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ int compoundgood(ichar_t *word) { ichar_t newword[MAXWLEN]; register ichar_t *p, savech; long secondcap; /* Capitalization of 2nd half */ /* ** If missingspaceflag is set, compound words are never ok. */ if (missingspaceflag) return 0; /* ** Test for a possible compound word (for languages like German that ** form lots of compounds). ** ** This is similar to missingspace, except we quit on the first hit, ** and we won't allow either member of the compound to be a single ** letter. ** ** We don't do words of length less than 2 * compoundmin, since ** both halves must at least compoundmin letters. */ if (icharlen(word) < 2 * hashheader.compoundmin) return 0; icharcpy(newword, word); p = newword + hashheader.compoundmin; for ( ; p[hashheader.compoundmin - 1] != 0; p++) { savech = *p; *p = 0; if (bgood(newword, 0, 0, 0)) { *p = savech; if (bgood(p, 0, 1, 0)) { /* Accept any case variant in 2nd */ secondcap = whatcap(p); switch (whatcap(newword)) { case ANYCASE: case CAPITALIZED: case FOLLOWCASE: /* Followcase can have l.c. suffix */ return secondcap == ANYCASE; case ALLCAPS: return secondcap == ALLCAPS; } } } else *p = savech; } return 0; } /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ /* ARGSUSED */ int ins_root_cap(ichar_t *word, ichar_t *pattern, int prestrip, int preadd, int sufstrip, int sufadd, struct dent *firstdent, struct flagent *pfxent, struct flagent *sufent) { #ifndef NO_CAPITALIZATION_SUPPORT register struct dent * dent; #endif /* NO_CAPITALIZATION_SUPPORT */ int firstisupper; ichar_t newword[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; #ifndef NO_CAPITALIZATION_SUPPORT register ichar_t *p; int len, i, limit; #endif /* NO_CAPITALIZATION_SUPPORT */ icharcpy(newword, word); firstisupper = myupper(pattern[0]); #ifdef NO_CAPITALIZATION_SUPPORT /* ** Apply the old, simple-minded capitalization rules. */ if (firstisupper) { if (myupper(pattern[1])) upcase(newword); else { lowcase(newword); newword[0] = mytoupper(newword[0]); } } else lowcase(newword); return insert(newword); #else /* NO_CAPITALIZATION_SUPPORT */ #define flagsareok(dent) \ ((pfxent == NULL || TSTMASKBIT(dent->mask, pfxent->flagbit)) \ && (sufent == NULL || TSTMASKBIT(dent->mask, sufent->flagbit))) dent = firstdent; if ((dent->flagfield & (CAPTYPEMASK | MOREVARIANTS)) == ALLCAPS) { upcase(newword); /* Uppercase required */ return insert(newword); } for (p = pattern; *p; p++) { if (mylower(*p)) break; } if (*p == 0) { upcase(newword); /* Pattern was all caps */ return insert(newword); } for (p = pattern + 1; *p; p++) { if (myupper(*p)) break; } if (*p == 0) { /* ** The pattern was all-lower or capitalized. If that's ** legal, insert only that version. */ if (firstisupper) { if (captype(dent->flagfield) == CAPITALIZED || captype(dent->flagfield) == ANYCASE) { lowcase(newword); newword[0] = mytoupper(newword[0]); return insert(newword); } } else { if (captype(dent->flagfield) == ANYCASE) { lowcase(newword); return insert(newword); } } while (dent->flagfield & MOREVARIANTS) { dent = dent->next; if (captype(dent->flagfield) == FOLLOWCASE || !flagsareok(dent)) continue; if (firstisupper) { if (captype(dent->flagfield) == CAPITALIZED) { lowcase(newword); newword[0] = mytoupper(newword[0]); return insert(newword); } } else { if (captype (dent->flagfield) == ANYCASE) { lowcase(newword); return insert(newword); } } } } /* ** Either the sample had complex capitalization, or the simple ** capitalizations (all-lower or capitalized) are illegal. ** Insert all legal capitalizations, including those that are ** all-lower or capitalized. If the prototype is capitalized, ** capitalized all-lower samples. Watch out for affixes. */ dent = firstdent; p = strtosichar(dent->word, 1); len = icharlen(p); if (dent->flagfield & MOREVARIANTS) dent = dent->next; /* Skip place-holder entry */ for ( ; ; ) { if (flagsareok(dent)) { if (captype(dent->flagfield) != FOLLOWCASE) { lowcase(newword); if (firstisupper || captype(dent->flagfield) == CAPITALIZED) newword[0] = mytoupper(newword[0]); if (insert(newword) < 0) return -1; } else { /* Followcase is the tough one. */ p = strtosichar(dent->word, 1); bcopy((char *) (p + prestrip), (char *) (newword + preadd), (len - prestrip - sufstrip) * sizeof(ichar_t)); if (myupper(p[prestrip])) { for (i = 0; i < preadd; i++) newword[i] = mytoupper(newword[i]); } else { for (i = 0; i < preadd; i++) newword[i] = mytolower(newword[i]); } limit = len + preadd + sufadd - prestrip - sufstrip; i = len + preadd - prestrip - sufstrip; p += len - sufstrip - 1; if (myupper(*p)) { for (p = newword + i; i < limit; i++, p++) *p = mytoupper(*p); } else { for (p = newword + i; i < limit; i++, p++) *p = mytolower(*p); } if (insert(newword) < 0) return -1; } } if ((dent->flagfield & MOREVARIANTS) == 0) break; /* End of the line */ dent = dent->next; } return 0; #endif /* NO_CAPITALIZATION_SUPPORT */ } static void save_pers_dic() { /* this is also part of the library */ treeoutput(); math_mode = 0; LaTeX_Mode = 'P'; } /*---------------------------------------------------------------------------*/ /* * */ static void init_modes(char *strg) { int i; for (i = 0; i < strlen(strg); i++) { switch(strg[i]) { case 'g': gflag = 1; break; case 'G': /* default */ gflag = 0; break; case 'P': tryhardflag = 0; break; case 'm': tryhardflag = 1; break; case 'y': yflag = 1; break; case 'Y': /* default */ yflag = 0; break; case 'z': showflags = 1; break; case 'Z': /* default */ showflags = 0; break; } } } /*---------------------------------------------------------------------------*/ void askmode() { register char *cp1, *cp2; ichar_t *itok; /* Ichar version of current word */ if (fflag) { if (freopen(askfilename, "w", stdout) == NULL) { fprintf(stderr, CANT_CREATE, askfilename); exit(1); } } printf("%s\n", Version_ID[0]); while (fflush(stdout), xgets(contextbufs[0], sizeof contextbufs[0], stdin) != NULL) { /* ** *line is like `i', ** @line is like `a', ** &line is like 'u' ** $... init options do Ulisses ** $"... jj ** `#' is like `Q' (writes personal dictionary) ** `+' sets tflag, ** `-' clears tflag ** `!' sets terse mode, ** `%' clears terse ** `~' followed by a filename sets parameters according to file name ** `^' causes rest of line to be checked after stripping 1st char */ if (contextbufs[0][0] == '*' || contextbufs[0][0] == '@') treeinsert(ichartosstr(strtosichar(contextbufs[0] + 1, 0), 1), ICHARTOSSTR_SIZE, contextbufs[0][0] == '*'); else if (contextbufs[0][0] == '&') { itok = strtosichar(contextbufs[0] + 1, 0); lowcase(itok); treeinsert(ichartosstr(itok, 1), ICHARTOSSTR_SIZE, 1); } else if (contextbufs[0][0] == '#' && contextbufs[0][1] == '#') save_pers_dic(); else if (contextbufs[0][0] == '#') /* JJoao 2002 */ printf("%s\n\n",contextbufs[0]); else if (contextbufs[0][0] == '!') terse = 1; else if (contextbufs[0][0] == '%') terse = 0; else if (contextbufs[0][0] == '+' || contextbufs[0][0] == '-') { math_mode = 0; LaTeX_Mode = 'P'; tflag = (contextbufs[0][0] == '+'); prefstringchar = findfiletype(tflag ? "tex" : "nroff", 1, (int *) NULL); if (prefstringchar < 0) prefstringchar = 0; defdupchar = prefstringchar; } else if (contextbufs[0][0] == '~') { defdupchar = findfiletype(&contextbufs[0][1], 1, &tflag); if (defdupchar < 0) defdupchar = 0; } else if (contextbufs[0][0] == '$' && contextbufs[0][1] == '"') { jjflags(contextbufs[0] + 2); } else if (contextbufs[0][0] == '$') { init_modes(contextbufs[0] + 1); } else { if (contextbufs[0][0] == '^') { /* Strip off leading uparrow */ for (cp1 = contextbufs[0], cp2 = contextbufs[0] + 1; (*cp1++ = *cp2++) != '\0'; ) ; } checkline(stdout); } } } /** * @brief Copy/ignore "cnt" number of characters pointed to by *cc. * */ void copyout(register char **cc, register int cnt) { while (--cnt >= 0) { if (**cc == '\0') break; if (!aflag && !lflag) putc(**cc, outfile); (*cc)++; } }