123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770 |
- /*
- * CDE - Common Desktop Environment
- *
- * Copyright (c) 1993-2012, The Open Group. All rights reserved.
- *
- * These libraries and programs are free software; you can
- * redistribute them and/or modify them under the terms of the GNU
- * Lesser General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * These libraries and programs are distributed in the hope that
- * they will be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU Lesser General Public License for more
- * details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with these libraries and programs; if not, write
- * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
- * Floor, Boston, MA 02110-1301 USA
- */
- /*
- * COMPONENT_NAME: austext
- *
- * FUNCTIONS: euro_lstrupr
- * free_wordtree
- * is_concordable
- * language_name
- * load_include_list
- * load_language
- * load_paice_suffixes
- * load_stop_list
- * load_wordtree
- * null_lstrupr
- * null_stemmer
- * paice_stemmer
- * search_wordtree
- * teskey_parser
- * unload_language
- *
- * ORIGINS: 27
- *
- *
- * (C) COPYRIGHT International Business Machines Corp. 1995,1996
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- */
- /******************** LANG.C ********************
- * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
- * July 1995.
- * Includes load_language(), unload_language(), and functions and data for
- * parsing and stemming European languages in DtSearch/AusText.
- * Incorporates p/o socrates.c, p/o proctext.c, parser.c
- * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
- * Related to similar semantic modules repackaged into semantic.c.
- * Paice suffix removal algorithm from C. Paice, 1990,
- * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
- *
- * $Log$
- * Revision 2.13 1996/03/25 18:55:26 miker
- * Changed FILENAME_MAX to _POSIX_PATH_MAX.
- *
- * Revision 2.12 1996/03/25 17:00:19 miker
- * Cleanup compiler warning.
- *
- * Revision 2.11 1996/03/13 22:58:13 miker
- * Changed char to UCHAR several places.
- *
- * Revision 2.10 1996/03/05 16:49:58 miker
- * Move COMMENT_CHARS to SearchP.h.
- *
- * Revision 2.9 1996/03/05 16:31:20 miker
- * Added test of PA_MSGS for yacc-based boolean queries.
- * Made comment chars in linguistic files independent of locale.
- * Changed several char ptrs to unsigned char so parser will
- * work when compiled under default signed char compilers.
- * Simplified several statements with LHS *var++ for same reason.
- *
- * Revision 2.8 1996/02/05 16:16:05 miker
- * Restore prolog.
- *
- * Revision 2.7 1996/02/05 16:10:54 miker
- * load_paice_suffixes: discard .sfx lines beginning with all numeric
- * first token for compatibility with older file formats.
- *
- * Revision 2.6 1996/02/01 19:11:43 miker
- * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
- * Moved charmaps to new module langmap.c. Removed hard coded
- * paice stemmer values--now dynamic from .sfx file.
- *
- * Revision 2.5 1995/10/26 14:55:28 miker
- * Added prolog.
- *
- * Revision 2.4 1995/10/19 20:54:36 miker
- * Increased msg buf sizes to accommodate larger database file names.
- *
- * Revision 2.3 1995/10/06 14:39:45 miker
- * Bug fix: coredump loading multiple databases
- * on Solaris.
- *
- * Revision 2.2 1995/10/03 21:39:10 miker
- * Changed teskey_parser, paice_stemmer, and null_stemmer
- * to return number of words parsed/stemmed, not just boolean.
- *
- * Revision 2.1 1995/09/22 21:00:19 miker
- * Freeze DtSearch 0.1, AusText 2.1.8
- *
- * Revision 1.3 1995/09/19 22:08:28 miker
- * Added support for loading and parsing Japanese language DtSrLaJPN.
- *
- * Revision 1.2 1995/09/05 21:34:52 miker
- * Fixed bug: search engine wouldn't parse words of exactly
- * 3 or 15 chars.
- *
- * Revision 1.1 1995/08/31 21:03:44 miker
- * Initial revision
- */
- #include "SearchP.h"
- #include <limits.h>
- #include <stdlib.h>
- #include <string.h>
- #include <errno.h>
- #include <wchar.h>
- #include <sys/stat.h>
- #define X_INCLUDE_STRING_H
- #define XOS_USE_NO_LOCKING
- #include <X11/Xos_r.h>
- #define PROGNAME "LANG"
- #define EXT_SUFFIX ".sfx" /* standard paice suffix file format */
- #define OUTBUFSZ 6140
- #define SFX_DELIMS " \t\n"
- #define MS_misc 1
- #define MS_lang 15
- #define IS_VOWEL(c) ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
- /************************************************/
- /* */
- /* PRULE */
- /* */
- /************************************************/
- /* List of Paice suffix removal rules from .sfx files */
- typedef struct prule_t {
- struct prule_t *link; /* Ptr to next list node */
- UCHAR *suffix; /* Applicable suffix string, backwards */
- UCHAR suflen; /* Length of suffix */
- char must_be_intact; /* Optional '*'. Rule only applies
- * to intact words */
- UCHAR remove_count; /* Number of suffix chars to remove */
- UCHAR aplen; /* Length of apndstr */
- UCHAR *apndstr; /* Optional append string */
- char is_last_rule; /* '$' terminate or '>' continue algorithm */
- } PRULE;
- char *ensure_end_slash (char *pathstr);
- void unload_jpn_language (DBLK *dblk);
- /************************************************/
- /* */
- /* GLOBALS */
- /* */
- /************************************************/
- int debugging_loadlang = FALSE;
- int debugging_loadword = FALSE;
- int debugging_search_wordtree = FALSE;
- int debugging_teskey = FALSE;
- int debugging_paice = FALSE;
- static int *paice_charmap;
- static char paicebuf [DtSrMAXWIDTH_HWORD + 2];
- static int paicelen;
- static int paicewcsl;
- static int word_is_intact;
- /* Language strings correspond to DtSrLa.. constants. */
- static char *lang_fnames[] = {
- "eng", /* 0 */
- "eng", /* 1 ('eng2' same files as 'eng') */
- "esp", /* 2 */
- "fra", /* 3 */
- "ita", /* 4 */
- "deu", /* 5 */
- "jpn", /* 6 */
- "jpn", /* 7 ('jpn2' same files as 'jpn' */
- NULL
- };
- /************************************************/
- /* */
- /* language_name */
- /* */
- /************************************************/
- /* Returns language name string given language number */
- static char *language_name (DtSrINT16 langno)
- {
- static char *language_names[] = {
- "English-ASCII", /* 0 = DtSrLaENG */
- "English-Latin1", /* 1 = DtSrLaENG2 */
- "Spanish", /* 2 = DtSrLaESP */
- "French", /* 3 = DtSrLaFRA */
- "Italian", /* 4 = DtSrLaITA */
- "German", /* 5 = DtSrLaDEU */
- "Japanese-comp", /* 6 = DtSrLaJPN */
- "Japanese-.knj" /* 7 = DtSrLaJPN2 */
- };
- if (langno < 0)
- return "INVALID!";
- else if (langno > DtSrLaLAST)
- return "(Custom Language)";
- else
- return language_names [langno];
- } /* language_name() */
- /************************************************/
- /* */
- /* search_wordtree */
- /* */
- /************************************************/
- /* Sept 1991.
- * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
- * Searches a word list in a binary WORDTREE.
- * Passed wordstring is presumed to be a clean,
- * uppercase word token string terminated by \0.
- * Variables are static for speeeeed.
- * Returns TRUE if successful search, else FALSE.
- * See also search_wordtree_jpn() in jpn.c
- */
- static int search_wordtree (WORDTREE *wordtree, char *wordstring)
- {
- static int direction;
- static WORDTREE *node;
- if (debugging_search_wordtree)
- fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
- wordstring);
- /* MAIN SEARCH LOOP: binary tree search */
- for (node = wordtree; node != NULL; ) {
- if ((direction = strcmp (wordstring, node->word)) == 0) {
- if (debugging_search_wordtree)
- fprintf (aa_stderr, " HIT!\n");
- return TRUE;
- }
- /* Descend left or right depending on word */
- if (debugging_search_wordtree)
- fprintf (aa_stderr, " %c '%s'\n",
- (direction < 0) ? 'L' : 'R', (char *) node->word);
- if (direction < 0)
- node = node->llink;
- else
- node = node->rlink;
- }
- if (debugging_search_wordtree)
- fprintf (aa_stderr, " MISS.\n");
- return FALSE;
- } /* search_wordtree() */
- static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
- {
- int len = -1;
- if (p < s) goto done;
- if (*p >= 0 && *p <= 0x7F) {
- len = 1;
- *pwc = *p;
- goto done;
- }
- if (p == s) goto done;
- len = mbtowc (pwc, p - 1, MB_CUR_MAX);
- done:
- if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
- return len;
- }
- static char *euro_wctomb (int c, char *outp, int len)
- {
- wchar_t wc = c & 0xFF;
- if (len > 1) wctomb (outp, wc);
- else *outp = wc;
- return outp + len;
- }
- static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
- wchar_t *pwc)
- {
- int len = 1;
- *pwc = *outp = cofunction (cofunction_arg);
- if (*pwc >= 0 && *pwc <= 0x7F) goto done;
- *(outp + len++) = cofunction (NULL);
- if (mbtowc (pwc, outp, MB_CUR_MAX) >= 0) goto done;
- *pwc = 0x100;
- for (;;) {
- if (len >= MB_CUR_MAX) break;
- *(outp + len++) = cofunction (NULL);
- if (mblen (outp, MB_CUR_MAX) >= 0) break;
- }
- done:
- if (*pwc > 0xFF) *pwc = 0x100;
- return len;
- }
- /************************************************/
- /* */
- /* teskey_parser */
- /* */
- /************************************************/
- /* 1989.
- * Teskey_parser() is derived from the former Socrates() in socrates.c.
- * Returns next teskey-parsed word token from a character stream.
- * Called from (1) dtsrindex, where readchar_ftext() cofunction
- * reads the .fzk file document 'stream', or (2) search engine
- * query parsers, where readchar_string() cofunction 'reads'
- * from the query string.
- * (The word hiliting parser does not directly call teskey_parser; it has
- * its own simplified equivalent to the parsing algorithms herein.)
- *
- * First call passes args in PARG structure. This resets end of
- * text block (ETX) flag, resets 'offset' counter to zero, etc.
- * Subsequent calls should pass NULL, and parser returns
- * next token in block, until reader cofunction reads ETX,
- * ie special ETX char ('\0'). Subsequent calls to parser
- * return NULL meaning "no tokens left in current stream".
- * Reader cofunctions tolerate repeated calls after
- * the first ETX, still returning '\0'.
- *
- * This parser presumes all incoming text is unformatted.
- * Since parser accesses streams a char at a time it does
- * not require periodic line feeds or anything else.
- *
- * Parser also returns offset information: number of bytes
- * since beginning of text block.
- *
- * Variables are static for speeeeeeed.
- *
- * OUTPUT FORMAT: NULL or a static C string containing a single
- * parsed word token. Word buffer reused at next call.
- * Each word is translated as follows:
- * All alphas TO UPPERCASE.
- * Teskey algorithm used to find word boundaries.
- * Always keeps include-list words.
- * Throws away stoplist words, very short words, and very long words.
- * All intervening nonconcordables discarded.
- *
- * There is a slight mod to the published Teskey algorithm.
- * Words can begin with optionally concordable chars
- * but not end with them. For example if '-' is optionally
- * concordable, '-foo-' will be parsed into '-foo'.
- */
- char *teskey_parser (PARG *parg)
- {
- static READCFP cofunction;
- static void *cofunction_arg;
- static DBLK *dblk = NULL;
- static char *outbuf = NULL;
- static size_t outbufsz = 0;
- static char *endmaxword; /* end largest possible output word */
- static char *outp; /* next loc in outbuf */
- static int *charmap;
- static int minwordsz, maxwordsz;
- static int wordlen;
- static enum {BETW_WORDS, IN_WORD, TOO_LONG}
- tpstate;
- static long *offsetp, readcount, candidate_offset;
- static int is_hiliting;
- static int add_msgs;
- static int len, opt_len;
- static wchar_t wc;
- /* If first call for current text block... */
- if (parg) {
- dblk = parg->dblk;
- minwordsz = dblk->dbrec.or_minwordsz;
- maxwordsz = dblk->dbrec.or_maxwordsz;
- charmap = dblk->charmap;
- offsetp = parg->offsetp;
- is_hiliting = (parg->flags & PA_HILITING);
- add_msgs = (parg->flags & PA_MSGS);
- if (charmap == NULL) {
- fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 4,
- "%s dblk not initialized.\n"),
- PROGNAME"801");
- DtSearchExit (55);
- }
- if (parg->string) {
- cofunction_arg = parg->string;
- cofunction = (READCFP) readchar_string;
- }
- else if (parg->ftext) {
- cofunction_arg = parg;
- cofunction = (READCFP) readchar_ftext;
- }
- else {
- fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 5,
- "%s Program Error: parg contains neither file nor string.\n"),
- PROGNAME"327");
- DtSearchExit (27);
- }
- if (outbufsz <= maxwordsz) {
- if (outbuf)
- free (outbuf);
- outbufsz = maxwordsz + 8;
- outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
- }
- endmaxword = outbuf + maxwordsz;
- if (debugging_teskey)
- fprintf (aa_stderr,
- "teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
- maxwordsz, (unsigned long) outbufsz);
- readcount = 0L;
- }
- /* CANDIDATE WORD LOOP: Read text chars into outbuf.
- * Exit loop when outbuf contains one candidate token or at ETX.
- */
- READ_ANOTHER_WORD:
- outp = outbuf;
- tpstate = BETW_WORDS;
- for (;;) {
- len = euro_readchar (cofunction, cofunction_arg, outp, &wc);
- if (!wc) break;
- readcount += len;
- cofunction_arg = NULL;
- /*------------- BETW_WORDS State ------------
- * Reader is between word tokens.
- */
- if (tpstate == BETW_WORDS) {
- /*
- * Discard nonconcordable chars between words.
- */
- if ((charmap[wc] & NON_CONCORD) != 0)
- continue;
- /*
- * Fully concordable char is definite start of new word.
- * Convert to uppercase and go get next char.
- */
- if ((charmap[wc] & CONCORDABLE) != 0) {
- outp = euro_wctomb (charmap[wc], outp, len);
- candidate_offset = readcount;
- tpstate = IN_WORD;
- continue;
- }
- /*
- * Must be optionally concordable. It can only
- * start a new word if next char is concordable.
- * If so, convert a fully concordable char
- * to uppercase and go get next char.
- * Otherwise discard just like non_concord.
- */
- outp += len;
- opt_len = euro_readchar (cofunction, NULL, outp, &wc);
- if (wc) readcount += opt_len;
- if ((charmap[wc] & CONCORDABLE) != 0) {
- outp = euro_wctomb (charmap[wc], outp, opt_len);
- candidate_offset = readcount - opt_len;
- tpstate = IN_WORD;
- continue;
- }
- else {
- outp -= len;
- continue;
- }
- } /* endif BETW_WORDS */
- /*------------- IN_WORD State ------------
- * Reader is in middle of a word.
- * Convert all concordables to uppercase and append.
- * Terminate word at first non_concord.
- * Non_concords treatment depends on next char.
- */
- else if (tpstate == IN_WORD) {
- if ((charmap[wc] & CONCORDABLE) != 0) {
- if (outp < endmaxword) {
- outp = euro_wctomb (charmap[wc], outp, len);
- }
- else {
- tpstate = TOO_LONG;
- if (debugging_teskey)
- fprintf (aa_stderr,
- "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
- candidate_offset-1, outbuf);
- if (add_msgs) {
- char msgbuf [DtSrMAXWIDTH_HWORD + 100];
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 8,
- "%s '%.*s...' is larger\n"
- "than the maximum word size of database '%s'.") ,
- PROGNAME"449", maxwordsz,
- parg->string, dblk->label);
- DtSearchAddMessage (msgbuf);
- return NULL;
- }
- outbuf[0] = 0;
- outp = outbuf;
- }
- continue;
- }
- if ((charmap[wc] & NON_CONCORD) != 0) {
- *outp = '\0';
- break;
- }
- /* Must be opt_concord... */
- outp += len;
- opt_len = euro_readchar (cofunction, NULL, outp, &wc);
- if (wc) readcount += opt_len;
- if ((charmap[wc] & CONCORDABLE) != 0) {
- if (outp < endmaxword) {
- outp = euro_wctomb (charmap[wc], outp, opt_len);
- }
- else {
- tpstate = TOO_LONG;
- if (debugging_teskey)
- fprintf (aa_stderr,
- "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
- candidate_offset-1, outbuf);
- outbuf[0] = 0;
- outp = outbuf;
- }
- continue;
- }
- else { /* next char NOT concordable...*/
- outp -= len;
- *outp = '\0';
- break;
- }
- } /* endif IN_WORD */
- /*------------- TOO_LONG State ------------
- * Reader is in middle of a word that exceeds max word size.
- * Discard all concordables and opt_concords until we
- * can get between words again with a clear non_concord.
- */
- else if (tpstate == TOO_LONG) {
- if ((charmap[wc] & NON_CONCORD) != 0) {
- outp = outbuf;
- tpstate = BETW_WORDS;
- }
- continue;
- }
- /*------------- UNKNOWN State ------------*/
- else {
- fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 10,
- "%s Program Error: Unknown parser state.\n"),
- PROGNAME"306");
- DtSearchExit (26);
- }
- } /* end read loop for next CANDIDATE WORD */
- /*---------- TEST FOR ETX -------------*/
- if (outbuf[0] == 0) {
- if (debugging_teskey)
- fprintf (aa_stderr, "teskey: etx\n");
- if (add_msgs) {
- char msgbuf [200];
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 12,
- "%s '%.120s' is not a valid word in database '%s'.") ,
- PROGNAME"506", parg->string, dblk->label);
- DtSearchAddMessage (msgbuf);
- }
- return NULL;
- }
- wordlen = strlen (outbuf);
- candidate_offset--; /* token offset is one less than number of reads */
- if (debugging_teskey)
- fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
- candidate_offset, outbuf);
- if (is_hiliting) {
- if (debugging_teskey)
- fprintf (aa_stderr, ", (hiliting, skip tree searches)");
- goto GOOD_WORD;
- }
- /*--------- INCLUDE LIST ----------
- * Search before testing for stoplist or minimum word length.
- */
- if (dblk->inclist != NULL) {
- if (search_wordtree (dblk->inclist, outbuf)) {
- if (debugging_teskey)
- fprintf (aa_stderr, ", (INCLUDE LIST)");
- goto GOOD_WORD;
- }
- }
- /*--------- TOO SHORT -----------*/
- if (wordlen < minwordsz) {
- if (debugging_teskey)
- fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
- if (add_msgs) {
- char msgbuf [200];
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 17,
- "%s '%s' is less than the\n"
- "minimum word size of database '%s'.") ,
- PROGNAME"543", parg->string, dblk->label);
- DtSearchAddMessage (msgbuf);
- return NULL;
- }
- goto READ_ANOTHER_WORD;
- }
- /*----------- STOP LIST -------------*/
- if (dblk->stoplist != NULL) {
- if (search_wordtree (dblk->stoplist, outbuf)) {
- if (debugging_teskey)
- fprintf (aa_stderr, ", (STOP LIST)\n");
- if (add_msgs) {
- char msgbuf [200];
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 19,
- "%s The word '%s' is not indexed in database '%s'.") ,
- PROGNAME"558", parg->string, dblk->label);
- DtSearchAddMessage (msgbuf);
- return NULL;
- }
- goto READ_ANOTHER_WORD;
- }
- }
- GOOD_WORD:
- /* Word is correctly parsed and passes all dblk filters. */
- if (debugging_teskey)
- fprintf (aa_stderr, ", ...good word\n");
- if (offsetp)
- *offsetp = candidate_offset;
- return outbuf;
- } /* teskey_parser() */
- /************************************************/
- /* */
- /* is_concordable */
- /* */
- /************************************************/
- /* Verifies passed word token is teskey-concordable
- * in code page of passed charmap. Used in validating
- * word files. Returns TRUE if all chars concordable
- * or optionally concordable, else returns FALSE.
- */
- int is_concordable (char *word, int *charmap)
- {
- char *cptr;
- wchar_t wc;
- for (cptr = word; *cptr != 0; cptr++) {
- euro_mbtowc (&wc, cptr, word);
- if ((charmap[wc] & NON_CONCORD) != 0)
- break;
- }
- return (*cptr == 0);
- } /* is_concordable() */
- /************************************************/
- /* */
- /* load_wordtree */
- /* */
- /************************************************/
- /* Called by load_stop_list(), load_include_list(), etc,
- * to read an appropriate word list file into binary tree structures.
- *
- * INPUT FILE FORMAT: One word per line, all chars teskey concordable.
- * Preferred order is frequency of occurrence in the corpus
- * to make searches efficient. Otherwise the words should at least
- * be in random order or an order that will approximate a binary search.
- * If first char is any of COMMENT_CHARS, line is ignored as comments.
- * Ascii spaces, tabs, or newline delimits the first word token--
- * anything else on the line is ignored as comments.
- * Optionally characters in word token will be checked for teskey
- * concordability.
- *
- * RETURNS 0 if file successfully loaded, returns 1 if file missing,
- * returns 2 and messages in global msglist if file has fatal errors.
- */
- int load_wordtree (
- WORDTREE **treetop,
- DBLK *dblk,
- char *fname,
- int do_teskey_test)
- {
- int i;
- int errcount;
- int is_duplicate;
- long linecount = 0;
- char *token;
- char readbuf [256];
- char sprintbuf [_POSIX_PATH_MAX + 1024];
- FILE *fileid;
- WORDTREE *new;
- WORDTREE **this_link;
- _Xstrtokparams strtok_buf;
- if (debugging_loadlang)
- fprintf (aa_stderr, PROGNAME"1071 "
- "load_wordtree: db=%s fname='%s'\n",
- NULLORSTR(dblk->name), NULLORSTR(fname));
- if ((fileid = fopen (fname, "rt")) == NULL) {
- /* Not being able to find the file is not an error.
- * We indicate that with the return code.
- * But any other error (like permissions) is fatal.
- */
- if (errno == ENOENT) {
- if (debugging_loadlang)
- fputs (" ...file not found.\n", aa_stderr);
- return 1;
- }
- else {
- sprintf (sprintbuf,
- CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
- PROGNAME"362", fname, strerror(errno));
- DtSearchAddMessage (sprintbuf);
- return 2;
- }
- }
- /*--------- Main Read Loop ----------*/
- errcount = 0;
- while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
- linecount++;
- /*
- * Ignore comment lines beginning with punctuation char.
- * Ignore empty lines (strtok returns NULL, no tokens).
- * Otherwise first or only word on line is the desired word.
- */
- if (strchr (COMMENT_CHARS, readbuf[0]))
- continue;
- if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
- continue;
- dblk->lstrupr (token, dblk);
- if (debugging_loadword)
- fprintf (aa_stderr, " WORD: '%s' ", token);
- /* If requested confirm all chars are teskey-concordable. */
- if (do_teskey_test)
- if (!is_concordable (token, dblk->charmap)) {
- sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 400,
- "%s: %s, line %ld: Invalid chars in word '%s'."),
- PROGNAME"400", fname, linecount, token);
- DtSearchAddMessage (sprintbuf);
- errcount++;
- continue;
- }
- /* Unless we've already detected some errors,
- * allocate a new node and load its data fields.
- */
- if (errcount)
- continue;
- i = strlen (token);
- new = austext_malloc (sizeof(WORDTREE) + i + 4,
- PROGNAME"104", NULL);
- new->llink = NULL;
- new->rlink = NULL;
- new->len = i;
- new->word = (void *) (new + 1);
- strcpy (new->word, token);
- /* Descend binary tree and insert in correct alphabetical place */
- is_duplicate = FALSE;
- for (this_link = treetop; *this_link != NULL; ) {
- i = strcmp (new->word, (*this_link)->word);
- /* test for duplicate word */
- if (i == 0) {
- sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 423,
- "%s Word '%s' in '%s' is a duplicate."),
- PROGNAME"423", token, fname);
- DtSearchAddMessage (sprintbuf);
- /* duplicates aren't fatal, just ignore the word */
- is_duplicate = TRUE;
- break; /* no point in continuing descent */
- }
- /* Descend tree to find correct insertion point */
- if (debugging_loadword)
- fputc(((i < 0)? 'L' : 'R'), aa_stderr);
- this_link = (WORDTREE **) ((i < 0) ?
- &(*this_link)->llink : &(*this_link)->rlink);
- } /* end forloop to find tree insertion point */
- /* Don't link anything if error found while descending tree */
- if (is_duplicate) {
- if (debugging_loadword)
- fputs (" duplicate!\n", aa_stderr);
- free (new);
- continue;
- }
- /* Insert new node at current location in tree */
- *this_link = new;
- if (debugging_loadword)
- fputs(" .\n", aa_stderr);
- } /* end of read loop */
- fclose (fileid);
- if (errcount) {
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1186 load word file '%s' failed.\n", fname);
- return 2;
- }
- else {
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1193 load word file '%s' successful.\n", fname);
- return 0;
- }
- } /* load_wordtree() */
- /************************************************/
- /* */
- /* free_wordtree */
- /* */
- /************************************************/
- /* Formerly free_bintree() in msgutil.c.
- * Frees storage for all nodes in a WORDTREE and
- * sets its top-of-list pointer to NULL.
- * Works only for node structures where all memory
- * was allocated in a single call to malloc().
- * Uses link inversion traversal (eg, Data Structure Techniques,
- * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
- * at preorder visit, and node is freed at postorder visit.
- */
- static void free_wordtree (WORDTREE ** wordtree_head)
- {
- WORDTREE *next;
- WORDTREE *prev = NULL;
- WORDTREE *pres = *wordtree_head;
- if (*wordtree_head == NULL)
- return;
- DESCEND_LEFT:
- pres->word = (void *) 0; /* preorder visit: TAG = 0 */
- next = pres->llink;
- if (next != NULL) {
- pres->llink = prev;
- prev = pres;
- pres = next;
- goto DESCEND_LEFT;
- }
- DESCEND_RIGHT:
- next = pres->rlink;
- if (next != NULL) {
- pres->word = (void *) 1; /* TAG = 1 */
- pres->rlink = prev;
- prev = pres;
- pres = next;
- goto DESCEND_LEFT;
- }
- POSTORDER_VISIT:
- free (pres);
- if (prev == NULL) { /* end of algorithm? */
- *wordtree_head = NULL;
- return;
- }
- if (prev->word == (void *) 0) { /* go up left leg */
- next = prev->llink;
- pres = prev;
- prev = next;
- goto DESCEND_RIGHT;
- }
- else { /* go up right leg */
- next = prev->rlink;
- prev->word = (void *) 0; /* restore TAG = 0 */
- pres = prev;
- prev = next;
- goto POSTORDER_VISIT;
- }
- } /* free_wordtree() */
- /************************************************/
- /* */
- /* load_include_list */
- /* */
- /************************************************/
- /* Builds include list by reading include file
- * into a binary tree structure.
- * Unlike stoplists, include-lists are optional.
- * Also unlike stoplists, there are no language default include-lists.
- * 'dblist' may be NULL.
- * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
- */
- static int load_include_list (DBLK *dblk, DBLK *dblist)
- {
- int i;
- int filename_was_null = (dblk->fname_inc == NULL);
- DBLK *db;
- char sprintbuf [512];
- dblk->inclist = NULL; /* just to be sure */
-
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
- NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
- language_name(dblk->dbrec.or_language));
- /* If file name not provided, generate one based on
- * dblk's path, database name, and default extension.
- */
- if (filename_was_null) {
- if (dblk->name[0] == 0) {
- dblk->fname_inc = "";
- dblk->inclist = NULL;
- if (debugging_loadlang)
- fprintf (aa_stderr, PROGNAME"1339 "
- "No inclist because neither fname nor dbname provided.\n");
- return TRUE;
- }
- if (dblk->path == NULL)
- dblk->path = strdup("");
- dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
- PROGNAME"1187", NULL);
- strcpy (dblk->fname_inc, dblk->path);
- ensure_end_slash (dblk->fname_inc);
- strcat (dblk->fname_inc, dblk->name);
- strcat (dblk->fname_inc, EXT_INCLIST);
- }
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1350 Include list file name = '%s'.\n",
- dblk->fname_inc);
- /* Don't reload the same file if it's already
- * been loaded into a previous dblk in a list.
- * Code works just fine if dblist == NULL.
- */
- for (db = dblist; db != NULL; db = db->link) {
- if (db == dblk || db->fname_inc == NULL)
- continue;
- if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
- dblk->inclist = db->inclist;
- dblk->lang_flags |= LF_DUP_INC;
- if (debugging_loadlang)
- fprintf (aa_stderr, PROGNAME"1363 "
- "Using previously loaded inclist, db='%s'.\n",
- dblk->name);
- return TRUE;
- }
- }
- /* Include list is optional so missing file is
- * not an error unless caller named a specific file.
- */
- i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
- switch (i) {
- case 0:
- return TRUE;
- case 1:
- if (filename_was_null) {
- dblk->fname_inc = "";
- dblk->inclist = NULL;
- return TRUE;
- }
- else {
- sprintf (sprintbuf,
- CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
- PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
- DtSearchAddMessage (sprintbuf);
- return FALSE;
- }
-
- default:
- return FALSE;
- }
- } /* load_include_list() */
- /************************************************/
- /* */
- /* load_stop_list */
- /* */
- /************************************************/
- /* Builds stoplist by reading stoplist file into a
- * binary tree structure. File name can be
- * (1) passed in dblk.fname_stp,
- * (2) generated from dblk path, name, and '.stp',
- * (3) default for dblk path, language, and '.stp'.
- * 'dblist' may be NULL.
- * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
- */
- static int load_stop_list (DBLK *dblk, DBLK *dblist)
- {
- int i;
- DBLK *db;
- char sprintbuf [_POSIX_PATH_MAX + 512];
- struct stat statbuf;
- dblk->stoplist = NULL; /* just to be sure */
-
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
- NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
- language_name(dblk->dbrec.or_language));
- /* If file name not provided, generate one based on
- * dblk's path, database name, and default extension.
- * And if that doesn't work, generate one based on
- * dblk's path, language, and default extension.
- */
- if (dblk->fname_stp == NULL) {
- if (dblk->path == NULL)
- dblk->path = strdup("");
- dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
- PROGNAME"919", NULL);
- strcpy (dblk->fname_stp, dblk->path);
- ensure_end_slash (dblk->fname_stp);
- strcat (dblk->fname_stp, dblk->name);
- strcat (dblk->fname_stp, EXT_STOPLIST);
- errno = 0;
- stat (dblk->fname_stp, &statbuf);
- if (errno == ENOENT) {
- strcpy (dblk->fname_stp, dblk->path);
- ensure_end_slash (dblk->fname_stp);
- strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
- strcat (dblk->fname_stp, EXT_STOPLIST);
- }
- }
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1448 Stoplist file name = '%s'.\n",
- dblk->fname_stp);
- /* Don't reload the same file if it's already
- * been loaded into a previous dblk in a list.
- * Code works just fine if dblist == NULL.
- */
- for (db = dblist; db != NULL; db = db->link) {
- if (db == dblk || db->fname_stp == NULL)
- continue;
- if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
- dblk->stoplist = db->stoplist;
- dblk->lang_flags |= LF_DUP_STP;
- if (debugging_loadlang)
- fprintf (aa_stderr, PROGNAME"1460 "
- "Using previously loaded stoplist, db='%s'.\n",
- dblk->name);
- return TRUE;
- }
- }
- /* Stop lists are mandatory--a missing stoplist is fatal. */
- i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
- if (i == 1) {
- sprintf (sprintbuf,
- CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
- PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
- DtSearchAddMessage (sprintbuf);
- }
- return (i == 0);
- } /* load_stop_list() */
- /************************************************/
- /* */
- /* free_paice_rules */
- /* */
- /************************************************/
- /* Frees all allocated storage for a set of paice rules, typically
- * loaded at dblk.stem_extra. Called by REINIT routines and
- * by load_paice_suffixes() when cleaning up after an error.
- */
- static void free_paice_rules (PRULE ***rules_table_ptr)
- {
- int i;
- PRULE *p, **linkp;
- PRULE **rules_table;
- if (*rules_table_ptr == NULL)
- return;
- rules_table = *rules_table_ptr;
- for (i=0; i<256; i++) {
- if (rules_table[i] == NULL)
- continue;
- p = rules_table[i];
- while (p) {
- linkp = &p->link;
- free (p->suffix);
- if (p->apndstr)
- free (p->apndstr);
- free (p);
- p = *linkp;
- }
- }
- free (rules_table);
- *rules_table_ptr = NULL;
- return;
- } /* free_paice_rules() */
- /************************************************/
- /* */
- /* load_paice_suffixes */
- /* */
- /************************************************/
- /* Loads European language paice stemmer suffix rules
- * into dblk.stem_extra as an array of ptrs to linked lists.
- * Like stop lists, sfx files can be
- * (1) passed in dblk.fname_sfx,
- * (2) generated from dblk path, dbname, and '.sfx',
- * (3) generated from dblk path, language, and '.sfx'.
- * Internal tables will be reused if file previously loaded.
- * Only uses single byte character sets (ascii, iso-latin-1).
- * Uses strtok(). dblk->charmap must already be loaded.
- * Will continue to parse entire file even if errors are found.
- * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
- */
- static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
- {
- FILE *fp;
- DBLK *db;
- PRULE *prule, **prule_link;
- PRULE **rules_table;
- struct stat statbuf;
- UCHAR *cptr, *token;
- char readbuf [_POSIX_PATH_MAX + 1024];
- char msgbuf [_POSIX_PATH_MAX + 1024];
- UCHAR *suffix, *apndstr;
- int must_be_intact, is_last_rule;
- UCHAR remove_count;
- int lineno, errcount;
- int len;
- wchar_t wc;
- _Xstrtokparams strtok_buf;
- dblk->stem_extra = NULL; /* just to be sure */
- rules_table = NULL;
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
- NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
- language_name(dblk->dbrec.or_language));
- /* If file name not provided, generate one based on
- * dblk's path, database name, and default extension.
- * And if that doesn't work, generate one based on
- * dblk's path, language, and default extension.
- */
- if (dblk->fname_sfx == NULL) {
- if (dblk->path == NULL)
- dblk->path = strdup("");
- dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
- PROGNAME"1113", NULL);
- strcpy (dblk->fname_sfx, dblk->path);
- ensure_end_slash (dblk->fname_sfx);
- strcat (dblk->fname_sfx, dblk->name);
- strcat (dblk->fname_sfx, EXT_SUFFIX);
- errno = 0;
- stat (dblk->fname_sfx, &statbuf);
- if (errno == ENOENT) {
- strcpy (dblk->fname_sfx, dblk->path);
- ensure_end_slash (dblk->fname_sfx);
- strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
- strcat (dblk->fname_sfx, EXT_SUFFIX);
- }
- }
- if (debugging_loadlang)
- fprintf (aa_stderr,
- PROGNAME"1740 Paice suffix file name = '%s'.\n",
- dblk->fname_sfx);
- /* Don't reload the same file if it's already
- * been loaded into a previous dblk in a list,
- * but flag it so it won't be freed at unload_language/REINIT.
- * Code works just fine if dblist == NULL.
- */
- for (db = dblist; db != NULL; db = db->link) {
- if (db == dblk || db->fname_sfx == NULL)
- continue;
- if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
- dblk->stem_extra = db->stem_extra;
- dblk->lang_flags |= LF_DUP_SFX;
- if (debugging_loadlang)
- fprintf (aa_stderr, PROGNAME"1145 "
- "Using previously loaded suffixes, db='%s'.\n",
- dblk->name);
- return TRUE;
- }
- }
- fp = fopen (dblk->fname_sfx, "rt");
- if (fp == NULL) {
- sprintf (msgbuf,
- CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
- PROGNAME"181", dblk->fname_sfx, strerror(errno));
- DtSearchAddMessage (msgbuf);
- dblk->fname_sfx = NULL;
- return FALSE;
- }
- /* Rules table will eventually be loaded at dblk.stem_extra.
- * It consists of 256 PRULE ptrs,
- * one for each possible single byte char.
- * Each ptr is the head of a rules list for that char.
- */
- rules_table = austext_malloc (256 * sizeof(PRULE*),
- PROGNAME"199", &ausapi_msglist);
- memset (rules_table, 0, 256 * sizeof(PRULE*));
- lineno = 0;
- errcount = 0;
- /*------- Main Read Loop -------*/
- while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
- lineno++;
- /* Ignore comment lines */
- if (strchr (COMMENT_CHARS, readbuf[0]))
- continue;
- /* TOKEN #1: suffix string, backwards, all uppercase.
- * If missing, ignore 'empty' line.
- * If the first token is all numeric, ignore line
- * (for compatibility with older versions of file).
- */
- if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
- continue;
- for (cptr = suffix; cptr; cptr++) {
- euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
- if ((dblk->charmap[wc] & NUMERAL) == 0)
- break;
- }
- if (*cptr == '\0')
- continue;
- /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
- if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
- BAD_RULE:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 51,
- "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
- PROGNAME"898", dblk->fname_sfx, lineno, suffix);
- DtSearchAddMessage (msgbuf);
- errcount++;
- continue;
- }
- must_be_intact = FALSE;
- if (token[0] == '*') {
- must_be_intact = TRUE;
- /* Read next token... */
- if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
- goto BAD_RULE;
- }
- /* TOKEN #3: remove-count */
- remove_count = (UCHAR) atoi ((char *) token);
- /* OPTIONAL TOKEN #4: if next token is NOT a continue
- * symbol ('>' or '$'), then it's an append string.
- */
- apndstr = NULL;
- if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
- goto BAD_RULE;
- if (token[0] != '$' && token[0] != '>') {
- apndstr = token;
- /* Read next token... */
- if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
- goto BAD_RULE;
- }
- /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
- is_last_rule = (token[0] == '$');
- if (debugging_loadword) {
- fprintf (aa_stderr,
- " SFX: intact?=%d stop?=%d remv=%d '%s'",
- (int) must_be_intact,
- (int) is_last_rule,
- (int) remove_count,
- suffix);
- if (apndstr)
- fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
- else
- fputc ('\n', aa_stderr);
- }
- /* Good suffix. If we haven't had any errors yet,
- * add it to rules list for the first char of the suffix.
- */
- if (errcount)
- continue;
- prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
- memset (prule, 0, sizeof(PRULE));
- prule->suffix = (UCHAR *) strdup ((char*)suffix);
- prule->suflen = strlen ((char*)suffix);
- prule->must_be_intact = must_be_intact;
- prule->remove_count = remove_count;
- prule->is_last_rule = is_last_rule;
- if (apndstr) {
- len = mbstowcs (NULL, (char *)apndstr, 0);
- if (len != -1) {
- prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
- prule->aplen = len;
- }
- }
- prule_link = &rules_table[suffix[0]];
- while (*prule_link)
- prule_link = &(*prule_link)->link;
- *prule_link = prule;
- } /* end Main Read Loop */
- fclose (fp);
- if (errcount) {
- free_paice_rules (&rules_table);
- return FALSE;
- }
- dblk->stem_extra = rules_table;
- /* Update last table entry */
- if (debugging_loadlang) {
- fprintf (aa_stderr,
- PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
- dblk->fname_sfx);
- fflush (aa_stderr);
- }
- return TRUE;
- } /* load_paice_suffixes() */
- /************************************************/
- /* */
- /* is_matching_rule */
- /* */
- /************************************************/
- /* Subroutine of paice_stemmer().
- * Returns TRUE if passed rule can be applied to stem in paicebuf.
- * Else returns FALSE.
- */
- static int is_matching_rule (PRULE *rule)
- {
- static char *ptr;
- static wchar_t wc;
- static int i, j;
- if (debugging_paice)
- fprintf (aa_stderr, " test rule '%s':\t", rule->suffix);
- /* Skip rule if we've made at least one previous change
- * but the current rule requires an intact word.
- */
- if (rule->must_be_intact && !word_is_intact) {
- if (debugging_paice)
- fputs ("word not intact...\n", aa_stderr);
- return FALSE;
- }
- /* Do a backward strcmp on the suffix.
- * Skip rule if it doesn't match current paicebuf's ending chars.
- */
- j = rule->suflen;
- ptr = paicebuf + paicelen - 1;
- for (i = 0; i < j; i++) {
- if (*((rule->suffix) + i) != *ptr) {
- if (debugging_paice)
- fputs ("no match...\n", aa_stderr);
- return FALSE;
- }
- ptr--;
- }
- if (debugging_paice)
- fputs ("match", aa_stderr);
- /* Set i = paicebuf length after removing and appending suffixes.
- * Used to algorithmically test remaining stem length
- * after tentative application of rule.
- */
- i = paicewcsl - (rule->remove_count - rule->aplen);
- if (i <= 1) {
- if (debugging_paice)
- fputs (", but stem too short...\n", aa_stderr);
- return FALSE;
- }
- if (i == 2) {
- euro_mbtowc (&wc, paicebuf, paicebuf);
- if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);
- if (IS_VOWEL (wc)) {
- if (debugging_paice)
- fputs (", and short vowel stem valid.\n", aa_stderr);
- return TRUE;
- }
- else {
- if (debugging_paice)
- fputs (", but consonant stem too short...\n", aa_stderr);
- return FALSE;
- }
- }
- /* Remaining stem is at least 3 chars.
- * If it contains a vowel anywhere, it's valid.
- * (A 'Y' after the first char counts as a vowel).
- * Otherwise it's not.
- */
- for (j=0; j<i; j++) {
- euro_mbtowc (&wc, &paicebuf[j], paicebuf);
- if (IS_VOWEL (wc)) {
- GOOD_STEM:
- if (debugging_paice)
- fputs (", and remaining stem valid.\n", aa_stderr);
- return TRUE;
- }
- if (j > 0 && wc == L'Y')
- goto GOOD_STEM;
- }
- if (debugging_paice)
- fputs (", but remaining stem all consonants.\n", aa_stderr);
- return FALSE;
- } /* is_matching_rule() */
- /************************************************/
- /* */
- /* paice_stemmer */
- /* */
- /************************************************/
- /* Given a word token (ALREADY UPPERCASE) in a single byte
- * language such as the output of teskey_parser,
- * generates 'stem' by repeated suffix removal.
- * Returns stem token in a static buffer valid
- * until next call to paice_stemmer or null_stemmer.
- * Returned stem might be the original unmodified word.
- * Returned stem might also be empty string.
- * Returned stem is *never* NULL, even if wordin == NULL.
- * Input buffer will not be modified; does not use strtok.
- * All variables are static for speeeeeeed.
- */
- static char *paice_stemmer (char *wordin, DBLK *dblk)
- {
- wchar_t finalwc;
- int len;
- PRULE *rule, **rules_table;
- if (wordin == NULL)
- return "";
- if (*wordin == 0)
- return "";
- if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
- fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 31,
- "%s Stemmer suffixes file never loaded.\n"),
- PROGNAME"310");
- DtSearchExit (2);
- }
- /* The max length of a stem is bufsz - 2:
- * one for the terminating \0 and one for the
- * prefix ^O that identifies a stem. (But this
- * stemmer doesn't actually insert the ^O now.)
- */
- strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);
- if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
- mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
- paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
- else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;
- paice_charmap = dblk->charmap;
- word_is_intact = TRUE;
- for (;;) { /*-------- Main Stemming Loop ---------*/
- paicelen = strlen (paicebuf);
- paicewcsl = mbstowcs (NULL, paicebuf, 0);
- len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);
- if (debugging_paice) {
- fwprintf (aa_stderr,
- L"paice: '%s', rules list '%lc' for database '%s'\n",
- paicebuf, finalwc, dblk->name);
- fflush (aa_stderr);
- }
- /* Look for a matching rule */
- if ((rule = rules_table [finalwc]) == NULL) {
- if (debugging_paice)
- fputs (" list is null, stop.\n", aa_stderr);
- break;
- }
- while (rule) {
- if (is_matching_rule (rule))
- break;
- rule = rule->link;
- }
- if (rule == NULL) {
- if (debugging_paice)
- fwprintf (aa_stderr,
- L" rules list '%lc' is exhausted, stop.\n", finalwc);
- break;
- }
- /* Apply rule that matched */
- if (debugging_paice)
- fputs (" apply rule: ", aa_stderr);
- if (rule->remove_count == 0) {
- if (debugging_paice)
- fputs ("remove_count = 0, stop.\n", aa_stderr);
- break;
- }
- paicebuf [paicelen - len * rule->remove_count] = 0;
- if (rule->aplen)
- strcat (paicebuf, (char*)rule->apndstr);
- paicelen = strlen (paicebuf);
- paicewcsl = mbstowcs (NULL, paicebuf, 0);
- word_is_intact = FALSE; /* we've removed at least 1 suffix */
- if (debugging_paice)
- fprintf (aa_stderr, "--> '%s'", paicebuf);
- /* Terminate algorithm if rule says so.
- * Otherwise continue removing suffixes
- * from this partially stemmed word.
- */
- if (rule->is_last_rule) {
- if (debugging_paice)
- fputs (", stop flag is set, stop.\n", aa_stderr);
- break;
- }
- if (debugging_paice)
- fputc ('\n', aa_stderr);
- } /* end Main Stemming Loop */
- if (debugging_paice) {
- fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
- fflush (aa_stderr);
- }
- return paicebuf;
- } /* paice_stemmer() */
- /************************************************/
- /* */
- /* null_stemmer */
- /* */
- /************************************************/
- /* Stemmer that just copies and returns passed word.
- * In effect, the passed word IS its own stem.
- * Output buffer valid until next call to null_stemmer
- * or paice_stemmer.
- */
- char *null_stemmer (char *word, DBLK *dblk)
- {
- if (word == NULL)
- return "";
- if (*word == '\0')
- return "";
- strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
- paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
- return paicebuf;
- } /* null_stemmer() */
- /************************************************/
- /* */
- /* euro_lstrupr */
- /* */
- /************************************************/
- /* Converts passed string to uppercase in place.
- * Classic strupr() function using teskey charmaps.
- */
- static char *euro_lstrupr (char *string, DBLK *dblk)
- {
- static int *charmap, len;
- static char *s;
- static wchar_t wc;
- charmap = dblk->charmap;
- for (s = string; *s; s++) {
- len = euro_mbtowc (&wc, s, string);
- *s = charmap[wc] & 0xFF;
- if (len > 1) wctomb (s - 1, *s);
- }
- return string;
- }
- /************************************************/
- /* */
- /* null_lstrupr */
- /* */
- /************************************************/
- /* Just returns passed string. Used where uppercase
- * conversions are not required for a language.
- */
- char *null_lstrupr (char *s, DBLK *d)
- { return s; }
- /************************************************/
- /* */
- /* load_language */
- /* */
- /************************************************/
- /* Loads a dblk with a specific language's
- * structures and function pointers.
- * Does not reload structures previously loaded in
- * other dblks on dblist if derived from identical files.
- * But always loads structures if passed dblist is NULL.
- * Presumes dblk already partially initialized with mandatory fields:
- * name, path, language.
- * May also be preinitialized with optional fields:
- * minwordsz, maxwordsz.
- * Returns TRUE if all successful.
- * Otherwise returns FALSE with err msgs on ausapi_msglist.
- */
- int load_language (DBLK *dblk, DBLK *dblist)
- {
- int oops = FALSE;
- int language = dblk->dbrec.or_language;
- if (debugging_loadlang)
- fprintf (aa_stderr,
- "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
- (int)dblk->dbrec.or_language,
- language_name (dblk->dbrec.or_language),
- NULLORSTR(dblk->name));
- /*
- * Note: Load list functions must be called
- * AFTER charmap and lstrupr are loaded.
- */
- switch (language) {
- case DtSrLaENG:
- case DtSrLaENG2:
- case DtSrLaESP:
- case DtSrLaFRA:
- case DtSrLaITA:
- case DtSrLaDEU:
- dblk->charmap = (language == DtSrLaENG)?
- ascii_charmap : latin_charmap;
- dblk->parser = teskey_parser;
- dblk->stemmer = paice_stemmer;
- dblk->lstrupr = euro_lstrupr;
- if (dblk->dbrec.or_maxwordsz == 0)
- dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
- MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
- if (dblk->dbrec.or_minwordsz == 0)
- dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
- oops = FALSE;
- if (!load_stop_list (dblk, dblist))
- oops = TRUE;
- if (!load_include_list (dblk, dblist))
- oops = TRUE;
- if (!load_paice_suffixes (dblk, dblist))
- oops = TRUE;
- if (oops)
- return FALSE;
- break;
- case DtSrLaJPN:
- case DtSrLaJPN2:
- return load_jpn_language (dblk, dblist);
- default:
- /* Try loading a custom 'user' language.
- * If he failed to provide a loader function,
- * the dummy custom loader will tell him so.
- * If he provided one but it can't load this language,
- * it should return it's own error msgs.
- */
- return load_custom_language (dblk, dblist);
- } /* end switch (language) */
- return TRUE;
- } /* load_language() */
- /************************************************/
- /* */
- /* unload_language */
- /* */
- /************************************************/
- /* Frees storage for structures allocated by load_language().
- * Called when engine REINITs due to change in site config file
- * or databases.
- * Duplicate wordtrees are not unloaded because they
- * will have already been unloaded in a previous dblk.
- */
- void unload_language (DBLK *dblk)
- {
- switch (dblk->dbrec.or_language) {
- case DtSrLaENG:
- case DtSrLaENG2:
- case DtSrLaESP:
- case DtSrLaFRA:
- case DtSrLaITA:
- case DtSrLaDEU:
- dblk->charmap = NULL;
- if ((dblk->lang_flags & LF_DUP_STP) == 0)
- free_wordtree (&dblk->stoplist);
- else {
- dblk->stoplist = NULL;
- dblk->lang_flags &= ~LF_DUP_STP;
- }
- if ((dblk->lang_flags & LF_DUP_INC) == 0)
- free_wordtree (&dblk->inclist);
- else {
- dblk->inclist = NULL;
- dblk->lang_flags &= ~LF_DUP_INC;
- }
- if ((dblk->lang_flags & LF_DUP_SFX) == 0)
- free_paice_rules ((PRULE***)&dblk->stem_extra);
- else {
- dblk->stem_extra = NULL;
- dblk->lang_flags &= ~LF_DUP_SFX;
- }
- break;
- case DtSrLaJPN:
- case DtSrLaJPN2:
- unload_jpn_language (dblk);
- break;
- default:
- unload_custom_language (dblk);
- break;
- }
- return;
- } /* unload_language() */
- /******************** LANG.C ********************/
|