lang.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * COMPONENT_NAME: austext
  25. *
  26. * FUNCTIONS: euro_lstrupr
  27. * free_wordtree
  28. * is_concordable
  29. * language_name
  30. * load_include_list
  31. * load_language
  32. * load_paice_suffixes
  33. * load_stop_list
  34. * load_wordtree
  35. * null_lstrupr
  36. * null_stemmer
  37. * paice_stemmer
  38. * search_wordtree
  39. * teskey_parser
  40. * unload_language
  41. *
  42. * ORIGINS: 27
  43. *
  44. *
  45. * (C) COPYRIGHT International Business Machines Corp. 1995,1996
  46. * All Rights Reserved
  47. * Licensed Materials - Property of IBM
  48. * US Government Users Restricted Rights - Use, duplication or
  49. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  50. */
  51. /******************** LANG.C ********************
  52. * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
  53. * July 1995.
  54. * Includes load_language(), unload_language(), and functions and data for
  55. * parsing and stemming European languages in DtSearch/AusText.
  56. * Incorporates p/o socrates.c, p/o proctext.c, parser.c
  57. * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
  58. * Related to similar semantic modules repackaged into semantic.c.
  59. * Paice suffix removal algorithm from C. Paice, 1990,
  60. * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
  61. *
  62. * $Log$
  63. * Revision 2.13 1996/03/25 18:55:26 miker
  64. * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  65. *
  66. * Revision 2.12 1996/03/25 17:00:19 miker
  67. * Cleanup compiler warning.
  68. *
  69. * Revision 2.11 1996/03/13 22:58:13 miker
  70. * Changed char to UCHAR several places.
  71. *
  72. * Revision 2.10 1996/03/05 16:49:58 miker
  73. * Move COMMENT_CHARS to SearchP.h.
  74. *
  75. * Revision 2.9 1996/03/05 16:31:20 miker
  76. * Added test of PA_MSGS for yacc-based boolean queries.
  77. * Made comment chars in linguistic files independent of locale.
  78. * Changed several char ptrs to unsigned char so parser will
  79. * work when compiled under default signed char compilers.
  80. * Simplified several statements with LHS *var++ for same reason.
  81. *
  82. * Revision 2.8 1996/02/05 16:16:05 miker
  83. * Restore prolog.
  84. *
  85. * Revision 2.7 1996/02/05 16:10:54 miker
  86. * load_paice_suffixes: discard .sfx lines beginning with all numeric
  87. * first token for compatibility with older file formats.
  88. *
  89. * Revision 2.6 1996/02/01 19:11:43 miker
  90. * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
  91. * Moved charmaps to new module langmap.c. Removed hard coded
  92. * paice stemmer values--now dynamic from .sfx file.
  93. *
  94. * Revision 2.5 1995/10/26 14:55:28 miker
  95. * Added prolog.
  96. *
  97. * Revision 2.4 1995/10/19 20:54:36 miker
  98. * Increased msg buf sizes to accommodate larger database file names.
  99. *
  100. * Revision 2.3 1995/10/06 14:39:45 miker
  101. * Bug fix: coredump loading multiple databases
  102. * on Solaris.
  103. *
  104. * Revision 2.2 1995/10/03 21:39:10 miker
  105. * Changed teskey_parser, paice_stemmer, and null_stemmer
  106. * to return number of words parsed/stemmed, not just boolean.
  107. *
  108. * Revision 2.1 1995/09/22 21:00:19 miker
  109. * Freeze DtSearch 0.1, AusText 2.1.8
  110. *
  111. * Revision 1.3 1995/09/19 22:08:28 miker
  112. * Added support for loading and parsing Japanese language DtSrLaJPN.
  113. *
  114. * Revision 1.2 1995/09/05 21:34:52 miker
  115. * Fixed bug: search engine wouldn't parse words of exactly
  116. * 3 or 15 chars.
  117. *
  118. * Revision 1.1 1995/08/31 21:03:44 miker
  119. * Initial revision
  120. */
  121. #include "SearchP.h"
  122. #include <limits.h>
  123. #include <stdlib.h>
  124. #include <string.h>
  125. #include <errno.h>
  126. #include <wchar.h>
  127. #include <sys/stat.h>
  128. #define X_INCLUDE_STRING_H
  129. #define XOS_USE_NO_LOCKING
  130. #include <X11/Xos_r.h>
  131. #define PROGNAME "LANG"
  132. #define EXT_SUFFIX ".sfx" /* standard paice suffix file format */
  133. #define OUTBUFSZ 6140
  134. #define SFX_DELIMS " \t\n"
  135. #define MS_misc 1
  136. #define MS_lang 15
  137. #define IS_VOWEL(c) ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
  138. /************************************************/
  139. /* */
  140. /* PRULE */
  141. /* */
  142. /************************************************/
  143. /* List of Paice suffix removal rules from .sfx files */
  144. typedef struct prule_t {
  145. struct prule_t *link; /* Ptr to next list node */
  146. UCHAR *suffix; /* Applicable suffix string, backwards */
  147. UCHAR suflen; /* Length of suffix */
  148. char must_be_intact; /* Optional '*'. Rule only applies
  149. * to intact words */
  150. UCHAR remove_count; /* Number of suffix chars to remove */
  151. UCHAR aplen; /* Length of apndstr */
  152. UCHAR *apndstr; /* Optional append string */
  153. char is_last_rule; /* '$' terminate or '>' continue algorithm */
  154. } PRULE;
  155. char *ensure_end_slash (char *pathstr);
  156. void unload_jpn_language (DBLK *dblk);
  157. /************************************************/
  158. /* */
  159. /* GLOBALS */
  160. /* */
  161. /************************************************/
  162. int debugging_loadlang = FALSE;
  163. int debugging_loadword = FALSE;
  164. int debugging_search_wordtree = FALSE;
  165. int debugging_teskey = FALSE;
  166. int debugging_paice = FALSE;
  167. static int *paice_charmap;
  168. static char paicebuf [DtSrMAXWIDTH_HWORD + 2];
  169. static int paicelen;
  170. static int paicewcsl;
  171. static int word_is_intact;
  172. /* Language strings correspond to DtSrLa.. constants. */
  173. static char *lang_fnames[] = {
  174. "eng", /* 0 */
  175. "eng", /* 1 ('eng2' same files as 'eng') */
  176. "esp", /* 2 */
  177. "fra", /* 3 */
  178. "ita", /* 4 */
  179. "deu", /* 5 */
  180. "jpn", /* 6 */
  181. "jpn", /* 7 ('jpn2' same files as 'jpn' */
  182. NULL
  183. };
  184. /************************************************/
  185. /* */
  186. /* language_name */
  187. /* */
  188. /************************************************/
  189. /* Returns language name string given language number */
  190. static char *language_name (DtSrINT16 langno)
  191. {
  192. static char *language_names[] = {
  193. "English-ASCII", /* 0 = DtSrLaENG */
  194. "English-Latin1", /* 1 = DtSrLaENG2 */
  195. "Spanish", /* 2 = DtSrLaESP */
  196. "French", /* 3 = DtSrLaFRA */
  197. "Italian", /* 4 = DtSrLaITA */
  198. "German", /* 5 = DtSrLaDEU */
  199. "Japanese-comp", /* 6 = DtSrLaJPN */
  200. "Japanese-.knj" /* 7 = DtSrLaJPN2 */
  201. };
  202. if (langno < 0)
  203. return "INVALID!";
  204. else if (langno > DtSrLaLAST)
  205. return "(Custom Language)";
  206. else
  207. return language_names [langno];
  208. } /* language_name() */
  209. /************************************************/
  210. /* */
  211. /* search_wordtree */
  212. /* */
  213. /************************************************/
  214. /* Sept 1991.
  215. * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
  216. * Searches a word list in a binary WORDTREE.
  217. * Passed wordstring is presumed to be a clean,
  218. * uppercase word token string terminated by \0.
  219. * Variables are static for speeeeed.
  220. * Returns TRUE if successful search, else FALSE.
  221. * See also search_wordtree_jpn() in jpn.c
  222. */
  223. static int search_wordtree (WORDTREE *wordtree, char *wordstring)
  224. {
  225. static int direction;
  226. static WORDTREE *node;
  227. if (debugging_search_wordtree)
  228. fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
  229. wordstring);
  230. /* MAIN SEARCH LOOP: binary tree search */
  231. for (node = wordtree; node != NULL; ) {
  232. if ((direction = strcmp (wordstring, node->word)) == 0) {
  233. if (debugging_search_wordtree)
  234. fprintf (aa_stderr, " HIT!\n");
  235. return TRUE;
  236. }
  237. /* Descend left or right depending on word */
  238. if (debugging_search_wordtree)
  239. fprintf (aa_stderr, " %c '%s'\n",
  240. (direction < 0) ? 'L' : 'R', (char *) node->word);
  241. if (direction < 0)
  242. node = node->llink;
  243. else
  244. node = node->rlink;
  245. }
  246. if (debugging_search_wordtree)
  247. fprintf (aa_stderr, " MISS.\n");
  248. return FALSE;
  249. } /* search_wordtree() */
  250. static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
  251. {
  252. int len = -1;
  253. if (p < s) goto done;
  254. if (*p >= 0 && *p <= 0x7F) {
  255. len = 1;
  256. *pwc = *p;
  257. goto done;
  258. }
  259. if (p == s) goto done;
  260. len = mbtowc (pwc, p - 1, MB_CUR_MAX);
  261. done:
  262. if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
  263. return len;
  264. }
  265. static char *euro_wctomb (int c, char *outp, int len)
  266. {
  267. wchar_t wc = c & 0xFF;
  268. if (len > 1) wctomb (outp, wc);
  269. else *outp = wc;
  270. return outp + len;
  271. }
  272. static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
  273. wchar_t *pwc)
  274. {
  275. int len = 1;
  276. *pwc = *outp = cofunction (cofunction_arg);
  277. if (*pwc >= 0 && *pwc <= 0x7F) goto done;
  278. *(outp + len++) = cofunction (NULL);
  279. if (mbtowc (pwc, outp, MB_CUR_MAX) >= 0) goto done;
  280. *pwc = 0x100;
  281. for (;;) {
  282. if (len >= MB_CUR_MAX) break;
  283. *(outp + len++) = cofunction (NULL);
  284. if (mblen (outp, MB_CUR_MAX) >= 0) break;
  285. }
  286. done:
  287. if (*pwc > 0xFF) *pwc = 0x100;
  288. return len;
  289. }
  290. /************************************************/
  291. /* */
  292. /* teskey_parser */
  293. /* */
  294. /************************************************/
  295. /* 1989.
  296. * Teskey_parser() is derived from the former Socrates() in socrates.c.
  297. * Returns next teskey-parsed word token from a character stream.
  298. * Called from (1) dtsrindex, where readchar_ftext() cofunction
  299. * reads the .fzk file document 'stream', or (2) search engine
  300. * query parsers, where readchar_string() cofunction 'reads'
  301. * from the query string.
  302. * (The word hiliting parser does not directly call teskey_parser; it has
  303. * its own simplified equivalent to the parsing algorithms herein.)
  304. *
  305. * First call passes args in PARG structure. This resets end of
  306. * text block (ETX) flag, resets 'offset' counter to zero, etc.
  307. * Subsequent calls should pass NULL, and parser returns
  308. * next token in block, until reader cofunction reads ETX,
  309. * ie special ETX char ('\0'). Subsequent calls to parser
  310. * return NULL meaning "no tokens left in current stream".
  311. * Reader cofunctions tolerate repeated calls after
  312. * the first ETX, still returning '\0'.
  313. *
  314. * This parser presumes all incoming text is unformatted.
  315. * Since parser accesses streams a char at a time it does
  316. * not require periodic line feeds or anything else.
  317. *
  318. * Parser also returns offset information: number of bytes
  319. * since beginning of text block.
  320. *
  321. * Variables are static for speeeeeeed.
  322. *
  323. * OUTPUT FORMAT: NULL or a static C string containing a single
  324. * parsed word token. Word buffer reused at next call.
  325. * Each word is translated as follows:
  326. * All alphas TO UPPERCASE.
  327. * Teskey algorithm used to find word boundaries.
  328. * Always keeps include-list words.
  329. * Throws away stoplist words, very short words, and very long words.
  330. * All intervening nonconcordables discarded.
  331. *
  332. * There is a slight mod to the published Teskey algorithm.
  333. * Words can begin with optionally concordable chars
  334. * but not end with them. For example if '-' is optionally
  335. * concordable, '-foo-' will be parsed into '-foo'.
  336. */
  337. char *teskey_parser (PARG *parg)
  338. {
  339. static READCFP cofunction;
  340. static void *cofunction_arg;
  341. static DBLK *dblk = NULL;
  342. static char *outbuf = NULL;
  343. static size_t outbufsz = 0;
  344. static char *endmaxword; /* end largest possible output word */
  345. static char *outp; /* next loc in outbuf */
  346. static int *charmap;
  347. static int minwordsz, maxwordsz;
  348. static int wordlen;
  349. static enum {BETW_WORDS, IN_WORD, TOO_LONG}
  350. tpstate;
  351. static long *offsetp, readcount, candidate_offset;
  352. static int is_hiliting;
  353. static int add_msgs;
  354. static int len, opt_len;
  355. static wchar_t wc;
  356. /* If first call for current text block... */
  357. if (parg) {
  358. dblk = parg->dblk;
  359. minwordsz = dblk->dbrec.or_minwordsz;
  360. maxwordsz = dblk->dbrec.or_maxwordsz;
  361. charmap = dblk->charmap;
  362. offsetp = parg->offsetp;
  363. is_hiliting = (parg->flags & PA_HILITING);
  364. add_msgs = (parg->flags & PA_MSGS);
  365. if (charmap == NULL) {
  366. fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 4,
  367. "%s dblk not initialized.\n"),
  368. PROGNAME"801");
  369. DtSearchExit (55);
  370. }
  371. if (parg->string) {
  372. cofunction_arg = parg->string;
  373. cofunction = (READCFP) readchar_string;
  374. }
  375. else if (parg->ftext) {
  376. cofunction_arg = parg;
  377. cofunction = (READCFP) readchar_ftext;
  378. }
  379. else {
  380. fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 5,
  381. "%s Program Error: parg contains neither file nor string.\n"),
  382. PROGNAME"327");
  383. DtSearchExit (27);
  384. }
  385. if (outbufsz <= maxwordsz) {
  386. if (outbuf)
  387. free (outbuf);
  388. outbufsz = maxwordsz + 8;
  389. outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
  390. }
  391. endmaxword = outbuf + maxwordsz;
  392. if (debugging_teskey)
  393. fprintf (aa_stderr,
  394. "teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
  395. maxwordsz, (unsigned long) outbufsz);
  396. readcount = 0L;
  397. }
  398. /* CANDIDATE WORD LOOP: Read text chars into outbuf.
  399. * Exit loop when outbuf contains one candidate token or at ETX.
  400. */
  401. READ_ANOTHER_WORD:
  402. outp = outbuf;
  403. tpstate = BETW_WORDS;
  404. for (;;) {
  405. len = euro_readchar (cofunction, cofunction_arg, outp, &wc);
  406. if (!wc) break;
  407. readcount += len;
  408. cofunction_arg = NULL;
  409. /*------------- BETW_WORDS State ------------
  410. * Reader is between word tokens.
  411. */
  412. if (tpstate == BETW_WORDS) {
  413. /*
  414. * Discard nonconcordable chars between words.
  415. */
  416. if ((charmap[wc] & NON_CONCORD) != 0)
  417. continue;
  418. /*
  419. * Fully concordable char is definite start of new word.
  420. * Convert to uppercase and go get next char.
  421. */
  422. if ((charmap[wc] & CONCORDABLE) != 0) {
  423. outp = euro_wctomb (charmap[wc], outp, len);
  424. candidate_offset = readcount;
  425. tpstate = IN_WORD;
  426. continue;
  427. }
  428. /*
  429. * Must be optionally concordable. It can only
  430. * start a new word if next char is concordable.
  431. * If so, convert a fully concordable char
  432. * to uppercase and go get next char.
  433. * Otherwise discard just like non_concord.
  434. */
  435. outp += len;
  436. opt_len = euro_readchar (cofunction, NULL, outp, &wc);
  437. if (wc) readcount += opt_len;
  438. if ((charmap[wc] & CONCORDABLE) != 0) {
  439. outp = euro_wctomb (charmap[wc], outp, opt_len);
  440. candidate_offset = readcount - opt_len;
  441. tpstate = IN_WORD;
  442. continue;
  443. }
  444. else {
  445. outp -= len;
  446. continue;
  447. }
  448. } /* endif BETW_WORDS */
  449. /*------------- IN_WORD State ------------
  450. * Reader is in middle of a word.
  451. * Convert all concordables to uppercase and append.
  452. * Terminate word at first non_concord.
  453. * Non_concords treatment depends on next char.
  454. */
  455. else if (tpstate == IN_WORD) {
  456. if ((charmap[wc] & CONCORDABLE) != 0) {
  457. if (outp < endmaxword) {
  458. outp = euro_wctomb (charmap[wc], outp, len);
  459. }
  460. else {
  461. tpstate = TOO_LONG;
  462. if (debugging_teskey)
  463. fprintf (aa_stderr,
  464. "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
  465. candidate_offset-1, outbuf);
  466. if (add_msgs) {
  467. char msgbuf [DtSrMAXWIDTH_HWORD + 100];
  468. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 8,
  469. "%s '%.*s...' is larger\n"
  470. "than the maximum word size of database '%s'.") ,
  471. PROGNAME"449", maxwordsz,
  472. parg->string, dblk->label);
  473. DtSearchAddMessage (msgbuf);
  474. return NULL;
  475. }
  476. outbuf[0] = 0;
  477. outp = outbuf;
  478. }
  479. continue;
  480. }
  481. if ((charmap[wc] & NON_CONCORD) != 0) {
  482. *outp = '\0';
  483. break;
  484. }
  485. /* Must be opt_concord... */
  486. outp += len;
  487. opt_len = euro_readchar (cofunction, NULL, outp, &wc);
  488. if (wc) readcount += opt_len;
  489. if ((charmap[wc] & CONCORDABLE) != 0) {
  490. if (outp < endmaxword) {
  491. outp = euro_wctomb (charmap[wc], outp, opt_len);
  492. }
  493. else {
  494. tpstate = TOO_LONG;
  495. if (debugging_teskey)
  496. fprintf (aa_stderr,
  497. "teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
  498. candidate_offset-1, outbuf);
  499. outbuf[0] = 0;
  500. outp = outbuf;
  501. }
  502. continue;
  503. }
  504. else { /* next char NOT concordable...*/
  505. outp -= len;
  506. *outp = '\0';
  507. break;
  508. }
  509. } /* endif IN_WORD */
  510. /*------------- TOO_LONG State ------------
  511. * Reader is in middle of a word that exceeds max word size.
  512. * Discard all concordables and opt_concords until we
  513. * can get between words again with a clear non_concord.
  514. */
  515. else if (tpstate == TOO_LONG) {
  516. if ((charmap[wc] & NON_CONCORD) != 0) {
  517. outp = outbuf;
  518. tpstate = BETW_WORDS;
  519. }
  520. continue;
  521. }
  522. /*------------- UNKNOWN State ------------*/
  523. else {
  524. fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 10,
  525. "%s Program Error: Unknown parser state.\n"),
  526. PROGNAME"306");
  527. DtSearchExit (26);
  528. }
  529. } /* end read loop for next CANDIDATE WORD */
  530. /*---------- TEST FOR ETX -------------*/
  531. if (outbuf[0] == 0) {
  532. if (debugging_teskey)
  533. fprintf (aa_stderr, "teskey: etx\n");
  534. if (add_msgs) {
  535. char msgbuf [200];
  536. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 12,
  537. "%s '%.120s' is not a valid word in database '%s'.") ,
  538. PROGNAME"506", parg->string, dblk->label);
  539. DtSearchAddMessage (msgbuf);
  540. }
  541. return NULL;
  542. }
  543. wordlen = strlen (outbuf);
  544. candidate_offset--; /* token offset is one less than number of reads */
  545. if (debugging_teskey)
  546. fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
  547. candidate_offset, outbuf);
  548. if (is_hiliting) {
  549. if (debugging_teskey)
  550. fprintf (aa_stderr, ", (hiliting, skip tree searches)");
  551. goto GOOD_WORD;
  552. }
  553. /*--------- INCLUDE LIST ----------
  554. * Search before testing for stoplist or minimum word length.
  555. */
  556. if (dblk->inclist != NULL) {
  557. if (search_wordtree (dblk->inclist, outbuf)) {
  558. if (debugging_teskey)
  559. fprintf (aa_stderr, ", (INCLUDE LIST)");
  560. goto GOOD_WORD;
  561. }
  562. }
  563. /*--------- TOO SHORT -----------*/
  564. if (wordlen < minwordsz) {
  565. if (debugging_teskey)
  566. fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
  567. if (add_msgs) {
  568. char msgbuf [200];
  569. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 17,
  570. "%s '%s' is less than the\n"
  571. "minimum word size of database '%s'.") ,
  572. PROGNAME"543", parg->string, dblk->label);
  573. DtSearchAddMessage (msgbuf);
  574. return NULL;
  575. }
  576. goto READ_ANOTHER_WORD;
  577. }
  578. /*----------- STOP LIST -------------*/
  579. if (dblk->stoplist != NULL) {
  580. if (search_wordtree (dblk->stoplist, outbuf)) {
  581. if (debugging_teskey)
  582. fprintf (aa_stderr, ", (STOP LIST)\n");
  583. if (add_msgs) {
  584. char msgbuf [200];
  585. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 19,
  586. "%s The word '%s' is not indexed in database '%s'.") ,
  587. PROGNAME"558", parg->string, dblk->label);
  588. DtSearchAddMessage (msgbuf);
  589. return NULL;
  590. }
  591. goto READ_ANOTHER_WORD;
  592. }
  593. }
  594. GOOD_WORD:
  595. /* Word is correctly parsed and passes all dblk filters. */
  596. if (debugging_teskey)
  597. fprintf (aa_stderr, ", ...good word\n");
  598. if (offsetp)
  599. *offsetp = candidate_offset;
  600. return outbuf;
  601. } /* teskey_parser() */
  602. /************************************************/
  603. /* */
  604. /* is_concordable */
  605. /* */
  606. /************************************************/
  607. /* Verifies passed word token is teskey-concordable
  608. * in code page of passed charmap. Used in validating
  609. * word files. Returns TRUE if all chars concordable
  610. * or optionally concordable, else returns FALSE.
  611. */
  612. int is_concordable (char *word, int *charmap)
  613. {
  614. char *cptr;
  615. wchar_t wc;
  616. for (cptr = word; *cptr != 0; cptr++) {
  617. euro_mbtowc (&wc, cptr, word);
  618. if ((charmap[wc] & NON_CONCORD) != 0)
  619. break;
  620. }
  621. return (*cptr == 0);
  622. } /* is_concordable() */
  623. /************************************************/
  624. /* */
  625. /* load_wordtree */
  626. /* */
  627. /************************************************/
  628. /* Called by load_stop_list(), load_include_list(), etc,
  629. * to read an appropriate word list file into binary tree structures.
  630. *
  631. * INPUT FILE FORMAT: One word per line, all chars teskey concordable.
  632. * Preferred order is frequency of occurrence in the corpus
  633. * to make searches efficient. Otherwise the words should at least
  634. * be in random order or an order that will approximate a binary search.
  635. * If first char is any of COMMENT_CHARS, line is ignored as comments.
  636. * Ascii spaces, tabs, or newline delimits the first word token--
  637. * anything else on the line is ignored as comments.
  638. * Optionally characters in word token will be checked for teskey
  639. * concordability.
  640. *
  641. * RETURNS 0 if file successfully loaded, returns 1 if file missing,
  642. * returns 2 and messages in global msglist if file has fatal errors.
  643. */
  644. int load_wordtree (
  645. WORDTREE **treetop,
  646. DBLK *dblk,
  647. char *fname,
  648. int do_teskey_test)
  649. {
  650. int i;
  651. int errcount;
  652. int is_duplicate;
  653. long linecount = 0;
  654. char *token;
  655. char readbuf [256];
  656. char sprintbuf [_POSIX_PATH_MAX + 1024];
  657. FILE *fileid;
  658. WORDTREE *new;
  659. WORDTREE **this_link;
  660. _Xstrtokparams strtok_buf;
  661. if (debugging_loadlang)
  662. fprintf (aa_stderr, PROGNAME"1071 "
  663. "load_wordtree: db=%s fname='%s'\n",
  664. NULLORSTR(dblk->name), NULLORSTR(fname));
  665. if ((fileid = fopen (fname, "rt")) == NULL) {
  666. /* Not being able to find the file is not an error.
  667. * We indicate that with the return code.
  668. * But any other error (like permissions) is fatal.
  669. */
  670. if (errno == ENOENT) {
  671. if (debugging_loadlang)
  672. fputs (" ...file not found.\n", aa_stderr);
  673. return 1;
  674. }
  675. else {
  676. sprintf (sprintbuf,
  677. CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
  678. PROGNAME"362", fname, strerror(errno));
  679. DtSearchAddMessage (sprintbuf);
  680. return 2;
  681. }
  682. }
  683. /*--------- Main Read Loop ----------*/
  684. errcount = 0;
  685. while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
  686. linecount++;
  687. /*
  688. * Ignore comment lines beginning with punctuation char.
  689. * Ignore empty lines (strtok returns NULL, no tokens).
  690. * Otherwise first or only word on line is the desired word.
  691. */
  692. if (strchr (COMMENT_CHARS, readbuf[0]))
  693. continue;
  694. if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
  695. continue;
  696. dblk->lstrupr (token, dblk);
  697. if (debugging_loadword)
  698. fprintf (aa_stderr, " WORD: '%s' ", token);
  699. /* If requested confirm all chars are teskey-concordable. */
  700. if (do_teskey_test)
  701. if (!is_concordable (token, dblk->charmap)) {
  702. sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 400,
  703. "%s: %s, line %ld: Invalid chars in word '%s'."),
  704. PROGNAME"400", fname, linecount, token);
  705. DtSearchAddMessage (sprintbuf);
  706. errcount++;
  707. continue;
  708. }
  709. /* Unless we've already detected some errors,
  710. * allocate a new node and load its data fields.
  711. */
  712. if (errcount)
  713. continue;
  714. i = strlen (token);
  715. new = austext_malloc (sizeof(WORDTREE) + i + 4,
  716. PROGNAME"104", NULL);
  717. new->llink = NULL;
  718. new->rlink = NULL;
  719. new->len = i;
  720. new->word = (void *) (new + 1);
  721. strcpy (new->word, token);
  722. /* Descend binary tree and insert in correct alphabetical place */
  723. is_duplicate = FALSE;
  724. for (this_link = treetop; *this_link != NULL; ) {
  725. i = strcmp (new->word, (*this_link)->word);
  726. /* test for duplicate word */
  727. if (i == 0) {
  728. sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 423,
  729. "%s Word '%s' in '%s' is a duplicate."),
  730. PROGNAME"423", token, fname);
  731. DtSearchAddMessage (sprintbuf);
  732. /* duplicates aren't fatal, just ignore the word */
  733. is_duplicate = TRUE;
  734. break; /* no point in continuing descent */
  735. }
  736. /* Descend tree to find correct insertion point */
  737. if (debugging_loadword)
  738. fputc(((i < 0)? 'L' : 'R'), aa_stderr);
  739. this_link = (WORDTREE **) ((i < 0) ?
  740. &(*this_link)->llink : &(*this_link)->rlink);
  741. } /* end forloop to find tree insertion point */
  742. /* Don't link anything if error found while descending tree */
  743. if (is_duplicate) {
  744. if (debugging_loadword)
  745. fputs (" duplicate!\n", aa_stderr);
  746. free (new);
  747. continue;
  748. }
  749. /* Insert new node at current location in tree */
  750. *this_link = new;
  751. if (debugging_loadword)
  752. fputs(" .\n", aa_stderr);
  753. } /* end of read loop */
  754. fclose (fileid);
  755. if (errcount) {
  756. if (debugging_loadlang)
  757. fprintf (aa_stderr,
  758. PROGNAME"1186 load word file '%s' failed.\n", fname);
  759. return 2;
  760. }
  761. else {
  762. if (debugging_loadlang)
  763. fprintf (aa_stderr,
  764. PROGNAME"1193 load word file '%s' successful.\n", fname);
  765. return 0;
  766. }
  767. } /* load_wordtree() */
  768. /************************************************/
  769. /* */
  770. /* free_wordtree */
  771. /* */
  772. /************************************************/
  773. /* Formerly free_bintree() in msgutil.c.
  774. * Frees storage for all nodes in a WORDTREE and
  775. * sets its top-of-list pointer to NULL.
  776. * Works only for node structures where all memory
  777. * was allocated in a single call to malloc().
  778. * Uses link inversion traversal (eg, Data Structure Techniques,
  779. * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
  780. * at preorder visit, and node is freed at postorder visit.
  781. */
  782. static void free_wordtree (WORDTREE ** wordtree_head)
  783. {
  784. WORDTREE *next;
  785. WORDTREE *prev = NULL;
  786. WORDTREE *pres = *wordtree_head;
  787. if (*wordtree_head == NULL)
  788. return;
  789. DESCEND_LEFT:
  790. pres->word = (void *) 0; /* preorder visit: TAG = 0 */
  791. next = pres->llink;
  792. if (next != NULL) {
  793. pres->llink = prev;
  794. prev = pres;
  795. pres = next;
  796. goto DESCEND_LEFT;
  797. }
  798. DESCEND_RIGHT:
  799. next = pres->rlink;
  800. if (next != NULL) {
  801. pres->word = (void *) 1; /* TAG = 1 */
  802. pres->rlink = prev;
  803. prev = pres;
  804. pres = next;
  805. goto DESCEND_LEFT;
  806. }
  807. POSTORDER_VISIT:
  808. free (pres);
  809. if (prev == NULL) { /* end of algorithm? */
  810. *wordtree_head = NULL;
  811. return;
  812. }
  813. if (prev->word == (void *) 0) { /* go up left leg */
  814. next = prev->llink;
  815. pres = prev;
  816. prev = next;
  817. goto DESCEND_RIGHT;
  818. }
  819. else { /* go up right leg */
  820. next = prev->rlink;
  821. prev->word = (void *) 0; /* restore TAG = 0 */
  822. pres = prev;
  823. prev = next;
  824. goto POSTORDER_VISIT;
  825. }
  826. } /* free_wordtree() */
  827. /************************************************/
  828. /* */
  829. /* load_include_list */
  830. /* */
  831. /************************************************/
  832. /* Builds include list by reading include file
  833. * into a binary tree structure.
  834. * Unlike stoplists, include-lists are optional.
  835. * Also unlike stoplists, there are no language default include-lists.
  836. * 'dblist' may be NULL.
  837. * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
  838. */
  839. static int load_include_list (DBLK *dblk, DBLK *dblist)
  840. {
  841. int i;
  842. int filename_was_null = (dblk->fname_inc == NULL);
  843. DBLK *db;
  844. char sprintbuf [512];
  845. dblk->inclist = NULL; /* just to be sure */
  846. if (debugging_loadlang)
  847. fprintf (aa_stderr,
  848. PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
  849. NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
  850. language_name(dblk->dbrec.or_language));
  851. /* If file name not provided, generate one based on
  852. * dblk's path, database name, and default extension.
  853. */
  854. if (filename_was_null) {
  855. if (dblk->name[0] == 0) {
  856. dblk->fname_inc = "";
  857. dblk->inclist = NULL;
  858. if (debugging_loadlang)
  859. fprintf (aa_stderr, PROGNAME"1339 "
  860. "No inclist because neither fname nor dbname provided.\n");
  861. return TRUE;
  862. }
  863. if (dblk->path == NULL)
  864. dblk->path = strdup("");
  865. dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
  866. PROGNAME"1187", NULL);
  867. strcpy (dblk->fname_inc, dblk->path);
  868. ensure_end_slash (dblk->fname_inc);
  869. strcat (dblk->fname_inc, dblk->name);
  870. strcat (dblk->fname_inc, EXT_INCLIST);
  871. }
  872. if (debugging_loadlang)
  873. fprintf (aa_stderr,
  874. PROGNAME"1350 Include list file name = '%s'.\n",
  875. dblk->fname_inc);
  876. /* Don't reload the same file if it's already
  877. * been loaded into a previous dblk in a list.
  878. * Code works just fine if dblist == NULL.
  879. */
  880. for (db = dblist; db != NULL; db = db->link) {
  881. if (db == dblk || db->fname_inc == NULL)
  882. continue;
  883. if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
  884. dblk->inclist = db->inclist;
  885. dblk->lang_flags |= LF_DUP_INC;
  886. if (debugging_loadlang)
  887. fprintf (aa_stderr, PROGNAME"1363 "
  888. "Using previously loaded inclist, db='%s'.\n",
  889. dblk->name);
  890. return TRUE;
  891. }
  892. }
  893. /* Include list is optional so missing file is
  894. * not an error unless caller named a specific file.
  895. */
  896. i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
  897. switch (i) {
  898. case 0:
  899. return TRUE;
  900. case 1:
  901. if (filename_was_null) {
  902. dblk->fname_inc = "";
  903. dblk->inclist = NULL;
  904. return TRUE;
  905. }
  906. else {
  907. sprintf (sprintbuf,
  908. CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
  909. PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
  910. DtSearchAddMessage (sprintbuf);
  911. return FALSE;
  912. }
  913. default:
  914. return FALSE;
  915. }
  916. } /* load_include_list() */
  917. /************************************************/
  918. /* */
  919. /* load_stop_list */
  920. /* */
  921. /************************************************/
  922. /* Builds stoplist by reading stoplist file into a
  923. * binary tree structure. File name can be
  924. * (1) passed in dblk.fname_stp,
  925. * (2) generated from dblk path, name, and '.stp',
  926. * (3) default for dblk path, language, and '.stp'.
  927. * 'dblist' may be NULL.
  928. * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
  929. */
  930. static int load_stop_list (DBLK *dblk, DBLK *dblist)
  931. {
  932. int i;
  933. DBLK *db;
  934. char sprintbuf [_POSIX_PATH_MAX + 512];
  935. struct stat statbuf;
  936. dblk->stoplist = NULL; /* just to be sure */
  937. if (debugging_loadlang)
  938. fprintf (aa_stderr,
  939. PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
  940. NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
  941. language_name(dblk->dbrec.or_language));
  942. /* If file name not provided, generate one based on
  943. * dblk's path, database name, and default extension.
  944. * And if that doesn't work, generate one based on
  945. * dblk's path, language, and default extension.
  946. */
  947. if (dblk->fname_stp == NULL) {
  948. if (dblk->path == NULL)
  949. dblk->path = strdup("");
  950. dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
  951. PROGNAME"919", NULL);
  952. strcpy (dblk->fname_stp, dblk->path);
  953. ensure_end_slash (dblk->fname_stp);
  954. strcat (dblk->fname_stp, dblk->name);
  955. strcat (dblk->fname_stp, EXT_STOPLIST);
  956. errno = 0;
  957. stat (dblk->fname_stp, &statbuf);
  958. if (errno == ENOENT) {
  959. strcpy (dblk->fname_stp, dblk->path);
  960. ensure_end_slash (dblk->fname_stp);
  961. strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
  962. strcat (dblk->fname_stp, EXT_STOPLIST);
  963. }
  964. }
  965. if (debugging_loadlang)
  966. fprintf (aa_stderr,
  967. PROGNAME"1448 Stoplist file name = '%s'.\n",
  968. dblk->fname_stp);
  969. /* Don't reload the same file if it's already
  970. * been loaded into a previous dblk in a list.
  971. * Code works just fine if dblist == NULL.
  972. */
  973. for (db = dblist; db != NULL; db = db->link) {
  974. if (db == dblk || db->fname_stp == NULL)
  975. continue;
  976. if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
  977. dblk->stoplist = db->stoplist;
  978. dblk->lang_flags |= LF_DUP_STP;
  979. if (debugging_loadlang)
  980. fprintf (aa_stderr, PROGNAME"1460 "
  981. "Using previously loaded stoplist, db='%s'.\n",
  982. dblk->name);
  983. return TRUE;
  984. }
  985. }
  986. /* Stop lists are mandatory--a missing stoplist is fatal. */
  987. i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
  988. if (i == 1) {
  989. sprintf (sprintbuf,
  990. CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
  991. PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
  992. DtSearchAddMessage (sprintbuf);
  993. }
  994. return (i == 0);
  995. } /* load_stop_list() */
  996. /************************************************/
  997. /* */
  998. /* free_paice_rules */
  999. /* */
  1000. /************************************************/
  1001. /* Frees all allocated storage for a set of paice rules, typically
  1002. * loaded at dblk.stem_extra. Called by REINIT routines and
  1003. * by load_paice_suffixes() when cleaning up after an error.
  1004. */
  1005. static void free_paice_rules (PRULE ***rules_table_ptr)
  1006. {
  1007. int i;
  1008. PRULE *p, **linkp;
  1009. PRULE **rules_table;
  1010. if (*rules_table_ptr == NULL)
  1011. return;
  1012. rules_table = *rules_table_ptr;
  1013. for (i=0; i<256; i++) {
  1014. if (rules_table[i] == NULL)
  1015. continue;
  1016. p = rules_table[i];
  1017. while (p) {
  1018. linkp = &p->link;
  1019. free (p->suffix);
  1020. if (p->apndstr)
  1021. free (p->apndstr);
  1022. free (p);
  1023. p = *linkp;
  1024. }
  1025. }
  1026. free (rules_table);
  1027. *rules_table_ptr = NULL;
  1028. return;
  1029. } /* free_paice_rules() */
  1030. /************************************************/
  1031. /* */
  1032. /* load_paice_suffixes */
  1033. /* */
  1034. /************************************************/
  1035. /* Loads European language paice stemmer suffix rules
  1036. * into dblk.stem_extra as an array of ptrs to linked lists.
  1037. * Like stop lists, sfx files can be
  1038. * (1) passed in dblk.fname_sfx,
  1039. * (2) generated from dblk path, dbname, and '.sfx',
  1040. * (3) generated from dblk path, language, and '.sfx'.
  1041. * Internal tables will be reused if file previously loaded.
  1042. * Only uses single byte character sets (ascii, iso-latin-1).
  1043. * Uses strtok(). dblk->charmap must already be loaded.
  1044. * Will continue to parse entire file even if errors are found.
  1045. * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
  1046. */
  1047. static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
  1048. {
  1049. FILE *fp;
  1050. DBLK *db;
  1051. PRULE *prule, **prule_link;
  1052. PRULE **rules_table;
  1053. struct stat statbuf;
  1054. UCHAR *cptr, *token;
  1055. char readbuf [_POSIX_PATH_MAX + 1024];
  1056. char msgbuf [_POSIX_PATH_MAX + 1024];
  1057. UCHAR *suffix, *apndstr;
  1058. int must_be_intact, is_last_rule;
  1059. UCHAR remove_count;
  1060. int lineno, errcount;
  1061. int len;
  1062. wchar_t wc;
  1063. _Xstrtokparams strtok_buf;
  1064. dblk->stem_extra = NULL; /* just to be sure */
  1065. rules_table = NULL;
  1066. if (debugging_loadlang)
  1067. fprintf (aa_stderr,
  1068. PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
  1069. NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
  1070. language_name(dblk->dbrec.or_language));
  1071. /* If file name not provided, generate one based on
  1072. * dblk's path, database name, and default extension.
  1073. * And if that doesn't work, generate one based on
  1074. * dblk's path, language, and default extension.
  1075. */
  1076. if (dblk->fname_sfx == NULL) {
  1077. if (dblk->path == NULL)
  1078. dblk->path = strdup("");
  1079. dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
  1080. PROGNAME"1113", NULL);
  1081. strcpy (dblk->fname_sfx, dblk->path);
  1082. ensure_end_slash (dblk->fname_sfx);
  1083. strcat (dblk->fname_sfx, dblk->name);
  1084. strcat (dblk->fname_sfx, EXT_SUFFIX);
  1085. errno = 0;
  1086. stat (dblk->fname_sfx, &statbuf);
  1087. if (errno == ENOENT) {
  1088. strcpy (dblk->fname_sfx, dblk->path);
  1089. ensure_end_slash (dblk->fname_sfx);
  1090. strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
  1091. strcat (dblk->fname_sfx, EXT_SUFFIX);
  1092. }
  1093. }
  1094. if (debugging_loadlang)
  1095. fprintf (aa_stderr,
  1096. PROGNAME"1740 Paice suffix file name = '%s'.\n",
  1097. dblk->fname_sfx);
  1098. /* Don't reload the same file if it's already
  1099. * been loaded into a previous dblk in a list,
  1100. * but flag it so it won't be freed at unload_language/REINIT.
  1101. * Code works just fine if dblist == NULL.
  1102. */
  1103. for (db = dblist; db != NULL; db = db->link) {
  1104. if (db == dblk || db->fname_sfx == NULL)
  1105. continue;
  1106. if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
  1107. dblk->stem_extra = db->stem_extra;
  1108. dblk->lang_flags |= LF_DUP_SFX;
  1109. if (debugging_loadlang)
  1110. fprintf (aa_stderr, PROGNAME"1145 "
  1111. "Using previously loaded suffixes, db='%s'.\n",
  1112. dblk->name);
  1113. return TRUE;
  1114. }
  1115. }
  1116. fp = fopen (dblk->fname_sfx, "rt");
  1117. if (fp == NULL) {
  1118. sprintf (msgbuf,
  1119. CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
  1120. PROGNAME"181", dblk->fname_sfx, strerror(errno));
  1121. DtSearchAddMessage (msgbuf);
  1122. dblk->fname_sfx = NULL;
  1123. return FALSE;
  1124. }
  1125. /* Rules table will eventually be loaded at dblk.stem_extra.
  1126. * It consists of 256 PRULE ptrs,
  1127. * one for each possible single byte char.
  1128. * Each ptr is the head of a rules list for that char.
  1129. */
  1130. rules_table = austext_malloc (256 * sizeof(PRULE*),
  1131. PROGNAME"199", &ausapi_msglist);
  1132. memset (rules_table, 0, 256 * sizeof(PRULE*));
  1133. lineno = 0;
  1134. errcount = 0;
  1135. /*------- Main Read Loop -------*/
  1136. while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
  1137. lineno++;
  1138. /* Ignore comment lines */
  1139. if (strchr (COMMENT_CHARS, readbuf[0]))
  1140. continue;
  1141. /* TOKEN #1: suffix string, backwards, all uppercase.
  1142. * If missing, ignore 'empty' line.
  1143. * If the first token is all numeric, ignore line
  1144. * (for compatibility with older versions of file).
  1145. */
  1146. if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
  1147. continue;
  1148. for (cptr = suffix; cptr; cptr++) {
  1149. euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
  1150. if ((dblk->charmap[wc] & NUMERAL) == 0)
  1151. break;
  1152. }
  1153. if (*cptr == '\0')
  1154. continue;
  1155. /* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
  1156. if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
  1157. BAD_RULE:
  1158. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 51,
  1159. "%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
  1160. PROGNAME"898", dblk->fname_sfx, lineno, suffix);
  1161. DtSearchAddMessage (msgbuf);
  1162. errcount++;
  1163. continue;
  1164. }
  1165. must_be_intact = FALSE;
  1166. if (token[0] == '*') {
  1167. must_be_intact = TRUE;
  1168. /* Read next token... */
  1169. if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
  1170. goto BAD_RULE;
  1171. }
  1172. /* TOKEN #3: remove-count */
  1173. remove_count = (UCHAR) atoi ((char *) token);
  1174. /* OPTIONAL TOKEN #4: if next token is NOT a continue
  1175. * symbol ('>' or '$'), then it's an append string.
  1176. */
  1177. apndstr = NULL;
  1178. if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
  1179. goto BAD_RULE;
  1180. if (token[0] != '$' && token[0] != '>') {
  1181. apndstr = token;
  1182. /* Read next token... */
  1183. if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
  1184. goto BAD_RULE;
  1185. }
  1186. /* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
  1187. is_last_rule = (token[0] == '$');
  1188. if (debugging_loadword) {
  1189. fprintf (aa_stderr,
  1190. " SFX: intact?=%d stop?=%d remv=%d '%s'",
  1191. (int) must_be_intact,
  1192. (int) is_last_rule,
  1193. (int) remove_count,
  1194. suffix);
  1195. if (apndstr)
  1196. fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
  1197. else
  1198. fputc ('\n', aa_stderr);
  1199. }
  1200. /* Good suffix. If we haven't had any errors yet,
  1201. * add it to rules list for the first char of the suffix.
  1202. */
  1203. if (errcount)
  1204. continue;
  1205. prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
  1206. memset (prule, 0, sizeof(PRULE));
  1207. prule->suffix = (UCHAR *) strdup ((char*)suffix);
  1208. prule->suflen = strlen ((char*)suffix);
  1209. prule->must_be_intact = must_be_intact;
  1210. prule->remove_count = remove_count;
  1211. prule->is_last_rule = is_last_rule;
  1212. if (apndstr) {
  1213. len = mbstowcs (NULL, (char *)apndstr, 0);
  1214. if (len != -1) {
  1215. prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
  1216. prule->aplen = len;
  1217. }
  1218. }
  1219. prule_link = &rules_table[suffix[0]];
  1220. while (*prule_link)
  1221. prule_link = &(*prule_link)->link;
  1222. *prule_link = prule;
  1223. } /* end Main Read Loop */
  1224. fclose (fp);
  1225. if (errcount) {
  1226. free_paice_rules (&rules_table);
  1227. return FALSE;
  1228. }
  1229. dblk->stem_extra = rules_table;
  1230. /* Update last table entry */
  1231. if (debugging_loadlang) {
  1232. fprintf (aa_stderr,
  1233. PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
  1234. dblk->fname_sfx);
  1235. fflush (aa_stderr);
  1236. }
  1237. return TRUE;
  1238. } /* load_paice_suffixes() */
  1239. /************************************************/
  1240. /* */
  1241. /* is_matching_rule */
  1242. /* */
  1243. /************************************************/
  1244. /* Subroutine of paice_stemmer().
  1245. * Returns TRUE if passed rule can be applied to stem in paicebuf.
  1246. * Else returns FALSE.
  1247. */
  1248. static int is_matching_rule (PRULE *rule)
  1249. {
  1250. static char *ptr;
  1251. static wchar_t wc;
  1252. static int i, j;
  1253. if (debugging_paice)
  1254. fprintf (aa_stderr, " test rule '%s':\t", rule->suffix);
  1255. /* Skip rule if we've made at least one previous change
  1256. * but the current rule requires an intact word.
  1257. */
  1258. if (rule->must_be_intact && !word_is_intact) {
  1259. if (debugging_paice)
  1260. fputs ("word not intact...\n", aa_stderr);
  1261. return FALSE;
  1262. }
  1263. /* Do a backward strcmp on the suffix.
  1264. * Skip rule if it doesn't match current paicebuf's ending chars.
  1265. */
  1266. j = rule->suflen;
  1267. ptr = paicebuf + paicelen - 1;
  1268. for (i = 0; i < j; i++) {
  1269. if (*((rule->suffix) + i) != *ptr) {
  1270. if (debugging_paice)
  1271. fputs ("no match...\n", aa_stderr);
  1272. return FALSE;
  1273. }
  1274. ptr--;
  1275. }
  1276. if (debugging_paice)
  1277. fputs ("match", aa_stderr);
  1278. /* Set i = paicebuf length after removing and appending suffixes.
  1279. * Used to algorithmically test remaining stem length
  1280. * after tentative application of rule.
  1281. */
  1282. i = paicewcsl - (rule->remove_count - rule->aplen);
  1283. if (i <= 1) {
  1284. if (debugging_paice)
  1285. fputs (", but stem too short...\n", aa_stderr);
  1286. return FALSE;
  1287. }
  1288. if (i == 2) {
  1289. euro_mbtowc (&wc, paicebuf, paicebuf);
  1290. if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);
  1291. if (IS_VOWEL (wc)) {
  1292. if (debugging_paice)
  1293. fputs (", and short vowel stem valid.\n", aa_stderr);
  1294. return TRUE;
  1295. }
  1296. else {
  1297. if (debugging_paice)
  1298. fputs (", but consonant stem too short...\n", aa_stderr);
  1299. return FALSE;
  1300. }
  1301. }
  1302. /* Remaining stem is at least 3 chars.
  1303. * If it contains a vowel anywhere, it's valid.
  1304. * (A 'Y' after the first char counts as a vowel).
  1305. * Otherwise it's not.
  1306. */
  1307. for (j=0; j<i; j++) {
  1308. euro_mbtowc (&wc, &paicebuf[j], paicebuf);
  1309. if (IS_VOWEL (wc)) {
  1310. GOOD_STEM:
  1311. if (debugging_paice)
  1312. fputs (", and remaining stem valid.\n", aa_stderr);
  1313. return TRUE;
  1314. }
  1315. if (j > 0 && wc == L'Y')
  1316. goto GOOD_STEM;
  1317. }
  1318. if (debugging_paice)
  1319. fputs (", but remaining stem all consonants.\n", aa_stderr);
  1320. return FALSE;
  1321. } /* is_matching_rule() */
  1322. /************************************************/
  1323. /* */
  1324. /* paice_stemmer */
  1325. /* */
  1326. /************************************************/
  1327. /* Given a word token (ALREADY UPPERCASE) in a single byte
  1328. * language such as the output of teskey_parser,
  1329. * generates 'stem' by repeated suffix removal.
  1330. * Returns stem token in a static buffer valid
  1331. * until next call to paice_stemmer or null_stemmer.
  1332. * Returned stem might be the original unmodified word.
  1333. * Returned stem might also be empty string.
  1334. * Returned stem is *never* NULL, even if wordin == NULL.
  1335. * Input buffer will not be modified; does not use strtok.
  1336. * All variables are static for speeeeeeed.
  1337. */
  1338. static char *paice_stemmer (char *wordin, DBLK *dblk)
  1339. {
  1340. wchar_t finalwc;
  1341. int len;
  1342. PRULE *rule, **rules_table;
  1343. if (wordin == NULL)
  1344. return "";
  1345. if (*wordin == 0)
  1346. return "";
  1347. if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
  1348. fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 31,
  1349. "%s Stemmer suffixes file never loaded.\n"),
  1350. PROGNAME"310");
  1351. DtSearchExit (2);
  1352. }
  1353. /* The max length of a stem is bufsz - 2:
  1354. * one for the terminating \0 and one for the
  1355. * prefix ^O that identifies a stem. (But this
  1356. * stemmer doesn't actually insert the ^O now.)
  1357. */
  1358. strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);
  1359. if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
  1360. mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
  1361. paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
  1362. else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;
  1363. paice_charmap = dblk->charmap;
  1364. word_is_intact = TRUE;
  1365. for (;;) { /*-------- Main Stemming Loop ---------*/
  1366. paicelen = strlen (paicebuf);
  1367. paicewcsl = mbstowcs (NULL, paicebuf, 0);
  1368. len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);
  1369. if (debugging_paice) {
  1370. fwprintf (aa_stderr,
  1371. L"paice: '%s', rules list '%lc' for database '%s'\n",
  1372. paicebuf, finalwc, dblk->name);
  1373. fflush (aa_stderr);
  1374. }
  1375. /* Look for a matching rule */
  1376. if ((rule = rules_table [finalwc]) == NULL) {
  1377. if (debugging_paice)
  1378. fputs (" list is null, stop.\n", aa_stderr);
  1379. break;
  1380. }
  1381. while (rule) {
  1382. if (is_matching_rule (rule))
  1383. break;
  1384. rule = rule->link;
  1385. }
  1386. if (rule == NULL) {
  1387. if (debugging_paice)
  1388. fwprintf (aa_stderr,
  1389. L" rules list '%lc' is exhausted, stop.\n", finalwc);
  1390. break;
  1391. }
  1392. /* Apply rule that matched */
  1393. if (debugging_paice)
  1394. fputs (" apply rule: ", aa_stderr);
  1395. if (rule->remove_count == 0) {
  1396. if (debugging_paice)
  1397. fputs ("remove_count = 0, stop.\n", aa_stderr);
  1398. break;
  1399. }
  1400. paicebuf [paicelen - len * rule->remove_count] = 0;
  1401. if (rule->aplen)
  1402. strcat (paicebuf, (char*)rule->apndstr);
  1403. paicelen = strlen (paicebuf);
  1404. paicewcsl = mbstowcs (NULL, paicebuf, 0);
  1405. word_is_intact = FALSE; /* we've removed at least 1 suffix */
  1406. if (debugging_paice)
  1407. fprintf (aa_stderr, "--> '%s'", paicebuf);
  1408. /* Terminate algorithm if rule says so.
  1409. * Otherwise continue removing suffixes
  1410. * from this partially stemmed word.
  1411. */
  1412. if (rule->is_last_rule) {
  1413. if (debugging_paice)
  1414. fputs (", stop flag is set, stop.\n", aa_stderr);
  1415. break;
  1416. }
  1417. if (debugging_paice)
  1418. fputc ('\n', aa_stderr);
  1419. } /* end Main Stemming Loop */
  1420. if (debugging_paice) {
  1421. fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
  1422. fflush (aa_stderr);
  1423. }
  1424. return paicebuf;
  1425. } /* paice_stemmer() */
  1426. /************************************************/
  1427. /* */
  1428. /* null_stemmer */
  1429. /* */
  1430. /************************************************/
  1431. /* Stemmer that just copies and returns passed word.
  1432. * In effect, the passed word IS its own stem.
  1433. * Output buffer valid until next call to null_stemmer
  1434. * or paice_stemmer.
  1435. */
  1436. char *null_stemmer (char *word, DBLK *dblk)
  1437. {
  1438. if (word == NULL)
  1439. return "";
  1440. if (*word == '\0')
  1441. return "";
  1442. strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
  1443. paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
  1444. return paicebuf;
  1445. } /* null_stemmer() */
  1446. /************************************************/
  1447. /* */
  1448. /* euro_lstrupr */
  1449. /* */
  1450. /************************************************/
  1451. /* Converts passed string to uppercase in place.
  1452. * Classic strupr() function using teskey charmaps.
  1453. */
  1454. static char *euro_lstrupr (char *string, DBLK *dblk)
  1455. {
  1456. static int *charmap, len;
  1457. static char *s;
  1458. static wchar_t wc;
  1459. charmap = dblk->charmap;
  1460. for (s = string; *s; s++) {
  1461. len = euro_mbtowc (&wc, s, string);
  1462. *s = charmap[wc] & 0xFF;
  1463. if (len > 1) wctomb (s - 1, *s);
  1464. }
  1465. return string;
  1466. }
  1467. /************************************************/
  1468. /* */
  1469. /* null_lstrupr */
  1470. /* */
  1471. /************************************************/
  1472. /* Just returns passed string. Used where uppercase
  1473. * conversions are not required for a language.
  1474. */
  1475. char *null_lstrupr (char *s, DBLK *d)
  1476. { return s; }
  1477. /************************************************/
  1478. /* */
  1479. /* load_language */
  1480. /* */
  1481. /************************************************/
  1482. /* Loads a dblk with a specific language's
  1483. * structures and function pointers.
  1484. * Does not reload structures previously loaded in
  1485. * other dblks on dblist if derived from identical files.
  1486. * But always loads structures if passed dblist is NULL.
  1487. * Presumes dblk already partially initialized with mandatory fields:
  1488. * name, path, language.
  1489. * May also be preinitialized with optional fields:
  1490. * minwordsz, maxwordsz.
  1491. * Returns TRUE if all successful.
  1492. * Otherwise returns FALSE with err msgs on ausapi_msglist.
  1493. */
  1494. int load_language (DBLK *dblk, DBLK *dblist)
  1495. {
  1496. int oops = FALSE;
  1497. int language = dblk->dbrec.or_language;
  1498. if (debugging_loadlang)
  1499. fprintf (aa_stderr,
  1500. "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
  1501. (int)dblk->dbrec.or_language,
  1502. language_name (dblk->dbrec.or_language),
  1503. NULLORSTR(dblk->name));
  1504. /*
  1505. * Note: Load list functions must be called
  1506. * AFTER charmap and lstrupr are loaded.
  1507. */
  1508. switch (language) {
  1509. case DtSrLaENG:
  1510. case DtSrLaENG2:
  1511. case DtSrLaESP:
  1512. case DtSrLaFRA:
  1513. case DtSrLaITA:
  1514. case DtSrLaDEU:
  1515. dblk->charmap = (language == DtSrLaENG)?
  1516. ascii_charmap : latin_charmap;
  1517. dblk->parser = teskey_parser;
  1518. dblk->stemmer = paice_stemmer;
  1519. dblk->lstrupr = euro_lstrupr;
  1520. if (dblk->dbrec.or_maxwordsz == 0)
  1521. dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
  1522. MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
  1523. if (dblk->dbrec.or_minwordsz == 0)
  1524. dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
  1525. oops = FALSE;
  1526. if (!load_stop_list (dblk, dblist))
  1527. oops = TRUE;
  1528. if (!load_include_list (dblk, dblist))
  1529. oops = TRUE;
  1530. if (!load_paice_suffixes (dblk, dblist))
  1531. oops = TRUE;
  1532. if (oops)
  1533. return FALSE;
  1534. break;
  1535. case DtSrLaJPN:
  1536. case DtSrLaJPN2:
  1537. return load_jpn_language (dblk, dblist);
  1538. default:
  1539. /* Try loading a custom 'user' language.
  1540. * If he failed to provide a loader function,
  1541. * the dummy custom loader will tell him so.
  1542. * If he provided one but it can't load this language,
  1543. * it should return it's own error msgs.
  1544. */
  1545. return load_custom_language (dblk, dblist);
  1546. } /* end switch (language) */
  1547. return TRUE;
  1548. } /* load_language() */
  1549. /************************************************/
  1550. /* */
  1551. /* unload_language */
  1552. /* */
  1553. /************************************************/
  1554. /* Frees storage for structures allocated by load_language().
  1555. * Called when engine REINITs due to change in site config file
  1556. * or databases.
  1557. * Duplicate wordtrees are not unloaded because they
  1558. * will have already been unloaded in a previous dblk.
  1559. */
  1560. void unload_language (DBLK *dblk)
  1561. {
  1562. switch (dblk->dbrec.or_language) {
  1563. case DtSrLaENG:
  1564. case DtSrLaENG2:
  1565. case DtSrLaESP:
  1566. case DtSrLaFRA:
  1567. case DtSrLaITA:
  1568. case DtSrLaDEU:
  1569. dblk->charmap = NULL;
  1570. if ((dblk->lang_flags & LF_DUP_STP) == 0)
  1571. free_wordtree (&dblk->stoplist);
  1572. else {
  1573. dblk->stoplist = NULL;
  1574. dblk->lang_flags &= ~LF_DUP_STP;
  1575. }
  1576. if ((dblk->lang_flags & LF_DUP_INC) == 0)
  1577. free_wordtree (&dblk->inclist);
  1578. else {
  1579. dblk->inclist = NULL;
  1580. dblk->lang_flags &= ~LF_DUP_INC;
  1581. }
  1582. if ((dblk->lang_flags & LF_DUP_SFX) == 0)
  1583. free_paice_rules ((PRULE***)&dblk->stem_extra);
  1584. else {
  1585. dblk->stem_extra = NULL;
  1586. dblk->lang_flags &= ~LF_DUP_SFX;
  1587. }
  1588. break;
  1589. case DtSrLaJPN:
  1590. case DtSrLaJPN2:
  1591. unload_jpn_language (dblk);
  1592. break;
  1593. default:
  1594. unload_custom_language (dblk);
  1595. break;
  1596. }
  1597. return;
  1598. } /* unload_language() */
  1599. /******************** LANG.C ********************/