boolpars.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /* $XConsortium: boolpars.c /main/5 1996/11/25 18:49:27 drk $
  24. *
  25. * (c) Copyright 1996 Digital Equipment Corporation.
  26. * (c) Copyright 1996 Hewlett-Packard Company.
  27. * (c) Copyright 1996 International Business Machines Corp.
  28. * (c) Copyright 1996 Sun Microsystems, Inc.
  29. * (c) Copyright 1996 Novell, Inc.
  30. * (c) Copyright 1996 FUJITSU LIMITED.
  31. * (c) Copyright 1996 Hitachi.
  32. */
  33. /*
  34. * COMPONENT_NAME: austext
  35. *
  36. * FUNCTIONS: add_syntax_errmsg
  37. * boolean_parse
  38. * boolyac_AND
  39. * boolyac_COLLOC
  40. * boolyac_NOT
  41. * boolyac_OR
  42. * copy_final_truthtab
  43. * copy_token
  44. * creatett
  45. * freett
  46. * get_stem_truthtab
  47. * main
  48. * process_user_args
  49. * yyerror
  50. * yylex
  51. *
  52. * ORIGINS: 27
  53. *
  54. *
  55. * (C) COPYRIGHT International Business Machines Corp. 1996
  56. * All Rights Reserved
  57. * Licensed Materials - Property of IBM
  58. * US Government Users Restricted Rights - Use, duplication or
  59. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  60. */
  61. /********************* BOOLPARS.C ********************
  62. * $Id: boolpars.c /main/5 1996/11/25 18:49:27 drk $
  63. * February 1996.
  64. * AusText/DtSearch yacc-based boolean query parser.
  65. * Converts boolean query into stems array and truth table
  66. * for subsequent search. Boolyac.y is the yacc source.
  67. * After processing by yacc, it becomes boolyac.c and boolyac.h.
  68. * This module contains all the related C source code: yylex,
  69. * yacc action functions, and the main AusText driver function, boolean_parse.
  70. * Additional information (format of TRUTHTAB) in header file boolpars.h.
  71. *
  72. * $Log$
  73. * Revision 1.4 1996/03/22 23:12:50 miker
  74. * Added string.h header and correctly cast strcspn() calls.
  75. *
  76. * Revision 1.3 1996/03/20 19:14:30 miker
  77. * Enable collocation expressions in stem (type 'S') searches.
  78. *
  79. * Revision 1.2 1996/03/13 22:35:59 miker
  80. * Changed char to UCHAR several places; similar typecasts.
  81. *
  82. * Revision 1.1 1996/03/05 15:52:06 miker
  83. * Initial revision
  84. */
  85. #include "SearchE.h"
  86. #include <stdlib.h>
  87. #include <string.h>
  88. #include "boolpars.h"
  89. #include "boolyac.h"
  90. #if (DtSrMAX_STEMCOUNT != 8)
  91. #error DtSrMAX_STEMCOUNT is not defined to be 8.
  92. #endif
  93. #define PROGNAME "BOOLPARS"
  94. #define WORD_ENDERS " \t\n\f()|@~&"
  95. #define MAX_YYERRORS 4
  96. #define MS_boolpars 28
  97. /****************************************/
  98. /* */
  99. /* GLOBALS */
  100. /* */
  101. /****************************************/
  102. int qry_has_no_NOTs = FALSE;
  103. int qry_is_all_ANDs = FALSE;
  104. TRUTHTAB final_truthtab = { 0 };
  105. int parser_invalid_wordcount = 0;
  106. static int debugging_boolpars = FALSE;
  107. static unsigned char
  108. *final_permutes = NULL;
  109. static int last_token_was_boolop = TRUE;
  110. static char *msgbuf = NULL;
  111. static UCHAR *next_lex_char = NULL;
  112. static int paren_count = 0;
  113. static TRUTHTAB *ttlist = NULL;
  114. static int yyerror_count = 0;
  115. static size_t yyleng; /* same as in lex API */
  116. static char *yytext; /* same as in lex API */
  117. /****************************************/
  118. /* */
  119. /* add_syntax_errmsg */
  120. /* */
  121. /****************************************/
  122. /* Action function called for yacc rules used to trap syntax errors.
  123. * Adds error message identified by msgno to user's msglist.
  124. */
  125. void add_syntax_errmsg (int msgno)
  126. {
  127. switch (msgno) {
  128. case 1:
  129. /* Message #2 is called in two places */
  130. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 2,
  131. "%s Query field is empty."),
  132. PROGNAME"086");
  133. DtSearchAddMessage (msgbuf);
  134. break;
  135. case 2:
  136. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 5,
  137. "%s Boolean operators must be positioned\n"
  138. "between words or expressions. Two sequential words\n"
  139. "without an operator are interpreted as being separated\n"
  140. "by the AND operator (&)."),
  141. PROGNAME"091");
  142. DtSearchAddMessage (msgbuf);
  143. break;
  144. case 3:
  145. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 6,
  146. "%s Expression in parentheses is missing."),
  147. PROGNAME"093");
  148. DtSearchAddMessage (msgbuf);
  149. break;
  150. case 4:
  151. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 7,
  152. "%s NOT operator (~) must be positioned to\n"
  153. "the left of the word or expression it qualifies."),
  154. PROGNAME"098");
  155. DtSearchAddMessage (msgbuf);
  156. break;
  157. case 5:
  158. /* Message #3 is called in two places */
  159. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 3,
  160. "%s COLLOCATION operator (@) may\n"
  161. "only be positioned between two words."),
  162. PROGNAME"111");
  163. DtSearchAddMessage (msgbuf);
  164. break;
  165. case 6:
  166. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 4,
  167. "%s One or more words in your\n"
  168. "query are not stored in database '%s'.") ,
  169. PROGNAME"089", usrblk.dblk->label);
  170. DtSearchAddMessage (msgbuf);
  171. break;
  172. default:
  173. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 8,
  174. "%s Invalid boolean query. Syntax Error #%d.") ,
  175. PROGNAME"100", msgno);
  176. DtSearchAddMessage (msgbuf);
  177. break;
  178. }
  179. return;
  180. } /* add_syntax_errmsg() */
  181. /****************************************/
  182. /* */
  183. /* creatett */
  184. /* */
  185. /****************************************/
  186. /* Constructor for new truth table.
  187. * Allocates it, inits it, and links it into ttlist.
  188. */
  189. static TRUTHTAB *creatett (int stemno, int pmsz, unsigned char *permutes)
  190. {
  191. TRUTHTAB *newtt = austext_malloc (sizeof(TRUTHTAB) + pmsz + 4,
  192. PROGNAME"140", NULL);
  193. memset (newtt, 0, sizeof(TRUTHTAB));
  194. newtt->stemno = stemno;
  195. newtt->pmsz = pmsz;
  196. newtt->permutes = (unsigned char *) (newtt + 1);
  197. memcpy (newtt->permutes, permutes, pmsz);
  198. newtt->next = ttlist;
  199. ttlist = newtt;
  200. return newtt;
  201. } /* creatett() */
  202. /****************************************/
  203. /* */
  204. /* freett */
  205. /* */
  206. /****************************************/
  207. /* Destructor of passed truth table.
  208. * Unlinks it from ttlist and frees it.
  209. */
  210. static void freett (TRUTHTAB *argtt)
  211. {
  212. TRUTHTAB *tt;
  213. TRUTHTAB **lastlink = &ttlist;
  214. for (tt = ttlist; tt; tt = tt->next) {
  215. if (tt == argtt) {
  216. *lastlink = tt->next;
  217. free (tt);
  218. break;
  219. }
  220. lastlink = &tt->next;
  221. }
  222. return;
  223. } /* freett() */
  224. /****************************************/
  225. /* */
  226. /* copy_final_truthtab */
  227. /* */
  228. /****************************************/
  229. /* Copys passed truth table into global final_truthtab.
  230. * Returns final_truthtab.
  231. */
  232. TRUTHTAB *copy_final_truthtab (TRUTHTAB *tt)
  233. {
  234. memset (&final_truthtab, 0, sizeof(TRUTHTAB));
  235. if (!final_permutes)
  236. final_permutes = austext_malloc (300, PROGNAME"788", NULL);
  237. final_truthtab.pmsz = tt->pmsz;
  238. final_truthtab.permutes = final_permutes;
  239. memcpy (final_permutes, tt->permutes, final_truthtab.pmsz);
  240. return &final_truthtab;
  241. } /* copy_final_truthtab() */
  242. /****************************************/
  243. /* */
  244. /* get_stem_truthtab */
  245. /* */
  246. /****************************************/
  247. /* Subroutine of yylex. Also used in yacc action functions.
  248. * Creates and returns truth table for passed stem.
  249. * If stem is new, adds it to saveusr.stems array, and adds
  250. * the original query word string to usrblk.stems for msgs.
  251. * Returns NULL and posts err msg if array is full
  252. * or has other error.
  253. */
  254. static TRUTHTAB *get_stem_truthtab (char *newstem, char *origword)
  255. {
  256. int i, stemno;
  257. unsigned char bitmask;
  258. unsigned char *pmp;
  259. unsigned char new_permutes [128];
  260. TRUTHTAB *newtt;
  261. /* Check if stem is already in array */
  262. for (stemno = 0; stemno < saveusr.stemcount; stemno++)
  263. if (strcmp (newstem, saveusr.stems[stemno]) == 0)
  264. break;
  265. /* Add new stem to array */
  266. if (stemno == saveusr.stemcount) {
  267. if (++saveusr.stemcount > DtSrMAX_STEMCOUNT) {
  268. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 9,
  269. "%s Too many terms in boolean query."),
  270. PROGNAME"1513");
  271. DtSearchAddMessage (msgbuf);
  272. saveusr.stemcount--;
  273. return NULL;
  274. }
  275. strncpy (saveusr.stems[stemno], newstem, DtSrMAXWIDTH_HWORD);
  276. saveusr.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
  277. if (origword) {
  278. strncpy (usrblk.stems[stemno], origword, DtSrMAXWIDTH_HWORD);
  279. usrblk.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
  280. }
  281. }
  282. /* Stemno now indicates correct term in saveusr.stems.
  283. * Truth table for a single term has 128 8-bit permutes,
  284. * the 1/2 of all 256 possible permutations that have
  285. * that term's bit switched on.
  286. */
  287. bitmask = 1 << stemno; /* mask with only newstem's bit on */
  288. pmp = new_permutes;
  289. for (i=0; i<256; i++)
  290. if ((i & bitmask) != 0) {
  291. *pmp = i;
  292. pmp++;
  293. }
  294. newtt = creatett (stemno, 128, new_permutes);
  295. if (debugging_boolpars) {
  296. fprintf (aa_stderr, " WORD: stem[%d]='%c%s' expr=%p pmsz=%d\n",
  297. stemno,
  298. (saveusr.stems[stemno][0] == STEM_CH) ?
  299. '~' : saveusr.stems[stemno][0],
  300. &saveusr.stems[stemno][1],
  301. (void *) newtt, newtt->pmsz);
  302. fflush (aa_stderr);
  303. }
  304. return newtt;
  305. } /* get_stem_truthtab() */
  306. /****************************************/
  307. /* */
  308. /* boolyac_AND */
  309. /* */
  310. /****************************************/
  311. /* Action function for AND expression rule.
  312. * Returns set INTERSECTION of passed truth tables,
  313. * ie only the permutes they have in common.
  314. * Any truth table, input or output, can be the empty or
  315. * the universal set. For example: "(A & B) & ~A" is empty.
  316. */
  317. TRUTHTAB *boolyac_AND (TRUTHTAB *tt1, TRUTHTAB *tt2) {
  318. TRUTHTAB *newtt;
  319. unsigned char new_permutes [256];
  320. int pm1, pm2, newpm;
  321. pm1 = pm2 = newpm = 0;
  322. while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
  323. if (tt1->permutes[pm1] < tt2->permutes[pm2])
  324. pm1++;
  325. else if (tt1->permutes[pm1] > tt2->permutes[pm2])
  326. pm2++;
  327. else {
  328. new_permutes [newpm++] = tt1->permutes [pm1];
  329. pm1++;
  330. pm2++;
  331. }
  332. }
  333. /* Free old truthtabs, create new one. */
  334. freett (tt1);
  335. freett (tt2);
  336. newtt = creatett (-1, newpm, new_permutes);
  337. if (debugging_boolpars) {
  338. fprintf (aa_stderr, " AND: exprs=%p,%p-->expr=%p pmsz=%d\n",
  339. (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
  340. fflush (aa_stderr);
  341. }
  342. return newtt;
  343. } /* boolyac_AND() */
  344. /****************************************/
  345. /* */
  346. /* boolyac_OR */
  347. /* */
  348. /****************************************/
  349. /* Action function for OR expression rule.
  350. * Returns set UNION of passed truth tables.
  351. * Any truth table, input or output, can be the empty or
  352. * the universal set. For example: "A | ~A" is universal.
  353. */
  354. TRUTHTAB *boolyac_OR (TRUTHTAB *tt1, TRUTHTAB *tt2) {
  355. TRUTHTAB *newtt;
  356. unsigned char new_permutes [256];
  357. unsigned char *permutes1 = tt1->permutes;
  358. unsigned char *permutes2 = tt2->permutes;
  359. int pm1, pm2, newpm;
  360. pm1 = pm2 = newpm = 0;
  361. /* While neither permutes array is exhausted... */
  362. while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
  363. if (permutes1[pm1] < permutes2[pm2])
  364. new_permutes [newpm++] = permutes1[pm1++];
  365. else if (permutes2[pm2] < permutes1[pm1])
  366. new_permutes [newpm++] = permutes2[pm2++];
  367. else {
  368. new_permutes [newpm++] = permutes1[pm1++];
  369. pm2++;
  370. }
  371. }
  372. /* After one or both permutes arrays are exhausted... */
  373. while (pm1 < tt1->pmsz)
  374. new_permutes [newpm++] = permutes1[pm1++];
  375. while (pm2 < tt2->pmsz)
  376. new_permutes [newpm++] = permutes2[pm2++];
  377. /* Free old truthtabs, create new one. */
  378. freett (tt1);
  379. freett (tt2);
  380. newtt = creatett (-1, newpm, new_permutes);
  381. if (debugging_boolpars) {
  382. fprintf (aa_stderr, " OR: exprs=%p,%p-->expr=%p pmsz=%d\n",
  383. (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
  384. fflush (aa_stderr);
  385. }
  386. return newtt;
  387. } /* boolyac_OR() */
  388. /****************************************/
  389. /* */
  390. /* boolyac_NOT */
  391. /* */
  392. /****************************************/
  393. /* Action function for NOT expression rule.
  394. * Returns set COMPLEMENT of passed truth table,
  395. * ie the universal set minus the passed set,
  396. * ie all possible permutes except those passed.
  397. * Either the old or the new truth table can be
  398. * the empty or the universal set.
  399. */
  400. TRUTHTAB *boolyac_NOT (TRUTHTAB *oldtt) {
  401. TRUTHTAB *newtt;
  402. unsigned char new_permutes [256];
  403. int oldpm, newpm;
  404. int candidate;
  405. oldpm = newpm = 0;
  406. for (candidate = 0; candidate < 256; candidate++) {
  407. if (oldpm >= oldtt->pmsz || candidate < oldtt->permutes [oldpm]) {
  408. new_permutes [newpm++] = candidate;
  409. }
  410. /*
  411. * oldtt not done && candidate == oldtt.
  412. * (candidate > oldtt not possible).
  413. */
  414. else {
  415. oldpm++;
  416. }
  417. }
  418. freett (oldtt);
  419. newtt = creatett (-1, newpm, new_permutes);
  420. if (debugging_boolpars) {
  421. fprintf (aa_stderr, " NOT: expr=%p-->expr=%p pmsz=%d\n",
  422. (void *) oldtt, (void *) newtt, newtt->pmsz);
  423. fflush (aa_stderr);
  424. }
  425. return newtt;
  426. } /* boolyac_NOT() */
  427. /****************************************/
  428. /* */
  429. /* boolyac_COLLOC */
  430. /* */
  431. /****************************************/
  432. /* Action function for COLLOCATION expression rule.
  433. * The record set satisfying a collocation expression is
  434. * generated dynamically. At the parse level it is equivalent
  435. * to a separate 'word' with its own (undetermined) record set.
  436. * So it's given its own slot in saveusr.stems. The word
  437. * in saveusr.stems is formatted "@ssttv[v...]" where ss and tt are
  438. * ascii numbers that index the original collocated words
  439. * in saveusr.stems, and v... is the collocation value integer.
  440. * For example, "@03005" represents the collocation of stem
  441. * number 3 and stem number 0, with collocation value 5.
  442. *
  443. * Returns NULL and errmsg on msglist if any problems.
  444. */
  445. TRUTHTAB *boolyac_COLLOC (
  446. TRUTHTAB *word1tt,
  447. int colloc_val,
  448. TRUTHTAB *word2tt)
  449. {
  450. TRUTHTAB *newtt;
  451. char wordbuf [DtSrMAXWIDTH_HWORD];
  452. if (word1tt->stemno < 0 || word2tt->stemno < 0) {
  453. /* Message #3 is called in two places */
  454. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 3,
  455. "%s COLLOCATION operator (@) may\n"
  456. "only be positioned between two words."),
  457. PROGNAME"371");
  458. DtSearchAddMessage (msgbuf);
  459. return NULL;
  460. }
  461. if (word1tt->stemno == word2tt->stemno) {
  462. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 12,
  463. "%s Collocation operator is not\n"
  464. "permitted between identical words."),
  465. PROGNAME"377");
  466. DtSearchAddMessage (msgbuf);
  467. return NULL;
  468. }
  469. sprintf (wordbuf, COLLOC_STEM_FORMAT,
  470. word1tt->stemno, word2tt->stemno, colloc_val);
  471. if ((newtt = get_stem_truthtab (wordbuf, wordbuf)) == NULL)
  472. return NULL;
  473. freett (word1tt);
  474. freett (word2tt);
  475. if (debugging_boolpars) {
  476. fprintf (aa_stderr, " COLLOC: exprs=%p,%p-->expr=%p pmsz=%d\n",
  477. (void *) word1tt, (void *) word2tt, (void *) newtt, newtt->pmsz);
  478. fflush (aa_stderr);
  479. }
  480. return newtt;
  481. } /* boolyac_COLLOC() */
  482. /****************************************/
  483. /* */
  484. /* yyerror */
  485. /* */
  486. /****************************************/
  487. /* Replaces standard yacc error routine. */
  488. void yyerror (char *msg) {
  489. if (strcmp (msg, "syntax error") == 0) {
  490. if (DtSearchHasMessages())
  491. return;
  492. else if (parser_invalid_wordcount > 0)
  493. add_syntax_errmsg(6);
  494. else {
  495. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 1,
  496. "%s Your search string is an invalid\n"
  497. "boolean query. Please reformulate and try again."),
  498. PROGNAME"001");
  499. DtSearchAddMessage (msgbuf);
  500. }
  501. }
  502. else
  503. DtSearchAddMessage (msg);
  504. return;
  505. } /* yyerror() */
  506. /****************************************/
  507. /* */
  508. /* copy_token */
  509. /* */
  510. /****************************************/
  511. /* Subroutine of yylex(). Copies passed substring
  512. * Into a zero-terminated buffer of its own.
  513. * Static buffer good until next call.
  514. */
  515. static char *copy_token (UCHAR *tokenp, size_t toklen)
  516. {
  517. static char *buf = NULL;
  518. static size_t bufsz = 0;
  519. if (toklen > bufsz) {
  520. if (buf)
  521. free (buf);
  522. bufsz = toklen + (toklen >> 1); /* 1.5 times size needed */
  523. buf = austext_malloc (bufsz + 4, PROGNAME"182", NULL);
  524. }
  525. strncpy (buf, (char *) tokenp, toklen);
  526. buf [toklen] = 0;
  527. return buf;
  528. } /* copy_token() */
  529. /****************************************/
  530. /* */
  531. /* yylex */
  532. /* */
  533. /****************************************/
  534. /* Delivers tokens to yyparse() from usrblk.query */
  535. int yylex (void)
  536. {
  537. int retn_token;
  538. PARG parg;
  539. char *stembufp;
  540. char mystembuf [DtSrMAXWIDTH_HWORD + 4];
  541. GET_ANOTHER_TOKEN:
  542. /* Skip white space */
  543. while (ascii_charmap[*next_lex_char] & WHITESPACE)
  544. next_lex_char++;
  545. /* Terminating zero indicates end of query and end of parse.
  546. * Automatically close unbalanced parentheses.
  547. */
  548. if (*next_lex_char == 0) {
  549. if (paren_count > 0) {
  550. paren_count--;
  551. retn_token = ')';
  552. yytext = ")";
  553. yyleng = 1;
  554. goto DELIVER_TOKEN;
  555. }
  556. retn_token = 0;
  557. yytext = "";
  558. yyleng = 0;
  559. goto DELIVER_TOKEN;
  560. }
  561. switch (*next_lex_char) {
  562. case '|': /* OR operator */
  563. last_token_was_boolop = TRUE;
  564. retn_token = '|';
  565. yytext = "|";
  566. yyleng = 1;
  567. next_lex_char++;
  568. break;
  569. case '~': /* NOT operator */
  570. if (!last_token_was_boolop) {
  571. /* Generate implied AND between words
  572. * and parenthesized expressions.
  573. * A NOT is not itself boolean; it must
  574. * precede the next word or expression.
  575. */
  576. last_token_was_boolop = TRUE;
  577. retn_token = '&';
  578. yytext = "&";
  579. yyleng = 1;
  580. break;
  581. }
  582. last_token_was_boolop = TRUE;
  583. retn_token = '~';
  584. yytext = "~";
  585. yyleng = 1;
  586. next_lex_char++;
  587. break;
  588. case '&': /* AND operator */
  589. if (last_token_was_boolop && qry_is_all_ANDs) {
  590. /* Ignore multiple AND operators.
  591. * These might occur if we silently
  592. * discarded some invalid words.
  593. */
  594. next_lex_char++;
  595. goto GET_ANOTHER_TOKEN;
  596. }
  597. last_token_was_boolop = TRUE;
  598. retn_token = '&';
  599. yytext = "&";
  600. yyleng = 1;
  601. next_lex_char++;
  602. break;
  603. case '(': /* OPEN parentheses */
  604. if (!last_token_was_boolop) {
  605. /* Generate implied AND between words
  606. * and parenthesized expressions.
  607. */
  608. last_token_was_boolop = TRUE;
  609. retn_token = '&';
  610. yytext = "&";
  611. yyleng = 1;
  612. break;
  613. }
  614. paren_count++;
  615. retn_token = '(';
  616. yytext = "(";
  617. yyleng = 1;
  618. next_lex_char++;
  619. break;
  620. case ')': /* CLOSE parentheses */
  621. /* Just discard excessive right parentheses */
  622. if (--paren_count < 0) {
  623. paren_count = 0;
  624. next_lex_char++;
  625. goto GET_ANOTHER_TOKEN;
  626. }
  627. last_token_was_boolop = FALSE;
  628. retn_token = ')';
  629. yytext = ")";
  630. yyleng = 1;
  631. next_lex_char++;
  632. break;
  633. case '@': /* COLLOCATION operator */
  634. /* Collocation token:
  635. * Token is defined as the collocation char followed
  636. * by one or more numeric digits: "@#[#...]".
  637. * Syntactically it's a kind of an AND operator.
  638. * Semantically it's a pseudo word token
  639. * (it will occupy a slot in the stems array).
  640. * The yylval is the integer value following
  641. * the collocation character.
  642. */
  643. yyleng = strcspn ((char *) next_lex_char + 1, WORD_ENDERS) + 1;
  644. yytext = copy_token (next_lex_char, yyleng);
  645. next_lex_char += yyleng;
  646. if ((usrblk.dblk->dbrec.or_dbaccess & ORA_BLOB) == 0) {
  647. retn_token = ERROR_TOKEN;
  648. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 10,
  649. "%s Collocation searches not available for database '%s'."),
  650. PROGNAME"2567", usrblk.dblk->label);
  651. DtSearchAddMessage (msgbuf);
  652. break;
  653. }
  654. yylval.int_val = atoi (yytext + 1);
  655. if (yylval.int_val <= 0) {
  656. retn_token = ERROR_TOKEN;
  657. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 11,
  658. "%s Collocation operator '%.*s' is invalid.\n"
  659. "Correct format is '@n' where n is greater than zero.") ,
  660. PROGNAME"294", DtSrMAXWIDTH_HWORD, yytext);
  661. DtSearchAddMessage (msgbuf);
  662. break;
  663. }
  664. last_token_was_boolop = TRUE;
  665. retn_token = COLLOC_TOKEN;
  666. break;
  667. default:
  668. /* Presumed word token:
  669. * Token is all text chars until next whitespace,
  670. * next lex token, or end of string.
  671. * Linguistically parse it and optionally stem it.
  672. * The token value is the truth table for one
  673. * word: all permutes with only that word's
  674. * bits turned on. If the word is already
  675. * in the stems array, then the permutes
  676. * position is the word's index in the array.
  677. * If the word is not in the array, it's added.
  678. * If the array is full, then an error is reported.
  679. */
  680. if (!last_token_was_boolop) {
  681. /* Generate implied AND between words
  682. * and parenthesized expressions.
  683. */
  684. last_token_was_boolop = TRUE;
  685. retn_token = '&';
  686. yytext = "&";
  687. yyleng = 1;
  688. break;
  689. }
  690. yyleng = strcspn ((char *) next_lex_char, WORD_ENDERS);
  691. yytext = copy_token (next_lex_char, yyleng);
  692. next_lex_char += yyleng;
  693. /*
  694. * Linguistically parse the token.
  695. * Failure can occur because word is too short
  696. * or too long, it's on the stoplist, etc.
  697. * Setting PA_MSGS causes parser to explain
  698. * invalid words with a msg.
  699. */
  700. memset (&parg, 0, sizeof(PARG));
  701. parg.dblk = usrblk.dblk;
  702. parg.string = yytext;
  703. /*****if (!qry_is_all_ANDs)********/
  704. parg.flags = PA_MSGS;
  705. stembufp = usrblk.dblk->parser (&parg);
  706. if (debugging_boolpars) {
  707. fprintf (aa_stderr, " lang: '%s' -> '%s'\n",
  708. yytext, (stembufp)? stembufp : "<null>");
  709. fflush (aa_stderr);
  710. }
  711. /*
  712. * If token is not a linguistically valid word,
  713. * one of two things can happen. If the query
  714. * is all_ANDs (most common type) we silently
  715. * ignore the token.
  716. * Otherwise report error and quit now.
  717. */
  718. if (stembufp == NULL) {
  719. parser_invalid_wordcount++;
  720. if (qry_is_all_ANDs)
  721. goto GET_ANOTHER_TOKEN;
  722. retn_token = ERROR_TOKEN;
  723. if (!DtSearchHasMessages()) {
  724. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 13,
  725. "%s Word '%.*s' is invalid.") ,
  726. PROGNAME"315", DtSrMAXWIDTH_HWORD, yytext);
  727. DtSearchAddMessage (msgbuf);
  728. }
  729. break;
  730. }
  731. if (strlen(stembufp) != strlen(yytext)) {
  732. retn_token = ERROR_TOKEN;
  733. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 14,
  734. "%s String '%.*s' is not a single word.") ,
  735. PROGNAME"634", DtSrMAXWIDTH_HWORD, yytext);
  736. DtSearchAddMessage (msgbuf);
  737. break;
  738. }
  739. /*
  740. * If stemming, we must prefix term with
  741. * special stem char in the stems array.
  742. */
  743. if (usrblk.request == OE_SRCH_STEMS) {
  744. stembufp = usrblk.dblk->stemmer (stembufp, usrblk.dblk);
  745. if (debugging_boolpars) {
  746. fprintf (aa_stderr, " stemer: -> '%s'\n", stembufp);
  747. fflush (aa_stderr);
  748. }
  749. mystembuf[0] = STEM_CH;
  750. strncpy (mystembuf + 1, stembufp, DtSrMAXWIDTH_HWORD);
  751. mystembuf [DtSrMAXWIDTH_HWORD - 1] = 0;
  752. stembufp = mystembuf;
  753. }
  754. /* Load stem into stems arrays and return it's truth table. */
  755. if ((yylval.truthtab = get_stem_truthtab (stembufp, yytext))) {
  756. retn_token = WORD_TOKEN;
  757. last_token_was_boolop = FALSE;
  758. }
  759. else
  760. retn_token = ERROR_TOKEN;
  761. break;
  762. } /* switch on *next_lex_char */
  763. DELIVER_TOKEN:
  764. if (debugging_boolpars) {
  765. fprintf (aa_stderr,
  766. " yylex: op?=%d parct=%d tok#=%d lval=%p%sYYTEXT='%s'\n",
  767. last_token_was_boolop, paren_count,
  768. retn_token, (void *) yylval.truthtab,
  769. (retn_token == COLLOC_TOKEN)? "\t\t" : "\t",
  770. yytext);
  771. fflush (aa_stderr);
  772. }
  773. return retn_token;
  774. } /* yylex() */
  775. /****************************************/
  776. /* */
  777. /* boolean_parse */
  778. /* */
  779. /****************************************/
  780. /* Called from Opera_Engine for boolean searches.
  781. * Driver for yyparse().
  782. * Expects usrblk.request == OE_SRCH_STEMS or OE_SRCH_WORDS.
  783. * If parse is completely successful (query is valid), outputs
  784. * saveusr.stemcount,
  785. * saveusr.stems (stemmed if necessary with STEM_CH as first char,
  786. * and phony colloc words with '@' as first char),
  787. * usrblk.stems (original unstemmed query terms for err msgs),
  788. * final_truthtab,
  789. * qry_has_no_NOTs,
  790. * qry_is_all_ANDs,
  791. * and returns TRUE. Truthtab allocation good until next call.
  792. * If parse fails, returns FALSE and err msg(s) on msglist.
  793. */
  794. int boolean_parse (void)
  795. {
  796. int i;
  797. char *cptr;
  798. TRUTHTAB *tt, *ttnext;
  799. debugging_boolpars = (usrblk.debug & USRDBG_BOOL);
  800. if (!msgbuf)
  801. msgbuf = austext_malloc (300 + DtSrMAXWIDTH_HWORD,
  802. PROGNAME"255", NULL);
  803. /* Test for empty query */
  804. if (usrblk.query == NULL) {
  805. EMPTY_QUERY:
  806. /* Message #2 is called in two places */
  807. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 2,
  808. "%s Query is empty."), PROGNAME"289");
  809. DtSearchAddMessage (msgbuf);
  810. return FALSE;
  811. }
  812. for (cptr = usrblk.query; *cptr; cptr++) {
  813. if ((ascii_charmap[*cptr] & WHITESPACE) == 0)
  814. break;
  815. }
  816. if (*cptr == 0)
  817. goto EMPTY_QUERY;
  818. /* Init globals for yylex and yyparse */
  819. next_lex_char = (UCHAR *) usrblk.query;
  820. paren_count = 0;
  821. yyerror_count = 0;
  822. last_token_was_boolop = TRUE;
  823. saveusr.stemcount = 0;
  824. parser_invalid_wordcount = 0;
  825. /* Query "is all ANDS" if it has no ORs, NOTs, or COLLOCs.
  826. * Missing or linguistically invalid words will be silently
  827. * discarded for all_ANDs queries.
  828. * Query "has no NOTs" if it has no NOTs.
  829. * Results from queries without NOTs can be statistically sorted.
  830. */
  831. qry_has_no_NOTs = !strchr (usrblk.query, '~');
  832. qry_is_all_ANDs = !strpbrk (usrblk.query, "|~@");
  833. if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
  834. fprintf (aa_stderr,
  835. "start boolean_parse: stem?=%d allANDs?=%d noNOTs?=%d\n"
  836. " query: '%s'\n",
  837. (usrblk.request == OE_SRCH_STEMS),
  838. qry_is_all_ANDs, qry_has_no_NOTs, usrblk.query);
  839. fflush (aa_stderr);
  840. }
  841. if (yyparse() != 0)
  842. return FALSE;
  843. /* Free entire remaining ttlist. Only you
  844. * can prevent forest fires and memory leaks.
  845. */
  846. tt = ttlist;
  847. while (tt) {
  848. ttnext = tt->next;
  849. free (tt);
  850. tt = ttnext;
  851. }
  852. ttlist = NULL;
  853. if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
  854. print_stems (saveusr.stemcount, saveusr.stems,
  855. PROGNAME"815 end boolean_parse, syntax ok,");
  856. fprintf (aa_stderr, " permutes=%d:", final_truthtab.pmsz);
  857. for (i=0; i<16; i++) {
  858. if (i >= final_truthtab.pmsz)
  859. break;
  860. fprintf (aa_stderr, " %02x", final_truthtab.permutes [i]);
  861. }
  862. fputc ('\n', aa_stderr);
  863. fflush (aa_stderr);
  864. }
  865. if (final_truthtab.pmsz <= 0) {
  866. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 15,
  867. "%s Your query cannot logically return\n"
  868. "any records. Please reformulate and try again."),
  869. PROGNAME"334");
  870. DtSearchAddMessage (msgbuf);
  871. return FALSE;
  872. }
  873. if (final_truthtab.pmsz >= 256) {
  874. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 16,
  875. "%s Your query will return entire database\n"
  876. "'%s'. Please reformulate and try again.") ,
  877. PROGNAME"341", usrblk.dblk->label);
  878. DtSearchAddMessage (msgbuf);
  879. return FALSE;
  880. }
  881. return TRUE;
  882. } /* boolean_parse() */
  883. #ifdef TESTBOOL /*-----------------------------------------------*/
  884. USRBLK usrblk = { 0 };
  885. DBLK dblk;
  886. SAVEUSR saveusr = { 0 };
  887. extern int debugging_teskey;
  888. extern int debugging_paice;
  889. extern int debugging_jpn;
  890. /****************************************/
  891. /* */
  892. /* process_user_args */
  893. /* */
  894. /****************************************/
  895. /* Subroutine of main(). Validates and loads global
  896. * variables with values from command line arguments.
  897. */
  898. static void process_user_args (int argc, char *argv[])
  899. {
  900. int i;
  901. char *argptr;
  902. char *cptr;
  903. char *src, *targ;
  904. int oops = FALSE;
  905. /* Each pass grabs new parm of "-xxx" format */
  906. argc--, argv++;
  907. while (argc > 0) {
  908. argptr = argv[0];
  909. if (*argptr != '-')
  910. break;
  911. switch (argptr[1]) {
  912. case 'm':
  913. if (argptr[2] == 'x')
  914. dblk.dbrec.or_maxwordsz = atoi (argptr + 3);
  915. else if (argptr[2] == 'n')
  916. dblk.dbrec.or_minwordsz = atoi (argptr + 3);
  917. else
  918. goto BAD_ARG;
  919. break;
  920. case 'l':
  921. dblk.dbrec.or_language = atoi (argptr + 2);
  922. break;
  923. case 'd':
  924. for (cptr = argptr+2; *cptr != 0; cptr++) {
  925. switch (*cptr) {
  926. case 't': debugging_teskey = TRUE; break;
  927. case 'p': debugging_paice = TRUE; break;
  928. case 'j': debugging_jpn = TRUE; break;
  929. default:
  930. oops = TRUE;
  931. fprintf (aa_stderr,
  932. "%s Invalid debug option %c.\a\n",
  933. PROGNAME"049", *cptr);
  934. break;
  935. }
  936. }
  937. break;
  938. BAD_ARG:
  939. default:
  940. oops = TRUE;
  941. fprintf (aa_stderr,
  942. "%s Invalid command line argument '%s'.\a\n",
  943. PROGNAME"059", argptr);
  944. break;
  945. } /* end switch */
  946. argc--, argv++;
  947. } /* main loop on each arg */
  948. if (oops) {
  949. fprintf (aa_stderr,
  950. "\nUSAGE: %s [options]\n"
  951. " -mx# maximum word size.\n"
  952. " -mn# minimum word size.\n"
  953. " -dtpj Debug: Teskey, Paice, Japanese.\n"
  954. " -l# language number. Default 0.\n",
  955. aa_argv0);
  956. exit(2);
  957. }
  958. return;
  959. } /* process_user_args() */
  960. /****************************************/
  961. /* */
  962. /* main */
  963. /* */
  964. /****************************************/
  965. int main (int argc, char *argv[])
  966. {
  967. int i;
  968. int valid_boolpars;
  969. char *cptr;
  970. char linebuf [1024];
  971. /* Init global variables */
  972. aa_argv0 = argv[0];
  973. memset (&usrblk, 0, sizeof(USRBLK));
  974. usrblk.dblk = &dblk;
  975. usrblk.debug |= USRDBG_BOOL; /* set debugging_boolpars */
  976. memset (&dblk, 0, sizeof(DBLK));
  977. strcpy (dblk.name, "testbool");
  978. dblk.label = dblk.name;
  979. dblk.dbrec.or_dbaccess |= ORA_BLOB; /* enable collocations */
  980. /* Read command line args */
  981. process_user_args (argc, argv);
  982. if (!load_language (&dblk, NULL)) {
  983. fprintf (aa_stderr,
  984. PROGNAME"140 load_language() failed. Msgs:\n%s\n",
  985. DtSearchGetMessages());
  986. return 2;
  987. }
  988. fprintf (aa_stderr, " lang=%d minwdsz=%d maxwdsz=%d.\n",
  989. dblk.dbrec.or_language,
  990. dblk.dbrec.or_minwordsz,
  991. dblk.dbrec.or_maxwordsz);
  992. /* Main loop. Each line is a boolean query. */
  993. printf ("Enter an AusText boolean query. 'q' or '.' to quit.\n"
  994. "If first char is '$', words will be stemmed:\n> ");
  995. fflush (stdout);
  996. while (fgets (linebuf, sizeof(linebuf), stdin) != NULL) {
  997. linebuf [sizeof(linebuf) - 1] = 0;
  998. if (strcmp (linebuf, ".\n") == 0)
  999. break;
  1000. if (strcmp (linebuf, "q\n") == 0)
  1001. break;
  1002. if (linebuf[0] == '\n')
  1003. break;
  1004. linebuf [strlen(linebuf) - 1] = 0; /* overlay \n */
  1005. if (linebuf[0] == '$') {
  1006. usrblk.query = linebuf + 1;
  1007. usrblk.request = OE_SRCH_STEMS;
  1008. }
  1009. else {
  1010. usrblk.query = linebuf;
  1011. usrblk.request = OE_SRCH_WORDS;
  1012. }
  1013. if (!boolean_parse())
  1014. puts (PROGNAME"707 boolean_parse() returned FALSE (OE_BAD_QUERY).");
  1015. if (DtSearchHasMessages()) {
  1016. printf ("mmmmm Messages returned to user mmmmmmmmmmmmmmmmmm\n"
  1017. "%s\nmmmmm End of messages to user mmmmmmmmmmmmmmmmmmmm\n",
  1018. DtSearchGetMessages());
  1019. DtSearchFreeMessages();
  1020. }
  1021. printf ("--------------------------------\n> ");
  1022. fflush (stdout);
  1023. } /* main read loop for each query line */
  1024. return 0;
  1025. } /* main() */
  1026. #endif /* TESTBOOL */
  1027. /********************* BOOLPARS.C ********************/