123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125 |
- /*
- * CDE - Common Desktop Environment
- *
- * Copyright (c) 1993-2012, The Open Group. All rights reserved.
- *
- * These libraries and programs are free software; you can
- * redistribute them and/or modify them under the terms of the GNU
- * Lesser General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * These libraries and programs are distributed in the hope that
- * they will be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU Lesser General Public License for more
- * details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with these libraries and programs; if not, write
- * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
- * Floor, Boston, MA 02110-1301 USA
- */
- /* $XConsortium: boolpars.c /main/5 1996/11/25 18:49:27 drk $
- *
- * (c) Copyright 1996 Digital Equipment Corporation.
- * (c) Copyright 1996 Hewlett-Packard Company.
- * (c) Copyright 1996 International Business Machines Corp.
- * (c) Copyright 1996 Sun Microsystems, Inc.
- * (c) Copyright 1996 Novell, Inc.
- * (c) Copyright 1996 FUJITSU LIMITED.
- * (c) Copyright 1996 Hitachi.
- */
- /*
- * COMPONENT_NAME: austext
- *
- * FUNCTIONS: add_syntax_errmsg
- * boolean_parse
- * boolyac_AND
- * boolyac_COLLOC
- * boolyac_NOT
- * boolyac_OR
- * copy_final_truthtab
- * copy_token
- * creatett
- * freett
- * get_stem_truthtab
- * main
- * process_user_args
- * yyerror
- * yylex
- *
- * ORIGINS: 27
- *
- *
- * (C) COPYRIGHT International Business Machines Corp. 1996
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- */
- /********************* BOOLPARS.C ********************
- * $Id: boolpars.c /main/5 1996/11/25 18:49:27 drk $
- * February 1996.
- * AusText/DtSearch yacc-based boolean query parser.
- * Converts boolean query into stems array and truth table
- * for subsequent search. Boolyac.y is the yacc source.
- * After processing by yacc, it becomes boolyac.c and boolyac.h.
- * This module contains all the related C source code: yylex,
- * yacc action functions, and the main AusText driver function, boolean_parse.
- * Additional information (format of TRUTHTAB) in header file boolpars.h.
- *
- * $Log$
- * Revision 1.4 1996/03/22 23:12:50 miker
- * Added string.h header and correctly cast strcspn() calls.
- *
- * Revision 1.3 1996/03/20 19:14:30 miker
- * Enable collocation expressions in stem (type 'S') searches.
- *
- * Revision 1.2 1996/03/13 22:35:59 miker
- * Changed char to UCHAR several places; similar typecasts.
- *
- * Revision 1.1 1996/03/05 15:52:06 miker
- * Initial revision
- */
- #include "SearchE.h"
- #include <stdlib.h>
- #include <string.h>
- #include "boolpars.h"
- #include "boolyac.h"
- #if (DtSrMAX_STEMCOUNT != 8)
- #error DtSrMAX_STEMCOUNT is not defined to be 8.
- #endif
- #define PROGNAME "BOOLPARS"
- #define WORD_ENDERS " \t\n\f()|@~&"
- #define MAX_YYERRORS 4
- #define MS_boolpars 28
- /****************************************/
- /* */
- /* GLOBALS */
- /* */
- /****************************************/
- int qry_has_no_NOTs = FALSE;
- int qry_is_all_ANDs = FALSE;
- TRUTHTAB final_truthtab = { 0 };
- int parser_invalid_wordcount = 0;
- static int debugging_boolpars = FALSE;
- static unsigned char
- *final_permutes = NULL;
- static int last_token_was_boolop = TRUE;
- static char *msgbuf = NULL;
- static UCHAR *next_lex_char = NULL;
- static int paren_count = 0;
- static TRUTHTAB *ttlist = NULL;
- static int yyerror_count = 0;
- static size_t yyleng; /* same as in lex API */
- static char *yytext; /* same as in lex API */
- /****************************************/
- /* */
- /* add_syntax_errmsg */
- /* */
- /****************************************/
- /* Action function called for yacc rules used to trap syntax errors.
- * Adds error message identified by msgno to user's msglist.
- */
- void add_syntax_errmsg (int msgno)
- {
- switch (msgno) {
- case 1:
- /* Message #2 is called in two places */
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 2,
- "%s Query field is empty."),
- PROGNAME"086");
- DtSearchAddMessage (msgbuf);
- break;
- case 2:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 5,
- "%s Boolean operators must be positioned\n"
- "between words or expressions. Two sequential words\n"
- "without an operator are interpreted as being separated\n"
- "by the AND operator (&)."),
- PROGNAME"091");
- DtSearchAddMessage (msgbuf);
- break;
- case 3:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 6,
- "%s Expression in parentheses is missing."),
- PROGNAME"093");
- DtSearchAddMessage (msgbuf);
- break;
- case 4:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 7,
- "%s NOT operator (~) must be positioned to\n"
- "the left of the word or expression it qualifies."),
- PROGNAME"098");
- DtSearchAddMessage (msgbuf);
- break;
- case 5:
- /* Message #3 is called in two places */
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 3,
- "%s COLLOCATION operator (@) may\n"
- "only be positioned between two words."),
- PROGNAME"111");
- DtSearchAddMessage (msgbuf);
- break;
- case 6:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 4,
- "%s One or more words in your\n"
- "query are not stored in database '%s'.") ,
- PROGNAME"089", usrblk.dblk->label);
- DtSearchAddMessage (msgbuf);
- break;
- default:
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 8,
- "%s Invalid boolean query. Syntax Error #%d.") ,
- PROGNAME"100", msgno);
- DtSearchAddMessage (msgbuf);
- break;
- }
- return;
- } /* add_syntax_errmsg() */
- /****************************************/
- /* */
- /* creatett */
- /* */
- /****************************************/
- /* Constructor for new truth table.
- * Allocates it, inits it, and links it into ttlist.
- */
- static TRUTHTAB *creatett (int stemno, int pmsz, unsigned char *permutes)
- {
- TRUTHTAB *newtt = austext_malloc (sizeof(TRUTHTAB) + pmsz + 4,
- PROGNAME"140", NULL);
- memset (newtt, 0, sizeof(TRUTHTAB));
- newtt->stemno = stemno;
- newtt->pmsz = pmsz;
- newtt->permutes = (unsigned char *) (newtt + 1);
- memcpy (newtt->permutes, permutes, pmsz);
- newtt->next = ttlist;
- ttlist = newtt;
- return newtt;
- } /* creatett() */
- /****************************************/
- /* */
- /* freett */
- /* */
- /****************************************/
- /* Destructor of passed truth table.
- * Unlinks it from ttlist and frees it.
- */
- static void freett (TRUTHTAB *argtt)
- {
- TRUTHTAB *tt;
- TRUTHTAB **lastlink = &ttlist;
- for (tt = ttlist; tt; tt = tt->next) {
- if (tt == argtt) {
- *lastlink = tt->next;
- free (tt);
- break;
- }
- lastlink = &tt->next;
- }
- return;
- } /* freett() */
-
- /****************************************/
- /* */
- /* copy_final_truthtab */
- /* */
- /****************************************/
- /* Copys passed truth table into global final_truthtab.
- * Returns final_truthtab.
- */
- TRUTHTAB *copy_final_truthtab (TRUTHTAB *tt)
- {
- memset (&final_truthtab, 0, sizeof(TRUTHTAB));
- if (!final_permutes)
- final_permutes = austext_malloc (300, PROGNAME"788", NULL);
- final_truthtab.pmsz = tt->pmsz;
- final_truthtab.permutes = final_permutes;
- memcpy (final_permutes, tt->permutes, final_truthtab.pmsz);
- return &final_truthtab;
- } /* copy_final_truthtab() */
-
- /****************************************/
- /* */
- /* get_stem_truthtab */
- /* */
- /****************************************/
- /* Subroutine of yylex. Also used in yacc action functions.
- * Creates and returns truth table for passed stem.
- * If stem is new, adds it to saveusr.stems array, and adds
- * the original query word string to usrblk.stems for msgs.
- * Returns NULL and posts err msg if array is full
- * or has other error.
- */
- static TRUTHTAB *get_stem_truthtab (char *newstem, char *origword)
- {
- int i, stemno;
- unsigned char bitmask;
- unsigned char *pmp;
- unsigned char new_permutes [128];
- TRUTHTAB *newtt;
- /* Check if stem is already in array */
- for (stemno = 0; stemno < saveusr.stemcount; stemno++)
- if (strcmp (newstem, saveusr.stems[stemno]) == 0)
- break;
- /* Add new stem to array */
- if (stemno == saveusr.stemcount) {
- if (++saveusr.stemcount > DtSrMAX_STEMCOUNT) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 9,
- "%s Too many terms in boolean query."),
- PROGNAME"1513");
- DtSearchAddMessage (msgbuf);
- saveusr.stemcount--;
- return NULL;
- }
- strncpy (saveusr.stems[stemno], newstem, DtSrMAXWIDTH_HWORD);
- saveusr.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
- if (origword) {
- strncpy (usrblk.stems[stemno], origword, DtSrMAXWIDTH_HWORD);
- usrblk.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
- }
- }
- /* Stemno now indicates correct term in saveusr.stems.
- * Truth table for a single term has 128 8-bit permutes,
- * the 1/2 of all 256 possible permutations that have
- * that term's bit switched on.
- */
- bitmask = 1 << stemno; /* mask with only newstem's bit on */
- pmp = new_permutes;
- for (i=0; i<256; i++)
- if ((i & bitmask) != 0) {
- *pmp = i;
- pmp++;
- }
- newtt = creatett (stemno, 128, new_permutes);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " WORD: stem[%d]='%c%s' expr=%p pmsz=%d\n",
- stemno,
- (saveusr.stems[stemno][0] == STEM_CH) ?
- '~' : saveusr.stems[stemno][0],
- &saveusr.stems[stemno][1],
- (void *) newtt, newtt->pmsz);
- fflush (aa_stderr);
- }
- return newtt;
- } /* get_stem_truthtab() */
- /****************************************/
- /* */
- /* boolyac_AND */
- /* */
- /****************************************/
- /* Action function for AND expression rule.
- * Returns set INTERSECTION of passed truth tables,
- * ie only the permutes they have in common.
- * Any truth table, input or output, can be the empty or
- * the universal set. For example: "(A & B) & ~A" is empty.
- */
- TRUTHTAB *boolyac_AND (TRUTHTAB *tt1, TRUTHTAB *tt2) {
- TRUTHTAB *newtt;
- unsigned char new_permutes [256];
- int pm1, pm2, newpm;
- pm1 = pm2 = newpm = 0;
- while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
- if (tt1->permutes[pm1] < tt2->permutes[pm2])
- pm1++;
- else if (tt1->permutes[pm1] > tt2->permutes[pm2])
- pm2++;
- else {
- new_permutes [newpm++] = tt1->permutes [pm1];
- pm1++;
- pm2++;
- }
- }
- /* Free old truthtabs, create new one. */
- freett (tt1);
- freett (tt2);
- newtt = creatett (-1, newpm, new_permutes);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " AND: exprs=%p,%p-->expr=%p pmsz=%d\n",
- (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
- fflush (aa_stderr);
- }
- return newtt;
- } /* boolyac_AND() */
- /****************************************/
- /* */
- /* boolyac_OR */
- /* */
- /****************************************/
- /* Action function for OR expression rule.
- * Returns set UNION of passed truth tables.
- * Any truth table, input or output, can be the empty or
- * the universal set. For example: "A | ~A" is universal.
- */
- TRUTHTAB *boolyac_OR (TRUTHTAB *tt1, TRUTHTAB *tt2) {
- TRUTHTAB *newtt;
- unsigned char new_permutes [256];
- unsigned char *permutes1 = tt1->permutes;
- unsigned char *permutes2 = tt2->permutes;
- int pm1, pm2, newpm;
- pm1 = pm2 = newpm = 0;
- /* While neither permutes array is exhausted... */
- while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
- if (permutes1[pm1] < permutes2[pm2])
- new_permutes [newpm++] = permutes1[pm1++];
- else if (permutes2[pm2] < permutes1[pm1])
- new_permutes [newpm++] = permutes2[pm2++];
- else {
- new_permutes [newpm++] = permutes1[pm1++];
- pm2++;
- }
- }
- /* After one or both permutes arrays are exhausted... */
- while (pm1 < tt1->pmsz)
- new_permutes [newpm++] = permutes1[pm1++];
- while (pm2 < tt2->pmsz)
- new_permutes [newpm++] = permutes2[pm2++];
- /* Free old truthtabs, create new one. */
- freett (tt1);
- freett (tt2);
- newtt = creatett (-1, newpm, new_permutes);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " OR: exprs=%p,%p-->expr=%p pmsz=%d\n",
- (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
- fflush (aa_stderr);
- }
- return newtt;
- } /* boolyac_OR() */
- /****************************************/
- /* */
- /* boolyac_NOT */
- /* */
- /****************************************/
- /* Action function for NOT expression rule.
- * Returns set COMPLEMENT of passed truth table,
- * ie the universal set minus the passed set,
- * ie all possible permutes except those passed.
- * Either the old or the new truth table can be
- * the empty or the universal set.
- */
- TRUTHTAB *boolyac_NOT (TRUTHTAB *oldtt) {
- TRUTHTAB *newtt;
- unsigned char new_permutes [256];
- int oldpm, newpm;
- int candidate;
- oldpm = newpm = 0;
- for (candidate = 0; candidate < 256; candidate++) {
- if (oldpm >= oldtt->pmsz || candidate < oldtt->permutes [oldpm]) {
- new_permutes [newpm++] = candidate;
- }
- /*
- * oldtt not done && candidate == oldtt.
- * (candidate > oldtt not possible).
- */
- else {
- oldpm++;
- }
- }
- freett (oldtt);
- newtt = creatett (-1, newpm, new_permutes);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " NOT: expr=%p-->expr=%p pmsz=%d\n",
- (void *) oldtt, (void *) newtt, newtt->pmsz);
- fflush (aa_stderr);
- }
- return newtt;
- } /* boolyac_NOT() */
- /****************************************/
- /* */
- /* boolyac_COLLOC */
- /* */
- /****************************************/
- /* Action function for COLLOCATION expression rule.
- * The record set satisfying a collocation expression is
- * generated dynamically. At the parse level it is equivalent
- * to a separate 'word' with its own (undetermined) record set.
- * So it's given its own slot in saveusr.stems. The word
- * in saveusr.stems is formatted "@ssttv[v...]" where ss and tt are
- * ascii numbers that index the original collocated words
- * in saveusr.stems, and v... is the collocation value integer.
- * For example, "@03005" represents the collocation of stem
- * number 3 and stem number 0, with collocation value 5.
- *
- * Returns NULL and errmsg on msglist if any problems.
- */
- TRUTHTAB *boolyac_COLLOC (
- TRUTHTAB *word1tt,
- int colloc_val,
- TRUTHTAB *word2tt)
- {
- TRUTHTAB *newtt;
- char wordbuf [DtSrMAXWIDTH_HWORD];
- if (word1tt->stemno < 0 || word2tt->stemno < 0) {
- /* Message #3 is called in two places */
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 3,
- "%s COLLOCATION operator (@) may\n"
- "only be positioned between two words."),
- PROGNAME"371");
- DtSearchAddMessage (msgbuf);
- return NULL;
- }
- if (word1tt->stemno == word2tt->stemno) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 12,
- "%s Collocation operator is not\n"
- "permitted between identical words."),
- PROGNAME"377");
- DtSearchAddMessage (msgbuf);
- return NULL;
- }
- sprintf (wordbuf, COLLOC_STEM_FORMAT,
- word1tt->stemno, word2tt->stemno, colloc_val);
- if ((newtt = get_stem_truthtab (wordbuf, wordbuf)) == NULL)
- return NULL;
- freett (word1tt);
- freett (word2tt);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " COLLOC: exprs=%p,%p-->expr=%p pmsz=%d\n",
- (void *) word1tt, (void *) word2tt, (void *) newtt, newtt->pmsz);
- fflush (aa_stderr);
- }
- return newtt;
- } /* boolyac_COLLOC() */
- /****************************************/
- /* */
- /* yyerror */
- /* */
- /****************************************/
- /* Replaces standard yacc error routine. */
- void yyerror (char *msg) {
- if (strcmp (msg, "syntax error") == 0) {
- if (DtSearchHasMessages())
- return;
- else if (parser_invalid_wordcount > 0)
- add_syntax_errmsg(6);
- else {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 1,
- "%s Your search string is an invalid\n"
- "boolean query. Please reformulate and try again."),
- PROGNAME"001");
- DtSearchAddMessage (msgbuf);
- }
- }
- else
- DtSearchAddMessage (msg);
- return;
- } /* yyerror() */
- /****************************************/
- /* */
- /* copy_token */
- /* */
- /****************************************/
- /* Subroutine of yylex(). Copies passed substring
- * Into a zero-terminated buffer of its own.
- * Static buffer good until next call.
- */
- static char *copy_token (UCHAR *tokenp, size_t toklen)
- {
- static char *buf = NULL;
- static size_t bufsz = 0;
- if (toklen > bufsz) {
- if (buf)
- free (buf);
- bufsz = toklen + (toklen >> 1); /* 1.5 times size needed */
- buf = austext_malloc (bufsz + 4, PROGNAME"182", NULL);
- }
- strncpy (buf, (char *) tokenp, toklen);
- buf [toklen] = 0;
- return buf;
- } /* copy_token() */
- /****************************************/
- /* */
- /* yylex */
- /* */
- /****************************************/
- /* Delivers tokens to yyparse() from usrblk.query */
- int yylex (void)
- {
- int retn_token;
- PARG parg;
- char *stembufp;
- char mystembuf [DtSrMAXWIDTH_HWORD + 4];
- GET_ANOTHER_TOKEN:
- /* Skip white space */
- while (ascii_charmap[*next_lex_char] & WHITESPACE)
- next_lex_char++;
- /* Terminating zero indicates end of query and end of parse.
- * Automatically close unbalanced parentheses.
- */
- if (*next_lex_char == 0) {
- if (paren_count > 0) {
- paren_count--;
- retn_token = ')';
- yytext = ")";
- yyleng = 1;
- goto DELIVER_TOKEN;
- }
- retn_token = 0;
- yytext = "";
- yyleng = 0;
- goto DELIVER_TOKEN;
- }
- switch (*next_lex_char) {
- case '|': /* OR operator */
- last_token_was_boolop = TRUE;
- retn_token = '|';
- yytext = "|";
- yyleng = 1;
- next_lex_char++;
- break;
- case '~': /* NOT operator */
- if (!last_token_was_boolop) {
- /* Generate implied AND between words
- * and parenthesized expressions.
- * A NOT is not itself boolean; it must
- * precede the next word or expression.
- */
- last_token_was_boolop = TRUE;
- retn_token = '&';
- yytext = "&";
- yyleng = 1;
- break;
- }
- last_token_was_boolop = TRUE;
- retn_token = '~';
- yytext = "~";
- yyleng = 1;
- next_lex_char++;
- break;
- case '&': /* AND operator */
- if (last_token_was_boolop && qry_is_all_ANDs) {
- /* Ignore multiple AND operators.
- * These might occur if we silently
- * discarded some invalid words.
- */
- next_lex_char++;
- goto GET_ANOTHER_TOKEN;
- }
- last_token_was_boolop = TRUE;
- retn_token = '&';
- yytext = "&";
- yyleng = 1;
- next_lex_char++;
- break;
- case '(': /* OPEN parentheses */
- if (!last_token_was_boolop) {
- /* Generate implied AND between words
- * and parenthesized expressions.
- */
- last_token_was_boolop = TRUE;
- retn_token = '&';
- yytext = "&";
- yyleng = 1;
- break;
- }
- paren_count++;
- retn_token = '(';
- yytext = "(";
- yyleng = 1;
- next_lex_char++;
- break;
- case ')': /* CLOSE parentheses */
- /* Just discard excessive right parentheses */
- if (--paren_count < 0) {
- paren_count = 0;
- next_lex_char++;
- goto GET_ANOTHER_TOKEN;
- }
- last_token_was_boolop = FALSE;
- retn_token = ')';
- yytext = ")";
- yyleng = 1;
- next_lex_char++;
- break;
- case '@': /* COLLOCATION operator */
- /* Collocation token:
- * Token is defined as the collocation char followed
- * by one or more numeric digits: "@#[#...]".
- * Syntactically it's a kind of an AND operator.
- * Semantically it's a pseudo word token
- * (it will occupy a slot in the stems array).
- * The yylval is the integer value following
- * the collocation character.
- */
- yyleng = strcspn ((char *) next_lex_char + 1, WORD_ENDERS) + 1;
- yytext = copy_token (next_lex_char, yyleng);
- next_lex_char += yyleng;
- if ((usrblk.dblk->dbrec.or_dbaccess & ORA_BLOB) == 0) {
- retn_token = ERROR_TOKEN;
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 10,
- "%s Collocation searches not available for database '%s'."),
- PROGNAME"2567", usrblk.dblk->label);
- DtSearchAddMessage (msgbuf);
- break;
- }
- yylval.int_val = atoi (yytext + 1);
- if (yylval.int_val <= 0) {
- retn_token = ERROR_TOKEN;
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 11,
- "%s Collocation operator '%.*s' is invalid.\n"
- "Correct format is '@n' where n is greater than zero.") ,
- PROGNAME"294", DtSrMAXWIDTH_HWORD, yytext);
- DtSearchAddMessage (msgbuf);
- break;
- }
- last_token_was_boolop = TRUE;
- retn_token = COLLOC_TOKEN;
- break;
- default:
- /* Presumed word token:
- * Token is all text chars until next whitespace,
- * next lex token, or end of string.
- * Linguistically parse it and optionally stem it.
- * The token value is the truth table for one
- * word: all permutes with only that word's
- * bits turned on. If the word is already
- * in the stems array, then the permutes
- * position is the word's index in the array.
- * If the word is not in the array, it's added.
- * If the array is full, then an error is reported.
- */
- if (!last_token_was_boolop) {
- /* Generate implied AND between words
- * and parenthesized expressions.
- */
- last_token_was_boolop = TRUE;
- retn_token = '&';
- yytext = "&";
- yyleng = 1;
- break;
- }
- yyleng = strcspn ((char *) next_lex_char, WORD_ENDERS);
- yytext = copy_token (next_lex_char, yyleng);
- next_lex_char += yyleng;
- /*
- * Linguistically parse the token.
- * Failure can occur because word is too short
- * or too long, it's on the stoplist, etc.
- * Setting PA_MSGS causes parser to explain
- * invalid words with a msg.
- */
- memset (&parg, 0, sizeof(PARG));
- parg.dblk = usrblk.dblk;
- parg.string = yytext;
- /*****if (!qry_is_all_ANDs)********/
- parg.flags = PA_MSGS;
- stembufp = usrblk.dblk->parser (&parg);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " lang: '%s' -> '%s'\n",
- yytext, (stembufp)? stembufp : "<null>");
- fflush (aa_stderr);
- }
- /*
- * If token is not a linguistically valid word,
- * one of two things can happen. If the query
- * is all_ANDs (most common type) we silently
- * ignore the token.
- * Otherwise report error and quit now.
- */
- if (stembufp == NULL) {
- parser_invalid_wordcount++;
- if (qry_is_all_ANDs)
- goto GET_ANOTHER_TOKEN;
- retn_token = ERROR_TOKEN;
- if (!DtSearchHasMessages()) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 13,
- "%s Word '%.*s' is invalid.") ,
- PROGNAME"315", DtSrMAXWIDTH_HWORD, yytext);
- DtSearchAddMessage (msgbuf);
- }
- break;
- }
- if (strlen(stembufp) != strlen(yytext)) {
- retn_token = ERROR_TOKEN;
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 14,
- "%s String '%.*s' is not a single word.") ,
- PROGNAME"634", DtSrMAXWIDTH_HWORD, yytext);
- DtSearchAddMessage (msgbuf);
- break;
- }
- /*
- * If stemming, we must prefix term with
- * special stem char in the stems array.
- */
- if (usrblk.request == OE_SRCH_STEMS) {
- stembufp = usrblk.dblk->stemmer (stembufp, usrblk.dblk);
- if (debugging_boolpars) {
- fprintf (aa_stderr, " stemer: -> '%s'\n", stembufp);
- fflush (aa_stderr);
- }
- mystembuf[0] = STEM_CH;
- strncpy (mystembuf + 1, stembufp, DtSrMAXWIDTH_HWORD);
- mystembuf [DtSrMAXWIDTH_HWORD - 1] = 0;
- stembufp = mystembuf;
- }
- /* Load stem into stems arrays and return it's truth table. */
- if ((yylval.truthtab = get_stem_truthtab (stembufp, yytext))) {
- retn_token = WORD_TOKEN;
- last_token_was_boolop = FALSE;
- }
- else
- retn_token = ERROR_TOKEN;
- break;
- } /* switch on *next_lex_char */
- DELIVER_TOKEN:
- if (debugging_boolpars) {
- fprintf (aa_stderr,
- " yylex: op?=%d parct=%d tok#=%d lval=%p%sYYTEXT='%s'\n",
- last_token_was_boolop, paren_count,
- retn_token, (void *) yylval.truthtab,
- (retn_token == COLLOC_TOKEN)? "\t\t" : "\t",
- yytext);
- fflush (aa_stderr);
- }
- return retn_token;
- } /* yylex() */
- /****************************************/
- /* */
- /* boolean_parse */
- /* */
- /****************************************/
- /* Called from Opera_Engine for boolean searches.
- * Driver for yyparse().
- * Expects usrblk.request == OE_SRCH_STEMS or OE_SRCH_WORDS.
- * If parse is completely successful (query is valid), outputs
- * saveusr.stemcount,
- * saveusr.stems (stemmed if necessary with STEM_CH as first char,
- * and phony colloc words with '@' as first char),
- * usrblk.stems (original unstemmed query terms for err msgs),
- * final_truthtab,
- * qry_has_no_NOTs,
- * qry_is_all_ANDs,
- * and returns TRUE. Truthtab allocation good until next call.
- * If parse fails, returns FALSE and err msg(s) on msglist.
- */
- int boolean_parse (void)
- {
- int i;
- char *cptr;
- TRUTHTAB *tt, *ttnext;
- debugging_boolpars = (usrblk.debug & USRDBG_BOOL);
- if (!msgbuf)
- msgbuf = austext_malloc (300 + DtSrMAXWIDTH_HWORD,
- PROGNAME"255", NULL);
- /* Test for empty query */
- if (usrblk.query == NULL) {
- EMPTY_QUERY:
- /* Message #2 is called in two places */
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 2,
- "%s Query is empty."), PROGNAME"289");
- DtSearchAddMessage (msgbuf);
- return FALSE;
- }
- for (cptr = usrblk.query; *cptr; cptr++) {
- if ((ascii_charmap[*cptr] & WHITESPACE) == 0)
- break;
- }
- if (*cptr == 0)
- goto EMPTY_QUERY;
- /* Init globals for yylex and yyparse */
- next_lex_char = (UCHAR *) usrblk.query;
- paren_count = 0;
- yyerror_count = 0;
- last_token_was_boolop = TRUE;
- saveusr.stemcount = 0;
- parser_invalid_wordcount = 0;
- /* Query "is all ANDS" if it has no ORs, NOTs, or COLLOCs.
- * Missing or linguistically invalid words will be silently
- * discarded for all_ANDs queries.
- * Query "has no NOTs" if it has no NOTs.
- * Results from queries without NOTs can be statistically sorted.
- */
- qry_has_no_NOTs = !strchr (usrblk.query, '~');
- qry_is_all_ANDs = !strpbrk (usrblk.query, "|~@");
- if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
- fprintf (aa_stderr,
- "start boolean_parse: stem?=%d allANDs?=%d noNOTs?=%d\n"
- " query: '%s'\n",
- (usrblk.request == OE_SRCH_STEMS),
- qry_is_all_ANDs, qry_has_no_NOTs, usrblk.query);
- fflush (aa_stderr);
- }
- if (yyparse() != 0)
- return FALSE;
- /* Free entire remaining ttlist. Only you
- * can prevent forest fires and memory leaks.
- */
- tt = ttlist;
- while (tt) {
- ttnext = tt->next;
- free (tt);
- tt = ttnext;
- }
- ttlist = NULL;
- if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
- print_stems (saveusr.stemcount, saveusr.stems,
- PROGNAME"815 end boolean_parse, syntax ok,");
- fprintf (aa_stderr, " permutes=%d:", final_truthtab.pmsz);
- for (i=0; i<16; i++) {
- if (i >= final_truthtab.pmsz)
- break;
- fprintf (aa_stderr, " %02x", final_truthtab.permutes [i]);
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- if (final_truthtab.pmsz <= 0) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 15,
- "%s Your query cannot logically return\n"
- "any records. Please reformulate and try again."),
- PROGNAME"334");
- DtSearchAddMessage (msgbuf);
- return FALSE;
- }
- if (final_truthtab.pmsz >= 256) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolpars, 16,
- "%s Your query will return entire database\n"
- "'%s'. Please reformulate and try again.") ,
- PROGNAME"341", usrblk.dblk->label);
- DtSearchAddMessage (msgbuf);
- return FALSE;
- }
- return TRUE;
- } /* boolean_parse() */
- #ifdef TESTBOOL /*-----------------------------------------------*/
- USRBLK usrblk = { 0 };
- DBLK dblk;
- SAVEUSR saveusr = { 0 };
- extern int debugging_teskey;
- extern int debugging_paice;
- extern int debugging_jpn;
- /****************************************/
- /* */
- /* process_user_args */
- /* */
- /****************************************/
- /* Subroutine of main(). Validates and loads global
- * variables with values from command line arguments.
- */
- static void process_user_args (int argc, char *argv[])
- {
- int i;
- char *argptr;
- char *cptr;
- char *src, *targ;
- int oops = FALSE;
- /* Each pass grabs new parm of "-xxx" format */
- argc--, argv++;
- while (argc > 0) {
- argptr = argv[0];
- if (*argptr != '-')
- break;
- switch (argptr[1]) {
- case 'm':
- if (argptr[2] == 'x')
- dblk.dbrec.or_maxwordsz = atoi (argptr + 3);
- else if (argptr[2] == 'n')
- dblk.dbrec.or_minwordsz = atoi (argptr + 3);
- else
- goto BAD_ARG;
- break;
- case 'l':
- dblk.dbrec.or_language = atoi (argptr + 2);
- break;
- case 'd':
- for (cptr = argptr+2; *cptr != 0; cptr++) {
- switch (*cptr) {
- case 't': debugging_teskey = TRUE; break;
- case 'p': debugging_paice = TRUE; break;
- case 'j': debugging_jpn = TRUE; break;
- default:
- oops = TRUE;
- fprintf (aa_stderr,
- "%s Invalid debug option %c.\a\n",
- PROGNAME"049", *cptr);
- break;
- }
- }
- break;
- BAD_ARG:
- default:
- oops = TRUE;
- fprintf (aa_stderr,
- "%s Invalid command line argument '%s'.\a\n",
- PROGNAME"059", argptr);
- break;
- } /* end switch */
- argc--, argv++;
- } /* main loop on each arg */
- if (oops) {
- fprintf (aa_stderr,
- "\nUSAGE: %s [options]\n"
- " -mx# maximum word size.\n"
- " -mn# minimum word size.\n"
- " -dtpj Debug: Teskey, Paice, Japanese.\n"
- " -l# language number. Default 0.\n",
- aa_argv0);
- exit(2);
- }
- return;
- } /* process_user_args() */
- /****************************************/
- /* */
- /* main */
- /* */
- /****************************************/
- int main (int argc, char *argv[])
- {
- int i;
- int valid_boolpars;
- char *cptr;
- char linebuf [1024];
- /* Init global variables */
- aa_argv0 = argv[0];
- memset (&usrblk, 0, sizeof(USRBLK));
- usrblk.dblk = &dblk;
- usrblk.debug |= USRDBG_BOOL; /* set debugging_boolpars */
- memset (&dblk, 0, sizeof(DBLK));
- strcpy (dblk.name, "testbool");
- dblk.label = dblk.name;
- dblk.dbrec.or_dbaccess |= ORA_BLOB; /* enable collocations */
- /* Read command line args */
- process_user_args (argc, argv);
- if (!load_language (&dblk, NULL)) {
- fprintf (aa_stderr,
- PROGNAME"140 load_language() failed. Msgs:\n%s\n",
- DtSearchGetMessages());
- return 2;
- }
- fprintf (aa_stderr, " lang=%d minwdsz=%d maxwdsz=%d.\n",
- dblk.dbrec.or_language,
- dblk.dbrec.or_minwordsz,
- dblk.dbrec.or_maxwordsz);
- /* Main loop. Each line is a boolean query. */
- printf ("Enter an AusText boolean query. 'q' or '.' to quit.\n"
- "If first char is '$', words will be stemmed:\n> ");
- fflush (stdout);
- while (fgets (linebuf, sizeof(linebuf), stdin) != NULL) {
- linebuf [sizeof(linebuf) - 1] = 0;
- if (strcmp (linebuf, ".\n") == 0)
- break;
- if (strcmp (linebuf, "q\n") == 0)
- break;
- if (linebuf[0] == '\n')
- break;
- linebuf [strlen(linebuf) - 1] = 0; /* overlay \n */
- if (linebuf[0] == '$') {
- usrblk.query = linebuf + 1;
- usrblk.request = OE_SRCH_STEMS;
- }
- else {
- usrblk.query = linebuf;
- usrblk.request = OE_SRCH_WORDS;
- }
- if (!boolean_parse())
- puts (PROGNAME"707 boolean_parse() returned FALSE (OE_BAD_QUERY).");
- if (DtSearchHasMessages()) {
- printf ("mmmmm Messages returned to user mmmmmmmmmmmmmmmmmm\n"
- "%s\nmmmmm End of messages to user mmmmmmmmmmmmmmmmmmmm\n",
- DtSearchGetMessages());
- DtSearchFreeMessages();
- }
- printf ("--------------------------------\n> ");
- fflush (stdout);
- } /* main read loop for each query line */
- return 0;
- } /* main() */
- #endif /* TESTBOOL */
- /********************* BOOLPARS.C ********************/
|