12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595 |
- /*
- * CDE - Common Desktop Environment
- *
- * Copyright (c) 1993-2012, The Open Group. All rights reserved.
- *
- * These libraries and programs are free software; you can
- * redistribute them and/or modify them under the terms of the GNU
- * Lesser General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * These libraries and programs are distributed in the hope that
- * they will be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU Lesser General Public License for more
- * details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with these libraries and programs; if not, write
- * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
- * Floor, Boston, MA 02110-1301 USA
- */
- /* $XConsortium: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
- *
- * (c) Copyright 1996 Digital Equipment Corporation.
- * (c) Copyright 1996 Hewlett-Packard Company.
- * (c) Copyright 1996 International Business Machines Corp.
- * (c) Copyright 1996 Sun Microsystems, Inc.
- * (c) Copyright 1996 Novell, Inc.
- * (c) Copyright 1996 FUJITSU LIMITED.
- * (c) Copyright 1996 Hitachi.
- */
- /*
- * COMPONENT_NAME: austext
- *
- * FUNCTIONS: boolean_search
- * calc_result_bitvec_WK
- * calculate_idfs
- * dbread_filter_WK
- * get_proximity
- * got_USR_STOPSRCH
- * load_DtSrResults_WK
- * load_or_wordrecs
- * read_d99
- * read_recno
- * read_stem_bitvec_WK
- * stuff_DtSrResult
- * weights_filter_WK
- *
- * ORIGINS: 27
- *
- *
- * (C) COPYRIGHT International Business Machines Corp. 1996
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- */
- /********************* BOOLSRCH.C **********************
- * $Id: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
- * February 1996.
- * The vista code from the original vewords.c.
- * Given a final truth table and stems array from the user's boolean
- * query (output of boolean_search()), find all database records
- * containing the truth table's set operations and return
- * their database addresses in a resultlist.
- * See boolpars.h for format and limitations of TRUTHTAB.
- *
- *-------------- D99DBA TO DBA CONVERSION ----------------
- * 'd99dbas' are not real vista dbas! They were modified
- * as follows to permit shorter bit vectors,
- * and to minimize bit shifts at search time.
- * vista_dba <- (OR_D00 << 24) | vista_slot
- * vista_slot <- ((d99recno - 1) * or_recslots) + 2
- * d99dba <- (d99recno << 8) | weight_byte
- * d99recno <- ((vista_slot - 2) / or_recslots) + 1
- * The d99 and bitvec recno of the first rec is 1.
- * The slotno (vista dba) of the first rec is 2
- * (dbrec occupies first slot and vista slots begin at 1).
- *
- * $Log$
- * Revision 1.5 1996/03/20 19:21:49 miker
- * Completed collocations code. Restored get_colloc_bitvec() from colloc.c.
- *
- * Revision 1.4 1996/03/18 22:06:24 miker
- * Bug fix. Zero permute NOT queries always returned no hits.
- *
- * Revision 1.3 1996/03/13 23:05:24 miker
- * Change long double constant to regular float for better portability.
- *
- * Revision 1.2 1996/03/13 22:36:37 miker
- * Changed char to UCHAR several places; similar typecasts.
- * Moved collocations processing to colloc.c.
- *
- * Revision 1.1 1996/03/05 15:52:06 miker
- * Initial revision
- */
- /***#define _ALL_SOURCE****/ /* to pickup typedefs for shm vnodes */
- #include "SearchE.h"
- #include <string.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include "vista.h"
- #include "boolpars.h"
- #define PROGNAME "BOOLSRCH"
- #define INIT_ITERATIONS 50
- #define MS_boolsrch 16
- /*
- * DBAS_PER_BLOCK is the max number of dbas to be read
- * from d99 file. Note DBAS_PER_BLOCK * sizeof(DB_ADDR) = 512 bytes,
- * the standard blksize of one hard disk block.
- */
- #define DBAS_PER_BLOCK 128
- #define RESET_BIT(bv, by, bm) bv[by] &= (UCHAR) ~bm
- #if (DtSrMAX_STEMCOUNT != 8)
- #error DtSrMAX_STEMCOUNT does not equal 8.
- #endif
- /****************************************/
- /* */
- /* PROXWT */
- /* */
- /****************************************/
- typedef struct {
- float wt;
- long byteno;
- int bitmask;
- int proximity;
- } PROXWT;
- /****************************************/
- /* */
- /* GLOBALS */
- /* */
- /****************************************/
- int debugging_boolsrch = FALSE;
- static int all_key_types = TRUE;
- static UCHAR *bitvec_allocp = NULL;
- static size_t bitvec_allocsz = 0;
- static long bitveclen; /* 1/8 of tot_addr_count */
- static UCHAR *bitvecs [DtSrMAX_STEMCOUNT];
- static int check_dates = FALSE;
- static int do_stat_sort = FALSE;
- static double idf [DtSrMAX_STEMCOUNT];
- static char *msgbuf = NULL;
- static int need_zero_permute = FALSE;
- static struct or_objrec objrec;
- static DB_ADDR objrecdba;
- static int or_abstrsz = 0;
- static int or_fzkeysz = 0;
- static short or_language = DtSrLaENG;
- static long or_maxdba; /* largest dba in database */
- static long or_reccount; /* tot num db obj (real_num_rec) */
- static long or_recslots; /* D00 slots per obj (slot_d00) */
- static struct or_hwordrec
- *or_wordrecs = NULL;
- static PROXWT *proxwts = NULL;
- static int proxwtct;
- static UCHAR *result_bitvec;
- static long result_count = 0;
- static DtSrResult *resultlist = NULL;
- static int save_stemno = 0;
- static long tot_addr_count; /* may be > reccount bcs deletes */
- static int vistano;
- static float *wtvec = NULL;
- extern void find_keyword (char *cur_word, int vista_num);
- extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
- /************************************************/
- /* */
- /* got_USR_STOPSRCH */
- /* */
- /************************************************/
- /* Called at beginning of every workproc.
- * Returns TRUE if user pushed STOP SEARCH button,
- * else FALSE.
- */
- static int got_USR_STOPSRCH (void)
- {
- if ((usrblk.flags & USR_STOPSRCH) == 0)
- return FALSE;
- if (OE_flags & OE_AUDIT)
- oe_write_audit_rec (-1L);
- usrblk.retncode = OE_USER_STOP;
- return TRUE;
- }
- /****************************************/
- /* */
- /* read_recno */
- /* */
- /****************************************/
- /* Utility function.
- * Reads a database record given a d99 record number.
- * Returns TRUE and loads globals objrec and objrecdba
- * on success, else returns FALSE.
- */
- static int read_recno (long recno)
- {
- /* Convert recno to a real dba */
- objrecdba = (recno - 1) * or_recslots + 2;
- if (objrecdba >= or_maxdba)
- return FALSE;
- objrecdba |= (OR_D00 << 24);
- /* Read the object record.
- * Skip records with database read errors.
- * Use d_crset instead of CRSET and d_recread
- * instead of RECREAD to trap vista errors
- * without aborting.
- */
- d_crset (&objrecdba, vistano);
- if (db_status != S_OKAY) {
- BAD_DBA:
- if (debugging_boolsrch) {
- fprintf (aa_stderr,
- PROGNAME"434 Invalid dba %ld. "
- "recno=%ld bitvec[%ld]=%02x db_status=%d.\n",
- objrecdba, recno, recno>>3, 1<<(recno%8), db_status);
- fflush (aa_stderr);
- }
- return FALSE;
- }
- d_recread (&objrec, vistano);
- if (db_status != S_OKAY)
- goto BAD_DBA;
- swab_objrec (&objrec, NTOH);
- return TRUE;
- } /* read_recno() */
- /************************************************/
- /* */
- /* calculate_idfs */
- /* */
- /************************************************/
- /* Subroutine of boolean_search() initialization.
- * Loads idf[] (inverse doc frequency) for each stem.
- * IDF = 1.0 for a word that occurs in every record.
- * For a word that occurs only once in entire database:
- * NUM OF DB RECS IDF OF SINGULAR WORD
- * 10 4.32
- * 100 7.64
- * 1,000 10.97
- * 10,000 14.29
- * 100,000 17.61
- * 1,000,000 20.93
- * 10,000,000 24.25
- */
- static void calculate_idfs (void)
- {
- int i;
- double dbl;
- for (i = 0; i < saveusr.stemcount; i++) {
- if ( or_wordrecs[i].or_hwaddrs == 0 ||
- or_wordrecs[i].or_hwordkey[0] == '@')
- idf[i] = 0.0;
- else {
- /* ln(2) = 0.693147181 */
- dbl = (double) or_reccount / (double) or_wordrecs[i].or_hwaddrs;
- idf[i] = log(dbl) / 0.693147181 + 1.0;
- if (debugging_boolsrch)
- fprintf (aa_stderr,
- PROGNAME"733 IDF[%d] numdocs=%5ld idf=%lf\n",
- i, (long) or_wordrecs[i].or_hwaddrs, idf[i]);
- }
- }
- return;
- } /* calculate_idfs() */
- /************************************************/
- /* */
- /* load_or_wordrecs */
- /* */
- /************************************************/
- /* Subroutine of boolean_search() initialization.
- * Loads or_wordrecs[] array with vista key file
- * records for each term in saveusr.stems.
- * Returns TRUE on success. Else returns FALSE with
- * appropriate usrblk.retncode and user msgs on msglist.
- */
- static int load_or_wordrecs (void)
- {
- int i, j, k;
- int stemno;
- struct or_hwordrec
- *wordrec;
- int colloc_count = 0;
- int not_found_count = 0;
- if (or_wordrecs)
- free (or_wordrecs);
- or_wordrecs = austext_malloc (
- saveusr.stemcount * sizeof (struct or_hwordrec) + 16,
- PROGNAME "782", NULL);
- for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
- wordrec = &or_wordrecs [stemno];
- /* If this is a collocation term,
- * save the two indexes and the collocation
- * value in the wordrec buffer instead of usual
- * offsets and dba counts.
- */
- if (saveusr.stems[stemno][0] == '@') {
- strcpy (wordrec->or_hwordkey, saveusr.stems[stemno]);
- sscanf (saveusr.stems[stemno], COLLOC_STEM_FORMAT, &i, &j, &k);
- wordrec->or_hwoffset = i;
- wordrec->or_hwfree = j;
- wordrec->or_hwaddrs = k;
- colloc_count++;
- continue;
- }
- if (debugging_boolsrch)
- fprintf (aa_stderr, PROGNAME"823 KEYFIND[%d] ", stemno);
- find_keyword (saveusr.stems[stemno], vistano);
- /*
- * If term is found, add it to the or_wordrecs[] array.
- * But it is an error to include a word in more records
- * than the max specified in site config file. This is
- * meaningful for databases where certain common high
- * frequency words slip by which should be on the stoplist.
- * It's possible in huge databases to run out of memory
- * assembling very long resultlists.
- */
- if (db_status == S_OKAY) {
- strncpy (wordrec->or_hwordkey, saveusr.stems[stemno],
- DtSrMAXWIDTH_HWORD);
- wordrec->or_hwordkey [DtSrMAXWIDTH_HWORD - 1] = 0;
- read_wordstr (wordrec, vistano);
- if (db_status != S_OKAY) {
- /* Probable corrupted database. The btree
- * read succeeded but the record read failed.
- */
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolsrch, 6,
- "%s Database Error. Word '%s' is\n"
- "listed in database '%s' but has no index record.") ,
- PROGNAME"295", usrblk.stems[stemno], usrblk.dblk->label);
- DtSearchAddMessage (msgbuf);
- usrblk.retncode = OE_SYSTEM_STOP;
- if (debugging_boolsrch)
- fprintf (aa_stderr,
- "db error, db_status = %d.\n", db_status);
- return FALSE;
- }
- if (debugging_boolsrch)
- fprintf (aa_stderr, "ofs=%ld addrs=%ld free=%ld\n",
- (long) wordrec->or_hwoffset,
- (long) wordrec->or_hwaddrs,
- (long) wordrec->or_hwfree);
- if (wordrec->or_hwaddrs > OE_words_hitlimit) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolsrch, 14,
- "%s '%s' has more than %ld hits.\n"
- "Please remove it from the query or raise the WHITLIM\n"
- "value in the search engine configuration file."),
- PROGNAME"1444", wordrec->or_hwordkey, OE_words_hitlimit);
- DtSearchAddMessage (msgbuf);
- /* Also log WHITLIM msg for administrator... */
- fprintf (aa_stderr, "%s\n", msgbuf);
- usrblk.retncode = OE_BAD_QUERY;
- return FALSE;
- }
- }
- /* Only other possible nonfatal vista return is S_NOTFOUND.
- * If qry_is_all_ANDs we can quit right now.
- * Otherwise switch off all bits in the word's bit vector.
- */
- else if (qry_is_all_ANDs) {
- if (debugging_boolsrch)
- fputs ("not found, qry_all_ANDs, quit.\n", aa_stderr);
- usrblk.retncode = OE_NOTAVAIL;
- return FALSE;
- }
- else {
- memset (wordrec, 0, sizeof(struct or_hwordrec));
- if (debugging_boolsrch)
- fputs ("not found, addrs-->0.\n", aa_stderr);
- not_found_count++;
- }
- } /* end loop for each term in saveusr.stems[] */
- /* It's a failure if all the user's words
- * don't exist in database.
- */
- if (not_found_count + colloc_count >= saveusr.stemcount) {
- usrblk.retncode = OE_NOTAVAIL;
- return FALSE;
- }
- return TRUE;
- } /* load_or_wordrecs() */
- /****************************************/
- /* */
- /* get_proximity */
- /* */
- /****************************************/
- /* Subroutine of stuff_DtSrResult().
- * Given d99recno, finds proxwt[] for record,
- * calculates and returns integer proximity.
- */
- static int get_proximity (long recno)
- {
- long byteno = recno >> 3;
- int bitmask = 1 << (recno % 8);
- int i;
- for (i = 0; i < proxwtct; i++)
- if (proxwts[i].byteno == byteno && proxwts[i].bitmask == bitmask)
- break;
- if (i >= proxwtct)
- return -1;
- return proxwts[i].proximity;
- } /* get_proximity() */
- /****************************************/
- /* */
- /* stuff_DtSrResult */
- /* */
- /****************************************/
- /* Subroutine of load_DtSrResults_WK().
- * Loads passed DtSrResult structure with data from global objrec.
- * Performs additional vista reads as necessary to get misc recs.
- */
- static void stuff_DtSrResult (
- DtSrResult *new,
- long recno)
- {
- int m;
- int fzkey_remaining;
- char *src, *targ, *targend;
- static struct or_miscrec
- miscrecbuf;
- new->objflags = objrec.or_objflags;
- new->objuflags = objrec.or_objuflags;
- new->objsize = objrec.or_objsize;
- new->objdate = objrec.or_objdate;
- new->objtype = objrec.or_objtype;
- new->objcost = objrec.or_objcost;
- new->dbn = OE_dbn;
- new->dba = objrecdba;
- new->language = or_language;
- strncpy (new->reckey, objrec.or_objkey, DtSrMAX_DB_KEYSIZE);
- if (do_stat_sort)
- new->proximity = get_proximity (recno);
- /* The abstract immediately follows the fuzzy key
- * in the FZKABS misc recs. It may span several recs.
- */
- new->abstractp = (char *) (new + 1);
- if (or_abstrsz > 0) {
- targ = new->abstractp;
- targend = targ + or_abstrsz - 1;
- fzkey_remaining = or_fzkeysz;
- CRSET (PROGNAME"226", &objrecdba, vistano);
- SETOR (PROGNAME"227", OR_OBJ_MISCS, saveusr.vistano);
- FINDFM (PROGNAME"228", OR_OBJ_MISCS, saveusr.vistano);
- while (db_status == S_OKAY) {
- RECREAD (PROGNAME"2209", &miscrecbuf, saveusr.vistano);
- NTOHS (miscrecbuf.or_misctype);
- if (miscrecbuf.or_misctype == ORM_FZKABS) {
- src = (char *) miscrecbuf.or_misc;
- for (m = 0; m < sizeof(miscrecbuf.or_misc); m++) {
- /* skip over the fzkey */
- if (fzkey_remaining > 0) {
- src++;
- fzkey_remaining--;
- continue;
- }
- /* copy the abstract */
- *targ = *src;
- if (*src++ == 0 || targ++ >= targend) {
- *targ = 0;
- targ = targend; /* force outer loop end */
- break;
- }
- } /* end for-loop m */
- } /* end (misctype == FZKABS) */
- if (targ >= targend)
- break;
- FINDNM (PROGNAME"545", OR_OBJ_MISCS, saveusr.vistano);
- } /* end while-loop */
- } /* endif: (or_abstrsz > 0) */
- return;
- } /* stuff_DtSrResult() */
- /****************************************/
- /* */
- /* load_DtSrResults_WK */
- /* */
- /****************************************/
- /* Builds DtSrResult list for every record
- * in result_bitvec, but not more than aa_maxhits.
- */
- static void load_DtSrResults_WK (void)
- {
- long recno;
- int bitno;
- long byteno;
- int i;
- long dittocount;
- DtSrResult *resultp;
- size_t resultsz = sizeof(DtSrResult) + or_abstrsz + 4;
- if (got_USR_STOPSRCH())
- return;
- if (resultlist) {
- DtSearchFreeResults (&resultlist);
- resultlist = NULL;
- }
- /* Make a single pass through the final result_bitvec.
- * For each nonzero bit, ie each database record
- * that satisfies the query requirements,
- * retrieve the record and push it onto the
- * DtSrResult list. If not sorting records,
- * stop when we reach the user's specified aa_maxhits count.
- */
- dittocount = 0;
- for (recno = 1; recno < tot_addr_count; recno++) {
- byteno = recno >> 3; /* divide by 8 */
- bitno = recno % 8;
- /* Skip zero bits */
- if ((result_bitvec[byteno] & (1 << bitno)) == 0)
- continue;
- if (!read_recno (recno))
- continue;
- /* Create new DtSrResult node, push it onto resultlist. */
- resultp = austext_malloc (resultsz + 4, PROGNAME"466", NULL);
- memset (resultp, 0, resultsz);
- resultp->link = resultlist;
- resultlist = resultp;
- /* Load the new DtSrResult node from the object record */
- stuff_DtSrResult (resultp, recno);
- /* Check if any more reads are necessary.
- * If not sorting, stop after aa_maxhits.
- * If sorting, there won't be more than
- * aa_maxhits recs in the bitvec anyway.
- */
- dittocount++;
- if (dittocount >= aa_maxhits)
- break;
- } /* end bitvec loop */
- /*--------- All Done. Clean up and return to caller. ---------*/
- /*@@@@@@ make separate workproc call if aa_maxhits > 100.
- @@@@@ sort may take a long time */
- if (wtvec) {
- free (wtvec);
- wtvec = NULL;
- }
- if (proxwts) {
- free (proxwts);
- proxwts = NULL;
- }
- if (dittocount <= 0) {
- usrblk.workproc = dummy_workproc;
- usrblk.retncode = OE_NOTAVAIL;
- return;
- }
- usrblk.retncode = OE_OK;
- usrblk.workproc = dummy_workproc;
- usrblk.stemcount = saveusr.stemcount;
- if (usrblk.search_type == 'W')
- memcpy (usrblk.stems, saveusr.stems,
- saveusr.stemcount * DtSrMAXWIDTH_HWORD);
- else
- /* Don't copy first char (ctrl-o) stem */
- for (i = 0; i < saveusr.stemcount; i++)
- strcpy (usrblk.stems[i], &saveusr.stems[i][1]);
- if (do_stat_sort)
- DtSearchSortResults (&resultlist, DtSrSORT_PROX);
- usrblk.dittocount = dittocount;
- if (usrblk.dittolist)
- DtSearchFreeResults (&usrblk.dittolist);
- usrblk.dittolist = resultlist;
- resultlist = NULL;
- return;
- } /* load_DtSrResults_WK() */
- /****************************************/
- /* */
- /* weights_filter_WK */
- /* */
- /****************************************/
- /* This workproc is called only if we're doing statistical sorting.
- * (1) It reduces the result_bitvec to it's final size,
- * containing only the highest aa_maxhits statistical weights
- * in wtvec.
- * (2) It replaces (possibly large) wtvec with (probably much smaller)
- * array of PROXWT structures containing the selected records'
- * weights and calculated proximities, for final ranking sort.
- *
- */
- static void weights_filter_WK (void)
- {
- int i;
- double scalefac;
- long recno;
- int smallest, biggest;
- float biggestwt;
- long byteno, smallest_byteno;
- int bitmask, smallest_bitmask;
- if (got_USR_STOPSRCH())
- return;
- /* Init weight filtering */
- if (proxwts)
- free (proxwts);
- proxwtct = (result_count < aa_maxhits)? result_count : aa_maxhits;
- proxwts = austext_malloc (proxwtct * sizeof(PROXWT) + 4,
- PROGNAME"429", NULL);
- memset (proxwts, 0, proxwtct * sizeof(PROXWT));
- smallest = 0;
- scalefac = 0.0;
- biggestwt = 0.0; /* biggest single wt of all docs */
- /* One pass thru entire result_bitvec */
- for (recno = 1; recno < tot_addr_count; recno++) {
- byteno = recno >> 3;
- bitmask = 1 << (recno % 8);
- /* Skip zero bits */
- if ((result_bitvec[byteno] & bitmask) == 0)
- continue;
- /* Make scalefac = sum of squares of all wts in bitvec.
- * It's possible that all or some of the weights are
- * zero (eg queries like "~aaa" or "~aaa | bbb").
- * In this case give them a very small positive number
- * so we don't divide by zero later on.
- */
- if (wtvec[recno] == 0.0)
- wtvec[recno] = 0.1;
- scalefac += (double) wtvec[recno] * (double) wtvec[recno];
- /*
- * The following logic first fills up the proxwts table.
- * After that if a bitvec's weight is larger than the smallest
- * proxwt, replace the smallest proxwt with the new weight
- * and switch off the previous smallest in the original bitvec.
- */
- /*
- * Just discard rec on bitvec if it's weight
- * is smaller than the current smallest.
- */
- if (wtvec [recno] <= proxwts[smallest].wt) {
- RESET_BIT (result_bitvec, byteno, bitmask);
- result_count--;
- continue;
- }
- /*
- * Else discard current smallest if
- * table full, ie it really points to something.
- */
- if (proxwts[smallest].wt > 0.0) {
- smallest_byteno = proxwts[smallest].byteno;
- smallest_bitmask = proxwts[smallest].bitmask;
- RESET_BIT (result_bitvec, smallest_byteno, smallest_bitmask);
- result_count--;
- }
- /* Add this weight to the proxwts table. */
- proxwts [smallest] .wt = wtvec [recno];
- proxwts [smallest] .byteno = byteno;
- proxwts [smallest] .bitmask = bitmask;
- /* Keep track of the table entry that has
- * the highest weight. This will eventually
- * be the first sorted hit on the hitlist.
- * It's weight/proximity will be used
- * to scale the proximities of the
- * other hits.
- */
- if (biggestwt < wtvec[recno]) {
- biggestwt = wtvec[recno];
- biggest = smallest;
- }
- /* Find the next smallest */
- smallest = 0;
- for (i = 1; i < proxwtct; i++) {
- if (proxwts[i].wt < proxwts[smallest].wt)
- smallest = i;
- }
- } /* end loop on every recno */
- free (wtvec);
- wtvec = NULL;
- /* PROXIMITY CALCULATIONS.
- * In order to translate statistical weight into an AusText
- * proximity, basically you have to invert it, then scale it.
- * The statistical weight is a similarity measure: the
- * larger it is the more similar the document to the query.
- * But AusText 'proximity' is like a 'distance' measure,
- * the smaller the number the closer the document is to the query.
- *
- * First 'normalize' each document's statistical
- * weight to be a fraction between 0 and 1. Done
- * by calculating a normalization factor,
- * the sqrt of the sum of squares of weights of all
- * docs that would have qualified for the hitlist
- * if we weren't truncating. Note cosine-based normalization
- * factor (Pythagorean) always >= largest wt so we can
- * guarantee all normalized weights are > 0.0 and <= 1.0.
- *
- * The proximity itself is calculated as the 'percent value'
- * that the doc is 'distant' from perfection (1.0 or 100%).
- * For example, if the normalized weight of the first record
- * is .931 then it's proximity will be 7 (100% - 93% = 7).
- *
- * The proximity of every other hit is scaled away
- * from the first because the normalization algorithm
- * tends to clump proximities when there are a lot of hits.
- * Specifically the proximity of every hit is a constant
- * scale factor (derived from the first proximity),
- * divided by it's weight.
- *
- * A "bulls eye" (normalized weight = 1.0, proximity == 0)
- * for the first hit is not allowed so scale factor will
- * not also be zero. Otherwise *all* hits in that particular
- * results list would be bulls eyes.
- */
- scalefac = (double) biggestwt / sqrt (scalefac);
- /* normalized weight of first hit */
- scalefac = (1.0 - scalefac) * 100.0;
- /* proximity of first hit */
- if (scalefac < 1.0)
- scalefac = 1.0;
- /* No bulls eyes */
- scalefac *= (double) biggestwt * 1.2;
- /* scale factor for other hits */
- for (i = 0; i < proxwtct; i++) {
- proxwts[i].proximity = (int) (scalefac / (double) proxwts[i].wt);
- if (proxwts[i].proximity > 9999)
- proxwts[i].proximity = 9999;
- }
- if (debugging_boolsrch) {
- fprintf (aa_stderr,
- PROGNAME"489 FINAL PROXWTS proxwtct=%d bigwt=%.2f scalefac=%.2lf\n",
- proxwtct, biggestwt, scalefac);
- for (i=0; i<10; i++) {
- if (i >= proxwtct)
- break;
- fprintf (aa_stderr,
- " byteno=%3ld bitmask=%02x wt=%.2f prox=%d\n",
- proxwts[i].byteno, proxwts[i].bitmask,
- proxwts[i].wt, proxwts[i].proximity);
- }
- fprintf (aa_stderr, PROGNAME"499 WEIGHT RESULTS resultct=%ld bv=\n",
- result_count);
- for (i=0; i<22; i++) {
- if (i >= bitveclen)
- break;
- fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = load_DtSrResults_WK;
- return;
- } /* weights_filter_WK() */
- /****************************************/
- /* */
- /* dbread_filter_WK */
- /* */
- /****************************************/
- /* Called if we must remove documents from result_bitvec
- * because of keytype or date,
- */
- static void dbread_filter_WK (void)
- {
- long recno;
- long byteno;
- int bitmask;
- long discards;
- if (got_USR_STOPSRCH())
- return;
- if (debugging_boolsrch) {
- discards = 0;
- fputs (PROGNAME"865 DBREAD discards (k=keytype d=date):\n", aa_stderr);
- fflush (aa_stderr);
- }
- /* One pass thru entire result_bitvec */
- for (recno = 1; recno < tot_addr_count; recno++) {
- byteno = recno >> 3;
- bitmask = 1 << (recno % 8);
- if ((result_bitvec[byteno] & bitmask) == 0)
- continue;
- if (!read_recno (recno))
- continue;
- /* Skip undesired record types */
- if (!all_key_types) {
- if (strchr (saveusr.ktchars, objrec.or_objkey[0]) == NULL) {
- RESET_BIT (result_bitvec, byteno, bitmask);
- result_count--;
- if (debugging_boolsrch) {
- discards++;
- fputc ('k', aa_stderr);
- fflush (aa_stderr);
- }
- continue;
- }
- }
- /* Skip record if out of date range */
- if (check_dates) {
- if (!objdate_in_range (objrec.or_objdate,
- usrblk.objdate1, usrblk.objdate2)) {
- RESET_BIT (result_bitvec, byteno, bitmask);
- result_count--;
- if (debugging_boolsrch) {
- discards++;
- fputc ('d', aa_stderr);
- fflush (aa_stderr);
- }
- continue;
- }
- }
- } /* end loop on every recno */
- if (debugging_boolsrch) {
- int i;
- if (discards)
- fputc ('\n', aa_stderr);
- fprintf (aa_stderr,
- PROGNAME"857 DBREAD RESULTS discards=%ld resultct=%ld bv=\n",
- discards, result_count);
- for (i=0; i<22; i++) {
- if (i >= bitveclen)
- break;
- fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- /* Determine next workproc.
- * (1) If no records survived the read db filter,
- * we're done, return 'no hits'.
- * (2) If we're sorting, the next workproc reduces the
- * bitvec to the aa_maxhits recs with the highest
- * statistical weights.
- * (3) Otherwise the next workproc just loads the hitlist.
- */
- if (result_count <= 0) {
- usrblk.retncode = OE_NOTAVAIL;
- usrblk.workproc = dummy_workproc;
- }
- else if (do_stat_sort) {
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = weights_filter_WK;
- }
- else {
- if (debugging_boolsrch)
- fprintf (aa_stderr,
- PROGNAME"931 No sorting by statistical weights.\n");
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = load_DtSrResults_WK;
- }
- return;
- } /* dbread_filter_WK() */
- /****************************************/
- /* */
- /* calc_result_bitvec_WK */
- /* */
- /****************************************/
- /* Second workproc after read_stem_bitvec_WK().
- * If possible, minimizes size of truth table permutes,
- * then applies them to stem bitvecs to create result_bitvec.
- */
- static void calc_result_bitvec_WK (void)
- {
- int mask;
- int cpm;
- long byteno;
- int bitno, stemno;
- UCHAR permute;
- UCHAR my_permutes [256];
- int my_pmsz;
- int i;
- if (got_USR_STOPSRCH())
- return;
- /* If there are fewer than a full complement of stems,
- * minimize size of truth table by discarding
- * permutes that refer to unused stems.
- */
- if (saveusr.stemcount < DtSrMAX_STEMCOUNT) {
- /* Set high order bits of mask to mark unused stem positions */
- mask = 0;
- for (i = 0; i < saveusr.stemcount; i++)
- mask |= 1 << i;
- mask = ~mask;
- /* 'cpm' is a candidate permute */
- my_pmsz = 0;
- for (cpm = 0; cpm < 256; cpm++) {
- /*
- * Discard candidate if it refers to an unused stem.
- */
- if (cpm & mask)
- continue;
- /*
- * Otherwise if candidate is in final_truthtab, keep it.
- */
- for (i = 0; i < final_truthtab.pmsz; i++) {
- if (final_truthtab.permutes[i] == (UCHAR) cpm) {
- my_permutes [my_pmsz] = (UCHAR) cpm;
- my_pmsz++;
- }
- }
- }
- if (debugging_boolsrch) {
- fprintf (aa_stderr,
- PROGNAME"565 Minimize truth table, pmsz=%d-->%d\n permutes=",
- final_truthtab.pmsz, my_pmsz);
- for (i=0; i<16; i++) {
- if (i >= my_pmsz)
- break;
- fprintf (aa_stderr, " %02x", (int) my_permutes [i]);
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- final_truthtab.permutes = my_permutes;
- final_truthtab.pmsz = my_pmsz;
- } /* end minimize of permutes */
- /* Calculate result bit vector.
- * Loop 1 is a single pass through the bit vectors
- * (a bit loop inside a byte loop).
- * For each nonzero bit, ie each database record
- * that has at least one of the query terms in it,
- * build a 'permute' equivalent to the boolean
- * representation of the terms in that record (Loop 2).
- * Then search the truth table permutes for a match (Loop 3).
- * If found, set the record's bit in the result_bitvec.
- */
- /* LOOP 1. For each database addr... */
- result_count = 0;
- for (byteno = 0; byteno < bitveclen; byteno++) {
- for (bitno = 0; bitno < 8; bitno++) {
- mask = 1 << bitno;
- /* LOOP 2. Build permute for each query term. */
- permute = 0;
- for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
- if (bitvecs [stemno] [byteno] & (UCHAR) mask)
- permute |= 1 << stemno;
- }
- /* LOOP 3. Search truth table for matching permute. */
- for (i = 0; i < final_truthtab.pmsz; i++) {
- if (final_truthtab.permutes[i] == permute) {
- result_bitvec [byteno] |= (UCHAR) mask;
- result_count++;
- }
- }
- }
- }
- if (debugging_boolsrch) {
- fprintf (aa_stderr, PROGNAME"621 PRELIM RESULTS resultct=%ld bv=\n",
- result_count);
- for (i=0; i<22; i++) {
- if (i >= bitveclen)
- break;
- fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- /* The next workprocs are 'filters', reducing the size
- * of result_bitvec by removing various unwanted records.
- * They're done in the following order:
- * (1) If no records survived the truth table manipulations,
- * we're done, return 'no hits'.
- * (2) If we must remove documents because of keytype or date,
- * the next workproc is the filter that reads the database.
- * (3) If we're sorting, the next workproc reduces the
- * bitvec to the aa_maxhits recs with the highest
- * statistical weights.
- * (4) Otherwise the next workproc just loads the hitlist.
- */
- if (result_count <= 0) {
- usrblk.retncode = OE_NOTAVAIL;
- usrblk.workproc = dummy_workproc;
- }
- else if (!all_key_types || check_dates) {
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = dbread_filter_WK;
- }
- else if (do_stat_sort) {
- if (debugging_boolsrch)
- fprintf (aa_stderr,
- PROGNAME"948 No db reads necessary for date or keytype.\n");
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = weights_filter_WK;
- }
- else {
- if (debugging_boolsrch)
- fprintf (aa_stderr,
- PROGNAME"625 No filtering: no sort and no db reads.\n");
- usrblk.retncode = OE_SEARCHING;
- usrblk.workproc = load_DtSrResults_WK;
- }
- return;
- } /* calc_result_bitvec_WK() */
- /****************************************/
- /* */
- /* read_d99 */
- /* */
- /****************************************/
- /* Subroutine of read_stem_bitvec_WK().
- * Repeatedly called to get each d99dba in the inverted index
- * file (d99) for a specific index term. The first call passes
- * the term's wordrec with d99 offset and size information.
- * Subsequent calls pass NULL.
- * Returns valid d99dba, or 0 at end of term's index, or -1 on error.
- * Actual reads are performed a disk block at a time,
- * with dbas stored in a static buffer for the next call.
- */
- static DB_ADDR read_d99 (struct or_hwordrec *wordrec)
- {
- static DB_ADDR readbuf [DBAS_PER_BLOCK];
- static DB_ADDR *bufptr, *endbuf;
- static FILE *fptr;
- static long bal_read, request_read;
- /* First call for new term */
- if (wordrec) {
- fptr = usrblk.dblk->iifile;
- fseek (fptr, wordrec->or_hwoffset, SEEK_SET);
- bal_read = wordrec->or_hwaddrs;
- bufptr = endbuf = 0; /* triggers block read */
- }
- /* Time to read another block */
- if (bufptr >= endbuf) {
- if (bal_read <= 0)
- return 0;
- if (bal_read > DBAS_PER_BLOCK) {
- request_read = DBAS_PER_BLOCK;
- bal_read -= DBAS_PER_BLOCK;
- endbuf = readbuf + DBAS_PER_BLOCK;
- }
- else {
- /* last block is usually short */
- request_read = bal_read;
- bal_read = 0;
- endbuf = readbuf + request_read;
- }
- if (fread (readbuf, sizeof(DB_ADDR), request_read, fptr)
- != request_read) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolsrch, 28,
- "%s Database Read Error in %s.d99.") ,
- PROGNAME"428", usrblk.dblk->name);
- DtSearchAddMessage (msgbuf);
- return -1;
- }
- bufptr = readbuf;
- }
- /******return *bufptr++;*******/
- return ntohl (*bufptr++);
- } /* read_d99() */
- /****************************************/
- /* */
- /* get_colloc_bitvec */
- /* */
- /****************************************/
- /* Subroutine of read_stem_bitvec_WK().
- * Constructs a 'collocation bitvector' for current save_stemno.
- * A collocation expression requests the return of all records
- * containing both of two terms (a kind of boolean AND) such that
- * the occurrences are within n characters of each other.
- * For example "ICE @5 CREAM" requests the return of all records
- * containing both "ICE" and "CREAM" but only if they are separated
- * by no more than 5 characters.
- *
- * Since offset information is not stored in the inverted index
- * this module initially returns the intersection of the two words'
- * bit vectors (boolean AND). Then it retrieves each record,
- * builds an offset (hilites) table for each of the two words,
- * then compares the offset differences in the tables.
- * If no occurrence pairs are within the specified separation
- * range, the record is deleted from the bitvector.
- * Returns 0 if successful, otherwise returns -1 and msgs.
- @@@@ rewrite as its own workproc--reading/hiliting can take a long time...
- */
- static int get_colloc_bitvec (void)
- {
- int stemno_A = or_wordrecs[save_stemno].or_hwoffset;
- int stemno_B = or_wordrecs[save_stemno].or_hwfree;
- long range = or_wordrecs[save_stemno].or_hwaddrs;
- UCHAR *bitvec_A = bitvecs [stemno_A];
- UCHAR *bitvec_B = bitvecs [stemno_B];
- UCHAR *bitvec_C = bitvecs [save_stemno];
- long byteno, recno;
- UCHAR bitmask;
- int parse_type;
- int got_a_colloc;
- char *stemp;
- DtSrHitword *hitwords_A, *hitwords_B;
- long hitwcount_A, hitwcount_B;
- long threshold_range;
- DB_ADDR dba;
- LLIST *bloblist;
- long a, b, offset_A, offset_B;
- /* First construct the set intersection (AND) of
- * each of the collocated terms in the colloc bitvec.
- */
- for (byteno = 0; byteno < bitveclen; byteno++)
- bitvec_C [byteno] = bitvec_A [byteno] & bitvec_B [byteno];
- if (debugging_boolsrch) {
- int i;
- fprintf (aa_stderr,
- PROGNAME"312 INTERSECT[%d] (colloc %d & %d):\n",
- save_stemno, stemno_A, stemno_B);
- for (i=0; i<bitveclen; i++) {
- fprintf (aa_stderr, " %02x", bitvec_C[i]);
- if (i > 22)
- break;
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- /* Read cleartext for each rec in intersection/colloc bitvec.
- * Get hitwords (hilite table) for each collocation term.
- * Switch off recs in bitvec where no term pairs are in
- * collocation range.
- */
- for (recno = 1; recno < tot_addr_count; recno++) {
- byteno = recno >> 3; /* divide by 8 */
- bitmask = 1 << (recno % 8);
- /* Skip zero bits */
- if ((bitvec_C[byteno] & bitmask) == 0)
- continue;
- /* Convert recno to vista database address.
- * Silently skip rec if dba doesn't exist.
- */
- dba = (recno - 1) * or_recslots + 2;
- if (dba >= or_maxdba) {
- RESET_BIT (bitvec_C, byteno, bitmask);
- continue;
- }
- dba |= (OR_D00 << 24);
- /* Silently skip records that have no document text */
- if ((bloblist = ve_getblobs (dba, vistano)) == NULL) {
- if (debugging_boolsrch) {
- fprintf (aa_stderr,
- PROGNAME"126 No blobs for recno=%ld byteno=%ld mask%02x\n",
- recno, byteno, bitmask);
- fflush (aa_stderr);
- }
- RESET_BIT (bitvec_C, byteno, bitmask);
- continue;
- }
- /* Uncompress record text into usrblk.cleartext */
- if (oe_unblob (bloblist) != OE_OK)
- return -1;
- /* Build 'hilite' table for stem A. If stem
- * can't be found in the record, silently skip it.
- * Otherwise save the table.
- */
- stemp = saveusr.stems [stemno_A];
- if (stemp[0] == STEM_CH) {
- parse_type = 'S';
- stemp++;
- }
- else
- parse_type = 'W';
- if (!hilite_cleartext (parse_type, stemp, 1)) {
- RESET_BIT (bitvec_C, byteno, bitmask);
- continue;
- }
- hitwords_A = usrblk.hitwords;
- hitwcount_A = usrblk.hitwcount;
- usrblk.hitwords = NULL;
- usrblk.hitwcount = 0;
- /* In the same way build 'hilite' table for stem B */
- stemp = saveusr.stems [stemno_B];
- if (stemp[0] == STEM_CH) {
- parse_type = 'S';
- stemp++;
- }
- else
- parse_type = 'W';
- if (!hilite_cleartext (parse_type, stemp, 1)) {
- RESET_BIT (bitvec_C, byteno, bitmask);
- free (hitwords_A);
- continue;
- }
- hitwords_B = usrblk.hitwords;
- hitwcount_B = usrblk.hitwcount;
- usrblk.hitwords = NULL;
- usrblk.hitwcount = 0;
- /* Compare the two hilite tables for range matches */
- got_a_colloc = FALSE;
- b = 0;
- for (a = 0; a < hitwcount_A; a++) {
- offset_A = hitwords_A[a].offset;
- threshold_range = offset_A + hitwords_A[a].length + range;
- for (; b < hitwcount_B; b++) {
- offset_B = hitwords_B[b].offset;
- /* Advance B to first entry past A's offset */
- if (offset_B <= offset_A )
- continue; /* ...the B loop */
- if (offset_B <= threshold_range)
- got_a_colloc = TRUE;
- break; /* ...the B loop */
- } /* end B loop */
- if (got_a_colloc || b >= hitwcount_B)
- break; /* ...the A loop */
- } /* end A loop */
- free (hitwords_A);
- free (hitwords_B);
- /* If no collocations found within range,
- * switch off rec in colloc bitvec.
- */
- if (!got_a_colloc)
- RESET_BIT (bitvec_C, byteno, bitmask);
- } /* end loop on each recno in intersection/colloc bitvec */
- return 0;
- } /* get_colloc_bitvec() */
- /****************************************/
- /* */
- /* read_stem_bitvec_WK */
- /* */
- /****************************************/
- /* First workproc after boolean_search().
- * Each iterative call loads one (save_stemno) real stem's bitvec.
- * After last stem bitvec loaded, sets up
- * call to next workproc in sequence.
- */
- static void read_stem_bitvec_WK (void)
- {
- long byteno;
- DB_ADDR d99recno;
- float weight;
- if (got_USR_STOPSRCH())
- return;
- /* Process collocation 'stems' */
- if (saveusr.stems [save_stemno] [0] == '@') {
- d99recno = get_colloc_bitvec();
- goto DONE_READING;
- }
- for ( d99recno = read_d99 (&or_wordrecs [save_stemno]);
- d99recno;
- d99recno = read_d99 (NULL)) {
- if (d99recno == -1) /* read error */
- break;
- /* Save low byte 'statistical weight' value.
- * It can only be 0 - 255.
- */
- if (do_stat_sort)
- weight = (float) (d99recno & 0x000000ff) + 1.0;
- d99recno = (d99recno >> 8) & 0x00ffffff;
- /* Set correct bit in bitvec.
- * The byte number is the recno divided by 8.
- * The bit number is the remainder after division by 8.
- */
- if ((byteno = d99recno >> 3) >= bitveclen) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolsrch, 32,
- "%s Database Error: %s '%s'\n"
- "in database '%s' has invalid d99 record number %ld.") ,
- PROGNAME"394",
- (usrblk.search_type == 'W') ?
- CATGETS(dtsearch_catd, MS_boolsrch, 33, "Word") :
- CATGETS(dtsearch_catd, MS_boolsrch, 34, "Stem of"),
- usrblk.stems [save_stemno],
- usrblk.dblk->label,
- d99recno);
- DtSearchAddMessage (msgbuf);
- d99recno = -1; /* force error return */
- goto DONE_READING;
- }
- bitvecs [save_stemno] [byteno] |= 1 << (d99recno % 8);
- /* Add to correct weight in weight vector.
- * IDF ranges between 1.0 and 20.0, and weight
- * is 1 - 256, so we're adding 1 - ~5000 to wtvec.
- */
- if (do_stat_sort)
- wtvec [d99recno] += weight * (float) idf [save_stemno];
- } /* end loop that retrieves every d99recno for curr stem */
- DONE_READING:
- if (debugging_boolsrch) {
- int i;
- if (debugging_boolsrch)
- fprintf (aa_stderr, PROGNAME"313 BITVEC[%d]:\n", save_stemno);
- for (i=0; i<bitveclen; i++) {
- fprintf (aa_stderr, " %02x", bitvecs[save_stemno][i]);
- if (i > 22)
- break;
- }
- fputc ('\n', aa_stderr);
- fflush (aa_stderr);
- }
- if (d99recno == 0) {
- /* Normal conclusion. Increment to next stem.
- * If not all stems have been read,
- * this is still the next workproc.
- * Otherwise the next workproc is the one
- * merging all bitvectors into the final
- * result bitvec using the truth table.
- */
- usrblk.retncode = OE_SEARCHING;
- if (++save_stemno < saveusr.stemcount)
- usrblk.workproc = read_stem_bitvec_WK;
- else
- usrblk.workproc = calc_result_bitvec_WK;
- }
- else
- /* d99recno must be -1 */
- usrblk.retncode = OE_SYSTEM_STOP;
- return;
- } /* read_stem_bitvec_WK() */
- /****************************************/
- /* */
- /* boolean_search */
- /* */
- /****************************************/
- /* Called from Opera_Engine after successful boolean_parse().
- * Expects valid globals: saveusr.stems, saveusr.stemcount,
- * usrblk.stems (contains original unstemmed query terms for msgs),
- * usrblk.search_type, final_truthtab, qry_has_no_NOTs,
- * and qry_is_all_ANDs.
- * Based on parts of the function ve_word_search().
- * Upon return, usrblk.retncode, msglist, etc is appropriately loaded.
- * Upon successful return usrblk.stems, usrblk.stemcount,
- * and dittolist are also loaded.
- */
- void boolean_search (void)
- {
- int i, j;
- size_t allocsz_needed;
- /* Sanity checks */
- if ( saveusr.stemcount <= 0 ||
- final_truthtab.pmsz <= 0 ||
- final_truthtab.pmsz >= 256 ) {
- fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_boolsrch, 35,
- "%s Program Error: stemct=%d pmsz=%d\n") ,
- PROGNAME"1404", saveusr.stemcount, final_truthtab.pmsz);
- DtSearchExit (14);
- }
- /*---------- Init globals ----------*/
- if (!msgbuf)
- msgbuf = austext_malloc (500, PROGNAME"393", NULL);
- debugging_boolsrch = (usrblk.debug & USRDBG_SRCHCMPL);
- need_zero_permute = (final_truthtab.permutes[0] == 0);
- do_stat_sort = ((usrblk.flags & USR_SORT_WHITL) != 0);
- check_dates = (usrblk.objdate1 || usrblk.objdate2);
- or_abstrsz = usrblk.dblk->dbrec.or_abstrsz;
- or_fzkeysz = usrblk.dblk->dbrec.or_fzkeysz;
- or_language = usrblk.dblk->dbrec.or_language;
- or_maxdba = usrblk.dblk->dbrec.or_maxdba;
- usrblk.flags &= ~USR_STOPSRCH; /* turn off stop button */
- saveusr.vistano = vistano = usrblk.dblk->vistano;
- saveusr.dittolist = NULL;
- saveusr.dittocount = 0L;
- saveusr.iterations = INIT_ITERATIONS;
- /*
- * saveusr.ktchars is a string holding
- * first char of desired record ids.
- */
- all_key_types = TRUE;
- for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++) {
- if (usrblk.dblk->keytypes[i].is_selected)
- saveusr.ktchars[j++] = usrblk.dblk->keytypes[i].ktchar;
- else
- all_key_types = FALSE;
- }
- saveusr.ktchars[j] = '\0';
- or_recslots = (long) (usrblk.dblk->dbrec.or_recslots);
- or_reccount = usrblk.dblk->dbrec.or_reccount;
- /* RECFRST is just to get the slot# (dba) of the
- * first real object record after the dbrec.
- * Currently the dbrec occupies only one slot,
- * the first (#1), so dba will usually be #2.
- */
- /********
- RECFRST(PROGNAME"2545", OR_OBJREC, saveusr.vistano);
- CRGET(PROGNAME"2546", &dba, saveusr.vistano);
- dba &= 0x00FFFFFF;
- ********/
- tot_addr_count = ((usrblk.dblk->dbrec.or_maxdba + 1) / or_recslots) + 1;
- bitveclen = (tot_addr_count >> 3) + 1;
- if (debugging_boolsrch) {
- fprintf (aa_stderr, PROGNAME"360 "
- "boolean_search: typ='%c' needzpm?=%d sort?=%d maxhits=%d\n"
- " maxdba=%ld recct=%ld recslts=%ld\n"
- " totnmadr=%ld bvln=%ld allkts?=%d ktchars='%s'\n"
- ,usrblk.search_type
- ,need_zero_permute
- ,do_stat_sort
- ,aa_maxhits
- ,(long) usrblk.dblk->dbrec.or_maxdba
- ,or_reccount
- ,or_recslots
- ,tot_addr_count
- ,bitveclen
- ,all_key_types
- ,saveusr.ktchars
- );
- fflush (aa_stderr);
- }
- /*---------- Read vista btree ----------
- * Load or_wordrecs[] array for each term in saveusr.stems.
- */
- if (!load_or_wordrecs())
- return;
- /* If statistically sorting final resultlist, calculate
- * idf (inverse document frequency) for each term using
- * the frequency data in or_wordrecs[].
- */
- if (do_stat_sort)
- calculate_idfs();
- /* Bitvector allocation. Number needed is one for each stem,
- * plus one extra to accumulate the result bitvector.
- */
- allocsz_needed = bitveclen * (saveusr.stemcount + 1);
- if (debugging_boolsrch)
- fprintf (aa_stderr, PROGNAME"430 "
- "bitvecs[] alloc needed=%lu (bvln=%ld stems=%d+1), have=%lu.\n",
- (unsigned long) allocsz_needed, bitveclen, saveusr.stemcount, (unsigned long) bitvec_allocsz);
- if (bitvec_allocsz < allocsz_needed) {
- if (bitvec_allocp)
- free (bitvec_allocp);
- bitvec_allocp = austext_malloc (allocsz_needed + 16,
- PROGNAME"508", NULL);
- if (debugging_boolsrch)
- fprintf (aa_stderr, PROGNAME"432 bitvecs[] realloc %lu-->%lu.\n",
- (unsigned long) bitvec_allocsz, (unsigned long) allocsz_needed);
- bitvec_allocsz = allocsz_needed;
- }
- /* Clear all bitvecs to zero and assign them */
- memset (bitvec_allocp, 0, allocsz_needed);
- for (i = 0; i < saveusr.stemcount; i++)
- bitvecs[i] = bitvec_allocp + (i * bitveclen);
- result_bitvec = bitvec_allocp + (i * bitveclen);
- /* If sorting statistically, allocate weight vector.
- * One float for each db record.
- */
- if (wtvec) {
- free (wtvec);
- wtvec = NULL;
- }
- if (do_stat_sort) {
- wtvec = austext_malloc ((tot_addr_count + 4) * sizeof(float) + 4,
- PROGNAME"040", NULL);
- memset (wtvec, 0, (tot_addr_count + 4) * sizeof(float));
- }
- /* The 'zero permute' is every record that has
- * NONE of the query terms in it. It can only be
- * generated if a NOT operator was included in the query.
- */
- if (need_zero_permute) {
- sprintf (msgbuf, CATGETS(dtsearch_catd, MS_boolsrch, 15,
- "%s Your query requires retrieving every\n"
- "document in the database that does not have any of\n"
- "your query words. This type of search may take an\n"
- "unusually long time."),
- PROGNAME"1536");
- DtSearchAddMessage (msgbuf);
- }
- if (debugging_boolsrch)
- fflush (aa_stderr);
- /* Searches may take a long time. To allow gui to put a
- * a 'working' dialog box and a 'cancel' button,
- * we pass execution to workprocs.
- * If user cannot cancel search no matter how
- * long it may take, we call each of the subsequent
- * workproc functions directly from here.
- * Otherwise they will themselves setup each
- * subsequent call to usrblk.workproc(), as long as
- * the previous call returns OE_SEARCHING and the user
- * hasn't pushed USR_STOPSRCH.
- */
- usrblk.workproc = read_stem_bitvec_WK;
- save_stemno = 0; /* global arg for first workproc */
- usrblk.workproc(); /* direct call to first workproc */
- if ((usrblk.flags & USR_NO_ITERATE) != 0 &&
- (usrblk.debug & USRDBG_ITERATE) == 0) {
- while (usrblk.retncode == OE_SEARCHING)
- usrblk.workproc();
- }
- return;
- } /* boolean_search() */
- /************************** BOOLSRCH.C **********************/
|