dtsrindex.c 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * COMPONENT_NAME: austext
  25. *
  26. * FUNCTIONS: descend_tree
  27. * displayable
  28. * fill_data1
  29. * load_into_bintree
  30. * main
  31. * print_exit_code
  32. * print_usage_msg
  33. * put_addrs_2_dtbs_addr_file
  34. * segregate_dicname
  35. * traverse_tree
  36. * user_args_processor
  37. * write_2_dtbs_addr_file
  38. * write_new_word_2_dtbs
  39. * write_to_file
  40. *
  41. * ORIGINS: 27
  42. *
  43. *
  44. * (C) COPYRIGHT International Business Machines Corp. 1992,1996
  45. * All Rights Reserved
  46. * Licensed Materials - Property of IBM
  47. * US Government Users Restricted Rights - Use, duplication or
  48. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  49. */
  50. /************************ DTSRINDEX.C *******************************
  51. * $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $
  52. * CDE version of borodin.c
  53. * Formerly dtsrindex.c was cborodin.c.
  54. *
  55. * INPUT FORMAT:
  56. * Text file in FZK format.
  57. * Each record contains 4 formatted 'lines' (text strings ending in \n):
  58. * 1. fzkey (not used in this program).
  59. * 2. abstract (not used in this program).
  60. * 3. unique database key for the record. Used to find the database
  61. * address of the record which is the reference for the inverted index.
  62. * 4. The record's date (not used in this program).
  63. *
  64. * The rest of the record is unformatted text (not necessarily organized
  65. * into 'lines'). It is read a character at a time and parsed into
  66. * individual words by the parser function for the database's language.
  67. * Each record ends with a delimiter string specified by command line arg.
  68. *
  69. * $Log$
  70. * Revision 2.8 1996/04/10 19:50:38 miker
  71. * Deleted dangerous and unnecessary -a option.
  72. *
  73. * Revision 2.7 1996/03/25 18:54:15 miker
  74. * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  75. *
  76. * Revision 2.6 1996/02/01 18:25:44 miker
  77. * AusText 2.1.11, DtSearch 0.3. Pass 1 changed to accommodate
  78. * new single-character reading parser/stemmers.
  79. *
  80. * Revision 2.5 1995/12/29 17:16:04 miker
  81. * Bug fix: Opened wrong msg catalog.
  82. *
  83. * Revision 2.4 1995/12/27 21:18:40 miker
  84. * Msg bug: 'percent done' was negative number.
  85. *
  86. * Revision 2.3 1995/12/01 16:15:44 miker
  87. * Deleted unnecessary log2 var, conflict with Solaris function.
  88. * Added -r command line arg.
  89. *
  90. * Revision 2.2 1995/10/26 15:26:53 miker
  91. * Added prolog.
  92. *
  93. * Revision 2.1 1995/09/22 19:29:53 miker
  94. * Freeze DtSearch 0.1, AusText 2.1.8
  95. *
  96. * Revision 1.3 1995/09/05 21:08:54 miker
  97. * Fixed bug: appeared as if 1 and 2 char 'words' were being indexed.
  98. * Added DEBUG_P switch.
  99. *
  100. * Revision 1.2 1995/09/01 22:17:02 miker
  101. * Fixed solaris segfault: too many args to printf in print_usage().
  102. *
  103. * Revision 1.1 1995/08/31 20:51:08 miker
  104. * Initial revision of dtsrindex.c, copied from cborodin.c.
  105. *
  106. * Log: cborodin.c,v
  107. * Revision 1.18 1995/05/30 18:58:54 miker
  108. * Correct bug introduced by previous fix (2.1.5c).
  109. *
  110. * Revision 1.17 1995/05/18 22:54:08 miker
  111. * 2.1.5b cborodin bug. Segfault due to overflowing bitvector
  112. * after many deletions and no mrclean.
  113. */
  114. #include <cde_config.h>
  115. #include <Dt/SearchP.h>
  116. #include <limits.h>
  117. #include <stdlib.h>
  118. #include <unistd.h>
  119. #include <string.h>
  120. #include <ctype.h>
  121. #include <time.h>
  122. #include <errno.h>
  123. #include <math.h>
  124. #include <sys/stat.h>
  125. #include <locale.h>
  126. #include "vista.h"
  127. extern void find_keyword (char *cur_word, int vista_num);
  128. extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
  129. extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num);
  130. extern void fill_data1 (char *ch);
  131. // lib/DtSearch/vstfunct.c
  132. void fillnew_wordrec (struct or_hwordrec * glob_word, int vista_num);
  133. void init_user_interrupt(void); // lib/DtSearch/userint.c
  134. #define PROGNAME "DTSRINDEX"
  135. #define BATCH_SIZE 10000L
  136. #define WORDS_PER_DOT 500
  137. #define RECS_PER_DOT 20
  138. #define INBUFSZ 1024 /* default input text header line size */
  139. #define MS_misc 1
  140. #define MS_cborodin 14
  141. /******************* BIT VECTORS *****************/
  142. DB_ADDR *word_addrs_ii; /* fread buf for d99 (= tot # dbas) */
  143. DtSrINT32 *dbas_word_count;
  144. char *dbas_bits_batch;
  145. DB_ADDR *record_addr_word;
  146. DtSrINT32 num_addrs_for_word;
  147. DtSrINT32 or_reccount;
  148. DtSrINT32 bit_vector_size;
  149. /*-------------------------- GLOBALS ----------------------------*/
  150. /* batch_size also used by fileman.c for allocating unused holes
  151. * in order to no go past end of 'record_addr_word' array.
  152. */
  153. extern DtSrINT32 batch_size;
  154. char buf[1024];
  155. static int cache_size = CACHE_SIZE;
  156. static int check_existing_addrs = TRUE;
  157. long count_word_ii = 0L;
  158. long dbkey_seqno = 0L;
  159. DBLK dblk;
  160. DBREC dbrec;
  161. static int debugging = 0;
  162. #define DEBUG_I 0x01 /* P1 tree insertions */
  163. #define DEBUG_P 0x10 /* P1 parser/stemmer */
  164. #define DEBUG_T 0x02 /* P2 tree dump (words) */
  165. #define DEBUG_N 0x04 /* P2 NEW words, vista */
  166. #define DEBUG_O 0x08 /* P2 OLD words, vista) */
  167. #define DEBUG_t 0x20 /* P2 tree dump (dbas) */
  168. #define DEBUG_n 0x40 /* P2 NEW d99 for new words */
  169. #define DEBUG_o 0x80 /* P2 OLD d99 updates for old words */
  170. static unsigned long
  171. default_hashsize;
  172. char dicname [10];
  173. char dicpath [_POSIX_PATH_MAX];
  174. static int dotcount = 0;
  175. char dtbs_addr_file [_POSIX_PATH_MAX];
  176. FILE *dtbs_addr_fp;
  177. long dtbs_size_records = 0L;
  178. static long duplicate_recids = 0L;
  179. struct stat fstat_input;
  180. FILE_HEADER fl_hdr;
  181. static char fname_input [_POSIX_PATH_MAX];
  182. struct or_hwordrec
  183. got_word;
  184. static FILE *instream;
  185. char *inbuf;
  186. int inbuf_overflowed = FALSE;
  187. size_t inbufsz = INBUFSZ;
  188. int is_pmr;
  189. static DtSrINT32
  190. or_maxdba = 0;
  191. static char msg_374[] = "\n%s Out of Memory!\n"
  192. " Split the incoming file into several "
  193. "smaller files and try again.\n";
  194. static char msg_776[] = "\n%s Write Failure d99 file: %s\n";
  195. char new_dtbs_file = FALSE;
  196. long num_of_diff_words = 0L;
  197. int normal_retncode = 0;
  198. static PARG parg;
  199. int parsep_char = END_RETAIN_PAGE;
  200. char rec_type;
  201. unsigned long record_count = 0UL;
  202. int record_lines;
  203. static int recs_per_dot = RECS_PER_DOT;
  204. static unsigned long
  205. seconds_left;
  206. extern int shutdown_now;
  207. static DtSrINT32
  208. or_recslots;
  209. char *sprintbuffer = NULL;
  210. char *temp = NULL;
  211. extern int debugging_teskey;
  212. time_t timestart = 0;
  213. time_t totalstart = 0;
  214. static int words_per_dot = WORDS_PER_DOT;
  215. /************************************************/
  216. /* */
  217. /* DBALIST */
  218. /* */
  219. /************************************************/
  220. typedef struct dba_str {
  221. DB_ADDR dba;
  222. DtSrINT32 w_c;
  223. struct dba_str *next_dba;
  224. } DBALIST;
  225. /************************************************/
  226. /* */
  227. /* TREENODE */
  228. /* */
  229. /************************************************/
  230. typedef struct _treen_ {
  231. char *word; /* ptr to word in stop list */
  232. struct _treen_ *llink; /* left link in binary tree */
  233. struct _treen_ *rlink; /* ptr to right link in binary tree */
  234. DBALIST *dba_list;
  235. } TREENODE;
  236. static TREENODE *root_node = NULL;
  237. static TREENODE *top_of_stack;
  238. static TREENODE *stack;
  239. static TREENODE *pres;
  240. static TREENODE *prev;
  241. static TREENODE *next;
  242. static TREENODE *avail_node;
  243. /************************************************/
  244. /* */
  245. /* displayable */
  246. /* */
  247. /************************************************/
  248. /* Returns static string same as passed string except nonprintable
  249. * and nonascii chars replaced by '^' for display.
  250. */
  251. static char *displayable (char *passed_string)
  252. {
  253. static char *buf = NULL;
  254. static size_t buflen = 0;
  255. size_t passed_len = strlen (passed_string);
  256. char *targ, *src;
  257. if (buflen < passed_len) {
  258. if (buf)
  259. free (buf);
  260. buflen = passed_len;
  261. buf = austext_malloc (buflen + 4, PROGNAME"158", NULL);
  262. }
  263. targ = buf;
  264. for (src = passed_string; *src != 0; src++) {
  265. if (*src >= 32 && *src < 127)
  266. *targ++ = *src;
  267. else
  268. *targ++ = '^';
  269. }
  270. *targ = 0;
  271. return buf;
  272. } /* displayable() */
  273. /************************************************/
  274. /* */
  275. /* print_exit_code */
  276. /* */
  277. /************************************************/
  278. /* Called from inside DtSearchExit() at (*austext_exit_last)() */
  279. static void print_exit_code (int exit_code)
  280. {
  281. if(dotcount) {
  282. putchar ('\n');
  283. dotcount = 0;
  284. }
  285. /* Put total seconds into totalstart */
  286. if (totalstart > 0)
  287. totalstart = time (NULL) - totalstart;
  288. printf (CATGETS(dtsearch_catd, MS_cborodin, 206,
  289. "%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"),
  290. aa_argv0, exit_code, totalstart / 60L, totalstart % 60L);
  291. return;
  292. } /* print_exit_code() */
  293. /****************************************/
  294. /* */
  295. /* write_to_file() */
  296. /* */
  297. /****************************************/
  298. /* This is the 'visit node' point for the tree traversal
  299. * functions of Pass 2 (traverse_tree() and descend_tree()).
  300. *
  301. * Each tree node = word or stem + linked list of dbas.
  302. * When called, each dba list member just contains the number
  303. * of times the token appears in that document. This function
  304. * chains through the list, builds a statistical 'weight'
  305. * for each doc/word pair, and stores it as a reformatted 'dba'
  306. * in array 'record_addr_word[]', in 'host' byte swap order.
  307. * The count of the current number of addrs
  308. * in the array is stored in 'num_addrs_for_word'.
  309. * Fill_data1() is then called to update or write a new
  310. * vista record and d99 data for the token.
  311. *
  312. * The weight stored for each doc-word instance is 1 byte.
  313. * It's the ratio of log of number of times given word occurs in doc,
  314. * divided by log of total count of all words in doc,
  315. * scaled to range 0 to 255.
  316. * Fundamentally it's a word count of that word in the doc,
  317. * but adjusted as follows:
  318. * 1) Large occurrences in small documents weigh more than
  319. * the same number of occurrences in large documents.
  320. * 2) Taking the log skews the ratio to be more linear,
  321. * ie take advantage of higher ranges of the 'weight'.
  322. * For example a word that occurs in 10% of the document,
  323. * will have a weight of .5 (50%).
  324. * 3) The scaling changes the ratio, a float between 0. and .9999,
  325. * to an integer between 0 and 255.
  326. */
  327. void write_to_file (TREENODE * output_node)
  328. {
  329. DBALIST *print_dba;
  330. DB_ADDR mydba;
  331. /* 'record_addr_word[]' was permanently allocated
  332. * with a size = max batch size so it can hold
  333. * all the addrs for a single word node in the tree.
  334. * In effect it will replace the dba linked list.
  335. * Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[].
  336. */
  337. if (debugging & (DEBUG_T | DEBUG_t)) { /* Print out tree node */
  338. printf (" node '%s' %c%c%c",
  339. displayable(output_node->word),
  340. (output_node->llink)? 'L' : '.',
  341. (output_node->rlink)? 'R' : '.',
  342. (debugging & DEBUG_t)? '\n' : ' ');
  343. }
  344. num_addrs_for_word = 0; /* DtSrINT32 */
  345. print_dba = output_node->dba_list;
  346. while (print_dba != NULL) {
  347. mydba = print_dba->dba;
  348. if (debugging & DEBUG_t)
  349. printf (" dba #%ld: node adr=%ld cnt=%ld",
  350. (long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c);
  351. record_addr_word [num_addrs_for_word] =
  352. mydba << 8; /* rec# in hi 3 bytes */
  353. record_addr_word [num_addrs_for_word] +=
  354. (log ((double) (print_dba->w_c) + 0.5) /
  355. log ((double) (dbas_word_count[mydba] + 1))) * 256;
  356. if (debugging & DEBUG_t)
  357. printf (" -> x%lx (%ld:%ld)\n",
  358. (long)record_addr_word [num_addrs_for_word],
  359. (long)record_addr_word [num_addrs_for_word] >> 8,
  360. (long)record_addr_word [num_addrs_for_word] & 0xffL);
  361. print_dba = print_dba->next_dba;
  362. num_addrs_for_word++;
  363. if (num_addrs_for_word >= batch_size) {
  364. printf (CATGETS(dtsearch_catd, MS_cborodin, 280,
  365. "\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"),
  366. PROGNAME"280", (long)num_addrs_for_word, (long)batch_size);
  367. DtSearchExit (91);
  368. }
  369. }
  370. if ((debugging & DEBUG_T) && !(debugging & DEBUG_t))
  371. printf (" dbacnt=%ld\n", (long)num_addrs_for_word);
  372. fill_data1 (output_node->word);
  373. return;
  374. } /* write_to_file() */
  375. /****************************************/
  376. /* */
  377. /* descend_tree() */
  378. /* */
  379. /****************************************/
  380. /* Coroutine of traverse_tree(), Pass 2 Robson tree traversal.
  381. * The write_to_file() function is the 'preorder visit' point.
  382. */
  383. void descend_tree (void)
  384. {
  385. int not_done = TRUE;
  386. while (not_done) {
  387. if ((pres->llink == NULL) && (pres->rlink == NULL)) {
  388. write_to_file (pres);
  389. avail_node = pres;
  390. return;
  391. }
  392. if (pres->llink != NULL) {
  393. next = pres->llink;
  394. pres->llink = prev;
  395. prev = pres;
  396. pres = next;
  397. }
  398. else {
  399. write_to_file (pres);
  400. next = pres->rlink;
  401. pres->rlink = prev;
  402. prev = pres;
  403. pres = next;
  404. }
  405. }
  406. return;
  407. } /* descend_tree() */
  408. /********************************/
  409. /* */
  410. /* traverse_tree */
  411. /* */
  412. /********************************/
  413. /* This is the actual Pass 2 function, a tree traversal
  414. * of Pass 1's word-dba binary tree.
  415. * The algorithm is based on the J. M. ROBSON link inversion traversal
  416. * algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78.
  417. * The write_to_file() function is the 'preorder visit' point.
  418. */
  419. void traverse_tree (void)
  420. {
  421. int not_done = TRUE;
  422. int descend = TRUE;
  423. /* Dheck for the empty tree */
  424. if (root_node == NULL) {
  425. printf (CATGETS(dtsearch_catd, MS_cborodin, 288,
  426. "%s Abort. There are no words in the input file %s.\n"),
  427. PROGNAME"288", fname_input);
  428. DtSearchExit (34);
  429. }
  430. /* Initialize the variables */
  431. pres = root_node;
  432. prev = pres;
  433. top_of_stack = NULL;
  434. stack = NULL;
  435. while (not_done) {
  436. if (descend) {
  437. descend_tree ();
  438. }
  439. if (pres == root_node) {
  440. return;
  441. }
  442. if (prev->rlink == NULL) {
  443. write_to_file (prev);
  444. next = prev->llink;
  445. prev->llink = pres;
  446. pres = prev;
  447. prev = next;
  448. descend = FALSE;
  449. }
  450. else {
  451. if (prev->llink == NULL) {
  452. next = prev->rlink;
  453. prev->rlink = pres;
  454. pres = prev;
  455. prev = next;
  456. descend = FALSE;
  457. }
  458. else {
  459. if (prev == top_of_stack) {
  460. next = stack;
  461. top_of_stack = stack->rlink;
  462. stack = stack->llink;
  463. next->llink = NULL;
  464. next->rlink = NULL;
  465. next = prev->llink;
  466. prev->llink = prev->rlink;
  467. prev->rlink = pres;
  468. pres = prev;
  469. prev = next;
  470. descend = FALSE;
  471. }
  472. else {
  473. write_to_file (prev);
  474. avail_node->llink = stack;
  475. avail_node->rlink = top_of_stack;
  476. stack = avail_node;
  477. top_of_stack = prev;
  478. next = prev->rlink;
  479. prev->rlink = pres;
  480. pres = next;
  481. descend = TRUE;
  482. }
  483. }
  484. }
  485. }
  486. } /* traverse_tree() */
  487. /********************************************************/
  488. /* */
  489. /* print_usage_msg */
  490. /* */
  491. /********************************************************/
  492. static void print_usage_msg (void)
  493. {
  494. printf (CATGETS(dtsearch_catd, MS_cborodin, 17,
  495. "\n"
  496. "USAGE: %s -d<dbname> [options] <infile>\n"
  497. " Listed default file name extensions can be overridden.\n"
  498. " -d<dbname> 1 - 8 character database name, include optional path prefix.\n"
  499. " -t<etxstr> End of text document delimiter string. Default '\\f\\n'.\n"
  500. " -r<N> Change Pass 1 records-per-dot from %d to <N>.\n"
  501. " -b<N> Change max batch size from %ld to <N>.\n"
  502. " -c<N> Change database paging cache from %ld 1K pages to <N> 1K pages.\n"
  503. " <N> >= 16 by powers of 2. Initially try only small changes.\n"
  504. " -i<N> Change (i)nput buffer size from default %d to <N>.\n"
  505. " -h<N> Change duplicate record id hash table size from %ld to <N>.\n"
  506. " -h0 means there are no duplicates, do not check for them.\n"
  507. " <infile> Input [path]file name. Default extension %s.\n"),
  508. aa_argv0,
  509. (int) RECS_PER_DOT,
  510. (long) BATCH_SIZE, (long) CACHE_SIZE,
  511. (int) INBUFSZ, default_hashsize, EXT_FZKEY);
  512. return;
  513. } /* print_usage_msg() */
  514. /********************************************************/
  515. /* */
  516. /* segregate_dicname */
  517. /* */
  518. /********************************************************/
  519. /* Separates dictionary name from pathname and loads
  520. * them into the globals 'dicname' and 'dicpath'.
  521. * Returns TRUE if dicname is valid, else returns FALSE.
  522. */
  523. static int segregate_dicname (char *string)
  524. {
  525. char mybuf[_POSIX_PATH_MAX];
  526. char *ptr;
  527. int i;
  528. strncpy (mybuf, string, sizeof (mybuf));
  529. mybuf[sizeof (mybuf) - 1] = 0;
  530. /*
  531. * Set 'ptr' to just the 8 char dictionary name by moving
  532. * it backwards until first non-alphanumeric character
  533. * (such as a ":" in the dos drive id or a slash between directories),
  534. * or to the beginning of string.
  535. */
  536. for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--)
  537. if (!isalnum (*ptr)) {
  538. ptr++;
  539. break;
  540. }
  541. if (ptr < mybuf)
  542. ptr = mybuf;
  543. /* test for valid dictionary name */
  544. i = strlen (ptr);
  545. if (i < 1 || i > 8)
  546. return FALSE;
  547. strcpy (dicname, ptr);
  548. *ptr = 0;
  549. strncpy (dicpath, mybuf, sizeof (dicpath));
  550. dicpath[sizeof (dicpath) - 1] = 0;
  551. return TRUE;
  552. } /* segregate_dicname() */
  553. /********************************************************/
  554. /* */
  555. /* USER_ARGS_PROCESSOR */
  556. /* */
  557. /********************************************************/
  558. /* handles command line arguments for 'main' */
  559. void user_args_processor (int argc, char **argv)
  560. {
  561. char *argptr;
  562. char *targ, *src;
  563. int i;
  564. if (argc <= 1) {
  565. print_usage_msg ();
  566. DtSearchExit (2);
  567. }
  568. /* Initialize some variables prior to parsing command line */
  569. dicname[0] = 0;
  570. dicpath[0] = 0;
  571. /* Each pass grabs new parm of "-xxx" format */
  572. while (--argc > 0 && (*++argv)[0] == '-') {
  573. argptr = argv[0];
  574. switch (argptr[1]) {
  575. case 't': /* ETX delimiter string */
  576. /* Replace any "\n" string with real linefeed */
  577. targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
  578. src = argptr + 2;
  579. while (*src) {
  580. if (src[0] == '\\' && src[1] == 'n') {
  581. *targ++ = '\n';
  582. src += 2;
  583. }
  584. else
  585. *targ++ = *src++;
  586. }
  587. *targ = 0;
  588. break;
  589. case 'r':
  590. if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
  591. printf (CATGETS(dtsearch_catd, MS_cborodin, 577,
  592. "%s Invalid arg '%s'. Using default -r%d.\n"),
  593. PROGNAME"577", argptr, RECS_PER_DOT);
  594. recs_per_dot = RECS_PER_DOT;
  595. }
  596. break;
  597. case 'h':
  598. duprec_hashsize = atol (argptr + 2);
  599. if (duprec_hashsize == 0UL)
  600. printf (CATGETS(dtsearch_catd, MS_cborodin, 539,
  601. "%s Duplicate record id checking disabled.\n"),
  602. PROGNAME"539");
  603. break;
  604. case 'b':
  605. batch_size = atol (argptr + 2);
  606. if (batch_size <= 0L) {
  607. printf (CATGETS(dtsearch_catd, MS_cborodin, 595,
  608. "%s Invalid batch size argument '%s'.\n"),
  609. PROGNAME"595", argptr);
  610. goto BADPARM;
  611. }
  612. break;
  613. case 'c':
  614. cache_size = atoi (argptr + 2);
  615. if (cache_size < 16) {
  616. /* minimum size is 16 */
  617. if (cache_size > 0)
  618. cache_size = 16;
  619. /* on error reset size to default */
  620. else
  621. cache_size = CACHE_SIZE;
  622. CACHE_ADJUSTED:
  623. printf (CATGETS(dtsearch_catd, MS_cborodin, 600,
  624. "%sCache size readjusted to %d.\n"),
  625. PROGNAME "600 ", cache_size);
  626. break;
  627. }
  628. /* If necessary, round up to nearest power of 2 */
  629. for (i = 4; i < 12; i++)
  630. if (1 << i >= cache_size)
  631. break;
  632. i = 1 << i;
  633. if (i != cache_size) {
  634. cache_size = i;
  635. goto CACHE_ADJUSTED;
  636. }
  637. break;
  638. case 'D': /* unadvertised debugging feature */
  639. for (i = 2; argptr[i] != 0; i++) {
  640. switch (argptr[i]) {
  641. case 'I': debugging |= DEBUG_I; break;
  642. case 'P': debugging |= DEBUG_P;
  643. /******* debugging_teskey = TRUE; ******/
  644. break;
  645. case 'N': debugging |= DEBUG_N; break;
  646. case 'n': debugging |= DEBUG_n; break;
  647. case 'O': debugging |= DEBUG_O; break;
  648. case 'o': debugging |= DEBUG_o; break;
  649. case 'T': debugging |= DEBUG_T; break;
  650. case 't': debugging |= DEBUG_t; break;
  651. default: goto BADPARM;
  652. }
  653. }
  654. break;
  655. case 'd':
  656. /* May include both dicname and dicpath */
  657. if (!segregate_dicname (argptr + 2)) {
  658. printf (CATGETS(dtsearch_catd, MS_cborodin, 550,
  659. "%s '%s' is invalid path/database name.\n"),
  660. PROGNAME"550", argptr);
  661. goto BADPARM;
  662. }
  663. break;
  664. case 'i': /* (I)nput buffer size */
  665. if ((inbufsz = atol (argptr + 2)) <= 0) {
  666. printf (CATGETS(dtsearch_catd, MS_cborodin, 558,
  667. "%s Invalid input buffer size '%s'.\n"),
  668. PROGNAME"558", argptr);
  669. goto BADPARM;
  670. }
  671. break;
  672. default:
  673. printf (CATGETS(dtsearch_catd, MS_cborodin, 567,
  674. "%s Unknown command line argument '%s'.\n"),
  675. PROGNAME"567", argptr);
  676. BADPARM:
  677. print_usage_msg ();
  678. DtSearchExit (2); /* abort */
  679. } /* endswitch */
  680. } /* endwhile for cmd line '-'processing */
  681. /* Validate input file name */
  682. if (argc-- <= 0) {
  683. printf (CATGETS(dtsearch_catd, MS_cborodin, 580,
  684. "%s Missing required input file name.\n"),
  685. PROGNAME"580");
  686. goto BADPARM;
  687. }
  688. /* Don't incr argv yet--save input file name */
  689. else
  690. append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY);
  691. /* Check for missing database name */
  692. if (dicname[0] == 0) {
  693. printf (CATGETS(dtsearch_catd, MS_cborodin, 589,
  694. "%s No database name specified (-d argument).\a\n"),
  695. PROGNAME"589");
  696. goto BADPARM;
  697. }
  698. strcpy (dblk.name, dicname);
  699. dblk.path = dicpath;
  700. return;
  701. } /* user_args_processor() */
  702. /****************************************/
  703. /* */
  704. /* put_addrs_2_dtbs_addr_file */
  705. /* */
  706. /****************************************/
  707. /* Suboutine of write_2_dtbs_addr_file() from Pass 2.
  708. * That function has used a bit vector to determine
  709. * the total change in old d99 addrs for preexisting words,
  710. * and prepared for writing an array of old dbas that
  711. * are not in the current words tree node (globally named
  712. * word_addrs_ii [num_addrs]).
  713. * The addrs that ARE in the Pass 1 node fzk file were previously
  714. * prepared in a similar array of dbas, globally named
  715. * record_addr_word [num_addrs_for_word] but passed here as
  716. * 'addrs_array' and 'nitems'.
  717. * Both arrays will be byte swapped from 'host' to
  718. * 'network' order in this function.
  719. * This function does the actual fwrite of both arrays to the d99.
  720. * If the number of new addrs can fit in the available free slots,
  721. * it rewrites to original offset, otherwise appends to end of d99.
  722. */
  723. static void put_addrs_2_dtbs_addr_file (
  724. DB_ADDR *addrs_array,
  725. DtSrINT32 nitems)
  726. {
  727. FREE_SPACE_STR *free_slot;
  728. FREE_SPACE_STR del_rec;
  729. DtSrINT32 int32;
  730. DtSrINT32 num_writes;
  731. DtSrINT32 num_addrs;
  732. if (nitems >= batch_size) {
  733. printf ( CATGETS(dtsearch_catd, MS_cborodin, 6,
  734. "put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") ,
  735. (int)nitems, (long)batch_size);
  736. DtSearchExit (58);
  737. }
  738. num_addrs = got_word.or_hwaddrs;
  739. got_word.or_hwaddrs += nitems; /** somehow, this can exceed total
  740. **** num addrs in database by 1 (!?) ******/
  741. /* (...only if prev 'overlay/compression' didn't delete all) */
  742. /* Put both arrays in 'network' byte order */
  743. for (int32 = 0; int32 < nitems; int32++)
  744. HTONL (addrs_array[int32]);
  745. for (int32 = 0; int32 < num_addrs; int32++)
  746. HTONL (word_addrs_ii[int32]);
  747. /*
  748. * If number of new addresses greater than number of free holes,
  749. * find new free slot that is big enough to hold the data .
  750. */
  751. if (nitems > got_word.or_hwfree) {
  752. /* Discard old slot, find new one. */
  753. del_rec.hole_size = num_addrs + got_word.or_hwfree;
  754. del_rec.offset = got_word.or_hwoffset;
  755. free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr);
  756. add_free_space (&del_rec, &fl_hdr);
  757. if (free_slot == NULL) {
  758. fseek (dtbs_addr_fp, 0L, SEEK_END);
  759. got_word.or_hwoffset = ftell (dtbs_addr_fp);
  760. got_word.or_hwfree = 0;
  761. }
  762. else {
  763. fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET);
  764. got_word.or_hwoffset = free_slot->offset;
  765. got_word.or_hwfree = free_slot->hole_size -
  766. got_word.or_hwaddrs;
  767. }
  768. /*----- Write new database addresses to a file -----*/
  769. num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
  770. (size_t)nitems, dtbs_addr_fp);
  771. if (num_writes != nitems) {
  772. DtSearchExit (98);
  773. }
  774. /* Copy the old addresses immediately after the new ones */
  775. num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs,
  776. dtbs_addr_fp);
  777. if (num_writes != num_addrs) {
  778. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  779. PROGNAME"776", strerror(errno));
  780. DtSearchExit (76);
  781. }
  782. /* Write foxes to the free holes, if any, no byte swap */
  783. for (int32 = 0; int32 < got_word.or_hwfree; int32++)
  784. addrs_array [int32] = 0xFFFFFFFF;
  785. num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
  786. (size_t)got_word.or_hwfree, dtbs_addr_fp);
  787. if (num_writes != got_word.or_hwfree) {
  788. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  789. PROGNAME"786", strerror(errno));
  790. DtSearchExit (86);
  791. }
  792. } /* end if (nitems > got_word.or_hwfree), had to get bigger slot */
  793. /* Else can reuse existing slot.
  794. * Write the new addresses into free holes.
  795. * The remaining free holes should already have foxes. (?)
  796. */
  797. else {
  798. fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET);
  799. num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
  800. (size_t)nitems, dtbs_addr_fp);
  801. if (num_writes != nitems) {
  802. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  803. PROGNAME"798", strerror(errno));
  804. DtSearchExit (87);
  805. }
  806. /* Copy the old addresses immediately after the new ones */
  807. num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR),
  808. (size_t)num_addrs, dtbs_addr_fp);
  809. if (num_writes != num_addrs) {
  810. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  811. PROGNAME"889", strerror(errno));
  812. DtSearchExit (89);
  813. }
  814. got_word.or_hwfree -= nitems;
  815. }
  816. } /* put_addrs_2_dtbs_addr_file() */
  817. /****************************************/
  818. /* */
  819. /* write_2_dtbs_addr_file */
  820. /* */
  821. /****************************************/
  822. /* Subroutine of fill_data1() from Pass 2.
  823. * Updates OLD (preexisting) word's d99 file.
  824. *
  825. * The vista word rec has already been read into global 'got_word'.
  826. * record_addr_word [num_addrs_for_word] is the array of dba's
  827. * for docs from this batch that contain the current word (built by
  828. * fill_data1 from the dba_list for the word's Pass 1 binary tree node,
  829. * and still in 'host' byte swap order).
  830. * This function freads all the old addresses for that word from
  831. * the d99 file. It then deletes(!) d99 addrs that
  832. * are in the word's Pass 1 tree node. It then calls
  833. * put_addrs_2_dtbs_addr_file() to fwrite out the
  834. * dba's in the tree, which are either brand new,
  835. * or are 'updating' the deleted addrs.
  836. * Then it writes the modified old addrs.
  837. * Then rewrites vista word rec with new data.
  838. *
  839. * The bit vector dbas_bits_batch contains a 1 bit
  840. * for every dba for every doc in the fzk file.
  841. * got_word structure:
  842. * .or_hwordkey - the word. (always in a 'huge' word buffer).
  843. * .or_hwoffset - offset in a d99 inverted index file for
  844. * a given word. the first address starts
  845. * at this position.
  846. * .or_hwaddrs - total number of addresses for a given word.
  847. * .or_hwfree - number of free slots in a database
  848. * addresses file for a given word.
  849. */
  850. void write_2_dtbs_addr_file (void)
  851. {
  852. DtSrINT32 num_addrs_ii;
  853. DtSrINT32 num_reads;
  854. DtSrINT32 i_start, k, cur_ind = 0;
  855. DtSrINT32 num_delete_addrs = 0;
  856. char addrs_removed = FALSE;
  857. DtSrINT32 i;
  858. DtSrINT32 cur_byte;
  859. char bit_addrs;
  860. DB_ADDR temp1;
  861. if (debugging & DEBUG_O)
  862. printf (" old vis '%s' ofs=%ld adr=%ld fre=%ld\n",
  863. displayable(got_word.or_hwordkey),
  864. (long) got_word.or_hwoffset,
  865. (long) got_word.or_hwaddrs,
  866. (long) got_word.or_hwfree);
  867. num_addrs_ii = got_word.or_hwaddrs;
  868. if (num_addrs_ii > or_reccount) {
  869. printf (CATGETS(dtsearch_catd, MS_cborodin, 713,
  870. "\n%s Word '%s' occurs in %ld records,\n"
  871. " but there are only %ld records in database!\n"
  872. " (This may be a good candidate for the stoplist).\n"),
  873. PROGNAME"713",
  874. (long) got_word.or_hwordkey,
  875. (long) num_addrs_ii,
  876. (long) or_reccount);
  877. DtSearchExit (68);
  878. }
  879. if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0)
  880. {
  881. printf (CATGETS(dtsearch_catd, MS_cborodin, 875,
  882. "\n%s Could not fseek d99 file to offset %ld.\n"),
  883. PROGNAME"875", got_word.or_hwoffset);
  884. DtSearchExit (98);
  885. }
  886. num_reads = fread (word_addrs_ii, sizeof(DB_ADDR),
  887. (size_t)num_addrs_ii, dtbs_addr_fp);
  888. if (num_reads != num_addrs_ii) {
  889. printf (CATGETS(dtsearch_catd, MS_cborodin, 848,
  890. "\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n"
  891. " at offset %ld. Number of dba's read (return code) = %ld.\n"),
  892. PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii,
  893. (long)got_word.or_hwoffset, (long)num_reads);
  894. DtSearchExit (98);
  895. }
  896. for (i = 0; i < num_addrs_ii; i++)
  897. NTOHL (word_addrs_ii[i]);
  898. /* Now both addr arrays are in 'host' byte swap order */
  899. /* If there are only new docs,
  900. * this switch will prevent the checking for updates.
  901. */
  902. if (check_existing_addrs) {
  903. i_start = 0;
  904. /* Loop on every preexisting dba for word as read from d99 */
  905. for (i = 0; i < num_addrs_ii; i++) {
  906. if (debugging & DEBUG_o)
  907. printf (" old d99 %ld: x%lx(%ld:%ld)",
  908. (long) i,
  909. (long) word_addrs_ii[i],
  910. (long) word_addrs_ii[i] >> 8,
  911. (long) word_addrs_ii[i] & 0xffL);
  912. /* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits)
  913. * to right over stat wt byte. D99 rec#'s start at 1,
  914. * so subtract 1 to start at 0 for bit vector.
  915. */
  916. temp1 = (*(word_addrs_ii + i) >> 8) - 1; /* = rec#, base 0 */
  917. cur_byte = temp1 >> 3; /* get matching byte# in bit vector */
  918. if (cur_byte >= bit_vector_size) {
  919. printf ( CATGETS(dtsearch_catd, MS_cborodin, 9,
  920. "\n%s Corrupted d99 file for word '%s',\n"
  921. " database address %ld @ file position %ld => bitvector[%ld],"
  922. " but max bitvector allocation = %ld.\n") ,
  923. PROGNAME"727", displayable(got_word.or_hwordkey),
  924. (long)temp1, (long)i,
  925. (long)cur_byte, (long)bit_vector_size);
  926. DtSearchExit (69);
  927. }
  928. bit_addrs = 0;
  929. bit_addrs |= 1 << (temp1 % 8); /* bit mask */
  930. /*
  931. * If this dba, which is on the current word's old d99
  932. * addrs list, is also a doc in the fzk file (dbas_bits_batch),
  933. * delete it from the d99 list by writing subsequent dba's
  934. * over it. Boy this recursive nested loop has gotta be slow.
  935. * Faster algorithm? Add 'good' addrs to the end of
  936. * record_addr_word[]. No nested overlay loop, only one write!
  937. */
  938. if (bit_addrs & (*(dbas_bits_batch + cur_byte))) {
  939. addrs_removed = TRUE;
  940. num_delete_addrs++;
  941. if (i_start == 0) {
  942. cur_ind = i;
  943. i_start = i + 1;
  944. }
  945. else {
  946. if (i_start < i) {
  947. /* compress: move good addrs over
  948. * space of deleted ones */
  949. for (k = i_start; k < i; k++) {
  950. word_addrs_ii[cur_ind] = word_addrs_ii[k];
  951. cur_ind++;
  952. }
  953. }
  954. i_start = i + 1;
  955. }
  956. } /* end if where dba is on both fzk list and curr d99 */
  957. } /* end loop on every d99 addr for this word */
  958. if (addrs_removed) { /* final overlay compression */
  959. if (i_start < i) {
  960. /* compress: move good addrs over
  961. * space of deleted ones */
  962. for (k = i_start; k < i; k++) {
  963. word_addrs_ii[cur_ind] = word_addrs_ii[k];
  964. cur_ind++;
  965. }
  966. }
  967. }
  968. } /* end if (check_existing_addrs) */
  969. got_word.or_hwaddrs -= num_delete_addrs;
  970. got_word.or_hwfree += num_delete_addrs;
  971. /* The old dba array word_addrs_ii[] is now 'compressed',
  972. * it contains only addrs not in fzk file.
  973. * And the vista rec 'got_word' now matches it.
  974. * And record_addr_word[] still contains
  975. * the new/updated addrs from the fzk file.
  976. * Now Efim calls a func to write them both back out to d99 file.
  977. */
  978. put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word);
  979. write_wordstr (&got_word, 0); /* update vista WORD rec */
  980. return;
  981. } /* write_2_dtbs_addr_file() */
  982. /********************************/
  983. /* */
  984. /* write_new_word_2_dtbs */
  985. /* */
  986. /********************************/
  987. /* Subroutine of fill_data1() in Pass 2 for a NEW word.
  988. * Writes d99 data, and updates (empty) got_word vista record.
  989. * record_addr_word [num_addrs_for_word] is the array of addrs
  990. * for docs from this batch that contain the current word (built by
  991. * fill_data1 from the dba_list for the word's Pass 1 binary tree node).
  992. * It will be byte swapped from 'host' to 'network' order in this function.
  993. */
  994. void write_new_word_2_dtbs (void)
  995. {
  996. FREE_SPACE_STR *free_slot;
  997. DtSrINT32 num_writes;
  998. int ret_fseek;
  999. DtSrINT32 int32;
  1000. if (debugging & (DEBUG_n | DEBUG_N))
  1001. printf (" new word '%s', adrs=%ld,",
  1002. got_word.or_hwordkey, (long)num_addrs_for_word);
  1003. free_slot = find_free_space (num_addrs_for_word, &fl_hdr);
  1004. if (free_slot == NULL) {
  1005. /* append addrs to end of d99 file */
  1006. ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END);
  1007. got_word.or_hwoffset = ftell (dtbs_addr_fp);
  1008. got_word.or_hwfree = 0;
  1009. if (debugging & (DEBUG_n | DEBUG_N))
  1010. printf ("APPEND ofs=%ld, fre=0\n", (long int) got_word.or_hwoffset);
  1011. }
  1012. else {
  1013. ret_fseek = fseek (dtbs_addr_fp,
  1014. (long)free_slot->offset, SEEK_SET);
  1015. got_word.or_hwoffset = free_slot->offset;
  1016. got_word.or_hwfree = free_slot->hole_size -
  1017. num_addrs_for_word;
  1018. if (debugging & (DEBUG_n | DEBUG_N))
  1019. printf (" REUSE slot ofs=%ld, fre=%ld\n",
  1020. (long int) got_word.or_hwoffset, (long int) got_word.or_hwfree);
  1021. }
  1022. /***** Write new database addresses to d99 file *********/
  1023. if (debugging & DEBUG_n) {
  1024. for (int32 = 0; int32 < num_addrs_for_word; int32++) {
  1025. printf (" dba #%ld: x%lx(%ld:%ld)\n",
  1026. (long)int32,
  1027. (long)record_addr_word[int32],
  1028. (long)record_addr_word[int32] >> 8,
  1029. (long)record_addr_word[int32] & 0xffL);
  1030. }
  1031. }
  1032. /* Put addr array in 'network' byte order */
  1033. for (int32 = 0; int32 < num_addrs_for_word; int32++)
  1034. HTONL (record_addr_word[int32]);
  1035. num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
  1036. (size_t)num_addrs_for_word, dtbs_addr_fp);
  1037. if (num_writes != num_addrs_for_word)
  1038. DtSearchExit (97);
  1039. got_word.or_hwaddrs = num_addrs_for_word;
  1040. if (got_word.or_hwfree != 0) {
  1041. /* Fill unused free holes with foxes for debugging.
  1042. * Note that byte swap is unnecessary for foxes.
  1043. * Note that record_addr_word is now available for this action.
  1044. */
  1045. for (int32 = 0; int32 < got_word.or_hwfree; int32++)
  1046. *(record_addr_word + int32) = 0xFFFFFFFF;
  1047. num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
  1048. (size_t)got_word.or_hwfree, dtbs_addr_fp);
  1049. if (num_writes != got_word.or_hwfree) {
  1050. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  1051. PROGNAME"960", strerror(errno));
  1052. DtSearchExit (96);
  1053. }
  1054. }
  1055. /* Save changed word_info structure back to the vista database! */
  1056. write_wordstr (&got_word, 0);
  1057. return;
  1058. } /* write_new_word_2_dtbs() */
  1059. /************************/
  1060. /* */
  1061. /* fill_data1 */
  1062. /* */
  1063. /************************/
  1064. /* Called from write_to_file() in Pass 2.
  1065. * Write_to_file() is 'visit node' function of tree traversal.
  1066. * It has converted dbalist in each word node in tree to
  1067. * array of dbas (record_addr_word [num_addrs_for_word])
  1068. * with correct statistical weighting, still in 'host' byte swap order.
  1069. * This function seeks word key in database. If word is new,
  1070. * it calls functions to write new vista rec and d99 data.
  1071. * If word is old it calls functions to read word rec and update d99.
  1072. */
  1073. void fill_data1 (char *node_word)
  1074. {
  1075. char miker[1024];
  1076. strcpy (miker, node_word);
  1077. count_word_ii++;
  1078. if (shutdown_now) {
  1079. printf (CATGETS(dtsearch_catd, MS_cborodin, 164,
  1080. "\n%s Abort due to signal %d. Database %s\n"
  1081. " probably corrupted. Restore backup database.\n"),
  1082. PROGNAME"164", shutdown_now, dicname);
  1083. DtSearchExit (10);
  1084. }
  1085. /* print occasional progress dots and msgs */
  1086. if (!(count_word_ii % words_per_dot)) {
  1087. putchar ('.');
  1088. dotcount++;
  1089. if (!(dotcount % 10))
  1090. putchar (' ');
  1091. if (dotcount >= 50) {
  1092. dotcount = 0;
  1093. seconds_left = (unsigned long)
  1094. (((float) num_of_diff_words /
  1095. (float) count_word_ii - 1.) *
  1096. (float) (time (NULL) - timestart));
  1097. printf (CATGETS(dtsearch_catd, MS_cborodin, 849,
  1098. "\n%s: Word #%ld, %.0f%% done. Est %lum %02lus "
  1099. "to completion.\n"),
  1100. aa_argv0, count_word_ii,
  1101. (float) count_word_ii / (float) num_of_diff_words * 100.0,
  1102. /***(count_word_ii * 100L) / num_of_diff_words,***/
  1103. seconds_left / 60L, seconds_left % 60L);
  1104. }
  1105. else
  1106. fflush (stdout);
  1107. } /* endif for progress dots and msgs */
  1108. strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD);
  1109. got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0;
  1110. find_keyword (miker, 0); /* vista KEYFIND for word rec */
  1111. if (db_status == S_NOTFOUND) { /* this is a NEW word */
  1112. got_word.or_hwoffset = 0;
  1113. got_word.or_hwfree = 0;
  1114. got_word.or_hwaddrs = 0;
  1115. fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */
  1116. if (db_status != S_OKAY)
  1117. vista_abort (PROGNAME"981");
  1118. write_new_word_2_dtbs(); /* write NEW word's d99 entries
  1119. * and update vista word rec */
  1120. return;
  1121. }
  1122. /* update previously existing word */
  1123. read_wordstr (&got_word, 0); /* read OLD word rec into got_word */
  1124. if (db_status == S_OKAY)
  1125. write_2_dtbs_addr_file(); /* update OLD word's d99 entries
  1126. * and update vista word rec */
  1127. return;
  1128. } /* fill_data1() */
  1129. /************************************************/
  1130. /* */
  1131. /* load_into_bintree */
  1132. /* */
  1133. /************************************************/
  1134. /* Pass 1 function.
  1135. * Loads parsed word token or stem token into
  1136. * inverted index binary tree along with passed dba.
  1137. * Token is allowed to be empty, ie first byte is \0.
  1138. * Derived from Efim's original 'teskey_parse()'
  1139. * and bin_tree() functions.
  1140. * Variables static for speeeeeeed.
  1141. */
  1142. static void load_into_bintree (
  1143. char *parser_token,
  1144. int token_is_stem,
  1145. DB_ADDR dba)
  1146. {
  1147. static DtSrINT16 or_maxwordsz;
  1148. static char *cptr;
  1149. static int i;
  1150. static TREENODE **this_link;
  1151. static TREENODE *newnode;
  1152. static DBALIST *newdba;
  1153. static char *tokbuf = NULL;
  1154. if (*parser_token == 0) {
  1155. if (debugging & DEBUG_I)
  1156. printf (" bintr=<empty> dba=%ld\n", (long)dba);
  1157. return;
  1158. }
  1159. /* Copy token to a buffer.
  1160. * Stems have a special prefix character
  1161. * to distinguish them from words.
  1162. * Also increment total dba word count.
  1163. */
  1164. if (tokbuf == NULL) {
  1165. or_maxwordsz = dblk.dbrec.or_maxwordsz;
  1166. tokbuf = austext_malloc ((size_t) or_maxwordsz + 4,
  1167. PROGNAME"1152", NULL);
  1168. }
  1169. if (token_is_stem) {
  1170. tokbuf[0] = STEM_CH;
  1171. strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz);
  1172. dbas_word_count[dba]++;
  1173. }
  1174. else
  1175. strncpy (tokbuf, parser_token, (size_t)or_maxwordsz);
  1176. tokbuf [or_maxwordsz] = 0;
  1177. if (debugging & DEBUG_I)
  1178. printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba);
  1179. /* TREE TRAVERSAL. Search binary tree to find either
  1180. * insertion point or identical preexisting token.
  1181. */
  1182. for (this_link = &root_node; *this_link != NULL; ) {
  1183. i = strcmp (tokbuf, (*this_link)->word);
  1184. /* If identical word/stem token already exists... */
  1185. if (i == 0) {
  1186. /* If token appears more than once in current
  1187. * document (dba already exists at top of dba list),
  1188. * just increment the word count in the list.
  1189. */
  1190. if ((*this_link)->dba_list->dba == dba)
  1191. (*this_link)->dba_list->w_c++;
  1192. /* If this is first appearance of token for this doc
  1193. * (dba is not at start of token's dba list),
  1194. * insert dba at start of token's dba list.
  1195. */
  1196. else {
  1197. if ((newdba = malloc (sizeof(DBALIST))) == NULL) {
  1198. printf (CATGETS(dtsearch_catd, MS_cborodin, 374,
  1199. msg_374), PROGNAME"1150");
  1200. DtSearchExit (26);
  1201. }
  1202. newdba->dba = dba;
  1203. newdba->w_c = 1;
  1204. newdba->next_dba = (*this_link)->dba_list;
  1205. (*this_link)->dba_list = newdba;
  1206. }
  1207. if (debugging & DEBUG_I)
  1208. printf (" Old %ld=%ld\n",
  1209. (long)((*this_link)->dba_list->dba),
  1210. (long)((*this_link)->dba_list->w_c));
  1211. return; /* done with token */
  1212. } /* endif where token was found in binary tree */
  1213. /* Increment link ptr by descending to correct subtree */
  1214. if (i < 0) {
  1215. this_link = &(*this_link)->llink;
  1216. if (debugging & DEBUG_I)
  1217. putchar ('L');
  1218. }
  1219. else {
  1220. this_link = &(*this_link)->rlink;
  1221. if (debugging & DEBUG_I)
  1222. putchar ('R');
  1223. }
  1224. } /* end tree traversal */
  1225. /* Tree traversal never found a preexisting token node.
  1226. * Create a new node and insert it at the point
  1227. * indicated by link ptr.
  1228. */
  1229. newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4,
  1230. PROGNAME"1234", NULL);
  1231. newnode->llink = NULL;
  1232. newnode->rlink = NULL;
  1233. newnode->word = (char *) (newnode + 1); /* use mem at end of node */
  1234. strcpy (newnode->word, tokbuf);
  1235. newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL);
  1236. newnode->dba_list = newdba;
  1237. newdba->dba = dba;
  1238. newdba->w_c = 1;
  1239. newdba->next_dba = NULL;
  1240. *this_link = newnode;
  1241. num_of_diff_words++;
  1242. if (debugging & DEBUG_I)
  1243. printf (" New %ld=%ld\n",
  1244. (long)((*this_link)->dba_list->dba),
  1245. (long)((*this_link)->dba_list->w_c));
  1246. return;
  1247. } /* load_into_bintree() */
  1248. /**********************************************/
  1249. /* */
  1250. /* MAIN */
  1251. /* */
  1252. /**********************************************/
  1253. int
  1254. main (int argc, char **argv)
  1255. {
  1256. int i;
  1257. long word_offset; /* <-- PARG.offsetp */
  1258. long bytes_in; /* ftell() */
  1259. DtSrINT32 dba_offset;
  1260. int got_ETX;
  1261. char *cptr, *src;
  1262. char temp_buf[40];
  1263. char db_key [DtSrMAX_DB_KEYSIZE + 2];
  1264. int oops = FALSE;
  1265. DtSrINT32 cur_byte;
  1266. struct tm *tmptr;
  1267. DB_ADDR dba, temp_dba;
  1268. time_t elapsed;
  1269. size_t mallocsz;
  1270. char *parsebufp, *stembufp;
  1271. /******************* INITIALIZE ******************/
  1272. setlocale (LC_ALL, "");
  1273. dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0);
  1274. aa_argv0 = strdup (argv[0]);
  1275. time (&elapsed);
  1276. tmptr = localtime (&elapsed);
  1277. strftime (buf, sizeof(buf),
  1278. CATGETS(dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
  1279. tmptr);
  1280. printf (CATGETS(dtsearch_catd, MS_cborodin, 1, "%s. Run %s.\n"),
  1281. aa_argv0, buf);
  1282. austext_exit_last = print_exit_code;
  1283. batch_size = BATCH_SIZE;
  1284. init_user_interrupt ();
  1285. default_hashsize = duprec_hashsize;
  1286. memset (&dblk, 0, sizeof(DBLK));
  1287. memset (&parg, 0, sizeof(PARG));
  1288. parg.dblk = &dblk;
  1289. parg.etxdelim = ETXDELIM; /* default, can be changed */
  1290. parg.offsetp = &word_offset;
  1291. parg.flags |= PA_INDEXING; /* do compounding, if parser can */
  1292. /* Read user specified command line arguments */
  1293. user_args_processor (argc, argv);
  1294. /* Finish init now that we know final values */
  1295. inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL);
  1296. temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL);
  1297. sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16,
  1298. PROGNAME"1286", NULL);
  1299. record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16,
  1300. PROGNAME "1133", NULL);
  1301. /* Save dicname and path in dblk. Save full name of d99 file. */
  1302. strcpy (dblk.name, dicname);
  1303. dblk.path = dicpath;
  1304. strcpy (dtbs_addr_file, dicpath);
  1305. strcat (dtbs_addr_file, dicname);
  1306. strcat (dtbs_addr_file, EXT_DTBS);
  1307. /* Open the database */
  1308. if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) {
  1309. fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
  1310. DtSearchExit (3);
  1311. }
  1312. memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC));
  1313. /* Load database's parser, stemmer, and linguistic files into dblk. */
  1314. if (!load_language (&dblk, NULL)) {
  1315. puts (DtSearchGetMessages());
  1316. printf (CATGETS(dtsearch_catd, MS_cborodin, 1097,
  1317. "%s Aborting due to errors in loading language files.\n"),
  1318. PROGNAME"1097");
  1319. DtSearchExit(3);
  1320. }
  1321. RECFRST (PROGNAME "1067", OR_OBJREC, 0);
  1322. CRGET (PROGNAME "1069", &dba, 0); /* byte swap already done in vista */
  1323. or_reccount = dbrec.or_reccount; /* DtSrINT32 */
  1324. or_recslots = dbrec.or_recslots; /* promoted to DtSrINT32 */
  1325. or_maxdba = dbrec.or_maxdba; /* DtSrINT32 lim of dbas_word_count */
  1326. bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */
  1327. dba_offset = or_recslots - (dba & 0x00FFFFFF); /* DtSrINT32 */
  1328. if (debugging)
  1329. printf (PROGNAME"1286 "
  1330. "realnumrec=%ld recslots=%ld bitvecsz=%ld"
  1331. " dbaoffset=%d maxdba=%ld\n",
  1332. (long)or_reccount, (long)or_recslots, (long)bit_vector_size,
  1333. (int)dba_offset, (long)or_maxdba);
  1334. /* Allocate memory space for the arrays.
  1335. * dbas_bits_batch = 'bit vector', one bit for every possible rec#.
  1336. * the 1 bits = only the dba's that are in this fzk batch.
  1337. * word_addrs_ii = fread buffer for d99 file.
  1338. * dbas_word_count = summing bkts for word count statistics.
  1339. */
  1340. dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48,
  1341. PROGNAME "1150", NULL);
  1342. word_addrs_ii = (DB_ADDR *) austext_malloc (
  1343. sizeof (DB_ADDR) * (or_reccount + 1) + 48,
  1344. PROGNAME "1152", NULL);
  1345. mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48;
  1346. dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz,
  1347. PROGNAME "1154", NULL);
  1348. memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48);
  1349. memset (dbas_word_count, 0, mallocsz);
  1350. root_node = NULL;
  1351. /* Open the d99 file that contains database addresses.
  1352. * If the file doesn't exist, it means the database
  1353. * for keyword search is empty - open a new file.
  1354. */
  1355. if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) {
  1356. dtbs_addr_fp = fopen (dtbs_addr_file, "w+b");
  1357. check_existing_addrs = FALSE;
  1358. new_dtbs_file = TRUE;
  1359. if (dtbs_addr_fp == NULL) {
  1360. /* msg 1068 used multiple places */
  1361. printf (CATGETS(dtsearch_catd, MS_cborodin, 1068,
  1362. "%s Can't open new inverted index file '%s': %s\n"),
  1363. PROGNAME"1068", dtbs_addr_file, strerror(errno));
  1364. DtSearchExit (13);
  1365. }
  1366. /* write New Header Information to a file */
  1367. init_header (dtbs_addr_fp, &fl_hdr);
  1368. }
  1369. else {
  1370. /* read Header Information from d99 file */
  1371. if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) {
  1372. /* msg 1068 used multiple places */
  1373. printf (CATGETS(dtsearch_catd, MS_cborodin, 1068,
  1374. "%s Can't read header data for '%s': %s\n"),
  1375. PROGNAME"1422", dtbs_addr_file, strerror(errno));
  1376. DtSearchExit (13);
  1377. }
  1378. }
  1379. /* open input .fzk file */
  1380. src = getcwd (sprintbuffer, _POSIX_PATH_MAX);
  1381. if (!src && debugging)
  1382. printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno));
  1383. if (!src)
  1384. src = getenv ("PWD");
  1385. printf (CATGETS(dtsearch_catd, MS_misc, 24,
  1386. "%s: current working directory = '%s', .fzk file = '%s'\n"),
  1387. aa_argv0,
  1388. (src) ? src : CATGETS(dtsearch_catd, MS_misc, 6, "<unknown>"),
  1389. fname_input);
  1390. if ((instream = fopen (fname_input, "rt")) == NULL) {
  1391. BAD_INPUT_FILE:
  1392. printf (CATGETS(dtsearch_catd, MS_cborodin, 1083,
  1393. "%s Can't read input file '%s': %s\n"),
  1394. PROGNAME"1083", fname_input, strerror(errno));
  1395. DtSearchExit (14);
  1396. }
  1397. if (fstat (fileno (instream), &fstat_input) == -1)
  1398. goto BAD_INPUT_FILE;
  1399. parg.ftext = instream; /* for readchar_ftext(), discard_to_ETX() */
  1400. time (&totalstart); /* for total elapsed time */
  1401. timestart = totalstart; /* for Pass 1 elapsed time */
  1402. /*------------ PASS 1: ------------
  1403. * Main Read Loop. For each text record in input file,
  1404. * parse and stem words, store them into binary tree
  1405. * inverted index in memory.
  1406. * The first few lines are database administrative values.
  1407. * They are presumed ascii and read with fgets() as
  1408. * 'lines' terminated with \n. The text of the document
  1409. * itself is presumed to be in the appropriate database
  1410. * 'language', so it is *not* presumed to be lines
  1411. * terminated with \n. The document text is read by
  1412. * the language's parser() a 'word' at a time, which
  1413. * ultimately means a byte at a time.
  1414. */
  1415. printf (CATGETS(dtsearch_catd, MS_cborodin, 1108,
  1416. "%s: Beginning Pass 1, reading records from '%s'.\n"
  1417. " Each dot = %d records.\n"),
  1418. aa_argv0, fname_input, recs_per_dot);
  1419. dotcount = 0;
  1420. while (!feof(instream)) {
  1421. /* 1. Read and discard the FZKEY line.
  1422. * 2. Read and discard the ABSTRACT line.
  1423. * 3. Read the UNIQUE KEY for the record.
  1424. * Do some record initialization steps here.
  1425. * 4. Read and discard the DATE line.
  1426. * 5. Let the parser read and parse rest of record, ie doc text...
  1427. */
  1428. /*----- READ LINE #1, fzkey -----*/
  1429. if (fgets (inbuf, inbufsz, instream) == NULL)
  1430. break;
  1431. inbuf [inbufsz] = 0; /* just to be sure */
  1432. if (shutdown_now) {
  1433. printf (CATGETS(dtsearch_catd, MS_cborodin, 164,
  1434. "\n%s: %s Abort due to signal %d. Database %s\n"
  1435. " possibly corrupted. Restore backup database.\n"),
  1436. aa_argv0, PROGNAME"1299", shutdown_now, dicname);
  1437. DtSearchExit (11);
  1438. }
  1439. /* Silently skip null records just like dtsrload */
  1440. if (strcmp (inbuf, parg.etxdelim) == 0)
  1441. continue;
  1442. record_count++;
  1443. /*----- READ LINE #2, abstract -----*/
  1444. if (fgets (inbuf, inbufsz, instream) == NULL) {
  1445. INVALID_FZK_FORMAT:
  1446. printf (CATGETS(dtsearch_catd, MS_cborodin, 1129,
  1447. "%s: %s Invalid .fzk file format.\n"),
  1448. fname_input, PROGNAME"1129");
  1449. DtSearchExit (22);
  1450. }
  1451. inbuf[inbufsz] = 0; /* just to be sure */
  1452. /*--- READ LINE #3, unique database key ---*/
  1453. if (fgets (inbuf, inbufsz, instream) == NULL)
  1454. goto INVALID_FZK_FORMAT;
  1455. inbuf[inbufsz] = 0; /* just to be sure */
  1456. if ((cptr = strtok (inbuf, " \t\n")) == NULL)
  1457. goto INVALID_FZK_FORMAT;
  1458. /* If necessary, discard long keys exactly like cravel */
  1459. if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) {
  1460. printf (CATGETS(dtsearch_catd, MS_cborodin, 659,
  1461. "\n%s: %s Discarding record, key too long:\n '%s'.\n"),
  1462. aa_argv0, PROGNAME"659", cptr);
  1463. discard_to_ETX (&parg);
  1464. continue;
  1465. }
  1466. strcpy (db_key, cptr);
  1467. /* Skip duplicate record ids in same order as dtsrload */
  1468. i = is_duprec (db_key);
  1469. if (i == 2) { /* out of memory */
  1470. printf (CATGETS(dtsearch_catd, MS_cborodin, 374, msg_374),
  1471. PROGNAME"1317");
  1472. DtSearchExit (57);
  1473. }
  1474. else if (i == 1) { /* duplicate record id */
  1475. duplicate_recids++;
  1476. if (dotcount > 0)
  1477. putchar ('\n');
  1478. printf (CATGETS(dtsearch_catd, MS_cborodin, 1402,
  1479. "%s: Discarded duplicate rec #%lu '%s'.\n"),
  1480. aa_argv0, record_count, db_key);
  1481. discard_to_ETX (&parg);
  1482. continue;
  1483. }
  1484. /****** FFFFFFFFFFFFFFFFFFFFF **********/
  1485. /* Convert database address (slot #) to 'record number',
  1486. * what dba would have been if all records took up
  1487. * only one slot and there were no dbrec at top of file.
  1488. * Record numbers on d99, like dba's, start at #1,
  1489. * but rec numbers here (in bit vector) start at #0.
  1490. */
  1491. KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0);
  1492. if (db_status != S_OKAY) {
  1493. normal_retncode = 1; /* = 'warning' */
  1494. if (dotcount > 0)
  1495. putchar ('\n');
  1496. printf (CATGETS(dtsearch_catd, MS_cborodin, 1168,
  1497. "%s: %s Discarded '%s', key not in database.\n"),
  1498. aa_argv0, PROGNAME"1168", displayable(db_key));
  1499. discard_to_ETX (&parg);
  1500. continue;
  1501. }
  1502. CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */
  1503. temp_dba &= 0x00FFFFFF; /* = slot# */
  1504. dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */
  1505. /*
  1506. * Don't change this 'dba'!--eventually it goes
  1507. * into d99 in this exact format! It will also
  1508. * be used as an index into dbas_word_count[] in
  1509. * load_into_bintree() so do a sanity check
  1510. * to make sure that it hasn't exceeded the size
  1511. * of that array. (The count increments have been
  1512. * reported as as 'uninitialized memory reads'
  1513. * by a debugger). This might happen for example
  1514. * if user failed to run dtsrload before dtsrindex?
  1515. */
  1516. if (dba < 1 || dba > or_maxdba) {
  1517. printf ( CATGETS(dtsearch_catd, MS_cborodin, 21,
  1518. "\n%s '%s' record overflows word counter array.\n"
  1519. "Record number %ld > maxdba %ld, dba=%ld, "
  1520. "recslots=%ld, offs=%d.\n") ,
  1521. PROGNAME"1526", displayable(db_key),
  1522. (long)dba, (long)or_maxdba, (long)temp_dba,
  1523. (long)or_recslots, (int)dba_offset);
  1524. DtSearchExit (68);
  1525. }
  1526. temp_dba = dba - 1; /* = rec# starting at 0 */
  1527. cur_byte = temp_dba >> 3; /* bits to bytes: div by 8 */
  1528. if (cur_byte >= bit_vector_size) {
  1529. printf ( CATGETS(dtsearch_catd, MS_cborodin, 22,
  1530. "\n%s '%s' record in database (dba=%ld)\n"
  1531. " overflows bitvector allocation (%ld >= %ld).\n") ,
  1532. PROGNAME"1475", displayable(db_key), (long)dba,
  1533. (long)cur_byte, (long)bit_vector_size);
  1534. DtSearchExit (69);
  1535. }
  1536. dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8);
  1537. /* Print occasional progress dots and msgs */
  1538. if (!(record_count % recs_per_dot)) {
  1539. putchar ('.');
  1540. dotcount++;
  1541. if (!(dotcount % 10))
  1542. putchar (' ');
  1543. if (dotcount >= 50) {
  1544. dotcount = 0;
  1545. bytes_in = ftell (instream);
  1546. seconds_left = (unsigned long)
  1547. (((float) fstat_input.st_size /
  1548. (float) bytes_in - 1.) *
  1549. (float) (time (NULL) - timestart));
  1550. printf (CATGETS(dtsearch_catd, MS_cborodin, 1190,
  1551. "\n%s: Rec #%lu, %.0f%% done. "
  1552. "Est %lum %02lus to end Pass 1.\n"),
  1553. aa_argv0,
  1554. record_count,
  1555. (float) bytes_in / (float) fstat_input.st_size * 100.0,
  1556. seconds_left / 60UL,
  1557. seconds_left % 60UL);
  1558. }
  1559. fflush (stdout);
  1560. }
  1561. /*----- READ LINE #4, date -----*/
  1562. if (fgets (inbuf, inbufsz, instream) == NULL)
  1563. goto INVALID_FZK_FORMAT;
  1564. inbuf[inbufsz] = 0; /* just to be sure */
  1565. /* PARSE LOOP FOR CURRENT TEXT BLOCK.
  1566. * We must be in the middle of a record ('lines' #5 and beyond).
  1567. * From here to ETX, which is either the record delimiter string
  1568. * or the end of file, read the file a 'word' at a time
  1569. * using the parse() function for the language specified
  1570. * for the database.
  1571. * Load_into_bintree() stores each token into
  1572. * inverted index binary tree.
  1573. * Note: dba here MUST still be rec#, base 1.
  1574. * It's stored as is by load_into_bintree(),
  1575. * and will be moved as is into d99 file in Pass 2.
  1576. */
  1577. if (debugging & DEBUG_P)
  1578. printf ("\nRecord #%lu '%s'\n"
  1579. "Offset Word---- Stem----\n",
  1580. record_count, db_key);
  1581. for ( cptr = dblk.parser (&parg);
  1582. cptr;
  1583. cptr = dblk.parser (NULL)) {
  1584. if (debugging & DEBUG_P) {
  1585. printf ("%6ld %s %n", word_offset, cptr, &i);
  1586. if (!(debugging & DEBUG_I))
  1587. while (i++ < 30)
  1588. putchar (' ');
  1589. }
  1590. load_into_bintree (cptr, FALSE, dba);
  1591. cptr = dblk.stemmer (cptr, &dblk);
  1592. if (debugging & DEBUG_P) {
  1593. printf ("%s\n", cptr);
  1594. fflush (stdout);
  1595. }
  1596. load_into_bintree (cptr, TRUE, dba);
  1597. }
  1598. } /* end of PASS 1 Main read loop */
  1599. elapsed = time(NULL) - timestart;
  1600. if (dotcount > 0) {
  1601. putchar ('\n');
  1602. dotcount = 0;
  1603. }
  1604. if (duplicate_recids > 0L) {
  1605. normal_retncode = 1; /* 'warning' */
  1606. sprintf (buf, CATGETS(dtsearch_catd, MS_cborodin, 40,
  1607. "Ignored %ld duplicate records"),
  1608. duplicate_recids);
  1609. }
  1610. else
  1611. strcpy (buf, CATGETS(dtsearch_catd, MS_cborodin, 41,
  1612. "No duplicate records found"));
  1613. printf (CATGETS(dtsearch_catd, MS_cborodin, 1225,
  1614. "%s: Pass 1 completed in %lum %lus, read %lu records.\n"
  1615. " %s, parsed %lu words.\n"),
  1616. aa_argv0, elapsed / 60L, elapsed % 60L, record_count,
  1617. buf, num_of_diff_words);
  1618. if (record_count > batch_size) {
  1619. printf (CATGETS(dtsearch_catd, MS_cborodin, 33,
  1620. "\n%s Number of incoming records exceeded %d.\n"
  1621. " This will usually result in 'Out of Paging Space' "
  1622. "error in Pass 2\n"
  1623. " and corruption of database. Either split the incoming file to\n"
  1624. " reduce record count or use the -b option, and rerun.\n"),
  1625. PROGNAME"33", (int)batch_size);
  1626. DtSearchExit (33);
  1627. }
  1628. /*----------------- PASS 2: -----------------
  1629. * Traverse completed binary tree and write it to d99 file.
  1630. */
  1631. printf (CATGETS(dtsearch_catd, MS_cborodin, 1233,
  1632. "%s: Beginning Pass 2: batch index traversal and database update.\n"
  1633. " Each dot = %d words.\n"),
  1634. aa_argv0, words_per_dot);
  1635. dotcount = 0;
  1636. time (&timestart);
  1637. traverse_tree (); /* actual Pass 2 */
  1638. if (dotcount) {
  1639. putchar ('\n');
  1640. dotcount = 0;
  1641. }
  1642. /* Write header information to the d99 file */
  1643. if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) {
  1644. printf (CATGETS(dtsearch_catd, MS_cborodin, 776, msg_776),
  1645. PROGNAME"1723", strerror(errno));
  1646. DtSearchExit (13);
  1647. }
  1648. d_close ();
  1649. fclose (dtbs_addr_fp);
  1650. elapsed = time (NULL) - timestart;
  1651. printf (CATGETS(dtsearch_catd, MS_cborodin, 1246,
  1652. "%s: Pass 2 completed in %lum %lus, updated %lu words.\n"),
  1653. aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii);
  1654. if (normal_retncode == 1)
  1655. printf (CATGETS(dtsearch_catd, MS_cborodin, 2,
  1656. "%s: Warnings were detected.\n"), aa_argv0);
  1657. DtSearchExit (normal_retncode);
  1658. } /* main() */
  1659. /*************************** DTSRINDEX.C ****************************/