dtsrkdump.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /* $XConsortium: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
  24. *
  25. * (c) Copyright 1996 Digital Equipment Corporation.
  26. * (c) Copyright 1996 Hewlett-Packard Company.
  27. * (c) Copyright 1996 International Business Machines Corp.
  28. * (c) Copyright 1996 Sun Microsystems, Inc.
  29. * (c) Copyright 1996 Novell, Inc.
  30. * (c) Copyright 1996 FUJITSU LIMITED.
  31. * (c) Copyright 1996 Hitachi.
  32. */
  33. /*
  34. * COMPONENT_NAME: austext
  35. *
  36. * FUNCTIONS: count_words
  37. * main
  38. *
  39. * ORIGINS: 27
  40. *
  41. *
  42. * (C) COPYRIGHT International Business Machines Corp. 1994,1996
  43. * All Rights Reserved
  44. * Licensed Materials - Property of IBM
  45. * US Government Users Restricted Rights - Use, duplication or
  46. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  47. */
  48. /*********************** DTSRKDUMP.C *************************
  49. * $Id: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
  50. * April 1994.
  51. * Dumps a DtSearch/AusText keyfile to stdout.
  52. * Renamed from auskdump for DtSearch.
  53. *
  54. * $Log$
  55. * Revision 2.3 1996/04/10 21:19:28 miker
  56. * Program renamed from auskdump with minor cleanup.
  57. *
  58. *
  59. * *** Log: auskdump.c,v ***
  60. * Revision 2.2 1995/10/19 20:29:37 miker
  61. * Permit accessing of read-only databases.
  62. * Revision 2.1 1995/09/22 18:55:59 miker
  63. * Freeze DtSearch 0.1, AusText 2.1.8
  64. * Revision 1.11 1995/09/19 21:47:26 miker
  65. * Added explanation of '*' in report.
  66. * Revision 1.10 1995/09/06 14:18:33 miker
  67. * Fixed bug: -p value incorrectly converted to double because
  68. * atof() function prototype was not provided from stdlib.h.
  69. * Revision 1.9 1995/09/01 23:58:57 miker
  70. * Minor name changes for DtSearch.
  71. * Print err msgs when databases fail to open.
  72. * Revision 1.8 1995/05/30 18:40:12 miker
  73. * Print progress dots and some additional dbrec info.
  74. */
  75. #include "SearchP.h"
  76. #include <string.h>
  77. #include <ctype.h>
  78. #include <stdlib.h>
  79. #include <fcntl.h>
  80. #include <locale.h>
  81. #include "vista.h"
  82. #define PROGNAME "DTSRKDUMP"
  83. #define MIN_THRESHOLD 100L
  84. #define KEYS_PER_DOT 1000
  85. #define MS_dtsrkdump 25
  86. /*----------------- GLOBALS -------------------*/
  87. char buf[2048];
  88. static long *counters = NULL; /* allocated array */
  89. static int do_verbose = FALSE;
  90. static DB_ADDR dba;
  91. static long min_threshold = MIN_THRESHOLD;
  92. static long maxdba = 0L;
  93. static struct or_dbrec
  94. dbrec;
  95. /****************************************/
  96. /* */
  97. /* count_words */
  98. /* */
  99. /****************************************/
  100. void count_words (int index)
  101. {
  102. long vista_field = 0;
  103. UCHAR *ptr;
  104. DtSrINT32 offset, free, addrs;
  105. int tabstop;
  106. long keycount = 0;
  107. int dotcount = 0;
  108. if (index == 0)
  109. vista_field = OR_SWORDKEY;
  110. else if (index == 2)
  111. vista_field = OR_LWORDKEY;
  112. else if (index == 4)
  113. vista_field = OR_HWORDKEY;
  114. else {
  115. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 1,
  116. "%s Program Error Abort.\a\n"),
  117. PROGNAME"030");
  118. DtSearchExit (4);
  119. }
  120. KEYFRST (PROGNAME"36", vista_field, 0);
  121. while (db_status == S_OKAY) {
  122. KEYREAD (PROGNAME"48", buf);
  123. if (buf[0] == STEM_CH)
  124. (counters[index])++;
  125. else
  126. (counters[index + 1])++;
  127. if (do_verbose) {
  128. CRGET (PROGNAME"58", &dba, 0);
  129. switch (index) {
  130. case 0:
  131. CRREAD (PROGNAME"66", OR_SWOFFSET, &offset, 0);
  132. CRREAD (PROGNAME"67", OR_SWFREE, &free, 0);
  133. CRREAD (PROGNAME"68", OR_SWADDRS, &addrs, 0);
  134. break;
  135. case 2:
  136. CRREAD (PROGNAME"76", OR_LWOFFSET, &offset, 0);
  137. CRREAD (PROGNAME"77", OR_LWFREE, &free, 0);
  138. CRREAD (PROGNAME"78", OR_LWADDRS, &addrs, 0);
  139. break;
  140. case 4:
  141. CRREAD (PROGNAME"86", OR_HWOFFSET, &offset, 0);
  142. CRREAD (PROGNAME"87", OR_HWFREE, &free, 0);
  143. CRREAD (PROGNAME"88", OR_HWADDRS, &addrs, 0);
  144. break;
  145. }
  146. NTOHL (offset);
  147. NTOHL (free);
  148. NTOHL (addrs);
  149. if (addrs >= min_threshold) {
  150. printf (" \"");
  151. tabstop = 0;
  152. for (ptr = (UCHAR *) buf; *ptr != 0; ptr++) {
  153. putchar ((*ptr >= 32) ? *ptr : '~');
  154. tabstop++;
  155. }
  156. printf ("\" ");
  157. while (tabstop++ < 22)
  158. putchar (' ');
  159. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 2,
  160. "%c dba=%d:%-7ld ofs=%-9ld adr=%-6ld fre=%ld\n"),
  161. (addrs >= dbrec.or_reccount) ? '*' : ' ',
  162. dba >> 24, dba & 0xffffff, offset, addrs, free);
  163. }
  164. } /* end verbose */
  165. else { /* !verbose */
  166. if (++keycount % KEYS_PER_DOT == 0) {
  167. putchar ('.');
  168. if (++dotcount % 10 == 0)
  169. putchar (' ');
  170. if (dotcount % 50 == 0) {
  171. putchar ('\n');
  172. dotcount = 0;
  173. }
  174. fflush (stdout);
  175. }
  176. } /* end !verbose dot printing */
  177. KEYNEXT (PROGNAME"98", vista_field, 0);
  178. } /* end object key read loop */
  179. if (dotcount)
  180. putchar ('\n');
  181. return;
  182. } /* count_words() */
  183. /****************************************/
  184. /* */
  185. /* main */
  186. /* */
  187. /****************************************/
  188. int main (int argc, char *argv[])
  189. {
  190. int i;
  191. int oops;
  192. int dotcount;
  193. long keycount;
  194. long total;
  195. char *ptr;
  196. int do_objkeys = FALSE;
  197. int do_wordkeys = FALSE;
  198. char dbpath[2048];
  199. char rcs_revision [8];
  200. char dbname[12];
  201. time_t now;
  202. double percent = 0.0;
  203. int listing_most_words = FALSE;
  204. static char *word_labels[6] =
  205. {
  206. "Short Stems = %8ld\n", "Short Words = %8ld\n",
  207. "Long Stems = %8ld\n", "Long Words = %8ld\n",
  208. "Huge Stems = %8ld\n", "Huge Words = %8ld\n"
  209. };
  210. aa_argv0 = argv[0];
  211. time (&now);
  212. sscanf ("$Revision: /main/3 $", "%*s %s", rcs_revision);
  213. setlocale (LC_ALL, "");
  214. dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0);
  215. strftime (buf, sizeof (buf), "%m/%d/%Y, %I:%M %p",
  216. localtime (&now));
  217. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 3,
  218. "%s %s, engine %s. %s.\n"),
  219. aa_argv0, rcs_revision, AUSAPI_VERSION, buf);
  220. if (argc <= 1) {
  221. PRINT_USAGE:
  222. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 4,
  223. "\nUSAGE: %s -o|w|ow [-v] [-t<N> | -p<N>] dbname\n"
  224. " Reads DtSearch key files and prints summary report.\n"
  225. " -o Keys examined are OBJECT record keys.\n"
  226. " -w Keys examined are inverted index WORDS.\n"
  227. " -v VERBOSE mode, lists every key.\n"
  228. " -t<N> Threshold. Sets w and v options, and lists only words\n"
  229. " with >= <N> addresses. All words will be listed if <N> = 1.\n"
  230. " -p<N> Another threshold. Same as -t except <N> is percent\n"
  231. " of the entire database (<N> may include a decimal point).\n"
  232. " For example -p99.9 prints out every word that occurs\n"
  233. " in 99.9%% or more of the records--an excellent way to find\n"
  234. " candidates for the stop list.\n"
  235. " If w and v are set without threshold, default is -t%d.\n"
  236. " <dbname> 1 - 8 character database name with optional path prefix.\n")
  237. ,aa_argv0
  238. ,MIN_THRESHOLD
  239. );
  240. DtSearchExit (2);
  241. }
  242. /* parse options */
  243. else { /* argc >= 2 */
  244. for (;;) {
  245. /* each pass grabs new token with "-xxx" format */
  246. --argc;
  247. ++argv;
  248. if (argc <= 0)
  249. break; /* no more tokens of any kind */
  250. ptr = argv[0];
  251. if (*ptr != '-')
  252. break; /* no more option tokens */
  253. /* examine each char in this -xxx token */
  254. while (*(++ptr) != 0) {
  255. switch (*ptr) {
  256. case 'o':
  257. do_objkeys = TRUE;
  258. break;
  259. case 'w':
  260. do_wordkeys = TRUE;
  261. break;
  262. case 'v':
  263. do_verbose = TRUE;
  264. break;
  265. case 'p':
  266. do_verbose = TRUE;
  267. do_wordkeys = TRUE;
  268. percent = atof (ptr + 1);
  269. if (percent <= 0.0 || percent > 100.0) {
  270. fprintf (stderr,
  271. CATGETS(dtsearch_catd, MS_dtsrkdump, 5,
  272. "%s Invalid percent value %lf.\a\n"),
  273. PROGNAME"195", percent);
  274. goto PRINT_USAGE;
  275. }
  276. ptr[1] = 0; /* terminate parse */
  277. break;
  278. case 't':
  279. do_verbose = TRUE;
  280. do_wordkeys = TRUE;
  281. if ((min_threshold = atol (ptr + 1)) <= 0L) {
  282. fprintf (stderr,
  283. CATGETS(dtsearch_catd, MS_dtsrkdump, 53,
  284. "%s Invalid threshold value.\a\n"),
  285. PROGNAME"198");
  286. goto PRINT_USAGE;
  287. }
  288. ptr[1] = 0; /* terminate parse */
  289. break;
  290. default:
  291. fprintf (stderr,
  292. CATGETS(dtsearch_catd, MS_dtsrkdump, 55,
  293. "%s Unknown command line argument '%c'.\a\n"),
  294. PROGNAME"278", *ptr);
  295. goto PRINT_USAGE;
  296. } /* end switch */
  297. } /* end while-loop for each char of -xxx token */
  298. } /* end for-loop for each -xxx token */
  299. } /* end of options parse altogether */
  300. oops = FALSE;
  301. if (argc <= 0) {
  302. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 56,
  303. "%s Missing required database name.\a\n"),
  304. PROGNAME"267");
  305. oops = TRUE;
  306. }
  307. if (!do_wordkeys && !do_objkeys) {
  308. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 57,
  309. "%s Either -o or -w must be specified.\a\n"),
  310. PROGNAME"271");
  311. oops = TRUE;
  312. }
  313. if (oops)
  314. goto PRINT_USAGE;
  315. /* Database name may have a long path prefix.
  316. * If so, we need to segregate the two.
  317. * Set 'ptr' to just the 8 char dictionary name by moving
  318. * it backwards until first non-alphanumeric character
  319. * (such as a ":" in the dos drive id or a slash between directories),
  320. * or to the beginning of string.
  321. */
  322. strncpy (dbpath, argv[0], sizeof (dbpath));
  323. dbpath[sizeof (dbpath) - 1] = 0;
  324. for (ptr = dbpath + strlen (dbpath) - 1; ptr >= dbpath; ptr--)
  325. if (!isalnum (*ptr)) {
  326. ptr++;
  327. break;
  328. }
  329. if (ptr < dbpath)
  330. ptr = dbpath;
  331. /* test for valid database name */
  332. i = strlen (ptr);
  333. if (i < 1 || i > 8) {
  334. fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 58,
  335. "%s Invalid database name '%s'.\a\n"),
  336. PROGNAME"297", ptr);
  337. goto PRINT_USAGE;
  338. }
  339. strcpy (dbname, ptr);
  340. *ptr = 0; /* truncate dbname off of full path/dbname */
  341. /* Open database in read-only mode. */
  342. db_oflag = O_RDONLY;
  343. if (!austext_dopen (dbname, dbpath, NULL, 0, &dbrec)) {
  344. fprintf (stderr, "%s\n", DtSearchGetMessages());
  345. DtSearchExit (3);
  346. }
  347. maxdba = dbrec.or_maxdba;
  348. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 60,
  349. "%s: '%s' reccount=%ld maxdba=%ld recslots=%hd minw=%hd maxw=%hd\n"),
  350. aa_argv0, dbname, dbrec.or_reccount,
  351. dbrec.or_maxdba, dbrec.or_recslots,
  352. dbrec.or_minwordsz, dbrec.or_maxwordsz);
  353. /* Adjust threshold if necessary */
  354. if (percent > 0.0)
  355. min_threshold = (long)
  356. ((float) percent * (float) dbrec.or_reccount / 100.0);
  357. if (min_threshold > dbrec.or_reccount)
  358. min_threshold = dbrec.or_reccount;
  359. if (do_wordkeys && do_verbose) {
  360. if (min_threshold > 1 && min_threshold < dbrec.or_reccount) {
  361. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 70,
  362. "%s Will only list words occurring "
  363. "in %ld or more records.\n"),
  364. aa_argv0, min_threshold);
  365. listing_most_words =
  366. (float) min_threshold / (float) dbrec.or_reccount > .90;
  367. }
  368. else {
  369. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 80,
  370. "%s: Listing all words in database.\n"),
  371. aa_argv0);
  372. listing_most_words = TRUE;
  373. }
  374. }
  375. if (do_objkeys) {
  376. /*
  377. * Allocate and initialize an array of keytype counters, one for
  378. * each possible ascii keytype char (256).
  379. */
  380. counters = austext_malloc (258 * sizeof(long), PROGNAME"113", NULL);
  381. memset (counters, 0, 258 * sizeof(long));
  382. dotcount = 0;
  383. keycount = 0;
  384. KEYFRST (PROGNAME"111", OR_OBJKEY, 0);
  385. while (db_status == S_OKAY) {
  386. KEYREAD (PROGNAME"288", buf);
  387. (counters[buf[0]])++;
  388. CRGET (PROGNAME"251", &dba, 0);
  389. if (maxdba < (dba & 0xffffff))
  390. maxdba = dba;
  391. if (do_verbose) {
  392. /* Mark control and nonascii chars with a period. */
  393. i = 0;
  394. putchar ('\"');
  395. for (ptr = buf; *ptr != 0; ptr++) {
  396. if (*ptr < 32 | *ptr >= 127) {
  397. putchar ('.');
  398. i++;
  399. }
  400. else {
  401. putchar (*ptr);
  402. i++;
  403. }
  404. }
  405. printf ("\" ");
  406. while (i++ < DtSrMAX_DB_KEYSIZE)
  407. putchar (' ');
  408. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 100,
  409. "dba x%08lx, %6ld\n"), dba, dba);
  410. } /* end verbose */
  411. else { /* !verbose */
  412. if (++keycount % KEYS_PER_DOT == 0) {
  413. putchar ('.');
  414. if (++dotcount % 10 == 0)
  415. putchar (' ');
  416. if (dotcount % 50 == 0) {
  417. putchar ('\n');
  418. dotcount = 0;
  419. }
  420. fflush (stdout);
  421. }
  422. } /* end !verbose dot printing */
  423. KEYNEXT (PROGNAME"291", OR_OBJKEY, 0);
  424. } /* end object key read loop */
  425. /* Print objkey summary report */
  426. if (dotcount)
  427. putchar ('\n');
  428. if (dbpath[0] == 0)
  429. buf[0] = 0;
  430. else
  431. sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
  432. " in %s"), dbpath);
  433. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 120,
  434. "Object Summary for '%s'%s:\n"), dbname, buf);
  435. puts (CATGETS(dtsearch_catd, MS_dtsrkdump, 130,
  436. "Object Count by Keytypes:"));
  437. total = 0L;
  438. for (i = 0; i < 256; i++) {
  439. if (counters[i] > 0L) {
  440. total += counters[i];
  441. if (i > 32 && i < 127)
  442. printf (" '%c' %6ld\n", i, counters[i]);
  443. else
  444. printf (" x%02x %6ld\n", i, counters[i]);
  445. }
  446. }
  447. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 160,
  448. "TOTAL Objects Count = %ld\n"), total);
  449. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 170,
  450. "Largest Object DBA = %ld\n"), maxdba);
  451. free (counters);
  452. } /* end do_objkeys */
  453. if (do_wordkeys) {
  454. if (listing_most_words)
  455. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 180,
  456. "%s: * Words marked with asterisk occur in every record.\n"),
  457. aa_argv0);
  458. /*
  459. * Allocate and initialize word and stem counters. First is for
  460. * short stems (those beginning with STEM_CH), next is for short
  461. * words (everything else). Next are for long stems, long words,
  462. * huge stems, and huge words (6 in all).
  463. */
  464. counters = austext_malloc (8 * sizeof (long), PROGNAME"113", NULL);
  465. memset (counters, 0, 6 * sizeof(long));
  466. count_words (0); /* short */
  467. count_words (2); /* long */
  468. count_words (4); /* huge */
  469. /* print wordkey summary report */
  470. if (do_objkeys)
  471. putchar ('\n'); /* separate from last report */
  472. if (dbpath[0] == 0)
  473. buf[0] = 0;
  474. else
  475. sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
  476. " in %s"), dbpath);
  477. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 200,
  478. "Words Summary for '%s'%s:\n"), dbname, buf);
  479. total = 0L;
  480. for (i = 0; i < 6; i++) {
  481. printf (word_labels[i], counters[i]);
  482. total += counters[i];
  483. }
  484. printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 210,
  485. "TOTAL Words Count = %ld\n"), total);
  486. free (counters);
  487. } /* end do_wordkeys */
  488. DtSearchExit (0);
  489. } /* main() */
  490. /*********************** DTSRKDUMP.C *************************/