123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539 |
- /*
- * CDE - Common Desktop Environment
- *
- * Copyright (c) 1993-2012, The Open Group. All rights reserved.
- *
- * These libraries and programs are free software; you can
- * redistribute them and/or modify them under the terms of the GNU
- * Lesser General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * These libraries and programs are distributed in the hope that
- * they will be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU Lesser General Public License for more
- * details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with these libraries and programs; if not, write
- * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
- * Floor, Boston, MA 02110-1301 USA
- */
- /* $XConsortium: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
- *
- * (c) Copyright 1996 Digital Equipment Corporation.
- * (c) Copyright 1996 Hewlett-Packard Company.
- * (c) Copyright 1996 International Business Machines Corp.
- * (c) Copyright 1996 Sun Microsystems, Inc.
- * (c) Copyright 1996 Novell, Inc.
- * (c) Copyright 1996 FUJITSU LIMITED.
- * (c) Copyright 1996 Hitachi.
- */
- /*
- * COMPONENT_NAME: austext
- *
- * FUNCTIONS: count_words
- * main
- *
- * ORIGINS: 27
- *
- *
- * (C) COPYRIGHT International Business Machines Corp. 1994,1996
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- */
- /*********************** DTSRKDUMP.C *************************
- * $Id: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
- * April 1994.
- * Dumps a DtSearch/AusText keyfile to stdout.
- * Renamed from auskdump for DtSearch.
- *
- * $Log$
- * Revision 2.3 1996/04/10 21:19:28 miker
- * Program renamed from auskdump with minor cleanup.
- *
- *
- * *** Log: auskdump.c,v ***
- * Revision 2.2 1995/10/19 20:29:37 miker
- * Permit accessing of read-only databases.
- * Revision 2.1 1995/09/22 18:55:59 miker
- * Freeze DtSearch 0.1, AusText 2.1.8
- * Revision 1.11 1995/09/19 21:47:26 miker
- * Added explanation of '*' in report.
- * Revision 1.10 1995/09/06 14:18:33 miker
- * Fixed bug: -p value incorrectly converted to double because
- * atof() function prototype was not provided from stdlib.h.
- * Revision 1.9 1995/09/01 23:58:57 miker
- * Minor name changes for DtSearch.
- * Print err msgs when databases fail to open.
- * Revision 1.8 1995/05/30 18:40:12 miker
- * Print progress dots and some additional dbrec info.
- */
- #include "SearchP.h"
- #include <string.h>
- #include <ctype.h>
- #include <stdlib.h>
- #include <fcntl.h>
- #include <locale.h>
- #include "vista.h"
- #define PROGNAME "DTSRKDUMP"
- #define MIN_THRESHOLD 100L
- #define KEYS_PER_DOT 1000
- #define MS_dtsrkdump 25
- /*----------------- GLOBALS -------------------*/
- char buf[2048];
- static long *counters = NULL; /* allocated array */
- static int do_verbose = FALSE;
- static DB_ADDR dba;
- static long min_threshold = MIN_THRESHOLD;
- static long maxdba = 0L;
- static struct or_dbrec
- dbrec;
- /****************************************/
- /* */
- /* count_words */
- /* */
- /****************************************/
- void count_words (int index)
- {
- long vista_field = 0;
- UCHAR *ptr;
- DtSrINT32 offset, free, addrs;
- int tabstop;
- long keycount = 0;
- int dotcount = 0;
- if (index == 0)
- vista_field = OR_SWORDKEY;
- else if (index == 2)
- vista_field = OR_LWORDKEY;
- else if (index == 4)
- vista_field = OR_HWORDKEY;
- else {
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 1,
- "%s Program Error Abort.\a\n"),
- PROGNAME"030");
- DtSearchExit (4);
- }
- KEYFRST (PROGNAME"36", vista_field, 0);
- while (db_status == S_OKAY) {
- KEYREAD (PROGNAME"48", buf);
- if (buf[0] == STEM_CH)
- (counters[index])++;
- else
- (counters[index + 1])++;
- if (do_verbose) {
- CRGET (PROGNAME"58", &dba, 0);
- switch (index) {
- case 0:
- CRREAD (PROGNAME"66", OR_SWOFFSET, &offset, 0);
- CRREAD (PROGNAME"67", OR_SWFREE, &free, 0);
- CRREAD (PROGNAME"68", OR_SWADDRS, &addrs, 0);
- break;
- case 2:
- CRREAD (PROGNAME"76", OR_LWOFFSET, &offset, 0);
- CRREAD (PROGNAME"77", OR_LWFREE, &free, 0);
- CRREAD (PROGNAME"78", OR_LWADDRS, &addrs, 0);
- break;
- case 4:
- CRREAD (PROGNAME"86", OR_HWOFFSET, &offset, 0);
- CRREAD (PROGNAME"87", OR_HWFREE, &free, 0);
- CRREAD (PROGNAME"88", OR_HWADDRS, &addrs, 0);
- break;
- }
- NTOHL (offset);
- NTOHL (free);
- NTOHL (addrs);
- if (addrs >= min_threshold) {
- printf (" \"");
- tabstop = 0;
- for (ptr = (UCHAR *) buf; *ptr != 0; ptr++) {
- putchar ((*ptr >= 32) ? *ptr : '~');
- tabstop++;
- }
- printf ("\" ");
- while (tabstop++ < 22)
- putchar (' ');
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 2,
- "%c dba=%d:%-7ld ofs=%-9ld adr=%-6ld fre=%ld\n"),
- (addrs >= dbrec.or_reccount) ? '*' : ' ',
- dba >> 24, dba & 0xffffff, offset, addrs, free);
- }
- } /* end verbose */
- else { /* !verbose */
- if (++keycount % KEYS_PER_DOT == 0) {
- putchar ('.');
- if (++dotcount % 10 == 0)
- putchar (' ');
- if (dotcount % 50 == 0) {
- putchar ('\n');
- dotcount = 0;
- }
- fflush (stdout);
- }
- } /* end !verbose dot printing */
- KEYNEXT (PROGNAME"98", vista_field, 0);
- } /* end object key read loop */
- if (dotcount)
- putchar ('\n');
- return;
- } /* count_words() */
- /****************************************/
- /* */
- /* main */
- /* */
- /****************************************/
- int main (int argc, char *argv[])
- {
- int i;
- int oops;
- int dotcount;
- long keycount;
- long total;
- char *ptr;
- int do_objkeys = FALSE;
- int do_wordkeys = FALSE;
- char dbpath[2048];
- char rcs_revision [8];
- char dbname[12];
- time_t now;
- double percent = 0.0;
- int listing_most_words = FALSE;
- static char *word_labels[6] =
- {
- "Short Stems = %8ld\n", "Short Words = %8ld\n",
- "Long Stems = %8ld\n", "Long Words = %8ld\n",
- "Huge Stems = %8ld\n", "Huge Words = %8ld\n"
- };
- aa_argv0 = argv[0];
- time (&now);
- sscanf ("$Revision: /main/3 $", "%*s %s", rcs_revision);
- setlocale (LC_ALL, "");
- dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0);
- strftime (buf, sizeof (buf), "%m/%d/%Y, %I:%M %p",
- localtime (&now));
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 3,
- "%s %s, engine %s. %s.\n"),
- aa_argv0, rcs_revision, AUSAPI_VERSION, buf);
- if (argc <= 1) {
- PRINT_USAGE:
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 4,
- "\nUSAGE: %s -o|w|ow [-v] [-t<N> | -p<N>] dbname\n"
- " Reads DtSearch key files and prints summary report.\n"
- " -o Keys examined are OBJECT record keys.\n"
- " -w Keys examined are inverted index WORDS.\n"
- " -v VERBOSE mode, lists every key.\n"
- " -t<N> Threshold. Sets w and v options, and lists only words\n"
- " with >= <N> addresses. All words will be listed if <N> = 1.\n"
- " -p<N> Another threshold. Same as -t except <N> is percent\n"
- " of the entire database (<N> may include a decimal point).\n"
- " For example -p99.9 prints out every word that occurs\n"
- " in 99.9%% or more of the records--an excellent way to find\n"
- " candidates for the stop list.\n"
- " If w and v are set without threshold, default is -t%d.\n"
- " <dbname> 1 - 8 character database name with optional path prefix.\n")
- ,aa_argv0
- ,MIN_THRESHOLD
- );
- DtSearchExit (2);
- }
- /* parse options */
- else { /* argc >= 2 */
- for (;;) {
- /* each pass grabs new token with "-xxx" format */
- --argc;
- ++argv;
- if (argc <= 0)
- break; /* no more tokens of any kind */
- ptr = argv[0];
- if (*ptr != '-')
- break; /* no more option tokens */
- /* examine each char in this -xxx token */
- while (*(++ptr) != 0) {
- switch (*ptr) {
- case 'o':
- do_objkeys = TRUE;
- break;
- case 'w':
- do_wordkeys = TRUE;
- break;
- case 'v':
- do_verbose = TRUE;
- break;
- case 'p':
- do_verbose = TRUE;
- do_wordkeys = TRUE;
- percent = atof (ptr + 1);
- if (percent <= 0.0 || percent > 100.0) {
- fprintf (stderr,
- CATGETS(dtsearch_catd, MS_dtsrkdump, 5,
- "%s Invalid percent value %lf.\a\n"),
- PROGNAME"195", percent);
- goto PRINT_USAGE;
- }
- ptr[1] = 0; /* terminate parse */
- break;
- case 't':
- do_verbose = TRUE;
- do_wordkeys = TRUE;
- if ((min_threshold = atol (ptr + 1)) <= 0L) {
- fprintf (stderr,
- CATGETS(dtsearch_catd, MS_dtsrkdump, 53,
- "%s Invalid threshold value.\a\n"),
- PROGNAME"198");
- goto PRINT_USAGE;
- }
- ptr[1] = 0; /* terminate parse */
- break;
- default:
- fprintf (stderr,
- CATGETS(dtsearch_catd, MS_dtsrkdump, 55,
- "%s Unknown command line argument '%c'.\a\n"),
- PROGNAME"278", *ptr);
- goto PRINT_USAGE;
- } /* end switch */
- } /* end while-loop for each char of -xxx token */
- } /* end for-loop for each -xxx token */
- } /* end of options parse altogether */
- oops = FALSE;
- if (argc <= 0) {
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 56,
- "%s Missing required database name.\a\n"),
- PROGNAME"267");
- oops = TRUE;
- }
- if (!do_wordkeys && !do_objkeys) {
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 57,
- "%s Either -o or -w must be specified.\a\n"),
- PROGNAME"271");
- oops = TRUE;
- }
- if (oops)
- goto PRINT_USAGE;
- /* Database name may have a long path prefix.
- * If so, we need to segregate the two.
- * Set 'ptr' to just the 8 char dictionary name by moving
- * it backwards until first non-alphanumeric character
- * (such as a ":" in the dos drive id or a slash between directories),
- * or to the beginning of string.
- */
- strncpy (dbpath, argv[0], sizeof (dbpath));
- dbpath[sizeof (dbpath) - 1] = 0;
- for (ptr = dbpath + strlen (dbpath) - 1; ptr >= dbpath; ptr--)
- if (!isalnum (*ptr)) {
- ptr++;
- break;
- }
- if (ptr < dbpath)
- ptr = dbpath;
- /* test for valid database name */
- i = strlen (ptr);
- if (i < 1 || i > 8) {
- fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 58,
- "%s Invalid database name '%s'.\a\n"),
- PROGNAME"297", ptr);
- goto PRINT_USAGE;
- }
- strcpy (dbname, ptr);
- *ptr = 0; /* truncate dbname off of full path/dbname */
- /* Open database in read-only mode. */
- db_oflag = O_RDONLY;
- if (!austext_dopen (dbname, dbpath, NULL, 0, &dbrec)) {
- fprintf (stderr, "%s\n", DtSearchGetMessages());
- DtSearchExit (3);
- }
- maxdba = dbrec.or_maxdba;
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 60,
- "%s: '%s' reccount=%ld maxdba=%ld recslots=%hd minw=%hd maxw=%hd\n"),
- aa_argv0, dbname, dbrec.or_reccount,
- dbrec.or_maxdba, dbrec.or_recslots,
- dbrec.or_minwordsz, dbrec.or_maxwordsz);
- /* Adjust threshold if necessary */
- if (percent > 0.0)
- min_threshold = (long)
- ((float) percent * (float) dbrec.or_reccount / 100.0);
- if (min_threshold > dbrec.or_reccount)
- min_threshold = dbrec.or_reccount;
- if (do_wordkeys && do_verbose) {
- if (min_threshold > 1 && min_threshold < dbrec.or_reccount) {
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 70,
- "%s Will only list words occurring "
- "in %ld or more records.\n"),
- aa_argv0, min_threshold);
- listing_most_words =
- (float) min_threshold / (float) dbrec.or_reccount > .90;
- }
- else {
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 80,
- "%s: Listing all words in database.\n"),
- aa_argv0);
- listing_most_words = TRUE;
- }
- }
- if (do_objkeys) {
- /*
- * Allocate and initialize an array of keytype counters, one for
- * each possible ascii keytype char (256).
- */
- counters = austext_malloc (258 * sizeof(long), PROGNAME"113", NULL);
- memset (counters, 0, 258 * sizeof(long));
- dotcount = 0;
- keycount = 0;
- KEYFRST (PROGNAME"111", OR_OBJKEY, 0);
- while (db_status == S_OKAY) {
- KEYREAD (PROGNAME"288", buf);
- (counters[buf[0]])++;
- CRGET (PROGNAME"251", &dba, 0);
- if (maxdba < (dba & 0xffffff))
- maxdba = dba;
- if (do_verbose) {
- /* Mark control and nonascii chars with a period. */
- i = 0;
- putchar ('\"');
- for (ptr = buf; *ptr != 0; ptr++) {
- if (*ptr < 32 | *ptr >= 127) {
- putchar ('.');
- i++;
- }
- else {
- putchar (*ptr);
- i++;
- }
- }
- printf ("\" ");
- while (i++ < DtSrMAX_DB_KEYSIZE)
- putchar (' ');
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 100,
- "dba x%08lx, %6ld\n"), dba, dba);
- } /* end verbose */
- else { /* !verbose */
- if (++keycount % KEYS_PER_DOT == 0) {
- putchar ('.');
- if (++dotcount % 10 == 0)
- putchar (' ');
- if (dotcount % 50 == 0) {
- putchar ('\n');
- dotcount = 0;
- }
- fflush (stdout);
- }
- } /* end !verbose dot printing */
- KEYNEXT (PROGNAME"291", OR_OBJKEY, 0);
- } /* end object key read loop */
- /* Print objkey summary report */
- if (dotcount)
- putchar ('\n');
- if (dbpath[0] == 0)
- buf[0] = 0;
- else
- sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
- " in %s"), dbpath);
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 120,
- "Object Summary for '%s'%s:\n"), dbname, buf);
- puts (CATGETS(dtsearch_catd, MS_dtsrkdump, 130,
- "Object Count by Keytypes:"));
- total = 0L;
- for (i = 0; i < 256; i++) {
- if (counters[i] > 0L) {
- total += counters[i];
- if (i > 32 && i < 127)
- printf (" '%c' %6ld\n", i, counters[i]);
- else
- printf (" x%02x %6ld\n", i, counters[i]);
- }
- }
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 160,
- "TOTAL Objects Count = %ld\n"), total);
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 170,
- "Largest Object DBA = %ld\n"), maxdba);
- free (counters);
- } /* end do_objkeys */
- if (do_wordkeys) {
- if (listing_most_words)
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 180,
- "%s: * Words marked with asterisk occur in every record.\n"),
- aa_argv0);
- /*
- * Allocate and initialize word and stem counters. First is for
- * short stems (those beginning with STEM_CH), next is for short
- * words (everything else). Next are for long stems, long words,
- * huge stems, and huge words (6 in all).
- */
- counters = austext_malloc (8 * sizeof (long), PROGNAME"113", NULL);
- memset (counters, 0, 6 * sizeof(long));
- count_words (0); /* short */
- count_words (2); /* long */
- count_words (4); /* huge */
- /* print wordkey summary report */
- if (do_objkeys)
- putchar ('\n'); /* separate from last report */
- if (dbpath[0] == 0)
- buf[0] = 0;
- else
- sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
- " in %s"), dbpath);
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 200,
- "Words Summary for '%s'%s:\n"), dbname, buf);
- total = 0L;
- for (i = 0; i < 6; i++) {
- printf (word_labels[i], counters[i]);
- total += counters[i];
- }
- printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 210,
- "TOTAL Words Count = %ld\n"), total);
- free (counters);
- } /* end do_wordkeys */
- DtSearchExit (0);
- } /* main() */
- /*********************** DTSRKDUMP.C *************************/
|