isduprec.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * COMPONENT_NAME: austext
  25. *
  26. * FUNCTIONS: dump_hashtab
  27. * is_duprec
  28. * main
  29. *
  30. * ORIGINS: 27
  31. *
  32. *
  33. * (C) COPYRIGHT International Business Machines Corp. 1993,1995
  34. * All Rights Reserved
  35. * Licensed Materials - Property of IBM
  36. * US Government Users Restricted Rights - Use, duplication or
  37. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  38. */
  39. /******************* ISDUPREC.C *******************
  40. * $XConsortium: isduprec.c /main/5 1996/05/07 13:37:35 drk $
  41. * June 1993.
  42. * Is_duprec() returns 0 (FALSE) for every record id it is passed
  43. * unless one is passed that duplicates a previous one,
  44. * in which case it returns 1 (TRUE).
  45. * It ensures that duplicate record ids in an .fzk file
  46. * are not processed by either ravel or borodin.
  47. * It does it by storing each recid into a hash table and
  48. * searching the table before storing a new recid.
  49. * Returns 2 on errors (malloc out of space, etc);
  50. *
  51. * Global 'duprec_hashsize' can be changed to any rational value
  52. * for a hash table size (say 1000 to 30,000) prior to the first call
  53. * of is_duprec(). It should be roughly => to the total number of
  54. * different record ids expected to be passed to is_duprec().
  55. * If initialized to 0 before the first call, that will disable
  56. * duplicate checking, i.e. is_duprec() will allocate no memory
  57. * and always return 0.
  58. *
  59. * $Log$
  60. * Revision 2.2 1995/10/25 17:22:48 miker
  61. * Added prolog.
  62. *
  63. * Revision 2.1 1995/09/22 20:56:44 miker
  64. * Freeze DtSearch 0.1, AusText 2.1.8
  65. *
  66. * Revision 1.3 1995/09/05 18:11:45 miker
  67. * Minor changes so ansi c compilers won't whine.
  68. */
  69. #include <stdlib.h>
  70. #include <string.h>
  71. #ifdef TEST
  72. #include <stdio.h>
  73. #include <errno.h>
  74. #endif
  75. #define PROGNAME "ISDUPREC"
  76. #define HASHSIZE 3000L
  77. #define NOT_A_DUP 0
  78. #define IS_A_DUP 1
  79. #define OUT_OF_MEM 2
  80. unsigned long duprec_hashsize = HASHSIZE;
  81. /************************************************/
  82. /* */
  83. /* HASHNODE */
  84. /* */
  85. /************************************************/
  86. /* The hash table is a HASHSIZE array of pointers to these structures.
  87. * Each pointer is initialized to NULL.
  88. * Additions are handled by filling in a HASHNODE pointed to
  89. * by the table pointer. The 'recid' is NOT a char array of length
  90. * 1, but a string whose length varies depending on the actual
  91. * length of the passed record id. Each hashnode is malloced
  92. * for exactly the right length. Collisions are handled by linking
  93. * additional nodes off of the original one.
  94. */
  95. typedef struct hash_tag {
  96. struct hash_tag *link;
  97. char recid[2]; /* actual array size varies */
  98. } HASHNODE;
  99. #ifdef TEST
  100. /************************************************/
  101. /* */
  102. /* dump_hashtab() */
  103. /* */
  104. /************************************************/
  105. /* For debugging, prints out all recids in hashtab, skipping empty bkts */
  106. static void dump_hashtab (HASHNODE ** hashtab)
  107. {
  108. HASHNODE *hp, **hpp;
  109. int i;
  110. printf (PROGNAME "67 dump_hashtab(%p):\n", hashtab);
  111. for (i = 0, hpp = hashtab; i < duprec_hashsize; i++, hpp++) {
  112. if (*hpp) {
  113. printf (" %4d:", i);
  114. fflush (stdout);
  115. for (hp = *hpp; hp != NULL; hp = hp->link)
  116. printf (" '%s'", hp->recid);
  117. putchar ('\n');
  118. fflush (stdout);
  119. }
  120. }
  121. return;
  122. } /* dump_hashtab() */
  123. #endif /* TEST */
  124. /************************************************/
  125. /* */
  126. /* is_duprec() */
  127. /* */
  128. /************************************************/
  129. /* Normal return is 0 indicating that passed record id is unique.
  130. * Also immediately returns 0 if duplicate checking has been
  131. * turned off by setting global 'duprec_hashsize' to zero.
  132. * Returns 1 if record id is a duplicate.
  133. * Returns 2 if out of memory.
  134. * First call uses 'duprec_hashsize' to create hash table.
  135. */
  136. int is_duprec (char *recid)
  137. {
  138. static HASHNODE **hashtab = NULL;
  139. static unsigned long primes[10] =
  140. {1013, 1511, 2203, 3511, 5003, 10007, 15013, 20011, 25013, 30001};
  141. unsigned long i;
  142. char *cp;
  143. unsigned long sum;
  144. HASHNODE *hp, **hpp;
  145. if (duprec_hashsize == 0UL)
  146. return NOT_A_DUP;
  147. /* Generate hash table at first call only */
  148. if (hashtab == NULL) {
  149. /*
  150. * adjust table size upward to nearest preordained prime
  151. * number
  152. */
  153. for (i = 0; i < 9 && primes[i] < duprec_hashsize; i++);
  154. duprec_hashsize = primes[i];
  155. #ifdef TEST
  156. printf (PROGNAME "117 Create hash table, duprec_hashsize set = %ld.\n",
  157. duprec_hashsize);
  158. #endif
  159. hashtab = malloc ((duprec_hashsize + 2L) * sizeof (HASHNODE *));
  160. if (hashtab == NULL)
  161. return OUT_OF_MEM;
  162. /* init table to all NULL pointers. */
  163. hpp = hashtab;
  164. for (i = duprec_hashsize + 2L; i > 0L; i--)
  165. *hpp++ = NULL;
  166. }
  167. /*****dump_hashtab(hashtab);******/
  168. /* HASH FUNCTION: H(recid) = (SUM(i*recid[i])) mod M,
  169. * where M is table size (prime), and SUM is calculated
  170. * for i=1 to end of recid. Multiplying the position by the character
  171. * value at that position minimizes the influence of identical
  172. * characters at the beginnings and ends of recids,
  173. * and also usually yields a number larger than M.
  174. * Not skipping over the first position (the keytype char) helps
  175. * efficiently catch recids that are blank after the keytype.
  176. */
  177. sum = 0UL;
  178. i = 1;
  179. cp = recid;
  180. while (*cp != 0)
  181. sum += i++ * (*cp++);
  182. hpp = &(hashtab[sum % duprec_hashsize]); /* hpp = head of linked
  183. * list */
  184. #ifdef TEST
  185. printf (PROGNAME "150 is_duprec('%s')=hashtab[%lu]=%p: ",
  186. recid, sum % duprec_hashsize, *hpp);
  187. fflush (stdout);
  188. i = 0;
  189. #endif
  190. /* Search linked list (if any) for hashnode containing recid */
  191. for (hp = *hpp; hp != NULL; hp = hp->link) {
  192. #ifdef TEST
  193. i++;
  194. #endif
  195. if (strcmp (hp->recid, recid) == 0) {
  196. #ifdef TEST
  197. printf ("DUP!@listpos=%d\n", i);
  198. #endif
  199. return IS_A_DUP;
  200. }
  201. hpp = &hp->link; /* now hpp = tail of linked list */
  202. }
  203. #ifdef TEST
  204. printf ("miss@listlen=%d\n", i);
  205. #endif
  206. /* Not a duplicate. Add current recid to hash table. */
  207. if ((hp = malloc (sizeof (HASHNODE) + strlen (recid) + 2)) == NULL)
  208. return OUT_OF_MEM;
  209. strcpy (hp->recid, recid);
  210. hp->link = NULL;
  211. /*****hp->link = *hpp;******/
  212. *hpp = hp;
  213. return NOT_A_DUP;
  214. } /* is_duprec() */
  215. #ifdef MAIN
  216. /************************************************/
  217. /* */
  218. /* main() */
  219. /* */
  220. /************************************************/
  221. main (int argc, char *argv[])
  222. {
  223. int i;
  224. FILE *f;
  225. char buf[2048];
  226. if (argc < 2) {
  227. printf ("USAGE: %s <file> [n]\n"
  228. "where file contains list of char strings\n"
  229. "and optional n changes hash table size.\n",
  230. argv[0]);
  231. return;
  232. }
  233. if ((f = fopen (argv[1], "r")) == NULL) {
  234. printf ("Can't open %s: %s\n", argv[1], strerror (errno));
  235. return;
  236. }
  237. if (argc >= 3)
  238. duprec_hashsize = atol (argv[2]);
  239. while (fgets (buf, sizeof (buf), f) != NULL) {
  240. buf[sizeof (buf) - 1] = 0;
  241. i = is_duprec (buf);
  242. printf ("%s", buf); /* each buf should end in \n */
  243. if (i > 1)
  244. break;
  245. }
  246. return;
  247. }
  248. #endif
  249. /******************* ISDUPREC.C *******************/