fast_mphf.C 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * $XConsortium: fast_mphf.cc /main/5 1996/07/18 14:35:57 drk $
  25. *
  26. * Copyright (c) 1992 HAL Computer Systems International, Ltd.
  27. * All rights reserved. Unpublished -- rights reserved under
  28. * the Copyright Laws of the United States. USE OF A COPYRIGHT
  29. * NOTICE IS PRECAUTIONARY ONLY AND DOES NOT IMPLY PUBLICATION
  30. * OR DISCLOSURE.
  31. *
  32. * THIS SOFTWARE CONTAINS CONFIDENTIAL INFORMATION AND TRADE
  33. * SECRETS OF HAL COMPUTER SYSTEMS INTERNATIONAL, LTD. USE,
  34. * DISCLOSURE, OR REPRODUCTION IS PROHIBITED WITHOUT THE
  35. * PRIOR EXPRESS WRITTEN PERMISSION OF HAL COMPUTER SYSTEMS
  36. * INTERNATIONAL, LTD.
  37. *
  38. * RESTRICTED RIGHTS LEGEND
  39. * Use, duplication, or disclosure by the Government is subject
  40. * to the restrictions as set forth in subparagraph (c)(l)(ii)
  41. * of the Rights in Technical Data and Computer Software clause
  42. * at DFARS 252.227-7013.
  43. *
  44. * HAL COMPUTER SYSTEMS INTERNATIONAL, LTD.
  45. * 1315 Dell Avenue
  46. * Campbell, CA 95008
  47. *
  48. */
  49. #define NUM_BITS_INCREASES 5
  50. #include "index/fast_mphf.h"
  51. #ifdef C_API
  52. tbl_cache *fast_mphf::v_tbl_cache_ptr = 0;
  53. #define v_tbl_cache (*v_tbl_cache_ptr)
  54. #else
  55. tbl_cache fast_mphf::v_tbl_cache;
  56. #endif
  57. tbl_record::~tbl_record()
  58. {
  59. delete v_tbl0;
  60. delete v_tbl1;
  61. }
  62. tbl_cache::tbl_cache() : f_array(10)
  63. {
  64. }
  65. tbl_cache::~tbl_cache()
  66. {
  67. for ( int v=0; v<f_array.no_elmts(); v++ ) {
  68. delete (tbl_record*)f_array[v];
  69. }
  70. }
  71. void tbl_cache::init_table(int hash_tbl_sz, int seed, atoi_pearson*& t1, atoi_pearson*& t2)
  72. {
  73. int x = f_array.no_elmts();
  74. tbl_record *y = 0;
  75. for ( int v=0; v<x; v++ ) {
  76. y = (tbl_record*)f_array[v];
  77. if ( y -> v_tbl0 != 0 && y -> v_seed == seed )
  78. {
  79. t1 = y -> v_tbl0;
  80. t2 = y -> v_tbl1;
  81. /*
  82. MESSAGE(cerr, "USE cached TABLE!");
  83. debug(cerr, int(v_tbl0));
  84. debug(cerr, int(v_tbl1));
  85. */
  86. return ;
  87. }
  88. }
  89. pm_random z(seed);
  90. t1 = new atoi_pearson(hash_tbl_sz, 128, z);
  91. t2 = new atoi_pearson(hash_tbl_sz, 128, z);
  92. y = new tbl_record(seed, t1, t2);
  93. f_array.insert(y, x);
  94. f_array.reset_elmts(x+1);
  95. }
  96. fast_mphf::fast_mphf(c_code_t c_cd): long_pstring(c_cd),
  97. v_long_string_core_indexed(false),
  98. v_no_ps(0), v_p1(0), v_p2(0),
  99. r(0), v_seed(0), t(0)
  100. {
  101. #ifdef C_API
  102. if ( v_tbl_cache_ptr == 0 ) {
  103. v_tbl_cache_ptr = new tbl_cache;
  104. }
  105. #endif
  106. v_tbl0 = 0;
  107. v_tbl1 = 0;
  108. }
  109. void fast_mphf::init_persistent_info(persistent_info* x)
  110. {
  111. long_pstring::init_persistent_info(x);
  112. if ( get_mode(OLD_OBJECT) == false ) {
  113. v_hash_tbl_sz = 0;
  114. v_no_ps = 0;
  115. v_p1 = 0;
  116. v_p2 = 0;
  117. r = 0;
  118. v_seed = 0;
  119. t = 0;
  120. ihash::init_data_member();
  121. }
  122. }
  123. fast_mphf::~fast_mphf()
  124. {
  125. /*
  126. delete v_tbl0;
  127. delete v_tbl1;
  128. #ifdef C_API
  129. if ( v_tbl_cache_ptr ) {
  130. for (int i=0; i<10; i++ )
  131. delete v_tbl_cache_ptr[i];
  132. delete v_tbl_cache_ptr;
  133. v_tbl_cache_ptr = 0;
  134. }
  135. #endif
  136. */
  137. }
  138. io_status fast_mphf::asciiIn(istream& in)
  139. {
  140. in >> v_hash_tbl_sz;
  141. in >> v_no_ps;
  142. in >> v_p1;
  143. in >> v_p2;
  144. in >> r;
  145. in >> v_seed;
  146. /*
  147. MESSAGE(cerr, "in fast_mphf::asciiIn()");
  148. debug(cerr, my_oid());
  149. debug(cerr, v_hash_tbl_sz);
  150. debug(cerr, v_no_ps);
  151. debug(cerr, v_p1);
  152. debug(cerr, v_p2);
  153. debug(cerr, r);
  154. debug(cerr, v_seed);
  155. */
  156. v_key_set_sz = v_hash_tbl_sz ;
  157. in.get(); // skip the '\n' after seed
  158. long_pstring::asciiIn(in);
  159. if ( v_key_set_sz > 0 ) {
  160. t = (int)(flog2(v_key_set_sz)) + 1; // bits of each g value.
  161. if ( floor(flog2(v_key_set_sz)) < flog2(v_key_set_sz) )
  162. t++;
  163. //MESSAGE(cerr, "compacted array:");
  164. //debug(cerr, long_pstring::size());
  165. //for (int z=0; z<long_pstring::size()/4; z++) {
  166. // cerr << ((unsigned*)long_pstring::get())[z] << " ";
  167. //}
  168. //cerr << "\n";
  169. v_hash_func_sz = v_no_ps*sizeof(unsigned)+128*2+6*sizeof(int);
  170. init_map_tbls();
  171. } else {
  172. v_hash_func_sz = 0;
  173. t = 0;
  174. }
  175. set_mode(UPDATE, true);
  176. return done;
  177. }
  178. Boolean fast_mphf::init_map_tbls()
  179. {
  180. /*
  181. MESSAGE(cerr, "in fast_mphf::init_map_tbls()");
  182. debug(cerr, (void*)this);
  183. debug(cerr, my_oid());
  184. debug(cerr, v_key_set_sz);
  185. debug(cerr, v_hash_tbl_sz);
  186. debug(cerr, int(this));
  187. debug(cerr, v_no_ps);
  188. debug(cerr, v_p1);
  189. debug(cerr, v_p2);
  190. debug(cerr, r);
  191. debug(cerr, v_seed);
  192. debug(cerr, t);
  193. //debug(cerr, (void*)&v_tbl_cache);
  194. */
  195. if ( v_key_set_sz > 0 ) {
  196. v_tbl_cache.init_table(v_hash_tbl_sz, v_seed, v_tbl0, v_tbl1);
  197. }
  198. //long_pstring::init_run_data();
  199. return true;
  200. }
  201. int fast_mphf::hashTo(const key_type& k)
  202. {
  203. unsigned int i;
  204. if ( v_long_string_core_indexed == false ) {
  205. v_long_string_core_indexed = true;
  206. }
  207. /*
  208. cerr << "\n";
  209. MESSAGE(cerr, "fast_mphf:: hashTO()");
  210. debug(cerr, k);
  211. */
  212. if ( v_hash_tbl_sz == 0 ) {
  213. throw(stringException("hash table empty"));
  214. }
  215. i = v_tbl0 -> atoi(k.get(), k.size(), r, v_key_set_sz); // for halmphf
  216. if ( i < v_p1 ) {
  217. i %= v_p2;
  218. } else {
  219. i %= v_no_ps - v_p2;
  220. i += v_p2;
  221. }
  222. int gv, c_bit;
  223. gValue(i, gv, c_bit);
  224. //i = v_tbl1 -> atoi(k, c_bit+r+1) + gv;
  225. //debug(cerr, c_bit+r);
  226. i = v_tbl1 -> atoi(k.get(), k.size(), c_bit+r+1, v_hash_tbl_sz) + gv; // for halmphf
  227. return i % v_hash_tbl_sz;
  228. }
  229. int fast_mphf::gValue(int i, int& gvalue, int& ctl_bit)
  230. {
  231. if ( !INRANGE(i, 0, (int) v_no_ps-1) ) {
  232. throw(boundaryException(0, v_no_ps-1, i));
  233. }
  234. int a, b;
  235. unsigned un_compacted, un_compacted1;
  236. a = b = t * i ;
  237. a /= BITS_IN(unsigned);
  238. b %= BITS_IN(unsigned);
  239. unsigned value_at_a, value_at_a_plus;
  240. /*
  241. debug(cerr, a);
  242. debug(cerr, b);
  243. debug(cerr, t);
  244. */
  245. char x_buf[10];
  246. long_pstring::extract(a*sizeof(a), (a+1)*sizeof(a), x_buf);
  247. memcpy((char*)&value_at_a, x_buf, sizeof(value_at_a));
  248. //cerr << "Extract: at " << a << "; the int =" << value_at_a << endl;
  249. #ifdef PORTABLE_DB
  250. if ( swap_order() == true )
  251. ORDER_SWAP_UINT(value_at_a);
  252. #endif
  253. //cerr << "after swap " << a << "; the int =" << value_at_a << endl;
  254. //debug(cerr, hex(value_at_a));
  255. if ( BITS_IN(unsigned) - b >= t ) {
  256. un_compacted = getbits(value_at_a, BITS_IN(unsigned) - b, t);
  257. } else {
  258. long_pstring::extract((a+1)*sizeof(a), (a+2)*sizeof(a), x_buf);
  259. memcpy((char*)&value_at_a_plus, x_buf, sizeof(value_at_a_plus));
  260. //cerr << "Extract+1: at " << (a+1) << "; the int =" << value_at_a_plus << endl;
  261. #ifdef PORTABLE_DB
  262. if ( swap_order() == true )
  263. ORDER_SWAP_UINT(value_at_a_plus);
  264. #endif
  265. //cerr << "after swap " << (a+1) << "; the int =" << value_at_a_plus << endl;
  266. //debug(cerr, hex(value_at_a_plus));
  267. un_compacted1 =
  268. getbits(value_at_a, BITS_IN(unsigned) - b, BITS_IN(unsigned) - b);
  269. un_compacted =
  270. getbits(value_at_a_plus, BITS_IN(unsigned), t - BITS_IN(unsigned) + b);
  271. un_compacted1 <<= ( t - BITS_IN(unsigned) + b);
  272. un_compacted |= un_compacted1;
  273. }
  274. //debug(cerr, hex(un_compacted));
  275. ctl_bit = un_compacted & (unsigned)1;
  276. gvalue = un_compacted >> 1;
  277. return 0;
  278. }
  279. Boolean fast_mphf::build(const char *from)
  280. {
  281. fstream in(from, ios::in);
  282. return build(in);
  283. }
  284. Boolean fast_mphf::build(istream& in)
  285. {
  286. int ok = -1;
  287. sorter stor(in);
  288. params pms;
  289. pms.v_n = stor.no_unique_keys();
  290. pms.select_value();
  291. buffer mphf_spec(LBUFSIZ);
  292. //buffer& mphf_spec = get_store() -> aux_buf();
  293. mphf_spec.set_swap_order(swap_order());
  294. int i=0;
  295. while ( i<NUM_BITS_INCREASES && ok != 0 ) {
  296. ok = compute_a_mphf(stor.unique_keys(), pms, mphf_spec);
  297. switch (ok) {
  298. case 0:
  299. break;
  300. case 1:
  301. pms.re_select_value();
  302. break;
  303. case -1:
  304. throw(stringException("finding a mphf failed"));
  305. }
  306. i++;
  307. }
  308. if ( ok != 0 ) {
  309. set_mode(HEALTH, false);
  310. throw(stringException("finding a mphf failed"));
  311. }
  312. stringstream strin;
  313. if ( !strin ) {
  314. throw(streamException(strin.rdstate()));
  315. }
  316. else {
  317. strin.write(mphf_spec.get_base(), mphf_spec.content_sz());
  318. }
  319. asciiIn(strin);
  320. set_mode(HEALTH, true);
  321. return true;
  322. }
  323. void
  324. fast_mphf::print_mapping(const char *key_file, int option)
  325. {
  326. //debug(cerr, option);
  327. MESSAGE(cerr, "print_mapping()");
  328. char string[LBUFSIZ];
  329. fstream in(key_file, ios::in);
  330. if ( !in ) {
  331. throw(streamException(in.rdstate()));
  332. }
  333. char *hash_table = new char[v_hash_tbl_sz];
  334. for (unsigned int i = 0; i < v_hash_tbl_sz; hash_table[i++] = 0 );
  335. ostring lbuf(LBUFSIZ);
  336. while ( in.getline(string, LBUFSIZ, '\n') ) {
  337. // string[strlen(string)-1] = '\0';
  338. //debug(cerr, string);
  339. //debug(cerr, strlen(string));
  340. lbuf.reset();
  341. lbuf.set(string, strlen(string));
  342. int hash = hashTo(lbuf) ;
  343. if ( option == 1 ) {
  344. cout << " string = " << string;
  345. cout << ", hash = " << hash << "\n";
  346. }
  347. if ( hash_table[hash] == 1 )
  348. MESSAGE(cerr, "mapping_print(): panic: mphf hash collision");
  349. else
  350. hash_table[hash] = 1;
  351. in.getline(string, LBUFSIZ, '\n');
  352. }
  353. MESSAGE(cerr, "print_mapping() done");
  354. }
  355. void fast_mphf::print_tbls(ostream& out)
  356. {
  357. debug(out, *v_tbl0);
  358. debug(out, *v_tbl1);
  359. }
  360. void fast_mphf::print_gvalues(ostream& out)
  361. {
  362. int gv, cbit;
  363. for (unsigned int i = 0; i<v_no_ps; i++ ) {
  364. out << i;
  365. gValue(i, gv, cbit);
  366. out << " " << gv << " " << cbit << "\n";
  367. }
  368. }
  369. int fast_mphf::print_bits(unsigned x, ostream& out)
  370. {
  371. for ( unsigned int i=0; i<8*sizeof(unsigned); i++ ) {
  372. if ( BIT_TEST(x, 0x80000000) )
  373. out << "1";
  374. else
  375. out << "0";
  376. x = x << 1;
  377. }
  378. out << "\n";
  379. return 0;
  380. }
  381. int fast_mphf::cdr_sizeof()
  382. {
  383. return long_pstring::cdr_sizeof() + ihash::cdr_sizeof() +
  384. 6*sizeof(unsigned int);
  385. }
  386. io_status fast_mphf::cdrOut(buffer& buf)
  387. {
  388. long_pstring::cdrOut(buf);
  389. ihash::cdrOut(buf);
  390. buf.put(v_no_ps);
  391. buf.put(v_p1);
  392. buf.put(v_p2);
  393. buf.put(r);
  394. buf.put(v_seed);
  395. buf.put(t);
  396. return done;
  397. }
  398. io_status fast_mphf::cdrIn(buffer& buf)
  399. {
  400. long_pstring::cdrIn(buf);
  401. ihash::cdrIn(buf);
  402. buf.get(v_no_ps);
  403. buf.get(v_p1);
  404. buf.get(v_p2);
  405. buf.get(r);
  406. buf.get(v_seed);
  407. buf.get(t);
  408. init_map_tbls();
  409. return done;
  410. }
  411. MMDB_BODIES(fast_mphf)
  412. HANDLER_BODIES(fast_mphf)