huffman.C 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * $XConsortium: huffman.cc /main/3 1996/06/11 17:15:06 cde-hal $
  25. *
  26. * Copyright (c) 1993 HAL Computer Systems International, Ltd.
  27. * All rights reserved. Unpublished -- rights reserved under
  28. * the Copyright Laws of the United States. USE OF A COPYRIGHT
  29. * NOTICE IS PRECAUTIONARY ONLY AND DOES NOT IMPLY PUBLICATION
  30. * OR DISCLOSURE.
  31. *
  32. * THIS SOFTWARE CONTAINS CONFIDENTIAL INFORMATION AND TRADE
  33. * SECRETS OF HAL COMPUTER SYSTEMS INTERNATIONAL, LTD. USE,
  34. * DISCLOSURE, OR REPRODUCTION IS PROHIBITED WITHOUT THE
  35. * PRIOR EXPRESS WRITTEN PERMISSION OF HAL COMPUTER SYSTEMS
  36. * INTERNATIONAL, LTD.
  37. *
  38. * RESTRICTED RIGHTS LEGEND
  39. * Use, duplication, or disclosure by the Government is subject
  40. * to the restrictions as set forth in subparagraph (c)(l)(ii)
  41. * of the Rights in Technical Data and Computer Software clause
  42. * at DFARS 252.227-7013.
  43. *
  44. * HAL COMPUTER SYSTEMS INTERNATIONAL, LTD.
  45. * 1315 Dell Avenue
  46. * Campbell, CA 95008
  47. *
  48. */
  49. #include "compression/huffman.h"
  50. #include "dstr/heap.h"
  51. ////////////////////////////////////////
  52. //
  53. ////////////////////////////////////////
  54. htr_node::htr_node(encoding_unit* e, htr_node* lt, htr_node* rt):
  55. left(lt), right(rt), eu(e), freq(e->freq), parent(0)
  56. {
  57. }
  58. htr_node::htr_node(unsigned long f, htr_node* lt, htr_node* rt):
  59. left(lt), right(rt), eu(0), freq(f), parent(0)
  60. {
  61. }
  62. htr_node::~htr_node()
  63. {
  64. delete left;
  65. delete right;
  66. }
  67. ////////////////////////////////////////
  68. //
  69. ////////////////////////////////////////
  70. Boolean htr_eq(const void* n1, const void* n2)
  71. {
  72. if ( ((htr_node*)n1) -> freq == ((htr_node*)n2) -> freq )
  73. return true;
  74. else
  75. return false;
  76. }
  77. Boolean htr_ls(const void* n1, const void* n2)
  78. {
  79. if ( ((htr_node*)n1) -> freq > ((htr_node*)n2) -> freq )
  80. return true;
  81. else
  82. return false;
  83. }
  84. ////////////////////////////////////////
  85. //
  86. ////////////////////////////////////////
  87. huff::huff(): compress_agent(HUFFMAN_AGENT_CODE),
  88. e_units(0), cts(0), tri(new trie(26)), htr_root(0)
  89. {
  90. }
  91. huff::~huff()
  92. {
  93. delete tri;
  94. delete htr_root;
  95. }
  96. void huff::build_tree()
  97. {
  98. heap htr_node_set(htr_eq, htr_ls, cts);
  99. htr_node* x ;
  100. for (unsigned int i=0; i<cts; i++ ) {
  101. if ( e_units[i] ) {
  102. x = new htr_node(e_units[i]);
  103. e_units[i] -> leaf_htr_node = x;
  104. htr_node_set.insert(x);
  105. }
  106. }
  107. htr_node_set.heapify();
  108. htr_node *n1, *n2, *n3;
  109. while ( htr_node_set.count() > 1 ) {
  110. // max is the smallest element. see htr_ls()
  111. n1 = (htr_node*)htr_node_set.max_elm() ;
  112. htr_node_set.delete_max() ;
  113. // max is the smallest element. see htr_ls()
  114. n2 = (htr_node*)htr_node_set.max_elm() ;
  115. htr_node_set.delete_max() ;
  116. n3 = new htr_node(n1->freq+n2->freq, n1, n2);
  117. n1 -> parent = n2 -> parent = n3;
  118. htr_node_set.insert_heapify(n3);
  119. }
  120. htr_root = (htr_node*)htr_node_set.max_elm();
  121. htr_node_set.delete_max() ;
  122. }
  123. void huff::calculate_code()
  124. {
  125. htr_node* x ;
  126. htr_node* parent;
  127. for (unsigned int i=0; i<cts; i++ ) {
  128. if ( e_units[i] == 0 )
  129. continue;
  130. e_units[i] -> code = 0;
  131. e_units[i] -> bits = 0;
  132. x = e_units[i] -> leaf_htr_node;
  133. while ( x ) {
  134. parent = x -> parent;
  135. if ( parent == 0 )
  136. break;
  137. e_units[i] -> code >>= 1;
  138. if ( parent -> left == x ) {
  139. e_units[i] -> code |= 0x80000000;
  140. } else
  141. if ( parent -> right != x ) {
  142. debug(cerr, i);
  143. throw(stringException("huffman tree corrupted"));
  144. }
  145. x = parent;
  146. e_units[i] -> bits++;
  147. if ( e_units[i] -> bits > (int) BITS_IN(unsigned long) ) {
  148. debug(cerr, e_units[i] -> bits);
  149. throw(stringException("huffman tree too deep"));
  150. }
  151. }
  152. e_units[i] -> code >>= ( 32 - e_units[i] -> bits );
  153. //debug(cerr, hex(e_units[i] -> code));
  154. }
  155. }
  156. ostream& huff::print_alphabet(ostream& out)
  157. {
  158. unsigned long total_uncmp = 0;
  159. unsigned long int total_cmp = 0;
  160. for (unsigned int i=0; i<cts; i++ ) {
  161. if ( e_units[i] == 0 )
  162. continue;
  163. total_uncmp += (e_units[i] -> word -> size()) * (e_units[i] -> freq);
  164. total_cmp += (e_units[i] -> bits) * (e_units[i] -> freq);
  165. out << *(e_units[i] -> word) << ":" << e_units[i]->bits << "\n";
  166. }
  167. total_cmp = total_cmp / 8 + total_cmp % 8;
  168. /*
  169. debug(cerr, total_uncmp);
  170. debug(cerr, total_cmp);
  171. debug(cerr, 1 - float(total_cmp) / float(total_uncmp) );
  172. */
  173. return out;
  174. }
  175. // self modifying buf ptr after taking an encoding unit.
  176. encoding_unit* huff::get_e_unit(unsigned char*& buf, int len)
  177. {
  178. encoding_unit* x = tri -> parse(buf, len) ;
  179. //debug(cerr, *(x -> word));
  180. buf += x -> word -> size();
  181. return x;
  182. }
  183. int total_uncomp = 0;
  184. int total_comp = 0;
  185. void huff::compress(const buffer& uncompressed, buffer& compressed)
  186. {
  187. //debug(cerr, *(buffer*)&uncompressed);
  188. if ( compressed.buf_sz() < uncompressed.buf_sz() )
  189. compressed.expand_chunk(uncompressed.buf_sz());
  190. unsigned short total_bits = 0;
  191. int uncmp_sz = uncompressed.content_sz();
  192. unsigned char* buf = (unsigned char*)uncompressed.get_base();
  193. unsigned int code_buf = 0;
  194. unsigned int rem_long = 0;
  195. int rem_bits = 0;
  196. encoding_unit *e_ptr = 0;
  197. while ( uncmp_sz > 0 ) {
  198. //e_ptr = get_e_unit(buf, uncmp_sz);
  199. e_ptr = tri -> parse(buf, uncmp_sz);
  200. buf += e_ptr -> word -> size();
  201. uncmp_sz -= e_ptr -> word -> size();
  202. if ( rem_bits + e_ptr -> bits > 32 ) {
  203. code_buf = e_ptr -> code; // shift bits to the higher end
  204. rem_long <<= 32-rem_bits;
  205. rem_bits += e_ptr -> bits - 32; // new rem_bits
  206. code_buf >>= rem_bits; // get padding part
  207. rem_long |= code_buf; // padding
  208. compressed.put( rem_long );
  209. // save remaining (rem_bits + e_ptr -> bits - 32) bits to rem_bits.
  210. rem_long = e_ptr -> code & (~0L >> (32 - rem_bits));
  211. } else {
  212. rem_long <<= e_ptr -> bits;
  213. rem_long |= e_ptr -> code;
  214. rem_bits += e_ptr -> bits;
  215. //debug(cerr, hex(rem_long));
  216. }
  217. total_bits += e_ptr -> bits;
  218. total_bits &= 0x1f; // take the mod on 32
  219. }
  220. if ( rem_bits > 0 ) {
  221. rem_long <<= 32 - rem_bits;
  222. //MESSAGE(cerr, "PUT");
  223. //debug(cerr, hex(rem_long));
  224. compressed.put( rem_long );
  225. }
  226. //debug(cerr, total_bits);
  227. compressed.put(char(total_bits));
  228. // total_uncomp += uncompressed.content_sz();
  229. // total_comp += compressed.content_sz();
  230. /*
  231. debug(cerr, total_uncomp);
  232. debug(cerr, total_comp);
  233. debug(cerr,
  234. 1-float(compressed.content_sz()-1)/float(uncompressed.content_sz())
  235. );
  236. */
  237. }
  238. void huff::decompress(buffer& compressed, buffer& uncompressed)
  239. {
  240. char* buf_base = uncompressed.get_base();
  241. char* str;
  242. int str_len;
  243. char rem_bits;
  244. int ct = (compressed.content_sz() - 1) >> 2;
  245. unsigned int c;
  246. int bits_bound = 32;
  247. htr_node *node_ptr = htr_root;
  248. do {
  249. compressed.get(c); ct--;
  250. if ( ct == 0 ) {
  251. compressed.get(rem_bits);
  252. //debug(cerr, int(rem_bits));
  253. bits_bound = rem_bits ;
  254. }
  255. for ( int i=0;i<bits_bound; i++ ) {
  256. if ( node_ptr -> left == 0 && node_ptr -> right == 0 ) {
  257. //for ( int j=0; j<node_ptr -> eu -> word -> size(); j++ ) {
  258. // cerr << (node_ptr -> eu -> word -> get())[j];
  259. //}
  260. str_len = node_ptr -> eu -> word -> size();
  261. str = node_ptr -> eu -> word -> get();
  262. if ( str_len == 1 ) {
  263. *buf_base = str[0];
  264. buf_base++;
  265. // uncompressed.put((node_ptr -> eu -> word -> get())[0]);
  266. } else {
  267. memcpy(buf_base, str, str_len);
  268. buf_base += str_len;
  269. /*
  270. uncompressed.put( node_ptr -> eu -> word -> get(),
  271. node_ptr -> eu -> word -> size()
  272. );
  273. */
  274. }
  275. node_ptr = htr_root;
  276. }
  277. if ( c & 0x80000000 )
  278. node_ptr = node_ptr -> left;
  279. else
  280. node_ptr = node_ptr -> right;
  281. c <<= 1;
  282. }
  283. } while ( ct>0 );
  284. //debug(cerr, buf_base-uncompressed.get_base());
  285. uncompressed.set_content_sz(buf_base-uncompressed.get_base());
  286. if ( rem_bits > 0 )
  287. uncompressed.put( node_ptr -> eu -> word -> get(),
  288. node_ptr -> eu -> word -> size()
  289. );
  290. }
  291. //////////////////////////////////////////////////////////
  292. //
  293. //////////////////////////////////////////////////////////
  294. MMDB_BODIES(huff)
  295. int huff::cdr_sizeof()
  296. {
  297. return pstring::cdr_sizeof();
  298. }
  299. io_status huff::cdrOut(buffer& buf)
  300. {
  301. //MESSAGE(cerr, "huff::cdrOut");
  302. //debug(cerr, my_oid());
  303. static buffer v_out_buf(LBUFSIZ);
  304. unsigned int i;
  305. if ( cts > 0 ) {
  306. //MESSAGE(cerr, "huff::cdrOut: dict out");
  307. int sz = sizeof(int);
  308. for ( i=0; i<cts; i++ ) {
  309. sz += ( e_units[i] -> word -> size() +
  310. sizeof(unsigned int) +
  311. sizeof(char)
  312. );
  313. }
  314. v_out_buf.expand_chunk(sz);
  315. v_out_buf.put(cts);
  316. int word_sz;
  317. for ( i=0; i<cts; i++ ) {
  318. word_sz = e_units[i] -> word -> size();
  319. v_out_buf.put(char(word_sz));
  320. v_out_buf.put(e_units[i] -> word -> get(), word_sz);
  321. v_out_buf.put(e_units[i] -> freq);
  322. }
  323. pstring::update(v_out_buf.get_base(), v_out_buf.content_sz());
  324. }
  325. return pstring::cdrOut(buf);
  326. }
  327. // format:
  328. // entries_int
  329. // (len_byte word_chars freq_int)+
  330. //
  331. io_status huff::cdrIn(buffer& buf)
  332. {
  333. static buffer v_in_buf(0);
  334. pstring::cdrIn(buf);
  335. if ( pstring::size() > 0 ) {
  336. v_in_buf.set_chunk(pstring::get(), pstring::size());
  337. v_in_buf.set_content_sz(pstring::size());
  338. v_in_buf.get(cts);
  339. char word_buf[BUFSIZ];
  340. char word_sz;
  341. unsigned int word_freq;
  342. //ostring *z = 0;
  343. for ( unsigned int i=0; i<cts; i++ ) {
  344. v_in_buf.get(word_sz);
  345. v_in_buf.get(word_buf, int(word_sz));
  346. v_in_buf.get(word_freq);
  347. /*
  348. z = new ostring((char*)word_buf, word_sz);
  349. extend_alphabet();
  350. alphabet[alphabet_sz++] = new encoding_unit(z, word_freq);
  351. */
  352. tri -> add_to_alphabet((unsigned char*)word_buf, word_sz, word_freq);
  353. }
  354. e_units = tri -> get_alphabet(cts);
  355. build_tree();
  356. calculate_code();
  357. delete tri; tri = 0;
  358. //print_alphabet(cerr);
  359. }
  360. return done;
  361. }
  362. trie* alphabet = 0;
  363. void trie_add_wrap(unsigned char* buf, int len, int action_num)
  364. {
  365. switch ( action_num ) {
  366. case 1:
  367. alphabet -> add(buf, len);
  368. break;
  369. case 2:
  370. alphabet -> add_letters(buf, len);
  371. break;
  372. default:
  373. debug(cerr, action_num);
  374. throw(stringException("unknown action number"));
  375. }
  376. }
  377. io_status huff::build_dict(lex_func_t f_lex, getchar_func_t f_getchar)
  378. {
  379. MESSAGE(cerr, "get to huff build dict");
  380. fill_buf_func = f_getchar;
  381. alphabet = tri;
  382. lex_action_func = trie_add_wrap;
  383. if ( (*f_lex)() != 0 )
  384. throw(stringException("huff::asciiIn(): Parsing input failed"));
  385. e_units = tri -> get_alphabet(cts);
  386. //debug(cerr, *tri);
  387. build_tree();
  388. calculate_code();
  389. //print_alphabet(cerr);
  390. set_mode(UPDATE, true);
  391. return done;
  392. }