blocksort.c 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079
  1. /*
  2. * bzip2 is written by Julian Seward <jseward@bzip.org>.
  3. * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
  4. * See README and LICENSE files in this directory for more information.
  5. */
  6. /*-------------------------------------------------------------*/
  7. /*--- Block sorting machinery ---*/
  8. /*--- blocksort.c ---*/
  9. /*-------------------------------------------------------------*/
  10. /* ------------------------------------------------------------------
  11. This file is part of bzip2/libbzip2, a program and library for
  12. lossless, block-sorting data compression.
  13. bzip2/libbzip2 version 1.0.4 of 20 December 2006
  14. Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
  15. Please read the WARNING, DISCLAIMER and PATENTS sections in the
  16. README file.
  17. This program is released under the terms of the license contained
  18. in the file LICENSE.
  19. ------------------------------------------------------------------ */
  20. /* #include "bzlib_private.h" */
  21. #define mswap(zz1, zz2) \
  22. { \
  23. int32_t zztmp = zz1; \
  24. zz1 = zz2; \
  25. zz2 = zztmp; \
  26. }
  27. static
  28. /* No measurable speed gain with inlining */
  29. /* ALWAYS_INLINE */
  30. void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
  31. {
  32. while (zzn > 0) {
  33. mswap(ptr[zzp1], ptr[zzp2]);
  34. zzp1++;
  35. zzp2++;
  36. zzn--;
  37. }
  38. }
  39. static
  40. ALWAYS_INLINE
  41. int32_t mmin(int32_t a, int32_t b)
  42. {
  43. return (a < b) ? a : b;
  44. }
  45. /*---------------------------------------------*/
  46. /*--- Fallback O(N log(N)^2) sorting ---*/
  47. /*--- algorithm, for repetitive blocks ---*/
  48. /*---------------------------------------------*/
  49. /*---------------------------------------------*/
  50. static
  51. inline
  52. void fallbackSimpleSort(uint32_t* fmap,
  53. uint32_t* eclass,
  54. int32_t lo,
  55. int32_t hi)
  56. {
  57. int32_t i, j, tmp;
  58. uint32_t ec_tmp;
  59. if (lo == hi) return;
  60. if (hi - lo > 3) {
  61. for (i = hi-4; i >= lo; i--) {
  62. tmp = fmap[i];
  63. ec_tmp = eclass[tmp];
  64. for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
  65. fmap[j-4] = fmap[j];
  66. fmap[j-4] = tmp;
  67. }
  68. }
  69. for (i = hi-1; i >= lo; i--) {
  70. tmp = fmap[i];
  71. ec_tmp = eclass[tmp];
  72. for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
  73. fmap[j-1] = fmap[j];
  74. fmap[j-1] = tmp;
  75. }
  76. }
  77. /*---------------------------------------------*/
  78. #define fpush(lz,hz) { \
  79. stackLo[sp] = lz; \
  80. stackHi[sp] = hz; \
  81. sp++; \
  82. }
  83. #define fpop(lz,hz) { \
  84. sp--; \
  85. lz = stackLo[sp]; \
  86. hz = stackHi[sp]; \
  87. }
  88. #define FALLBACK_QSORT_SMALL_THRESH 10
  89. #define FALLBACK_QSORT_STACK_SIZE 100
  90. static
  91. void fallbackQSort3(uint32_t* fmap,
  92. uint32_t* eclass,
  93. int32_t loSt,
  94. int32_t hiSt)
  95. {
  96. int32_t sp;
  97. uint32_t r;
  98. int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
  99. int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
  100. r = 0;
  101. sp = 0;
  102. fpush(loSt, hiSt);
  103. while (sp > 0) {
  104. int32_t unLo, unHi, ltLo, gtHi, n, m;
  105. int32_t lo, hi;
  106. uint32_t med;
  107. uint32_t r3;
  108. AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
  109. fpop(lo, hi);
  110. if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
  111. fallbackSimpleSort(fmap, eclass, lo, hi);
  112. continue;
  113. }
  114. /* Random partitioning. Median of 3 sometimes fails to
  115. * avoid bad cases. Median of 9 seems to help but
  116. * looks rather expensive. This too seems to work but
  117. * is cheaper. Guidance for the magic constants
  118. * 7621 and 32768 is taken from Sedgewick's algorithms
  119. * book, chapter 35.
  120. */
  121. r = ((r * 7621) + 1) % 32768;
  122. r3 = r % 3;
  123. if (r3 == 0)
  124. med = eclass[fmap[lo]];
  125. else if (r3 == 1)
  126. med = eclass[fmap[(lo+hi)>>1]];
  127. else
  128. med = eclass[fmap[hi]];
  129. unLo = ltLo = lo;
  130. unHi = gtHi = hi;
  131. while (1) {
  132. while (1) {
  133. if (unLo > unHi) break;
  134. n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
  135. if (n == 0) {
  136. mswap(fmap[unLo], fmap[ltLo]);
  137. ltLo++;
  138. unLo++;
  139. continue;
  140. }
  141. if (n > 0) break;
  142. unLo++;
  143. }
  144. while (1) {
  145. if (unLo > unHi) break;
  146. n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
  147. if (n == 0) {
  148. mswap(fmap[unHi], fmap[gtHi]);
  149. gtHi--; unHi--;
  150. continue;
  151. }
  152. if (n < 0) break;
  153. unHi--;
  154. }
  155. if (unLo > unHi) break;
  156. mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
  157. }
  158. AssertD(unHi == unLo-1, "fallbackQSort3(2)");
  159. if (gtHi < ltLo) continue;
  160. n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
  161. m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
  162. n = lo + unLo - ltLo - 1;
  163. m = hi - (gtHi - unHi) + 1;
  164. if (n - lo > hi - m) {
  165. fpush(lo, n);
  166. fpush(m, hi);
  167. } else {
  168. fpush(m, hi);
  169. fpush(lo, n);
  170. }
  171. }
  172. }
  173. #undef fpush
  174. #undef fpop
  175. #undef FALLBACK_QSORT_SMALL_THRESH
  176. #undef FALLBACK_QSORT_STACK_SIZE
  177. /*---------------------------------------------*/
  178. /* Pre:
  179. * nblock > 0
  180. * eclass exists for [0 .. nblock-1]
  181. * ((uint8_t*)eclass) [0 .. nblock-1] holds block
  182. * ptr exists for [0 .. nblock-1]
  183. *
  184. * Post:
  185. * ((uint8_t*)eclass) [0 .. nblock-1] holds block
  186. * All other areas of eclass destroyed
  187. * fmap [0 .. nblock-1] holds sorted order
  188. * bhtab[0 .. 2+(nblock/32)] destroyed
  189. */
  190. #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
  191. #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
  192. #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
  193. #define WORD_BH(zz) bhtab[(zz) >> 5]
  194. #define UNALIGNED_BH(zz) ((zz) & 0x01f)
  195. static
  196. void fallbackSort(EState* state)
  197. {
  198. int32_t ftab[257];
  199. int32_t ftabCopy[256];
  200. int32_t H, i, j, k, l, r, cc, cc1;
  201. int32_t nNotDone;
  202. int32_t nBhtab;
  203. /* params */
  204. uint32_t *const fmap = state->arr1;
  205. uint32_t *const eclass = state->arr2;
  206. #define eclass8 ((uint8_t*)eclass)
  207. uint32_t *const bhtab = state->ftab;
  208. const int32_t nblock = state->nblock;
  209. /*
  210. * Initial 1-char radix sort to generate
  211. * initial fmap and initial BH bits.
  212. */
  213. for (i = 0; i < 257; i++) ftab[i] = 0;
  214. for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
  215. for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
  216. j = ftab[0]; /* bbox: optimized */
  217. for (i = 1; i < 257; i++) {
  218. j += ftab[i];
  219. ftab[i] = j;
  220. }
  221. for (i = 0; i < nblock; i++) {
  222. j = eclass8[i];
  223. k = ftab[j] - 1;
  224. ftab[j] = k;
  225. fmap[k] = i;
  226. }
  227. nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
  228. for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
  229. for (i = 0; i < 256; i++) SET_BH(ftab[i]);
  230. /*
  231. * Inductively refine the buckets. Kind-of an
  232. * "exponential radix sort" (!), inspired by the
  233. * Manber-Myers suffix array construction algorithm.
  234. */
  235. /*-- set sentinel bits for block-end detection --*/
  236. for (i = 0; i < 32; i++) {
  237. SET_BH(nblock + 2*i);
  238. CLEAR_BH(nblock + 2*i + 1);
  239. }
  240. /*-- the log(N) loop --*/
  241. H = 1;
  242. while (1) {
  243. j = 0;
  244. for (i = 0; i < nblock; i++) {
  245. if (ISSET_BH(i))
  246. j = i;
  247. k = fmap[i] - H;
  248. if (k < 0)
  249. k += nblock;
  250. eclass[k] = j;
  251. }
  252. nNotDone = 0;
  253. r = -1;
  254. while (1) {
  255. /*-- find the next non-singleton bucket --*/
  256. k = r + 1;
  257. while (ISSET_BH(k) && UNALIGNED_BH(k))
  258. k++;
  259. if (ISSET_BH(k)) {
  260. while (WORD_BH(k) == 0xffffffff) k += 32;
  261. while (ISSET_BH(k)) k++;
  262. }
  263. l = k - 1;
  264. if (l >= nblock)
  265. break;
  266. while (!ISSET_BH(k) && UNALIGNED_BH(k))
  267. k++;
  268. if (!ISSET_BH(k)) {
  269. while (WORD_BH(k) == 0x00000000) k += 32;
  270. while (!ISSET_BH(k)) k++;
  271. }
  272. r = k - 1;
  273. if (r >= nblock)
  274. break;
  275. /*-- now [l, r] bracket current bucket --*/
  276. if (r > l) {
  277. nNotDone += (r - l + 1);
  278. fallbackQSort3(fmap, eclass, l, r);
  279. /*-- scan bucket and generate header bits-- */
  280. cc = -1;
  281. for (i = l; i <= r; i++) {
  282. cc1 = eclass[fmap[i]];
  283. if (cc != cc1) {
  284. SET_BH(i);
  285. cc = cc1;
  286. }
  287. }
  288. }
  289. }
  290. H *= 2;
  291. if (H > nblock || nNotDone == 0)
  292. break;
  293. }
  294. /*
  295. * Reconstruct the original block in
  296. * eclass8 [0 .. nblock-1], since the
  297. * previous phase destroyed it.
  298. */
  299. j = 0;
  300. for (i = 0; i < nblock; i++) {
  301. while (ftabCopy[j] == 0)
  302. j++;
  303. ftabCopy[j]--;
  304. eclass8[fmap[i]] = (uint8_t)j;
  305. }
  306. AssertH(j < 256, 1005);
  307. #undef eclass8
  308. }
  309. #undef SET_BH
  310. #undef CLEAR_BH
  311. #undef ISSET_BH
  312. #undef WORD_BH
  313. #undef UNALIGNED_BH
  314. /*---------------------------------------------*/
  315. /*--- The main, O(N^2 log(N)) sorting ---*/
  316. /*--- algorithm. Faster for "normal" ---*/
  317. /*--- non-repetitive blocks. ---*/
  318. /*---------------------------------------------*/
  319. /*---------------------------------------------*/
  320. static
  321. NOINLINE
  322. int mainGtU(EState* state,
  323. uint32_t i1,
  324. uint32_t i2)
  325. {
  326. int32_t k;
  327. uint8_t c1, c2;
  328. uint16_t s1, s2;
  329. uint8_t *const block = state->block;
  330. uint16_t *const quadrant = state->quadrant;
  331. const int32_t nblock = state->nblock;
  332. /* Loop unrolling here is actually very useful
  333. * (generated code is much simpler),
  334. * code size increase is only 270 bytes (i386)
  335. * but speeds up compression 10% overall
  336. */
  337. #if BZIP2_SPEED >= 1
  338. #define TIMES_8(code) \
  339. code; code; code; code; \
  340. code; code; code; code;
  341. #define TIMES_12(code) \
  342. code; code; code; code; \
  343. code; code; code; code; \
  344. code; code; code; code;
  345. #else
  346. #define TIMES_8(code) \
  347. { \
  348. int nn = 8; \
  349. do { \
  350. code; \
  351. } while (--nn); \
  352. }
  353. #define TIMES_12(code) \
  354. { \
  355. int nn = 12; \
  356. do { \
  357. code; \
  358. } while (--nn); \
  359. }
  360. #endif
  361. AssertD(i1 != i2, "mainGtU");
  362. TIMES_12(
  363. c1 = block[i1]; c2 = block[i2];
  364. if (c1 != c2) return (c1 > c2);
  365. i1++; i2++;
  366. )
  367. k = nblock + 8;
  368. do {
  369. TIMES_8(
  370. c1 = block[i1]; c2 = block[i2];
  371. if (c1 != c2) return (c1 > c2);
  372. s1 = quadrant[i1]; s2 = quadrant[i2];
  373. if (s1 != s2) return (s1 > s2);
  374. i1++; i2++;
  375. )
  376. if (i1 >= nblock) i1 -= nblock;
  377. if (i2 >= nblock) i2 -= nblock;
  378. state->budget--;
  379. k -= 8;
  380. } while (k >= 0);
  381. return False;
  382. }
  383. #undef TIMES_8
  384. #undef TIMES_12
  385. /*---------------------------------------------*/
  386. /*
  387. * Knuth's increments seem to work better
  388. * than Incerpi-Sedgewick here. Possibly
  389. * because the number of elems to sort is
  390. * usually small, typically <= 20.
  391. */
  392. static
  393. const uint32_t incs[14] ALIGN4 = {
  394. 1, 4, 13, 40, 121, 364, 1093, 3280,
  395. 9841, 29524, 88573, 265720,
  396. 797161, 2391484
  397. };
  398. static
  399. void mainSimpleSort(EState* state,
  400. int32_t lo,
  401. int32_t hi,
  402. int32_t d)
  403. {
  404. uint32_t *const ptr = state->ptr;
  405. /* At which increment to start? */
  406. int hp = 0;
  407. {
  408. int bigN = hi - lo;
  409. if (bigN <= 0)
  410. return;
  411. while (incs[hp] <= bigN)
  412. hp++;
  413. hp--;
  414. }
  415. for (; hp >= 0; hp--) {
  416. int32_t i;
  417. unsigned h;
  418. h = incs[hp];
  419. i = lo + h;
  420. while (1) {
  421. unsigned j;
  422. unsigned v;
  423. if (i > hi) break;
  424. v = ptr[i];
  425. j = i;
  426. while (mainGtU(state, ptr[j-h]+d, v+d)) {
  427. ptr[j] = ptr[j-h];
  428. j = j - h;
  429. if (j <= (lo + h - 1)) break;
  430. }
  431. ptr[j] = v;
  432. i++;
  433. /* 1.5% overall speedup, +290 bytes */
  434. #if BZIP2_SPEED >= 3
  435. /*-- copy 2 --*/
  436. if (i > hi) break;
  437. v = ptr[i];
  438. j = i;
  439. while (mainGtU(state, ptr[j-h]+d, v+d)) {
  440. ptr[j] = ptr[j-h];
  441. j = j - h;
  442. if (j <= (lo + h - 1)) break;
  443. }
  444. ptr[j] = v;
  445. i++;
  446. /*-- copy 3 --*/
  447. if (i > hi) break;
  448. v = ptr[i];
  449. j = i;
  450. while (mainGtU(state, ptr[j-h]+d, v+d)) {
  451. ptr[j] = ptr[j-h];
  452. j = j - h;
  453. if (j <= (lo + h - 1)) break;
  454. }
  455. ptr[j] = v;
  456. i++;
  457. #endif
  458. if (state->budget < 0) return;
  459. }
  460. }
  461. }
  462. /*---------------------------------------------*/
  463. /*
  464. * The following is an implementation of
  465. * an elegant 3-way quicksort for strings,
  466. * described in a paper "Fast Algorithms for
  467. * Sorting and Searching Strings", by Robert
  468. * Sedgewick and Jon L. Bentley.
  469. */
  470. static
  471. ALWAYS_INLINE
  472. uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
  473. {
  474. uint8_t t;
  475. if (a > b) {
  476. t = a;
  477. a = b;
  478. b = t;
  479. }
  480. /* here b >= a */
  481. if (b > c) {
  482. b = c;
  483. if (a > b)
  484. b = a;
  485. }
  486. return b;
  487. }
  488. #define mpush(lz,hz,dz) \
  489. { \
  490. stackLo[sp] = lz; \
  491. stackHi[sp] = hz; \
  492. stackD [sp] = dz; \
  493. sp++; \
  494. }
  495. #define mpop(lz,hz,dz) \
  496. { \
  497. sp--; \
  498. lz = stackLo[sp]; \
  499. hz = stackHi[sp]; \
  500. dz = stackD [sp]; \
  501. }
  502. #define mnextsize(az) (nextHi[az] - nextLo[az])
  503. #define mnextswap(az,bz) \
  504. { \
  505. int32_t tz; \
  506. tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
  507. tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
  508. tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
  509. }
  510. #define MAIN_QSORT_SMALL_THRESH 20
  511. #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
  512. #define MAIN_QSORT_STACK_SIZE 100
  513. static NOINLINE
  514. void mainQSort3(EState* state,
  515. int32_t loSt,
  516. int32_t hiSt
  517. /*int32_t dSt*/)
  518. {
  519. enum { dSt = BZ_N_RADIX };
  520. int32_t unLo, unHi, ltLo, gtHi, n, m, med;
  521. int32_t sp, lo, hi, d;
  522. int32_t stackLo[MAIN_QSORT_STACK_SIZE];
  523. int32_t stackHi[MAIN_QSORT_STACK_SIZE];
  524. int32_t stackD [MAIN_QSORT_STACK_SIZE];
  525. int32_t nextLo[3];
  526. int32_t nextHi[3];
  527. int32_t nextD [3];
  528. uint32_t *const ptr = state->ptr;
  529. uint8_t *const block = state->block;
  530. sp = 0;
  531. mpush(loSt, hiSt, dSt);
  532. while (sp > 0) {
  533. AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
  534. mpop(lo, hi, d);
  535. if (hi - lo < MAIN_QSORT_SMALL_THRESH
  536. || d > MAIN_QSORT_DEPTH_THRESH
  537. ) {
  538. mainSimpleSort(state, lo, hi, d);
  539. if (state->budget < 0)
  540. return;
  541. continue;
  542. }
  543. med = (int32_t) mmed3(block[ptr[lo ] + d],
  544. block[ptr[hi ] + d],
  545. block[ptr[(lo+hi) >> 1] + d]);
  546. unLo = ltLo = lo;
  547. unHi = gtHi = hi;
  548. while (1) {
  549. while (1) {
  550. if (unLo > unHi)
  551. break;
  552. n = ((int32_t)block[ptr[unLo]+d]) - med;
  553. if (n == 0) {
  554. mswap(ptr[unLo], ptr[ltLo]);
  555. ltLo++;
  556. unLo++;
  557. continue;
  558. }
  559. if (n > 0) break;
  560. unLo++;
  561. }
  562. while (1) {
  563. if (unLo > unHi)
  564. break;
  565. n = ((int32_t)block[ptr[unHi]+d]) - med;
  566. if (n == 0) {
  567. mswap(ptr[unHi], ptr[gtHi]);
  568. gtHi--;
  569. unHi--;
  570. continue;
  571. }
  572. if (n < 0) break;
  573. unHi--;
  574. }
  575. if (unLo > unHi)
  576. break;
  577. mswap(ptr[unLo], ptr[unHi]);
  578. unLo++;
  579. unHi--;
  580. }
  581. AssertD(unHi == unLo-1, "mainQSort3(2)");
  582. if (gtHi < ltLo) {
  583. mpush(lo, hi, d + 1);
  584. continue;
  585. }
  586. n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
  587. m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
  588. n = lo + unLo - ltLo - 1;
  589. m = hi - (gtHi - unHi) + 1;
  590. nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
  591. nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
  592. nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
  593. if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
  594. if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
  595. if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
  596. AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
  597. AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
  598. mpush(nextLo[0], nextHi[0], nextD[0]);
  599. mpush(nextLo[1], nextHi[1], nextD[1]);
  600. mpush(nextLo[2], nextHi[2], nextD[2]);
  601. }
  602. }
  603. #undef mpush
  604. #undef mpop
  605. #undef mnextsize
  606. #undef mnextswap
  607. #undef MAIN_QSORT_SMALL_THRESH
  608. #undef MAIN_QSORT_DEPTH_THRESH
  609. #undef MAIN_QSORT_STACK_SIZE
  610. /*---------------------------------------------*/
  611. /* Pre:
  612. * nblock > N_OVERSHOOT
  613. * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
  614. * ((uint8_t*)block32) [0 .. nblock-1] holds block
  615. * ptr exists for [0 .. nblock-1]
  616. *
  617. * Post:
  618. * ((uint8_t*)block32) [0 .. nblock-1] holds block
  619. * All other areas of block32 destroyed
  620. * ftab[0 .. 65536] destroyed
  621. * ptr [0 .. nblock-1] holds sorted order
  622. * if (*budget < 0), sorting was abandoned
  623. */
  624. #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
  625. #define SETMASK (1 << 21)
  626. #define CLEARMASK (~(SETMASK))
  627. static NOINLINE
  628. void mainSort(EState* state)
  629. {
  630. int32_t i, j;
  631. Bool bigDone[256];
  632. uint8_t runningOrder[256];
  633. /* bbox: moved to EState to save stack
  634. int32_t copyStart[256];
  635. int32_t copyEnd [256];
  636. */
  637. #define copyStart (state->mainSort__copyStart)
  638. #define copyEnd (state->mainSort__copyEnd)
  639. uint32_t *const ptr = state->ptr;
  640. uint8_t *const block = state->block;
  641. uint32_t *const ftab = state->ftab;
  642. const int32_t nblock = state->nblock;
  643. uint16_t *const quadrant = state->quadrant;
  644. /*-- set up the 2-byte frequency table --*/
  645. /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
  646. memset(ftab, 0, 65537 * sizeof(ftab[0]));
  647. j = block[0] << 8;
  648. i = nblock - 1;
  649. /* 3%, +300 bytes */
  650. #if BZIP2_SPEED >= 2
  651. for (; i >= 3; i -= 4) {
  652. quadrant[i] = 0;
  653. j = (j >> 8) | (((unsigned)block[i]) << 8);
  654. ftab[j]++;
  655. quadrant[i-1] = 0;
  656. j = (j >> 8) | (((unsigned)block[i-1]) << 8);
  657. ftab[j]++;
  658. quadrant[i-2] = 0;
  659. j = (j >> 8) | (((unsigned)block[i-2]) << 8);
  660. ftab[j]++;
  661. quadrant[i-3] = 0;
  662. j = (j >> 8) | (((unsigned)block[i-3]) << 8);
  663. ftab[j]++;
  664. }
  665. #endif
  666. for (; i >= 0; i--) {
  667. quadrant[i] = 0;
  668. j = (j >> 8) | (((unsigned)block[i]) << 8);
  669. ftab[j]++;
  670. }
  671. /*-- (emphasises close relationship of block & quadrant) --*/
  672. for (i = 0; i < BZ_N_OVERSHOOT; i++) {
  673. block [nblock+i] = block[i];
  674. quadrant[nblock+i] = 0;
  675. }
  676. /*-- Complete the initial radix sort --*/
  677. j = ftab[0]; /* bbox: optimized */
  678. for (i = 1; i <= 65536; i++) {
  679. j += ftab[i];
  680. ftab[i] = j;
  681. }
  682. {
  683. unsigned s;
  684. s = block[0] << 8;
  685. i = nblock - 1;
  686. #if BZIP2_SPEED >= 2
  687. for (; i >= 3; i -= 4) {
  688. s = (s >> 8) | (block[i] << 8);
  689. j = ftab[s] - 1;
  690. ftab[s] = j;
  691. ptr[j] = i;
  692. s = (s >> 8) | (block[i-1] << 8);
  693. j = ftab[s] - 1;
  694. ftab[s] = j;
  695. ptr[j] = i-1;
  696. s = (s >> 8) | (block[i-2] << 8);
  697. j = ftab[s] - 1;
  698. ftab[s] = j;
  699. ptr[j] = i-2;
  700. s = (s >> 8) | (block[i-3] << 8);
  701. j = ftab[s] - 1;
  702. ftab[s] = j;
  703. ptr[j] = i-3;
  704. }
  705. #endif
  706. for (; i >= 0; i--) {
  707. s = (s >> 8) | (block[i] << 8);
  708. j = ftab[s] - 1;
  709. ftab[s] = j;
  710. ptr[j] = i;
  711. }
  712. }
  713. /*
  714. * Now ftab contains the first loc of every small bucket.
  715. * Calculate the running order, from smallest to largest
  716. * big bucket.
  717. */
  718. for (i = 0; i <= 255; i++) {
  719. bigDone [i] = False;
  720. runningOrder[i] = i;
  721. }
  722. {
  723. /* bbox: was: int32_t h = 1; */
  724. /* do h = 3 * h + 1; while (h <= 256); */
  725. unsigned h = 364;
  726. do {
  727. /*h = h / 3;*/
  728. h = (h * 171) >> 9; /* bbox: fast h/3 */
  729. for (i = h; i <= 255; i++) {
  730. unsigned vv, jh;
  731. vv = runningOrder[i]; /* uint8[] */
  732. j = i;
  733. while (jh = j - h, BIGFREQ(runningOrder[jh]) > BIGFREQ(vv)) {
  734. runningOrder[j] = runningOrder[jh];
  735. j = jh;
  736. if (j < h)
  737. break;
  738. }
  739. runningOrder[j] = vv;
  740. }
  741. } while (h != 1);
  742. }
  743. /*
  744. * The main sorting loop.
  745. */
  746. for (i = 0; /*i <= 255*/; i++) {
  747. unsigned ss;
  748. /*
  749. * Process big buckets, starting with the least full.
  750. * Basically this is a 3-step process in which we call
  751. * mainQSort3 to sort the small buckets [ss, j], but
  752. * also make a big effort to avoid the calls if we can.
  753. */
  754. ss = runningOrder[i];
  755. /*
  756. * Step 1:
  757. * Complete the big bucket [ss] by quicksorting
  758. * any unsorted small buckets [ss, j], for j != ss.
  759. * Hopefully previous pointer-scanning phases have already
  760. * completed many of the small buckets [ss, j], so
  761. * we don't have to sort them at all.
  762. */
  763. for (j = 0; j <= 255; j++) {
  764. if (j != ss) {
  765. unsigned sb;
  766. sb = (ss << 8) + j;
  767. if (!(ftab[sb] & SETMASK)) {
  768. int32_t lo = ftab[sb] /*& CLEARMASK (redundant)*/;
  769. int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
  770. if (hi > lo) {
  771. mainQSort3(state, lo, hi /*,BZ_N_RADIX*/);
  772. if (state->budget < 0) return;
  773. }
  774. }
  775. ftab[sb] |= SETMASK;
  776. }
  777. }
  778. AssertH(!bigDone[ss], 1006);
  779. /*
  780. * Step 2:
  781. * Now scan this big bucket [ss] so as to synthesise the
  782. * sorted order for small buckets [t, ss] for all t,
  783. * including, magically, the bucket [ss,ss] too.
  784. * This will avoid doing Real Work in subsequent Step 1's.
  785. */
  786. {
  787. for (j = 0; j <= 255; j++) {
  788. copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
  789. copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
  790. }
  791. for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
  792. unsigned c1;
  793. int32_t k;
  794. k = ptr[j] - 1;
  795. if (k < 0)
  796. k += nblock;
  797. c1 = block[k];
  798. if (!bigDone[c1])
  799. ptr[copyStart[c1]++] = k;
  800. }
  801. for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
  802. unsigned c1;
  803. int32_t k;
  804. k = ptr[j]-1;
  805. if (k < 0)
  806. k += nblock;
  807. c1 = block[k];
  808. if (!bigDone[c1])
  809. ptr[copyEnd[c1]--] = k;
  810. }
  811. }
  812. /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
  813. * Necessity for this case is demonstrated by compressing
  814. * a sequence of approximately 48.5 million of character
  815. * 251; 1.0.0/1.0.1 will then die here. */
  816. AssertH((copyStart[ss]-1 == copyEnd[ss]) \
  817. || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
  818. for (j = 0; j <= 255; j++)
  819. ftab[(j << 8) + ss] |= SETMASK;
  820. if (i == 255)
  821. break;
  822. /*
  823. * Step 3:
  824. * The [ss] big bucket is now done. Record this fact,
  825. * and update the quadrant descriptors. Remember to
  826. * update quadrants in the overshoot area too, if
  827. * necessary. The "if (i < 255)" test merely skips
  828. * this updating for the last bucket processed, since
  829. * updating for the last bucket is pointless.
  830. *
  831. * The quadrant array provides a way to incrementally
  832. * cache sort orderings, as they appear, so as to
  833. * make subsequent comparisons in fullGtU() complete
  834. * faster. For repetitive blocks this makes a big
  835. * difference (but not big enough to be able to avoid
  836. * the fallback sorting mechanism, exponential radix sort).
  837. *
  838. * The precise meaning is: at all times:
  839. *
  840. * for 0 <= i < nblock and 0 <= j <= nblock
  841. *
  842. * if block[i] != block[j],
  843. *
  844. * then the relative values of quadrant[i] and
  845. * quadrant[j] are meaningless.
  846. *
  847. * else {
  848. * if quadrant[i] < quadrant[j]
  849. * then the string starting at i lexicographically
  850. * precedes the string starting at j
  851. *
  852. * else if quadrant[i] > quadrant[j]
  853. * then the string starting at j lexicographically
  854. * precedes the string starting at i
  855. *
  856. * else
  857. * the relative ordering of the strings starting
  858. * at i and j has not yet been determined.
  859. * }
  860. */
  861. bigDone[ss] = True;
  862. {
  863. unsigned bbStart = ftab[ss << 8] & CLEARMASK;
  864. unsigned bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
  865. unsigned shifts = 0;
  866. while ((bbSize >> shifts) > 65534) shifts++;
  867. for (j = bbSize-1; j >= 0; j--) {
  868. unsigned a2update = ptr[bbStart + j]; /* uint32[] */
  869. uint16_t qVal = (uint16_t)(j >> shifts);
  870. quadrant[a2update] = qVal;
  871. if (a2update < BZ_N_OVERSHOOT)
  872. quadrant[a2update + nblock] = qVal;
  873. }
  874. AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
  875. }
  876. }
  877. #undef runningOrder
  878. #undef copyStart
  879. #undef copyEnd
  880. }
  881. #undef BIGFREQ
  882. #undef SETMASK
  883. #undef CLEARMASK
  884. /*---------------------------------------------*/
  885. /* Pre:
  886. * nblock > 0
  887. * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
  888. * ((uint8_t*)arr2)[0 .. nblock-1] holds block
  889. * arr1 exists for [0 .. nblock-1]
  890. *
  891. * Post:
  892. * ((uint8_t*)arr2) [0 .. nblock-1] holds block
  893. * All other areas of block destroyed
  894. * ftab[0 .. 65536] destroyed
  895. * arr1[0 .. nblock-1] holds sorted order
  896. */
  897. static NOINLINE
  898. int32_t BZ2_blockSort(EState* state)
  899. {
  900. /* In original bzip2 1.0.4, it's a parameter, but 30
  901. * (which was the default) should work ok. */
  902. enum { wfact = 30 };
  903. unsigned i;
  904. int32_t origPtr = origPtr;
  905. if (state->nblock >= 10000) {
  906. /* Calculate the location for quadrant, remembering to get
  907. * the alignment right. Assumes that &(block[0]) is at least
  908. * 2-byte aligned -- this should be ok since block is really
  909. * the first section of arr2.
  910. */
  911. i = state->nblock + BZ_N_OVERSHOOT;
  912. if (i & 1)
  913. i++;
  914. state->quadrant = (uint16_t*) &(state->block[i]);
  915. /* (wfact-1) / 3 puts the default-factor-30
  916. * transition point at very roughly the same place as
  917. * with v0.1 and v0.9.0.
  918. * Not that it particularly matters any more, since the
  919. * resulting compressed stream is now the same regardless
  920. * of whether or not we use the main sort or fallback sort.
  921. */
  922. state->budget = state->nblock * ((wfact-1) / 3);
  923. mainSort(state);
  924. if (state->budget >= 0)
  925. goto good;
  926. }
  927. fallbackSort(state);
  928. good:
  929. #if BZ_LIGHT_DEBUG
  930. origPtr = -1;
  931. #endif
  932. for (i = 0; i < state->nblock; i++) {
  933. if (state->ptr[i] == 0) {
  934. origPtr = i;
  935. break;
  936. }
  937. }
  938. AssertH(origPtr != -1, 1003);
  939. return origPtr;
  940. }
  941. /*-------------------------------------------------------------*/
  942. /*--- end blocksort.c ---*/
  943. /*-------------------------------------------------------------*/