blocksort.c 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. /*
  2. * bzip2 is written by Julian Seward <jseward@bzip.org>.
  3. * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
  4. * See README and LICENSE files in this directory for more information.
  5. */
  6. /*-------------------------------------------------------------*/
  7. /*--- Block sorting machinery ---*/
  8. /*--- blocksort.c ---*/
  9. /*-------------------------------------------------------------*/
  10. /* ------------------------------------------------------------------
  11. This file is part of bzip2/libbzip2, a program and library for
  12. lossless, block-sorting data compression.
  13. bzip2/libbzip2 version 1.0.4 of 20 December 2006
  14. Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
  15. Please read the WARNING, DISCLAIMER and PATENTS sections in the
  16. README file.
  17. This program is released under the terms of the license contained
  18. in the file LICENSE.
  19. ------------------------------------------------------------------ */
  20. /* #include "bzlib_private.h" */
  21. #define mswap(zz1, zz2) \
  22. { \
  23. int32_t zztmp = zz1; \
  24. zz1 = zz2; \
  25. zz2 = zztmp; \
  26. }
  27. static
  28. /* No measurable speed gain with inlining */
  29. /* ALWAYS_INLINE */
  30. void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
  31. {
  32. while (zzn > 0) {
  33. mswap(ptr[zzp1], ptr[zzp2]);
  34. zzp1++;
  35. zzp2++;
  36. zzn--;
  37. }
  38. }
  39. static
  40. ALWAYS_INLINE
  41. int32_t mmin(int32_t a, int32_t b)
  42. {
  43. return (a < b) ? a : b;
  44. }
  45. /*---------------------------------------------*/
  46. /*--- Fallback O(N log(N)^2) sorting ---*/
  47. /*--- algorithm, for repetitive blocks ---*/
  48. /*---------------------------------------------*/
  49. /*---------------------------------------------*/
  50. static
  51. inline
  52. void fallbackSimpleSort(uint32_t* fmap,
  53. uint32_t* eclass,
  54. int32_t lo,
  55. int32_t hi)
  56. {
  57. int32_t i, j, tmp;
  58. uint32_t ec_tmp;
  59. if (lo == hi) return;
  60. if (hi - lo > 3) {
  61. for (i = hi-4; i >= lo; i--) {
  62. tmp = fmap[i];
  63. ec_tmp = eclass[tmp];
  64. for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
  65. fmap[j-4] = fmap[j];
  66. fmap[j-4] = tmp;
  67. }
  68. }
  69. for (i = hi-1; i >= lo; i--) {
  70. tmp = fmap[i];
  71. ec_tmp = eclass[tmp];
  72. for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
  73. fmap[j-1] = fmap[j];
  74. fmap[j-1] = tmp;
  75. }
  76. }
  77. /*---------------------------------------------*/
  78. #define fpush(lz,hz) { \
  79. stackLo[sp] = lz; \
  80. stackHi[sp] = hz; \
  81. sp++; \
  82. }
  83. #define fpop(lz,hz) { \
  84. sp--; \
  85. lz = stackLo[sp]; \
  86. hz = stackHi[sp]; \
  87. }
  88. #define FALLBACK_QSORT_SMALL_THRESH 10
  89. #define FALLBACK_QSORT_STACK_SIZE 100
  90. static
  91. void fallbackQSort3(uint32_t* fmap,
  92. uint32_t* eclass,
  93. int32_t loSt,
  94. int32_t hiSt)
  95. {
  96. int32_t unLo, unHi, ltLo, gtHi, n, m;
  97. int32_t sp, lo, hi;
  98. uint32_t med, r, r3;
  99. int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
  100. int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
  101. r = 0;
  102. sp = 0;
  103. fpush(loSt, hiSt);
  104. while (sp > 0) {
  105. AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
  106. fpop(lo, hi);
  107. if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
  108. fallbackSimpleSort(fmap, eclass, lo, hi);
  109. continue;
  110. }
  111. /* Random partitioning. Median of 3 sometimes fails to
  112. * avoid bad cases. Median of 9 seems to help but
  113. * looks rather expensive. This too seems to work but
  114. * is cheaper. Guidance for the magic constants
  115. * 7621 and 32768 is taken from Sedgewick's algorithms
  116. * book, chapter 35.
  117. */
  118. r = ((r * 7621) + 1) % 32768;
  119. r3 = r % 3;
  120. if (r3 == 0)
  121. med = eclass[fmap[lo]];
  122. else if (r3 == 1)
  123. med = eclass[fmap[(lo+hi)>>1]];
  124. else
  125. med = eclass[fmap[hi]];
  126. unLo = ltLo = lo;
  127. unHi = gtHi = hi;
  128. while (1) {
  129. while (1) {
  130. if (unLo > unHi) break;
  131. n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
  132. if (n == 0) {
  133. mswap(fmap[unLo], fmap[ltLo]);
  134. ltLo++;
  135. unLo++;
  136. continue;
  137. };
  138. if (n > 0) break;
  139. unLo++;
  140. }
  141. while (1) {
  142. if (unLo > unHi) break;
  143. n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
  144. if (n == 0) {
  145. mswap(fmap[unHi], fmap[gtHi]);
  146. gtHi--; unHi--;
  147. continue;
  148. };
  149. if (n < 0) break;
  150. unHi--;
  151. }
  152. if (unLo > unHi) break;
  153. mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
  154. }
  155. AssertD(unHi == unLo-1, "fallbackQSort3(2)");
  156. if (gtHi < ltLo) continue;
  157. n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
  158. m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
  159. n = lo + unLo - ltLo - 1;
  160. m = hi - (gtHi - unHi) + 1;
  161. if (n - lo > hi - m) {
  162. fpush(lo, n);
  163. fpush(m, hi);
  164. } else {
  165. fpush(m, hi);
  166. fpush(lo, n);
  167. }
  168. }
  169. }
  170. #undef fpush
  171. #undef fpop
  172. #undef FALLBACK_QSORT_SMALL_THRESH
  173. #undef FALLBACK_QSORT_STACK_SIZE
  174. /*---------------------------------------------*/
  175. /* Pre:
  176. * nblock > 0
  177. * eclass exists for [0 .. nblock-1]
  178. * ((uint8_t*)eclass) [0 .. nblock-1] holds block
  179. * ptr exists for [0 .. nblock-1]
  180. *
  181. * Post:
  182. * ((uint8_t*)eclass) [0 .. nblock-1] holds block
  183. * All other areas of eclass destroyed
  184. * fmap [0 .. nblock-1] holds sorted order
  185. * bhtab[0 .. 2+(nblock/32)] destroyed
  186. */
  187. #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
  188. #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
  189. #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
  190. #define WORD_BH(zz) bhtab[(zz) >> 5]
  191. #define UNALIGNED_BH(zz) ((zz) & 0x01f)
  192. static
  193. void fallbackSort(uint32_t* fmap,
  194. uint32_t* eclass,
  195. uint32_t* bhtab,
  196. int32_t nblock)
  197. {
  198. int32_t ftab[257];
  199. int32_t ftabCopy[256];
  200. int32_t H, i, j, k, l, r, cc, cc1;
  201. int32_t nNotDone;
  202. int32_t nBhtab;
  203. uint8_t* eclass8 = (uint8_t*)eclass;
  204. /*
  205. * Initial 1-char radix sort to generate
  206. * initial fmap and initial BH bits.
  207. */
  208. for (i = 0; i < 257; i++) ftab[i] = 0;
  209. for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
  210. for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
  211. j = ftab[0]; /* bbox: optimized */
  212. for (i = 1; i < 257; i++) {
  213. j += ftab[i];
  214. ftab[i] = j;
  215. }
  216. for (i = 0; i < nblock; i++) {
  217. j = eclass8[i];
  218. k = ftab[j] - 1;
  219. ftab[j] = k;
  220. fmap[k] = i;
  221. }
  222. nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
  223. for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
  224. for (i = 0; i < 256; i++) SET_BH(ftab[i]);
  225. /*
  226. * Inductively refine the buckets. Kind-of an
  227. * "exponential radix sort" (!), inspired by the
  228. * Manber-Myers suffix array construction algorithm.
  229. */
  230. /*-- set sentinel bits for block-end detection --*/
  231. for (i = 0; i < 32; i++) {
  232. SET_BH(nblock + 2*i);
  233. CLEAR_BH(nblock + 2*i + 1);
  234. }
  235. /*-- the log(N) loop --*/
  236. H = 1;
  237. while (1) {
  238. j = 0;
  239. for (i = 0; i < nblock; i++) {
  240. if (ISSET_BH(i))
  241. j = i;
  242. k = fmap[i] - H;
  243. if (k < 0)
  244. k += nblock;
  245. eclass[k] = j;
  246. }
  247. nNotDone = 0;
  248. r = -1;
  249. while (1) {
  250. /*-- find the next non-singleton bucket --*/
  251. k = r + 1;
  252. while (ISSET_BH(k) && UNALIGNED_BH(k))
  253. k++;
  254. if (ISSET_BH(k)) {
  255. while (WORD_BH(k) == 0xffffffff) k += 32;
  256. while (ISSET_BH(k)) k++;
  257. }
  258. l = k - 1;
  259. if (l >= nblock)
  260. break;
  261. while (!ISSET_BH(k) && UNALIGNED_BH(k))
  262. k++;
  263. if (!ISSET_BH(k)) {
  264. while (WORD_BH(k) == 0x00000000) k += 32;
  265. while (!ISSET_BH(k)) k++;
  266. }
  267. r = k - 1;
  268. if (r >= nblock)
  269. break;
  270. /*-- now [l, r] bracket current bucket --*/
  271. if (r > l) {
  272. nNotDone += (r - l + 1);
  273. fallbackQSort3(fmap, eclass, l, r);
  274. /*-- scan bucket and generate header bits-- */
  275. cc = -1;
  276. for (i = l; i <= r; i++) {
  277. cc1 = eclass[fmap[i]];
  278. if (cc != cc1) {
  279. SET_BH(i);
  280. cc = cc1;
  281. };
  282. }
  283. }
  284. }
  285. H *= 2;
  286. if (H > nblock || nNotDone == 0)
  287. break;
  288. }
  289. /*
  290. * Reconstruct the original block in
  291. * eclass8 [0 .. nblock-1], since the
  292. * previous phase destroyed it.
  293. */
  294. j = 0;
  295. for (i = 0; i < nblock; i++) {
  296. while (ftabCopy[j] == 0)
  297. j++;
  298. ftabCopy[j]--;
  299. eclass8[fmap[i]] = (uint8_t)j;
  300. }
  301. AssertH(j < 256, 1005);
  302. }
  303. #undef SET_BH
  304. #undef CLEAR_BH
  305. #undef ISSET_BH
  306. #undef WORD_BH
  307. #undef UNALIGNED_BH
  308. /*---------------------------------------------*/
  309. /*--- The main, O(N^2 log(N)) sorting ---*/
  310. /*--- algorithm. Faster for "normal" ---*/
  311. /*--- non-repetitive blocks. ---*/
  312. /*---------------------------------------------*/
  313. /*---------------------------------------------*/
  314. static
  315. NOINLINE
  316. int mainGtU(
  317. uint32_t i1,
  318. uint32_t i2,
  319. uint8_t* block,
  320. uint16_t* quadrant,
  321. uint32_t nblock,
  322. int32_t* budget)
  323. {
  324. int32_t k;
  325. uint8_t c1, c2;
  326. uint16_t s1, s2;
  327. /* Loop unrolling here is actually very useful
  328. * (generated code is much simpler),
  329. * code size increase is only 270 bytes (i386)
  330. * but speeds up compression 10% overall
  331. */
  332. #if CONFIG_BZIP2_FEATURE_SPEED >= 1
  333. #define TIMES_8(code) \
  334. code; code; code; code; \
  335. code; code; code; code;
  336. #define TIMES_12(code) \
  337. code; code; code; code; \
  338. code; code; code; code; \
  339. code; code; code; code;
  340. #else
  341. #define TIMES_8(code) \
  342. { \
  343. int nn = 8; \
  344. do { \
  345. code; \
  346. } while (--nn); \
  347. }
  348. #define TIMES_12(code) \
  349. { \
  350. int nn = 12; \
  351. do { \
  352. code; \
  353. } while (--nn); \
  354. }
  355. #endif
  356. AssertD(i1 != i2, "mainGtU");
  357. TIMES_12(
  358. c1 = block[i1]; c2 = block[i2];
  359. if (c1 != c2) return (c1 > c2);
  360. i1++; i2++;
  361. )
  362. k = nblock + 8;
  363. do {
  364. TIMES_8(
  365. c1 = block[i1]; c2 = block[i2];
  366. if (c1 != c2) return (c1 > c2);
  367. s1 = quadrant[i1]; s2 = quadrant[i2];
  368. if (s1 != s2) return (s1 > s2);
  369. i1++; i2++;
  370. )
  371. if (i1 >= nblock) i1 -= nblock;
  372. if (i2 >= nblock) i2 -= nblock;
  373. (*budget)--;
  374. k -= 8;
  375. } while (k >= 0);
  376. return False;
  377. }
  378. #undef TIMES_8
  379. #undef TIMES_12
  380. /*---------------------------------------------*/
  381. /*
  382. * Knuth's increments seem to work better
  383. * than Incerpi-Sedgewick here. Possibly
  384. * because the number of elems to sort is
  385. * usually small, typically <= 20.
  386. */
  387. static
  388. const int32_t incs[14] = {
  389. 1, 4, 13, 40, 121, 364, 1093, 3280,
  390. 9841, 29524, 88573, 265720,
  391. 797161, 2391484
  392. };
  393. static
  394. void mainSimpleSort(uint32_t* ptr,
  395. uint8_t* block,
  396. uint16_t* quadrant,
  397. int32_t nblock,
  398. int32_t lo,
  399. int32_t hi,
  400. int32_t d,
  401. int32_t* budget)
  402. {
  403. int32_t i, j, h, bigN, hp;
  404. uint32_t v;
  405. bigN = hi - lo + 1;
  406. if (bigN < 2) return;
  407. hp = 0;
  408. while (incs[hp] < bigN) hp++;
  409. hp--;
  410. for (; hp >= 0; hp--) {
  411. h = incs[hp];
  412. i = lo + h;
  413. while (1) {
  414. /*-- copy 1 --*/
  415. if (i > hi) break;
  416. v = ptr[i];
  417. j = i;
  418. while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
  419. ptr[j] = ptr[j-h];
  420. j = j - h;
  421. if (j <= (lo + h - 1)) break;
  422. }
  423. ptr[j] = v;
  424. i++;
  425. /* 1.5% overall speedup, +290 bytes */
  426. #if CONFIG_BZIP2_FEATURE_SPEED >= 3
  427. /*-- copy 2 --*/
  428. if (i > hi) break;
  429. v = ptr[i];
  430. j = i;
  431. while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
  432. ptr[j] = ptr[j-h];
  433. j = j - h;
  434. if (j <= (lo + h - 1)) break;
  435. }
  436. ptr[j] = v;
  437. i++;
  438. /*-- copy 3 --*/
  439. if (i > hi) break;
  440. v = ptr[i];
  441. j = i;
  442. while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
  443. ptr[j] = ptr[j-h];
  444. j = j - h;
  445. if (j <= (lo + h - 1)) break;
  446. }
  447. ptr[j] = v;
  448. i++;
  449. #endif
  450. if (*budget < 0) return;
  451. }
  452. }
  453. }
  454. /*---------------------------------------------*/
  455. /*
  456. * The following is an implementation of
  457. * an elegant 3-way quicksort for strings,
  458. * described in a paper "Fast Algorithms for
  459. * Sorting and Searching Strings", by Robert
  460. * Sedgewick and Jon L. Bentley.
  461. */
  462. static
  463. ALWAYS_INLINE
  464. uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
  465. {
  466. uint8_t t;
  467. if (a > b) {
  468. t = a;
  469. a = b;
  470. b = t;
  471. };
  472. /* here b >= a */
  473. if (b > c) {
  474. b = c;
  475. if (a > b)
  476. b = a;
  477. }
  478. return b;
  479. }
  480. #define mpush(lz,hz,dz) \
  481. { \
  482. stackLo[sp] = lz; \
  483. stackHi[sp] = hz; \
  484. stackD [sp] = dz; \
  485. sp++; \
  486. }
  487. #define mpop(lz,hz,dz) \
  488. { \
  489. sp--; \
  490. lz = stackLo[sp]; \
  491. hz = stackHi[sp]; \
  492. dz = stackD [sp]; \
  493. }
  494. #define mnextsize(az) (nextHi[az] - nextLo[az])
  495. #define mnextswap(az,bz) \
  496. { \
  497. int32_t tz; \
  498. tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
  499. tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
  500. tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
  501. }
  502. #define MAIN_QSORT_SMALL_THRESH 20
  503. #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
  504. #define MAIN_QSORT_STACK_SIZE 100
  505. static NOINLINE
  506. void mainQSort3(uint32_t* ptr,
  507. uint8_t* block,
  508. uint16_t* quadrant,
  509. int32_t nblock,
  510. int32_t loSt,
  511. int32_t hiSt,
  512. int32_t dSt,
  513. int32_t* budget)
  514. {
  515. int32_t unLo, unHi, ltLo, gtHi, n, m, med;
  516. int32_t sp, lo, hi, d;
  517. int32_t stackLo[MAIN_QSORT_STACK_SIZE];
  518. int32_t stackHi[MAIN_QSORT_STACK_SIZE];
  519. int32_t stackD [MAIN_QSORT_STACK_SIZE];
  520. int32_t nextLo[3];
  521. int32_t nextHi[3];
  522. int32_t nextD [3];
  523. sp = 0;
  524. mpush(loSt, hiSt, dSt);
  525. while (sp > 0) {
  526. AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
  527. mpop(lo, hi, d);
  528. if (hi - lo < MAIN_QSORT_SMALL_THRESH
  529. || d > MAIN_QSORT_DEPTH_THRESH
  530. ) {
  531. mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
  532. if (*budget < 0)
  533. return;
  534. continue;
  535. }
  536. med = (int32_t) mmed3(block[ptr[lo ] + d],
  537. block[ptr[hi ] + d],
  538. block[ptr[(lo+hi) >> 1] + d]);
  539. unLo = ltLo = lo;
  540. unHi = gtHi = hi;
  541. while (1) {
  542. while (1) {
  543. if (unLo > unHi)
  544. break;
  545. n = ((int32_t)block[ptr[unLo]+d]) - med;
  546. if (n == 0) {
  547. mswap(ptr[unLo], ptr[ltLo]);
  548. ltLo++;
  549. unLo++;
  550. continue;
  551. };
  552. if (n > 0) break;
  553. unLo++;
  554. }
  555. while (1) {
  556. if (unLo > unHi)
  557. break;
  558. n = ((int32_t)block[ptr[unHi]+d]) - med;
  559. if (n == 0) {
  560. mswap(ptr[unHi], ptr[gtHi]);
  561. gtHi--;
  562. unHi--;
  563. continue;
  564. };
  565. if (n < 0) break;
  566. unHi--;
  567. }
  568. if (unLo > unHi)
  569. break;
  570. mswap(ptr[unLo], ptr[unHi]);
  571. unLo++;
  572. unHi--;
  573. }
  574. AssertD(unHi == unLo-1, "mainQSort3(2)");
  575. if (gtHi < ltLo) {
  576. mpush(lo, hi, d + 1);
  577. continue;
  578. }
  579. n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
  580. m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
  581. n = lo + unLo - ltLo - 1;
  582. m = hi - (gtHi - unHi) + 1;
  583. nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
  584. nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
  585. nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
  586. if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
  587. if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
  588. if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
  589. AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
  590. AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
  591. mpush(nextLo[0], nextHi[0], nextD[0]);
  592. mpush(nextLo[1], nextHi[1], nextD[1]);
  593. mpush(nextLo[2], nextHi[2], nextD[2]);
  594. }
  595. }
  596. #undef mpush
  597. #undef mpop
  598. #undef mnextsize
  599. #undef mnextswap
  600. #undef MAIN_QSORT_SMALL_THRESH
  601. #undef MAIN_QSORT_DEPTH_THRESH
  602. #undef MAIN_QSORT_STACK_SIZE
  603. /*---------------------------------------------*/
  604. /* Pre:
  605. * nblock > N_OVERSHOOT
  606. * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
  607. * ((uint8_t*)block32) [0 .. nblock-1] holds block
  608. * ptr exists for [0 .. nblock-1]
  609. *
  610. * Post:
  611. * ((uint8_t*)block32) [0 .. nblock-1] holds block
  612. * All other areas of block32 destroyed
  613. * ftab[0 .. 65536] destroyed
  614. * ptr [0 .. nblock-1] holds sorted order
  615. * if (*budget < 0), sorting was abandoned
  616. */
  617. #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
  618. #define SETMASK (1 << 21)
  619. #define CLEARMASK (~(SETMASK))
  620. static NOINLINE
  621. void mainSort(EState* state,
  622. uint32_t* ptr,
  623. uint8_t* block,
  624. uint16_t* quadrant,
  625. uint32_t* ftab,
  626. int32_t nblock,
  627. int32_t* budget)
  628. {
  629. int32_t i, j, k, ss, sb;
  630. uint8_t c1;
  631. int32_t numQSorted;
  632. uint16_t s;
  633. Bool bigDone[256];
  634. /* bbox: moved to EState to save stack
  635. int32_t runningOrder[256];
  636. int32_t copyStart[256];
  637. int32_t copyEnd [256];
  638. */
  639. #define runningOrder (state->mainSort__runningOrder)
  640. #define copyStart (state->mainSort__copyStart)
  641. #define copyEnd (state->mainSort__copyEnd)
  642. /*-- set up the 2-byte frequency table --*/
  643. /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
  644. memset(ftab, 0, 65537 * sizeof(ftab[0]));
  645. j = block[0] << 8;
  646. i = nblock - 1;
  647. /* 3%, +300 bytes */
  648. #if CONFIG_BZIP2_FEATURE_SPEED >= 2
  649. for (; i >= 3; i -= 4) {
  650. quadrant[i] = 0;
  651. j = (j >> 8) | (((uint16_t)block[i]) << 8);
  652. ftab[j]++;
  653. quadrant[i-1] = 0;
  654. j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
  655. ftab[j]++;
  656. quadrant[i-2] = 0;
  657. j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
  658. ftab[j]++;
  659. quadrant[i-3] = 0;
  660. j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
  661. ftab[j]++;
  662. }
  663. #endif
  664. for (; i >= 0; i--) {
  665. quadrant[i] = 0;
  666. j = (j >> 8) | (((uint16_t)block[i]) << 8);
  667. ftab[j]++;
  668. }
  669. /*-- (emphasises close relationship of block & quadrant) --*/
  670. for (i = 0; i < BZ_N_OVERSHOOT; i++) {
  671. block [nblock+i] = block[i];
  672. quadrant[nblock+i] = 0;
  673. }
  674. /*-- Complete the initial radix sort --*/
  675. j = ftab[0]; /* bbox: optimized */
  676. for (i = 1; i <= 65536; i++) {
  677. j += ftab[i];
  678. ftab[i] = j;
  679. }
  680. s = block[0] << 8;
  681. i = nblock - 1;
  682. #if CONFIG_BZIP2_FEATURE_SPEED >= 2
  683. for (; i >= 3; i -= 4) {
  684. s = (s >> 8) | (block[i] << 8);
  685. j = ftab[s] - 1;
  686. ftab[s] = j;
  687. ptr[j] = i;
  688. s = (s >> 8) | (block[i-1] << 8);
  689. j = ftab[s] - 1;
  690. ftab[s] = j;
  691. ptr[j] = i-1;
  692. s = (s >> 8) | (block[i-2] << 8);
  693. j = ftab[s] - 1;
  694. ftab[s] = j;
  695. ptr[j] = i-2;
  696. s = (s >> 8) | (block[i-3] << 8);
  697. j = ftab[s] - 1;
  698. ftab[s] = j;
  699. ptr[j] = i-3;
  700. }
  701. #endif
  702. for (; i >= 0; i--) {
  703. s = (s >> 8) | (block[i] << 8);
  704. j = ftab[s] - 1;
  705. ftab[s] = j;
  706. ptr[j] = i;
  707. }
  708. /*
  709. * Now ftab contains the first loc of every small bucket.
  710. * Calculate the running order, from smallest to largest
  711. * big bucket.
  712. */
  713. for (i = 0; i <= 255; i++) {
  714. bigDone [i] = False;
  715. runningOrder[i] = i;
  716. }
  717. {
  718. int32_t vv;
  719. /* bbox: was: int32_t h = 1; */
  720. /* do h = 3 * h + 1; while (h <= 256); */
  721. uint32_t h = 364;
  722. do {
  723. /*h = h / 3;*/
  724. h = (h * 171) >> 9; /* bbox: fast h/3 */
  725. for (i = h; i <= 255; i++) {
  726. vv = runningOrder[i];
  727. j = i;
  728. while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
  729. runningOrder[j] = runningOrder[j-h];
  730. j = j - h;
  731. if (j <= (h - 1))
  732. goto zero;
  733. }
  734. zero:
  735. runningOrder[j] = vv;
  736. }
  737. } while (h != 1);
  738. }
  739. /*
  740. * The main sorting loop.
  741. */
  742. numQSorted = 0;
  743. for (i = 0; i <= 255; i++) {
  744. /*
  745. * Process big buckets, starting with the least full.
  746. * Basically this is a 3-step process in which we call
  747. * mainQSort3 to sort the small buckets [ss, j], but
  748. * also make a big effort to avoid the calls if we can.
  749. */
  750. ss = runningOrder[i];
  751. /*
  752. * Step 1:
  753. * Complete the big bucket [ss] by quicksorting
  754. * any unsorted small buckets [ss, j], for j != ss.
  755. * Hopefully previous pointer-scanning phases have already
  756. * completed many of the small buckets [ss, j], so
  757. * we don't have to sort them at all.
  758. */
  759. for (j = 0; j <= 255; j++) {
  760. if (j != ss) {
  761. sb = (ss << 8) + j;
  762. if (!(ftab[sb] & SETMASK)) {
  763. int32_t lo = ftab[sb] & CLEARMASK;
  764. int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
  765. if (hi > lo) {
  766. mainQSort3(
  767. ptr, block, quadrant, nblock,
  768. lo, hi, BZ_N_RADIX, budget
  769. );
  770. if (*budget < 0) return;
  771. numQSorted += (hi - lo + 1);
  772. }
  773. }
  774. ftab[sb] |= SETMASK;
  775. }
  776. }
  777. AssertH(!bigDone[ss], 1006);
  778. /*
  779. * Step 2:
  780. * Now scan this big bucket [ss] so as to synthesise the
  781. * sorted order for small buckets [t, ss] for all t,
  782. * including, magically, the bucket [ss,ss] too.
  783. * This will avoid doing Real Work in subsequent Step 1's.
  784. */
  785. {
  786. for (j = 0; j <= 255; j++) {
  787. copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
  788. copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
  789. }
  790. for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
  791. k = ptr[j] - 1;
  792. if (k < 0)
  793. k += nblock;
  794. c1 = block[k];
  795. if (!bigDone[c1])
  796. ptr[copyStart[c1]++] = k;
  797. }
  798. for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
  799. k = ptr[j]-1;
  800. if (k < 0)
  801. k += nblock;
  802. c1 = block[k];
  803. if (!bigDone[c1])
  804. ptr[copyEnd[c1]--] = k;
  805. }
  806. }
  807. /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
  808. * Necessity for this case is demonstrated by compressing
  809. * a sequence of approximately 48.5 million of character
  810. * 251; 1.0.0/1.0.1 will then die here. */
  811. AssertH((copyStart[ss]-1 == copyEnd[ss]) \
  812. || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
  813. for (j = 0; j <= 255; j++)
  814. ftab[(j << 8) + ss] |= SETMASK;
  815. /*
  816. * Step 3:
  817. * The [ss] big bucket is now done. Record this fact,
  818. * and update the quadrant descriptors. Remember to
  819. * update quadrants in the overshoot area too, if
  820. * necessary. The "if (i < 255)" test merely skips
  821. * this updating for the last bucket processed, since
  822. * updating for the last bucket is pointless.
  823. *
  824. * The quadrant array provides a way to incrementally
  825. * cache sort orderings, as they appear, so as to
  826. * make subsequent comparisons in fullGtU() complete
  827. * faster. For repetitive blocks this makes a big
  828. * difference (but not big enough to be able to avoid
  829. * the fallback sorting mechanism, exponential radix sort).
  830. *
  831. * The precise meaning is: at all times:
  832. *
  833. * for 0 <= i < nblock and 0 <= j <= nblock
  834. *
  835. * if block[i] != block[j],
  836. *
  837. * then the relative values of quadrant[i] and
  838. * quadrant[j] are meaningless.
  839. *
  840. * else {
  841. * if quadrant[i] < quadrant[j]
  842. * then the string starting at i lexicographically
  843. * precedes the string starting at j
  844. *
  845. * else if quadrant[i] > quadrant[j]
  846. * then the string starting at j lexicographically
  847. * precedes the string starting at i
  848. *
  849. * else
  850. * the relative ordering of the strings starting
  851. * at i and j has not yet been determined.
  852. * }
  853. */
  854. bigDone[ss] = True;
  855. if (i < 255) {
  856. int32_t bbStart = ftab[ss << 8] & CLEARMASK;
  857. int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
  858. int32_t shifts = 0;
  859. while ((bbSize >> shifts) > 65534) shifts++;
  860. for (j = bbSize-1; j >= 0; j--) {
  861. int32_t a2update = ptr[bbStart + j];
  862. uint16_t qVal = (uint16_t)(j >> shifts);
  863. quadrant[a2update] = qVal;
  864. if (a2update < BZ_N_OVERSHOOT)
  865. quadrant[a2update + nblock] = qVal;
  866. }
  867. AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
  868. }
  869. }
  870. #undef runningOrder
  871. #undef copyStart
  872. #undef copyEnd
  873. }
  874. #undef BIGFREQ
  875. #undef SETMASK
  876. #undef CLEARMASK
  877. /*---------------------------------------------*/
  878. /* Pre:
  879. * nblock > 0
  880. * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
  881. * ((uint8_t*)arr2)[0 .. nblock-1] holds block
  882. * arr1 exists for [0 .. nblock-1]
  883. *
  884. * Post:
  885. * ((uint8_t*)arr2) [0 .. nblock-1] holds block
  886. * All other areas of block destroyed
  887. * ftab[0 .. 65536] destroyed
  888. * arr1[0 .. nblock-1] holds sorted order
  889. */
  890. static NOINLINE
  891. void BZ2_blockSort(EState* s)
  892. {
  893. /* In original bzip2 1.0.4, it's a parameter, but 30
  894. * (which was the default) should work ok. */
  895. enum { wfact = 30 };
  896. uint32_t* ptr = s->ptr;
  897. uint8_t* block = s->block;
  898. uint32_t* ftab = s->ftab;
  899. int32_t nblock = s->nblock;
  900. uint16_t* quadrant;
  901. int32_t budget;
  902. int32_t i;
  903. if (nblock < 10000) {
  904. fallbackSort(s->arr1, s->arr2, ftab, nblock);
  905. } else {
  906. /* Calculate the location for quadrant, remembering to get
  907. * the alignment right. Assumes that &(block[0]) is at least
  908. * 2-byte aligned -- this should be ok since block is really
  909. * the first section of arr2.
  910. */
  911. i = nblock + BZ_N_OVERSHOOT;
  912. if (i & 1) i++;
  913. quadrant = (uint16_t*)(&(block[i]));
  914. /* (wfact-1) / 3 puts the default-factor-30
  915. * transition point at very roughly the same place as
  916. * with v0.1 and v0.9.0.
  917. * Not that it particularly matters any more, since the
  918. * resulting compressed stream is now the same regardless
  919. * of whether or not we use the main sort or fallback sort.
  920. */
  921. budget = nblock * ((wfact-1) / 3);
  922. mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
  923. if (budget < 0) {
  924. fallbackSort(s->arr1, s->arr2, ftab, nblock);
  925. }
  926. }
  927. s->origPtr = -1;
  928. for (i = 0; i < s->nblock; i++)
  929. if (ptr[i] == 0) {
  930. s->origPtr = i;
  931. break;
  932. };
  933. AssertH(s->origPtr != -1, 1003);
  934. }
  935. /*-------------------------------------------------------------*/
  936. /*--- end blocksort.c ---*/
  937. /*-------------------------------------------------------------*/