123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580 |
- /*
- * Branch/Call/Jump (BCJ) filter decoders
- *
- * Authors: Lasse Collin <lasse.collin@tukaani.org>
- * Igor Pavlov <http://7-zip.org/>
- *
- * This file has been put into the public domain.
- * You can do whatever you want with this file.
- */
- #include "xz_private.h"
- /*
- * The rest of the file is inside this ifdef. It makes things a little more
- * convenient when building without support for any BCJ filters.
- */
- #ifdef XZ_DEC_BCJ
- struct xz_dec_bcj {
- /* Type of the BCJ filter being used */
- enum {
- BCJ_X86 = 4, /* x86 or x86-64 */
- BCJ_POWERPC = 5, /* Big endian only */
- BCJ_IA64 = 6, /* Big or little endian */
- BCJ_ARM = 7, /* Little endian only */
- BCJ_ARMTHUMB = 8, /* Little endian only */
- BCJ_SPARC = 9 /* Big or little endian */
- } type;
- /*
- * Return value of the next filter in the chain. We need to preserve
- * this information across calls, because we must not call the next
- * filter anymore once it has returned XZ_STREAM_END.
- */
- enum xz_ret ret;
- /* True if we are operating in single-call mode. */
- bool single_call;
- /*
- * Absolute position relative to the beginning of the uncompressed
- * data (in a single .xz Block). We care only about the lowest 32
- * bits so this doesn't need to be uint64_t even with big files.
- */
- uint32_t pos;
- /* x86 filter state */
- uint32_t x86_prev_mask;
- /* Temporary space to hold the variables from struct xz_buf */
- uint8_t *out;
- size_t out_pos;
- size_t out_size;
- struct {
- /* Amount of already filtered data in the beginning of buf */
- size_t filtered;
- /* Total amount of data currently stored in buf */
- size_t size;
- /*
- * Buffer to hold a mix of filtered and unfiltered data. This
- * needs to be big enough to hold Alignment + 2 * Look-ahead:
- *
- * Type Alignment Look-ahead
- * x86 1 4
- * PowerPC 4 0
- * IA-64 16 0
- * ARM 4 0
- * ARM-Thumb 2 2
- * SPARC 4 0
- */
- uint8_t buf[16];
- } temp;
- };
- #ifdef XZ_DEC_X86
- /*
- * This is used to test the most significant byte of a memory address
- * in an x86 instruction.
- */
- static inline int bcj_x86_test_msbyte(uint8_t b)
- {
- return b == 0x00 || b == 0xFF;
- }
- static noinline_for_stack size_t XZ_FUNC bcj_x86(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- static const bool mask_to_allowed_status[8]
- = { true, true, true, false, true, false, false, false };
- static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
- size_t i;
- size_t prev_pos = (size_t)-1;
- uint32_t prev_mask = s->x86_prev_mask;
- uint32_t src;
- uint32_t dest;
- uint32_t j;
- uint8_t b;
- if (size <= 4)
- return 0;
- size -= 4;
- for (i = 0; i < size; ++i) {
- if ((buf[i] & 0xFE) != 0xE8)
- continue;
- prev_pos = i - prev_pos;
- if (prev_pos > 3) {
- prev_mask = 0;
- } else {
- prev_mask = (prev_mask << (prev_pos - 1)) & 7;
- if (prev_mask != 0) {
- b = buf[i + 4 - mask_to_bit_num[prev_mask]];
- if (!mask_to_allowed_status[prev_mask]
- || bcj_x86_test_msbyte(b)) {
- prev_pos = i;
- prev_mask = (prev_mask << 1) | 1;
- continue;
- }
- }
- }
- prev_pos = i;
- if (bcj_x86_test_msbyte(buf[i + 4])) {
- src = get_unaligned_le32(buf + i + 1);
- while (true) {
- dest = src - (s->pos + (uint32_t)i + 5);
- if (prev_mask == 0)
- break;
- j = mask_to_bit_num[prev_mask] * 8;
- b = (uint8_t)(dest >> (24 - j));
- if (!bcj_x86_test_msbyte(b))
- break;
- src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
- }
- dest &= 0x01FFFFFF;
- dest |= (uint32_t)0 - (dest & 0x01000000);
- put_unaligned_le32(dest, buf + i + 1);
- i += 4;
- } else {
- prev_mask = (prev_mask << 1) | 1;
- }
- }
- prev_pos = i - prev_pos;
- s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
- return i;
- }
- #endif
- #ifdef XZ_DEC_POWERPC
- static noinline_for_stack size_t XZ_FUNC bcj_powerpc(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- size_t i;
- uint32_t instr;
- for (i = 0; i + 4 <= size; i += 4) {
- instr = get_unaligned_be32(buf + i);
- if ((instr & 0xFC000003) == 0x48000001) {
- instr &= 0x03FFFFFC;
- instr -= s->pos + (uint32_t)i;
- instr &= 0x03FFFFFC;
- instr |= 0x48000001;
- put_unaligned_be32(instr, buf + i);
- }
- }
- return i;
- }
- #endif
- #ifdef XZ_DEC_IA64
- static noinline_for_stack size_t XZ_FUNC bcj_ia64(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- static const uint8_t branch_table[32] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 4, 4, 6, 6, 0, 0, 7, 7,
- 4, 4, 0, 0, 4, 4, 0, 0
- };
- /*
- * The local variables take a little bit stack space, but it's less
- * than what LZMA2 decoder takes, so it doesn't make sense to reduce
- * stack usage here without doing that for the LZMA2 decoder too.
- */
- /* Loop counters */
- size_t i;
- size_t j;
- /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
- uint32_t slot;
- /* Bitwise offset of the instruction indicated by slot */
- uint32_t bit_pos;
- /* bit_pos split into byte and bit parts */
- uint32_t byte_pos;
- uint32_t bit_res;
- /* Address part of an instruction */
- uint32_t addr;
- /* Mask used to detect which instructions to convert */
- uint32_t mask;
- /* 41-bit instruction stored somewhere in the lowest 48 bits */
- uint64_t instr;
- /* Instruction normalized with bit_res for easier manipulation */
- uint64_t norm;
- for (i = 0; i + 16 <= size; i += 16) {
- mask = branch_table[buf[i] & 0x1F];
- for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
- if (((mask >> slot) & 1) == 0)
- continue;
- byte_pos = bit_pos >> 3;
- bit_res = bit_pos & 7;
- instr = 0;
- for (j = 0; j < 6; ++j)
- instr |= (uint64_t)(buf[i + j + byte_pos])
- << (8 * j);
- norm = instr >> bit_res;
- if (((norm >> 37) & 0x0F) == 0x05
- && ((norm >> 9) & 0x07) == 0) {
- addr = (norm >> 13) & 0x0FFFFF;
- addr |= ((uint32_t)(norm >> 36) & 1) << 20;
- addr <<= 4;
- addr -= s->pos + (uint32_t)i;
- addr >>= 4;
- norm &= ~((uint64_t)0x8FFFFF << 13);
- norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
- norm |= (uint64_t)(addr & 0x100000)
- << (36 - 20);
- instr &= (1 << bit_res) - 1;
- instr |= norm << bit_res;
- for (j = 0; j < 6; j++)
- buf[i + j + byte_pos]
- = (uint8_t)(instr >> (8 * j));
- }
- }
- }
- return i;
- }
- #endif
- #ifdef XZ_DEC_ARM
- static noinline_for_stack size_t XZ_FUNC bcj_arm(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- size_t i;
- uint32_t addr;
- for (i = 0; i + 4 <= size; i += 4) {
- if (buf[i + 3] == 0xEB) {
- addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
- | ((uint32_t)buf[i + 2] << 16);
- addr <<= 2;
- addr -= s->pos + (uint32_t)i + 8;
- addr >>= 2;
- buf[i] = (uint8_t)addr;
- buf[i + 1] = (uint8_t)(addr >> 8);
- buf[i + 2] = (uint8_t)(addr >> 16);
- }
- }
- return i;
- }
- #endif
- #ifdef XZ_DEC_ARMTHUMB
- static noinline_for_stack size_t XZ_FUNC bcj_armthumb(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- size_t i;
- uint32_t addr;
- for (i = 0; i + 4 <= size; i += 2) {
- if ((buf[i + 1] & 0xF8) == 0xF0
- && (buf[i + 3] & 0xF8) == 0xF8) {
- addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
- | ((uint32_t)buf[i] << 11)
- | (((uint32_t)buf[i + 3] & 0x07) << 8)
- | (uint32_t)buf[i + 2];
- addr <<= 1;
- addr -= s->pos + (uint32_t)i + 4;
- addr >>= 1;
- buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
- buf[i] = (uint8_t)(addr >> 11);
- buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
- buf[i + 2] = (uint8_t)addr;
- i += 2;
- }
- }
- return i;
- }
- #endif
- #ifdef XZ_DEC_SPARC
- static noinline_for_stack size_t XZ_FUNC bcj_sparc(
- struct xz_dec_bcj *s, uint8_t *buf, size_t size)
- {
- size_t i;
- uint32_t instr;
- for (i = 0; i + 4 <= size; i += 4) {
- instr = get_unaligned_be32(buf + i);
- if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
- instr <<= 2;
- instr -= s->pos + (uint32_t)i;
- instr >>= 2;
- instr = ((uint32_t)0x40000000 - (instr & 0x400000))
- | 0x40000000 | (instr & 0x3FFFFF);
- put_unaligned_be32(instr, buf + i);
- }
- }
- return i;
- }
- #endif
- /*
- * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
- * of data that got filtered.
- *
- * NOTE: This is implemented as a switch statement to avoid using function
- * pointers, which could be problematic in the kernel boot code, which must
- * avoid pointers to static data (at least on x86).
- */
- static void XZ_FUNC bcj_apply(struct xz_dec_bcj *s,
- uint8_t *buf, size_t *pos, size_t size)
- {
- size_t filtered;
- buf += *pos;
- size -= *pos;
- switch (s->type) {
- #ifdef XZ_DEC_X86
- case BCJ_X86:
- filtered = bcj_x86(s, buf, size);
- break;
- #endif
- #ifdef XZ_DEC_POWERPC
- case BCJ_POWERPC:
- filtered = bcj_powerpc(s, buf, size);
- break;
- #endif
- #ifdef XZ_DEC_IA64
- case BCJ_IA64:
- filtered = bcj_ia64(s, buf, size);
- break;
- #endif
- #ifdef XZ_DEC_ARM
- case BCJ_ARM:
- filtered = bcj_arm(s, buf, size);
- break;
- #endif
- #ifdef XZ_DEC_ARMTHUMB
- case BCJ_ARMTHUMB:
- filtered = bcj_armthumb(s, buf, size);
- break;
- #endif
- #ifdef XZ_DEC_SPARC
- case BCJ_SPARC:
- filtered = bcj_sparc(s, buf, size);
- break;
- #endif
- default:
- /* Never reached but silence compiler warnings. */
- filtered = 0;
- break;
- }
- *pos += filtered;
- s->pos += filtered;
- }
- /*
- * Flush pending filtered data from temp to the output buffer.
- * Move the remaining mixture of possibly filtered and unfiltered
- * data to the beginning of temp.
- */
- static void XZ_FUNC bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
- {
- size_t copy_size;
- copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
- memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
- b->out_pos += copy_size;
- s->temp.filtered -= copy_size;
- s->temp.size -= copy_size;
- memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
- }
- /*
- * The BCJ filter functions are primitive in sense that they process the
- * data in chunks of 1-16 bytes. To hide this issue, this function does
- * some buffering.
- */
- XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_run(struct xz_dec_bcj *s,
- struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
- {
- size_t out_start;
- /*
- * Flush pending already filtered data to the output buffer. Return
- * immediatelly if we couldn't flush everything, or if the next
- * filter in the chain had already returned XZ_STREAM_END.
- */
- if (s->temp.filtered > 0) {
- bcj_flush(s, b);
- if (s->temp.filtered > 0)
- return XZ_OK;
- if (s->ret == XZ_STREAM_END)
- return XZ_STREAM_END;
- }
- /*
- * If we have more output space than what is currently pending in
- * temp, copy the unfiltered data from temp to the output buffer
- * and try to fill the output buffer by decoding more data from the
- * next filter in the chain. Apply the BCJ filter on the new data
- * in the output buffer. If everything cannot be filtered, copy it
- * to temp and rewind the output buffer position accordingly.
- *
- * This needs to be always run when temp.size == 0 to handle a special
- * case where the output buffer is full and the next filter has no
- * more output coming but hasn't returned XZ_STREAM_END yet.
- */
- if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
- out_start = b->out_pos;
- memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
- b->out_pos += s->temp.size;
- s->ret = xz_dec_lzma2_run(lzma2, b);
- if (s->ret != XZ_STREAM_END
- && (s->ret != XZ_OK || s->single_call))
- return s->ret;
- bcj_apply(s, b->out, &out_start, b->out_pos);
- /*
- * As an exception, if the next filter returned XZ_STREAM_END,
- * we can do that too, since the last few bytes that remain
- * unfiltered are meant to remain unfiltered.
- */
- if (s->ret == XZ_STREAM_END)
- return XZ_STREAM_END;
- s->temp.size = b->out_pos - out_start;
- b->out_pos -= s->temp.size;
- memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
- /*
- * If there wasn't enough input to the next filter to fill
- * the output buffer with unfiltered data, there's no point
- * to try decoding more data to temp.
- */
- if (b->out_pos + s->temp.size < b->out_size)
- return XZ_OK;
- }
- /*
- * We have unfiltered data in temp. If the output buffer isn't full
- * yet, try to fill the temp buffer by decoding more data from the
- * next filter. Apply the BCJ filter on temp. Then we hopefully can
- * fill the actual output buffer by copying filtered data from temp.
- * A mix of filtered and unfiltered data may be left in temp; it will
- * be taken care on the next call to this function.
- */
- if (b->out_pos < b->out_size) {
- /* Make b->out{,_pos,_size} temporarily point to s->temp. */
- s->out = b->out;
- s->out_pos = b->out_pos;
- s->out_size = b->out_size;
- b->out = s->temp.buf;
- b->out_pos = s->temp.size;
- b->out_size = sizeof(s->temp.buf);
- s->ret = xz_dec_lzma2_run(lzma2, b);
- s->temp.size = b->out_pos;
- b->out = s->out;
- b->out_pos = s->out_pos;
- b->out_size = s->out_size;
- if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
- return s->ret;
- bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
- /*
- * If the next filter returned XZ_STREAM_END, we mark that
- * everything is filtered, since the last unfiltered bytes
- * of the stream are meant to be left as is.
- */
- if (s->ret == XZ_STREAM_END)
- s->temp.filtered = s->temp.size;
- bcj_flush(s, b);
- if (s->temp.filtered > 0)
- return XZ_OK;
- }
- return s->ret;
- }
- XZ_EXTERN struct xz_dec_bcj * XZ_FUNC xz_dec_bcj_create(bool single_call)
- {
- struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s != NULL)
- s->single_call = single_call;
- return s;
- }
- XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_reset(
- struct xz_dec_bcj *s, uint8_t id)
- {
- switch (id) {
- #ifdef XZ_DEC_X86
- case BCJ_X86:
- #endif
- #ifdef XZ_DEC_POWERPC
- case BCJ_POWERPC:
- #endif
- #ifdef XZ_DEC_IA64
- case BCJ_IA64:
- #endif
- #ifdef XZ_DEC_ARM
- case BCJ_ARM:
- #endif
- #ifdef XZ_DEC_ARMTHUMB
- case BCJ_ARMTHUMB:
- #endif
- #ifdef XZ_DEC_SPARC
- case BCJ_SPARC:
- #endif
- break;
- default:
- /* Unsupported Filter ID */
- return XZ_OPTIONS_ERROR;
- }
- s->type = id;
- s->ret = XZ_OK;
- s->pos = 0;
- s->x86_prev_mask = 0;
- s->temp.filtered = 0;
- s->temp.size = 0;
- return XZ_OK;
- }
- #endif
|