123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289 |
- /*
- * findtext.c
- * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
- *
- * Description:
- * Find the blocks that contain the text of MS Word files
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include "antiword.h"
- /*
- * bAddTextBlocks - Add the blocks to the text block list
- *
- * Returns TRUE when successful, FALSE if not
- */
- BOOL
- bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
- BOOL bUsesUnicode, USHORT usPropMod,
- ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
- {
- text_block_type tTextBlock;
- ULONG ulCharPos, ulOffset, ulIndex;
- long lToGo;
- fail(ulTotalLength > (ULONG)LONG_MAX / 2);
- fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
- fail(aulBBD == NULL);
- NO_DBG_HEX(ulCharPosFirst);
- NO_DBG_DEC(ulTotalLength);
- if (bUsesUnicode) {
- /* One character equals two bytes */
- NO_DBG_MSG("Uses Unicode");
- lToGo = (long)ulTotalLength * 2;
- } else {
- /* One character equals one byte */
- NO_DBG_MSG("Uses ASCII");
- lToGo = (long)ulTotalLength;
- }
- ulCharPos = ulCharPosFirst;
- ulOffset = ulCharPosFirst;
- for (ulIndex = ulStartBlock;
- ulIndex != END_OF_CHAIN && lToGo > 0;
- ulIndex = aulBBD[ulIndex]) {
- if (ulIndex >= (ULONG)tBBDLen) {
- DBG_DEC(ulIndex);
- DBG_DEC(tBBDLen);
- werr(1, "The Big Block Depot is damaged");
- }
- if (ulOffset >= BIG_BLOCK_SIZE) {
- ulOffset -= BIG_BLOCK_SIZE;
- continue;
- }
- tTextBlock.ulFileOffset =
- (ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
- tTextBlock.ulCharPos = ulCharPos;
- tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
- (ULONG)lToGo);
- tTextBlock.bUsesUnicode = bUsesUnicode;
- tTextBlock.usPropMod = usPropMod;
- ulOffset = 0;
- if (!bAdd2TextBlockList(&tTextBlock)) {
- DBG_HEX(tTextBlock.ulFileOffset);
- DBG_HEX(tTextBlock.ulCharPos);
- DBG_DEC(tTextBlock.ulLength);
- DBG_DEC(tTextBlock.bUsesUnicode);
- DBG_DEC(tTextBlock.usPropMod);
- return FALSE;
- }
- ulCharPos += tTextBlock.ulLength;
- lToGo -= (long)tTextBlock.ulLength;
- }
- DBG_DEC_C(lToGo != 0, lToGo);
- return lToGo == 0;
- } /* end of bAddTextBlocks */
- /*
- * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
- *
- * Code for "fast saved" files.
- *
- * Returns TRUE when successful, FALSE if not
- */
- BOOL
- bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
- const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
- {
- UCHAR *aucBuffer;
- ULONG ulBeginTextInfo, ulTextOffset, ulTotLength;
- size_t tTextInfoLen;
- int iIndex, iType, iOff, iLen, iPieces;
- USHORT usPropMod;
- DBG_MSG("bGet6DocumentText");
- fail(pFile == NULL);
- fail(aulBBD == NULL);
- fail(aucHeader == NULL);
- ulBeginTextInfo = ulGetLong(0x160, aucHeader); /* fcClx */
- DBG_HEX(ulBeginTextInfo);
- tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader); /* lcbClx */
- DBG_DEC(tTextInfoLen);
- aucBuffer = xmalloc(tTextInfoLen);
- if (!bReadBuffer(pFile, ulStartBlock,
- aulBBD, tBBDLen, BIG_BLOCK_SIZE,
- aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
- iOff = 0;
- while ((size_t)iOff < tTextInfoLen) {
- iType = (int)ucGetByte(iOff, aucBuffer);
- iOff++;
- if (iType == 0) {
- DBG_FIXME();
- iOff++;
- continue;
- }
- if (iType == 1) {
- iLen = (int)usGetWord(iOff, aucBuffer);
- vAdd2PropModList(aucBuffer + iOff);
- iOff += iLen + 2;
- continue;
- }
- if (iType != 2) {
- werr(0, "Unknown type of 'fastsaved' format");
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- /* Type 2 */
- iLen = (int)usGetWord(iOff, aucBuffer);
- NO_DBG_DEC(iLen);
- iOff += 4;
- iPieces = (iLen - 4) / 12;
- DBG_DEC(iPieces);
- for (iIndex = 0; iIndex < iPieces; iIndex++) {
- ulTextOffset = ulGetLong(
- iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
- aucBuffer);
- usPropMod = usGetWord(
- iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
- aucBuffer);
- ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
- aucBuffer) -
- ulGetLong(iOff + iIndex * 4,
- aucBuffer);
- NO_DBG_HEX_C(usPropMod != 0, usPropMod);
- if (!bAddTextBlocks(ulTextOffset, ulTotLength,
- bUsesUnicode, usPropMod,
- ulStartBlock,
- aulBBD, tBBDLen)) {
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- }
- break;
- }
- aucBuffer = xfree(aucBuffer);
- return TRUE;
- } /* end of bGet6DocumentText */
- /*
- * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
- *
- * Returns TRUE when successful, FALSE if not
- */
- BOOL
- bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
- const ULONG *aulBBD, size_t tBBDLen,
- const ULONG *aulSBD, size_t tSBDLen,
- const UCHAR *aucHeader)
- {
- const ULONG *aulBlockDepot;
- UCHAR *aucBuffer;
- ULONG ulTextOffset, ulBeginTextInfo;
- ULONG ulTotLength, ulLen;
- long lIndex, lPieces, lOff;
- size_t tTextInfoLen, tBlockDepotLen, tBlockSize;
- int iType, iLen;
- BOOL bUsesUnicode;
- USHORT usPropMod;
- DBG_MSG("bGet8DocumentText");
- fail(pFile == NULL || pPPS == NULL);
- fail(aulBBD == NULL || aulSBD == NULL);
- fail(aucHeader == NULL);
- ulBeginTextInfo = ulGetLong(0x1a2, aucHeader); /* fcClx */
- DBG_HEX(ulBeginTextInfo);
- tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader); /* lcbClx */
- DBG_DEC(tTextInfoLen);
- DBG_DEC(pPPS->tTable.ulSB);
- DBG_HEX(pPPS->tTable.ulSize);
- if (pPPS->tTable.ulSize == 0) {
- return FALSE;
- }
- if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
- /* Use the Small Block Depot */
- aulBlockDepot = aulSBD;
- tBlockDepotLen = tSBDLen;
- tBlockSize = SMALL_BLOCK_SIZE;
- } else {
- /* Use the Big Block Depot */
- aulBlockDepot = aulBBD;
- tBlockDepotLen = tBBDLen;
- tBlockSize = BIG_BLOCK_SIZE;
- }
- aucBuffer = xmalloc(tTextInfoLen);
- if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
- aulBlockDepot, tBlockDepotLen, tBlockSize,
- aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
- lOff = 0;
- while (lOff < (long)tTextInfoLen) {
- iType = (int)ucGetByte(lOff, aucBuffer);
- lOff++;
- if (iType == 0) {
- DBG_FIXME();
- lOff++;
- continue;
- }
- if (iType == 1) {
- iLen = (int)usGetWord(lOff, aucBuffer);
- vAdd2PropModList(aucBuffer + lOff);
- lOff += (long)iLen + 2;
- continue;
- }
- if (iType != 2) {
- werr(0, "Unknown type of 'fastsaved' format");
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- /* Type 2 */
- ulLen = ulGetLong(lOff, aucBuffer);
- if (ulLen < 4) {
- DBG_DEC(ulLen);
- return FALSE;
- }
- lOff += 4;
- lPieces = (long)((ulLen - 4) / 12);
- DBG_DEC(lPieces);
- for (lIndex = 0; lIndex < lPieces; lIndex++) {
- ulTextOffset = ulGetLong(
- lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
- aucBuffer);
- usPropMod = usGetWord(
- lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
- aucBuffer);
- ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
- aucBuffer) -
- ulGetLong(lOff + lIndex * 4,
- aucBuffer);
- if ((ulTextOffset & BIT(30)) == 0) {
- bUsesUnicode = TRUE;
- } else {
- bUsesUnicode = FALSE;
- ulTextOffset &= ~BIT(30);
- ulTextOffset /= 2;
- }
- NO_DBG_HEX_C(usPropMod != 0, usPropMod);
- if (!bAddTextBlocks(ulTextOffset, ulTotLength,
- bUsesUnicode, usPropMod,
- pPPS->tWordDocument.ulSB,
- aulBBD, tBBDLen)) {
- aucBuffer = xfree(aucBuffer);
- return FALSE;
- }
- }
- break;
- }
- aucBuffer = xfree(aucBuffer);
- return TRUE;
- } /* end of bGet8DocumentText */
|