findtext.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. /*
  2. * findtext.c
  3. * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
  4. *
  5. * Description:
  6. * Find the blocks that contain the text of MS Word files
  7. */
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include "antiword.h"
  11. /*
  12. * bAddTextBlocks - Add the blocks to the text block list
  13. *
  14. * Returns TRUE when successful, FALSE if not
  15. */
  16. BOOL
  17. bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
  18. BOOL bUsesUnicode, USHORT usPropMod,
  19. ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
  20. {
  21. text_block_type tTextBlock;
  22. ULONG ulCharPos, ulOffset, ulIndex;
  23. long lToGo;
  24. fail(ulTotalLength > (ULONG)LONG_MAX / 2);
  25. fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
  26. fail(aulBBD == NULL);
  27. NO_DBG_HEX(ulCharPosFirst);
  28. NO_DBG_DEC(ulTotalLength);
  29. if (bUsesUnicode) {
  30. /* One character equals two bytes */
  31. NO_DBG_MSG("Uses Unicode");
  32. lToGo = (long)ulTotalLength * 2;
  33. } else {
  34. /* One character equals one byte */
  35. NO_DBG_MSG("Uses ASCII");
  36. lToGo = (long)ulTotalLength;
  37. }
  38. ulCharPos = ulCharPosFirst;
  39. ulOffset = ulCharPosFirst;
  40. for (ulIndex = ulStartBlock;
  41. ulIndex != END_OF_CHAIN && lToGo > 0;
  42. ulIndex = aulBBD[ulIndex]) {
  43. if (ulIndex >= (ULONG)tBBDLen) {
  44. DBG_DEC(ulIndex);
  45. DBG_DEC(tBBDLen);
  46. werr(1, "The Big Block Depot is damaged");
  47. }
  48. if (ulOffset >= BIG_BLOCK_SIZE) {
  49. ulOffset -= BIG_BLOCK_SIZE;
  50. continue;
  51. }
  52. tTextBlock.ulFileOffset =
  53. (ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
  54. tTextBlock.ulCharPos = ulCharPos;
  55. tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
  56. (ULONG)lToGo);
  57. tTextBlock.bUsesUnicode = bUsesUnicode;
  58. tTextBlock.usPropMod = usPropMod;
  59. ulOffset = 0;
  60. if (!bAdd2TextBlockList(&tTextBlock)) {
  61. DBG_HEX(tTextBlock.ulFileOffset);
  62. DBG_HEX(tTextBlock.ulCharPos);
  63. DBG_DEC(tTextBlock.ulLength);
  64. DBG_DEC(tTextBlock.bUsesUnicode);
  65. DBG_DEC(tTextBlock.usPropMod);
  66. return FALSE;
  67. }
  68. ulCharPos += tTextBlock.ulLength;
  69. lToGo -= (long)tTextBlock.ulLength;
  70. }
  71. DBG_DEC_C(lToGo != 0, lToGo);
  72. return lToGo == 0;
  73. } /* end of bAddTextBlocks */
  74. /*
  75. * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
  76. *
  77. * Code for "fast saved" files.
  78. *
  79. * Returns TRUE when successful, FALSE if not
  80. */
  81. BOOL
  82. bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
  83. const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
  84. {
  85. UCHAR *aucBuffer;
  86. ULONG ulBeginTextInfo, ulTextOffset, ulTotLength;
  87. size_t tTextInfoLen;
  88. int iIndex, iType, iOff, iLen, iPieces;
  89. USHORT usPropMod;
  90. DBG_MSG("bGet6DocumentText");
  91. fail(pFile == NULL);
  92. fail(aulBBD == NULL);
  93. fail(aucHeader == NULL);
  94. ulBeginTextInfo = ulGetLong(0x160, aucHeader); /* fcClx */
  95. DBG_HEX(ulBeginTextInfo);
  96. tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader); /* lcbClx */
  97. DBG_DEC(tTextInfoLen);
  98. aucBuffer = xmalloc(tTextInfoLen);
  99. if (!bReadBuffer(pFile, ulStartBlock,
  100. aulBBD, tBBDLen, BIG_BLOCK_SIZE,
  101. aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
  102. aucBuffer = xfree(aucBuffer);
  103. return FALSE;
  104. }
  105. NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
  106. iOff = 0;
  107. while ((size_t)iOff < tTextInfoLen) {
  108. iType = (int)ucGetByte(iOff, aucBuffer);
  109. iOff++;
  110. if (iType == 0) {
  111. DBG_FIXME();
  112. iOff++;
  113. continue;
  114. }
  115. if (iType == 1) {
  116. iLen = (int)usGetWord(iOff, aucBuffer);
  117. vAdd2PropModList(aucBuffer + iOff);
  118. iOff += iLen + 2;
  119. continue;
  120. }
  121. if (iType != 2) {
  122. werr(0, "Unknown type of 'fastsaved' format");
  123. aucBuffer = xfree(aucBuffer);
  124. return FALSE;
  125. }
  126. /* Type 2 */
  127. iLen = (int)usGetWord(iOff, aucBuffer);
  128. NO_DBG_DEC(iLen);
  129. iOff += 4;
  130. iPieces = (iLen - 4) / 12;
  131. DBG_DEC(iPieces);
  132. for (iIndex = 0; iIndex < iPieces; iIndex++) {
  133. ulTextOffset = ulGetLong(
  134. iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
  135. aucBuffer);
  136. usPropMod = usGetWord(
  137. iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
  138. aucBuffer);
  139. ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
  140. aucBuffer) -
  141. ulGetLong(iOff + iIndex * 4,
  142. aucBuffer);
  143. NO_DBG_HEX_C(usPropMod != 0, usPropMod);
  144. if (!bAddTextBlocks(ulTextOffset, ulTotLength,
  145. bUsesUnicode, usPropMod,
  146. ulStartBlock,
  147. aulBBD, tBBDLen)) {
  148. aucBuffer = xfree(aucBuffer);
  149. return FALSE;
  150. }
  151. }
  152. break;
  153. }
  154. aucBuffer = xfree(aucBuffer);
  155. return TRUE;
  156. } /* end of bGet6DocumentText */
  157. /*
  158. * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
  159. *
  160. * Returns TRUE when successful, FALSE if not
  161. */
  162. BOOL
  163. bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
  164. const ULONG *aulBBD, size_t tBBDLen,
  165. const ULONG *aulSBD, size_t tSBDLen,
  166. const UCHAR *aucHeader)
  167. {
  168. const ULONG *aulBlockDepot;
  169. UCHAR *aucBuffer;
  170. ULONG ulTextOffset, ulBeginTextInfo;
  171. ULONG ulTotLength, ulLen;
  172. long lIndex, lPieces, lOff;
  173. size_t tTextInfoLen, tBlockDepotLen, tBlockSize;
  174. int iType, iLen;
  175. BOOL bUsesUnicode;
  176. USHORT usPropMod;
  177. DBG_MSG("bGet8DocumentText");
  178. fail(pFile == NULL || pPPS == NULL);
  179. fail(aulBBD == NULL || aulSBD == NULL);
  180. fail(aucHeader == NULL);
  181. ulBeginTextInfo = ulGetLong(0x1a2, aucHeader); /* fcClx */
  182. DBG_HEX(ulBeginTextInfo);
  183. tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader); /* lcbClx */
  184. DBG_DEC(tTextInfoLen);
  185. DBG_DEC(pPPS->tTable.ulSB);
  186. DBG_HEX(pPPS->tTable.ulSize);
  187. if (pPPS->tTable.ulSize == 0) {
  188. return FALSE;
  189. }
  190. if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
  191. /* Use the Small Block Depot */
  192. aulBlockDepot = aulSBD;
  193. tBlockDepotLen = tSBDLen;
  194. tBlockSize = SMALL_BLOCK_SIZE;
  195. } else {
  196. /* Use the Big Block Depot */
  197. aulBlockDepot = aulBBD;
  198. tBlockDepotLen = tBBDLen;
  199. tBlockSize = BIG_BLOCK_SIZE;
  200. }
  201. aucBuffer = xmalloc(tTextInfoLen);
  202. if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
  203. aulBlockDepot, tBlockDepotLen, tBlockSize,
  204. aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
  205. aucBuffer = xfree(aucBuffer);
  206. return FALSE;
  207. }
  208. NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
  209. lOff = 0;
  210. while (lOff < (long)tTextInfoLen) {
  211. iType = (int)ucGetByte(lOff, aucBuffer);
  212. lOff++;
  213. if (iType == 0) {
  214. DBG_FIXME();
  215. lOff++;
  216. continue;
  217. }
  218. if (iType == 1) {
  219. iLen = (int)usGetWord(lOff, aucBuffer);
  220. vAdd2PropModList(aucBuffer + lOff);
  221. lOff += (long)iLen + 2;
  222. continue;
  223. }
  224. if (iType != 2) {
  225. werr(0, "Unknown type of 'fastsaved' format");
  226. aucBuffer = xfree(aucBuffer);
  227. return FALSE;
  228. }
  229. /* Type 2 */
  230. ulLen = ulGetLong(lOff, aucBuffer);
  231. if (ulLen < 4) {
  232. DBG_DEC(ulLen);
  233. return FALSE;
  234. }
  235. lOff += 4;
  236. lPieces = (long)((ulLen - 4) / 12);
  237. DBG_DEC(lPieces);
  238. for (lIndex = 0; lIndex < lPieces; lIndex++) {
  239. ulTextOffset = ulGetLong(
  240. lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
  241. aucBuffer);
  242. usPropMod = usGetWord(
  243. lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
  244. aucBuffer);
  245. ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
  246. aucBuffer) -
  247. ulGetLong(lOff + lIndex * 4,
  248. aucBuffer);
  249. if ((ulTextOffset & BIT(30)) == 0) {
  250. bUsesUnicode = TRUE;
  251. } else {
  252. bUsesUnicode = FALSE;
  253. ulTextOffset &= ~BIT(30);
  254. ulTextOffset /= 2;
  255. }
  256. NO_DBG_HEX_C(usPropMod != 0, usPropMod);
  257. if (!bAddTextBlocks(ulTextOffset, ulTotLength,
  258. bUsesUnicode, usPropMod,
  259. pPPS->tWordDocument.ulSB,
  260. aulBBD, tBBDLen)) {
  261. aucBuffer = xfree(aucBuffer);
  262. return FALSE;
  263. }
  264. }
  265. break;
  266. }
  267. aucBuffer = xfree(aucBuffer);
  268. return TRUE;
  269. } /* end of bGet8DocumentText */