readchar.c 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * COMPONENT_NAME: austext
  25. *
  26. * FUNCTIONS: discard_to_ETX
  27. * readchar_ftext
  28. * readchar_string
  29. *
  30. * ORIGINS: 27
  31. *
  32. *
  33. * (C) COPYRIGHT International Business Machines Corp. 1996
  34. * All Rights Reserved
  35. * Licensed Materials - Property of IBM
  36. * US Government Users Restricted Rights - Use, duplication or
  37. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  38. */
  39. /************************ READCHAR.C *******************************
  40. * $XConsortium: readchar.c /main/3 1996/05/07 13:47:58 drk $
  41. * January 1996.
  42. * Character reading cofunctions for language parsers.
  43. *
  44. * $Log$
  45. * Revision 1.5 1996/03/25 17:01:19 miker
  46. * Clean up compiler warning.
  47. *
  48. * Revision 1.4 1996/03/13 22:59:39 miker
  49. * Added prolog. Changed char to UCHAR several places.
  50. *
  51. * Revision 1.3 1996/03/05 18:39:34 miker
  52. * Make *all* char ptrs unsigned.
  53. *
  54. * Revision 1.2 1996/03/05 18:08:03 miker
  55. * Readchar functions return unsigned chars for compatibility
  56. * with compilers whose default char type is signed.
  57. *
  58. * Revision 1.1 1996/02/01 19:20:39 miker
  59. * Initial revision
  60. */
  61. #include "SearchP.h"
  62. #include <stdlib.h>
  63. #include <string.h>
  64. #define PROGNAME "READCHAR"
  65. /************************************************/
  66. /* */
  67. /* readchar_string */
  68. /* */
  69. /************************************************/
  70. /* Generic readchar cofunction for parsers when the
  71. * text block is a string. Used for example when
  72. * parsing queries.
  73. */
  74. UCHAR readchar_string (UCHAR *the_string)
  75. {
  76. static UCHAR *strp = (UCHAR *) "";
  77. if (the_string)
  78. strp = the_string;
  79. return ((*strp)? *strp++ : 0);
  80. }
  81. /************************************************/
  82. /* */
  83. /* discard_to_ETX */
  84. /* */
  85. /************************************************/
  86. /* Called when dtsrload or dtsrindex wants to skip to next
  87. * .fzk record by reading and discarding all text to either
  88. * end of record marker or end of file.
  89. * Usually called after some error condition in the .fzk file,
  90. * such as recid not found in database, or when blobs not
  91. * used in dtsrload so they can be discarded.
  92. * Wraps around readchar_ftext(), which does the actual read
  93. * and checks for ETX with a read-ahead buffer.
  94. */
  95. void discard_to_ETX (PARG *parg)
  96. {
  97. if (!readchar_ftext (parg))
  98. return;
  99. while (readchar_ftext (NULL))
  100. ;
  101. return;
  102. } /* discard_to_ETX() */
  103. /************************************************/
  104. /* */
  105. /* readchar_ftext */
  106. /* */
  107. /************************************************/
  108. /* Returns next char in a text file. Called in 2 different situations:
  109. * It's a character reader cofunction called from
  110. * linguistic parser functions for supported languages.
  111. * It's also called directly from discard_to_ETX() in offline
  112. * build tools for *all* languages when for some reason the
  113. * current record being indexed must be discarded all the way
  114. * to end of text block (ETX).
  115. * ETX is when etxdelim string detected, or at end-of-file.
  116. *
  117. * The first call, which passes parg, is a reset trigger
  118. * to clear ETX. The globals are then set and used in
  119. * subsequent calls (passing NULL). This technique is
  120. * used because it will be called many times in a time
  121. * critical loop while indexing.
  122. *
  123. * Returns '\0' at ETX, and keeps returning '\0'
  124. * without further reads until the ETX flag is reset.
  125. * Returns '\0' forever at end-of-file.
  126. */
  127. UCHAR readchar_ftext (PARG *parg)
  128. {
  129. static FILE *ftext = NULL;
  130. static UCHAR *etxdelim = NULL;
  131. static UCHAR *rabuf = NULL;
  132. static int ETX_flag = TRUE;
  133. static int delimsz = 0;
  134. static int bufcount = 0;
  135. static int i;
  136. static UCHAR *head, *tail, *cptr, *rabufend;
  137. /* I'm always going to read ahead just enough chars
  138. * to test the delim string. The string is expected
  139. * to be small, typically just a few chars.
  140. * (A single \0 char indicates there is no record
  141. * delimiter--record ends only at end of file.)
  142. * I use a circular read ahead buffer with head and tail ptrs.
  143. * Bufcount contains current number of chars in buf.
  144. * Head is next file read point, ahead of youngest char in buf.
  145. * Tail is next char to return, ie oldest char in buf.
  146. */
  147. if (parg) {
  148. ftext = parg->ftext;
  149. if (feof(ftext)) {
  150. ETX_flag = TRUE;
  151. return 0;
  152. }
  153. if (etxdelim) {
  154. free (etxdelim);
  155. etxdelim = NULL;
  156. }
  157. if (parg->etxdelim)
  158. etxdelim = (UCHAR *) strdup (parg->etxdelim);
  159. ETX_flag = FALSE;
  160. delimsz = (etxdelim)? strlen((char*) etxdelim) : 0;
  161. if (!rabuf) {
  162. rabuf = austext_malloc (MAX_ETXDELIM + 2, PROGNAME"479", NULL);
  163. rabufend = rabuf + MAX_ETXDELIM;
  164. }
  165. if (delimsz >= MAX_ETXDELIM) {
  166. fprintf (aa_stderr, PROGNAME"505 Record delimiter too long.\n");
  167. DtSearchExit (2);
  168. }
  169. head = tail = rabuf;
  170. bufcount = 0;
  171. }
  172. if (ETX_flag)
  173. return 0;
  174. /* Read chars into read ahead buf until we
  175. * have enough to compare for etxdelim.
  176. * If possible, always read in at least one char.
  177. */
  178. while (bufcount == 0 || bufcount < delimsz) {
  179. if (feof(ftext))
  180. break;
  181. if ((i = fgetc (ftext)) == EOF)
  182. break;
  183. *head++ = i;
  184. bufcount++;
  185. if (head >= rabufend)
  186. head = rabuf;
  187. }
  188. /* There are now 3 possibilities.
  189. * (1) If bufcount == 0 we got EOF and there
  190. * are no chars remaining in buffer, quit now.
  191. * (2) Most likely bufcount is nonzero and equals delimsz.
  192. * Do a wrap-around strcmp looking for delim string.
  193. * (3) If bufcount is positive but less than delimsz,
  194. * we got EOF before the last record ended.
  195. * We'll fall through to the code that returns the next
  196. * char in the buffer, returning the remaining chars one
  197. * at a time until exhausted.
  198. * Note this sequence also handles the case where delimsz == 0.
  199. */
  200. if (bufcount <= 0) {
  201. ETX_flag = TRUE;
  202. return 0;
  203. }
  204. /* Compare chars in read ahead buf for delim string.
  205. * (Note that if the compare succeeds, both bufcount
  206. * and delimsz must be > 0).
  207. */
  208. if (bufcount == delimsz) {
  209. cptr = tail;
  210. for (i = 0; i < delimsz; i++) {
  211. if (etxdelim[i] != *cptr)
  212. break;
  213. cptr++;
  214. if (cptr >= rabufend)
  215. cptr = rabuf;
  216. }
  217. if (i == delimsz) {
  218. ETX_flag = TRUE;
  219. return 0;
  220. }
  221. }
  222. /* No ETX yet. Return the oldest char in read ahead buffer. */
  223. cptr = tail++;
  224. if (tail >= rabufend)
  225. tail = rabuf;
  226. bufcount--;
  227. return *cptr;
  228. } /* readchar_ftext */
  229. /*************************** READCHAR.C ****************************/