re.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /****************************************************************
  2. Copyright (C) Lucent Technologies 1997
  3. All Rights Reserved
  4. Permission to use, copy, modify, and distribute this software and
  5. its documentation for any purpose and without fee is hereby
  6. granted, provided that the above copyright notice appear in all
  7. copies and that both that the copyright notice and this
  8. permission notice and warranty disclaimer appear in supporting
  9. documentation, and that the name Lucent Technologies or any of
  10. its entities not be used in advertising or publicity pertaining
  11. to distribution of the software without specific, written prior
  12. permission.
  13. LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  14. INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  15. IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  16. SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  17. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  18. IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  19. ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  20. THIS SOFTWARE.
  21. ****************************************************************/
  22. #include <u.h>
  23. #include <lib9.h>
  24. #include <chartypes.h>
  25. #include <bio.h>
  26. #include <regexp.h>
  27. #include "awk.h"
  28. #include "y.tab.h"
  29. /* This file provides the interface between the main body of
  30. * awk and the pattern matching package. It preprocesses
  31. * patterns prior to compilation to provide awk-like semantics
  32. * to character sequences not supported by the pattern package.
  33. * The following conversions are performed:
  34. *
  35. * "()" -> "[]"
  36. * "[-" -> "[\-"
  37. * "[^-" -> "[^\-"
  38. * "-]" -> "\-]"
  39. * "[]" -> "[]*"
  40. * "\xdddd" -> "\z" where 'z' is the UTF sequence
  41. * for the hex value
  42. * "\ddd" -> "\o" where 'o' is a char octal value
  43. * "\b" -> "\B" where 'B' is backspace
  44. * "\t" -> "\T" where 'T' is tab
  45. * "\f" -> "\F" where 'F' is form feed
  46. * "\n" -> "\N" where 'N' is newline
  47. * "\r" -> "\r" where 'C' is cr
  48. */
  49. #define MAXRE 512
  50. static char re[MAXRE]; /* copy buffer */
  51. char *patbeg;
  52. int patlen; /* number of chars in pattern */
  53. #define NPATS 20 /* number of slots in pattern cache */
  54. static struct pat_list /* dynamic pattern cache */
  55. {
  56. char *re;
  57. int use;
  58. Reprog *program;
  59. } pattern[NPATS];
  60. static int npats; /* cache fill level */
  61. /* Compile a pattern */
  62. void
  63. *compre(char *pat)
  64. {
  65. int i, j, inclass;
  66. char c, *p, *s;
  67. Reprog *program;
  68. if (!compile_time) { /* search cache for dynamic pattern */
  69. for (i = 0; i < npats; i++)
  70. if (!strcmp(pat, pattern[i].re)) {
  71. pattern[i].use++;
  72. return((void *) pattern[i].program);
  73. }
  74. }
  75. /* Preprocess Pattern for compilation */
  76. p = re;
  77. s = pat;
  78. inclass = 0;
  79. while (c = *s++) {
  80. if (c == '\\') {
  81. quoted(&s, &p, re+MAXRE);
  82. continue;
  83. }
  84. else if (!inclass && c == '(' && *s == ')') {
  85. if (p < re+MAXRE-2) { /* '()' -> '[]*' */
  86. *p++ = '[';
  87. *p++ = ']';
  88. c = '*';
  89. s++;
  90. }
  91. else overflow();
  92. }
  93. else if (c == '['){ /* '[-' -> '[\-' */
  94. inclass = 1;
  95. if (*s == '-') {
  96. if (p < re+MAXRE-2) {
  97. *p++ = '[';
  98. *p++ = '\\';
  99. c = *s++;
  100. }
  101. else overflow();
  102. } /* '[^-' -> '[^\-'*/
  103. else if (*s == '^' && s[1] == '-'){
  104. if (p < re+MAXRE-3) {
  105. *p++ = '[';
  106. *p++ = *s++;
  107. *p++ = '\\';
  108. c = *s++;
  109. }
  110. else overflow();
  111. }
  112. else if (*s == '['){ /* skip '[[' */
  113. if (p < re+MAXRE-1)
  114. *p++ = c;
  115. else overflow();
  116. c = *s++;
  117. }
  118. else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
  119. if (p < re+MAXRE-2) {
  120. *p++ = c;
  121. *p++ = *s++;
  122. c = *s++;
  123. }
  124. else overflow();
  125. }
  126. else if (*s == ']') { /* '[]' -> '[]*' */
  127. if (p < re+MAXRE-2) {
  128. *p++ = c;
  129. *p++ = *s++;
  130. c = '*';
  131. inclass = 0;
  132. }
  133. else overflow();
  134. }
  135. }
  136. else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
  137. if (p < re+MAXRE-1)
  138. *p++ = '\\';
  139. else overflow();
  140. }
  141. else if (c == ']')
  142. inclass = 0;
  143. if (p < re+MAXRE-1)
  144. *p++ = c;
  145. else overflow();
  146. }
  147. *p = 0;
  148. program = regcomp(re); /* compile pattern */
  149. if (!compile_time) {
  150. if (npats < NPATS) /* Room in cache */
  151. i = npats++;
  152. else { /* Throw out least used */
  153. int use = pattern[0].use;
  154. i = 0;
  155. for (j = 1; j < NPATS; j++) {
  156. if (pattern[j].use < use) {
  157. use = pattern[j].use;
  158. i = j;
  159. }
  160. }
  161. xfree(pattern[i].program);
  162. xfree(pattern[i].re);
  163. }
  164. pattern[i].re = tostring(pat);
  165. pattern[i].program = program;
  166. pattern[i].use = 1;
  167. }
  168. return((void *) program);
  169. }
  170. /* T/F match indication - matched string not exported */
  171. int
  172. match(void *p, char *s, char * _)
  173. {
  174. return regexec((Reprog *) p, (char *) s, 0, 0);
  175. }
  176. /* match and delimit the matched string */
  177. int
  178. pmatch(void *p, char *s, char *start)
  179. {
  180. Resub m;
  181. m.sp = start;
  182. m.ep = 0;
  183. if (regexec((Reprog *) p, (char *) s, &m, 1)) {
  184. patbeg = m.sp;
  185. patlen = m.ep-m.sp;
  186. return 1;
  187. }
  188. patlen = -1;
  189. patbeg = start;
  190. return 0;
  191. }
  192. /* perform a non-empty match */
  193. int
  194. nematch(void *p, char *s, char *start)
  195. {
  196. if (pmatch(p, s, start) == 1 && patlen > 0)
  197. return 1;
  198. patlen = -1;
  199. patbeg = start;
  200. return 0;
  201. }
  202. /* in the parsing of regular expressions, metacharacters like . have */
  203. /* to be seen literally; \056 is not a metacharacter. */
  204. int
  205. hexstr(char **pp) /* find and eval hex string at pp, return new p */
  206. {
  207. char c;
  208. int n = 0;
  209. int i;
  210. for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
  211. if (isdigit(c))
  212. n = 16 * n + c - '0';
  213. else if ('a' <= c && c <= 'f')
  214. n = 16 * n + c - 'a' + 10;
  215. else if ('A' <= c && c <= 'F')
  216. n = 16 * n + c - 'A' + 10;
  217. }
  218. *pp += i;
  219. return n;
  220. }
  221. /* look for awk-specific escape sequences */
  222. #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
  223. void
  224. quoted(char **s, char **to, char *end) /* handle escaped sequence */
  225. {
  226. char *p = *s;
  227. char *t = *to;
  228. Rune c;
  229. switch(c = *p++) {
  230. case 't':
  231. c = '\t';
  232. break;
  233. case 'n':
  234. c = '\n';
  235. break;
  236. case 'f':
  237. c = '\f';
  238. break;
  239. case 'r':
  240. c = '\r';
  241. break;
  242. case 'b':
  243. c = '\b';
  244. break;
  245. default:
  246. if (t < end-1) /* all else must be escaped */
  247. *t++ = '\\';
  248. if (c == 'x') { /* hexadecimal goo follows */
  249. c = hexstr(&p);
  250. if (t < end-UTFmax)
  251. t += runelen(c);
  252. else overflow();
  253. *to = t;
  254. *s = p;
  255. return;
  256. } else if (isoctdigit(c)) { /* \d \dd \ddd */
  257. c -= '0';
  258. if (isoctdigit(*p)) {
  259. c = 8 * c + *p++ - '0';
  260. if (isoctdigit(*p))
  261. c = 8 * c + *p++ - '0';
  262. }
  263. }
  264. break;
  265. }
  266. if (t < end-1)
  267. *t++ = c;
  268. *s = p;
  269. *to = t;
  270. }
  271. /* pattern package error handler */
  272. void
  273. regerror(char *s)
  274. {
  275. FATAL("%s", s);
  276. }
  277. void
  278. overflow(void)
  279. {
  280. FATAL("%s", "regular expression too big");
  281. }