re.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. /****************************************************************
  10. Copyright (C) Lucent Technologies 1997
  11. All Rights Reserved
  12. Permission to use, copy, modify, and distribute this software and
  13. its documentation for any purpose and without fee is hereby
  14. granted, provided that the above copyright notice appear in all
  15. copies and that both that the copyright notice and this
  16. permission notice and warranty disclaimer appear in supporting
  17. documentation, and that the name Lucent Technologies or any of
  18. its entities not be used in advertising or publicity pertaining
  19. to distribution of the software without specific, written prior
  20. permission.
  21. LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  22. INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  23. IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  24. SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  25. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  26. IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  27. ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  28. THIS SOFTWARE.
  29. ****************************************************************/
  30. #define DEBUG
  31. #include <stdio.h>
  32. #include <ctype.h>
  33. #include <setjmp.h>
  34. #include <math.h>
  35. #include <string.h>
  36. #include <stdlib.h>
  37. #include <time.h>
  38. #include "awk.h"
  39. #include "y.tab.h"
  40. #include "regexp.h"
  41. /* This file provides the interface between the main body of
  42. * awk and the pattern matching package. It preprocesses
  43. * patterns prior to compilation to provide awk-like semantics
  44. * to character sequences not supported by the pattern package.
  45. * The following conversions are performed:
  46. *
  47. * "()" -> "[]"
  48. * "[-" -> "[\-"
  49. * "[^-" -> "[^\-"
  50. * "-]" -> "\-]"
  51. * "[]" -> "[]*"
  52. * "\xdddd" -> "\z" where 'z' is the UTF sequence
  53. * for the hex value
  54. * "\ddd" -> "\o" where 'o' is a char octal value
  55. * "\b" -> "\B" where 'B' is backspace
  56. * "\t" -> "\T" where 'T' is tab
  57. * "\f" -> "\F" where 'F' is form feed
  58. * "\n" -> "\N" where 'N' is newline
  59. * "\r" -> "\r" where 'C' is cr
  60. */
  61. #define MAXRE 512
  62. static char re[MAXRE]; /* copy buffer */
  63. char *patbeg;
  64. int patlen; /* number of chars in pattern */
  65. #define NPATS 20 /* number of slots in pattern cache */
  66. static struct pat_list /* dynamic pattern cache */
  67. {
  68. char *re;
  69. int use;
  70. Reprog *program;
  71. } pattern[NPATS];
  72. static int npats; /* cache fill level */
  73. /* Compile a pattern */
  74. void
  75. *compre(char *pat)
  76. {
  77. int i, j, inclass;
  78. char c, *p, *s;
  79. Reprog *program;
  80. if (!compile_time) { /* search cache for dynamic pattern */
  81. for (i = 0; i < npats; i++)
  82. if (!strcmp(pat, pattern[i].re)) {
  83. pattern[i].use++;
  84. return((void *) pattern[i].program);
  85. }
  86. }
  87. /* Preprocess Pattern for compilation */
  88. p = re;
  89. s = pat;
  90. inclass = 0;
  91. while (c = *s++) {
  92. if (c == '\\') {
  93. quoted(&s, &p, re+MAXRE);
  94. continue;
  95. }
  96. else if (!inclass && c == '(' && *s == ')') {
  97. if (p < re+MAXRE-2) { /* '()' -> '[]*' */
  98. *p++ = '[';
  99. *p++ = ']';
  100. c = '*';
  101. s++;
  102. }
  103. else overflow();
  104. }
  105. else if (c == '['){ /* '[-' -> '[\-' */
  106. inclass = 1;
  107. if (*s == '-') {
  108. if (p < re+MAXRE-2) {
  109. *p++ = '[';
  110. *p++ = '\\';
  111. c = *s++;
  112. }
  113. else overflow();
  114. } /* '[^-' -> '[^\-'*/
  115. else if (*s == '^' && s[1] == '-'){
  116. if (p < re+MAXRE-3) {
  117. *p++ = '[';
  118. *p++ = *s++;
  119. *p++ = '\\';
  120. c = *s++;
  121. }
  122. else overflow();
  123. }
  124. else if (*s == '['){ /* skip '[[' */
  125. if (p < re+MAXRE-1)
  126. *p++ = c;
  127. else overflow();
  128. c = *s++;
  129. }
  130. else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
  131. if (p < re+MAXRE-2) {
  132. *p++ = c;
  133. *p++ = *s++;
  134. c = *s++;
  135. }
  136. else overflow();
  137. }
  138. else if (*s == ']') { /* '[]' -> '[]*' */
  139. if (p < re+MAXRE-2) {
  140. *p++ = c;
  141. *p++ = *s++;
  142. c = '*';
  143. inclass = 0;
  144. }
  145. else overflow();
  146. }
  147. }
  148. else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
  149. if (p < re+MAXRE-1)
  150. *p++ = '\\';
  151. else overflow();
  152. }
  153. else if (c == ']')
  154. inclass = 0;
  155. if (p < re+MAXRE-1)
  156. *p++ = c;
  157. else overflow();
  158. }
  159. *p = 0;
  160. program = regcomp(re); /* compile pattern */
  161. if (!compile_time) {
  162. if (npats < NPATS) /* Room in cache */
  163. i = npats++;
  164. else { /* Throw out least used */
  165. int use = pattern[0].use;
  166. i = 0;
  167. for (j = 1; j < NPATS; j++) {
  168. if (pattern[j].use < use) {
  169. use = pattern[j].use;
  170. i = j;
  171. }
  172. }
  173. xfree(pattern[i].program);
  174. xfree(pattern[i].re);
  175. }
  176. pattern[i].re = tostring(pat);
  177. pattern[i].program = program;
  178. pattern[i].use = 1;
  179. }
  180. return((void *) program);
  181. }
  182. /* T/F match indication - matched string not exported */
  183. int
  184. match(void *p, char *s, char *)
  185. {
  186. return regexec((Reprog *) p, (char *) s, 0, 0);
  187. }
  188. /* match and delimit the matched string */
  189. int
  190. pmatch(void *p, char *s, char *start)
  191. {
  192. Resub m;
  193. m.s.sp = start;
  194. m.e.ep = 0;
  195. if (regexec((Reprog *) p, (char *) s, &m, 1)) {
  196. patbeg = m.s.sp;
  197. patlen = m.e.ep-m.s.sp;
  198. return 1;
  199. }
  200. patlen = -1;
  201. patbeg = start;
  202. return 0;
  203. }
  204. /* perform a non-empty match */
  205. int
  206. nematch(void *p, char *s, char *start)
  207. {
  208. if (pmatch(p, s, start) == 1 && patlen > 0)
  209. return 1;
  210. patlen = -1;
  211. patbeg = start;
  212. return 0;
  213. }
  214. /* in the parsing of regular expressions, metacharacters like . have */
  215. /* to be seen literally; \056 is not a metacharacter. */
  216. hexstr(char **pp) /* find and eval hex string at pp, return new p */
  217. {
  218. char c;
  219. int n = 0;
  220. int i;
  221. for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
  222. if (isdigit(c))
  223. n = 16 * n + c - '0';
  224. else if ('a' <= c && c <= 'f')
  225. n = 16 * n + c - 'a' + 10;
  226. else if ('A' <= c && c <= 'F')
  227. n = 16 * n + c - 'A' + 10;
  228. }
  229. *pp += i;
  230. return n;
  231. }
  232. /* look for awk-specific escape sequences */
  233. #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
  234. void
  235. quoted(char **s, char **to, char *end) /* handle escaped sequence */
  236. {
  237. char *p = *s;
  238. char *t = *to;
  239. wchar_t c;
  240. switch(c = *p++) {
  241. case 't':
  242. c = '\t';
  243. break;
  244. case 'n':
  245. c = '\n';
  246. break;
  247. case 'f':
  248. c = '\f';
  249. break;
  250. case 'r':
  251. c = '\r';
  252. break;
  253. case 'b':
  254. c = '\b';
  255. break;
  256. default:
  257. if (t < end-1) /* all else must be escaped */
  258. *t++ = '\\';
  259. if (c == 'x') { /* hexadecimal goo follows */
  260. c = hexstr(&p);
  261. if (t < end-MB_CUR_MAX)
  262. t += wctomb(t, c);
  263. else overflow();
  264. *to = t;
  265. *s = p;
  266. return;
  267. } else if (isoctdigit(c)) { /* \d \dd \ddd */
  268. c -= '0';
  269. if (isoctdigit(*p)) {
  270. c = 8 * c + *p++ - '0';
  271. if (isoctdigit(*p))
  272. c = 8 * c + *p++ - '0';
  273. }
  274. }
  275. break;
  276. }
  277. if (t < end-1)
  278. *t++ = c;
  279. *s = p;
  280. *to = t;
  281. }
  282. /* count rune positions */
  283. int
  284. countposn(char *s, int n)
  285. {
  286. int i, j;
  287. char *end;
  288. for (i = 0, end = s+n; *s && s < end; i++){
  289. j = mblen(s, n);
  290. if(j <= 0)
  291. j = 1;
  292. s += j;
  293. }
  294. return(i);
  295. }
  296. /* pattern package error handler */
  297. void
  298. regerror(char *s)
  299. {
  300. FATAL("%s", s);
  301. }
  302. void
  303. overflow(void)
  304. {
  305. FATAL("%s", "regular expression too big");
  306. }