123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- /*
- * This file is part of the UCB release of Plan 9. It is subject to the license
- * terms in the LICENSE file found in the top-level directory of this
- * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
- * part of the UCB release of Plan 9, including this file, may be copied,
- * modified, propagated, or distributed except according to the terms contained
- * in the LICENSE file.
- */
- /****************************************************************
- Copyright (C) Lucent Technologies 1997
- All Rights Reserved
- Permission to use, copy, modify, and distribute this software and
- its documentation for any purpose and without fee is hereby
- granted, provided that the above copyright notice appear in all
- copies and that both that the copyright notice and this
- permission notice and warranty disclaimer appear in supporting
- documentation, and that the name Lucent Technologies or any of
- its entities not be used in advertising or publicity pertaining
- to distribution of the software without specific, written prior
- permission.
- LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
- IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
- SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
- IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
- ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
- THIS SOFTWARE.
- ****************************************************************/
- #define DEBUG
- #include <stdio.h>
- #include <ctype.h>
- #include <setjmp.h>
- #include <math.h>
- #include <string.h>
- #include <stdlib.h>
- #include <time.h>
- #include "awk.h"
- #include "y.tab.h"
- #include "regexp.h"
- /* This file provides the interface between the main body of
- * awk and the pattern matching package. It preprocesses
- * patterns prior to compilation to provide awk-like semantics
- * to character sequences not supported by the pattern package.
- * The following conversions are performed:
- *
- * "()" -> "[]"
- * "[-" -> "[\-"
- * "[^-" -> "[^\-"
- * "-]" -> "\-]"
- * "[]" -> "[]*"
- * "\xdddd" -> "\z" where 'z' is the UTF sequence
- * for the hex value
- * "\ddd" -> "\o" where 'o' is a char octal value
- * "\b" -> "\B" where 'B' is backspace
- * "\t" -> "\T" where 'T' is tab
- * "\f" -> "\F" where 'F' is form feed
- * "\n" -> "\N" where 'N' is newline
- * "\r" -> "\r" where 'C' is cr
- */
- #define MAXRE 512
- static char re[MAXRE]; /* copy buffer */
- char *patbeg;
- int patlen; /* number of chars in pattern */
- #define NPATS 20 /* number of slots in pattern cache */
- static struct pat_list /* dynamic pattern cache */
- {
- char *re;
- int use;
- Reprog *program;
- } pattern[NPATS];
- static int npats; /* cache fill level */
- /* Compile a pattern */
- void
- *compre(char *pat)
- {
- int i, j, inclass;
- char c, *p, *s;
- Reprog *program;
- if (!compile_time) { /* search cache for dynamic pattern */
- for (i = 0; i < npats; i++)
- if (!strcmp(pat, pattern[i].re)) {
- pattern[i].use++;
- return((void *) pattern[i].program);
- }
- }
- /* Preprocess Pattern for compilation */
- p = re;
- s = pat;
- inclass = 0;
- while (c = *s++) {
- if (c == '\\') {
- quoted(&s, &p, re+MAXRE);
- continue;
- }
- else if (!inclass && c == '(' && *s == ')') {
- if (p < re+MAXRE-2) { /* '()' -> '[]*' */
- *p++ = '[';
- *p++ = ']';
- c = '*';
- s++;
- }
- else overflow();
- }
- else if (c == '['){ /* '[-' -> '[\-' */
- inclass = 1;
- if (*s == '-') {
- if (p < re+MAXRE-2) {
- *p++ = '[';
- *p++ = '\\';
- c = *s++;
- }
- else overflow();
- } /* '[^-' -> '[^\-'*/
- else if (*s == '^' && s[1] == '-'){
- if (p < re+MAXRE-3) {
- *p++ = '[';
- *p++ = *s++;
- *p++ = '\\';
- c = *s++;
- }
- else overflow();
- }
- else if (*s == '['){ /* skip '[[' */
- if (p < re+MAXRE-1)
- *p++ = c;
- else overflow();
- c = *s++;
- }
- else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
- if (p < re+MAXRE-2) {
- *p++ = c;
- *p++ = *s++;
- c = *s++;
- }
- else overflow();
- }
- else if (*s == ']') { /* '[]' -> '[]*' */
- if (p < re+MAXRE-2) {
- *p++ = c;
- *p++ = *s++;
- c = '*';
- inclass = 0;
- }
- else overflow();
- }
- }
- else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
- if (p < re+MAXRE-1)
- *p++ = '\\';
- else overflow();
- }
- else if (c == ']')
- inclass = 0;
- if (p < re+MAXRE-1)
- *p++ = c;
- else overflow();
- }
- *p = 0;
- program = regcomp(re); /* compile pattern */
- if (!compile_time) {
- if (npats < NPATS) /* Room in cache */
- i = npats++;
- else { /* Throw out least used */
- int use = pattern[0].use;
- i = 0;
- for (j = 1; j < NPATS; j++) {
- if (pattern[j].use < use) {
- use = pattern[j].use;
- i = j;
- }
- }
- xfree(pattern[i].program);
- xfree(pattern[i].re);
- }
- pattern[i].re = tostring(pat);
- pattern[i].program = program;
- pattern[i].use = 1;
- }
- return((void *) program);
- }
- /* T/F match indication - matched string not exported */
- int
- match(void *p, char *s, char *)
- {
- return regexec((Reprog *) p, (char *) s, 0, 0);
- }
- /* match and delimit the matched string */
- int
- pmatch(void *p, char *s, char *start)
- {
- Resub m;
- m.s.sp = start;
- m.e.ep = 0;
- if (regexec((Reprog *) p, (char *) s, &m, 1)) {
- patbeg = m.s.sp;
- patlen = m.e.ep-m.s.sp;
- return 1;
- }
- patlen = -1;
- patbeg = start;
- return 0;
- }
- /* perform a non-empty match */
- int
- nematch(void *p, char *s, char *start)
- {
- if (pmatch(p, s, start) == 1 && patlen > 0)
- return 1;
- patlen = -1;
- patbeg = start;
- return 0;
- }
- /* in the parsing of regular expressions, metacharacters like . have */
- /* to be seen literally; \056 is not a metacharacter. */
- hexstr(char **pp) /* find and eval hex string at pp, return new p */
- {
- char c;
- int n = 0;
- int i;
- for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
- if (isdigit(c))
- n = 16 * n + c - '0';
- else if ('a' <= c && c <= 'f')
- n = 16 * n + c - 'a' + 10;
- else if ('A' <= c && c <= 'F')
- n = 16 * n + c - 'A' + 10;
- }
- *pp += i;
- return n;
- }
- /* look for awk-specific escape sequences */
- #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
- void
- quoted(char **s, char **to, char *end) /* handle escaped sequence */
- {
- char *p = *s;
- char *t = *to;
- wchar_t c;
- switch(c = *p++) {
- case 't':
- c = '\t';
- break;
- case 'n':
- c = '\n';
- break;
- case 'f':
- c = '\f';
- break;
- case 'r':
- c = '\r';
- break;
- case 'b':
- c = '\b';
- break;
- default:
- if (t < end-1) /* all else must be escaped */
- *t++ = '\\';
- if (c == 'x') { /* hexadecimal goo follows */
- c = hexstr(&p);
- if (t < end-MB_CUR_MAX)
- t += wctomb(t, c);
- else overflow();
- *to = t;
- *s = p;
- return;
- } else if (isoctdigit(c)) { /* \d \dd \ddd */
- c -= '0';
- if (isoctdigit(*p)) {
- c = 8 * c + *p++ - '0';
- if (isoctdigit(*p))
- c = 8 * c + *p++ - '0';
- }
- }
- break;
- }
- if (t < end-1)
- *t++ = c;
- *s = p;
- *to = t;
- }
- /* count rune positions */
- int
- countposn(char *s, int n)
- {
- int i, j;
- char *end;
- for (i = 0, end = s+n; *s && s < end; i++){
- j = mblen(s, n);
- if(j <= 0)
- j = 1;
- s += j;
- }
- return(i);
- }
- /* pattern package error handler */
- void
- regerror(char *s)
- {
- FATAL("%s", s);
- }
- void
- overflow(void)
- {
- FATAL("%s", "regular expression too big");
- }
|