123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- #include "grep.h"
- /*
- * incremental compiler.
- * add the branch c to the
- * state s.
- */
- void
- increment(State *s, int c)
- {
- int i;
- State *t, **tt;
- Re *re1, *re2;
- nfollow = 0;
- gen++;
- matched = 0;
- for(i=0; i<s->count; i++)
- fol1(s->re[i], c);
- qsort(follow, nfollow, sizeof(*follow), fcmp);
- for(tt=&state0; t = *tt;) {
- if(t->count > nfollow) {
- tt = &t->linkleft;
- goto cont;
- }
- if(t->count < nfollow) {
- tt = &t->linkright;
- goto cont;
- }
- for(i=0; i<nfollow; i++) {
- re1 = t->re[i];
- re2 = follow[i];
- if(re1 > re2) {
- tt = &t->linkleft;
- goto cont;
- }
- if(re1 < re2) {
- tt = &t->linkright;
- goto cont;
- }
- }
- if(!!matched && !t->match) {
- tt = &t->linkleft;
- goto cont;
- }
- if(!matched && !!t->match) {
- tt = &t->linkright;
- goto cont;
- }
- s->next[c] = t;
- return;
- cont:;
- }
- t = sal(nfollow);
- *tt = t;
- for(i=0; i<nfollow; i++) {
- re1 = follow[i];
- t->re[i] = re1;
- }
- s->next[c] = t;
- t->match = matched;
- }
- int
- fcmp(void *va, void *vb)
- {
- Re **aa, **bb;
- Re *a, *b;
- aa = va;
- bb = vb;
- a = *aa;
- b = *bb;
- if(a > b)
- return 1;
- if(a < b)
- return -1;
- return 0;
- }
- void
- fol1(Re *r, int c)
- {
- Re *r1;
- loop:
- if(r->gen == gen)
- return;
- if(nfollow >= maxfollow)
- error("nfollow");
- r->gen = gen;
- switch(r->type) {
- default:
- error("fol1");
- case Tcase:
- if(c >= 0 && c < 256)
- if(r1 = r->cases[c])
- follow[nfollow++] = r1;
- if(r = r->next)
- goto loop;
- break;
- case Talt:
- case Tor:
- fol1(r->alt, c);
- r = r->next;
- goto loop;
- case Tbegin:
- if(c == '\n' || c == Cbegin)
- follow[nfollow++] = r->next;
- break;
- case Tend:
- if(c == '\n') {
- if(r->next == 0) {
- matched = 1;
- break;
- }
- r = r->next;
- goto loop;
- }
- break;
- case Tclass:
- if(c >= r->lo && c <= r->hi)
- follow[nfollow++] = r->next;
- break;
- }
- }
- Rune tab1[] =
- {
- 0x007f,
- 0x07ff,
- };
- Rune tab2[] =
- {
- 0x003f,
- 0x0fff,
- };
- Re2
- rclass(Rune p0, Rune p1)
- {
- char xc0[6], xc1[6];
- int i, n, m;
- Re2 x;
- if(p0 > p1)
- return re2char(0xff, 0xff); // no match
- /*
- * bust range into same length
- * character sequences
- */
- for(i=0; i<nelem(tab1); i++) {
- m = tab1[i];
- if(p0 <= m && p1 > m)
- return re2or(rclass(p0, m), rclass(m+1, p1));
- }
- /*
- * bust range into part of a single page
- * or into full pages
- */
- for(i=0; i<nelem(tab2); i++) {
- m = tab2[i];
- if((p0 & ~m) != (p1 & ~m)) {
- if((p0 & m) != 0)
- return re2or(rclass(p0, p0|m), rclass((p0|m)+1, p1));
- if((p1 & m) != m)
- return re2or(rclass(p0, (p1&~m)-1), rclass(p1&~m, p1));
- }
- }
- n = runetochar(xc0, &p0);
- i = runetochar(xc1, &p1);
- if(i != n)
- error("length");
- x = re2char(xc0[0], xc1[0]);
- for(i=1; i<n; i++)
- x = re2cat(x, re2char(xc0[i], xc1[i]));
- return x;
- }
- int
- pcmp(void *va, void *vb)
- {
- int n;
- Rune *a, *b;
- a = va;
- b = vb;
- n = a[0] - b[0];
- if(n)
- return n;
- return a[1] - b[1];
- }
- /*
- * convert character chass into
- * run-pair ranges of matches.
- * this is 10646/utf specific and
- * needs to be changed for some
- * other input character set.
- * this is the key to a fast
- * regular search of characters
- * by looking at sequential bytes.
- */
- Re2
- re2class(char *s)
- {
- Rune pairs[200], *p, *q, ov;
- int nc;
- Re2 x;
- nc = 0;
- if(*s == '^') {
- nc = 1;
- s++;
- }
- p = pairs;
- s += chartorune(p, s);
- for(;;) {
- if(*p == '\\')
- s += chartorune(p, s);
- if(*p == 0)
- break;
- p[1] = *p;
- p += 2;
- s += chartorune(p, s);
- if(*p != '-')
- continue;
- s += chartorune(p, s);
- if(*p == '\\')
- s += chartorune(p, s);
- if(*p == 0)
- break;
- p[-1] = *p;
- s += chartorune(p, s);
- }
- *p = 0;
- qsort(pairs, (p-pairs)/2, 2*sizeof(*pairs), pcmp);
- q = pairs;
- for(p=pairs+2; *p; p+=2) {
- if(p[0] > p[1])
- continue;
- if(p[0] > q[1] || p[1] < q[0]) {
- q[2] = p[0];
- q[3] = p[1];
- q += 2;
- continue;
- }
- if(p[0] < q[0])
- q[0] = p[0];
- if(p[1] > q[1])
- q[1] = p[1];
- }
- q[2] = 0;
- p = pairs;
- if(nc) {
- x = rclass(0, p[0]-1);
- ov = p[1]+1;
- for(p+=2; *p; p+=2) {
- x = re2or(x, rclass(ov, p[0]-1));
- ov = p[1]+1;
- }
- x = re2or(x, rclass(ov, 0xffff));
- } else {
- x = rclass(p[0], p[1]);
- for(p+=2; *p; p+=2)
- x = re2or(x, rclass(p[0], p[1]));
- }
- return x;
- }
|