123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802 |
- #include "sam.h"
- Rangeset sel;
- String lastregexp;
- /*
- * Machine Information
- */
- typedef struct Inst Inst;
- struct Inst
- {
- long type; /* < 0x10000 ==> literal, otherwise action */
- union {
- int rsid;
- int rsubid;
- int class;
- struct Inst *rother;
- struct Inst *rright;
- } r;
- union{
- struct Inst *lleft;
- struct Inst *lnext;
- } l;
- };
- #define sid r.rsid
- #define subid r.rsubid
- #define rclass r.class
- #define other r.rother
- #define right r.rright
- #define left l.lleft
- #define next l.lnext
- #define NPROG 1024
- Inst program[NPROG];
- Inst *progp;
- Inst *startinst; /* First inst. of program; might not be program[0] */
- Inst *bstartinst; /* same for backwards machine */
- typedef struct Ilist Ilist;
- struct Ilist
- {
- Inst *inst; /* Instruction of the thread */
- Rangeset se;
- Posn startp; /* first char of match */
- };
- #define NLIST 127
- Ilist *tl, *nl; /* This list, next list */
- Ilist list[2][NLIST+1]; /* +1 for trailing null */
- static Rangeset sempty;
- /*
- * Actions and Tokens
- *
- * 0x100xx are operators, value == precedence
- * 0x200xx are tokens, i.e. operands for operators
- */
- #define OPERATOR 0x10000 /* Bitmask of all operators */
- #define START 0x10000 /* Start, used for marker on stack */
- #define RBRA 0x10001 /* Right bracket, ) */
- #define LBRA 0x10002 /* Left bracket, ( */
- #define OR 0x10003 /* Alternation, | */
- #define CAT 0x10004 /* Concatentation, implicit operator */
- #define STAR 0x10005 /* Closure, * */
- #define PLUS 0x10006 /* a+ == aa* */
- #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
- #define ANY 0x20000 /* Any character but newline, . */
- #define NOP 0x20001 /* No operation, internal use only */
- #define BOL 0x20002 /* Beginning of line, ^ */
- #define EOL 0x20003 /* End of line, $ */
- #define CCLASS 0x20004 /* Character class, [] */
- #define NCCLASS 0x20005 /* Negated character class, [^] */
- #define END 0x20077 /* Terminate: match found */
- #define ISATOR 0x10000
- #define ISAND 0x20000
- /*
- * Parser Information
- */
- typedef struct Node Node;
- struct Node
- {
- Inst *first;
- Inst *last;
- };
- #define NSTACK 20
- Node andstack[NSTACK];
- Node *andp;
- int atorstack[NSTACK];
- int *atorp;
- int lastwasand; /* Last token was operand */
- int cursubid;
- int subidstack[NSTACK];
- int *subidp;
- int backwards;
- int nbra;
- Rune *exprp; /* pointer to next character in source expression */
- #define DCLASS 10 /* allocation increment */
- int nclass; /* number active */
- int Nclass; /* high water mark */
- Rune **class;
- int negateclass;
- int addinst(Ilist *l, Inst *inst, Rangeset *sep);
- void newmatch(Rangeset*);
- void bnewmatch(Rangeset*);
- void pushand(Inst*, Inst*);
- void pushator(int);
- Node *popand(int);
- int popator(void);
- void startlex(Rune*);
- int lex(void);
- void operator(int);
- void operand(int);
- void evaluntil(int);
- void optimize(Inst*);
- void bldcclass(void);
- void
- regerror(Err e)
- {
- Strzero(&lastregexp);
- error(e);
- }
- void
- regerror_c(Err e, int c)
- {
- Strzero(&lastregexp);
- error_c(e, c);
- }
- Inst *
- newinst(int t)
- {
- if(progp >= &program[NPROG])
- regerror(Etoolong);
- progp->type = t;
- progp->left = 0;
- progp->right = 0;
- return progp++;
- }
- Inst *
- realcompile(Rune *s)
- {
- int token;
- startlex(s);
- atorp = atorstack;
- andp = andstack;
- subidp = subidstack;
- cursubid = 0;
- lastwasand = FALSE;
- /* Start with a low priority operator to prime parser */
- pushator(START-1);
- while((token=lex()) != END){
- if((token&ISATOR) == OPERATOR)
- operator(token);
- else
- operand(token);
- }
- /* Close with a low priority operator */
- evaluntil(START);
- /* Force END */
- operand(END);
- evaluntil(START);
- if(nbra)
- regerror(Eleftpar);
- --andp; /* points to first and only operand */
- return andp->first;
- }
- void
- compile(String *s)
- {
- int i;
- Inst *oprogp;
- if(Strcmp(s, &lastregexp)==0)
- return;
- for(i=0; i<nclass; i++)
- free(class[i]);
- nclass = 0;
- progp = program;
- backwards = FALSE;
- startinst = realcompile(s->s);
- optimize(program);
- oprogp = progp;
- backwards = TRUE;
- bstartinst = realcompile(s->s);
- optimize(oprogp);
- Strduplstr(&lastregexp, s);
- }
- void
- operand(int t)
- {
- Inst *i;
- if(lastwasand)
- operator(CAT); /* catenate is implicit */
- i = newinst(t);
- if(t == CCLASS){
- if(negateclass)
- i->type = NCCLASS; /* UGH */
- i->rclass = nclass-1; /* UGH */
- }
- pushand(i, i);
- lastwasand = TRUE;
- }
- void
- operator(int t)
- {
- if(t==RBRA && --nbra<0)
- regerror(Erightpar);
- if(t==LBRA){
- /*
- * if(++cursubid >= NSUBEXP)
- * regerror(Esubexp);
- */
- cursubid++; /* silently ignored */
- nbra++;
- if(lastwasand)
- operator(CAT);
- }else
- evaluntil(t);
- if(t!=RBRA)
- pushator(t);
- lastwasand = FALSE;
- if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
- lastwasand = TRUE; /* these look like operands */
- }
- void
- cant(char *s)
- {
- char buf[100];
- sprint(buf, "regexp: can't happen: %s", s);
- panic(buf);
- }
- void
- pushand(Inst *f, Inst *l)
- {
- if(andp >= &andstack[NSTACK])
- cant("operand stack overflow");
- andp->first = f;
- andp->last = l;
- andp++;
- }
- void
- pushator(int t)
- {
- if(atorp >= &atorstack[NSTACK])
- cant("operator stack overflow");
- *atorp++=t;
- if(cursubid >= NSUBEXP)
- *subidp++= -1;
- else
- *subidp++=cursubid;
- }
- Node *
- popand(int op)
- {
- if(andp <= &andstack[0])
- if(op)
- regerror_c(Emissop, op);
- else
- regerror(Ebadregexp);
- return --andp;
- }
- int
- popator(void)
- {
- if(atorp <= &atorstack[0])
- cant("operator stack underflow");
- --subidp;
- return *--atorp;
- }
- void
- evaluntil(int pri)
- {
- Node *op1, *op2, *t;
- Inst *inst1, *inst2;
- while(pri==RBRA || atorp[-1]>=pri){
- switch(popator()){
- case LBRA:
- op1 = popand('(');
- inst2 = newinst(RBRA);
- inst2->subid = *subidp;
- op1->last->next = inst2;
- inst1 = newinst(LBRA);
- inst1->subid = *subidp;
- inst1->next = op1->first;
- pushand(inst1, inst2);
- return; /* must have been RBRA */
- default:
- panic("unknown regexp operator");
- break;
- case OR:
- op2 = popand('|');
- op1 = popand('|');
- inst2 = newinst(NOP);
- op2->last->next = inst2;
- op1->last->next = inst2;
- inst1 = newinst(OR);
- inst1->right = op1->first;
- inst1->left = op2->first;
- pushand(inst1, inst2);
- break;
- case CAT:
- op2 = popand(0);
- op1 = popand(0);
- if(backwards && op2->first->type!=END)
- t = op1, op1 = op2, op2 = t;
- op1->last->next = op2->first;
- pushand(op1->first, op2->last);
- break;
- case STAR:
- op2 = popand('*');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(inst1, inst1);
- break;
- case PLUS:
- op2 = popand('+');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(op2->first, inst1);
- break;
- case QUEST:
- op2 = popand('?');
- inst1 = newinst(OR);
- inst2 = newinst(NOP);
- inst1->left = inst2;
- inst1->right = op2->first;
- op2->last->next = inst2;
- pushand(inst1, inst2);
- break;
- }
- }
- }
- void
- optimize(Inst *start)
- {
- Inst *inst, *target;
- for(inst=start; inst->type!=END; inst++){
- target = inst->next;
- while(target->type == NOP)
- target = target->next;
- inst->next = target;
- }
- }
- #ifdef DEBUG
- void
- dumpstack(void){
- Node *stk;
- int *ip;
- dprint("operators\n");
- for(ip = atorstack; ip<atorp; ip++)
- dprint("0%o\n", *ip);
- dprint("operands\n");
- for(stk = andstack; stk<andp; stk++)
- dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
- }
- void
- dump(void){
- Inst *l;
- l = program;
- do{
- dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
- l->left-program, l->right-program);
- }while(l++->type);
- }
- #endif
- void
- startlex(Rune *s)
- {
- exprp = s;
- nbra = 0;
- }
- int
- lex(void){
- int c= *exprp++;
- switch(c){
- case '\\':
- if(*exprp)
- if((c= *exprp++)=='n')
- c='\n';
- break;
- case 0:
- c = END;
- --exprp; /* In case we come here again */
- break;
- case '*':
- c = STAR;
- break;
- case '?':
- c = QUEST;
- break;
- case '+':
- c = PLUS;
- break;
- case '|':
- c = OR;
- break;
- case '.':
- c = ANY;
- break;
- case '(':
- c = LBRA;
- break;
- case ')':
- c = RBRA;
- break;
- case '^':
- c = BOL;
- break;
- case '$':
- c = EOL;
- break;
- case '[':
- c = CCLASS;
- bldcclass();
- break;
- }
- return c;
- }
- long
- nextrec(void){
- if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
- regerror(Ebadclass);
- if(exprp[0] == '\\'){
- exprp++;
- if(*exprp=='n'){
- exprp++;
- return '\n';
- }
- return *exprp++|0x10000;
- }
- return *exprp++;
- }
- void
- bldcclass(void)
- {
- long c1, c2, n, na;
- Rune *classp;
- classp = emalloc(DCLASS*RUNESIZE);
- n = 0;
- na = DCLASS;
- /* we have already seen the '[' */
- if(*exprp == '^'){
- classp[n++] = '\n'; /* don't match newline in negate case */
- negateclass = TRUE;
- exprp++;
- }else
- negateclass = FALSE;
- while((c1 = nextrec()) != ']'){
- if(c1 == '-'){
- Error:
- free(classp);
- regerror(Ebadclass);
- }
- if(n+4 >= na){ /* 3 runes plus NUL */
- na += DCLASS;
- classp = erealloc(classp, na*RUNESIZE);
- }
- if(*exprp == '-'){
- exprp++; /* eat '-' */
- if((c2 = nextrec()) == ']')
- goto Error;
- classp[n+0] = 0xFFFF;
- classp[n+1] = c1;
- classp[n+2] = c2;
- n += 3;
- }else
- classp[n++] = c1;
- }
- classp[n] = 0;
- if(nclass == Nclass){
- Nclass += DCLASS;
- class = erealloc(class, Nclass*sizeof(Rune*));
- }
- class[nclass++] = classp;
- }
- int
- classmatch(int classno, int c, int negate)
- {
- Rune *p;
- p = class[classno];
- while(*p){
- if(*p == 0xFFFF){
- if(p[1]<=c && c<=p[2])
- return !negate;
- p += 3;
- }else if(*p++ == c)
- return !negate;
- }
- return negate;
- }
- /*
- * Note optimization in addinst:
- * *l must be pending when addinst called; if *l has been looked
- * at already, the optimization is a bug.
- */
- int
- addinst(Ilist *l, Inst *inst, Rangeset *sep)
- {
- Ilist *p;
- for(p = l; p->inst; p++){
- if(p->inst==inst){
- if((sep)->p[0].p1 < p->se.p[0].p1)
- p->se= *sep; /* this would be bug */
- return 0; /* It's already there */
- }
- }
- p->inst = inst;
- p->se= *sep;
- (p+1)->inst = 0;
- return 1;
- }
- int
- execute(File *f, Posn startp, Posn eof)
- {
- int flag = 0;
- Inst *inst;
- Ilist *tlp;
- Posn p = startp;
- int nnl = 0, ntl;
- int c;
- int wrapped = 0;
- int startchar = startinst->type<OPERATOR? startinst->type : 0;
- list[0][0].inst = list[1][0].inst = 0;
- sel.p[0].p1 = -1;
- /* Execute machine once for each character */
- for(;;p++){
- doloop:
- c = filereadc(f, p);
- if(p>=eof || c<0){
- switch(wrapped++){
- case 0: /* let loop run one more click */
- case 2:
- break;
- case 1: /* expired; wrap to beginning */
- if(sel.p[0].p1>=0 || eof!=INFINITY)
- goto Return;
- list[0][0].inst = list[1][0].inst = 0;
- p = 0;
- goto doloop;
- default:
- goto Return;
- }
- }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
- break;
- /* fast check for first char */
- if(startchar && nnl==0 && c!=startchar)
- continue;
- tl = list[flag];
- nl = list[flag^=1];
- nl->inst = 0;
- ntl = nnl;
- nnl = 0;
- if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
- /* Add first instruction to this list */
- sempty.p[0].p1 = p;
- if(addinst(tl, startinst, &sempty))
- if(++ntl >= NLIST)
- Overflow:
- error(Eoverflow);
- }
- /* Execute machine until this list is empty */
- for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
- Switchstmt:
- switch(inst->type){
- default: /* regular character */
- if(inst->type==c){
- Addinst:
- if(addinst(nl, inst->next, &tlp->se))
- if(++nnl >= NLIST)
- goto Overflow;
- }
- break;
- case LBRA:
- if(inst->subid>=0)
- tlp->se.p[inst->subid].p1 = p;
- inst = inst->next;
- goto Switchstmt;
- case RBRA:
- if(inst->subid>=0)
- tlp->se.p[inst->subid].p2 = p;
- inst = inst->next;
- goto Switchstmt;
- case ANY:
- if(c!='\n')
- goto Addinst;
- break;
- case BOL:
- if(p==0 || filereadc(f, p - 1)=='\n'){
- Step:
- inst = inst->next;
- goto Switchstmt;
- }
- break;
- case EOL:
- if(c == '\n')
- goto Step;
- break;
- case CCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 0))
- goto Addinst;
- break;
- case NCCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 1))
- goto Addinst;
- break;
- case OR:
- /* evaluate right choice later */
- if(addinst(tl, inst->right, &tlp->se))
- if(++ntl >= NLIST)
- goto Overflow;
- /* efficiency: advance and re-evaluate */
- inst = inst->left;
- goto Switchstmt;
- case END: /* Match! */
- tlp->se.p[0].p2 = p;
- newmatch(&tlp->se);
- break;
- }
- }
- }
- Return:
- return sel.p[0].p1>=0;
- }
- void
- newmatch(Rangeset *sp)
- {
- int i;
- if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
- (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
- for(i = 0; i<NSUBEXP; i++)
- sel.p[i] = sp->p[i];
- }
- int
- bexecute(File *f, Posn startp)
- {
- int flag = 0;
- Inst *inst;
- Ilist *tlp;
- Posn p = startp;
- int nnl = 0, ntl;
- int c;
- int wrapped = 0;
- int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
- list[0][0].inst = list[1][0].inst = 0;
- sel.p[0].p1= -1;
- /* Execute machine once for each character, including terminal NUL */
- for(;;--p){
- doloop:
- if((c = filereadc(f, p - 1))==-1){
- switch(wrapped++){
- case 0: /* let loop run one more click */
- case 2:
- break;
- case 1: /* expired; wrap to end */
- if(sel.p[0].p1>=0)
- case 3:
- goto Return;
- list[0][0].inst = list[1][0].inst = 0;
- p = f->nc;
- goto doloop;
- default:
- goto Return;
- }
- }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
- break;
- /* fast check for first char */
- if(startchar && nnl==0 && c!=startchar)
- continue;
- tl = list[flag];
- nl = list[flag^=1];
- nl->inst = 0;
- ntl = nnl;
- nnl = 0;
- if(sel.p[0].p1<0 && (!wrapped || p>startp)){
- /* Add first instruction to this list */
- /* the minus is so the optimizations in addinst work */
- sempty.p[0].p1 = -p;
- if(addinst(tl, bstartinst, &sempty))
- if(++ntl >= NLIST)
- Overflow:
- error(Eoverflow);
- }
- /* Execute machine until this list is empty */
- for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
- Switchstmt:
- switch(inst->type){
- default: /* regular character */
- if(inst->type == c){
- Addinst:
- if(addinst(nl, inst->next, &tlp->se))
- if(++nnl >= NLIST)
- goto Overflow;
- }
- break;
- case LBRA:
- if(inst->subid>=0)
- tlp->se.p[inst->subid].p1 = p;
- inst = inst->next;
- goto Switchstmt;
- case RBRA:
- if(inst->subid >= 0)
- tlp->se.p[inst->subid].p2 = p;
- inst = inst->next;
- goto Switchstmt;
- case ANY:
- if(c != '\n')
- goto Addinst;
- break;
- case BOL:
- if(c=='\n' || p==0){
- Step:
- inst = inst->next;
- goto Switchstmt;
- }
- break;
- case EOL:
- if(p==f->nc || filereadc(f, p)=='\n')
- goto Step;
- break;
- case CCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 0))
- goto Addinst;
- break;
- case NCCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 1))
- goto Addinst;
- break;
- case OR:
- /* evaluate right choice later */
- if(addinst(tl, inst->right, &tlp->se))
- if(++ntl >= NLIST)
- goto Overflow;
- /* efficiency: advance and re-evaluate */
- inst = inst->left;
- goto Switchstmt;
- case END: /* Match! */
- tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
- tlp->se.p[0].p2 = p;
- bnewmatch(&tlp->se);
- break;
- }
- }
- }
- Return:
- return sel.p[0].p1>=0;
- }
- void
- bnewmatch(Rangeset *sp)
- {
- int i;
- if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
- for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
- sel.p[i].p1 = sp->p[i].p2;
- sel.p[i].p2 = sp->p[i].p1;
- }
- }
|