123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849 |
- /*
- * This file is part of the UCB release of Plan 9. It is subject to the license
- * terms in the LICENSE file found in the top-level directory of this
- * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
- * part of the UCB release of Plan 9, including this file, may be copied,
- * modified, propagated, or distributed except according to the terms contained
- * in the LICENSE file.
- */
- #include <u.h>
- #include <libc.h>
- #include <draw.h>
- #include <thread.h>
- #include <cursor.h>
- #include <mouse.h>
- #include <keyboard.h>
- #include <frame.h>
- #include <fcall.h>
- #include <plumb.h>
- #include "dat.h"
- #include "fns.h"
- Rangeset sel;
- Rune *lastregexp;
- /*
- * Machine Information
- */
- typedef struct Inst Inst;
- struct Inst
- {
- uint type; /* <= Runemax+1 ==> literal, otherwise action */
- union {
- int sid;
- int subid;
- int class;
- Inst *other;
- Inst *right;
- };
- union{
- Inst *left;
- Inst *next;
- };
- };
- #define NPROG 1024
- Inst program[NPROG];
- Inst *progp;
- Inst *startinst; /* First inst. of program; might not be program[0] */
- Inst *bstartinst; /* same for backwards machine */
- Channel *rechan; /* chan(Inst*) */
- typedef struct Ilist Ilist;
- struct Ilist
- {
- Inst *inst; /* Instruction of the thread */
- Rangeset se;
- uint startp; /* first char of match */
- };
- #define NLIST 127
- Ilist *tl, *nl; /* This list, next list */
- Ilist list[2][NLIST+1]; /* +1 for trailing null */
- static Rangeset sempty;
- /*
- * Actions and Tokens
- *
- * 0x100xx are operators, value == precedence
- * 0x200xx are tokens, i.e. operands for operators
- */
- enum {
- OPERATOR = Runemask+1, /* Bitmask of all operators */
- START = OPERATOR, /* Start, used for marker on stack */
- RBRA, /* Right bracket, ) */
- LBRA, /* Left bracket, ( */
- OR, /* Alternation, | */
- CAT, /* Concatentation, implicit operator */
- STAR, /* Closure, * */
- PLUS, /* a+ == aa* */
- QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
- ANY = OPERATOR<<1, /* Any character but newline, . */
- NOP, /* No operation, internal use only */
- BOL, /* Beginning of line, ^ */
- EOL, /* End of line, $ */
- CCLASS, /* Character class, [] */
- NCCLASS, /* Negated character class, [^] */
- END, /* Terminate: match found */
- ISATOR = OPERATOR,
- ISAND = OPERATOR<<1,
- };
- /*
- * Parser Information
- */
- typedef struct Node Node;
- struct Node
- {
- Inst *first;
- Inst *last;
- };
- #define NSTACK 20
- Node andstack[NSTACK];
- Node *andp;
- int atorstack[NSTACK];
- int *atorp;
- int lastwasand; /* Last token was operand */
- int cursubid;
- int subidstack[NSTACK];
- int *subidp;
- int backwards;
- int nbra;
- Rune *exprp; /* pointer to next character in source expression */
- #define DCLASS 10 /* allocation increment */
- int nclass; /* number active */
- int Nclass; /* high water mark */
- Rune **class;
- int negateclass;
- int addinst(Ilist *l, Inst *inst, Rangeset *sep);
- void newmatch(Rangeset*);
- void bnewmatch(Rangeset*);
- void pushand(Inst*, Inst*);
- void pushator(int);
- Node *popand(int);
- int popator(void);
- void startlex(Rune*);
- int lex(void);
- void operator(int);
- void operand(int);
- void evaluntil(int);
- void optimize(Inst*);
- void bldcclass(void);
- void
- rxinit(void)
- {
- rechan = chancreate(sizeof(Inst*), 0);
- lastregexp = runemalloc(1);
- }
- void
- regerror(char *e)
- {
- lastregexp[0] = 0;
- warning(nil, "regexp: %s\n", e);
- sendp(rechan, nil);
- threadexits(nil);
- }
- Inst *
- newinst(int t)
- {
- if(progp >= &program[NPROG])
- regerror("expression too long");
- progp->type = t;
- progp->left = nil;
- progp->right = nil;
- return progp++;
- }
- void
- realcompile(void *arg)
- {
- int token;
- Rune *s;
- threadsetname("regcomp");
- s = arg;
- startlex(s);
- atorp = atorstack;
- andp = andstack;
- subidp = subidstack;
- cursubid = 0;
- lastwasand = FALSE;
- /* Start with a low priority operator to prime parser */
- pushator(START-1);
- while((token=lex()) != END){
- if((token&ISATOR) == OPERATOR)
- operator(token);
- else
- operand(token);
- }
- /* Close with a low priority operator */
- evaluntil(START);
- /* Force END */
- operand(END);
- evaluntil(START);
- if(nbra)
- regerror("unmatched `('");
- --andp; /* points to first and only operand */
- sendp(rechan, andp->first);
- threadexits(nil);
- }
- /* r is null terminated */
- int
- rxcompile(Rune *r)
- {
- int i, nr;
- Inst *oprogp;
- nr = runestrlen(r)+1;
- if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
- return TRUE;
- lastregexp[0] = 0;
- for(i=0; i<nclass; i++)
- free(class[i]);
- nclass = 0;
- progp = program;
- backwards = FALSE;
- bstartinst = nil;
- threadcreate(realcompile, r, STACK);
- startinst = recvp(rechan);
- if(startinst == nil)
- return FALSE;
- optimize(program);
- oprogp = progp;
- backwards = TRUE;
- threadcreate(realcompile, r, STACK);
- bstartinst = recvp(rechan);
- if(bstartinst == nil)
- return FALSE;
- optimize(oprogp);
- lastregexp = runerealloc(lastregexp, nr);
- runemove(lastregexp, r, nr);
- return TRUE;
- }
- void
- operand(int t)
- {
- Inst *i;
- if(lastwasand)
- operator(CAT); /* catenate is implicit */
- i = newinst(t);
- if(t == CCLASS){
- if(negateclass)
- i->type = NCCLASS; /* UGH */
- i->class = nclass-1; /* UGH */
- }
- pushand(i, i);
- lastwasand = TRUE;
- }
- void
- operator(int t)
- {
- if(t==RBRA && --nbra<0)
- regerror("unmatched `)'");
- if(t==LBRA){
- cursubid++; /* silently ignored */
- nbra++;
- if(lastwasand)
- operator(CAT);
- }else
- evaluntil(t);
- if(t!=RBRA)
- pushator(t);
- lastwasand = FALSE;
- if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
- lastwasand = TRUE; /* these look like operands */
- }
- void
- pushand(Inst *f, Inst *l)
- {
- if(andp >= &andstack[NSTACK])
- error("operand stack overflow");
- andp->first = f;
- andp->last = l;
- andp++;
- }
- void
- pushator(int t)
- {
- if(atorp >= &atorstack[NSTACK])
- error("operator stack overflow");
- *atorp++=t;
- if(cursubid >= NRange)
- *subidp++= -1;
- else
- *subidp++=cursubid;
- }
- Node *
- popand(int op)
- {
- char buf[64];
- if(andp <= &andstack[0]){
- if(op){
- sprint(buf, "missing operand for %c", op);
- regerror(buf);
- }else
- regerror("malformed regexp");
- }
- return --andp;
- }
- int
- popator()
- {
- if(atorp <= &atorstack[0])
- error("operator stack underflow");
- --subidp;
- return *--atorp;
- }
- void
- evaluntil(int pri)
- {
- Node *op1, *op2, *t;
- Inst *inst1, *inst2;
- while(pri==RBRA || atorp[-1]>=pri){
- switch(popator()){
- case LBRA:
- op1 = popand('(');
- inst2 = newinst(RBRA);
- inst2->subid = *subidp;
- op1->last->next = inst2;
- inst1 = newinst(LBRA);
- inst1->subid = *subidp;
- inst1->next = op1->first;
- pushand(inst1, inst2);
- return; /* must have been RBRA */
- default:
- error("unknown regexp operator");
- break;
- case OR:
- op2 = popand('|');
- op1 = popand('|');
- inst2 = newinst(NOP);
- op2->last->next = inst2;
- op1->last->next = inst2;
- inst1 = newinst(OR);
- inst1->right = op1->first;
- inst1->left = op2->first;
- pushand(inst1, inst2);
- break;
- case CAT:
- op2 = popand(0);
- op1 = popand(0);
- if(backwards && op2->first->type!=END){
- t = op1;
- op1 = op2;
- op2 = t;
- }
- op1->last->next = op2->first;
- pushand(op1->first, op2->last);
- break;
- case STAR:
- op2 = popand('*');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(inst1, inst1);
- break;
- case PLUS:
- op2 = popand('+');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(op2->first, inst1);
- break;
- case QUEST:
- op2 = popand('?');
- inst1 = newinst(OR);
- inst2 = newinst(NOP);
- inst1->left = inst2;
- inst1->right = op2->first;
- op2->last->next = inst2;
- pushand(inst1, inst2);
- break;
- }
- }
- }
- void
- optimize(Inst *start)
- {
- Inst *inst, *target;
- for(inst=start; inst->type!=END; inst++){
- target = inst->next;
- while(target->type == NOP)
- target = target->next;
- inst->next = target;
- }
- }
- void
- startlex(Rune *s)
- {
- exprp = s;
- nbra = 0;
- }
- int
- lex(void){
- int c;
- c = *exprp++;
- switch(c){
- case '\\':
- if(*exprp)
- if((c= *exprp++)=='n')
- c='\n';
- break;
- case 0:
- c = END;
- --exprp; /* In case we come here again */
- break;
- case '*':
- c = STAR;
- break;
- case '?':
- c = QUEST;
- break;
- case '+':
- c = PLUS;
- break;
- case '|':
- c = OR;
- break;
- case '.':
- c = ANY;
- break;
- case '(':
- c = LBRA;
- break;
- case ')':
- c = RBRA;
- break;
- case '^':
- c = BOL;
- break;
- case '$':
- c = EOL;
- break;
- case '[':
- c = CCLASS;
- bldcclass();
- break;
- }
- return c;
- }
- int
- nextrec(void)
- {
- if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
- regerror("malformed `[]'");
- if(exprp[0] == '\\'){
- exprp++;
- if(*exprp=='n'){
- exprp++;
- return '\n';
- }
- return *exprp++|(Runemax+1);
- }
- return *exprp++;
- }
- void
- bldcclass(void)
- {
- int c1, c2, n, na;
- Rune *classp;
- classp = runemalloc(DCLASS);
- n = 0;
- na = DCLASS;
- /* we have already seen the '[' */
- if(*exprp == '^'){
- classp[n++] = '\n'; /* don't match newline in negate case */
- negateclass = TRUE;
- exprp++;
- }else
- negateclass = FALSE;
- while((c1 = nextrec()) != ']'){
- if(c1 == '-'){
- Error:
- free(classp);
- regerror("malformed `[]'");
- }
- if(n+4 >= na){ /* 3 runes plus NUL */
- na += DCLASS;
- classp = runerealloc(classp, na);
- }
- if(*exprp == '-'){
- exprp++; /* eat '-' */
- if((c2 = nextrec()) == ']')
- goto Error;
- classp[n+0] = Runemax;
- classp[n+1] = c1;
- classp[n+2] = c2;
- n += 3;
- }else
- classp[n++] = c1;
- }
- classp[n] = 0;
- if(nclass == Nclass){
- Nclass += DCLASS;
- class = realloc(class, Nclass*sizeof(Rune*));
- }
- class[nclass++] = classp;
- }
- int
- classmatch(int classno, int c, int negate)
- {
- Rune *p;
- p = class[classno];
- while(*p){
- if(*p == Runemax){
- if(p[1]<=c && c<=p[2])
- return !negate;
- p += 3;
- }else if(*p++ == c)
- return !negate;
- }
- return negate;
- }
- /*
- * Note optimization in addinst:
- * *l must be pending when addinst called; if *l has been looked
- * at already, the optimization is a bug.
- */
- int
- addinst(Ilist *l, Inst *inst, Rangeset *sep)
- {
- Ilist *p;
- for(p = l; p->inst; p++){
- if(p->inst==inst){
- if((sep)->r[0].q0 < p->se.r[0].q0)
- p->se= *sep; /* this would be bug */
- return 0; /* It's already there */
- }
- }
- p->inst = inst;
- p->se= *sep;
- (p+1)->inst = nil;
- return 1;
- }
- int
- rxnull(void)
- {
- return startinst==nil || bstartinst==nil;
- }
- /* either t!=nil or r!=nil, and we match the string in the appropriate place */
- int
- rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
- {
- int flag;
- Inst *inst;
- Ilist *tlp;
- uint p;
- int nnl, ntl;
- int nc, c;
- int wrapped;
- int startchar;
- flag = 0;
- p = startp;
- startchar = 0;
- wrapped = 0;
- nnl = 0;
- if(startinst->type<OPERATOR)
- startchar = startinst->type;
- list[0][0].inst = list[1][0].inst = nil;
- sel.r[0].q0 = -1;
- if(t != nil)
- nc = t->file->Buffer.nc;
- else
- nc = runestrlen(r);
- /* Execute machine once for each character */
- for(;;p++){
- doloop:
- if(p>=eof || p>=nc){
- switch(wrapped++){
- case 0: /* let loop run one more click */
- case 2:
- break;
- case 1: /* expired; wrap to beginning */
- if(sel.r[0].q0>=0 || eof!=Infinity)
- goto Return;
- list[0][0].inst = list[1][0].inst = nil;
- p = 0;
- goto doloop;
- default:
- goto Return;
- }
- c = 0;
- }else{
- if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
- break;
- if(t != nil)
- c = textreadc(t, p);
- else
- c = r[p];
- }
- /* fast check for first char */
- if(startchar && nnl==0 && c!=startchar)
- continue;
- tl = list[flag];
- nl = list[flag^=1];
- nl->inst = nil;
- ntl = nnl;
- nnl = 0;
- if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
- /* Add first instruction to this list */
- sempty.r[0].q0 = p;
- if(addinst(tl, startinst, &sempty))
- if(++ntl >= NLIST){
- Overflow:
- warning(nil, "regexp list overflow\n");
- sel.r[0].q0 = -1;
- goto Return;
- }
- }
- /* Execute machine until this list is empty */
- for(tlp = tl; (inst = tlp->inst) != nil; tlp++){
- Switchstmt:
- switch(inst->type){
- default: /* regular character */
- if(inst->type==c){
- Addinst:
- if(addinst(nl, inst->next, &tlp->se))
- if(++nnl >= NLIST)
- goto Overflow;
- }
- break;
- case LBRA:
- if(inst->subid>=0)
- tlp->se.r[inst->subid].q0 = p;
- inst = inst->next;
- goto Switchstmt;
- case RBRA:
- if(inst->subid>=0)
- tlp->se.r[inst->subid].q1 = p;
- inst = inst->next;
- goto Switchstmt;
- case ANY:
- if(c!='\n')
- goto Addinst;
- break;
- case BOL:
- if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
- Step:
- inst = inst->next;
- goto Switchstmt;
- }
- break;
- case EOL:
- if(c == '\n')
- goto Step;
- break;
- case CCLASS:
- if(c>=0 && classmatch(inst->class, c, 0))
- goto Addinst;
- break;
- case NCCLASS:
- if(c>=0 && classmatch(inst->class, c, 1))
- goto Addinst;
- break;
- case OR:
- /* evaluate right choice later */
- if(addinst(tlp, inst->right, &tlp->se))
- if(++ntl >= NLIST)
- goto Overflow;
- /* efficiency: advance and re-evaluate */
- inst = inst->left;
- goto Switchstmt;
- case END: /* Match! */
- tlp->se.r[0].q1 = p;
- newmatch(&tlp->se);
- break;
- }
- }
- }
- Return:
- *rp = sel;
- return sel.r[0].q0 >= 0;
- }
- void
- newmatch(Rangeset *sp)
- {
- if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
- (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
- sel = *sp;
- }
- int
- rxbexecute(Text *t, uint startp, Rangeset *rp)
- {
- int flag;
- Inst *inst;
- Ilist *tlp;
- int p;
- int nnl, ntl;
- int c;
- int wrapped;
- int startchar;
- flag = 0;
- nnl = 0;
- wrapped = 0;
- p = startp;
- startchar = 0;
- if(bstartinst->type<OPERATOR)
- startchar = bstartinst->type;
- list[0][0].inst = list[1][0].inst = nil;
- sel.r[0].q0= -1;
- /* Execute machine once for each character, including terminal NUL */
- for(;;--p){
- doloop:
- if(p <= 0){
- switch(wrapped++){
- case 0: /* let loop run one more click */
- case 2:
- break;
- case 1: /* expired; wrap to end */
- if(sel.r[0].q0>=0)
- goto Return;
- list[0][0].inst = list[1][0].inst = nil;
- p = t->file->Buffer.nc;
- goto doloop;
- case 3:
- default:
- goto Return;
- }
- c = 0;
- }else{
- if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
- break;
- c = textreadc(t, p-1);
- }
- /* fast check for first char */
- if(startchar && nnl==0 && c!=startchar)
- continue;
- tl = list[flag];
- nl = list[flag^=1];
- nl->inst = nil;
- ntl = nnl;
- nnl = 0;
- if(sel.r[0].q0<0 && (!wrapped || p>startp)){
- /* Add first instruction to this list */
- /* the minus is so the optimizations in addinst work */
- sempty.r[0].q0 = -p;
- if(addinst(tl, bstartinst, &sempty))
- if(++ntl >= NLIST){
- Overflow:
- warning(nil, "regexp list overflow\n");
- sel.r[0].q0 = -1;
- goto Return;
- }
- }
- /* Execute machine until this list is empty */
- for(tlp = tl; (inst = tlp->inst) != nil; tlp++){
- Switchstmt:
- switch(inst->type){
- default: /* regular character */
- if(inst->type == c){
- Addinst:
- if(addinst(nl, inst->next, &tlp->se))
- if(++nnl >= NLIST)
- goto Overflow;
- }
- break;
- case LBRA:
- if(inst->subid>=0)
- tlp->se.r[inst->subid].q0 = p;
- inst = inst->next;
- goto Switchstmt;
- case RBRA:
- if(inst->subid >= 0)
- tlp->se.r[inst->subid].q1 = p;
- inst = inst->next;
- goto Switchstmt;
- case ANY:
- if(c != '\n')
- goto Addinst;
- break;
- case BOL:
- if(c=='\n' || p==0){
- Step:
- inst = inst->next;
- goto Switchstmt;
- }
- break;
- case EOL:
- if(p<t->file->Buffer.nc && textreadc(t, p)=='\n')
- goto Step;
- break;
- case CCLASS:
- if(c>0 && classmatch(inst->class, c, 0))
- goto Addinst;
- break;
- case NCCLASS:
- if(c>0 && classmatch(inst->class, c, 1))
- goto Addinst;
- break;
- case OR:
- /* evaluate right choice later */
- if(addinst(tl, inst->right, &tlp->se))
- if(++ntl >= NLIST)
- goto Overflow;
- /* efficiency: advance and re-evaluate */
- inst = inst->left;
- goto Switchstmt;
- case END: /* Match! */
- tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
- tlp->se.r[0].q1 = p;
- bnewmatch(&tlp->se);
- break;
- }
- }
- }
- Return:
- *rp = sel;
- return sel.r[0].q0 >= 0;
- }
- void
- bnewmatch(Rangeset *sp)
- {
- int i;
- if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
- for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
- sel.r[i].q0 = sp->r[i].q1;
- sel.r[i].q1 = sp->r[i].q0;
- }
- }
|