123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050 |
- implement Regx;
- include "common.m";
- sys : Sys;
- utils : Utils;
- textm : Textm;
- FALSE, TRUE, XXX : import Dat;
- NRange : import Dat;
- Range, Rangeset : import Dat;
- error, warning, tgetc, rgetc : import utils;
- Text : import textm;
- init(mods : ref Dat->Mods)
- {
- sys = mods.sys;
- utils = mods.utils;
- textm = mods.textm;
- }
- None : con 0;
- Fore : con '+';
- Back : con '-';
- Char : con 0;
- Line : con 1;
- isaddrc(r : int) : int
- {
- if (utils->strchr("0123456789+-/$.#", r) >= 0)
- return TRUE;
- return FALSE;
- }
- #
- # quite hard: could be almost anything but white space, but we are a little conservative,
- # aiming for regular expressions of alphanumerics and no white space
- #
- isregexc(r : int) : int
- {
- if(r == 0)
- return FALSE;
- if(utils->isalnum(r))
- return TRUE;
- if(utils->strchr("^+-.*?#,;[]()$", r)>=0)
- return TRUE;
- return FALSE;
- }
- number(md: ref Dat->Mntdir, t : ref Text, r : Range, line : int, dir : int, size : int) : (int, Range)
- {
- q0, q1 : int;
- {
- if(size == Char){
- if(dir == Fore)
- line = r.q1+line; # was t.file.buf.nc+line;
- else if(dir == Back){
- if(r.q0==0 && line > 0)
- r.q0 = t.file.buf.nc;
- line = r.q0-line; # was t.file.buf.nc - line;
- }
- if(line<0 || line>t.file.buf.nc)
- raise "e";
- return (TRUE, (line, line));
- }
- (q0, q1) = r;
- case(dir){
- None =>
- q0 = 0;
- q1 = 0;
- while(line>0 && q1<t.file.buf.nc)
- if(t.readc(q1++) == '\n')
- if(--line > 0)
- q0 = q1;
- if(line==1 && t.readc(q1-1)!='\n') # no newline at end - count it
- ;
- else if(line > 0)
- raise "e";
- Fore =>
- if(q1 > 0)
- while(t.readc(q1-1) != '\n')
- q1++;
- q0 = q1;
- while(line>0 && q1<t.file.buf.nc)
- if(t.readc(q1++) == '\n')
- if(--line > 0)
- q0 = q1;
- if(line > 0)
- raise "e";
- Back =>
- if(q0 < t.file.buf.nc)
- while(q0>0 && t.readc(q0-1)!='\n')
- q0--;
- q1 = q0;
- while(line>0 && q0>0){
- if(t.readc(q0-1) == '\n'){
- if(--line >= 0)
- q1 = q0;
- }
- --q0;
- }
- if(line > 0)
- raise "e";
- while(q0>0 && t.readc(q0-1)!='\n')
- --q0;
- }
- return (TRUE, (q0, q1));
- }
- exception{
- * =>
- if(md != nil)
- warning(nil, "address out of range\n");
- return (FALSE, r);
- }
- return (FALSE, r);
- }
- regexp(md: ref Dat->Mntdir, t : ref Text, lim : Range, r : Range, pat : string, dir : int) : (int, Range)
- {
- found : int;
- sel : Rangeset;
- q : int;
- if(pat == nil && rxnull()){
- warning(md, "no previous regular expression");
- return (FALSE, r);
- }
- if(pat == nil || !rxcompile(pat))
- return (FALSE, r);
- if(dir == Back)
- (found, sel) = rxbexecute(t, r.q0);
- else{
- if(lim.q0 < 0)
- q = Dat->Infinity;
- else
- q = lim.q1;
- (found, sel) = rxexecute(t, nil, r.q1, q);
- }
- if(!found && md == nil)
- warning(nil, "no match for regexp\n");
- return (found, sel[0]);
- }
- xgetc(a0 : ref Text, a1 : string, n : int) : int
- {
- if (a0 == nil)
- return rgetc(a1, n);
- return tgetc(a0, n);
- }
- address(md: ref Dat->Mntdir, t : ref Text, lim : Range, ar : Range, a0 : ref Text, a1 : string, q0 : int, q1 : int, eval : int) : (int, int, Range)
- {
- dir, size : int;
- prevc, c, n : int;
- q : int;
- pat : string;
- r, nr : Range;
- r = ar;
- q = q0;
- dir = None;
- size = Line;
- c = 0;
- while(q < q1){
- prevc = c;
- c = xgetc(a0, a1, q++);
- case(c){
- ';' =>
- ar = r;
- if(prevc == 0) # lhs defaults to 0
- r.q0 = 0;
- if(q>=q1 && t!=nil && t.file!=nil) # rhs defaults to $
- r.q1 = t.file.buf.nc;
- else{
- (eval, q, nr) = address(md, t, lim, ar, a0, a1, q, q1, eval);
- r.q1 = nr.q1;
- }
- return (eval, q, r);
- ',' =>
- if(prevc == 0) # lhs defaults to 0
- r.q0 = 0;
- if(q>=q1 && t!=nil && t.file!=nil) # rhs defaults to $
- r.q1 = t.file.buf.nc;
- else{
- (eval, q, nr) = address(md, t, lim, ar, a0, a1, q, q1, eval);
- r.q1 = nr.q1;
- }
- return (eval, q, r);
- '+' or '-' =>
- if(eval && (prevc=='+' || prevc=='-')){
- if((nc := xgetc(a0, a1, q)) != '#' && nc != '/' && nc != '?')
- (eval, r) = number(md, t, r, 1, prevc, Line); # do previous one
- }
- dir = c;
- '.' or '$' =>
- if(q != q0+1)
- return (eval, q-1, r);
- if(eval)
- if(c == '.')
- r = ar;
- else
- r = (t.file.buf.nc, t.file.buf.nc);
- if(q < q1)
- dir = Fore;
- else
- dir = None;
- '#' =>
- if(q==q1 || (c=xgetc(a0, a1, q++))<'0' || '9'<c)
- return (eval, q-1, r);
- size = Char;
- n = c -'0';
- while(q<q1){
- c = xgetc(a0, a1, q++);
- if(c<'0' || '9'<c){
- q--;
- break;
- }
- n = n*10+(c-'0');
- }
- if(eval)
- (eval, r) = number(md, t, r, n, dir, size);
- dir = None;
- size = Line;
- '0' to '9' =>
- n = c -'0';
- while(q<q1){
- c = xgetc(a0, a1, q++);
- if(c<'0' || '9'<c){
- q--;
- break;
- }
- n = n*10+(c-'0');
- }
- if(eval)
- (eval, r) = number(md, t, r, n, dir, size);
- dir = None;
- size = Line;
- '/' =>
- pat = nil;
- break2 := 0; # Ow !
- while(q<q1){
- c = xgetc(a0, a1, q++);
- case(c){
- '\n' =>
- --q;
- break2 = 1;
- '\\' =>
- pat[len pat] = c;
- if(q == q1)
- break2 = 1;
- else
- c = xgetc(a0, a1, q++);
- '/' =>
- break2 = 1;
- }
- if (break2)
- break;
- pat[len pat] = c;
- }
- if(eval)
- (eval, r) = regexp(md, t, lim, r, pat, dir);
- pat = nil;
- dir = None;
- size = Line;
- * =>
- return (eval, q-1, r);
- }
- }
- if(eval && dir != None)
- (eval, r) = number(md, t, r, 1, dir, Line); # do previous one
- return (eval, q, r);
- }
- sel : Rangeset = array[NRange] of Range;
- lastregexp : string;
- # Machine Information
-
- Inst : adt {
- typex : int; # < 16r10000 ==> literal, otherwise action
- # sid : int;
- subid : int;
- class : int;
- # other : cyclic ref Inst;
- right : cyclic ref Inst;
- # left : cyclic ref Inst;
- next : cyclic ref Inst;
- };
- NPROG : con 1024;
- program := array[NPROG] of ref Inst;
- progp : int;
- startinst : ref Inst; # First inst. of program; might not be program[0]
- bstartinst : ref Inst; # same for backwards machine
- Ilist : adt {
- inst : ref Inst; # Instruction of the thread
- se : Rangeset;
- startp : int; # first char of match
- };
- NLIST : con 128;
- thl, nl : array of Ilist; # This list, next list
- listx := array[2] of array of Ilist;
- sempty : Rangeset = array[NRange] of Range;
- #
- # Actions and Tokens
- #
- # 0x100xx are operators, value == precedence
- # 0x200xx are tokens, i.e. operands for operators
- #
- OPERATOR : con 16r10000; # Bitmask of all operators
- START : con 16r10000; # Start, used for marker on stack
- RBRA : con 16r10001; # Right bracket, )
- LBRA : con 16r10002; # Left bracket, (
- OR : con 16r10003; # Alternation, |
- CAT : con 16r10004; # Concatentation, implicit operator
- STAR : con 16r10005; # Closure, *
- PLUS : con 16r10006; # a+ == aa*
- QUEST : con 16r10007; # a? == a|nothing, i.e. 0 or 1 a's
- ANY : con 16r20000; # Any character but newline, .
- NOP : con 16r20001; # No operation, internal use only
- BOL : con 16r20002; # Beginning of line, ^
- EOL : con 16r20003; # End of line, $
- CCLASS : con 16r20004; # Character class, []
- NCCLASS : con 16r20005; # Negated character class, [^]
- END : con 16r20077; # Terminate: match found
- ISATOR : con 16r10000;
- ISAND : con 16r20000;
- # Parser Information
-
- Node : adt {
- first : ref Inst;
- last : ref Inst;
- };
- NSTACK : con 20;
- andstack := array[NSTACK] of ref Node;
- andp : int;
- atorstack := array[NSTACK] of int;
- atorp : int;
- lastwasand : int; # Last token was operand
- cursubid : int;
- subidstack := array[NSTACK] of int;
- subidp : int;
- backwards : int;
- nbra : int;
- exprs : string;
- exprp : int; # pointer to next character in source expression
- DCLASS : con 10; # allocation increment
- nclass : int; # number active
- Nclass : int = 0; # high water mark
- class : array of string;
- negateclass : int;
- nilnode : Node;
- nilinst : Inst;
- rxinit()
- {
- lastregexp = nil;
- for (k := 0; k < NPROG; k++)
- program[k] = ref nilinst;
- for (k = 0; k < NSTACK; k++)
- andstack[k] = ref nilnode;
- for (k = 0; k < 2; k++) {
- listx[k] = array[NLIST] of Ilist;
- for (i := 0; i < NLIST; i++) {
- listx[k][i].inst = nil;
- listx[k][i].startp = 0;
- listx[k][i].se = array[NRange] of Range;
- for (j := 0; j < NRange; j++)
- listx[k][i].se[j].q0 = listx[k][i].se[j].q1 = 0;
- }
- }
- }
- regerror(e : string)
- {
- lastregexp = nil;
- buf := sys->sprint("regexp: %s\n", e);
- warning(nil, buf);
- raise "regerror";
- }
- newinst(t : int) : ref Inst
- {
- if(progp >= NPROG)
- regerror("expression too long");
- program[progp].typex = t;
- program[progp].next = nil; # next was left
- program[progp].right = nil;
- return program[progp++];
- }
- realcompile(s : string) : ref Inst
- {
- token : int;
- {
- startlex(s);
- atorp = 0;
- andp = 0;
- subidp = 0;
- cursubid = 0;
- lastwasand = FALSE;
- # Start with a low priority operator to prime parser
- pushator(START-1);
- while((token=lex()) != END){
- if((token&ISATOR) == OPERATOR)
- operator(token);
- else
- operand(token);
- }
- # Close with a low priority operator
- evaluntil(START);
- # Force END
- operand(END);
- evaluntil(START);
- if(nbra)
- regerror("unmatched `('");
- --andp; # points to first and only operand
- return andstack[andp].first;
- }
- exception{
- "regerror" =>
- return nil;
- }
- return nil;
- }
- rxcompile(r : string) : int
- {
- oprogp : int;
- if(lastregexp == r)
- return TRUE;
- lastregexp = nil;
- for (i := 0; i < nclass; i++)
- class[i] = nil;
- nclass = 0;
- progp = 0;
- backwards = FALSE;
- bstartinst = nil;
- startinst = realcompile(r);
- if(startinst == nil)
- return FALSE;
- optimize(0);
- oprogp = progp;
- backwards = TRUE;
- bstartinst = realcompile(r);
- if(bstartinst == nil)
- return FALSE;
- optimize(oprogp);
- lastregexp = r;
- return TRUE;
- }
- operand(t : int)
- {
- i : ref Inst;
- if(lastwasand)
- operator(CAT); # catenate is implicit
- i = newinst(t);
- if(t == CCLASS){
- if(negateclass)
- i.typex = NCCLASS; # UGH
- i.class = nclass-1; # UGH
- }
- pushand(i, i);
- lastwasand = TRUE;
- }
- operator(t : int)
- {
- if(t==RBRA && --nbra<0)
- regerror("unmatched `)'");
- if(t==LBRA){
- cursubid++; # silently ignored
- nbra++;
- if(lastwasand)
- operator(CAT);
- }else
- evaluntil(t);
- if(t!=RBRA)
- pushator(t);
- lastwasand = FALSE;
- if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
- lastwasand = TRUE; # these look like operands
- }
- pushand(f : ref Inst, l : ref Inst)
- {
- if(andp >= NSTACK)
- error("operand stack overflow");
- andstack[andp].first = f;
- andstack[andp].last = l;
- andp++;
- }
- pushator(t : int)
- {
- if(atorp >= NSTACK)
- error("operator stack overflow");
- atorstack[atorp++]=t;
- if(cursubid >= NRange)
- subidstack[subidp++]= -1;
- else
- subidstack[subidp++]=cursubid;
- }
- popand(op : int) : ref Node
- {
- if(andp <= 0)
- if(op){
- buf := sys->sprint("missing operand for %c", op);
- regerror(buf);
- }else
- regerror("malformed regexp");
- return andstack[--andp];
- }
- popator() : int
- {
- if(atorp <= 0)
- error("operator stack underflow");
- --subidp;
- return atorstack[--atorp];
- }
- evaluntil(pri : int)
- {
- op1, op2 : ref Node;
- inst1, inst2 : ref Inst;
- while(pri==RBRA || atorstack[atorp-1]>=pri){
- case(popator()){
- LBRA =>
- op1 = popand('(');
- inst2 = newinst(RBRA);
- inst2.subid = subidstack[subidp];
- op1.last.next = inst2;
- inst1 = newinst(LBRA);
- inst1.subid = subidstack[subidp];
- inst1.next = op1.first;
- pushand(inst1, inst2);
- return; # must have been RBRA
- OR =>
- op2 = popand('|');
- op1 = popand('|');
- inst2 = newinst(NOP);
- op2.last.next = inst2;
- op1.last.next = inst2;
- inst1 = newinst(OR);
- inst1.right = op1.first;
- inst1.next = op2.first; # next was left
- pushand(inst1, inst2);
- CAT =>
- op2 = popand(0);
- op1 = popand(0);
- if(backwards && op2.first.typex!=END)
- (op1, op2) = (op2, op1);
- op1.last.next = op2.first;
- pushand(op1.first, op2.last);
- STAR =>
- op2 = popand('*');
- inst1 = newinst(OR);
- op2.last.next = inst1;
- inst1.right = op2.first;
- pushand(inst1, inst1);
- PLUS =>
- op2 = popand('+');
- inst1 = newinst(OR);
- op2.last.next = inst1;
- inst1.right = op2.first;
- pushand(op2.first, inst1);
- QUEST =>
- op2 = popand('?');
- inst1 = newinst(OR);
- inst2 = newinst(NOP);
- inst1.next = inst2; # next was left
- inst1.right = op2.first;
- op2.last.next = inst2;
- pushand(inst1, inst2);
- * =>
- error("unknown regexp operator");
- }
- }
- }
- optimize(start : int)
- {
- inst : int;
- target : ref Inst;
- for(inst=start; program[inst].typex!=END; inst++){
- target = program[inst].next;
- while(target.typex == NOP)
- target = target.next;
- program[inst].next = target;
- }
- }
- startlex(s : string)
- {
- exprs = s;
- exprp = 0;
- nbra = 0;
- }
- lex() : int
- {
- c : int;
- if (exprp == len exprs)
- return END;
- c = exprs[exprp++];
- case(c){
- '\\' =>
- if(exprp < len exprs)
- if((c= exprs[exprp++])=='n')
- c='\n';
- '*' =>
- c = STAR;
- '?' =>
- c = QUEST;
- '+' =>
- c = PLUS;
- '|' =>
- c = OR;
- '.' =>
- c = ANY;
- '(' =>
- c = LBRA;
- ')' =>
- c = RBRA;
- '^' =>
- c = BOL;
- '$' =>
- c = EOL;
- '[' =>
- c = CCLASS;
- bldcclass();
- }
- return c;
- }
- nextrec() : int
- {
- if(exprp == len exprs || (exprp == len exprs-1 && exprs[exprp]=='\\'))
- regerror("malformed `[]'");
- if(exprs[exprp] == '\\'){
- exprp++;
- if(exprs[exprp]=='n'){
- exprp++;
- return '\n';
- }
- return exprs[exprp++] | 16r10000;
- }
- return exprs[exprp++];
- }
- bldcclass()
- {
- c1, c2 : int;
- classp : string;
- # we have already seen the '['
- if(exprp < len exprs && exprs[exprp] == '^'){
- classp[len classp] = '\n'; # don't match newline in negate case
- negateclass = TRUE;
- exprp++;
- }else
- negateclass = FALSE;
- while((c1 = nextrec()) != ']'){
- if(c1 == '-'){
- classp = nil;
- regerror("malformed `[]'");
- }
- if(exprp < len exprs && exprs[exprp] == '-'){
- exprp++; # eat '-'
- if((c2 = nextrec()) == ']') {
- classp = nil;
- regerror("malformed '[]'");
- }
- classp[len classp] = 16rFFFF;
- classp[len classp] = c1;
- classp[len classp] = c2;
- }else
- classp[len classp] = c1;
- }
- if(nclass == Nclass){
- Nclass += DCLASS;
- oc := class;
- class = array[Nclass] of string;
- if (oc != nil) {
- class[0:] = oc[0:Nclass-DCLASS];
- oc = nil;
- }
- }
- class[nclass++] = classp;
- }
- classmatch(classno : int, c : int, negate : int) : int
- {
- p : string;
- p = class[classno];
- for (i := 0; i < len p; ) {
- if(p[i] == 16rFFFF){
- if(p[i+1]<=c && c<=p[i+2])
- return !negate;
- i += 3;
- }else if(p[i++] == c)
- return !negate;
- }
- return negate;
- }
- #
- # Note optimization in addinst:
- # *l must be pending when addinst called; if *l has been looked
- # at already, the optimization is a bug.
- #
- addinst(l : array of Ilist, inst : ref Inst, sep : Rangeset)
- {
- p : int;
- for(p = 0; l[p].inst != nil; p++){
- if(l[p].inst==inst){
- if(sep[0].q0 < l[p].se[0].q0)
- l[p].se[0:] = sep[0:NRange]; # this would be bug
- return; # It's already there
- }
- }
- l[p].inst = inst;
- l[p].se[0:]= sep[0:NRange];
- l[p+1].inst = nil;
- }
- rxnull() : int
- {
- return startinst==nil || bstartinst==nil;
- }
- OVERFLOW : con "overflow";
- # either t!=nil or r!=nil, and we match the string in the appropriate place
- rxexecute(t : ref Text, r: string, startp : int, eof : int) : (int, Rangeset)
- {
- flag : int;
- inst : ref Inst;
- tlp : int;
- p : int;
- nnl, ntl : int;
- nc, c : int;
- wrapped : int;
- startchar : int;
- flag = 0;
- p = startp;
- startchar = 0;
- wrapped = 0;
- nnl = 0;
- if(startinst.typex<OPERATOR)
- startchar = startinst.typex;
- listx[0][0].inst = listx[1][0].inst = nil;
- sel[0].q0 = -1;
-
- {
- if(t != nil)
- nc = t.file.buf.nc;
- else
- nc = len r;
- # Execute machine once for each character
- for(;;p++){
- if(p>=eof || p>=nc){
- case(wrapped++){
- 0 or 2 => # let loop run one more click
- ;
- 1 => # expired; wrap to beginning
- if(sel[0].q0>=0 || eof!=Dat->Infinity)
- return (sel[0].q0>=0, sel);
- listx[0][0].inst = listx[1][0].inst = nil;
- p = -1;
- continue;
- * =>
- return (sel[0].q0>=0, sel);
- }
- c = 0;
- }else{
- if(((wrapped && p>=startp) || sel[0].q0>0) && nnl==0)
- break;
- if(t != nil)
- c = t.readc(p);
- else
- c = r[p];
- }
- # fast check for first char
- if(startchar && nnl==0 && c!=startchar)
- continue;
- thl = listx[flag];
- nl = listx[flag^=1];
- nl[0].inst = nil;
- ntl = nnl;
- nnl = 0;
- if(sel[0].q0<0 && (!wrapped || p<startp || startp==eof)){
- # Add first instruction to this list
- if(++ntl >= NLIST)
- raise OVERFLOW;
- sempty[0].q0 = p;
- addinst(thl, startinst, sempty);
- }
- # Execute machine until this list is empty
- tlp = 0;
- inst = thl[0].inst;
- while(inst != nil){ # assignment =
- case(inst.typex){
- LBRA =>
- if(inst.subid>=0)
- thl[tlp].se[inst.subid].q0 = p;
- inst = inst.next;
- continue;
- RBRA =>
- if(inst.subid>=0)
- thl[tlp].se[inst.subid].q1 = p;
- inst = inst.next;
- continue;
- ANY =>
- if(c!='\n') {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- BOL =>
- if(p==0 || (t != nil && t.readc(p-1)=='\n') || (r != nil && r[p-1] == '\n')){
- inst = inst.next;
- continue;
- }
- EOL =>
- if(c == '\n') {
- inst = inst.next;
- continue;
- }
- CCLASS =>
- if(c>=0 && classmatch(inst.class, c, 0)) {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- NCCLASS =>
- if(c>=0 && classmatch(inst.class, c, 1)) {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- OR =>
- # evaluate right choice later
- if(++ntl >= NLIST)
- raise OVERFLOW;
- addinst(thl[tlp:], inst.right, thl[tlp].se);
- # efficiency: advance and re-evaluate
- inst = inst.next; # next was left
- continue;
- END => # Match!
- thl[tlp].se[0].q1 = p;
- newmatch(thl[tlp].se);
- * => # regular character
- if(inst.typex==c){
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- }
- tlp++;
- inst = thl[tlp].inst;
- }
- }
- return (sel[0].q0>=0, sel);
- }
- exception{
- OVERFLOW =>
- error("regexp list overflow");
- sel[0].q0 = -1;
- return (0, sel);
- }
- return (0, sel);
- }
- newmatch(sp : Rangeset)
- {
- if(sel[0].q0<0 || sp[0].q0<sel[0].q0 ||
- (sp[0].q0==sel[0].q0 && sp[0].q1>sel[0].q1))
- sel[0:] = sp[0:NRange];
- }
- rxbexecute(t : ref Text, startp : int) : (int, Rangeset)
- {
- flag : int;
- inst : ref Inst;
- tlp : int;
- p : int;
- nnl, ntl : int;
- c : int;
- wrapped : int;
- startchar : int;
- flag = 0;
- nnl = 0;
- wrapped = 0;
- p = startp;
- startchar = 0;
- if(bstartinst.typex<OPERATOR)
- startchar = bstartinst.typex;
- listx[0][0].inst = listx[1][0].inst = nil;
- sel[0].q0= -1;
-
- {
- # Execute machine once for each character, including terminal NUL
- for(;;--p){
- if(p <= 0){
- case(wrapped++){
- 0 or 2 => # let loop run one more click
- ;
- 1 => # expired; wrap to end
- if(sel[0].q0>=0)
- return (sel[0].q0>=0, sel);
- listx[0][0].inst = listx[1][0].inst = nil;
- p = t.file.buf.nc+1;
- continue;
- 3 or * =>
- return (sel[0].q0>=0, sel);
- }
- c = 0;
- }else{
- if(((wrapped && p<=startp) || sel[0].q0>0) && nnl==0)
- break;
- c = t.readc(p-1);
- }
- # fast check for first char
- if(startchar && nnl==0 && c!=startchar)
- continue;
- thl = listx[flag];
- nl = listx[flag^=1];
- nl[0].inst = nil;
- ntl = nnl;
- nnl = 0;
- if(sel[0].q0<0 && (!wrapped || p>startp)){
- # Add first instruction to this list
- if(++ntl >= NLIST)
- raise OVERFLOW;
- # the minus is so the optimizations in addinst work
- sempty[0].q0 = -p;
- addinst(thl, bstartinst, sempty);
- }
- # Execute machine until this list is empty
- tlp = 0;
- inst = thl[0].inst;
- while(inst != nil){ # assignment =
- case(inst.typex){
- LBRA =>
- if(inst.subid>=0)
- thl[tlp].se[inst.subid].q0 = p;
- inst = inst.next;
- continue;
- RBRA =>
- if(inst.subid >= 0)
- thl[tlp].se[inst.subid].q1 = p;
- inst = inst.next;
- continue;
- ANY =>
- if(c != '\n') {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- BOL =>
- if(c=='\n' || p==0){
- inst = inst.next;
- continue;
- }
- EOL =>
- if(p<t.file.buf.nc && t.readc(p)=='\n') {
- inst = inst.next;
- continue;
- }
- CCLASS =>
- if(c>0 && classmatch(inst.class, c, 0)) {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- NCCLASS =>
- if(c>0 && classmatch(inst.class, c, 1)) {
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- OR =>
- # evaluate right choice later
- if(++ntl >= NLIST)
- raise OVERFLOW;
- addinst(thl[tlp:], inst.right, thl[tlp].se);
- # efficiency: advance and re-evaluate
- inst = inst.next; # next was left
- continue;
- END => # Match!
- thl[tlp].se[0].q0 = -thl[tlp].se[0].q0; # minus sign
- thl[tlp].se[0].q1 = p;
- bnewmatch(thl[tlp].se);
- * => # regular character
- if(inst.typex == c){
- if(++nnl >= NLIST)
- raise OVERFLOW;
- addinst(nl, inst.next, thl[tlp].se);
- }
- }
- tlp++;
- inst = thl[tlp].inst;
- }
- }
- return (sel[0].q0>=0, sel);
- }
- exception{
- OVERFLOW =>
- error("regexp list overflow");
- sel[0].q0 = -1;
- return (0, sel);
- }
- return (0, sel);
- }
- bnewmatch(sp : Rangeset)
- {
- i : int;
- if(sel[0].q0<0 || sp[0].q0>sel[0].q1 || (sp[0].q0==sel[0].q1 && sp[0].q1<sel[0].q0))
- for(i = 0; i<NRange; i++){ # note the reversal; q0<=q1
- sel[i].q0 = sp[i].q1;
- sel[i].q1 = sp[i].q0;
- }
- }
|