regx.b 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050
  1. implement Regx;
  2. include "common.m";
  3. sys : Sys;
  4. utils : Utils;
  5. textm : Textm;
  6. FALSE, TRUE, XXX : import Dat;
  7. NRange : import Dat;
  8. Range, Rangeset : import Dat;
  9. error, warning, tgetc, rgetc : import utils;
  10. Text : import textm;
  11. init(mods : ref Dat->Mods)
  12. {
  13. sys = mods.sys;
  14. utils = mods.utils;
  15. textm = mods.textm;
  16. }
  17. None : con 0;
  18. Fore : con '+';
  19. Back : con '-';
  20. Char : con 0;
  21. Line : con 1;
  22. isaddrc(r : int) : int
  23. {
  24. if (utils->strchr("0123456789+-/$.#", r) >= 0)
  25. return TRUE;
  26. return FALSE;
  27. }
  28. #
  29. # quite hard: could be almost anything but white space, but we are a little conservative,
  30. # aiming for regular expressions of alphanumerics and no white space
  31. #
  32. isregexc(r : int) : int
  33. {
  34. if(r == 0)
  35. return FALSE;
  36. if(utils->isalnum(r))
  37. return TRUE;
  38. if(utils->strchr("^+-.*?#,;[]()$", r)>=0)
  39. return TRUE;
  40. return FALSE;
  41. }
  42. number(md: ref Dat->Mntdir, t : ref Text, r : Range, line : int, dir : int, size : int) : (int, Range)
  43. {
  44. q0, q1 : int;
  45. {
  46. if(size == Char){
  47. if(dir == Fore)
  48. line = r.q1+line; # was t.file.buf.nc+line;
  49. else if(dir == Back){
  50. if(r.q0==0 && line > 0)
  51. r.q0 = t.file.buf.nc;
  52. line = r.q0-line; # was t.file.buf.nc - line;
  53. }
  54. if(line<0 || line>t.file.buf.nc)
  55. raise "e";
  56. return (TRUE, (line, line));
  57. }
  58. (q0, q1) = r;
  59. case(dir){
  60. None =>
  61. q0 = 0;
  62. q1 = 0;
  63. while(line>0 && q1<t.file.buf.nc)
  64. if(t.readc(q1++) == '\n')
  65. if(--line > 0)
  66. q0 = q1;
  67. if(line==1 && t.readc(q1-1)!='\n') # no newline at end - count it
  68. ;
  69. else if(line > 0)
  70. raise "e";
  71. Fore =>
  72. if(q1 > 0)
  73. while(t.readc(q1-1) != '\n')
  74. q1++;
  75. q0 = q1;
  76. while(line>0 && q1<t.file.buf.nc)
  77. if(t.readc(q1++) == '\n')
  78. if(--line > 0)
  79. q0 = q1;
  80. if(line > 0)
  81. raise "e";
  82. Back =>
  83. if(q0 < t.file.buf.nc)
  84. while(q0>0 && t.readc(q0-1)!='\n')
  85. q0--;
  86. q1 = q0;
  87. while(line>0 && q0>0){
  88. if(t.readc(q0-1) == '\n'){
  89. if(--line >= 0)
  90. q1 = q0;
  91. }
  92. --q0;
  93. }
  94. if(line > 0)
  95. raise "e";
  96. while(q0>0 && t.readc(q0-1)!='\n')
  97. --q0;
  98. }
  99. return (TRUE, (q0, q1));
  100. }
  101. exception{
  102. * =>
  103. if(md != nil)
  104. warning(nil, "address out of range\n");
  105. return (FALSE, r);
  106. }
  107. return (FALSE, r);
  108. }
  109. regexp(md: ref Dat->Mntdir, t : ref Text, lim : Range, r : Range, pat : string, dir : int) : (int, Range)
  110. {
  111. found : int;
  112. sel : Rangeset;
  113. q : int;
  114. if(pat == nil && rxnull()){
  115. warning(md, "no previous regular expression");
  116. return (FALSE, r);
  117. }
  118. if(pat == nil || !rxcompile(pat))
  119. return (FALSE, r);
  120. if(dir == Back)
  121. (found, sel) = rxbexecute(t, r.q0);
  122. else{
  123. if(lim.q0 < 0)
  124. q = Dat->Infinity;
  125. else
  126. q = lim.q1;
  127. (found, sel) = rxexecute(t, nil, r.q1, q);
  128. }
  129. if(!found && md == nil)
  130. warning(nil, "no match for regexp\n");
  131. return (found, sel[0]);
  132. }
  133. xgetc(a0 : ref Text, a1 : string, n : int) : int
  134. {
  135. if (a0 == nil)
  136. return rgetc(a1, n);
  137. return tgetc(a0, n);
  138. }
  139. address(md: ref Dat->Mntdir, t : ref Text, lim : Range, ar : Range, a0 : ref Text, a1 : string, q0 : int, q1 : int, eval : int) : (int, int, Range)
  140. {
  141. dir, size : int;
  142. prevc, c, n : int;
  143. q : int;
  144. pat : string;
  145. r, nr : Range;
  146. r = ar;
  147. q = q0;
  148. dir = None;
  149. size = Line;
  150. c = 0;
  151. while(q < q1){
  152. prevc = c;
  153. c = xgetc(a0, a1, q++);
  154. case(c){
  155. ';' =>
  156. ar = r;
  157. if(prevc == 0) # lhs defaults to 0
  158. r.q0 = 0;
  159. if(q>=q1 && t!=nil && t.file!=nil) # rhs defaults to $
  160. r.q1 = t.file.buf.nc;
  161. else{
  162. (eval, q, nr) = address(md, t, lim, ar, a0, a1, q, q1, eval);
  163. r.q1 = nr.q1;
  164. }
  165. return (eval, q, r);
  166. ',' =>
  167. if(prevc == 0) # lhs defaults to 0
  168. r.q0 = 0;
  169. if(q>=q1 && t!=nil && t.file!=nil) # rhs defaults to $
  170. r.q1 = t.file.buf.nc;
  171. else{
  172. (eval, q, nr) = address(md, t, lim, ar, a0, a1, q, q1, eval);
  173. r.q1 = nr.q1;
  174. }
  175. return (eval, q, r);
  176. '+' or '-' =>
  177. if(eval && (prevc=='+' || prevc=='-')){
  178. if((nc := xgetc(a0, a1, q)) != '#' && nc != '/' && nc != '?')
  179. (eval, r) = number(md, t, r, 1, prevc, Line); # do previous one
  180. }
  181. dir = c;
  182. '.' or '$' =>
  183. if(q != q0+1)
  184. return (eval, q-1, r);
  185. if(eval)
  186. if(c == '.')
  187. r = ar;
  188. else
  189. r = (t.file.buf.nc, t.file.buf.nc);
  190. if(q < q1)
  191. dir = Fore;
  192. else
  193. dir = None;
  194. '#' =>
  195. if(q==q1 || (c=xgetc(a0, a1, q++))<'0' || '9'<c)
  196. return (eval, q-1, r);
  197. size = Char;
  198. n = c -'0';
  199. while(q<q1){
  200. c = xgetc(a0, a1, q++);
  201. if(c<'0' || '9'<c){
  202. q--;
  203. break;
  204. }
  205. n = n*10+(c-'0');
  206. }
  207. if(eval)
  208. (eval, r) = number(md, t, r, n, dir, size);
  209. dir = None;
  210. size = Line;
  211. '0' to '9' =>
  212. n = c -'0';
  213. while(q<q1){
  214. c = xgetc(a0, a1, q++);
  215. if(c<'0' || '9'<c){
  216. q--;
  217. break;
  218. }
  219. n = n*10+(c-'0');
  220. }
  221. if(eval)
  222. (eval, r) = number(md, t, r, n, dir, size);
  223. dir = None;
  224. size = Line;
  225. '/' =>
  226. pat = nil;
  227. break2 := 0; # Ow !
  228. while(q<q1){
  229. c = xgetc(a0, a1, q++);
  230. case(c){
  231. '\n' =>
  232. --q;
  233. break2 = 1;
  234. '\\' =>
  235. pat[len pat] = c;
  236. if(q == q1)
  237. break2 = 1;
  238. else
  239. c = xgetc(a0, a1, q++);
  240. '/' =>
  241. break2 = 1;
  242. }
  243. if (break2)
  244. break;
  245. pat[len pat] = c;
  246. }
  247. if(eval)
  248. (eval, r) = regexp(md, t, lim, r, pat, dir);
  249. pat = nil;
  250. dir = None;
  251. size = Line;
  252. * =>
  253. return (eval, q-1, r);
  254. }
  255. }
  256. if(eval && dir != None)
  257. (eval, r) = number(md, t, r, 1, dir, Line); # do previous one
  258. return (eval, q, r);
  259. }
  260. sel : Rangeset = array[NRange] of Range;
  261. lastregexp : string;
  262. # Machine Information
  263. Inst : adt {
  264. typex : int; # < 16r10000 ==> literal, otherwise action
  265. # sid : int;
  266. subid : int;
  267. class : int;
  268. # other : cyclic ref Inst;
  269. right : cyclic ref Inst;
  270. # left : cyclic ref Inst;
  271. next : cyclic ref Inst;
  272. };
  273. NPROG : con 1024;
  274. program := array[NPROG] of ref Inst;
  275. progp : int;
  276. startinst : ref Inst; # First inst. of program; might not be program[0]
  277. bstartinst : ref Inst; # same for backwards machine
  278. Ilist : adt {
  279. inst : ref Inst; # Instruction of the thread
  280. se : Rangeset;
  281. startp : int; # first char of match
  282. };
  283. NLIST : con 128;
  284. thl, nl : array of Ilist; # This list, next list
  285. listx := array[2] of array of Ilist;
  286. sempty : Rangeset = array[NRange] of Range;
  287. #
  288. # Actions and Tokens
  289. #
  290. # 0x100xx are operators, value == precedence
  291. # 0x200xx are tokens, i.e. operands for operators
  292. #
  293. OPERATOR : con 16r10000; # Bitmask of all operators
  294. START : con 16r10000; # Start, used for marker on stack
  295. RBRA : con 16r10001; # Right bracket, )
  296. LBRA : con 16r10002; # Left bracket, (
  297. OR : con 16r10003; # Alternation, |
  298. CAT : con 16r10004; # Concatentation, implicit operator
  299. STAR : con 16r10005; # Closure, *
  300. PLUS : con 16r10006; # a+ == aa*
  301. QUEST : con 16r10007; # a? == a|nothing, i.e. 0 or 1 a's
  302. ANY : con 16r20000; # Any character but newline, .
  303. NOP : con 16r20001; # No operation, internal use only
  304. BOL : con 16r20002; # Beginning of line, ^
  305. EOL : con 16r20003; # End of line, $
  306. CCLASS : con 16r20004; # Character class, []
  307. NCCLASS : con 16r20005; # Negated character class, [^]
  308. END : con 16r20077; # Terminate: match found
  309. ISATOR : con 16r10000;
  310. ISAND : con 16r20000;
  311. # Parser Information
  312. Node : adt {
  313. first : ref Inst;
  314. last : ref Inst;
  315. };
  316. NSTACK : con 20;
  317. andstack := array[NSTACK] of ref Node;
  318. andp : int;
  319. atorstack := array[NSTACK] of int;
  320. atorp : int;
  321. lastwasand : int; # Last token was operand
  322. cursubid : int;
  323. subidstack := array[NSTACK] of int;
  324. subidp : int;
  325. backwards : int;
  326. nbra : int;
  327. exprs : string;
  328. exprp : int; # pointer to next character in source expression
  329. DCLASS : con 10; # allocation increment
  330. nclass : int; # number active
  331. Nclass : int = 0; # high water mark
  332. class : array of string;
  333. negateclass : int;
  334. nilnode : Node;
  335. nilinst : Inst;
  336. rxinit()
  337. {
  338. lastregexp = nil;
  339. for (k := 0; k < NPROG; k++)
  340. program[k] = ref nilinst;
  341. for (k = 0; k < NSTACK; k++)
  342. andstack[k] = ref nilnode;
  343. for (k = 0; k < 2; k++) {
  344. listx[k] = array[NLIST] of Ilist;
  345. for (i := 0; i < NLIST; i++) {
  346. listx[k][i].inst = nil;
  347. listx[k][i].startp = 0;
  348. listx[k][i].se = array[NRange] of Range;
  349. for (j := 0; j < NRange; j++)
  350. listx[k][i].se[j].q0 = listx[k][i].se[j].q1 = 0;
  351. }
  352. }
  353. }
  354. regerror(e : string)
  355. {
  356. lastregexp = nil;
  357. buf := sys->sprint("regexp: %s\n", e);
  358. warning(nil, buf);
  359. raise "regerror";
  360. }
  361. newinst(t : int) : ref Inst
  362. {
  363. if(progp >= NPROG)
  364. regerror("expression too long");
  365. program[progp].typex = t;
  366. program[progp].next = nil; # next was left
  367. program[progp].right = nil;
  368. return program[progp++];
  369. }
  370. realcompile(s : string) : ref Inst
  371. {
  372. token : int;
  373. {
  374. startlex(s);
  375. atorp = 0;
  376. andp = 0;
  377. subidp = 0;
  378. cursubid = 0;
  379. lastwasand = FALSE;
  380. # Start with a low priority operator to prime parser
  381. pushator(START-1);
  382. while((token=lex()) != END){
  383. if((token&ISATOR) == OPERATOR)
  384. operator(token);
  385. else
  386. operand(token);
  387. }
  388. # Close with a low priority operator
  389. evaluntil(START);
  390. # Force END
  391. operand(END);
  392. evaluntil(START);
  393. if(nbra)
  394. regerror("unmatched `('");
  395. --andp; # points to first and only operand
  396. return andstack[andp].first;
  397. }
  398. exception{
  399. "regerror" =>
  400. return nil;
  401. }
  402. return nil;
  403. }
  404. rxcompile(r : string) : int
  405. {
  406. oprogp : int;
  407. if(lastregexp == r)
  408. return TRUE;
  409. lastregexp = nil;
  410. for (i := 0; i < nclass; i++)
  411. class[i] = nil;
  412. nclass = 0;
  413. progp = 0;
  414. backwards = FALSE;
  415. bstartinst = nil;
  416. startinst = realcompile(r);
  417. if(startinst == nil)
  418. return FALSE;
  419. optimize(0);
  420. oprogp = progp;
  421. backwards = TRUE;
  422. bstartinst = realcompile(r);
  423. if(bstartinst == nil)
  424. return FALSE;
  425. optimize(oprogp);
  426. lastregexp = r;
  427. return TRUE;
  428. }
  429. operand(t : int)
  430. {
  431. i : ref Inst;
  432. if(lastwasand)
  433. operator(CAT); # catenate is implicit
  434. i = newinst(t);
  435. if(t == CCLASS){
  436. if(negateclass)
  437. i.typex = NCCLASS; # UGH
  438. i.class = nclass-1; # UGH
  439. }
  440. pushand(i, i);
  441. lastwasand = TRUE;
  442. }
  443. operator(t : int)
  444. {
  445. if(t==RBRA && --nbra<0)
  446. regerror("unmatched `)'");
  447. if(t==LBRA){
  448. cursubid++; # silently ignored
  449. nbra++;
  450. if(lastwasand)
  451. operator(CAT);
  452. }else
  453. evaluntil(t);
  454. if(t!=RBRA)
  455. pushator(t);
  456. lastwasand = FALSE;
  457. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  458. lastwasand = TRUE; # these look like operands
  459. }
  460. pushand(f : ref Inst, l : ref Inst)
  461. {
  462. if(andp >= NSTACK)
  463. error("operand stack overflow");
  464. andstack[andp].first = f;
  465. andstack[andp].last = l;
  466. andp++;
  467. }
  468. pushator(t : int)
  469. {
  470. if(atorp >= NSTACK)
  471. error("operator stack overflow");
  472. atorstack[atorp++]=t;
  473. if(cursubid >= NRange)
  474. subidstack[subidp++]= -1;
  475. else
  476. subidstack[subidp++]=cursubid;
  477. }
  478. popand(op : int) : ref Node
  479. {
  480. if(andp <= 0)
  481. if(op){
  482. buf := sys->sprint("missing operand for %c", op);
  483. regerror(buf);
  484. }else
  485. regerror("malformed regexp");
  486. return andstack[--andp];
  487. }
  488. popator() : int
  489. {
  490. if(atorp <= 0)
  491. error("operator stack underflow");
  492. --subidp;
  493. return atorstack[--atorp];
  494. }
  495. evaluntil(pri : int)
  496. {
  497. op1, op2 : ref Node;
  498. inst1, inst2 : ref Inst;
  499. while(pri==RBRA || atorstack[atorp-1]>=pri){
  500. case(popator()){
  501. LBRA =>
  502. op1 = popand('(');
  503. inst2 = newinst(RBRA);
  504. inst2.subid = subidstack[subidp];
  505. op1.last.next = inst2;
  506. inst1 = newinst(LBRA);
  507. inst1.subid = subidstack[subidp];
  508. inst1.next = op1.first;
  509. pushand(inst1, inst2);
  510. return; # must have been RBRA
  511. OR =>
  512. op2 = popand('|');
  513. op1 = popand('|');
  514. inst2 = newinst(NOP);
  515. op2.last.next = inst2;
  516. op1.last.next = inst2;
  517. inst1 = newinst(OR);
  518. inst1.right = op1.first;
  519. inst1.next = op2.first; # next was left
  520. pushand(inst1, inst2);
  521. CAT =>
  522. op2 = popand(0);
  523. op1 = popand(0);
  524. if(backwards && op2.first.typex!=END)
  525. (op1, op2) = (op2, op1);
  526. op1.last.next = op2.first;
  527. pushand(op1.first, op2.last);
  528. STAR =>
  529. op2 = popand('*');
  530. inst1 = newinst(OR);
  531. op2.last.next = inst1;
  532. inst1.right = op2.first;
  533. pushand(inst1, inst1);
  534. PLUS =>
  535. op2 = popand('+');
  536. inst1 = newinst(OR);
  537. op2.last.next = inst1;
  538. inst1.right = op2.first;
  539. pushand(op2.first, inst1);
  540. QUEST =>
  541. op2 = popand('?');
  542. inst1 = newinst(OR);
  543. inst2 = newinst(NOP);
  544. inst1.next = inst2; # next was left
  545. inst1.right = op2.first;
  546. op2.last.next = inst2;
  547. pushand(inst1, inst2);
  548. * =>
  549. error("unknown regexp operator");
  550. }
  551. }
  552. }
  553. optimize(start : int)
  554. {
  555. inst : int;
  556. target : ref Inst;
  557. for(inst=start; program[inst].typex!=END; inst++){
  558. target = program[inst].next;
  559. while(target.typex == NOP)
  560. target = target.next;
  561. program[inst].next = target;
  562. }
  563. }
  564. startlex(s : string)
  565. {
  566. exprs = s;
  567. exprp = 0;
  568. nbra = 0;
  569. }
  570. lex() : int
  571. {
  572. c : int;
  573. if (exprp == len exprs)
  574. return END;
  575. c = exprs[exprp++];
  576. case(c){
  577. '\\' =>
  578. if(exprp < len exprs)
  579. if((c= exprs[exprp++])=='n')
  580. c='\n';
  581. '*' =>
  582. c = STAR;
  583. '?' =>
  584. c = QUEST;
  585. '+' =>
  586. c = PLUS;
  587. '|' =>
  588. c = OR;
  589. '.' =>
  590. c = ANY;
  591. '(' =>
  592. c = LBRA;
  593. ')' =>
  594. c = RBRA;
  595. '^' =>
  596. c = BOL;
  597. '$' =>
  598. c = EOL;
  599. '[' =>
  600. c = CCLASS;
  601. bldcclass();
  602. }
  603. return c;
  604. }
  605. nextrec() : int
  606. {
  607. if(exprp == len exprs || (exprp == len exprs-1 && exprs[exprp]=='\\'))
  608. regerror("malformed `[]'");
  609. if(exprs[exprp] == '\\'){
  610. exprp++;
  611. if(exprs[exprp]=='n'){
  612. exprp++;
  613. return '\n';
  614. }
  615. return exprs[exprp++] | 16r10000;
  616. }
  617. return exprs[exprp++];
  618. }
  619. bldcclass()
  620. {
  621. c1, c2 : int;
  622. classp : string;
  623. # we have already seen the '['
  624. if(exprp < len exprs && exprs[exprp] == '^'){
  625. classp[len classp] = '\n'; # don't match newline in negate case
  626. negateclass = TRUE;
  627. exprp++;
  628. }else
  629. negateclass = FALSE;
  630. while((c1 = nextrec()) != ']'){
  631. if(c1 == '-'){
  632. classp = nil;
  633. regerror("malformed `[]'");
  634. }
  635. if(exprp < len exprs && exprs[exprp] == '-'){
  636. exprp++; # eat '-'
  637. if((c2 = nextrec()) == ']') {
  638. classp = nil;
  639. regerror("malformed '[]'");
  640. }
  641. classp[len classp] = 16rFFFF;
  642. classp[len classp] = c1;
  643. classp[len classp] = c2;
  644. }else
  645. classp[len classp] = c1;
  646. }
  647. if(nclass == Nclass){
  648. Nclass += DCLASS;
  649. oc := class;
  650. class = array[Nclass] of string;
  651. if (oc != nil) {
  652. class[0:] = oc[0:Nclass-DCLASS];
  653. oc = nil;
  654. }
  655. }
  656. class[nclass++] = classp;
  657. }
  658. classmatch(classno : int, c : int, negate : int) : int
  659. {
  660. p : string;
  661. p = class[classno];
  662. for (i := 0; i < len p; ) {
  663. if(p[i] == 16rFFFF){
  664. if(p[i+1]<=c && c<=p[i+2])
  665. return !negate;
  666. i += 3;
  667. }else if(p[i++] == c)
  668. return !negate;
  669. }
  670. return negate;
  671. }
  672. #
  673. # Note optimization in addinst:
  674. # *l must be pending when addinst called; if *l has been looked
  675. # at already, the optimization is a bug.
  676. #
  677. addinst(l : array of Ilist, inst : ref Inst, sep : Rangeset)
  678. {
  679. p : int;
  680. for(p = 0; l[p].inst != nil; p++){
  681. if(l[p].inst==inst){
  682. if(sep[0].q0 < l[p].se[0].q0)
  683. l[p].se[0:] = sep[0:NRange]; # this would be bug
  684. return; # It's already there
  685. }
  686. }
  687. l[p].inst = inst;
  688. l[p].se[0:]= sep[0:NRange];
  689. l[p+1].inst = nil;
  690. }
  691. rxnull() : int
  692. {
  693. return startinst==nil || bstartinst==nil;
  694. }
  695. OVERFLOW : con "overflow";
  696. # either t!=nil or r!=nil, and we match the string in the appropriate place
  697. rxexecute(t : ref Text, r: string, startp : int, eof : int) : (int, Rangeset)
  698. {
  699. flag : int;
  700. inst : ref Inst;
  701. tlp : int;
  702. p : int;
  703. nnl, ntl : int;
  704. nc, c : int;
  705. wrapped : int;
  706. startchar : int;
  707. flag = 0;
  708. p = startp;
  709. startchar = 0;
  710. wrapped = 0;
  711. nnl = 0;
  712. if(startinst.typex<OPERATOR)
  713. startchar = startinst.typex;
  714. listx[0][0].inst = listx[1][0].inst = nil;
  715. sel[0].q0 = -1;
  716. {
  717. if(t != nil)
  718. nc = t.file.buf.nc;
  719. else
  720. nc = len r;
  721. # Execute machine once for each character
  722. for(;;p++){
  723. if(p>=eof || p>=nc){
  724. case(wrapped++){
  725. 0 or 2 => # let loop run one more click
  726. ;
  727. 1 => # expired; wrap to beginning
  728. if(sel[0].q0>=0 || eof!=Dat->Infinity)
  729. return (sel[0].q0>=0, sel);
  730. listx[0][0].inst = listx[1][0].inst = nil;
  731. p = -1;
  732. continue;
  733. * =>
  734. return (sel[0].q0>=0, sel);
  735. }
  736. c = 0;
  737. }else{
  738. if(((wrapped && p>=startp) || sel[0].q0>0) && nnl==0)
  739. break;
  740. if(t != nil)
  741. c = t.readc(p);
  742. else
  743. c = r[p];
  744. }
  745. # fast check for first char
  746. if(startchar && nnl==0 && c!=startchar)
  747. continue;
  748. thl = listx[flag];
  749. nl = listx[flag^=1];
  750. nl[0].inst = nil;
  751. ntl = nnl;
  752. nnl = 0;
  753. if(sel[0].q0<0 && (!wrapped || p<startp || startp==eof)){
  754. # Add first instruction to this list
  755. if(++ntl >= NLIST)
  756. raise OVERFLOW;
  757. sempty[0].q0 = p;
  758. addinst(thl, startinst, sempty);
  759. }
  760. # Execute machine until this list is empty
  761. tlp = 0;
  762. inst = thl[0].inst;
  763. while(inst != nil){ # assignment =
  764. case(inst.typex){
  765. LBRA =>
  766. if(inst.subid>=0)
  767. thl[tlp].se[inst.subid].q0 = p;
  768. inst = inst.next;
  769. continue;
  770. RBRA =>
  771. if(inst.subid>=0)
  772. thl[tlp].se[inst.subid].q1 = p;
  773. inst = inst.next;
  774. continue;
  775. ANY =>
  776. if(c!='\n') {
  777. if(++nnl >= NLIST)
  778. raise OVERFLOW;
  779. addinst(nl, inst.next, thl[tlp].se);
  780. }
  781. BOL =>
  782. if(p==0 || (t != nil && t.readc(p-1)=='\n') || (r != nil && r[p-1] == '\n')){
  783. inst = inst.next;
  784. continue;
  785. }
  786. EOL =>
  787. if(c == '\n') {
  788. inst = inst.next;
  789. continue;
  790. }
  791. CCLASS =>
  792. if(c>=0 && classmatch(inst.class, c, 0)) {
  793. if(++nnl >= NLIST)
  794. raise OVERFLOW;
  795. addinst(nl, inst.next, thl[tlp].se);
  796. }
  797. NCCLASS =>
  798. if(c>=0 && classmatch(inst.class, c, 1)) {
  799. if(++nnl >= NLIST)
  800. raise OVERFLOW;
  801. addinst(nl, inst.next, thl[tlp].se);
  802. }
  803. OR =>
  804. # evaluate right choice later
  805. if(++ntl >= NLIST)
  806. raise OVERFLOW;
  807. addinst(thl[tlp:], inst.right, thl[tlp].se);
  808. # efficiency: advance and re-evaluate
  809. inst = inst.next; # next was left
  810. continue;
  811. END => # Match!
  812. thl[tlp].se[0].q1 = p;
  813. newmatch(thl[tlp].se);
  814. * => # regular character
  815. if(inst.typex==c){
  816. if(++nnl >= NLIST)
  817. raise OVERFLOW;
  818. addinst(nl, inst.next, thl[tlp].se);
  819. }
  820. }
  821. tlp++;
  822. inst = thl[tlp].inst;
  823. }
  824. }
  825. return (sel[0].q0>=0, sel);
  826. }
  827. exception{
  828. OVERFLOW =>
  829. error("regexp list overflow");
  830. sel[0].q0 = -1;
  831. return (0, sel);
  832. }
  833. return (0, sel);
  834. }
  835. newmatch(sp : Rangeset)
  836. {
  837. if(sel[0].q0<0 || sp[0].q0<sel[0].q0 ||
  838. (sp[0].q0==sel[0].q0 && sp[0].q1>sel[0].q1))
  839. sel[0:] = sp[0:NRange];
  840. }
  841. rxbexecute(t : ref Text, startp : int) : (int, Rangeset)
  842. {
  843. flag : int;
  844. inst : ref Inst;
  845. tlp : int;
  846. p : int;
  847. nnl, ntl : int;
  848. c : int;
  849. wrapped : int;
  850. startchar : int;
  851. flag = 0;
  852. nnl = 0;
  853. wrapped = 0;
  854. p = startp;
  855. startchar = 0;
  856. if(bstartinst.typex<OPERATOR)
  857. startchar = bstartinst.typex;
  858. listx[0][0].inst = listx[1][0].inst = nil;
  859. sel[0].q0= -1;
  860. {
  861. # Execute machine once for each character, including terminal NUL
  862. for(;;--p){
  863. if(p <= 0){
  864. case(wrapped++){
  865. 0 or 2 => # let loop run one more click
  866. ;
  867. 1 => # expired; wrap to end
  868. if(sel[0].q0>=0)
  869. return (sel[0].q0>=0, sel);
  870. listx[0][0].inst = listx[1][0].inst = nil;
  871. p = t.file.buf.nc+1;
  872. continue;
  873. 3 or * =>
  874. return (sel[0].q0>=0, sel);
  875. }
  876. c = 0;
  877. }else{
  878. if(((wrapped && p<=startp) || sel[0].q0>0) && nnl==0)
  879. break;
  880. c = t.readc(p-1);
  881. }
  882. # fast check for first char
  883. if(startchar && nnl==0 && c!=startchar)
  884. continue;
  885. thl = listx[flag];
  886. nl = listx[flag^=1];
  887. nl[0].inst = nil;
  888. ntl = nnl;
  889. nnl = 0;
  890. if(sel[0].q0<0 && (!wrapped || p>startp)){
  891. # Add first instruction to this list
  892. if(++ntl >= NLIST)
  893. raise OVERFLOW;
  894. # the minus is so the optimizations in addinst work
  895. sempty[0].q0 = -p;
  896. addinst(thl, bstartinst, sempty);
  897. }
  898. # Execute machine until this list is empty
  899. tlp = 0;
  900. inst = thl[0].inst;
  901. while(inst != nil){ # assignment =
  902. case(inst.typex){
  903. LBRA =>
  904. if(inst.subid>=0)
  905. thl[tlp].se[inst.subid].q0 = p;
  906. inst = inst.next;
  907. continue;
  908. RBRA =>
  909. if(inst.subid >= 0)
  910. thl[tlp].se[inst.subid].q1 = p;
  911. inst = inst.next;
  912. continue;
  913. ANY =>
  914. if(c != '\n') {
  915. if(++nnl >= NLIST)
  916. raise OVERFLOW;
  917. addinst(nl, inst.next, thl[tlp].se);
  918. }
  919. BOL =>
  920. if(c=='\n' || p==0){
  921. inst = inst.next;
  922. continue;
  923. }
  924. EOL =>
  925. if(p<t.file.buf.nc && t.readc(p)=='\n') {
  926. inst = inst.next;
  927. continue;
  928. }
  929. CCLASS =>
  930. if(c>0 && classmatch(inst.class, c, 0)) {
  931. if(++nnl >= NLIST)
  932. raise OVERFLOW;
  933. addinst(nl, inst.next, thl[tlp].se);
  934. }
  935. NCCLASS =>
  936. if(c>0 && classmatch(inst.class, c, 1)) {
  937. if(++nnl >= NLIST)
  938. raise OVERFLOW;
  939. addinst(nl, inst.next, thl[tlp].se);
  940. }
  941. OR =>
  942. # evaluate right choice later
  943. if(++ntl >= NLIST)
  944. raise OVERFLOW;
  945. addinst(thl[tlp:], inst.right, thl[tlp].se);
  946. # efficiency: advance and re-evaluate
  947. inst = inst.next; # next was left
  948. continue;
  949. END => # Match!
  950. thl[tlp].se[0].q0 = -thl[tlp].se[0].q0; # minus sign
  951. thl[tlp].se[0].q1 = p;
  952. bnewmatch(thl[tlp].se);
  953. * => # regular character
  954. if(inst.typex == c){
  955. if(++nnl >= NLIST)
  956. raise OVERFLOW;
  957. addinst(nl, inst.next, thl[tlp].se);
  958. }
  959. }
  960. tlp++;
  961. inst = thl[tlp].inst;
  962. }
  963. }
  964. return (sel[0].q0>=0, sel);
  965. }
  966. exception{
  967. OVERFLOW =>
  968. error("regexp list overflow");
  969. sel[0].q0 = -1;
  970. return (0, sel);
  971. }
  972. return (0, sel);
  973. }
  974. bnewmatch(sp : Rangeset)
  975. {
  976. i : int;
  977. if(sel[0].q0<0 || sp[0].q0>sel[0].q1 || (sp[0].q0==sel[0].q1 && sp[0].q1<sel[0].q0))
  978. for(i = 0; i<NRange; i++){ # note the reversal; q0<=q1
  979. sel[i].q0 = sp[i].q1;
  980. sel[i].q1 = sp[i].q0;
  981. }
  982. }