1
0

regcomp.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include <u.h>
  10. #include <libc.h>
  11. #include "regexp.h"
  12. #include "regcomp.h"
  13. #define TRUE 1
  14. #define FALSE 0
  15. /*
  16. * Parser Information
  17. */
  18. typedef
  19. struct Node
  20. {
  21. Reinst* first;
  22. Reinst* last;
  23. }Node;
  24. /* max character classes per program is nelem(reprog->class) */
  25. static Reprog *reprog;
  26. /* max rune ranges per character class is nelem(classp->spans)/2 */
  27. #define NCCRUNE nelem(classp->spans)
  28. #define NSTACK 20
  29. static Node andstack[NSTACK];
  30. static Node *andp;
  31. static int atorstack[NSTACK];
  32. static int* atorp;
  33. static int cursubid; /* id of current subexpression */
  34. static int subidstack[NSTACK]; /* parallel to atorstack */
  35. static int* subidp;
  36. static int lastwasand; /* Last token was operand */
  37. static int nbra;
  38. static char* exprp; /* pointer to next character in source expression */
  39. static int lexdone;
  40. static int nclass;
  41. static Reclass*classp;
  42. static Reinst* freep;
  43. static int errors;
  44. static Rune yyrune; /* last lex'd rune */
  45. static Reclass*yyclassp; /* last lex'd class */
  46. /* predeclared crap */
  47. static void operator(int);
  48. static void pushand(Reinst*, Reinst*);
  49. static void pushator(int);
  50. static void evaluntil(int);
  51. static int bldcclass(void);
  52. static jmp_buf regkaboom;
  53. static void
  54. rcerror(char *s)
  55. {
  56. errors++;
  57. regerror(s);
  58. longjmp(regkaboom, 1);
  59. }
  60. static Reinst*
  61. newinst(int t)
  62. {
  63. freep->type = t;
  64. freep->left = 0;
  65. freep->right = 0;
  66. return freep++;
  67. }
  68. static void
  69. operand(int t)
  70. {
  71. Reinst *i;
  72. if(lastwasand)
  73. operator(CAT); /* catenate is implicit */
  74. i = newinst(t);
  75. if(t == CCLASS || t == NCCLASS)
  76. i->cp = yyclassp;
  77. if(t == RUNE)
  78. i->r = yyrune;
  79. pushand(i, i);
  80. lastwasand = TRUE;
  81. }
  82. static void
  83. operator(int t)
  84. {
  85. if(t==RBRA && --nbra<0)
  86. rcerror("unmatched right paren");
  87. if(t==LBRA){
  88. if(++cursubid >= NSUBEXP)
  89. rcerror ("too many subexpressions");
  90. nbra++;
  91. if(lastwasand)
  92. operator(CAT);
  93. } else
  94. evaluntil(t);
  95. if(t != RBRA)
  96. pushator(t);
  97. lastwasand = FALSE;
  98. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  99. lastwasand = TRUE; /* these look like operands */
  100. }
  101. static void
  102. regerr2(char *s, int c)
  103. {
  104. char buf[100];
  105. char *cp = buf;
  106. while(*s)
  107. *cp++ = *s++;
  108. *cp++ = c;
  109. *cp = '\0';
  110. rcerror(buf);
  111. }
  112. static void
  113. cant(char *s)
  114. {
  115. char buf[100];
  116. strcpy(buf, "can't happen: ");
  117. strcat(buf, s);
  118. rcerror(buf);
  119. }
  120. static void
  121. pushand(Reinst *f, Reinst *l)
  122. {
  123. if(andp >= &andstack[NSTACK])
  124. cant("operand stack overflow");
  125. andp->first = f;
  126. andp->last = l;
  127. andp++;
  128. }
  129. static void
  130. pushator(int t)
  131. {
  132. if(atorp >= &atorstack[NSTACK])
  133. cant("operator stack overflow");
  134. *atorp++ = t;
  135. *subidp++ = cursubid;
  136. }
  137. static Node*
  138. popand(int op)
  139. {
  140. Reinst *inst;
  141. if(andp <= &andstack[0]){
  142. regerr2("missing operand for ", op);
  143. inst = newinst(NOP);
  144. pushand(inst,inst);
  145. }
  146. return --andp;
  147. }
  148. static int
  149. popator(void)
  150. {
  151. if(atorp <= &atorstack[0])
  152. cant("operator stack underflow");
  153. --subidp;
  154. return *--atorp;
  155. }
  156. static void
  157. evaluntil(int pri)
  158. {
  159. Node *op1, *op2;
  160. Reinst *inst1, *inst2;
  161. while(pri==RBRA || atorp[-1]>=pri){
  162. switch(popator()){
  163. default:
  164. rcerror("unknown operator in evaluntil");
  165. break;
  166. case LBRA: /* must have been RBRA */
  167. op1 = popand('(');
  168. inst2 = newinst(RBRA);
  169. inst2->subid = *subidp;
  170. op1->last->next = inst2;
  171. inst1 = newinst(LBRA);
  172. inst1->subid = *subidp;
  173. inst1->next = op1->first;
  174. pushand(inst1, inst2);
  175. return;
  176. case OR:
  177. op2 = popand('|');
  178. op1 = popand('|');
  179. inst2 = newinst(NOP);
  180. op2->last->next = inst2;
  181. op1->last->next = inst2;
  182. inst1 = newinst(OR);
  183. inst1->right = op1->first;
  184. inst1->left = op2->first;
  185. pushand(inst1, inst2);
  186. break;
  187. case CAT:
  188. op2 = popand(0);
  189. op1 = popand(0);
  190. op1->last->next = op2->first;
  191. pushand(op1->first, op2->last);
  192. break;
  193. case STAR:
  194. op2 = popand('*');
  195. inst1 = newinst(OR);
  196. op2->last->next = inst1;
  197. inst1->right = op2->first;
  198. pushand(inst1, inst1);
  199. break;
  200. case PLUS:
  201. op2 = popand('+');
  202. inst1 = newinst(OR);
  203. op2->last->next = inst1;
  204. inst1->right = op2->first;
  205. pushand(op2->first, inst1);
  206. break;
  207. case QUEST:
  208. op2 = popand('?');
  209. inst1 = newinst(OR);
  210. inst2 = newinst(NOP);
  211. inst1->left = inst2;
  212. inst1->right = op2->first;
  213. op2->last->next = inst2;
  214. pushand(inst1, inst2);
  215. break;
  216. }
  217. }
  218. }
  219. static Reprog*
  220. optimize(Reprog *pp)
  221. {
  222. Reinst *inst, *target;
  223. int size;
  224. Reprog *npp;
  225. Reclass *cl;
  226. int diff;
  227. /*
  228. * get rid of NOOP chains
  229. */
  230. for(inst=pp->firstinst; inst->type!=END; inst++){
  231. target = inst->next;
  232. while(target->type == NOP)
  233. target = target->next;
  234. inst->next = target;
  235. }
  236. /*
  237. * The original allocation is for an area larger than
  238. * necessary. Reallocate to the actual space used
  239. * and then relocate the code.
  240. */
  241. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  242. npp = realloc(pp, size);
  243. if(npp==0 || npp==pp)
  244. return pp;
  245. diff = (char *)npp - (char *)pp;
  246. freep = (Reinst *)((char *)freep + diff);
  247. for(inst=npp->firstinst; inst<freep; inst++){
  248. switch(inst->type){
  249. case OR:
  250. case STAR:
  251. case PLUS:
  252. case QUEST:
  253. *(char **)&inst->right += diff;
  254. break;
  255. case CCLASS:
  256. case NCCLASS:
  257. *(char **)&inst->right += diff;
  258. cl = inst->cp;
  259. *(char **)&cl->end += diff;
  260. break;
  261. }
  262. *(char **)&inst->left += diff;
  263. }
  264. *(char **)&npp->startinst += diff;
  265. return npp;
  266. }
  267. #ifdef DEBUG
  268. static void
  269. dumpstack(void){
  270. Node *stk;
  271. int *ip;
  272. print("operators\n");
  273. for(ip=atorstack; ip<atorp; ip++)
  274. print("0%o\n", *ip);
  275. print("operands\n");
  276. for(stk=andstack; stk<andp; stk++)
  277. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  278. }
  279. static void
  280. dump(Reprog *pp)
  281. {
  282. Reinst *l;
  283. Rune *p;
  284. l = pp->firstinst;
  285. do{
  286. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  287. l->left-pp->firstinst, l->right-pp->firstinst);
  288. if(l->type == RUNE)
  289. print("\t%C\n", l->r);
  290. else if(l->type == CCLASS || l->type == NCCLASS){
  291. print("\t[");
  292. if(l->type == NCCLASS)
  293. print("^");
  294. for(p = l->cp->spans; p < l->cp->end; p += 2)
  295. if(p[0] == p[1])
  296. print("%C", p[0]);
  297. else
  298. print("%C-%C", p[0], p[1]);
  299. print("]\n");
  300. } else
  301. print("\n");
  302. }while(l++->type);
  303. }
  304. #endif
  305. static Reclass*
  306. newclass(void)
  307. {
  308. if(nclass >= nelem(reprog->class))
  309. rcerror("too many character classes; increase Reprog.class size");
  310. return &(classp[nclass++]);
  311. }
  312. static int
  313. nextc(Rune *rp)
  314. {
  315. if(lexdone){
  316. *rp = 0;
  317. return 1;
  318. }
  319. exprp += chartorune(rp, exprp);
  320. if(*rp == L'\\'){
  321. exprp += chartorune(rp, exprp);
  322. return 1;
  323. }
  324. if(*rp == 0)
  325. lexdone = 1;
  326. return 0;
  327. }
  328. static int
  329. lex(int literal, int dot_type)
  330. {
  331. int quoted;
  332. quoted = nextc(&yyrune);
  333. if(literal || quoted){
  334. if(yyrune == 0)
  335. return END;
  336. return RUNE;
  337. }
  338. switch(yyrune){
  339. case 0:
  340. return END;
  341. case L'*':
  342. return STAR;
  343. case L'?':
  344. return QUEST;
  345. case L'+':
  346. return PLUS;
  347. case L'|':
  348. return OR;
  349. case L'.':
  350. return dot_type;
  351. case L'(':
  352. return LBRA;
  353. case L')':
  354. return RBRA;
  355. case L'^':
  356. return BOL;
  357. case L'$':
  358. return EOL;
  359. case L'[':
  360. return bldcclass();
  361. }
  362. return RUNE;
  363. }
  364. static int
  365. bldcclass(void)
  366. {
  367. int type;
  368. Rune r[NCCRUNE];
  369. Rune *p, *ep, *np;
  370. Rune rune;
  371. int quoted;
  372. /* we have already seen the '[' */
  373. type = CCLASS;
  374. yyclassp = newclass();
  375. /* look ahead for negation */
  376. /* SPECIAL CASE!!! negated classes don't match \n */
  377. ep = r;
  378. quoted = nextc(&rune);
  379. if(!quoted && rune == L'^'){
  380. type = NCCLASS;
  381. quoted = nextc(&rune);
  382. *ep++ = L'\n';
  383. *ep++ = L'\n';
  384. }
  385. /* parse class into a set of spans */
  386. while(ep < &r[NCCRUNE-1]){
  387. if(rune == 0){
  388. rcerror("malformed '[]'");
  389. return 0;
  390. }
  391. if(!quoted && rune == L']')
  392. break;
  393. if(!quoted && rune == L'-'){
  394. if(ep == r){
  395. rcerror("malformed '[]'");
  396. return 0;
  397. }
  398. quoted = nextc(&rune);
  399. if((!quoted && rune == L']') || rune == 0){
  400. rcerror("malformed '[]'");
  401. return 0;
  402. }
  403. *(ep-1) = rune;
  404. } else {
  405. *ep++ = rune;
  406. *ep++ = rune;
  407. }
  408. quoted = nextc(&rune);
  409. }
  410. if(ep >= &r[NCCRUNE-1]) {
  411. rcerror("char class too large; increase Reclass.spans size");
  412. return 0;
  413. }
  414. /* sort on span start */
  415. for(p = r; p < ep; p += 2){
  416. for(np = p; np < ep; np += 2)
  417. if(*np < *p){
  418. rune = np[0];
  419. np[0] = p[0];
  420. p[0] = rune;
  421. rune = np[1];
  422. np[1] = p[1];
  423. p[1] = rune;
  424. }
  425. }
  426. /* merge spans */
  427. np = yyclassp->spans;
  428. p = r;
  429. if(r == ep)
  430. yyclassp->end = np;
  431. else {
  432. np[0] = *p++;
  433. np[1] = *p++;
  434. for(; p < ep; p += 2)
  435. /* overlapping or adjacent ranges? */
  436. if(p[0] <= np[1] + 1){
  437. if(p[1] >= np[1])
  438. np[1] = p[1]; /* coalesce */
  439. } else {
  440. np += 2;
  441. np[0] = p[0];
  442. np[1] = p[1];
  443. }
  444. yyclassp->end = np+2;
  445. }
  446. return type;
  447. }
  448. static Reprog*
  449. regcomp1(char *s, int literal, int dot_type)
  450. {
  451. int token;
  452. Reprog *pp;
  453. /* get memory for the program */
  454. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  455. if(pp == 0){
  456. regerror("out of memory");
  457. return 0;
  458. }
  459. freep = pp->firstinst;
  460. classp = pp->class;
  461. errors = 0;
  462. if(setjmp(regkaboom))
  463. goto out;
  464. /* go compile the sucker */
  465. lexdone = 0;
  466. exprp = s;
  467. nclass = 0;
  468. nbra = 0;
  469. atorp = atorstack;
  470. andp = andstack;
  471. subidp = subidstack;
  472. lastwasand = FALSE;
  473. cursubid = 0;
  474. /* Start with a low priority operator to prime parser */
  475. pushator(START-1);
  476. while((token = lex(literal, dot_type)) != END){
  477. if((token&0300) == OPERATOR)
  478. operator(token);
  479. else
  480. operand(token);
  481. }
  482. /* Close with a low priority operator */
  483. evaluntil(START);
  484. /* Force END */
  485. operand(END);
  486. evaluntil(START);
  487. #ifdef DEBUG
  488. dumpstack();
  489. #endif
  490. if(nbra)
  491. rcerror("unmatched left paren");
  492. --andp; /* points to first and only operand */
  493. pp->startinst = andp->first;
  494. #ifdef DEBUG
  495. dump(pp);
  496. #endif
  497. pp = optimize(pp);
  498. #ifdef DEBUG
  499. print("start: %d\n", andp->first-pp->firstinst);
  500. dump(pp);
  501. #endif
  502. out:
  503. if(errors){
  504. free(pp);
  505. pp = 0;
  506. }
  507. return pp;
  508. }
  509. extern Reprog*
  510. regcomp(char *s)
  511. {
  512. return regcomp1(s, 0, ANY);
  513. }
  514. extern Reprog*
  515. regcomplit(char *s)
  516. {
  517. return regcomp1(s, 1, ANY);
  518. }
  519. extern Reprog*
  520. regcompnl(char *s)
  521. {
  522. return regcomp1(s, 0, ANYNL);
  523. }