regcomp.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include <u.h>
  10. #include <libc.h>
  11. #include "regexp.h"
  12. #include "regcomp.h"
  13. #define TRUE 1
  14. #define FALSE 0
  15. /*
  16. * Parser Information
  17. */
  18. typedef
  19. struct Node
  20. {
  21. Reinst* first;
  22. Reinst* last;
  23. }Node;
  24. /* max character classes per program is nelem(reprog->class) */
  25. /* TODO: why does clang think this is never used if we declare it static? */
  26. /*static */Reprog *reprog;
  27. /* max rune ranges per character class is nelem(classp->spans)/2 */
  28. #define NCCRUNE nelem(classp->spans)
  29. #define NSTACK 20
  30. static Node andstack[NSTACK];
  31. static Node *andp;
  32. static int atorstack[NSTACK];
  33. static int* atorp;
  34. static int cursubid; /* id of current subexpression */
  35. static int subidstack[NSTACK]; /* parallel to atorstack */
  36. static int* subidp;
  37. static int lastwasand; /* Last token was operand */
  38. static int nbra;
  39. static char* exprp; /* pointer to next character in source expression */
  40. static int lexdone;
  41. static int nclass;
  42. static Reclass*classp;
  43. static Reinst* freep;
  44. static int errors;
  45. static Rune yyrune; /* last lex'd rune */
  46. static Reclass*yyclassp; /* last lex'd class */
  47. /* predeclared crap */
  48. static void operator(int);
  49. static void pushand(Reinst*, Reinst*);
  50. static void pushator(int);
  51. static void evaluntil(int);
  52. static int bldcclass(void);
  53. static jmp_buf regkaboom;
  54. static void
  55. rcerror(char *s)
  56. {
  57. errors++;
  58. regerror(s);
  59. longjmp(regkaboom, 1);
  60. }
  61. static Reinst*
  62. newinst(int t)
  63. {
  64. freep->type = t;
  65. freep->left = 0;
  66. freep->right = 0;
  67. return freep++;
  68. }
  69. static void
  70. operand(int t)
  71. {
  72. Reinst *i;
  73. if(lastwasand)
  74. operator(CAT); /* catenate is implicit */
  75. i = newinst(t);
  76. if(t == CCLASS || t == NCCLASS)
  77. i->cp = yyclassp;
  78. if(t == RUNE)
  79. i->r = yyrune;
  80. pushand(i, i);
  81. lastwasand = TRUE;
  82. }
  83. static void
  84. operator(int t)
  85. {
  86. if(t==RBRA && --nbra<0)
  87. rcerror("unmatched right paren");
  88. if(t==LBRA){
  89. if(++cursubid >= NSUBEXP)
  90. rcerror ("too many subexpressions");
  91. nbra++;
  92. if(lastwasand)
  93. operator(CAT);
  94. } else
  95. evaluntil(t);
  96. if(t != RBRA)
  97. pushator(t);
  98. lastwasand = FALSE;
  99. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  100. lastwasand = TRUE; /* these look like operands */
  101. }
  102. static void
  103. regerr2(char *s, int c)
  104. {
  105. char buf[100];
  106. char *cp = buf;
  107. while(*s)
  108. *cp++ = *s++;
  109. *cp++ = c;
  110. *cp = '\0';
  111. rcerror(buf);
  112. }
  113. static void
  114. cant(char *s)
  115. {
  116. char buf[100];
  117. strcpy(buf, "can't happen: ");
  118. strcat(buf, s);
  119. rcerror(buf);
  120. }
  121. static void
  122. pushand(Reinst *f, Reinst *l)
  123. {
  124. if(andp >= &andstack[NSTACK])
  125. cant("operand stack overflow");
  126. andp->first = f;
  127. andp->last = l;
  128. andp++;
  129. }
  130. static void
  131. pushator(int t)
  132. {
  133. if(atorp >= &atorstack[NSTACK])
  134. cant("operator stack overflow");
  135. *atorp++ = t;
  136. *subidp++ = cursubid;
  137. }
  138. static Node*
  139. popand(int op)
  140. {
  141. Reinst *inst;
  142. if(andp <= &andstack[0]){
  143. regerr2("missing operand for ", op);
  144. inst = newinst(NOP);
  145. pushand(inst,inst);
  146. }
  147. return --andp;
  148. }
  149. static int
  150. popator(void)
  151. {
  152. if(atorp <= &atorstack[0])
  153. cant("operator stack underflow");
  154. --subidp;
  155. return *--atorp;
  156. }
  157. static void
  158. evaluntil(int pri)
  159. {
  160. Node *op1, *op2;
  161. Reinst *inst1, *inst2;
  162. while(pri==RBRA || atorp[-1]>=pri){
  163. switch(popator()){
  164. default:
  165. rcerror("unknown operator in evaluntil");
  166. break;
  167. case LBRA: /* must have been RBRA */
  168. op1 = popand('(');
  169. inst2 = newinst(RBRA);
  170. inst2->subid = *subidp;
  171. op1->last->next = inst2;
  172. inst1 = newinst(LBRA);
  173. inst1->subid = *subidp;
  174. inst1->next = op1->first;
  175. pushand(inst1, inst2);
  176. return;
  177. case OR:
  178. op2 = popand('|');
  179. op1 = popand('|');
  180. inst2 = newinst(NOP);
  181. op2->last->next = inst2;
  182. op1->last->next = inst2;
  183. inst1 = newinst(OR);
  184. inst1->right = op1->first;
  185. inst1->left = op2->first;
  186. pushand(inst1, inst2);
  187. break;
  188. case CAT:
  189. op2 = popand(0);
  190. op1 = popand(0);
  191. op1->last->next = op2->first;
  192. pushand(op1->first, op2->last);
  193. break;
  194. case STAR:
  195. op2 = popand('*');
  196. inst1 = newinst(OR);
  197. op2->last->next = inst1;
  198. inst1->right = op2->first;
  199. pushand(inst1, inst1);
  200. break;
  201. case PLUS:
  202. op2 = popand('+');
  203. inst1 = newinst(OR);
  204. op2->last->next = inst1;
  205. inst1->right = op2->first;
  206. pushand(op2->first, inst1);
  207. break;
  208. case QUEST:
  209. op2 = popand('?');
  210. inst1 = newinst(OR);
  211. inst2 = newinst(NOP);
  212. inst1->left = inst2;
  213. inst1->right = op2->first;
  214. op2->last->next = inst2;
  215. pushand(inst1, inst2);
  216. break;
  217. }
  218. }
  219. }
  220. static Reprog*
  221. optimize(Reprog *pp)
  222. {
  223. Reinst *inst, *target;
  224. int size;
  225. Reprog *npp;
  226. Reclass *cl;
  227. int diff;
  228. /*
  229. * get rid of NOOP chains
  230. */
  231. for(inst=pp->firstinst; inst->type!=END; inst++){
  232. target = inst->next;
  233. while(target->type == NOP)
  234. target = target->next;
  235. inst->next = target;
  236. }
  237. /*
  238. * The original allocation is for an area larger than
  239. * necessary. Reallocate to the actual space used
  240. * and then relocate the code.
  241. */
  242. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  243. npp = realloc(pp, size);
  244. if(npp==0 || npp==pp)
  245. return pp;
  246. diff = (char *)npp - (char *)pp;
  247. freep = (Reinst *)((char *)freep + diff);
  248. for(inst=npp->firstinst; inst<freep; inst++){
  249. switch(inst->type){
  250. case OR:
  251. case STAR:
  252. case PLUS:
  253. case QUEST:
  254. *(char **)&inst->right += diff;
  255. break;
  256. case CCLASS:
  257. case NCCLASS:
  258. *(char **)&inst->right += diff;
  259. cl = inst->cp;
  260. *(char **)&cl->end += diff;
  261. break;
  262. }
  263. *(char **)&inst->left += diff;
  264. }
  265. *(char **)&npp->startinst += diff;
  266. return npp;
  267. }
  268. #ifdef DEBUG
  269. static void
  270. dumpstack(void){
  271. Node *stk;
  272. int *ip;
  273. print("operators\n");
  274. for(ip=atorstack; ip<atorp; ip++)
  275. print("0%o\n", *ip);
  276. print("operands\n");
  277. for(stk=andstack; stk<andp; stk++)
  278. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  279. }
  280. static void
  281. dump(Reprog *pp)
  282. {
  283. Reinst *l;
  284. Rune *p;
  285. l = pp->firstinst;
  286. do{
  287. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  288. l->left-pp->firstinst, l->right-pp->firstinst);
  289. if(l->type == RUNE)
  290. print("\t%C\n", l->r);
  291. else if(l->type == CCLASS || l->type == NCCLASS){
  292. print("\t[");
  293. if(l->type == NCCLASS)
  294. print("^");
  295. for(p = l->cp->spans; p < l->cp->end; p += 2)
  296. if(p[0] == p[1])
  297. print("%C", p[0]);
  298. else
  299. print("%C-%C", p[0], p[1]);
  300. print("]\n");
  301. } else
  302. print("\n");
  303. }while(l++->type);
  304. }
  305. #endif
  306. static Reclass*
  307. newclass(void)
  308. {
  309. if(nclass >= nelem(reprog->class))
  310. rcerror("too many character classes; increase Reprog.class size");
  311. return &(classp[nclass++]);
  312. }
  313. static int
  314. nextc(Rune *rp)
  315. {
  316. if(lexdone){
  317. *rp = 0;
  318. return 1;
  319. }
  320. exprp += chartorune(rp, exprp);
  321. if(*rp == L'\\'){
  322. exprp += chartorune(rp, exprp);
  323. return 1;
  324. }
  325. if(*rp == 0)
  326. lexdone = 1;
  327. return 0;
  328. }
  329. static int
  330. lex(int literal, int dot_type)
  331. {
  332. int quoted;
  333. quoted = nextc(&yyrune);
  334. if(literal || quoted){
  335. if(yyrune == 0)
  336. return END;
  337. return RUNE;
  338. }
  339. switch(yyrune){
  340. case 0:
  341. return END;
  342. case L'*':
  343. return STAR;
  344. case L'?':
  345. return QUEST;
  346. case L'+':
  347. return PLUS;
  348. case L'|':
  349. return OR;
  350. case L'.':
  351. return dot_type;
  352. case L'(':
  353. return LBRA;
  354. case L')':
  355. return RBRA;
  356. case L'^':
  357. return BOL;
  358. case L'$':
  359. return EOL;
  360. case L'[':
  361. return bldcclass();
  362. }
  363. return RUNE;
  364. }
  365. static int
  366. bldcclass(void)
  367. {
  368. int type;
  369. Rune r[NCCRUNE];
  370. Rune *p, *ep, *np;
  371. Rune rune;
  372. int quoted;
  373. /* we have already seen the '[' */
  374. type = CCLASS;
  375. yyclassp = newclass();
  376. /* look ahead for negation */
  377. /* SPECIAL CASE!!! negated classes don't match \n */
  378. ep = r;
  379. quoted = nextc(&rune);
  380. if(!quoted && rune == L'^'){
  381. type = NCCLASS;
  382. quoted = nextc(&rune);
  383. *ep++ = L'\n';
  384. *ep++ = L'\n';
  385. }
  386. /* parse class into a set of spans */
  387. while(ep < &r[NCCRUNE-1]){
  388. if(rune == 0){
  389. rcerror("malformed '[]'");
  390. return 0;
  391. }
  392. if(!quoted && rune == L']')
  393. break;
  394. if(!quoted && rune == L'-'){
  395. if(ep == r){
  396. rcerror("malformed '[]'");
  397. return 0;
  398. }
  399. quoted = nextc(&rune);
  400. if((!quoted && rune == L']') || rune == 0){
  401. rcerror("malformed '[]'");
  402. return 0;
  403. }
  404. *(ep-1) = rune;
  405. } else {
  406. *ep++ = rune;
  407. *ep++ = rune;
  408. }
  409. quoted = nextc(&rune);
  410. }
  411. if(ep >= &r[NCCRUNE-1]) {
  412. rcerror("char class too large; increase Reclass.spans size");
  413. return 0;
  414. }
  415. /* sort on span start */
  416. for(p = r; p < ep; p += 2){
  417. for(np = p; np < ep; np += 2)
  418. if(*np < *p){
  419. rune = np[0];
  420. np[0] = p[0];
  421. p[0] = rune;
  422. rune = np[1];
  423. np[1] = p[1];
  424. p[1] = rune;
  425. }
  426. }
  427. /* merge spans */
  428. np = yyclassp->spans;
  429. p = r;
  430. if(r == ep)
  431. yyclassp->end = np;
  432. else {
  433. np[0] = *p++;
  434. np[1] = *p++;
  435. for(; p < ep; p += 2)
  436. /* overlapping or adjacent ranges? */
  437. if(p[0] <= np[1] + 1){
  438. if(p[1] >= np[1])
  439. np[1] = p[1]; /* coalesce */
  440. } else {
  441. np += 2;
  442. np[0] = p[0];
  443. np[1] = p[1];
  444. }
  445. yyclassp->end = np+2;
  446. }
  447. return type;
  448. }
  449. static Reprog*
  450. regcomp1(char *s, int literal, int dot_type)
  451. {
  452. int token;
  453. Reprog *pp;
  454. /* get memory for the program */
  455. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  456. if(pp == 0){
  457. regerror("out of memory");
  458. return 0;
  459. }
  460. freep = pp->firstinst;
  461. classp = pp->class;
  462. errors = 0;
  463. if(setjmp(regkaboom))
  464. goto out;
  465. /* go compile the sucker */
  466. lexdone = 0;
  467. exprp = s;
  468. nclass = 0;
  469. nbra = 0;
  470. atorp = atorstack;
  471. andp = andstack;
  472. subidp = subidstack;
  473. lastwasand = FALSE;
  474. cursubid = 0;
  475. /* Start with a low priority operator to prime parser */
  476. pushator(START-1);
  477. while((token = lex(literal, dot_type)) != END){
  478. if((token&0300) == OPERATOR)
  479. operator(token);
  480. else
  481. operand(token);
  482. }
  483. /* Close with a low priority operator */
  484. evaluntil(START);
  485. /* Force END */
  486. operand(END);
  487. evaluntil(START);
  488. #ifdef DEBUG
  489. dumpstack();
  490. #endif
  491. if(nbra)
  492. rcerror("unmatched left paren");
  493. --andp; /* points to first and only operand */
  494. pp->startinst = andp->first;
  495. #ifdef DEBUG
  496. dump(pp);
  497. #endif
  498. pp = optimize(pp);
  499. #ifdef DEBUG
  500. print("start: %d\n", andp->first-pp->firstinst);
  501. dump(pp);
  502. #endif
  503. out:
  504. if(errors){
  505. free(pp);
  506. pp = 0;
  507. }
  508. return pp;
  509. }
  510. extern Reprog*
  511. regcomp(char *s)
  512. {
  513. return regcomp1(s, 0, ANY);
  514. }
  515. extern Reprog*
  516. regcomplit(char *s)
  517. {
  518. return regcomp1(s, 1, ANY);
  519. }
  520. extern Reprog*
  521. regcompnl(char *s)
  522. {
  523. return regcomp1(s, 0, ANYNL);
  524. }