regcomp.c 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. #include <u.h>
  2. #include <libc.h>
  3. #include "regexp.h"
  4. #include "regcomp.h"
  5. #define TRUE 1
  6. #define FALSE 0
  7. /*
  8. * Parser Information
  9. */
  10. typedef
  11. struct Node
  12. {
  13. Reinst* first;
  14. Reinst* last;
  15. }Node;
  16. /* max character classes per program is nelem(reprog->class) */
  17. static Reprog *reprog;
  18. /* max rune ranges per character class is nelem(classp->spans)/2 */
  19. #define NCCRUNE nelem(classp->spans)
  20. #define NSTACK 20
  21. static Node andstack[NSTACK];
  22. static Node *andp;
  23. static int atorstack[NSTACK];
  24. static int* atorp;
  25. static int cursubid; /* id of current subexpression */
  26. static int subidstack[NSTACK]; /* parallel to atorstack */
  27. static int* subidp;
  28. static int lastwasand; /* Last token was operand */
  29. static int nbra;
  30. static char* exprp; /* pointer to next character in source expression */
  31. static int lexdone;
  32. static int nclass;
  33. static Reclass*classp;
  34. static Reinst* freep;
  35. static int errors;
  36. static Rune yyrune; /* last lex'd rune */
  37. static Reclass*yyclassp; /* last lex'd class */
  38. /* predeclared crap */
  39. static void operator(int);
  40. static void pushand(Reinst*, Reinst*);
  41. static void pushator(int);
  42. static void evaluntil(int);
  43. static int bldcclass(void);
  44. static jmp_buf regkaboom;
  45. static void
  46. rcerror(char *s)
  47. {
  48. errors++;
  49. regerror(s);
  50. longjmp(regkaboom, 1);
  51. }
  52. static Reinst*
  53. newinst(int t)
  54. {
  55. freep->type = t;
  56. freep->left = 0;
  57. freep->right = 0;
  58. return freep++;
  59. }
  60. static void
  61. operand(int t)
  62. {
  63. Reinst *i;
  64. if(lastwasand)
  65. operator(CAT); /* catenate is implicit */
  66. i = newinst(t);
  67. if(t == CCLASS || t == NCCLASS)
  68. i->cp = yyclassp;
  69. if(t == RUNE)
  70. i->r = yyrune;
  71. pushand(i, i);
  72. lastwasand = TRUE;
  73. }
  74. static void
  75. operator(int t)
  76. {
  77. if(t==RBRA && --nbra<0)
  78. rcerror("unmatched right paren");
  79. if(t==LBRA){
  80. if(++cursubid >= NSUBEXP)
  81. rcerror ("too many subexpressions");
  82. nbra++;
  83. if(lastwasand)
  84. operator(CAT);
  85. } else
  86. evaluntil(t);
  87. if(t != RBRA)
  88. pushator(t);
  89. lastwasand = FALSE;
  90. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  91. lastwasand = TRUE; /* these look like operands */
  92. }
  93. static void
  94. regerr2(char *s, int c)
  95. {
  96. char buf[100];
  97. char *cp = buf;
  98. while(*s)
  99. *cp++ = *s++;
  100. *cp++ = c;
  101. *cp = '\0';
  102. rcerror(buf);
  103. }
  104. static void
  105. cant(char *s)
  106. {
  107. char buf[100];
  108. strcpy(buf, "can't happen: ");
  109. strcat(buf, s);
  110. rcerror(buf);
  111. }
  112. static void
  113. pushand(Reinst *f, Reinst *l)
  114. {
  115. if(andp >= &andstack[NSTACK])
  116. cant("operand stack overflow");
  117. andp->first = f;
  118. andp->last = l;
  119. andp++;
  120. }
  121. static void
  122. pushator(int t)
  123. {
  124. if(atorp >= &atorstack[NSTACK])
  125. cant("operator stack overflow");
  126. *atorp++ = t;
  127. *subidp++ = cursubid;
  128. }
  129. static Node*
  130. popand(int op)
  131. {
  132. Reinst *inst;
  133. if(andp <= &andstack[0]){
  134. regerr2("missing operand for ", op);
  135. inst = newinst(NOP);
  136. pushand(inst,inst);
  137. }
  138. return --andp;
  139. }
  140. static int
  141. popator(void)
  142. {
  143. if(atorp <= &atorstack[0])
  144. cant("operator stack underflow");
  145. --subidp;
  146. return *--atorp;
  147. }
  148. static void
  149. evaluntil(int pri)
  150. {
  151. Node *op1, *op2;
  152. Reinst *inst1, *inst2;
  153. while(pri==RBRA || atorp[-1]>=pri){
  154. switch(popator()){
  155. default:
  156. rcerror("unknown operator in evaluntil");
  157. break;
  158. case LBRA: /* must have been RBRA */
  159. op1 = popand('(');
  160. inst2 = newinst(RBRA);
  161. inst2->subid = *subidp;
  162. op1->last->next = inst2;
  163. inst1 = newinst(LBRA);
  164. inst1->subid = *subidp;
  165. inst1->next = op1->first;
  166. pushand(inst1, inst2);
  167. return;
  168. case OR:
  169. op2 = popand('|');
  170. op1 = popand('|');
  171. inst2 = newinst(NOP);
  172. op2->last->next = inst2;
  173. op1->last->next = inst2;
  174. inst1 = newinst(OR);
  175. inst1->right = op1->first;
  176. inst1->left = op2->first;
  177. pushand(inst1, inst2);
  178. break;
  179. case CAT:
  180. op2 = popand(0);
  181. op1 = popand(0);
  182. op1->last->next = op2->first;
  183. pushand(op1->first, op2->last);
  184. break;
  185. case STAR:
  186. op2 = popand('*');
  187. inst1 = newinst(OR);
  188. op2->last->next = inst1;
  189. inst1->right = op2->first;
  190. pushand(inst1, inst1);
  191. break;
  192. case PLUS:
  193. op2 = popand('+');
  194. inst1 = newinst(OR);
  195. op2->last->next = inst1;
  196. inst1->right = op2->first;
  197. pushand(op2->first, inst1);
  198. break;
  199. case QUEST:
  200. op2 = popand('?');
  201. inst1 = newinst(OR);
  202. inst2 = newinst(NOP);
  203. inst1->left = inst2;
  204. inst1->right = op2->first;
  205. op2->last->next = inst2;
  206. pushand(inst1, inst2);
  207. break;
  208. }
  209. }
  210. }
  211. static Reprog*
  212. optimize(Reprog *pp)
  213. {
  214. Reinst *inst, *target;
  215. int size;
  216. Reprog *npp;
  217. Reclass *cl;
  218. int diff;
  219. /*
  220. * get rid of NOOP chains
  221. */
  222. for(inst=pp->firstinst; inst->type!=END; inst++){
  223. target = inst->next;
  224. while(target->type == NOP)
  225. target = target->next;
  226. inst->next = target;
  227. }
  228. /*
  229. * The original allocation is for an area larger than
  230. * necessary. Reallocate to the actual space used
  231. * and then relocate the code.
  232. */
  233. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  234. npp = realloc(pp, size);
  235. if(npp==0 || npp==pp)
  236. return pp;
  237. diff = (char *)npp - (char *)pp;
  238. freep = (Reinst *)((char *)freep + diff);
  239. for(inst=npp->firstinst; inst<freep; inst++){
  240. switch(inst->type){
  241. case OR:
  242. case STAR:
  243. case PLUS:
  244. case QUEST:
  245. *(char **)&inst->right += diff;
  246. break;
  247. case CCLASS:
  248. case NCCLASS:
  249. *(char **)&inst->right += diff;
  250. cl = inst->cp;
  251. *(char **)&cl->end += diff;
  252. break;
  253. }
  254. *(char **)&inst->left += diff;
  255. }
  256. *(char **)&npp->startinst += diff;
  257. return npp;
  258. }
  259. #ifdef DEBUG
  260. static void
  261. dumpstack(void){
  262. Node *stk;
  263. int *ip;
  264. print("operators\n");
  265. for(ip=atorstack; ip<atorp; ip++)
  266. print("0%o\n", *ip);
  267. print("operands\n");
  268. for(stk=andstack; stk<andp; stk++)
  269. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  270. }
  271. static void
  272. dump(Reprog *pp)
  273. {
  274. Reinst *l;
  275. Rune *p;
  276. l = pp->firstinst;
  277. do{
  278. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  279. l->left-pp->firstinst, l->right-pp->firstinst);
  280. if(l->type == RUNE)
  281. print("\t%C\n", l->r);
  282. else if(l->type == CCLASS || l->type == NCCLASS){
  283. print("\t[");
  284. if(l->type == NCCLASS)
  285. print("^");
  286. for(p = l->cp->spans; p < l->cp->end; p += 2)
  287. if(p[0] == p[1])
  288. print("%C", p[0]);
  289. else
  290. print("%C-%C", p[0], p[1]);
  291. print("]\n");
  292. } else
  293. print("\n");
  294. }while(l++->type);
  295. }
  296. #endif
  297. static Reclass*
  298. newclass(void)
  299. {
  300. if(nclass >= nelem(reprog->class))
  301. rcerror("too many character classes; increase Reprog.class size");
  302. return &(classp[nclass++]);
  303. }
  304. static int
  305. nextc(Rune *rp)
  306. {
  307. if(lexdone){
  308. *rp = 0;
  309. return 1;
  310. }
  311. exprp += chartorune(rp, exprp);
  312. if(*rp == L'\\'){
  313. exprp += chartorune(rp, exprp);
  314. return 1;
  315. }
  316. if(*rp == 0)
  317. lexdone = 1;
  318. return 0;
  319. }
  320. static int
  321. lex(int literal, int dot_type)
  322. {
  323. int quoted;
  324. quoted = nextc(&yyrune);
  325. if(literal || quoted){
  326. if(yyrune == 0)
  327. return END;
  328. return RUNE;
  329. }
  330. switch(yyrune){
  331. case 0:
  332. return END;
  333. case L'*':
  334. return STAR;
  335. case L'?':
  336. return QUEST;
  337. case L'+':
  338. return PLUS;
  339. case L'|':
  340. return OR;
  341. case L'.':
  342. return dot_type;
  343. case L'(':
  344. return LBRA;
  345. case L')':
  346. return RBRA;
  347. case L'^':
  348. return BOL;
  349. case L'$':
  350. return EOL;
  351. case L'[':
  352. return bldcclass();
  353. }
  354. return RUNE;
  355. }
  356. static int
  357. bldcclass(void)
  358. {
  359. int type;
  360. Rune r[NCCRUNE];
  361. Rune *p, *ep, *np;
  362. Rune rune;
  363. int quoted;
  364. /* we have already seen the '[' */
  365. type = CCLASS;
  366. yyclassp = newclass();
  367. /* look ahead for negation */
  368. /* SPECIAL CASE!!! negated classes don't match \n */
  369. ep = r;
  370. quoted = nextc(&rune);
  371. if(!quoted && rune == L'^'){
  372. type = NCCLASS;
  373. quoted = nextc(&rune);
  374. *ep++ = L'\n';
  375. *ep++ = L'\n';
  376. }
  377. /* parse class into a set of spans */
  378. while(ep < &r[NCCRUNE-1]){
  379. if(rune == 0){
  380. rcerror("malformed '[]'");
  381. return 0;
  382. }
  383. if(!quoted && rune == L']')
  384. break;
  385. if(!quoted && rune == L'-'){
  386. if(ep == r){
  387. rcerror("malformed '[]'");
  388. return 0;
  389. }
  390. quoted = nextc(&rune);
  391. if((!quoted && rune == L']') || rune == 0){
  392. rcerror("malformed '[]'");
  393. return 0;
  394. }
  395. *(ep-1) = rune;
  396. } else {
  397. *ep++ = rune;
  398. *ep++ = rune;
  399. }
  400. quoted = nextc(&rune);
  401. }
  402. if(ep >= &r[NCCRUNE-1]) {
  403. rcerror("char class too large; increase Reclass.spans size");
  404. return 0;
  405. }
  406. /* sort on span start */
  407. for(p = r; p < ep; p += 2){
  408. for(np = p; np < ep; np += 2)
  409. if(*np < *p){
  410. rune = np[0];
  411. np[0] = p[0];
  412. p[0] = rune;
  413. rune = np[1];
  414. np[1] = p[1];
  415. p[1] = rune;
  416. }
  417. }
  418. /* merge spans */
  419. np = yyclassp->spans;
  420. p = r;
  421. if(r == ep)
  422. yyclassp->end = np;
  423. else {
  424. np[0] = *p++;
  425. np[1] = *p++;
  426. for(; p < ep; p += 2)
  427. /* overlapping or adjacent ranges? */
  428. if(p[0] <= np[1] + 1){
  429. if(p[1] >= np[1])
  430. np[1] = p[1]; /* coalesce */
  431. } else {
  432. np += 2;
  433. np[0] = p[0];
  434. np[1] = p[1];
  435. }
  436. yyclassp->end = np+2;
  437. }
  438. return type;
  439. }
  440. static Reprog*
  441. regcomp1(char *s, int literal, int dot_type)
  442. {
  443. int token;
  444. Reprog *pp;
  445. /* get memory for the program */
  446. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  447. if(pp == 0){
  448. regerror("out of memory");
  449. return 0;
  450. }
  451. freep = pp->firstinst;
  452. classp = pp->class;
  453. errors = 0;
  454. if(setjmp(regkaboom))
  455. goto out;
  456. /* go compile the sucker */
  457. lexdone = 0;
  458. exprp = s;
  459. nclass = 0;
  460. nbra = 0;
  461. atorp = atorstack;
  462. andp = andstack;
  463. subidp = subidstack;
  464. lastwasand = FALSE;
  465. cursubid = 0;
  466. /* Start with a low priority operator to prime parser */
  467. pushator(START-1);
  468. while((token = lex(literal, dot_type)) != END){
  469. if((token&0300) == OPERATOR)
  470. operator(token);
  471. else
  472. operand(token);
  473. }
  474. /* Close with a low priority operator */
  475. evaluntil(START);
  476. /* Force END */
  477. operand(END);
  478. evaluntil(START);
  479. #ifdef DEBUG
  480. dumpstack();
  481. #endif
  482. if(nbra)
  483. rcerror("unmatched left paren");
  484. --andp; /* points to first and only operand */
  485. pp->startinst = andp->first;
  486. #ifdef DEBUG
  487. dump(pp);
  488. #endif
  489. pp = optimize(pp);
  490. #ifdef DEBUG
  491. print("start: %d\n", andp->first-pp->firstinst);
  492. dump(pp);
  493. #endif
  494. out:
  495. if(errors){
  496. free(pp);
  497. pp = 0;
  498. }
  499. return pp;
  500. }
  501. extern Reprog*
  502. regcomp(char *s)
  503. {
  504. return regcomp1(s, 0, ANY);
  505. }
  506. extern Reprog*
  507. regcomplit(char *s)
  508. {
  509. return regcomp1(s, 1, ANY);
  510. }
  511. extern Reprog*
  512. regcompnl(char *s)
  513. {
  514. return regcomp1(s, 0, ANYNL);
  515. }