regcomp.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. #include <u.h>
  2. #include <libc.h>
  3. #include "regexp.h"
  4. #include "regcomp.h"
  5. #define TRUE 1
  6. #define FALSE 0
  7. /*
  8. * Parser Information
  9. */
  10. typedef
  11. struct Node
  12. {
  13. Reinst* first;
  14. Reinst* last;
  15. }Node;
  16. #define NSTACK 20
  17. static Node andstack[NSTACK];
  18. static Node *andp;
  19. static int atorstack[NSTACK];
  20. static int* atorp;
  21. static int cursubid; /* id of current subexpression */
  22. static int subidstack[NSTACK]; /* parallel to atorstack */
  23. static int* subidp;
  24. static int lastwasand; /* Last token was operand */
  25. static int nbra;
  26. static char* exprp; /* pointer to next character in source expression */
  27. static int lexdone;
  28. static int nclass;
  29. static Reclass*classp;
  30. static Reinst* freep;
  31. static int errors;
  32. static Rune yyrune; /* last lex'd rune */
  33. static Reclass*yyclassp; /* last lex'd class */
  34. /* predeclared crap */
  35. static void operator(int);
  36. static void pushand(Reinst*, Reinst*);
  37. static void pushator(int);
  38. static void evaluntil(int);
  39. static int bldcclass(void);
  40. static jmp_buf regkaboom;
  41. static void
  42. rcerror(char *s)
  43. {
  44. errors++;
  45. regerror(s);
  46. longjmp(regkaboom, 1);
  47. }
  48. static Reinst*
  49. newinst(int t)
  50. {
  51. freep->type = t;
  52. freep->left = 0;
  53. freep->right = 0;
  54. return freep++;
  55. }
  56. static void
  57. operand(int t)
  58. {
  59. Reinst *i;
  60. if(lastwasand)
  61. operator(CAT); /* catenate is implicit */
  62. i = newinst(t);
  63. if(t == CCLASS || t == NCCLASS)
  64. i->cp = yyclassp;
  65. if(t == RUNE)
  66. i->r = yyrune;
  67. pushand(i, i);
  68. lastwasand = TRUE;
  69. }
  70. static void
  71. operator(int t)
  72. {
  73. if(t==RBRA && --nbra<0)
  74. rcerror("unmatched right paren");
  75. if(t==LBRA){
  76. if(++cursubid >= NSUBEXP)
  77. rcerror ("too many subexpressions");
  78. nbra++;
  79. if(lastwasand)
  80. operator(CAT);
  81. } else
  82. evaluntil(t);
  83. if(t != RBRA)
  84. pushator(t);
  85. lastwasand = FALSE;
  86. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  87. lastwasand = TRUE; /* these look like operands */
  88. }
  89. static void
  90. regerr2(char *s, int c)
  91. {
  92. char buf[100];
  93. char *cp = buf;
  94. while(*s)
  95. *cp++ = *s++;
  96. *cp++ = c;
  97. *cp = '\0';
  98. rcerror(buf);
  99. }
  100. static void
  101. cant(char *s)
  102. {
  103. char buf[100];
  104. strcpy(buf, "can't happen: ");
  105. strcat(buf, s);
  106. rcerror(buf);
  107. }
  108. static void
  109. pushand(Reinst *f, Reinst *l)
  110. {
  111. if(andp >= &andstack[NSTACK])
  112. cant("operand stack overflow");
  113. andp->first = f;
  114. andp->last = l;
  115. andp++;
  116. }
  117. static void
  118. pushator(int t)
  119. {
  120. if(atorp >= &atorstack[NSTACK])
  121. cant("operator stack overflow");
  122. *atorp++ = t;
  123. *subidp++ = cursubid;
  124. }
  125. static Node*
  126. popand(int op)
  127. {
  128. Reinst *inst;
  129. if(andp <= &andstack[0]){
  130. regerr2("missing operand for ", op);
  131. inst = newinst(NOP);
  132. pushand(inst,inst);
  133. }
  134. return --andp;
  135. }
  136. static int
  137. popator(void)
  138. {
  139. if(atorp <= &atorstack[0])
  140. cant("operator stack underflow");
  141. --subidp;
  142. return *--atorp;
  143. }
  144. static void
  145. evaluntil(int pri)
  146. {
  147. Node *op1, *op2;
  148. Reinst *inst1, *inst2;
  149. while(pri==RBRA || atorp[-1]>=pri){
  150. switch(popator()){
  151. default:
  152. rcerror("unknown operator in evaluntil");
  153. break;
  154. case LBRA: /* must have been RBRA */
  155. op1 = popand('(');
  156. inst2 = newinst(RBRA);
  157. inst2->subid = *subidp;
  158. op1->last->next = inst2;
  159. inst1 = newinst(LBRA);
  160. inst1->subid = *subidp;
  161. inst1->next = op1->first;
  162. pushand(inst1, inst2);
  163. return;
  164. case OR:
  165. op2 = popand('|');
  166. op1 = popand('|');
  167. inst2 = newinst(NOP);
  168. op2->last->next = inst2;
  169. op1->last->next = inst2;
  170. inst1 = newinst(OR);
  171. inst1->right = op1->first;
  172. inst1->left = op2->first;
  173. pushand(inst1, inst2);
  174. break;
  175. case CAT:
  176. op2 = popand(0);
  177. op1 = popand(0);
  178. op1->last->next = op2->first;
  179. pushand(op1->first, op2->last);
  180. break;
  181. case STAR:
  182. op2 = popand('*');
  183. inst1 = newinst(OR);
  184. op2->last->next = inst1;
  185. inst1->right = op2->first;
  186. pushand(inst1, inst1);
  187. break;
  188. case PLUS:
  189. op2 = popand('+');
  190. inst1 = newinst(OR);
  191. op2->last->next = inst1;
  192. inst1->right = op2->first;
  193. pushand(op2->first, inst1);
  194. break;
  195. case QUEST:
  196. op2 = popand('?');
  197. inst1 = newinst(OR);
  198. inst2 = newinst(NOP);
  199. inst1->left = inst2;
  200. inst1->right = op2->first;
  201. op2->last->next = inst2;
  202. pushand(inst1, inst2);
  203. break;
  204. }
  205. }
  206. }
  207. static Reprog*
  208. optimize(Reprog *pp)
  209. {
  210. Reinst *inst, *target;
  211. int size;
  212. Reprog *npp;
  213. Reclass *cl;
  214. int diff;
  215. /*
  216. * get rid of NOOP chains
  217. */
  218. for(inst=pp->firstinst; inst->type!=END; inst++){
  219. target = inst->next;
  220. while(target->type == NOP)
  221. target = target->next;
  222. inst->next = target;
  223. }
  224. /*
  225. * The original allocation is for an area larger than
  226. * necessary. Reallocate to the actual space used
  227. * and then relocate the code.
  228. */
  229. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  230. npp = realloc(pp, size);
  231. if(npp==0 || npp==pp)
  232. return pp;
  233. diff = (char *)npp - (char *)pp;
  234. freep = (Reinst *)((char *)freep + diff);
  235. for(inst=npp->firstinst; inst<freep; inst++){
  236. switch(inst->type){
  237. case OR:
  238. case STAR:
  239. case PLUS:
  240. case QUEST:
  241. *(char **)&inst->right += diff;
  242. break;
  243. case CCLASS:
  244. case NCCLASS:
  245. *(char **)&inst->right += diff;
  246. cl = inst->cp;
  247. *(char **)&cl->end += diff;
  248. break;
  249. }
  250. *(char **)&inst->left += diff;
  251. }
  252. *(char **)&npp->startinst += diff;
  253. return npp;
  254. }
  255. #ifdef DEBUG
  256. static void
  257. dumpstack(void){
  258. Node *stk;
  259. int *ip;
  260. print("operators\n");
  261. for(ip=atorstack; ip<atorp; ip++)
  262. print("0%o\n", *ip);
  263. print("operands\n");
  264. for(stk=andstack; stk<andp; stk++)
  265. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  266. }
  267. static void
  268. dump(Reprog *pp)
  269. {
  270. Reinst *l;
  271. Rune *p;
  272. l = pp->firstinst;
  273. do{
  274. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  275. l->left-pp->firstinst, l->right-pp->firstinst);
  276. if(l->type == RUNE)
  277. print("\t%C\n", l->r);
  278. else if(l->type == CCLASS || l->type == NCCLASS){
  279. print("\t[");
  280. if(l->type == NCCLASS)
  281. print("^");
  282. for(p = l->cp->spans; p < l->cp->end; p += 2)
  283. if(p[0] == p[1])
  284. print("%C", p[0]);
  285. else
  286. print("%C-%C", p[0], p[1]);
  287. print("]\n");
  288. } else
  289. print("\n");
  290. }while(l++->type);
  291. }
  292. #endif
  293. static Reclass*
  294. newclass(void)
  295. {
  296. if(nclass >= NCLASS)
  297. regerr2("too many character classes; limit", NCLASS+'0');
  298. return &(classp[nclass++]);
  299. }
  300. static int
  301. nextc(Rune *rp)
  302. {
  303. if(lexdone){
  304. *rp = 0;
  305. return 1;
  306. }
  307. exprp += chartorune(rp, exprp);
  308. if(*rp == L'\\'){
  309. exprp += chartorune(rp, exprp);
  310. return 1;
  311. }
  312. if(*rp == 0)
  313. lexdone = 1;
  314. return 0;
  315. }
  316. static int
  317. lex(int literal, int dot_type)
  318. {
  319. int quoted;
  320. quoted = nextc(&yyrune);
  321. if(literal || quoted){
  322. if(yyrune == 0)
  323. return END;
  324. return RUNE;
  325. }
  326. switch(yyrune){
  327. case 0:
  328. return END;
  329. case L'*':
  330. return STAR;
  331. case L'?':
  332. return QUEST;
  333. case L'+':
  334. return PLUS;
  335. case L'|':
  336. return OR;
  337. case L'.':
  338. return dot_type;
  339. case L'(':
  340. return LBRA;
  341. case L')':
  342. return RBRA;
  343. case L'^':
  344. return BOL;
  345. case L'$':
  346. return EOL;
  347. case L'[':
  348. return bldcclass();
  349. }
  350. return RUNE;
  351. }
  352. static int
  353. bldcclass(void)
  354. {
  355. int type;
  356. Rune r[NCCRUNE];
  357. Rune *p, *ep, *np;
  358. Rune rune;
  359. int quoted;
  360. /* we have already seen the '[' */
  361. type = CCLASS;
  362. yyclassp = newclass();
  363. /* look ahead for negation */
  364. /* SPECIAL CASE!!! negated classes don't match \n */
  365. ep = r;
  366. quoted = nextc(&rune);
  367. if(!quoted && rune == L'^'){
  368. type = NCCLASS;
  369. quoted = nextc(&rune);
  370. *ep++ = L'\n';
  371. *ep++ = L'\n';
  372. }
  373. /* parse class into a set of spans */
  374. for(; ep<&r[NCCRUNE];){
  375. if(rune == 0){
  376. rcerror("malformed '[]'");
  377. return 0;
  378. }
  379. if(!quoted && rune == L']')
  380. break;
  381. if(!quoted && rune == L'-'){
  382. if(ep == r){
  383. rcerror("malformed '[]'");
  384. return 0;
  385. }
  386. quoted = nextc(&rune);
  387. if((!quoted && rune == L']') || rune == 0){
  388. rcerror("malformed '[]'");
  389. return 0;
  390. }
  391. *(ep-1) = rune;
  392. } else {
  393. *ep++ = rune;
  394. *ep++ = rune;
  395. }
  396. quoted = nextc(&rune);
  397. }
  398. /* sort on span start */
  399. for(p = r; p < ep; p += 2){
  400. for(np = p; np < ep; np += 2)
  401. if(*np < *p){
  402. rune = np[0];
  403. np[0] = p[0];
  404. p[0] = rune;
  405. rune = np[1];
  406. np[1] = p[1];
  407. p[1] = rune;
  408. }
  409. }
  410. /* merge spans */
  411. np = yyclassp->spans;
  412. p = r;
  413. if(r == ep)
  414. yyclassp->end = np;
  415. else {
  416. np[0] = *p++;
  417. np[1] = *p++;
  418. for(; p < ep; p += 2)
  419. if(p[0] <= np[1]){
  420. if(p[1] > np[1])
  421. np[1] = p[1];
  422. } else {
  423. np += 2;
  424. np[0] = p[0];
  425. np[1] = p[1];
  426. }
  427. yyclassp->end = np+2;
  428. }
  429. return type;
  430. }
  431. static Reprog*
  432. regcomp1(char *s, int literal, int dot_type)
  433. {
  434. int token;
  435. Reprog *pp;
  436. /* get memory for the program */
  437. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  438. if(pp == 0){
  439. regerror("out of memory");
  440. return 0;
  441. }
  442. freep = pp->firstinst;
  443. classp = pp->class;
  444. errors = 0;
  445. if(setjmp(regkaboom))
  446. goto out;
  447. /* go compile the sucker */
  448. lexdone = 0;
  449. exprp = s;
  450. nclass = 0;
  451. nbra = 0;
  452. atorp = atorstack;
  453. andp = andstack;
  454. subidp = subidstack;
  455. lastwasand = FALSE;
  456. cursubid = 0;
  457. /* Start with a low priority operator to prime parser */
  458. pushator(START-1);
  459. while((token = lex(literal, dot_type)) != END){
  460. if((token&0300) == OPERATOR)
  461. operator(token);
  462. else
  463. operand(token);
  464. }
  465. /* Close with a low priority operator */
  466. evaluntil(START);
  467. /* Force END */
  468. operand(END);
  469. evaluntil(START);
  470. #ifdef DEBUG
  471. dumpstack();
  472. #endif
  473. if(nbra)
  474. rcerror("unmatched left paren");
  475. --andp; /* points to first and only operand */
  476. pp->startinst = andp->first;
  477. #ifdef DEBUG
  478. dump(pp);
  479. #endif
  480. pp = optimize(pp);
  481. #ifdef DEBUG
  482. print("start: %d\n", andp->first-pp->firstinst);
  483. dump(pp);
  484. #endif
  485. out:
  486. if(errors){
  487. free(pp);
  488. pp = 0;
  489. }
  490. return pp;
  491. }
  492. extern Reprog*
  493. regcomp(char *s)
  494. {
  495. return regcomp1(s, 0, ANY);
  496. }
  497. extern Reprog*
  498. regcomplit(char *s)
  499. {
  500. return regcomp1(s, 1, ANY);
  501. }
  502. extern Reprog*
  503. regcompnl(char *s)
  504. {
  505. return regcomp1(s, 0, ANYNL);
  506. }