regcomp.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <setjmp.h>
  4. #include <string.h>
  5. #include "regexp.h"
  6. #include "regcomp.h"
  7. #define TRUE 1
  8. #define FALSE 0
  9. /*
  10. * Parser Information
  11. */
  12. typedef
  13. struct Node
  14. {
  15. Reinst* first;
  16. Reinst* last;
  17. }Node;
  18. #define NSTACK 20
  19. static Node andstack[NSTACK];
  20. static Node *andp;
  21. static int atorstack[NSTACK];
  22. static int* atorp;
  23. static int cursubid; /* id of current subexpression */
  24. static int subidstack[NSTACK]; /* parallel to atorstack */
  25. static int* subidp;
  26. static int lastwasand; /* Last token was operand */
  27. static int nbra;
  28. static char* exprp; /* pointer to next character in source expression */
  29. static int lexdone;
  30. static int nclass;
  31. static Reclass*classp;
  32. static Reinst* freep;
  33. static int errors;
  34. static wchar_t yyrune; /* last lex'd rune */
  35. static Reclass*yyclassp; /* last lex'd class */
  36. /* predeclared crap */
  37. static void operator(int);
  38. static void pushand(Reinst*, Reinst*);
  39. static void pushator(int);
  40. static void evaluntil(int);
  41. static int bldcclass(void);
  42. static jmp_buf regkaboom;
  43. static void
  44. rcerror(char *s)
  45. {
  46. errors++;
  47. regerror(s);
  48. longjmp(regkaboom, 1);
  49. }
  50. static Reinst*
  51. newinst(int t)
  52. {
  53. freep->type = t;
  54. freep->l.left = 0;
  55. freep->r.right = 0;
  56. return freep++;
  57. }
  58. static void
  59. operand(int t)
  60. {
  61. Reinst *i;
  62. if(lastwasand)
  63. operator(CAT); /* catenate is implicit */
  64. i = newinst(t);
  65. if(t == CCLASS || t == NCCLASS)
  66. i->r.cp = yyclassp;
  67. if(t == RUNE)
  68. i->r.r = yyrune;
  69. pushand(i, i);
  70. lastwasand = TRUE;
  71. }
  72. static void
  73. operator(int t)
  74. {
  75. if(t==RBRA && --nbra<0)
  76. rcerror("unmatched right paren");
  77. if(t==LBRA){
  78. if(++cursubid >= NSUBEXP)
  79. rcerror ("too many subexpressions");
  80. nbra++;
  81. if(lastwasand)
  82. operator(CAT);
  83. } else
  84. evaluntil(t);
  85. if(t != RBRA)
  86. pushator(t);
  87. lastwasand = FALSE;
  88. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  89. lastwasand = TRUE; /* these look like operands */
  90. }
  91. static void
  92. regerr2(char *s, int c)
  93. {
  94. char buf[100];
  95. char *cp = buf;
  96. while(*s)
  97. *cp++ = *s++;
  98. *cp++ = c;
  99. *cp = '\0';
  100. rcerror(buf);
  101. }
  102. static void
  103. cant(char *s)
  104. {
  105. char buf[100];
  106. strcpy(buf, "can't happen: ");
  107. strcat(buf, s);
  108. rcerror(buf);
  109. }
  110. static void
  111. pushand(Reinst *f, Reinst *l)
  112. {
  113. if(andp >= &andstack[NSTACK])
  114. cant("operand stack overflow");
  115. andp->first = f;
  116. andp->last = l;
  117. andp++;
  118. }
  119. static void
  120. pushator(int t)
  121. {
  122. if(atorp >= &atorstack[NSTACK])
  123. cant("operator stack overflow");
  124. *atorp++ = t;
  125. *subidp++ = cursubid;
  126. }
  127. static Node*
  128. popand(int op)
  129. {
  130. Reinst *inst;
  131. if(andp <= &andstack[0]){
  132. regerr2("missing operand for ", op);
  133. inst = newinst(NOP);
  134. pushand(inst,inst);
  135. }
  136. return --andp;
  137. }
  138. static int
  139. popator(void)
  140. {
  141. if(atorp <= &atorstack[0])
  142. cant("operator stack underflow");
  143. --subidp;
  144. return *--atorp;
  145. }
  146. static void
  147. evaluntil(int pri)
  148. {
  149. Node *op1, *op2;
  150. Reinst *inst1, *inst2;
  151. while(pri==RBRA || atorp[-1]>=pri){
  152. switch(popator()){
  153. default:
  154. rcerror("unknown operator in evaluntil");
  155. break;
  156. case LBRA: /* must have been RBRA */
  157. op1 = popand('(');
  158. inst2 = newinst(RBRA);
  159. inst2->r.subid = *subidp;
  160. op1->last->l.next = inst2;
  161. inst1 = newinst(LBRA);
  162. inst1->r.subid = *subidp;
  163. inst1->l.next = op1->first;
  164. pushand(inst1, inst2);
  165. return;
  166. case OR:
  167. op2 = popand('|');
  168. op1 = popand('|');
  169. inst2 = newinst(NOP);
  170. op2->last->l.next = inst2;
  171. op1->last->l.next = inst2;
  172. inst1 = newinst(OR);
  173. inst1->r.right = op1->first;
  174. inst1->l.left = op2->first;
  175. pushand(inst1, inst2);
  176. break;
  177. case CAT:
  178. op2 = popand(0);
  179. op1 = popand(0);
  180. op1->last->l.next = op2->first;
  181. pushand(op1->first, op2->last);
  182. break;
  183. case STAR:
  184. op2 = popand('*');
  185. inst1 = newinst(OR);
  186. op2->last->l.next = inst1;
  187. inst1->r.right = op2->first;
  188. pushand(inst1, inst1);
  189. break;
  190. case PLUS:
  191. op2 = popand('+');
  192. inst1 = newinst(OR);
  193. op2->last->l.next = inst1;
  194. inst1->r.right = op2->first;
  195. pushand(op2->first, inst1);
  196. break;
  197. case QUEST:
  198. op2 = popand('?');
  199. inst1 = newinst(OR);
  200. inst2 = newinst(NOP);
  201. inst1->l.left = inst2;
  202. inst1->r.right = op2->first;
  203. op2->last->l.next = inst2;
  204. pushand(inst1, inst2);
  205. break;
  206. }
  207. }
  208. }
  209. static Reprog*
  210. optimize(Reprog *pp)
  211. {
  212. Reinst *inst, *target;
  213. int size;
  214. Reprog *npp;
  215. int diff;
  216. /*
  217. * get rid of NOOP chains
  218. */
  219. for(inst=pp->firstinst; inst->type!=END; inst++){
  220. target = inst->l.next;
  221. while(target->type == NOP)
  222. target = target->l.next;
  223. inst->l.next = target;
  224. }
  225. /*
  226. * The original allocation is for an area larger than
  227. * necessary. Reallocate to the actual space used
  228. * and then relocate the code.
  229. */
  230. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  231. npp = realloc(pp, size);
  232. if(npp==0 || npp==pp)
  233. return pp;
  234. diff = (char *)npp - (char *)pp;
  235. freep = (Reinst *)((char *)freep + diff);
  236. for(inst=npp->firstinst; inst<freep; inst++){
  237. switch(inst->type){
  238. case OR:
  239. case STAR:
  240. case PLUS:
  241. case QUEST:
  242. case CCLASS:
  243. case NCCLASS:
  244. *(char **)&inst->r.right += diff;
  245. break;
  246. }
  247. *(char **)&inst->l.left += diff;
  248. }
  249. *(char **)&npp->startinst += diff;
  250. return npp;
  251. }
  252. #ifdef DEBUG
  253. static void
  254. dumpstack(void){
  255. Node *stk;
  256. int *ip;
  257. print("operators\n");
  258. for(ip=atorstack; ip<atorp; ip++)
  259. print("0%o\n", *ip);
  260. print("operands\n");
  261. for(stk=andstack; stk<andp; stk++)
  262. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  263. }
  264. static void
  265. dump(Reprog *pp)
  266. {
  267. Reinst *l;
  268. wchar_t *p;
  269. l = pp->firstinst;
  270. do{
  271. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  272. l->l.left-pp->firstinst, l->l.right-pp->firstinst);
  273. if(l->type == RUNE)
  274. print("\t%C\n", l->r);
  275. else if(l->type == CCLASS || l->type == NCCLASS){
  276. print("\t[");
  277. if(l->type == NCCLASS)
  278. print("^");
  279. for(p = l->r.cp->spans; p < l->r.cp->end; p += 2)
  280. if(p[0] == p[1])
  281. print("%C", p[0]);
  282. else
  283. print("%C-%C", p[0], p[1]);
  284. print("]\n");
  285. } else
  286. print("\n");
  287. }while(l++->type);
  288. }
  289. #endif
  290. static Reclass*
  291. newclass(void)
  292. {
  293. if(nclass >= NCLASS)
  294. regerr2("too many character classes; limit", NCLASS+'0');
  295. return &(classp[nclass++]);
  296. }
  297. static int
  298. nextc(wchar_t *rp)
  299. {
  300. int n;
  301. if(lexdone){
  302. *rp = 0;
  303. return 1;
  304. }
  305. n = mbtowc(rp, exprp, MB_CUR_MAX);
  306. if (n <= 0)
  307. n = 1;
  308. exprp += n;
  309. if(*rp == L'\\'){
  310. n = mbtowc(rp, exprp, MB_CUR_MAX);
  311. if (n <= 0)
  312. n = 1;
  313. exprp += n;
  314. return 1;
  315. }
  316. if(*rp == 0)
  317. lexdone = 1;
  318. return 0;
  319. }
  320. static int
  321. lex(int literal, int dot_type)
  322. {
  323. int quoted;
  324. quoted = nextc(&yyrune);
  325. if(literal || quoted){
  326. if(yyrune == 0)
  327. return END;
  328. return RUNE;
  329. }
  330. switch(yyrune){
  331. case 0:
  332. return END;
  333. case L'*':
  334. return STAR;
  335. case L'?':
  336. return QUEST;
  337. case L'+':
  338. return PLUS;
  339. case L'|':
  340. return OR;
  341. case L'.':
  342. return dot_type;
  343. case L'(':
  344. return LBRA;
  345. case L')':
  346. return RBRA;
  347. case L'^':
  348. return BOL;
  349. case L'$':
  350. return EOL;
  351. case L'[':
  352. return bldcclass();
  353. }
  354. return RUNE;
  355. }
  356. static int
  357. bldcclass(void)
  358. {
  359. int type;
  360. wchar_t r[NCCRUNE];
  361. wchar_t *p, *ep, *np;
  362. wchar_t rune;
  363. int quoted;
  364. /* we have already seen the '[' */
  365. type = CCLASS;
  366. yyclassp = newclass();
  367. /* look ahead for negation */
  368. ep = r;
  369. quoted = nextc(&rune);
  370. if(!quoted && rune == L'^'){
  371. type = NCCLASS;
  372. quoted = nextc(&rune);
  373. *ep++ = L'\n';
  374. *ep++ = L'\n';
  375. }
  376. /* parse class into a set of spans */
  377. for(; ep<&r[NCCRUNE];){
  378. if(rune == 0){
  379. rcerror("malformed '[]'");
  380. return 0;
  381. }
  382. if(!quoted && rune == L']')
  383. break;
  384. if(!quoted && rune == L'-'){
  385. if(ep == r){
  386. rcerror("malformed '[]'");
  387. return 0;
  388. }
  389. quoted = nextc(&rune);
  390. if((!quoted && rune == L']') || rune == 0){
  391. rcerror("malformed '[]'");
  392. return 0;
  393. }
  394. *(ep-1) = rune;
  395. } else {
  396. *ep++ = rune;
  397. *ep++ = rune;
  398. }
  399. quoted = nextc(&rune);
  400. }
  401. /* sort on span start */
  402. for(p = r; p < ep; p += 2){
  403. for(np = p; np < ep; np += 2)
  404. if(*np < *p){
  405. rune = np[0];
  406. np[0] = p[0];
  407. p[0] = rune;
  408. rune = np[1];
  409. np[1] = p[1];
  410. p[1] = rune;
  411. }
  412. }
  413. /* merge spans */
  414. np = yyclassp->spans;
  415. p = r;
  416. if(r == ep)
  417. yyclassp->end = np;
  418. else {
  419. np[0] = *p++;
  420. np[1] = *p++;
  421. for(; p < ep; p += 2)
  422. if(p[0] <= np[1]){
  423. if(p[1] > np[1])
  424. np[1] = p[1];
  425. } else {
  426. np += 2;
  427. np[0] = p[0];
  428. np[1] = p[1];
  429. }
  430. yyclassp->end = np+2;
  431. }
  432. return type;
  433. }
  434. static Reprog*
  435. regcomp1(char *s, int literal, int dot_type)
  436. {
  437. int token;
  438. Reprog *pp;
  439. /* get memory for the program */
  440. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  441. if(pp == 0){
  442. regerror("out of memory");
  443. return 0;
  444. }
  445. freep = pp->firstinst;
  446. classp = pp->class;
  447. errors = 0;
  448. if(setjmp(regkaboom))
  449. goto out;
  450. /* go compile the sucker */
  451. lexdone = 0;
  452. exprp = s;
  453. nclass = 0;
  454. nbra = 0;
  455. atorp = atorstack;
  456. andp = andstack;
  457. subidp = subidstack;
  458. lastwasand = FALSE;
  459. cursubid = 0;
  460. /* Start with a low priority operator to prime parser */
  461. pushator(START-1);
  462. while((token = lex(literal, dot_type)) != END){
  463. if((token&0300) == OPERATOR)
  464. operator(token);
  465. else
  466. operand(token);
  467. }
  468. /* Close with a low priority operator */
  469. evaluntil(START);
  470. /* Force END */
  471. operand(END);
  472. evaluntil(START);
  473. #ifdef DEBUG
  474. dumpstack();
  475. #endif
  476. if(nbra)
  477. rcerror("unmatched left paren");
  478. --andp; /* points to first and only operand */
  479. pp->startinst = andp->first;
  480. #ifdef DEBUG
  481. dump(pp);
  482. #endif
  483. pp = optimize(pp);
  484. #ifdef DEBUG
  485. print("start: %d\n", andp->first-pp->firstinst);
  486. dump(pp);
  487. #endif
  488. out:
  489. if(errors){
  490. free(pp);
  491. pp = 0;
  492. }
  493. return pp;
  494. }
  495. extern Reprog*
  496. regcomp(char *s)
  497. {
  498. return regcomp1(s, 0, ANY);
  499. }
  500. extern Reprog*
  501. regcomplit(char *s)
  502. {
  503. return regcomp1(s, 1, ANY);
  504. }
  505. extern Reprog*
  506. regcompnl(char *s)
  507. {
  508. return regcomp1(s, 0, ANYNL);
  509. }