regcomp.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. /* From libregexp but leaks extra classes when it runs out */
  2. #include <u.h>
  3. #include <libc.h>
  4. #include "regexp.h"
  5. #include "/sys/src/libregexp/regcomp.h"
  6. #define TRUE 1
  7. #define FALSE 0
  8. /*
  9. * Parser Information
  10. */
  11. typedef
  12. struct Node
  13. {
  14. Reinst* first;
  15. Reinst* last;
  16. }Node;
  17. #define NSTACK 20
  18. static Node andstack[NSTACK];
  19. static Node *andp;
  20. static int atorstack[NSTACK];
  21. static int* atorp;
  22. static int cursubid; /* id of current subexpression */
  23. static int subidstack[NSTACK]; /* parallel to atorstack */
  24. static int* subidp;
  25. static int lastwasand; /* Last token was operand */
  26. static int nbra;
  27. static char* exprp; /* pointer to next character in source expression */
  28. static int lexdone;
  29. static int nclass;
  30. static Reclass*classp;
  31. static Reinst* freep;
  32. static int errors;
  33. static Rune yyrune; /* last lex'd rune */
  34. static Reclass*yyclassp; /* last lex'd class */
  35. /* predeclared crap */
  36. static void operator(int);
  37. static void pushand(Reinst*, Reinst*);
  38. static void pushator(int);
  39. static void evaluntil(int);
  40. static int bldcclass(void);
  41. static jmp_buf regkaboom;
  42. static void
  43. rcerror(char *s)
  44. {
  45. errors++;
  46. regerror(s);
  47. longjmp(regkaboom, 1);
  48. }
  49. static Reinst*
  50. newinst(int t)
  51. {
  52. freep->type = t;
  53. freep->left = 0;
  54. freep->right = 0;
  55. return freep++;
  56. }
  57. static void
  58. operand(int t)
  59. {
  60. Reinst *i;
  61. if(lastwasand)
  62. operator(CAT); /* catenate is implicit */
  63. i = newinst(t);
  64. if(t == CCLASS || t == NCCLASS)
  65. i->cp = yyclassp;
  66. if(t == RUNE)
  67. i->r = yyrune;
  68. pushand(i, i);
  69. lastwasand = TRUE;
  70. }
  71. static void
  72. operator(int t)
  73. {
  74. if(t==RBRA && --nbra<0)
  75. rcerror("unmatched right paren");
  76. if(t==LBRA){
  77. if(++cursubid >= NSUBEXP)
  78. rcerror ("too many subexpressions");
  79. nbra++;
  80. if(lastwasand)
  81. operator(CAT);
  82. } else
  83. evaluntil(t);
  84. if(t != RBRA)
  85. pushator(t);
  86. lastwasand = FALSE;
  87. if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
  88. lastwasand = TRUE; /* these look like operands */
  89. }
  90. static void
  91. regerr2(char *s, int c)
  92. {
  93. char buf[100];
  94. char *cp = buf;
  95. while(*s)
  96. *cp++ = *s++;
  97. *cp++ = c;
  98. *cp = '\0';
  99. rcerror(buf);
  100. }
  101. static void
  102. cant(char *s)
  103. {
  104. char buf[100];
  105. strcpy(buf, "can't happen: ");
  106. strcat(buf, s);
  107. rcerror(buf);
  108. }
  109. static void
  110. pushand(Reinst *f, Reinst *l)
  111. {
  112. if(andp >= &andstack[NSTACK])
  113. cant("operand stack overflow");
  114. andp->first = f;
  115. andp->last = l;
  116. andp++;
  117. }
  118. static void
  119. pushator(int t)
  120. {
  121. if(atorp >= &atorstack[NSTACK])
  122. cant("operator stack overflow");
  123. *atorp++ = t;
  124. *subidp++ = cursubid;
  125. }
  126. static Node*
  127. popand(int op)
  128. {
  129. Reinst *inst;
  130. if(andp <= &andstack[0]){
  131. regerr2("missing operand for ", op);
  132. inst = newinst(NOP);
  133. pushand(inst,inst);
  134. }
  135. return --andp;
  136. }
  137. static int
  138. popator(void)
  139. {
  140. if(atorp <= &atorstack[0])
  141. cant("operator stack underflow");
  142. --subidp;
  143. return *--atorp;
  144. }
  145. static void
  146. evaluntil(int pri)
  147. {
  148. Node *op1, *op2;
  149. Reinst *inst1, *inst2;
  150. while(pri==RBRA || atorp[-1]>=pri){
  151. switch(popator()){
  152. default:
  153. rcerror("unknown operator in evaluntil");
  154. break;
  155. case LBRA: /* must have been RBRA */
  156. op1 = popand('(');
  157. inst2 = newinst(RBRA);
  158. inst2->subid = *subidp;
  159. op1->last->next = inst2;
  160. inst1 = newinst(LBRA);
  161. inst1->subid = *subidp;
  162. inst1->next = op1->first;
  163. pushand(inst1, inst2);
  164. return;
  165. case OR:
  166. op2 = popand('|');
  167. op1 = popand('|');
  168. inst2 = newinst(NOP);
  169. op2->last->next = inst2;
  170. op1->last->next = inst2;
  171. inst1 = newinst(OR);
  172. inst1->right = op1->first;
  173. inst1->left = op2->first;
  174. pushand(inst1, inst2);
  175. break;
  176. case CAT:
  177. op2 = popand(0);
  178. op1 = popand(0);
  179. op1->last->next = op2->first;
  180. pushand(op1->first, op2->last);
  181. break;
  182. case STAR:
  183. op2 = popand('*');
  184. inst1 = newinst(OR);
  185. op2->last->next = inst1;
  186. inst1->right = op2->first;
  187. pushand(inst1, inst1);
  188. break;
  189. case PLUS:
  190. op2 = popand('+');
  191. inst1 = newinst(OR);
  192. op2->last->next = inst1;
  193. inst1->right = op2->first;
  194. pushand(op2->first, inst1);
  195. break;
  196. case QUEST:
  197. op2 = popand('?');
  198. inst1 = newinst(OR);
  199. inst2 = newinst(NOP);
  200. inst1->left = inst2;
  201. inst1->right = op2->first;
  202. op2->last->next = inst2;
  203. pushand(inst1, inst2);
  204. break;
  205. }
  206. }
  207. }
  208. static Reprog*
  209. optimize(Reprog *pp)
  210. {
  211. Reinst *inst, *target;
  212. int size;
  213. Reprog *npp;
  214. Reclass *cl;
  215. int diff;
  216. /*
  217. * get rid of NOOP chains
  218. */
  219. for(inst=pp->firstinst; inst->type!=END; inst++){
  220. target = inst->next;
  221. while(target->type == NOP)
  222. target = target->next;
  223. inst->next = target;
  224. }
  225. /*
  226. * The original allocation is for an area larger than
  227. * necessary. Reallocate to the actual space used
  228. * and then relocate the code.
  229. */
  230. size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
  231. npp = realloc(pp, size);
  232. if(npp==0 || npp==pp)
  233. return pp;
  234. diff = (char *)npp - (char *)pp;
  235. freep = (Reinst *)((char *)freep + diff);
  236. for(inst=npp->firstinst; inst<freep; inst++){
  237. switch(inst->type){
  238. case OR:
  239. case STAR:
  240. case PLUS:
  241. case QUEST:
  242. *(char **)&inst->right += diff;
  243. break;
  244. case CCLASS:
  245. case NCCLASS:
  246. *(char **)&inst->right += diff;
  247. cl = inst->cp;
  248. *(char **)&cl->end += diff;
  249. break;
  250. }
  251. *(char **)&inst->left += diff;
  252. }
  253. *(char **)&npp->startinst += diff;
  254. return npp;
  255. }
  256. #ifdef DEBUG
  257. static void
  258. dumpstack(void){
  259. Node *stk;
  260. int *ip;
  261. print("operators\n");
  262. for(ip=atorstack; ip<atorp; ip++)
  263. print("0%o\n", *ip);
  264. print("operands\n");
  265. for(stk=andstack; stk<andp; stk++)
  266. print("0%o\t0%o\n", stk->first->type, stk->last->type);
  267. }
  268. static void
  269. dump(Reprog *pp)
  270. {
  271. Reinst *l;
  272. Rune *p;
  273. l = pp->firstinst;
  274. do{
  275. print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
  276. l->left-pp->firstinst, l->right-pp->firstinst);
  277. if(l->type == RUNE)
  278. print("\t%C\n", l->r);
  279. else if(l->type == CCLASS || l->type == NCCLASS){
  280. print("\t[");
  281. if(l->type == NCCLASS)
  282. print("^");
  283. for(p = l->cp->spans; p < l->cp->end; p += 2)
  284. if(p[0] == p[1])
  285. print("%C", p[0]);
  286. else
  287. print("%C-%C", p[0], p[1]);
  288. print("]\n");
  289. } else
  290. print("\n");
  291. }while(l++->type);
  292. }
  293. #endif
  294. static Reclass*
  295. newclass(void)
  296. {
  297. if(nclass <= 0){
  298. classp = mallocz(128*sizeof(Reclass), 1);
  299. if(classp == nil)
  300. regerror("out of memory");
  301. nclass = 128;
  302. }
  303. return &classp[--nclass];
  304. }
  305. static int
  306. nextc(Rune *rp)
  307. {
  308. if(lexdone){
  309. *rp = 0;
  310. return 1;
  311. }
  312. exprp += chartorune(rp, exprp);
  313. if(*rp == L'\\'){
  314. exprp += chartorune(rp, exprp);
  315. return 1;
  316. }
  317. if(*rp == 0)
  318. lexdone = 1;
  319. return 0;
  320. }
  321. static int
  322. lex(int literal, int dot_type)
  323. {
  324. int quoted;
  325. quoted = nextc(&yyrune);
  326. if(literal || quoted){
  327. if(yyrune == 0)
  328. return END;
  329. return RUNE;
  330. }
  331. switch(yyrune){
  332. case 0:
  333. return END;
  334. case L'*':
  335. return STAR;
  336. case L'?':
  337. return QUEST;
  338. case L'+':
  339. return PLUS;
  340. case L'|':
  341. return OR;
  342. case L'.':
  343. return dot_type;
  344. case L'(':
  345. return LBRA;
  346. case L')':
  347. return RBRA;
  348. case L'^':
  349. return BOL;
  350. case L'$':
  351. return EOL;
  352. case L'[':
  353. return bldcclass();
  354. }
  355. return RUNE;
  356. }
  357. static int
  358. bldcclass(void)
  359. {
  360. int type;
  361. Rune r[NCCRUNE];
  362. Rune *p, *ep, *np;
  363. Rune rune;
  364. int quoted;
  365. /* we have already seen the '[' */
  366. type = CCLASS;
  367. yyclassp = newclass();
  368. /* look ahead for negation */
  369. /* SPECIAL CASE!!! negated classes don't match \n */
  370. ep = r;
  371. quoted = nextc(&rune);
  372. if(!quoted && rune == L'^'){
  373. type = NCCLASS;
  374. quoted = nextc(&rune);
  375. *ep++ = L'\n';
  376. *ep++ = L'\n';
  377. }
  378. /* parse class into a set of spans */
  379. for(; ep<&r[NCCRUNE];){
  380. if(rune == 0){
  381. rcerror("malformed '[]'");
  382. return 0;
  383. }
  384. if(!quoted && rune == L']')
  385. break;
  386. if(!quoted && rune == L'-'){
  387. if(ep == r){
  388. rcerror("malformed '[]'");
  389. return 0;
  390. }
  391. quoted = nextc(&rune);
  392. if((!quoted && rune == L']') || rune == 0){
  393. rcerror("malformed '[]'");
  394. return 0;
  395. }
  396. *(ep-1) = rune;
  397. } else {
  398. *ep++ = rune;
  399. *ep++ = rune;
  400. }
  401. quoted = nextc(&rune);
  402. }
  403. /* sort on span start */
  404. for(p = r; p < ep; p += 2){
  405. for(np = p; np < ep; np += 2)
  406. if(*np < *p){
  407. rune = np[0];
  408. np[0] = p[0];
  409. p[0] = rune;
  410. rune = np[1];
  411. np[1] = p[1];
  412. p[1] = rune;
  413. }
  414. }
  415. /* merge spans */
  416. np = yyclassp->spans;
  417. p = r;
  418. if(r == ep)
  419. yyclassp->end = np;
  420. else {
  421. np[0] = *p++;
  422. np[1] = *p++;
  423. for(; p < ep; p += 2)
  424. if(p[0] <= np[1]){
  425. if(p[1] > np[1])
  426. np[1] = p[1];
  427. } else {
  428. np += 2;
  429. np[0] = p[0];
  430. np[1] = p[1];
  431. }
  432. yyclassp->end = np+2;
  433. }
  434. return type;
  435. }
  436. static Reprog*
  437. regcomp1(char *s, int literal, int dot_type)
  438. {
  439. int token;
  440. Reprog *pp;
  441. /* get memory for the program */
  442. pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
  443. if(pp == 0){
  444. regerror("out of memory");
  445. return 0;
  446. }
  447. freep = pp->firstinst;
  448. classp = pp->class;
  449. errors = 0;
  450. if(setjmp(regkaboom))
  451. goto out;
  452. /* go compile the sucker */
  453. lexdone = 0;
  454. exprp = s;
  455. nclass = NCLASS;
  456. nbra = 0;
  457. atorp = atorstack;
  458. andp = andstack;
  459. subidp = subidstack;
  460. lastwasand = FALSE;
  461. cursubid = 0;
  462. /* Start with a low priority operator to prime parser */
  463. pushator(START-1);
  464. while((token = lex(literal, dot_type)) != END){
  465. if((token&0300) == OPERATOR)
  466. operator(token);
  467. else
  468. operand(token);
  469. }
  470. /* Close with a low priority operator */
  471. evaluntil(START);
  472. /* Force END */
  473. operand(END);
  474. evaluntil(START);
  475. #ifdef DEBUG
  476. dumpstack();
  477. #endif
  478. if(nbra)
  479. rcerror("unmatched left paren");
  480. --andp; /* points to first and only operand */
  481. pp->startinst = andp->first;
  482. #ifdef DEBUG
  483. dump(pp);
  484. #endif
  485. pp = optimize(pp);
  486. #ifdef DEBUG
  487. print("start: %d\n", andp->first-pp->firstinst);
  488. dump(pp);
  489. #endif
  490. out:
  491. if(errors){
  492. free(pp);
  493. pp = 0;
  494. }
  495. return pp;
  496. }
  497. extern Reprog*
  498. regcomp(char *s)
  499. {
  500. return regcomp1(s, 0, ANY);
  501. }
  502. extern Reprog*
  503. regcomplit(char *s)
  504. {
  505. return regcomp1(s, 1, ANY);
  506. }
  507. extern Reprog*
  508. regcompnl(char *s)
  509. {
  510. return regcomp1(s, 0, ANYNL);
  511. }