regexec.c 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. #include <u.h>
  2. #include <libc.h>
  3. #include "regexp.h"
  4. #include "regcomp.h"
  5. /*
  6. * return 0 if no match
  7. * >0 if a match
  8. * <0 if we ran out of _relist space
  9. */
  10. static int
  11. regexec1(Reprog *progp, /* program to run */
  12. char *bol, /* string to run machine on */
  13. Resub *mp, /* subexpression elements */
  14. int ms, /* number of elements at mp */
  15. Reljunk *j
  16. )
  17. {
  18. int flag=0;
  19. Reinst *inst;
  20. Relist *tlp;
  21. char *s;
  22. int i, checkstart;
  23. Rune r, *rp, *ep;
  24. int n;
  25. Relist* tl; /* This list, next list */
  26. Relist* nl;
  27. Relist* tle; /* ends of this and next list */
  28. Relist* nle;
  29. int match;
  30. char *p;
  31. match = 0;
  32. checkstart = j->starttype;
  33. if(mp)
  34. for(i=0; i<ms; i++) {
  35. mp[i].sp = 0;
  36. mp[i].ep = 0;
  37. }
  38. j->relist[0][0].inst = 0;
  39. j->relist[1][0].inst = 0;
  40. /* Execute machine once for each character, including terminal NUL */
  41. s = j->starts;
  42. do{
  43. /* fast check for first char */
  44. if(checkstart) {
  45. switch(j->starttype) {
  46. case RUNE:
  47. p = utfrune(s, j->startchar);
  48. if(p == 0 || s == j->eol)
  49. return match;
  50. s = p;
  51. break;
  52. case BOL:
  53. if(s == bol)
  54. break;
  55. p = utfrune(s, '\n');
  56. if(p == 0 || s == j->eol)
  57. return match;
  58. s = p;
  59. break;
  60. }
  61. }
  62. r = *(uchar*)s;
  63. if(r < Runeself)
  64. n = 1;
  65. else
  66. n = chartorune(&r, s);
  67. /* switch run lists */
  68. tl = j->relist[flag];
  69. tle = j->reliste[flag];
  70. nl = j->relist[flag^=1];
  71. nle = j->reliste[flag];
  72. nl->inst = 0;
  73. /* Add first instruction to current list */
  74. if(match == 0)
  75. _renewemptythread(tl, progp->startinst, ms, s);
  76. /* Execute machine until current list is empty */
  77. for(tlp=tl; tlp->inst; tlp++){ /* assignment = */
  78. for(inst = tlp->inst; ; inst = inst->next){
  79. switch(inst->type){
  80. case RUNE: /* regular character */
  81. if(inst->r == r){
  82. if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
  83. return -1;
  84. }
  85. break;
  86. case LBRA:
  87. tlp->se.m[inst->subid].sp = s;
  88. continue;
  89. case RBRA:
  90. tlp->se.m[inst->subid].ep = s;
  91. continue;
  92. case ANY:
  93. if(r != '\n')
  94. if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
  95. return -1;
  96. break;
  97. case ANYNL:
  98. if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
  99. return -1;
  100. break;
  101. case BOL:
  102. if(s == bol || *(s-1) == '\n')
  103. continue;
  104. break;
  105. case EOL:
  106. if(s == j->eol || r == 0 || r == '\n')
  107. continue;
  108. break;
  109. case CCLASS:
  110. ep = inst->cp->end;
  111. for(rp = inst->cp->spans; rp < ep; rp += 2)
  112. if(r >= rp[0] && r <= rp[1]){
  113. if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
  114. return -1;
  115. break;
  116. }
  117. break;
  118. case NCCLASS:
  119. ep = inst->cp->end;
  120. for(rp = inst->cp->spans; rp < ep; rp += 2)
  121. if(r >= rp[0] && r <= rp[1])
  122. break;
  123. if(rp == ep)
  124. if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
  125. return -1;
  126. break;
  127. case OR:
  128. /* evaluate right choice later */
  129. if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle)
  130. return -1;
  131. /* efficiency: advance and re-evaluate */
  132. continue;
  133. case END: /* Match! */
  134. match = 1;
  135. tlp->se.m[0].ep = s;
  136. if(mp != 0)
  137. _renewmatch(mp, ms, &tlp->se);
  138. break;
  139. }
  140. break;
  141. }
  142. }
  143. if(s == j->eol)
  144. break;
  145. checkstart = j->starttype && nl->inst==0;
  146. s += n;
  147. }while(r);
  148. return match;
  149. }
  150. static int
  151. regexec2(Reprog *progp, /* program to run */
  152. char *bol, /* string to run machine on */
  153. Resub *mp, /* subexpression elements */
  154. int ms, /* number of elements at mp */
  155. Reljunk *j
  156. )
  157. {
  158. int rv;
  159. Relist *relist0, *relist1;
  160. /* mark space */
  161. relist0 = malloc(BIGLISTSIZE*sizeof(Relist));
  162. if(relist0 == nil)
  163. return -1;
  164. relist1 = malloc(BIGLISTSIZE*sizeof(Relist));
  165. if(relist1 == nil){
  166. free(relist1);
  167. return -1;
  168. }
  169. j->relist[0] = relist0;
  170. j->relist[1] = relist1;
  171. j->reliste[0] = relist0 + BIGLISTSIZE - 2;
  172. j->reliste[1] = relist1 + BIGLISTSIZE - 2;
  173. rv = regexec1(progp, bol, mp, ms, j);
  174. free(relist0);
  175. free(relist1);
  176. return rv;
  177. }
  178. extern int
  179. regexec(Reprog *progp, /* program to run */
  180. char *bol, /* string to run machine on */
  181. Resub *mp, /* subexpression elements */
  182. int ms) /* number of elements at mp */
  183. {
  184. Reljunk j;
  185. Relist relist0[LISTSIZE], relist1[LISTSIZE];
  186. int rv;
  187. /*
  188. * use user-specified starting/ending location if specified
  189. */
  190. j.starts = bol;
  191. j.eol = 0;
  192. if(mp && ms>0){
  193. if(mp->sp)
  194. j.starts = mp->sp;
  195. if(mp->ep)
  196. j.eol = mp->ep;
  197. }
  198. j.starttype = 0;
  199. j.startchar = 0;
  200. if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
  201. j.starttype = RUNE;
  202. j.startchar = progp->startinst->r;
  203. }
  204. if(progp->startinst->type == BOL)
  205. j.starttype = BOL;
  206. /* mark space */
  207. j.relist[0] = relist0;
  208. j.relist[1] = relist1;
  209. j.reliste[0] = relist0 + nelem(relist0) - 2;
  210. j.reliste[1] = relist1 + nelem(relist1) - 2;
  211. rv = regexec1(progp, bol, mp, ms, &j);
  212. if(rv >= 0)
  213. return rv;
  214. rv = regexec2(progp, bol, mp, ms, &j);
  215. if(rv >= 0)
  216. return rv;
  217. return -1;
  218. }