comp.c 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #include "grep.h"
  2. /*
  3. * incremental compiler.
  4. * add the branch c to the
  5. * state s.
  6. */
  7. void
  8. increment(State *s, int c)
  9. {
  10. int i;
  11. State *t, **tt;
  12. Re *re1, *re2;
  13. nfollow = 0;
  14. gen++;
  15. matched = 0;
  16. for(i=0; i<s->count; i++)
  17. fol1(s->re[i], c);
  18. qsort(follow, nfollow, sizeof(*follow), fcmp);
  19. for(tt=&state0; t = *tt;) {
  20. if(t->count > nfollow) {
  21. tt = &t->linkleft;
  22. goto cont;
  23. }
  24. if(t->count < nfollow) {
  25. tt = &t->linkright;
  26. goto cont;
  27. }
  28. for(i=0; i<nfollow; i++) {
  29. re1 = t->re[i];
  30. re2 = follow[i];
  31. if(re1 > re2) {
  32. tt = &t->linkleft;
  33. goto cont;
  34. }
  35. if(re1 < re2) {
  36. tt = &t->linkright;
  37. goto cont;
  38. }
  39. }
  40. if(!!matched && !t->match) {
  41. tt = &t->linkleft;
  42. goto cont;
  43. }
  44. if(!matched && !!t->match) {
  45. tt = &t->linkright;
  46. goto cont;
  47. }
  48. s->next[c] = t;
  49. return;
  50. cont:;
  51. }
  52. t = sal(nfollow);
  53. *tt = t;
  54. for(i=0; i<nfollow; i++) {
  55. re1 = follow[i];
  56. t->re[i] = re1;
  57. }
  58. s->next[c] = t;
  59. t->match = matched;
  60. }
  61. int
  62. fcmp(void *va, void *vb)
  63. {
  64. Re **aa, **bb;
  65. Re *a, *b;
  66. aa = va;
  67. bb = vb;
  68. a = *aa;
  69. b = *bb;
  70. if(a > b)
  71. return 1;
  72. if(a < b)
  73. return -1;
  74. return 0;
  75. }
  76. void
  77. fol1(Re *r, int c)
  78. {
  79. Re *r1;
  80. loop:
  81. if(r->gen == gen)
  82. return;
  83. if(nfollow >= maxfollow)
  84. error("nfollow");
  85. r->gen = gen;
  86. switch(r->type) {
  87. default:
  88. error("fol1");
  89. case Tcase:
  90. if(c >= 0 && c < 256)
  91. if(r1 = r->cases[c])
  92. follow[nfollow++] = r1;
  93. if(r = r->next)
  94. goto loop;
  95. break;
  96. case Talt:
  97. case Tor:
  98. fol1(r->alt, c);
  99. r = r->next;
  100. goto loop;
  101. case Tbegin:
  102. if(c == '\n' || c == Cbegin)
  103. follow[nfollow++] = r->next;
  104. break;
  105. case Tend:
  106. if(c == '\n')
  107. matched = 1;
  108. break;
  109. case Tclass:
  110. if(c >= r->lo && c <= r->hi)
  111. follow[nfollow++] = r->next;
  112. break;
  113. }
  114. }
  115. Rune tab1[] =
  116. {
  117. 0x007f,
  118. 0x07ff,
  119. };
  120. Rune tab2[] =
  121. {
  122. 0x003f,
  123. 0x0fff,
  124. };
  125. Re2
  126. rclass(Rune p0, Rune p1)
  127. {
  128. char xc0[6], xc1[6];
  129. int i, n, m;
  130. Re2 x;
  131. if(p0 > p1)
  132. return re2char(0xff, 0xff); // no match
  133. /*
  134. * bust range into same length
  135. * character sequences
  136. */
  137. for(i=0; i<nelem(tab1); i++) {
  138. m = tab1[i];
  139. if(p0 <= m && p1 > m)
  140. return re2or(rclass(p0, m), rclass(m+1, p1));
  141. }
  142. /*
  143. * bust range into part of a single page
  144. * or into full pages
  145. */
  146. for(i=0; i<nelem(tab2); i++) {
  147. m = tab2[i];
  148. if((p0 & ~m) != (p1 & ~m)) {
  149. if((p0 & m) != 0)
  150. return re2or(rclass(p0, p0|m), rclass((p0|m)+1, p1));
  151. if((p1 & m) != m)
  152. return re2or(rclass(p0, (p1&~m)-1), rclass(p1&~m, p1));
  153. }
  154. }
  155. n = runetochar(xc0, &p0);
  156. i = runetochar(xc1, &p1);
  157. if(i != n)
  158. error("length");
  159. x = re2char(xc0[0], xc1[0]);
  160. for(i=1; i<n; i++)
  161. x = re2cat(x, re2char(xc0[i], xc1[i]));
  162. return x;
  163. }
  164. int
  165. pcmp(void *va, void *vb)
  166. {
  167. int n;
  168. Rune *a, *b;
  169. a = va;
  170. b = vb;
  171. n = a[0] - b[0];
  172. if(n)
  173. return n;
  174. return a[1] - b[1];
  175. }
  176. /*
  177. * convert character chass into
  178. * run-pair ranges of matches.
  179. * this is 10646/utf specific and
  180. * needs to be changed for some
  181. * other input character set.
  182. * this is the key to a fast
  183. * regular search of characters
  184. * by looking at sequential bytes.
  185. */
  186. Re2
  187. re2class(char *s)
  188. {
  189. Rune pairs[200], *p, *q, ov;
  190. int nc;
  191. Re2 x;
  192. nc = 0;
  193. if(*s == '^') {
  194. nc = 1;
  195. s++;
  196. }
  197. p = pairs;
  198. s += chartorune(p, s);
  199. for(;;) {
  200. if(*p == '\\')
  201. s += chartorune(p, s);
  202. if(*p == 0)
  203. break;
  204. p[1] = *p;
  205. p += 2;
  206. s += chartorune(p, s);
  207. if(*p != '-')
  208. continue;
  209. s += chartorune(p, s);
  210. if(*p == '\\')
  211. s += chartorune(p, s);
  212. if(*p == 0)
  213. break;
  214. p[-1] = *p;
  215. s += chartorune(p, s);
  216. }
  217. *p = 0;
  218. qsort(pairs, (p-pairs)/2, 2*sizeof(*pairs), pcmp);
  219. q = pairs;
  220. for(p=pairs+2; *p; p+=2) {
  221. if(p[0] > p[1])
  222. continue;
  223. if(p[0] > q[1] || p[1] < q[0]) {
  224. q[2] = p[0];
  225. q[3] = p[1];
  226. q += 2;
  227. continue;
  228. }
  229. if(p[0] < q[0])
  230. q[0] = p[0];
  231. if(p[1] > q[1])
  232. q[1] = p[1];
  233. }
  234. q[2] = 0;
  235. p = pairs;
  236. if(nc) {
  237. x = rclass(0, p[0]-1);
  238. ov = p[1]+1;
  239. for(p+=2; *p; p+=2) {
  240. x = re2or(x, rclass(ov, p[0]-1));
  241. ov = p[1]+1;
  242. }
  243. x = re2or(x, rclass(ov, 0xffff));
  244. } else {
  245. x = rclass(p[0], p[1]);
  246. for(p+=2; *p; p+=2)
  247. x = re2or(x, rclass(p[0], p[1]));
  248. }
  249. return x;
  250. }