dict.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <regexp.h>
  5. #include <ctype.h>
  6. #include "dict.h"
  7. /*
  8. * Assumed index file structure: lines of form
  9. * [^\t]+\t[0-9]+
  10. * First field is key, second is byte offset into dictionary.
  11. * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
  12. */
  13. typedef struct Addr Addr;
  14. struct Addr {
  15. int n; /* number of offsets */
  16. int cur; /* current position within doff array */
  17. int maxn; /* actual current size of doff array */
  18. ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
  19. };
  20. Biobuf binbuf;
  21. Biobuf boutbuf;
  22. Biobuf *bin = &binbuf; /* user cmd input */
  23. Biobuf *bout = &boutbuf; /* output */
  24. Biobuf *bdict; /* dictionary */
  25. Biobuf *bindex; /* index file */
  26. long indextop; /* index offset at end of file */
  27. int lastcmd; /* last executed command */
  28. Addr *dot; /* "current" address */
  29. Dict *dict; /* current dictionary */
  30. int linelen;
  31. int breaklen = 60;
  32. int outinhibit;
  33. int debug;
  34. void execcmd(int);
  35. int getpref(char*, Rune*);
  36. Entry getentry(int);
  37. int getfield(Rune*);
  38. long locate(Rune*);
  39. int parseaddr(char*, char**);
  40. int parsecmd(char*);
  41. int search(char*, int);
  42. long seeknextline(Biobuf*, long);
  43. void setdotnext(void);
  44. void setdotprev(void);
  45. void sortaddr(Addr*);
  46. void usage(void);
  47. enum {
  48. Plen=300, /* max length of a search pattern */
  49. Fieldlen=200, /* max length of an index field */
  50. Aslots=10, /* initial number of slots in an address */
  51. };
  52. void
  53. main(int argc, char **argv)
  54. {
  55. int i, cmd, kflag;
  56. char *line, *p;
  57. Binit(&binbuf, 0, OREAD);
  58. Binit(&boutbuf, 1, OWRITE);
  59. kflag = 0;
  60. line = 0;
  61. dict = &dicts[0];
  62. ARGBEGIN {
  63. case 'd':
  64. p = ARGF();
  65. dict = 0;
  66. if(p) {
  67. for(i=0; dicts[i].name; i++)
  68. if(strcmp(p, dicts[i].name)==0) {
  69. dict = &dicts[i];
  70. break;
  71. }
  72. }
  73. if(!dict)
  74. usage();
  75. break;
  76. case 'c':
  77. line = ARGF();
  78. if(!line)
  79. usage();
  80. break;
  81. case 'k':
  82. kflag++;
  83. break;
  84. case 'D':
  85. debug++;
  86. break;
  87. default:
  88. usage();
  89. ARGEND }
  90. if(kflag) {
  91. (*dict->printkey)();
  92. exits(0);
  93. }
  94. if(argc > 1)
  95. usage();
  96. else if(argc == 1) {
  97. if(line)
  98. usage();
  99. p = argv[0];
  100. line = malloc(strlen(p)+5);
  101. sprint(line, "/%s/P\n", p);
  102. }
  103. bdict = Bopen(dict->path, OREAD);
  104. if(!bdict) {
  105. err("can't open dictionary %s", dict->path);
  106. exits("nodict");
  107. }
  108. bindex = Bopen(dict->indexpath, OREAD);
  109. if(!bindex) {
  110. err("can't open index %s", dict->indexpath);
  111. exits("noindex");
  112. }
  113. indextop = Bseek(bindex, 0L, 2);
  114. dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
  115. dot->n = 0;
  116. dot->cur = 0;
  117. dot->maxn = Aslots;
  118. lastcmd = 0;
  119. if(line) {
  120. cmd = parsecmd(line);
  121. if(cmd)
  122. execcmd(cmd);
  123. } else {
  124. for(;;) {
  125. Bprint(bout, "*");
  126. Bflush(bout);
  127. line = Brdline(bin, '\n');
  128. linelen = 0;
  129. if(!line)
  130. break;
  131. cmd = parsecmd(line);
  132. if(cmd) {
  133. execcmd(cmd);
  134. lastcmd = cmd;
  135. }
  136. }
  137. }
  138. exits(0);
  139. }
  140. void
  141. usage(void)
  142. {
  143. int i;
  144. Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
  145. Bprint(bout, "available dictionaries:\n");
  146. for(i = 0; dicts[i].name; i++)
  147. Bprint(bout, " %s\t%s\n", dicts[i].name, dicts[i].desc);
  148. exits("usage");
  149. }
  150. int
  151. parsecmd(char *line)
  152. {
  153. char *e;
  154. int cmd, ans;
  155. if(parseaddr(line, &e) >= 0)
  156. line = e;
  157. else
  158. return 0;
  159. cmd = *line;
  160. ans = cmd;
  161. if(isupper(cmd))
  162. cmd = tolower(cmd);
  163. if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
  164. cmd == '\n')) {
  165. err("unknown command %c", cmd);
  166. return 0;
  167. }
  168. if(cmd == '\n')
  169. switch(lastcmd) {
  170. case 0: ans = 'H'; break;
  171. case 'H': ans = 'p'; break;
  172. default : ans = lastcmd; break;
  173. }
  174. else if(line[1] != '\n' && line[1] != 0)
  175. err("extra stuff after command %c ignored", cmd);
  176. return ans;
  177. }
  178. void
  179. execcmd(int cmd)
  180. {
  181. Entry e;
  182. int cur, doall;
  183. if(isupper(cmd)) {
  184. doall = 1;
  185. cmd = tolower(cmd);
  186. cur = 0;
  187. } else {
  188. doall = 0;
  189. cur = dot->cur;
  190. }
  191. if(debug && doall && cmd == 'a')
  192. Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
  193. for(;;){
  194. if(cur >= dot->n)
  195. break;
  196. if(doall) {
  197. Bprint(bout, "%d\t", cur+1);
  198. linelen += 4 + (cur >= 10);
  199. }
  200. switch(cmd) {
  201. case 'a':
  202. Bprint(bout, "#%lud\n", dot->doff[cur]);
  203. break;
  204. case 'h':
  205. case 'p':
  206. case 'r':
  207. e = getentry(cur);
  208. (*dict->printentry)(e, cmd);
  209. break;
  210. }
  211. cur++;
  212. if(doall) {
  213. if(cmd == 'p' || cmd == 'r') {
  214. Bputc(bout, '\n');
  215. linelen = 0;
  216. }
  217. } else
  218. break;
  219. }
  220. if(cur >= dot->n)
  221. cur = 0;
  222. dot->cur = cur;
  223. }
  224. /*
  225. * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
  226. * Answer goes in dot.
  227. * Return -1 if address starts, but get error.
  228. * Return 0 if no address.
  229. */
  230. int
  231. parseaddr(char *line, char **eptr)
  232. {
  233. int delim, plen;
  234. ulong v;
  235. char *e;
  236. char pat[Plen];
  237. if(*line == '/' || *line == '!') {
  238. /* anchored regular expression match; '!' means no folding */
  239. if(*line == '/') {
  240. delim = '/';
  241. e = strpbrk(line+1, "/\n");
  242. } else {
  243. delim = '!';
  244. e = strpbrk(line+1, "!\n");
  245. }
  246. plen = e-line-1;
  247. if(plen >= Plen-3) {
  248. err("pattern too big");
  249. return -1;
  250. }
  251. pat[0] = '^';
  252. memcpy(pat+1, line+1, plen);
  253. pat[plen+1] = '$';
  254. pat[plen+2] = 0;
  255. if(*e == '\n')
  256. line = e;
  257. else
  258. line = e+1;
  259. if(!search(pat, delim == '/')) {
  260. err("pattern not found");
  261. return -1;
  262. }
  263. } else if(*line == '#') {
  264. /* absolute byte offset into dictionary */
  265. line++;
  266. if(!isdigit(*line))
  267. return -1;
  268. v = strtoul(line, &e, 10);
  269. line = e;
  270. dot->doff[0] = v;
  271. dot->n = 1;
  272. dot->cur = 0;
  273. } else if(isdigit(*line)) {
  274. v = strtoul(line, &e, 10);
  275. line = e;
  276. if(v < 1 || v > dot->n)
  277. err(".%d not in range [1,%d], ignored",
  278. v, dot->n);
  279. else
  280. dot->cur = v-1;
  281. } else if(*line == '.') {
  282. line++;
  283. } else {
  284. *eptr = line;
  285. return 0;
  286. }
  287. while(*line == '+' || *line == '-') {
  288. if(*line == '+')
  289. setdotnext();
  290. else
  291. setdotprev();
  292. line++;
  293. }
  294. *eptr = line;
  295. return 1;
  296. }
  297. /*
  298. * Index file is sorted by folded field1.
  299. * Method: find pre, a folded prefix of r.e. pat,
  300. * and then low = offset to beginning of
  301. * line in index file where first match of prefix occurs.
  302. * Then go through index until prefix no longer matches,
  303. * adding each line that matches real pattern to dot.
  304. * Finally, sort dot offsets (uniquing).
  305. * We know pat len < Plen, and that it is surrounded by ^..$
  306. */
  307. int
  308. search(char *pat, int dofold)
  309. {
  310. int needre, prelen, match, n;
  311. Reprog *re;
  312. long ioff, v;
  313. Rune pre[Plen];
  314. Rune lit[Plen];
  315. Rune entry[Fieldlen];
  316. char fpat[Plen];
  317. prelen = getpref(pat+1, pre);
  318. if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
  319. runescpy(lit, pre);
  320. if(dofold)
  321. fold(lit);
  322. needre = 0;
  323. SET(re);
  324. } else {
  325. needre = 1;
  326. if(dofold) {
  327. foldre(fpat, pat);
  328. re = regcomp(fpat);
  329. } else
  330. re = regcomp(pat);
  331. }
  332. fold(pre);
  333. ioff = locate(pre);
  334. if(ioff < 0)
  335. return 0;
  336. dot->n = 0;
  337. Bseek(bindex, ioff, 0);
  338. for(;;) {
  339. if(!getfield(entry))
  340. break;
  341. if(dofold)
  342. fold(entry);
  343. if(needre)
  344. match = rregexec(re, entry, 0, 0);
  345. else
  346. match = (acomp(lit, entry) == 0);
  347. if(match) {
  348. if(!getfield(entry))
  349. break;
  350. v = runetol(entry);
  351. if(dot->n >= dot->maxn) {
  352. n = 2*dot->maxn;
  353. dot = realloc(dot,
  354. sizeof(Addr)+(n-1)*sizeof(long));
  355. if(!dot) {
  356. err("out of memory");
  357. exits("nomem");
  358. }
  359. dot->maxn = n;
  360. }
  361. dot->doff[dot->n++] = v;
  362. } else {
  363. if(!dofold)
  364. fold(entry);
  365. if(*pre) {
  366. n = acomp(pre, entry);
  367. if(n < -1 || (!needre && n < 0))
  368. break;
  369. }
  370. /* get to next index entry */
  371. if(!getfield(entry))
  372. break;
  373. }
  374. }
  375. sortaddr(dot);
  376. dot->cur = 0;
  377. return dot->n;
  378. }
  379. /*
  380. * Return offset in index file of first line whose folded
  381. * first field has pre as a prefix. -1 if none found.
  382. */
  383. long
  384. locate(Rune *pre)
  385. {
  386. long top, bot, mid;
  387. Rune entry[Fieldlen];
  388. if(*pre == 0)
  389. return 0;
  390. bot = 0;
  391. top = indextop;
  392. if(debug>1)
  393. fprint(2, "locate looking for prefix %S\n", pre);
  394. for(;;) {
  395. /*
  396. * Loop invariant: foldkey(bot) < pre <= foldkey(top)
  397. * and bot < top, and bot,top point at beginning of lines
  398. */
  399. mid = (top+bot) / 2;
  400. mid = seeknextline(bindex, mid);
  401. if(debug > 1)
  402. fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
  403. bot, (top+bot) / 2, mid, top);
  404. if(mid == top || !getfield(entry))
  405. break;
  406. if(debug > 1)
  407. fprint(2, "key=%S\n", entry);
  408. /*
  409. * here mid is strictly between bot and top
  410. */
  411. fold(entry);
  412. if(acomp(pre, entry) <= 0)
  413. top = mid;
  414. else
  415. bot = mid;
  416. }
  417. /*
  418. * bot < top, but they don't necessarily point at successive lines
  419. * Use linear search from bot to find first line that pre is a
  420. * prefix of
  421. */
  422. while((bot = seeknextline(bindex, bot)) <= top) {
  423. if(!getfield(entry))
  424. return -1;
  425. if(debug > 1)
  426. fprint(2, "key=%S\n", entry);
  427. fold(entry);
  428. switch(acomp(pre, entry)) {
  429. case -2:
  430. return -1;
  431. case -1:
  432. case 0:
  433. return bot;
  434. case 1:
  435. case 2:
  436. continue;
  437. }
  438. }
  439. return -1;
  440. }
  441. /*
  442. * Get prefix of non re-metacharacters, runified, into pre,
  443. * and return length
  444. */
  445. int
  446. getpref(char *pat, Rune *pre)
  447. {
  448. int n, r;
  449. char *p;
  450. p = pat;
  451. while(*p) {
  452. n = chartorune(pre, p);
  453. r = *pre;
  454. switch(r) {
  455. case L'.': case L'*': case L'+': case L'?':
  456. case L'[': case L']': case L'(': case ')':
  457. case L'|': case L'^': case L'$':
  458. *pre = 0;
  459. return p-pat;
  460. case L'\\':
  461. p += n;
  462. p += chartorune(++pre, p);
  463. pre++;
  464. break;
  465. default:
  466. p += n;
  467. pre++;
  468. }
  469. }
  470. return p-pat;
  471. }
  472. long
  473. seeknextline(Biobuf *b, long off)
  474. {
  475. long c;
  476. Bseek(b, off, 0);
  477. do {
  478. c = Bgetrune(b);
  479. } while(c>=0 && c!='\n');
  480. return Boffset(b);
  481. }
  482. /*
  483. * Get next field out of index file (either tab- or nl- terminated)
  484. * Answer in *rp, assumed to be Fieldlen long.
  485. * Return 0 if read error first.
  486. */
  487. int
  488. getfield(Rune *rp)
  489. {
  490. long c;
  491. int n;
  492. for(n=Fieldlen; n-- > 0; ) {
  493. if ((c = Bgetrune(bindex)) < 0)
  494. return 0;
  495. if(c == '\t' || c == '\n') {
  496. *rp = L'\0';
  497. return 1;
  498. }
  499. *rp++ = c;
  500. }
  501. err("word too long");
  502. return 0;
  503. }
  504. /*
  505. * A compare longs function suitable for qsort
  506. */
  507. static int
  508. longcmp(void *av, void *bv)
  509. {
  510. long v;
  511. long *a, *b;
  512. a = av;
  513. b = bv;
  514. v = *a - *b;
  515. if(v < 0)
  516. return -1;
  517. else if(v == 0)
  518. return 0;
  519. else
  520. return 1;
  521. }
  522. void
  523. sortaddr(Addr *a)
  524. {
  525. int i, j;
  526. long v;
  527. if(a->n <= 1)
  528. return;
  529. qsort(a->doff, a->n, sizeof(long), longcmp);
  530. /* remove duplicates */
  531. for(i=0, j=0; j < a->n; j++) {
  532. v = a->doff[j];
  533. if(i > 0 && v == a->doff[i-1])
  534. continue;
  535. a->doff[i++] = v;
  536. }
  537. a->n = i;
  538. }
  539. Entry
  540. getentry(int i)
  541. {
  542. long b, e, n;
  543. static Entry ans;
  544. static int anslen = 0;
  545. b = dot->doff[i];
  546. e = (*dict->nextoff)(b+1);
  547. ans.doff = b;
  548. if(e < 0) {
  549. err("couldn't seek to entry");
  550. ans.start = 0;
  551. ans.end = 0;
  552. } else {
  553. n = e-b;
  554. if(n+1 > anslen) {
  555. ans.start = realloc(ans.start, n+1);
  556. if(!ans.start) {
  557. err("out of memory");
  558. exits("nomem");
  559. }
  560. anslen = n+1;
  561. }
  562. Bseek(bdict, b, 0);
  563. n = Bread(bdict, ans.start, n);
  564. ans.end = ans.start + n;
  565. *ans.end = 0;
  566. }
  567. return ans;
  568. }
  569. void
  570. setdotnext(void)
  571. {
  572. long b;
  573. b = (*dict->nextoff)(dot->doff[dot->cur]+1);
  574. if(b < 0) {
  575. err("couldn't find a next entry");
  576. return;
  577. }
  578. dot->doff[0] = b;
  579. dot->n = 1;
  580. dot->cur = 0;
  581. }
  582. void
  583. setdotprev(void)
  584. {
  585. int tryback;
  586. long here, last, p;
  587. if(dot->cur < 0 || dot->cur >= dot->n)
  588. return;
  589. tryback = 2000;
  590. here = dot->doff[dot->cur];
  591. last = 0;
  592. while(last == 0) {
  593. p = here - tryback;
  594. if(p < 0)
  595. p = 0;
  596. for(;;) {
  597. p = (*dict->nextoff)(p+1);
  598. if(p < 0)
  599. return; /* shouldn't happen */
  600. if(p >= here)
  601. break;
  602. last = p;
  603. }
  604. if(!last) {
  605. if(here - tryback < 0) {
  606. err("can't find a previous entry");
  607. return;
  608. }
  609. tryback = 2*tryback;
  610. }
  611. }
  612. dot->doff[0] = last;
  613. dot->n = 1;
  614. dot->cur = 0;
  615. }