dict.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <regexp.h>
  5. #include <ctype.h>
  6. #include "dict.h"
  7. /*
  8. * Assumed index file structure: lines of form
  9. * [^\t]+\t[0-9]+
  10. * First field is key, second is byte offset into dictionary.
  11. * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
  12. */
  13. typedef struct Addr Addr;
  14. struct Addr {
  15. int n; /* number of offsets */
  16. int cur; /* current position within doff array */
  17. int maxn; /* actual current size of doff array */
  18. ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
  19. };
  20. Biobuf binbuf;
  21. Biobuf boutbuf;
  22. Biobuf *bin = &binbuf; /* user cmd input */
  23. Biobuf *bout = &boutbuf; /* output */
  24. Biobuf *bdict; /* dictionary */
  25. Biobuf *bindex; /* index file */
  26. long indextop; /* index offset at end of file */
  27. int lastcmd; /* last executed command */
  28. Addr *dot; /* "current" address */
  29. Dict *dict; /* current dictionary */
  30. int linelen;
  31. int breaklen = 60;
  32. int outinhibit;
  33. int debug;
  34. void execcmd(int);
  35. int getpref(char*, Rune*);
  36. Entry getentry(int);
  37. int getfield(Rune*);
  38. long locate(Rune*);
  39. int parseaddr(char*, char**);
  40. int parsecmd(char*);
  41. int search(char*, int);
  42. long seeknextline(Biobuf*, long);
  43. void setdotnext(void);
  44. void setdotprev(void);
  45. void sortaddr(Addr*);
  46. void usage(void);
  47. enum {
  48. Plen=300, /* max length of a search pattern */
  49. Fieldlen=200, /* max length of an index field */
  50. Aslots=10, /* initial number of slots in an address */
  51. };
  52. void
  53. main(int argc, char **argv)
  54. {
  55. int i, cmd, kflag;
  56. char *line, *p;
  57. Binit(&binbuf, 0, OREAD);
  58. Binit(&boutbuf, 1, OWRITE);
  59. kflag = 0;
  60. line = 0;
  61. dict = 0;
  62. for(i=0; dicts[i].name; i++){
  63. if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
  64. dict = &dicts[i];
  65. break;
  66. }
  67. }
  68. ARGBEGIN {
  69. case 'd':
  70. p = ARGF();
  71. dict = 0;
  72. if(p) {
  73. for(i=0; dicts[i].name; i++)
  74. if(strcmp(p, dicts[i].name)==0) {
  75. dict = &dicts[i];
  76. break;
  77. }
  78. }
  79. if(!dict)
  80. usage();
  81. break;
  82. case 'c':
  83. line = ARGF();
  84. if(!line)
  85. usage();
  86. break;
  87. case 'k':
  88. kflag++;
  89. break;
  90. case 'D':
  91. debug++;
  92. break;
  93. default:
  94. usage();
  95. ARGEND }
  96. if(dict == 0){
  97. err("no dictionaries present on this system");
  98. exits("nodict");
  99. }
  100. if(kflag) {
  101. (*dict->printkey)();
  102. exits(0);
  103. }
  104. if(argc > 1)
  105. usage();
  106. else if(argc == 1) {
  107. if(line)
  108. usage();
  109. p = argv[0];
  110. line = malloc(strlen(p)+5);
  111. sprint(line, "/%s/P\n", p);
  112. }
  113. bdict = Bopen(dict->path, OREAD);
  114. if(!bdict) {
  115. err("can't open dictionary %s", dict->path);
  116. exits("nodict");
  117. }
  118. bindex = Bopen(dict->indexpath, OREAD);
  119. if(!bindex) {
  120. err("can't open index %s", dict->indexpath);
  121. exits("noindex");
  122. }
  123. indextop = Bseek(bindex, 0L, 2);
  124. dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
  125. dot->n = 0;
  126. dot->cur = 0;
  127. dot->maxn = Aslots;
  128. lastcmd = 0;
  129. if(line) {
  130. cmd = parsecmd(line);
  131. if(cmd)
  132. execcmd(cmd);
  133. } else {
  134. for(;;) {
  135. Bprint(bout, "*");
  136. Bflush(bout);
  137. line = Brdline(bin, '\n');
  138. linelen = 0;
  139. if(!line)
  140. break;
  141. cmd = parsecmd(line);
  142. if(cmd) {
  143. execcmd(cmd);
  144. lastcmd = cmd;
  145. }
  146. }
  147. }
  148. exits(0);
  149. }
  150. void
  151. usage(void)
  152. {
  153. int i;
  154. char *a, *b;
  155. Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
  156. Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
  157. for(i = 0; dicts[i].name; i++){
  158. a = b = "";
  159. if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
  160. a = "[";
  161. b = "]";
  162. }
  163. Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
  164. }
  165. exits("usage");
  166. }
  167. int
  168. parsecmd(char *line)
  169. {
  170. char *e;
  171. int cmd, ans;
  172. if(parseaddr(line, &e) >= 0)
  173. line = e;
  174. else
  175. return 0;
  176. cmd = *line;
  177. ans = cmd;
  178. if(isupper(cmd))
  179. cmd = tolower(cmd);
  180. if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
  181. cmd == '\n')) {
  182. err("unknown command %c", cmd);
  183. return 0;
  184. }
  185. if(cmd == '\n')
  186. switch(lastcmd) {
  187. case 0: ans = 'H'; break;
  188. case 'H': ans = 'p'; break;
  189. default : ans = lastcmd; break;
  190. }
  191. else if(line[1] != '\n' && line[1] != 0)
  192. err("extra stuff after command %c ignored", cmd);
  193. return ans;
  194. }
  195. void
  196. execcmd(int cmd)
  197. {
  198. Entry e;
  199. int cur, doall;
  200. if(isupper(cmd)) {
  201. doall = 1;
  202. cmd = tolower(cmd);
  203. cur = 0;
  204. } else {
  205. doall = 0;
  206. cur = dot->cur;
  207. }
  208. if(debug && doall && cmd == 'a')
  209. Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
  210. for(;;){
  211. if(cur >= dot->n)
  212. break;
  213. if(doall) {
  214. Bprint(bout, "%d\t", cur+1);
  215. linelen += 4 + (cur >= 10);
  216. }
  217. switch(cmd) {
  218. case 'a':
  219. Bprint(bout, "#%lud\n", dot->doff[cur]);
  220. break;
  221. case 'h':
  222. case 'p':
  223. case 'r':
  224. e = getentry(cur);
  225. (*dict->printentry)(e, cmd);
  226. break;
  227. }
  228. cur++;
  229. if(doall) {
  230. if(cmd == 'p' || cmd == 'r') {
  231. Bputc(bout, '\n');
  232. linelen = 0;
  233. }
  234. } else
  235. break;
  236. }
  237. if(cur >= dot->n)
  238. cur = 0;
  239. dot->cur = cur;
  240. }
  241. /*
  242. * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
  243. * Answer goes in dot.
  244. * Return -1 if address starts, but get error.
  245. * Return 0 if no address.
  246. */
  247. int
  248. parseaddr(char *line, char **eptr)
  249. {
  250. int delim, plen;
  251. ulong v;
  252. char *e;
  253. char pat[Plen];
  254. if(*line == '/' || *line == '!') {
  255. /* anchored regular expression match; '!' means no folding */
  256. if(*line == '/') {
  257. delim = '/';
  258. e = strpbrk(line+1, "/\n");
  259. } else {
  260. delim = '!';
  261. e = strpbrk(line+1, "!\n");
  262. }
  263. plen = e-line-1;
  264. if(plen >= Plen-3) {
  265. err("pattern too big");
  266. return -1;
  267. }
  268. pat[0] = '^';
  269. memcpy(pat+1, line+1, plen);
  270. pat[plen+1] = '$';
  271. pat[plen+2] = 0;
  272. if(*e == '\n')
  273. line = e;
  274. else
  275. line = e+1;
  276. if(!search(pat, delim == '/')) {
  277. err("pattern not found");
  278. return -1;
  279. }
  280. } else if(*line == '#') {
  281. /* absolute byte offset into dictionary */
  282. line++;
  283. if(!isdigit(*line))
  284. return -1;
  285. v = strtoul(line, &e, 10);
  286. line = e;
  287. dot->doff[0] = v;
  288. dot->n = 1;
  289. dot->cur = 0;
  290. } else if(isdigit(*line)) {
  291. v = strtoul(line, &e, 10);
  292. line = e;
  293. if(v < 1 || v > dot->n)
  294. err(".%d not in range [1,%d], ignored",
  295. v, dot->n);
  296. else
  297. dot->cur = v-1;
  298. } else if(*line == '.') {
  299. line++;
  300. } else {
  301. *eptr = line;
  302. return 0;
  303. }
  304. while(*line == '+' || *line == '-') {
  305. if(*line == '+')
  306. setdotnext();
  307. else
  308. setdotprev();
  309. line++;
  310. }
  311. *eptr = line;
  312. return 1;
  313. }
  314. /*
  315. * Index file is sorted by folded field1.
  316. * Method: find pre, a folded prefix of r.e. pat,
  317. * and then low = offset to beginning of
  318. * line in index file where first match of prefix occurs.
  319. * Then go through index until prefix no longer matches,
  320. * adding each line that matches real pattern to dot.
  321. * Finally, sort dot offsets (uniquing).
  322. * We know pat len < Plen, and that it is surrounded by ^..$
  323. */
  324. int
  325. search(char *pat, int dofold)
  326. {
  327. int needre, prelen, match, n;
  328. Reprog *re;
  329. long ioff, v;
  330. Rune pre[Plen];
  331. Rune lit[Plen];
  332. Rune entry[Fieldlen];
  333. char fpat[Plen];
  334. prelen = getpref(pat+1, pre);
  335. if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
  336. runescpy(lit, pre);
  337. if(dofold)
  338. fold(lit);
  339. needre = 0;
  340. SET(re);
  341. } else {
  342. needre = 1;
  343. if(dofold) {
  344. foldre(fpat, pat);
  345. re = regcomp(fpat);
  346. } else
  347. re = regcomp(pat);
  348. }
  349. fold(pre);
  350. ioff = locate(pre);
  351. if(ioff < 0)
  352. return 0;
  353. dot->n = 0;
  354. Bseek(bindex, ioff, 0);
  355. for(;;) {
  356. if(!getfield(entry))
  357. break;
  358. if(dofold)
  359. fold(entry);
  360. if(needre)
  361. match = rregexec(re, entry, 0, 0);
  362. else
  363. match = (acomp(lit, entry) == 0);
  364. if(match) {
  365. if(!getfield(entry))
  366. break;
  367. v = runetol(entry);
  368. if(dot->n >= dot->maxn) {
  369. n = 2*dot->maxn;
  370. dot = realloc(dot,
  371. sizeof(Addr)+(n-1)*sizeof(long));
  372. if(!dot) {
  373. err("out of memory");
  374. exits("nomem");
  375. }
  376. dot->maxn = n;
  377. }
  378. dot->doff[dot->n++] = v;
  379. } else {
  380. if(!dofold)
  381. fold(entry);
  382. if(*pre) {
  383. n = acomp(pre, entry);
  384. if(n < -1 || (!needre && n < 0))
  385. break;
  386. }
  387. /* get to next index entry */
  388. if(!getfield(entry))
  389. break;
  390. }
  391. }
  392. sortaddr(dot);
  393. dot->cur = 0;
  394. return dot->n;
  395. }
  396. /*
  397. * Return offset in index file of first line whose folded
  398. * first field has pre as a prefix. -1 if none found.
  399. */
  400. long
  401. locate(Rune *pre)
  402. {
  403. long top, bot, mid;
  404. Rune entry[Fieldlen];
  405. if(*pre == 0)
  406. return 0;
  407. bot = 0;
  408. top = indextop;
  409. if(debug>1)
  410. fprint(2, "locate looking for prefix %S\n", pre);
  411. for(;;) {
  412. /*
  413. * Loop invariant: foldkey(bot) < pre <= foldkey(top)
  414. * and bot < top, and bot,top point at beginning of lines
  415. */
  416. mid = (top+bot) / 2;
  417. mid = seeknextline(bindex, mid);
  418. if(debug > 1)
  419. fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
  420. bot, (top+bot) / 2, mid, top);
  421. if(mid == top || !getfield(entry))
  422. break;
  423. if(debug > 1)
  424. fprint(2, "key=%S\n", entry);
  425. /*
  426. * here mid is strictly between bot and top
  427. */
  428. fold(entry);
  429. if(acomp(pre, entry) <= 0)
  430. top = mid;
  431. else
  432. bot = mid;
  433. }
  434. /*
  435. * bot < top, but they don't necessarily point at successive lines
  436. * Use linear search from bot to find first line that pre is a
  437. * prefix of
  438. */
  439. while((bot = seeknextline(bindex, bot)) <= top) {
  440. if(!getfield(entry))
  441. return -1;
  442. if(debug > 1)
  443. fprint(2, "key=%S\n", entry);
  444. fold(entry);
  445. switch(acomp(pre, entry)) {
  446. case -2:
  447. return -1;
  448. case -1:
  449. case 0:
  450. return bot;
  451. case 1:
  452. case 2:
  453. continue;
  454. }
  455. }
  456. return -1;
  457. }
  458. /*
  459. * Get prefix of non re-metacharacters, runified, into pre,
  460. * and return length
  461. */
  462. int
  463. getpref(char *pat, Rune *pre)
  464. {
  465. int n, r;
  466. char *p;
  467. p = pat;
  468. while(*p) {
  469. n = chartorune(pre, p);
  470. r = *pre;
  471. switch(r) {
  472. case L'.': case L'*': case L'+': case L'?':
  473. case L'[': case L']': case L'(': case ')':
  474. case L'|': case L'^': case L'$':
  475. *pre = 0;
  476. return p-pat;
  477. case L'\\':
  478. p += n;
  479. p += chartorune(++pre, p);
  480. pre++;
  481. break;
  482. default:
  483. p += n;
  484. pre++;
  485. }
  486. }
  487. return p-pat;
  488. }
  489. long
  490. seeknextline(Biobuf *b, long off)
  491. {
  492. long c;
  493. Bseek(b, off, 0);
  494. do {
  495. c = Bgetrune(b);
  496. } while(c>=0 && c!='\n');
  497. return Boffset(b);
  498. }
  499. /*
  500. * Get next field out of index file (either tab- or nl- terminated)
  501. * Answer in *rp, assumed to be Fieldlen long.
  502. * Return 0 if read error first.
  503. */
  504. int
  505. getfield(Rune *rp)
  506. {
  507. long c;
  508. int n;
  509. for(n=Fieldlen; n-- > 0; ) {
  510. if ((c = Bgetrune(bindex)) < 0)
  511. return 0;
  512. if(c == '\t' || c == '\n') {
  513. *rp = L'\0';
  514. return 1;
  515. }
  516. *rp++ = c;
  517. }
  518. err("word too long");
  519. return 0;
  520. }
  521. /*
  522. * A compare longs function suitable for qsort
  523. */
  524. static int
  525. longcmp(void *av, void *bv)
  526. {
  527. long v;
  528. long *a, *b;
  529. a = av;
  530. b = bv;
  531. v = *a - *b;
  532. if(v < 0)
  533. return -1;
  534. else if(v == 0)
  535. return 0;
  536. else
  537. return 1;
  538. }
  539. void
  540. sortaddr(Addr *a)
  541. {
  542. int i, j;
  543. long v;
  544. if(a->n <= 1)
  545. return;
  546. qsort(a->doff, a->n, sizeof(long), longcmp);
  547. /* remove duplicates */
  548. for(i=0, j=0; j < a->n; j++) {
  549. v = a->doff[j];
  550. if(i > 0 && v == a->doff[i-1])
  551. continue;
  552. a->doff[i++] = v;
  553. }
  554. a->n = i;
  555. }
  556. Entry
  557. getentry(int i)
  558. {
  559. long b, e, n;
  560. static Entry ans;
  561. static int anslen = 0;
  562. b = dot->doff[i];
  563. e = (*dict->nextoff)(b+1);
  564. ans.doff = b;
  565. if(e < 0) {
  566. err("couldn't seek to entry");
  567. ans.start = 0;
  568. ans.end = 0;
  569. } else {
  570. n = e-b;
  571. if(n+1 > anslen) {
  572. ans.start = realloc(ans.start, n+1);
  573. if(!ans.start) {
  574. err("out of memory");
  575. exits("nomem");
  576. }
  577. anslen = n+1;
  578. }
  579. Bseek(bdict, b, 0);
  580. n = Bread(bdict, ans.start, n);
  581. ans.end = ans.start + n;
  582. *ans.end = 0;
  583. }
  584. return ans;
  585. }
  586. void
  587. setdotnext(void)
  588. {
  589. long b;
  590. b = (*dict->nextoff)(dot->doff[dot->cur]+1);
  591. if(b < 0) {
  592. err("couldn't find a next entry");
  593. return;
  594. }
  595. dot->doff[0] = b;
  596. dot->n = 1;
  597. dot->cur = 0;
  598. }
  599. void
  600. setdotprev(void)
  601. {
  602. int tryback;
  603. long here, last, p;
  604. if(dot->cur < 0 || dot->cur >= dot->n)
  605. return;
  606. tryback = 2000;
  607. here = dot->doff[dot->cur];
  608. last = 0;
  609. while(last == 0) {
  610. p = here - tryback;
  611. if(p < 0)
  612. p = 0;
  613. for(;;) {
  614. p = (*dict->nextoff)(p+1);
  615. if(p < 0)
  616. return; /* shouldn't happen */
  617. if(p >= here)
  618. break;
  619. last = p;
  620. }
  621. if(!last) {
  622. if(here - tryback < 0) {
  623. err("can't find a previous entry");
  624. return;
  625. }
  626. tryback = 2*tryback;
  627. }
  628. }
  629. dot->doff[0] = last;
  630. dot->n = 1;
  631. dot->cur = 0;
  632. }