dict.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include <u.h>
  10. #include <libc.h>
  11. #include <bio.h>
  12. #include <regexp.h>
  13. #include <ctype.h>
  14. #include "dict.h"
  15. /*
  16. * Assumed index file structure: lines of form
  17. * [^\t]+\t[0-9]+
  18. * First field is key, second is byte offset into dictionary.
  19. * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
  20. */
  21. typedef struct Addr Addr;
  22. struct Addr {
  23. int n; /* number of offsets */
  24. int cur; /* current position within doff array */
  25. int maxn; /* actual current size of doff array */
  26. uint32_t doff[1]; /* doff[maxn], with 0..n-1 significant */
  27. };
  28. Biobuf binbuf;
  29. Biobuf boutbuf;
  30. Biobuf *bin = &binbuf; /* user cmd input */
  31. Biobuf *bout = &boutbuf; /* output */
  32. Biobuf *bdict; /* dictionary */
  33. Biobuf *bindex; /* index file */
  34. int32_t indextop; /* index offset at end of file */
  35. int lastcmd; /* last executed command */
  36. Addr *dot; /* "current" address */
  37. Dict *dict; /* current dictionary */
  38. int linelen;
  39. int breaklen = 60;
  40. int outinhibit;
  41. int debug;
  42. void execcmd(int);
  43. int getpref(char*, Rune*);
  44. Entry getentry(int);
  45. int getfield(Rune*);
  46. int32_t locate(Rune*);
  47. int parseaddr(char*, char**);
  48. int parsecmd(char*);
  49. int search(char*, int);
  50. int32_t seeknextline(Biobuf*, int32_t);
  51. void setdotnext(void);
  52. void setdotprev(void);
  53. void sortaddr(Addr*);
  54. void usage(void);
  55. enum {
  56. Plen=300, /* max length of a search pattern */
  57. Fieldlen=200, /* max length of an index field */
  58. Aslots=10, /* initial number of slots in an address */
  59. };
  60. void
  61. main(int argc, char **argv)
  62. {
  63. int i, cmd, kflag;
  64. char *line, *p;
  65. Binit(&binbuf, 0, OREAD);
  66. Binit(&boutbuf, 1, OWRITE);
  67. kflag = 0;
  68. line = 0;
  69. dict = 0;
  70. for(i=0; dicts[i].name; i++){
  71. if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
  72. dict = &dicts[i];
  73. break;
  74. }
  75. }
  76. ARGBEGIN {
  77. case 'd':
  78. p = ARGF();
  79. dict = 0;
  80. if(p) {
  81. for(i=0; dicts[i].name; i++)
  82. if(strcmp(p, dicts[i].name)==0) {
  83. dict = &dicts[i];
  84. break;
  85. }
  86. }
  87. if(!dict)
  88. usage();
  89. break;
  90. case 'c':
  91. line = ARGF();
  92. if(!line)
  93. usage();
  94. break;
  95. case 'k':
  96. kflag++;
  97. break;
  98. case 'D':
  99. debug++;
  100. break;
  101. default:
  102. usage();
  103. ARGEND }
  104. if(dict == 0){
  105. err("no dictionaries present on this system");
  106. exits("nodict");
  107. }
  108. if(kflag) {
  109. (*dict->printkey)();
  110. exits(0);
  111. }
  112. if(argc > 1)
  113. usage();
  114. else if(argc == 1) {
  115. if(line)
  116. usage();
  117. p = argv[0];
  118. line = malloc(strlen(p)+5);
  119. sprint(line, "/%s/P\n", p);
  120. }
  121. bdict = Bopen(dict->path, OREAD);
  122. if(!bdict) {
  123. err("can't open dictionary %s", dict->path);
  124. exits("nodict");
  125. }
  126. bindex = Bopen(dict->indexpath, OREAD);
  127. if(!bindex) {
  128. err("can't open index %s", dict->indexpath);
  129. exits("noindex");
  130. }
  131. indextop = Bseek(bindex, 0L, 2);
  132. dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
  133. dot->n = 0;
  134. dot->cur = 0;
  135. dot->maxn = Aslots;
  136. lastcmd = 0;
  137. if(line) {
  138. cmd = parsecmd(line);
  139. if(cmd)
  140. execcmd(cmd);
  141. } else {
  142. for(;;) {
  143. Bprint(bout, "*");
  144. Bflush(bout);
  145. line = Brdline(bin, '\n');
  146. linelen = 0;
  147. if(!line)
  148. break;
  149. cmd = parsecmd(line);
  150. if(cmd) {
  151. execcmd(cmd);
  152. lastcmd = cmd;
  153. }
  154. }
  155. }
  156. exits(0);
  157. }
  158. void
  159. usage(void)
  160. {
  161. int i;
  162. char *a, *b;
  163. Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
  164. Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
  165. for(i = 0; dicts[i].name; i++){
  166. a = b = "";
  167. if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
  168. a = "[";
  169. b = "]";
  170. }
  171. Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
  172. }
  173. exits("usage");
  174. }
  175. int
  176. parsecmd(char *line)
  177. {
  178. char *e;
  179. int cmd, ans;
  180. if(parseaddr(line, &e) >= 0)
  181. line = e;
  182. else
  183. return 0;
  184. cmd = *line;
  185. ans = cmd;
  186. if(isupper(cmd))
  187. cmd = tolower(cmd);
  188. if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
  189. cmd == '\n')) {
  190. err("unknown command %c", cmd);
  191. return 0;
  192. }
  193. if(cmd == '\n')
  194. switch(lastcmd) {
  195. case 0: ans = 'H'; break;
  196. case 'H': ans = 'p'; break;
  197. default : ans = lastcmd; break;
  198. }
  199. else if(line[1] != '\n' && line[1] != 0)
  200. err("extra stuff after command %c ignored", cmd);
  201. return ans;
  202. }
  203. void
  204. execcmd(int cmd)
  205. {
  206. Entry e;
  207. int cur, doall;
  208. if(isupper(cmd)) {
  209. doall = 1;
  210. cmd = tolower(cmd);
  211. cur = 0;
  212. } else {
  213. doall = 0;
  214. cur = dot->cur;
  215. }
  216. if(debug && doall && cmd == 'a')
  217. Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
  218. for(;;){
  219. if(cur >= dot->n)
  220. break;
  221. if(doall) {
  222. Bprint(bout, "%d\t", cur+1);
  223. linelen += 4 + (cur >= 10);
  224. }
  225. switch(cmd) {
  226. case 'a':
  227. Bprint(bout, "#%lud\n", dot->doff[cur]);
  228. break;
  229. case 'h':
  230. case 'p':
  231. case 'r':
  232. e = getentry(cur);
  233. (*dict->printentry)(e, cmd);
  234. break;
  235. }
  236. cur++;
  237. if(doall) {
  238. if(cmd == 'p' || cmd == 'r') {
  239. Bputc(bout, '\n');
  240. linelen = 0;
  241. }
  242. } else
  243. break;
  244. }
  245. if(cur >= dot->n)
  246. cur = 0;
  247. dot->cur = cur;
  248. }
  249. /*
  250. * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
  251. * Answer goes in dot.
  252. * Return -1 if address starts, but get error.
  253. * Return 0 if no address.
  254. */
  255. int
  256. parseaddr(char *line, char **eptr)
  257. {
  258. int delim, plen;
  259. uint32_t v;
  260. char *e;
  261. char pat[Plen];
  262. if(*line == '/' || *line == '!') {
  263. /* anchored regular expression match; '!' means no folding */
  264. if(*line == '/') {
  265. delim = '/';
  266. e = strpbrk(line+1, "/\n");
  267. } else {
  268. delim = '!';
  269. e = strpbrk(line+1, "!\n");
  270. }
  271. plen = e-line-1;
  272. if(plen >= Plen-3) {
  273. err("pattern too big");
  274. return -1;
  275. }
  276. pat[0] = '^';
  277. memcpy(pat+1, line+1, plen);
  278. pat[plen+1] = '$';
  279. pat[plen+2] = 0;
  280. if(*e == '\n')
  281. line = e;
  282. else
  283. line = e+1;
  284. if(!search(pat, delim == '/')) {
  285. err("pattern not found");
  286. return -1;
  287. }
  288. } else if(*line == '#') {
  289. /* absolute byte offset into dictionary */
  290. line++;
  291. if(!isdigit(*line))
  292. return -1;
  293. v = strtoul(line, &e, 10);
  294. line = e;
  295. dot->doff[0] = v;
  296. dot->n = 1;
  297. dot->cur = 0;
  298. } else if(isdigit(*line)) {
  299. v = strtoul(line, &e, 10);
  300. line = e;
  301. if(v < 1 || v > dot->n)
  302. err(".%d not in range [1,%d], ignored",
  303. v, dot->n);
  304. else
  305. dot->cur = v-1;
  306. } else if(*line == '.') {
  307. line++;
  308. } else {
  309. *eptr = line;
  310. return 0;
  311. }
  312. while(*line == '+' || *line == '-') {
  313. if(*line == '+')
  314. setdotnext();
  315. else
  316. setdotprev();
  317. line++;
  318. }
  319. *eptr = line;
  320. return 1;
  321. }
  322. /*
  323. * Index file is sorted by folded field1.
  324. * Method: find pre, a folded prefix of r.e. pat,
  325. * and then low = offset to beginning of
  326. * line in index file where first match of prefix occurs.
  327. * Then go through index until prefix no longer matches,
  328. * adding each line that matches real pattern to dot.
  329. * Finally, sort dot offsets (uniquing).
  330. * We know pat len < Plen, and that it is surrounded by ^..$
  331. */
  332. int
  333. search(char *pat, int dofold)
  334. {
  335. int needre, prelen, match, n;
  336. Reprog *re;
  337. int32_t ioff, v;
  338. Rune pre[Plen];
  339. Rune lit[Plen];
  340. Rune entry[Fieldlen];
  341. char fpat[Plen];
  342. prelen = getpref(pat+1, pre);
  343. if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
  344. runescpy(lit, pre);
  345. if(dofold)
  346. fold(lit);
  347. needre = 0;
  348. SET(re);
  349. } else {
  350. needre = 1;
  351. if(dofold) {
  352. foldre(fpat, pat);
  353. re = regcomp(fpat);
  354. } else
  355. re = regcomp(pat);
  356. }
  357. fold(pre);
  358. ioff = locate(pre);
  359. if(ioff < 0)
  360. return 0;
  361. dot->n = 0;
  362. Bseek(bindex, ioff, 0);
  363. for(;;) {
  364. if(!getfield(entry))
  365. break;
  366. if(dofold)
  367. fold(entry);
  368. if(needre)
  369. match = rregexec(re, entry, 0, 0);
  370. else
  371. match = (acomp(lit, entry) == 0);
  372. if(match) {
  373. if(!getfield(entry))
  374. break;
  375. v = runetol(entry);
  376. if(dot->n >= dot->maxn) {
  377. n = 2*dot->maxn;
  378. dot = realloc(dot,
  379. sizeof(Addr)+(n-1)*sizeof(int32_t));
  380. if(!dot) {
  381. err("out of memory");
  382. exits("nomem");
  383. }
  384. dot->maxn = n;
  385. }
  386. dot->doff[dot->n++] = v;
  387. } else {
  388. if(!dofold)
  389. fold(entry);
  390. if(*pre) {
  391. n = acomp(pre, entry);
  392. if(n < -1 || (!needre && n < 0))
  393. break;
  394. }
  395. /* get to next index entry */
  396. if(!getfield(entry))
  397. break;
  398. }
  399. }
  400. sortaddr(dot);
  401. dot->cur = 0;
  402. return dot->n;
  403. }
  404. /*
  405. * Return offset in index file of first line whose folded
  406. * first field has pre as a prefix. -1 if none found.
  407. */
  408. int32_t
  409. locate(Rune *pre)
  410. {
  411. int32_t top, bot, mid;
  412. Rune entry[Fieldlen];
  413. if(*pre == 0)
  414. return 0;
  415. bot = 0;
  416. top = indextop;
  417. if(debug>1)
  418. fprint(2, "locate looking for prefix %S\n", pre);
  419. for(;;) {
  420. /*
  421. * Loop invariant: foldkey(bot) < pre <= foldkey(top)
  422. * and bot < top, and bot,top point at beginning of lines
  423. */
  424. mid = (top+bot) / 2;
  425. mid = seeknextline(bindex, mid);
  426. if(debug > 1)
  427. fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
  428. bot, (top+bot) / 2, mid, top);
  429. if(mid == top || !getfield(entry))
  430. break;
  431. if(debug > 1)
  432. fprint(2, "key=%S\n", entry);
  433. /*
  434. * here mid is strictly between bot and top
  435. */
  436. fold(entry);
  437. if(acomp(pre, entry) <= 0)
  438. top = mid;
  439. else
  440. bot = mid;
  441. }
  442. /*
  443. * bot < top, but they don't necessarily point at successive lines
  444. * Use linear search from bot to find first line that pre is a
  445. * prefix of
  446. */
  447. while((bot = seeknextline(bindex, bot)) <= top) {
  448. if(!getfield(entry))
  449. return -1;
  450. if(debug > 1)
  451. fprint(2, "key=%S\n", entry);
  452. fold(entry);
  453. switch(acomp(pre, entry)) {
  454. case -2:
  455. return -1;
  456. case -1:
  457. case 0:
  458. return bot;
  459. case 1:
  460. case 2:
  461. continue;
  462. }
  463. }
  464. return -1;
  465. }
  466. /*
  467. * Get prefix of non re-metacharacters, runified, into pre,
  468. * and return length
  469. */
  470. int
  471. getpref(char *pat, Rune *pre)
  472. {
  473. int n, r;
  474. char *p;
  475. p = pat;
  476. while(*p) {
  477. n = chartorune(pre, p);
  478. r = *pre;
  479. switch(r) {
  480. case L'.': case L'*': case L'+': case L'?':
  481. case L'[': case L']': case L'(': case ')':
  482. case L'|': case L'^': case L'$':
  483. *pre = 0;
  484. return p-pat;
  485. case L'\\':
  486. p += n;
  487. p += chartorune(++pre, p);
  488. pre++;
  489. break;
  490. default:
  491. p += n;
  492. pre++;
  493. }
  494. }
  495. return p-pat;
  496. }
  497. int32_t
  498. seeknextline(Biobuf *b, int32_t off)
  499. {
  500. int32_t c;
  501. Bseek(b, off, 0);
  502. do {
  503. c = Bgetrune(b);
  504. } while(c>=0 && c!='\n');
  505. return Boffset(b);
  506. }
  507. /*
  508. * Get next field out of index file (either tab- or nl- terminated)
  509. * Answer in *rp, assumed to be Fieldlen long.
  510. * Return 0 if read error first.
  511. */
  512. int
  513. getfield(Rune *rp)
  514. {
  515. int32_t c;
  516. int n;
  517. for(n=Fieldlen; n-- > 0; ) {
  518. if ((c = Bgetrune(bindex)) < 0)
  519. return 0;
  520. if(c == '\t' || c == '\n') {
  521. *rp = L'\0';
  522. return 1;
  523. }
  524. *rp++ = c;
  525. }
  526. err("word too long");
  527. return 0;
  528. }
  529. /*
  530. * A compare longs function suitable for qsort
  531. */
  532. static int
  533. longcmp(const void *av, const void *bv)
  534. {
  535. int32_t v;
  536. int32_t *a, *b;
  537. a = av;
  538. b = bv;
  539. v = *a - *b;
  540. if(v < 0)
  541. return -1;
  542. else if(v == 0)
  543. return 0;
  544. else
  545. return 1;
  546. }
  547. void
  548. sortaddr(Addr *a)
  549. {
  550. int i, j;
  551. int32_t v;
  552. if(a->n <= 1)
  553. return;
  554. qsort(a->doff, a->n, sizeof(int32_t), longcmp);
  555. /* remove duplicates */
  556. for(i=0, j=0; j < a->n; j++) {
  557. v = a->doff[j];
  558. if(i > 0 && v == a->doff[i-1])
  559. continue;
  560. a->doff[i++] = v;
  561. }
  562. a->n = i;
  563. }
  564. Entry
  565. getentry(int i)
  566. {
  567. int32_t b, e, n;
  568. static Entry ans;
  569. static int anslen = 0;
  570. b = dot->doff[i];
  571. e = (*dict->nextoff)(b+1);
  572. ans.doff = b;
  573. if(e < 0) {
  574. err("couldn't seek to entry");
  575. ans.start = 0;
  576. ans.end = 0;
  577. } else {
  578. n = e-b;
  579. if(n+1 > anslen) {
  580. ans.start = realloc(ans.start, n+1);
  581. if(!ans.start) {
  582. err("out of memory");
  583. exits("nomem");
  584. }
  585. anslen = n+1;
  586. }
  587. Bseek(bdict, b, 0);
  588. n = Bread(bdict, ans.start, n);
  589. ans.end = ans.start + n;
  590. *ans.end = 0;
  591. }
  592. return ans;
  593. }
  594. void
  595. setdotnext(void)
  596. {
  597. int32_t b;
  598. b = (*dict->nextoff)(dot->doff[dot->cur]+1);
  599. if(b < 0) {
  600. err("couldn't find a next entry");
  601. return;
  602. }
  603. dot->doff[0] = b;
  604. dot->n = 1;
  605. dot->cur = 0;
  606. }
  607. void
  608. setdotprev(void)
  609. {
  610. int tryback;
  611. int32_t here, last, p;
  612. if(dot->cur < 0 || dot->cur >= dot->n)
  613. return;
  614. tryback = 2000;
  615. here = dot->doff[dot->cur];
  616. last = 0;
  617. while(last == 0) {
  618. p = here - tryback;
  619. if(p < 0)
  620. p = 0;
  621. for(;;) {
  622. p = (*dict->nextoff)(p+1);
  623. if(p < 0)
  624. return; /* shouldn't happen */
  625. if(p >= here)
  626. break;
  627. last = p;
  628. }
  629. if(!last) {
  630. if(here - tryback < 0) {
  631. err("can't find a previous entry");
  632. return;
  633. }
  634. tryback = 2*tryback;
  635. }
  636. }
  637. dot->doff[0] = last;
  638. dot->n = 1;
  639. dot->cur = 0;
  640. }