utils.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "dict.h"
  5. Dict dicts[] = {
  6. {"oed", "Oxford English Dictionary, 2nd Ed.",
  7. "/lib/dict/oed2", "/lib/dict/oed2index",
  8. oednextoff, oedprintentry, oedprintkey},
  9. {"ahd", "American Heritage Dictionary, 2nd College Ed.",
  10. "/lib/ahd/DICT.DB", "/lib/ahd/index",
  11. ahdnextoff, ahdprintentry, ahdprintkey},
  12. {"pgw", "Project Gutenberg Webster Dictionary",
  13. "/lib/dict/pgw", "/lib/dict/pgwindex",
  14. pgwnextoff, pgwprintentry, pgwprintkey},
  15. {"thesaurus", "Collins Thesaurus",
  16. "/lib/dict/thesaurus", "/lib/dict/thesindex",
  17. thesnextoff, thesprintentry, thesprintkey},
  18. {"ce", "Gendai Chinese->English",
  19. "/lib/dict/world/sansdata/sandic24.dat",
  20. "/lib/dict/world/sansdata/ceindex",
  21. worldnextoff, worldprintentry, worldprintkey},
  22. {"ceh", "Gendai Chinese->English (Hanzi index)",
  23. "/lib/dict/world/sansdata/sandic24.dat",
  24. "/lib/dict/world/sansdata/cehindex",
  25. worldnextoff, worldprintentry, worldprintkey},
  26. {"ec", "Gendai English->Chinese",
  27. "/lib/dict/world/sansdata/sandic24.dat",
  28. "/lib/dict/world/sansdata/ecindex",
  29. worldnextoff, worldprintentry, worldprintkey},
  30. {"dae", "Gyldendal Danish->English",
  31. "/lib/dict/world/gylddata/sandic30.dat",
  32. "/lib/dict/world/gylddata/daeindex",
  33. worldnextoff, worldprintentry, worldprintkey},
  34. {"eda", "Gyldendal English->Danish",
  35. "/lib/dict/world/gylddata/sandic29.dat",
  36. "/lib/dict/world/gylddata/edaindex",
  37. worldnextoff, worldprintentry, worldprintkey},
  38. {"due", "Wolters-Noordhoff Dutch->English",
  39. "/lib/dict/world/woltdata/sandic07.dat",
  40. "/lib/dict/world/woltdata/deindex",
  41. worldnextoff, worldprintentry, worldprintkey},
  42. {"edu", "Wolters-Noordhoff English->Dutch",
  43. "/lib/dict/world/woltdata/sandic06.dat",
  44. "/lib/dict/world/woltdata/edindex",
  45. worldnextoff, worldprintentry, worldprintkey},
  46. {"fie", "WSOY Finnish->English",
  47. "/lib/dict/world/werndata/sandic32.dat",
  48. "/lib/dict/world/werndata/fieindex",
  49. worldnextoff, worldprintentry, worldprintkey},
  50. {"efi", "WSOY English->Finnish",
  51. "/lib/dict/world/werndata/sandic31.dat",
  52. "/lib/dict/world/werndata/efiindex",
  53. worldnextoff, worldprintentry, worldprintkey},
  54. {"fe", "Collins French->English",
  55. "/lib/dict/fe", "/lib/dict/feindex",
  56. pcollnextoff, pcollprintentry, pcollprintkey},
  57. {"ef", "Collins English->French",
  58. "/lib/dict/ef", "/lib/dict/efindex",
  59. pcollnextoff, pcollprintentry, pcollprintkey},
  60. {"ge", "Collins German->English",
  61. "/lib/dict/ge", "/lib/dict/geindex",
  62. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  63. {"eg", "Collins English->German",
  64. "/lib/dict/eg", "/lib/dict/egindex",
  65. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  66. {"ie", "Collins Italian->English",
  67. "/lib/dict/ie", "/lib/dict/ieindex",
  68. pcollnextoff, pcollprintentry, pcollprintkey},
  69. {"ei", "Collins English->Italian",
  70. "/lib/dict/ei", "/lib/dict/eiindex",
  71. pcollnextoff, pcollprintentry, pcollprintkey},
  72. {"je", "Sanshusha Japanese->English",
  73. "/lib/dict/world/sansdata/sandic18.dat",
  74. "/lib/dict/world/sansdata/jeindex",
  75. worldnextoff, worldprintentry, worldprintkey},
  76. {"jek", "Sanshusha Japanese->English (Kanji index)",
  77. "/lib/dict/world/sansdata/sandic18.dat",
  78. "/lib/dict/world/sansdata/jekindex",
  79. worldnextoff, worldprintentry, worldprintkey},
  80. {"ej", "Sanshusha English->Japanese",
  81. "/lib/dict/world/sansdata/sandic18.dat",
  82. "/lib/dict/world/sansdata/ejindex",
  83. worldnextoff, worldprintentry, worldprintkey},
  84. {"tjeg", "Sanshusha technical Japanese->English,German",
  85. "/lib/dict/world/sansdata/sandic16.dat",
  86. "/lib/dict/world/sansdata/tjegindex",
  87. worldnextoff, worldprintentry, worldprintkey},
  88. {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)",
  89. "/lib/dict/world/sansdata/sandic16.dat",
  90. "/lib/dict/world/sansdata/tjegkindex",
  91. worldnextoff, worldprintentry, worldprintkey},
  92. {"tegj", "Sanshusha technical English->German,Japanese",
  93. "/lib/dict/world/sansdata/sandic16.dat",
  94. "/lib/dict/world/sansdata/tegjindex",
  95. worldnextoff, worldprintentry, worldprintkey},
  96. {"tgje", "Sanshusha technical German->Japanese,English",
  97. "/lib/dict/world/sansdata/sandic16.dat",
  98. "/lib/dict/world/sansdata/tgjeindex",
  99. worldnextoff, worldprintentry, worldprintkey},
  100. {"ne", "Kunnskapforlaget Norwegian->English",
  101. "/lib/dict/world/kunndata/sandic28.dat",
  102. "/lib/dict/world/kunndata/neindex",
  103. worldnextoff, worldprintentry, worldprintkey},
  104. {"en", "Kunnskapforlaget English->Norwegian",
  105. "/lib/dict/world/kunndata/sandic27.dat",
  106. "/lib/dict/world/kunndata/enindex",
  107. worldnextoff, worldprintentry, worldprintkey},
  108. {"re", "Leon Ungier Russian->English",
  109. "/lib/dict/re", "/lib/dict/reindex",
  110. simplenextoff, simpleprintentry, simpleprintkey},
  111. {"er", "Leon Ungier English->Russian",
  112. "/lib/dict/re", "/lib/dict/erindex",
  113. simplenextoff, simpleprintentry, simpleprintkey},
  114. {"se", "Collins Spanish->English",
  115. "/lib/dict/se", "/lib/dict/seindex",
  116. pcollnextoff, pcollprintentry, pcollprintkey},
  117. {"es", "Collins English->Spanish",
  118. "/lib/dict/es", "/lib/dict/esindex",
  119. pcollnextoff, pcollprintentry, pcollprintkey},
  120. {"swe", "Esselte Studium Swedish->English",
  121. "/lib/dict/world/essedata/sandic34.dat",
  122. "/lib/dict/world/essedata/sweindex",
  123. worldnextoff, worldprintentry, worldprintkey},
  124. {"esw", "Esselte Studium English->Swedish",
  125. "/lib/dict/world/essedata/sandic33.dat",
  126. "/lib/dict/world/essedata/eswindex",
  127. worldnextoff, worldprintentry, worldprintkey},
  128. {"movie", "Movies -- by title",
  129. "/lib/movie/data", "/lib/dict/movtindex",
  130. movienextoff, movieprintentry, movieprintkey},
  131. {"moviea", "Movies -- by actor",
  132. "/lib/movie/data", "/lib/dict/movaindex",
  133. movienextoff, movieprintentry, movieprintkey},
  134. {"movied", "Movies -- by director",
  135. "/lib/movie/data", "/lib/dict/movdindex",
  136. movienextoff, movieprintentry, movieprintkey},
  137. {"slang", "English Slang",
  138. "/lib/dict/slang", "/lib/dict/slangindex",
  139. slangnextoff, slangprintentry, slangprintkey},
  140. {"robert", "Robert Électronique",
  141. "/lib/dict/robert/_pointers", "/lib/dict/robert/_index",
  142. robertnextoff, robertindexentry, robertprintkey},
  143. {"robertv", "Robert Électronique - formes des verbes",
  144. "/lib/dict/robert/flex.rob", "/lib/dict/robert/_flexindex",
  145. robertnextflex, robertflexentry, robertprintkey},
  146. {0, 0, 0, 0, 0}
  147. };
  148. typedef struct Lig Lig;
  149. struct Lig {
  150. Rune start; /* accent rune */
  151. Rune *pairs; /* <char,accented version> pairs */
  152. };
  153. static Lig ligtab[Nligs] = {
  154. [LACU-LIGS] {L'´', L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
  155. [LGRV-LIGS] {L'ˋ', L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
  156. [LUML-LIGS] {L'¨', L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
  157. [LCED-LIGS] {L'¸', L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
  158. [LTIL-LIGS] {L'˜', L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
  159. [LBRV-LIGS] {L'˘', L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
  160. [LRNG-LIGS] {L'˚', L"AÅaåUŮuů"},
  161. [LDOT-LIGS] {L'˙', L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
  162. [LDTB-LIGS] {L'.', L""},
  163. [LFRN-LIGS] {L'⌢', L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
  164. [LFRB-LIGS] {L'̯', L""},
  165. [LOGO-LIGS] {L'˛', L"AĄaąEĘeęIĮiįıįUŲuų"},
  166. [LMAC-LIGS] {L'¯', L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
  167. [LHCK-LIGS] {L'ˇ', L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
  168. [LASP-LIGS] {L'ʽ', L""},
  169. [LLEN-LIGS] {L'ʼ', L""},
  170. [LBRB-LIGS] {L'̮', L""}
  171. };
  172. Rune *multitab[Nmulti] = {
  173. [MAAS-MULTI] L"ʽα",
  174. [MALN-MULTI] L"ʼα",
  175. [MAND-MULTI] L"and",
  176. [MAOQ-MULTI] L"a/q",
  177. [MBRA-MULTI] L"<|",
  178. [MDD-MULTI] L"..",
  179. [MDDD-MULTI] L"...",
  180. [MEAS-MULTI] L"ʽε",
  181. [MELN-MULTI] L"ʼε",
  182. [MEMM-MULTI] L"——",
  183. [MHAS-MULTI] L"ʽη",
  184. [MHLN-MULTI] L"ʼη",
  185. [MIAS-MULTI] L"ʽι",
  186. [MILN-MULTI] L"ʼι",
  187. [MLCT-MULTI] L"ct",
  188. [MLFF-MULTI] L"ff",
  189. [MLFFI-MULTI] L"ffi",
  190. [MLFFL-MULTI] L"ffl",
  191. [MLFL-MULTI] L"fl",
  192. [MLFI-MULTI] L"fi",
  193. [MLLS-MULTI] L"ɫɫ",
  194. [MLST-MULTI] L"st",
  195. [MOAS-MULTI] L"ʽο",
  196. [MOLN-MULTI] L"ʼο",
  197. [MOR-MULTI] L"or",
  198. [MRAS-MULTI] L"ʽρ",
  199. [MRLN-MULTI] L"ʼρ",
  200. [MTT-MULTI] L"~~",
  201. [MUAS-MULTI] L"ʽυ",
  202. [MULN-MULTI] L"ʼυ",
  203. [MWAS-MULTI] L"ʽω",
  204. [MWLN-MULTI] L"ʼω",
  205. [MOE-MULTI] L"oe",
  206. [MES-MULTI] L" ",
  207. };
  208. #define risupper(r) (L'A' <= (r) && (r) <= L'Z')
  209. #define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF)
  210. #define rtolower(r) ((r)-'A'+'a')
  211. static Rune latin_fold_tab[] =
  212. {
  213. /* Table to fold latin 1 characters to ASCII equivalents
  214. based at Rune value 0xc0
  215. À Á Â Ã Ä Å Æ Ç
  216. È É Ê Ë Ì Í Î Ï
  217. Ð Ñ Ò Ó Ô Õ Ö ×
  218. Ø Ù Ú Û Ü Ý Þ ß
  219. à á â ã ä å æ ç
  220. è é ê ë ì í î ï
  221. ð ñ ò ó ô õ ö ÷
  222. ø ù ú û ü ý þ ÿ
  223. */
  224. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  225. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  226. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  227. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 ,
  228. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  229. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  230. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  231. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y',
  232. };
  233. static Rune *ttabstack[20];
  234. static int ntt;
  235. /*
  236. * tab is an array of n Assoc's, sorted by key.
  237. * Look for key in tab, and return corresponding val
  238. * or -1 if not there
  239. */
  240. long
  241. lookassoc(Assoc *tab, int n, char *key)
  242. {
  243. Assoc *q;
  244. long i, low, high;
  245. int r;
  246. for(low = -1, high = n; high > low+1; ){
  247. i = (high+low)/2;
  248. q = &tab[i];
  249. if((r=strcmp(key, q->key))<0)
  250. high = i;
  251. else if(r == 0)
  252. return q->val;
  253. else
  254. low=i;
  255. }
  256. return -1;
  257. }
  258. long
  259. looknassoc(Nassoc *tab, int n, long key)
  260. {
  261. Nassoc *q;
  262. long i, low, high;
  263. for(low = -1, high = n; high > low+1; ){
  264. i = (high+low)/2;
  265. q = &tab[i];
  266. if(key < q->key)
  267. high = i;
  268. else if(key == q->key)
  269. return q->val;
  270. else
  271. low=i;
  272. }
  273. return -1;
  274. }
  275. void
  276. err(char *fmt, ...)
  277. {
  278. char buf[1000];
  279. va_list v;
  280. va_start(v, fmt);
  281. vsnprint(buf, sizeof(buf), fmt, v);
  282. va_end(v);
  283. fprint(2, "%s: %s\n", argv0, buf);
  284. }
  285. /*
  286. * Write the rune r to bout, keeping track of line length
  287. * and breaking the lines (at blanks) when they get too long
  288. */
  289. void
  290. outrune(long r)
  291. {
  292. if(outinhibit)
  293. return;
  294. if(++linelen > breaklen && r == L' ') {
  295. Bputc(bout, '\n');
  296. linelen = 0;
  297. } else
  298. Bputrune(bout, r);
  299. }
  300. void
  301. outrunes(Rune *rp)
  302. {
  303. Rune r;
  304. while((r = *rp++) != 0)
  305. outrune(r);
  306. }
  307. /* like outrune, but when arg is know to be a char */
  308. void
  309. outchar(int c)
  310. {
  311. if(outinhibit)
  312. return;
  313. if(++linelen > breaklen && c == ' ') {
  314. c ='\n';
  315. linelen = 0;
  316. }
  317. Bputc(bout, c);
  318. }
  319. void
  320. outchars(char *s)
  321. {
  322. char c;
  323. while((c = *s++) != 0)
  324. outchar(c);
  325. }
  326. void
  327. outprint(char *fmt, ...)
  328. {
  329. char buf[1000];
  330. va_list v;
  331. va_start(v, fmt);
  332. vsnprint(buf, sizeof(buf), fmt, v);
  333. va_end(v);
  334. outchars(buf);
  335. }
  336. void
  337. outpiece(char *b, char *e)
  338. {
  339. int c, lastc;
  340. lastc = 0;
  341. while(b < e) {
  342. c = *b++;
  343. if(c == '\n')
  344. c = ' ';
  345. if(!(c == ' ' && lastc == ' '))
  346. outchar(c);
  347. lastc = c;
  348. }
  349. }
  350. /*
  351. * Go to new line if not already there; indent if ind != 0.
  352. * If ind > 1, leave a blank line too.
  353. * Slight hack: assume if current line is only one or two
  354. * characters long, then they were spaces.
  355. */
  356. void
  357. outnl(int ind)
  358. {
  359. if(outinhibit)
  360. return;
  361. if(ind) {
  362. if(ind > 1) {
  363. if(linelen > 2)
  364. Bputc(bout, '\n');
  365. Bprint(bout, "\n ");
  366. } else if(linelen == 0)
  367. Bprint(bout, " ");
  368. else if(linelen == 1)
  369. Bputc(bout, ' ');
  370. else if(linelen != 2)
  371. Bprint(bout, "\n ");
  372. linelen = 2;
  373. } else {
  374. if(linelen) {
  375. Bputc(bout, '\n');
  376. linelen = 0;
  377. }
  378. }
  379. }
  380. /*
  381. * Fold the runes in null-terminated rp.
  382. * Use the sort(1) definition of folding (uppercase to lowercase,
  383. * latin1-accented characters to corresponding unaccented chars)
  384. */
  385. void
  386. fold(Rune *rp)
  387. {
  388. Rune r;
  389. while((r = *rp) != 0) {
  390. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  391. r = latin_fold_tab[r-0xc0];
  392. if(risupper(r))
  393. r = rtolower(r);
  394. *rp++ = r;
  395. }
  396. }
  397. /*
  398. * Like fold, but put folded result into new
  399. * (assumed to have enough space).
  400. * old is a regular expression, but we know that
  401. * metacharacters aren't affected
  402. */
  403. void
  404. foldre(char *new, char *old)
  405. {
  406. Rune r;
  407. while(*old) {
  408. old += chartorune(&r, old);
  409. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  410. r = latin_fold_tab[r-0xc0];
  411. if(risupper(r))
  412. r = rtolower(r);
  413. new += runetochar(new, &r);
  414. }
  415. *new = 0;
  416. }
  417. /*
  418. * acomp(s, t) returns:
  419. * -2 if s strictly precedes t
  420. * -1 if s is a prefix of t
  421. * 0 if s is the same as t
  422. * 1 if t is a prefix of s
  423. * 2 if t strictly precedes s
  424. */
  425. int
  426. acomp(Rune *s, Rune *t)
  427. {
  428. int cs, ct;
  429. for(;;) {
  430. cs = *s;
  431. ct = *t;
  432. if(cs != ct)
  433. break;
  434. if(cs == 0)
  435. return 0;
  436. s++;
  437. t++;
  438. }
  439. if(cs == 0)
  440. return -1;
  441. if(ct == 0)
  442. return 1;
  443. if(cs < ct)
  444. return -2;
  445. return 2;
  446. }
  447. /*
  448. * Copy null terminated Runes from 'from' to 'to'.
  449. */
  450. void
  451. runescpy(Rune *to, Rune *from)
  452. {
  453. while((*to++ = *from++) != 0)
  454. continue;
  455. }
  456. /*
  457. * Conversion of unsigned number to long, no overflow detection
  458. */
  459. long
  460. runetol(Rune *r)
  461. {
  462. int c;
  463. long n;
  464. n = 0;
  465. for(;; r++){
  466. c = *r;
  467. if(L'0'<=c && c<=L'9')
  468. c -= '0';
  469. else
  470. break;
  471. n = n*10 + c;
  472. }
  473. return n;
  474. }
  475. /*
  476. * See if there is a rune corresponding to the accented
  477. * version of r with accent acc (acc in [LIGS..LIGE-1]),
  478. * and return it if so, else return NONE.
  479. */
  480. Rune
  481. liglookup(Rune acc, Rune r)
  482. {
  483. Rune *p;
  484. if(acc < LIGS || acc >= LIGE)
  485. return NONE;
  486. for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
  487. if(*p == r)
  488. return *(p+1);
  489. return NONE;
  490. }
  491. /*
  492. * Maintain a translation table stack (a translation table
  493. * is an array of Runes indexed by bytes or 7-bit bytes).
  494. * If starting is true, push the curtab onto the stack
  495. * and return newtab; else pop the top of the stack and
  496. * return it.
  497. * If curtab is 0, initialize the stack and return.
  498. */
  499. Rune *
  500. changett(Rune *curtab, Rune *newtab, int starting)
  501. {
  502. if(curtab == 0) {
  503. ntt = 0;
  504. return 0;
  505. }
  506. if(starting) {
  507. if(ntt >= asize(ttabstack)) {
  508. if(debug)
  509. err("translation stack overflow");
  510. return curtab;
  511. }
  512. ttabstack[ntt++] = curtab;
  513. return newtab;
  514. } else {
  515. if(ntt == 0) {
  516. if(debug)
  517. err("translation stack underflow");
  518. return curtab;
  519. }
  520. return ttabstack[--ntt];
  521. }
  522. }