utils.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "dict.h"
  5. Dict dicts[] = {
  6. {"oed", "Oxford English Dictionary, 2nd Ed.",
  7. "/lib/dict/oed2", "/lib/dict/oed2index",
  8. oednextoff, oedprintentry, oedprintkey},
  9. {"ahd", "American Heritage Dictionary, 2nd College Ed.",
  10. "/lib/ahd/DICT.DB", "/lib/ahd/index",
  11. ahdnextoff, ahdprintentry, ahdprintkey},
  12. {"thesaurus", "Collins Thesaurus",
  13. "/lib/dict/thesaurus", "/lib/dict/thesindex",
  14. thesnextoff, thesprintentry, thesprintkey},
  15. {"ce", "Gendai Chinese->English",
  16. "/lib/dict/world/sansdata/sandic24.dat",
  17. "/lib/dict/world/sansdata/ceindex",
  18. worldnextoff, worldprintentry, worldprintkey},
  19. {"ceh", "Gendai Chinese->English (Hanzi index)",
  20. "/lib/dict/world/sansdata/sandic24.dat",
  21. "/lib/dict/world/sansdata/cehindex",
  22. worldnextoff, worldprintentry, worldprintkey},
  23. {"ec", "Gendai English->Chinese",
  24. "/lib/dict/world/sansdata/sandic24.dat",
  25. "/lib/dict/world/sansdata/ecindex",
  26. worldnextoff, worldprintentry, worldprintkey},
  27. {"dae", "Gyldendal Danish->English",
  28. "/lib/dict/world/gylddata/sandic30.dat",
  29. "/lib/dict/world/gylddata/daeindex",
  30. worldnextoff, worldprintentry, worldprintkey},
  31. {"eda", "Gyldendal English->Danish",
  32. "/lib/dict/world/gylddata/sandic29.dat",
  33. "/lib/dict/world/gylddata/edaindex",
  34. worldnextoff, worldprintentry, worldprintkey},
  35. {"due", "Wolters-Noordhoff Dutch->English",
  36. "/lib/dict/world/woltdata/sandic07.dat",
  37. "/lib/dict/world/woltdata/deindex",
  38. worldnextoff, worldprintentry, worldprintkey},
  39. {"edu", "Wolters-Noordhoff English->Dutch",
  40. "/lib/dict/world/woltdata/sandic06.dat",
  41. "/lib/dict/world/woltdata/edindex",
  42. worldnextoff, worldprintentry, worldprintkey},
  43. {"fie", "WSOY Finnish->English",
  44. "/lib/dict/world/werndata/sandic32.dat",
  45. "/lib/dict/world/werndata/fieindex",
  46. worldnextoff, worldprintentry, worldprintkey},
  47. {"efi", "WSOY English->Finnish",
  48. "/lib/dict/world/werndata/sandic31.dat",
  49. "/lib/dict/world/werndata/efiindex",
  50. worldnextoff, worldprintentry, worldprintkey},
  51. {"fe", "Collins French->English",
  52. "/lib/dict/fe", "/lib/dict/feindex",
  53. pcollnextoff, pcollprintentry, pcollprintkey},
  54. {"ef", "Collins English->French",
  55. "/lib/dict/ef", "/lib/dict/efindex",
  56. pcollnextoff, pcollprintentry, pcollprintkey},
  57. {"ge", "Collins German->English",
  58. "/lib/dict/ge", "/lib/dict/geindex",
  59. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  60. {"eg", "Collins English->German",
  61. "/lib/dict/eg", "/lib/dict/egindex",
  62. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  63. {"ie", "Collins Italian->English",
  64. "/lib/dict/ie", "/lib/dict/ieindex",
  65. pcollnextoff, pcollprintentry, pcollprintkey},
  66. {"ei", "Collins English->Italian",
  67. "/lib/dict/ei", "/lib/dict/eiindex",
  68. pcollnextoff, pcollprintentry, pcollprintkey},
  69. {"je", "Sanshusha Japanese->English",
  70. "/lib/dict/world/sansdata/sandic18.dat",
  71. "/lib/dict/world/sansdata/jeindex",
  72. worldnextoff, worldprintentry, worldprintkey},
  73. {"jek", "Sanshusha Japanese->English (Kanji index)",
  74. "/lib/dict/world/sansdata/sandic18.dat",
  75. "/lib/dict/world/sansdata/jekindex",
  76. worldnextoff, worldprintentry, worldprintkey},
  77. {"ej", "Sanshusha English->Japanese",
  78. "/lib/dict/world/sansdata/sandic18.dat",
  79. "/lib/dict/world/sansdata/ejindex",
  80. worldnextoff, worldprintentry, worldprintkey},
  81. {"tjeg", "Sanshusha technical Japanese->English,German",
  82. "/lib/dict/world/sansdata/sandic16.dat",
  83. "/lib/dict/world/sansdata/tjegindex",
  84. worldnextoff, worldprintentry, worldprintkey},
  85. {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)",
  86. "/lib/dict/world/sansdata/sandic16.dat",
  87. "/lib/dict/world/sansdata/tjegkindex",
  88. worldnextoff, worldprintentry, worldprintkey},
  89. {"tegj", "Sanshusha technical English->German,Japanese",
  90. "/lib/dict/world/sansdata/sandic16.dat",
  91. "/lib/dict/world/sansdata/tegjindex",
  92. worldnextoff, worldprintentry, worldprintkey},
  93. {"tgje", "Sanshusha technical German->Japanese,English",
  94. "/lib/dict/world/sansdata/sandic16.dat",
  95. "/lib/dict/world/sansdata/tgjeindex",
  96. worldnextoff, worldprintentry, worldprintkey},
  97. {"ne", "Kunnskapforlaget Norwegian->English",
  98. "/lib/dict/world/kunndata/sandic28.dat",
  99. "/lib/dict/world/kunndata/neindex",
  100. worldnextoff, worldprintentry, worldprintkey},
  101. {"en", "Kunnskapforlaget English->Norwegian",
  102. "/lib/dict/world/kunndata/sandic27.dat",
  103. "/lib/dict/world/kunndata/enindex",
  104. worldnextoff, worldprintentry, worldprintkey},
  105. {"re", "Leon Ungier Russian->English",
  106. "/lib/dict/re", "/lib/dict/reindex",
  107. simplenextoff, simpleprintentry, simpleprintkey},
  108. {"er", "Leon Ungier English->Russian",
  109. "/lib/dict/re", "/lib/dict/erindex",
  110. simplenextoff, simpleprintentry, simpleprintkey},
  111. {"se", "Collins Spanish->English",
  112. "/lib/dict/se", "/lib/dict/seindex",
  113. pcollnextoff, pcollprintentry, pcollprintkey},
  114. {"es", "Collins English->Spanish",
  115. "/lib/dict/es", "/lib/dict/esindex",
  116. pcollnextoff, pcollprintentry, pcollprintkey},
  117. {"swe", "Esselte Studium Swedish->English",
  118. "/lib/dict/world/essedata/sandic34.dat",
  119. "/lib/dict/world/essedata/sweindex",
  120. worldnextoff, worldprintentry, worldprintkey},
  121. {"esw", "Esselte Studium English->Swedish",
  122. "/lib/dict/world/essedata/sandic33.dat",
  123. "/lib/dict/world/essedata/eswindex",
  124. worldnextoff, worldprintentry, worldprintkey},
  125. {"movie", "Movies -- by title",
  126. "/lib/movie/data", "/lib/dict/movtindex",
  127. movienextoff, movieprintentry, movieprintkey},
  128. {"moviea", "Movies -- by actor",
  129. "/lib/movie/data", "/lib/dict/movaindex",
  130. movienextoff, movieprintentry, movieprintkey},
  131. {"movied", "Movies -- by director",
  132. "/lib/movie/data", "/lib/dict/movdindex",
  133. movienextoff, movieprintentry, movieprintkey},
  134. {"slang", "English Slang",
  135. "/lib/dict/slang", "/lib/dict/slangindex",
  136. slangnextoff, slangprintentry, slangprintkey},
  137. {"robert", "Robert Électronique",
  138. "/lib/dict/robert/_pointers", "/lib/dict/robert/_index",
  139. robertnextoff, robertindexentry, robertprintkey},
  140. {"robertv", "Robert Électronique - formes des verbes",
  141. "/lib/dict/robert/flex.rob", "/lib/dict/robert/_flexindex",
  142. robertnextflex, robertflexentry, robertprintkey},
  143. {0, 0, 0, 0, 0}
  144. };
  145. typedef struct Lig Lig;
  146. struct Lig {
  147. Rune start; /* accent rune */
  148. Rune *pairs; /* <char,accented version> pairs */
  149. };
  150. static Lig ligtab[Nligs] = {
  151. [LACU-LIGS] {L'´', L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
  152. [LGRV-LIGS] {L'ˋ', L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
  153. [LUML-LIGS] {L'¨', L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
  154. [LCED-LIGS] {L'¸', L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
  155. [LTIL-LIGS] {L'˜', L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
  156. [LBRV-LIGS] {L'˘', L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
  157. [LRNG-LIGS] {L'˚', L"AÅaåUŮuů"},
  158. [LDOT-LIGS] {L'˙', L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
  159. [LDTB-LIGS] {L'.', L""},
  160. [LFRN-LIGS] {L'⌢', L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
  161. [LFRB-LIGS] {L'̯', L""},
  162. [LOGO-LIGS] {L'˛', L"AĄaąEĘeęIĮiįıįUŲuų"},
  163. [LMAC-LIGS] {L'¯', L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
  164. [LHCK-LIGS] {L'ˇ', L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
  165. [LASP-LIGS] {L'ʽ', L""},
  166. [LLEN-LIGS] {L'ʼ', L""},
  167. [LBRB-LIGS] {L'̮', L""}
  168. };
  169. Rune *multitab[Nmulti] = {
  170. [MAAS-MULTI] L"ʽα",
  171. [MALN-MULTI] L"ʼα",
  172. [MAND-MULTI] L"and",
  173. [MAOQ-MULTI] L"a/q",
  174. [MBRA-MULTI] L"<|",
  175. [MDD-MULTI] L"..",
  176. [MDDD-MULTI] L"...",
  177. [MEAS-MULTI] L"ʽε",
  178. [MELN-MULTI] L"ʼε",
  179. [MEMM-MULTI] L"——",
  180. [MHAS-MULTI] L"ʽη",
  181. [MHLN-MULTI] L"ʼη",
  182. [MIAS-MULTI] L"ʽι",
  183. [MILN-MULTI] L"ʼι",
  184. [MLCT-MULTI] L"ct",
  185. [MLFF-MULTI] L"ff",
  186. [MLFFI-MULTI] L"ffi",
  187. [MLFFL-MULTI] L"ffl",
  188. [MLFL-MULTI] L"fl",
  189. [MLFI-MULTI] L"fi",
  190. [MLLS-MULTI] L"ɫɫ",
  191. [MLST-MULTI] L"st",
  192. [MOAS-MULTI] L"ʽο",
  193. [MOLN-MULTI] L"ʼο",
  194. [MOR-MULTI] L"or",
  195. [MRAS-MULTI] L"ʽρ",
  196. [MRLN-MULTI] L"ʼρ",
  197. [MTT-MULTI] L"~~",
  198. [MUAS-MULTI] L"ʽυ",
  199. [MULN-MULTI] L"ʼυ",
  200. [MWAS-MULTI] L"ʽω",
  201. [MWLN-MULTI] L"ʼω",
  202. [MOE-MULTI] L"oe",
  203. [MES-MULTI] L" ",
  204. };
  205. #define risupper(r) (L'A' <= (r) && (r) <= L'Z')
  206. #define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF)
  207. #define rtolower(r) ((r)-'A'+'a')
  208. static Rune latin_fold_tab[] =
  209. {
  210. /* Table to fold latin 1 characters to ASCII equivalents
  211. based at Rune value 0xc0
  212. À Á Â Ã Ä Å Æ Ç
  213. È É Ê Ë Ì Í Î Ï
  214. Ð Ñ Ò Ó Ô Õ Ö ×
  215. Ø Ù Ú Û Ü Ý Þ ß
  216. à á â ã ä å æ ç
  217. è é ê ë ì í î ï
  218. ð ñ ò ó ô õ ö ÷
  219. ø ù ú û ü ý þ ÿ
  220. */
  221. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  222. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  223. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  224. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 ,
  225. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  226. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  227. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  228. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y',
  229. };
  230. static Rune *ttabstack[20];
  231. static int ntt;
  232. /*
  233. * tab is an array of n Assoc's, sorted by key.
  234. * Look for key in tab, and return corresponding val
  235. * or -1 if not there
  236. */
  237. long
  238. lookassoc(Assoc *tab, int n, char *key)
  239. {
  240. Assoc *q;
  241. long i, low, high;
  242. int r;
  243. for(low = -1, high = n; high > low+1; ){
  244. i = (high+low)/2;
  245. q = &tab[i];
  246. if((r=strcmp(key, q->key))<0)
  247. high = i;
  248. else if(r == 0)
  249. return q->val;
  250. else
  251. low=i;
  252. }
  253. return -1;
  254. }
  255. long
  256. looknassoc(Nassoc *tab, int n, long key)
  257. {
  258. Nassoc *q;
  259. long i, low, high;
  260. for(low = -1, high = n; high > low+1; ){
  261. i = (high+low)/2;
  262. q = &tab[i];
  263. if(key < q->key)
  264. high = i;
  265. else if(key == q->key)
  266. return q->val;
  267. else
  268. low=i;
  269. }
  270. return -1;
  271. }
  272. void
  273. err(char *fmt, ...)
  274. {
  275. char buf[1000];
  276. va_list v;
  277. va_start(v, fmt);
  278. vsnprint(buf, sizeof(buf), fmt, v);
  279. va_end(v);
  280. fprint(2, "%s: %s\n", argv0, buf);
  281. }
  282. /*
  283. * Write the rune r to bout, keeping track of line length
  284. * and breaking the lines (at blanks) when they get too long
  285. */
  286. void
  287. outrune(long r)
  288. {
  289. if(outinhibit)
  290. return;
  291. if(++linelen > breaklen && r == L' ') {
  292. Bputc(bout, '\n');
  293. linelen = 0;
  294. } else
  295. Bputrune(bout, r);
  296. }
  297. void
  298. outrunes(Rune *rp)
  299. {
  300. Rune r;
  301. while((r = *rp++) != 0)
  302. outrune(r);
  303. }
  304. /* like outrune, but when arg is know to be a char */
  305. void
  306. outchar(int c)
  307. {
  308. if(outinhibit)
  309. return;
  310. if(++linelen > breaklen && c == ' ') {
  311. c ='\n';
  312. linelen = 0;
  313. }
  314. Bputc(bout, c);
  315. }
  316. void
  317. outchars(char *s)
  318. {
  319. char c;
  320. while((c = *s++) != 0)
  321. outchar(c);
  322. }
  323. void
  324. outprint(char *fmt, ...)
  325. {
  326. char buf[1000];
  327. va_list v;
  328. va_start(v, fmt);
  329. vsnprint(buf, sizeof(buf), fmt, v);
  330. va_end(v);
  331. outchars(buf);
  332. }
  333. void
  334. outpiece(char *b, char *e)
  335. {
  336. int c, lastc;
  337. lastc = 0;
  338. while(b < e) {
  339. c = *b++;
  340. if(c == '\n')
  341. c = ' ';
  342. if(!(c == ' ' && lastc == ' '))
  343. outchar(c);
  344. lastc = c;
  345. }
  346. }
  347. /*
  348. * Go to new line if not already there; indent if ind != 0.
  349. * If ind > 1, leave a blank line too.
  350. * Slight hack: assume if current line is only one or two
  351. * characters long, then they were spaces.
  352. */
  353. void
  354. outnl(int ind)
  355. {
  356. if(outinhibit)
  357. return;
  358. if(ind) {
  359. if(ind > 1) {
  360. if(linelen > 2)
  361. Bputc(bout, '\n');
  362. Bprint(bout, "\n ");
  363. } else if(linelen == 0)
  364. Bprint(bout, " ");
  365. else if(linelen == 1)
  366. Bputc(bout, ' ');
  367. else if(linelen != 2)
  368. Bprint(bout, "\n ");
  369. linelen = 2;
  370. } else {
  371. if(linelen) {
  372. Bputc(bout, '\n');
  373. linelen = 0;
  374. }
  375. }
  376. }
  377. /*
  378. * Fold the runes in null-terminated rp.
  379. * Use the sort(1) definition of folding (uppercase to lowercase,
  380. * latin1-accented characters to corresponding unaccented chars)
  381. */
  382. void
  383. fold(Rune *rp)
  384. {
  385. Rune r;
  386. while((r = *rp) != 0) {
  387. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  388. r = latin_fold_tab[r-0xc0];
  389. if(risupper(r))
  390. r = rtolower(r);
  391. *rp++ = r;
  392. }
  393. }
  394. /*
  395. * Like fold, but put folded result into new
  396. * (assumed to have enough space).
  397. * old is a regular expression, but we know that
  398. * metacharacters aren't affected
  399. */
  400. void
  401. foldre(char *new, char *old)
  402. {
  403. Rune r;
  404. while(*old) {
  405. old += chartorune(&r, old);
  406. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  407. r = latin_fold_tab[r-0xc0];
  408. if(risupper(r))
  409. r = rtolower(r);
  410. new += runetochar(new, &r);
  411. }
  412. *new = 0;
  413. }
  414. /*
  415. * acomp(s, t) returns:
  416. * -2 if s strictly precedes t
  417. * -1 if s is a prefix of t
  418. * 0 if s is the same as t
  419. * 1 if t is a prefix of s
  420. * 2 if t strictly precedes s
  421. */
  422. int
  423. acomp(Rune *s, Rune *t)
  424. {
  425. int cs, ct;
  426. for(;;) {
  427. cs = *s;
  428. ct = *t;
  429. if(cs != ct)
  430. break;
  431. if(cs == 0)
  432. return 0;
  433. s++;
  434. t++;
  435. }
  436. if(cs == 0)
  437. return -1;
  438. if(ct == 0)
  439. return 1;
  440. if(cs < ct)
  441. return -2;
  442. return 2;
  443. }
  444. /*
  445. * Copy null terminated Runes from 'from' to 'to'.
  446. */
  447. void
  448. runescpy(Rune *to, Rune *from)
  449. {
  450. while((*to++ = *from++) != 0)
  451. continue;
  452. }
  453. /*
  454. * Conversion of unsigned number to long, no overflow detection
  455. */
  456. long
  457. runetol(Rune *r)
  458. {
  459. int c;
  460. long n;
  461. n = 0;
  462. for(;; r++){
  463. c = *r;
  464. if(L'0'<=c && c<=L'9')
  465. c -= '0';
  466. else
  467. break;
  468. n = n*10 + c;
  469. }
  470. return n;
  471. }
  472. /*
  473. * See if there is a rune corresponding to the accented
  474. * version of r with accent acc (acc in [LIGS..LIGE-1]),
  475. * and return it if so, else return NONE.
  476. */
  477. Rune
  478. liglookup(Rune acc, Rune r)
  479. {
  480. Rune *p;
  481. if(acc < LIGS || acc >= LIGE)
  482. return NONE;
  483. for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
  484. if(*p == r)
  485. return *(p+1);
  486. return NONE;
  487. }
  488. /*
  489. * Maintain a translation table stack (a translation table
  490. * is an array of Runes indexed by bytes or 7-bit bytes).
  491. * If starting is true, push the curtab onto the stack
  492. * and return newtab; else pop the top of the stack and
  493. * return it.
  494. * If curtab is 0, initialize the stack and return.
  495. */
  496. Rune *
  497. changett(Rune *curtab, Rune *newtab, int starting)
  498. {
  499. if(curtab == 0) {
  500. ntt = 0;
  501. return 0;
  502. }
  503. if(starting) {
  504. if(ntt >= asize(ttabstack)) {
  505. if(debug)
  506. err("translation stack overflow");
  507. return curtab;
  508. }
  509. ttabstack[ntt++] = curtab;
  510. return newtab;
  511. } else {
  512. if(ntt == 0) {
  513. if(debug)
  514. err("translation stack underflow");
  515. return curtab;
  516. }
  517. return ttabstack[--ntt];
  518. }
  519. }