utils.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "dict.h"
  5. Dict dicts[] = {
  6. {"oed", "Oxford English Dictionary, 2nd Ed.",
  7. "/lib/dict/oed2", "/lib/dict/oed2index",
  8. oednextoff, oedprintentry, oedprintkey},
  9. {"ahd", "American Heritage Dictionary, 2nd College Ed.",
  10. "/lib/ahd/DICT.DB", "/lib/ahd/index",
  11. ahdnextoff, ahdprintentry, ahdprintkey},
  12. {"pgw", "Project Gutenberg Webster Dictionary",
  13. "/lib/dict/pgw", "/lib/dict/pgwindex",
  14. pgwnextoff, pgwprintentry, pgwprintkey},
  15. {"thesaurus", "Collins Thesaurus",
  16. "/lib/dict/thesaurus", "/lib/dict/thesindex",
  17. thesnextoff, thesprintentry, thesprintkey},
  18. {"roget", "Project Gutenberg Roget's Thesaurus",
  19. "/lib/dict/roget", "/lib/dict/rogetindex",
  20. rogetnextoff, rogetprintentry, rogetprintkey},
  21. {"ce", "Gendai Chinese->English",
  22. "/lib/dict/world/sansdata/sandic24.dat",
  23. "/lib/dict/world/sansdata/ceindex",
  24. worldnextoff, worldprintentry, worldprintkey},
  25. {"ceh", "Gendai Chinese->English (Hanzi index)",
  26. "/lib/dict/world/sansdata/sandic24.dat",
  27. "/lib/dict/world/sansdata/cehindex",
  28. worldnextoff, worldprintentry, worldprintkey},
  29. {"ec", "Gendai English->Chinese",
  30. "/lib/dict/world/sansdata/sandic24.dat",
  31. "/lib/dict/world/sansdata/ecindex",
  32. worldnextoff, worldprintentry, worldprintkey},
  33. {"dae", "Gyldendal Danish->English",
  34. "/lib/dict/world/gylddata/sandic30.dat",
  35. "/lib/dict/world/gylddata/daeindex",
  36. worldnextoff, worldprintentry, worldprintkey},
  37. {"eda", "Gyldendal English->Danish",
  38. "/lib/dict/world/gylddata/sandic29.dat",
  39. "/lib/dict/world/gylddata/edaindex",
  40. worldnextoff, worldprintentry, worldprintkey},
  41. {"due", "Wolters-Noordhoff Dutch->English",
  42. "/lib/dict/world/woltdata/sandic07.dat",
  43. "/lib/dict/world/woltdata/deindex",
  44. worldnextoff, worldprintentry, worldprintkey},
  45. {"edu", "Wolters-Noordhoff English->Dutch",
  46. "/lib/dict/world/woltdata/sandic06.dat",
  47. "/lib/dict/world/woltdata/edindex",
  48. worldnextoff, worldprintentry, worldprintkey},
  49. {"fie", "WSOY Finnish->English",
  50. "/lib/dict/world/werndata/sandic32.dat",
  51. "/lib/dict/world/werndata/fieindex",
  52. worldnextoff, worldprintentry, worldprintkey},
  53. {"efi", "WSOY English->Finnish",
  54. "/lib/dict/world/werndata/sandic31.dat",
  55. "/lib/dict/world/werndata/efiindex",
  56. worldnextoff, worldprintentry, worldprintkey},
  57. {"fe", "Collins French->English",
  58. "/lib/dict/fe", "/lib/dict/feindex",
  59. pcollnextoff, pcollprintentry, pcollprintkey},
  60. {"ef", "Collins English->French",
  61. "/lib/dict/ef", "/lib/dict/efindex",
  62. pcollnextoff, pcollprintentry, pcollprintkey},
  63. {"ge", "Collins German->English",
  64. "/lib/dict/ge", "/lib/dict/geindex",
  65. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  66. {"eg", "Collins English->German",
  67. "/lib/dict/eg", "/lib/dict/egindex",
  68. pcollgnextoff, pcollgprintentry, pcollgprintkey},
  69. {"ie", "Collins Italian->English",
  70. "/lib/dict/ie", "/lib/dict/ieindex",
  71. pcollnextoff, pcollprintentry, pcollprintkey},
  72. {"ei", "Collins English->Italian",
  73. "/lib/dict/ei", "/lib/dict/eiindex",
  74. pcollnextoff, pcollprintentry, pcollprintkey},
  75. {"je", "Sanshusha Japanese->English",
  76. "/lib/dict/world/sansdata/sandic18.dat",
  77. "/lib/dict/world/sansdata/jeindex",
  78. worldnextoff, worldprintentry, worldprintkey},
  79. {"jek", "Sanshusha Japanese->English (Kanji index)",
  80. "/lib/dict/world/sansdata/sandic18.dat",
  81. "/lib/dict/world/sansdata/jekindex",
  82. worldnextoff, worldprintentry, worldprintkey},
  83. {"ej", "Sanshusha English->Japanese",
  84. "/lib/dict/world/sansdata/sandic18.dat",
  85. "/lib/dict/world/sansdata/ejindex",
  86. worldnextoff, worldprintentry, worldprintkey},
  87. {"tjeg", "Sanshusha technical Japanese->English,German",
  88. "/lib/dict/world/sansdata/sandic16.dat",
  89. "/lib/dict/world/sansdata/tjegindex",
  90. worldnextoff, worldprintentry, worldprintkey},
  91. {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)",
  92. "/lib/dict/world/sansdata/sandic16.dat",
  93. "/lib/dict/world/sansdata/tjegkindex",
  94. worldnextoff, worldprintentry, worldprintkey},
  95. {"tegj", "Sanshusha technical English->German,Japanese",
  96. "/lib/dict/world/sansdata/sandic16.dat",
  97. "/lib/dict/world/sansdata/tegjindex",
  98. worldnextoff, worldprintentry, worldprintkey},
  99. {"tgje", "Sanshusha technical German->Japanese,English",
  100. "/lib/dict/world/sansdata/sandic16.dat",
  101. "/lib/dict/world/sansdata/tgjeindex",
  102. worldnextoff, worldprintentry, worldprintkey},
  103. {"ne", "Kunnskapforlaget Norwegian->English",
  104. "/lib/dict/world/kunndata/sandic28.dat",
  105. "/lib/dict/world/kunndata/neindex",
  106. worldnextoff, worldprintentry, worldprintkey},
  107. {"en", "Kunnskapforlaget English->Norwegian",
  108. "/lib/dict/world/kunndata/sandic27.dat",
  109. "/lib/dict/world/kunndata/enindex",
  110. worldnextoff, worldprintentry, worldprintkey},
  111. {"re", "Leon Ungier Russian->English",
  112. "/lib/dict/re", "/lib/dict/reindex",
  113. simplenextoff, simpleprintentry, simpleprintkey},
  114. {"er", "Leon Ungier English->Russian",
  115. "/lib/dict/re", "/lib/dict/erindex",
  116. simplenextoff, simpleprintentry, simpleprintkey},
  117. {"se", "Collins Spanish->English",
  118. "/lib/dict/se", "/lib/dict/seindex",
  119. pcollnextoff, pcollprintentry, pcollprintkey},
  120. {"es", "Collins English->Spanish",
  121. "/lib/dict/es", "/lib/dict/esindex",
  122. pcollnextoff, pcollprintentry, pcollprintkey},
  123. {"swe", "Esselte Studium Swedish->English",
  124. "/lib/dict/world/essedata/sandic34.dat",
  125. "/lib/dict/world/essedata/sweindex",
  126. worldnextoff, worldprintentry, worldprintkey},
  127. {"esw", "Esselte Studium English->Swedish",
  128. "/lib/dict/world/essedata/sandic33.dat",
  129. "/lib/dict/world/essedata/eswindex",
  130. worldnextoff, worldprintentry, worldprintkey},
  131. {"movie", "Movies -- by title",
  132. "/lib/movie/data", "/lib/dict/movtindex",
  133. movienextoff, movieprintentry, movieprintkey},
  134. {"moviea", "Movies -- by actor",
  135. "/lib/movie/data", "/lib/dict/movaindex",
  136. movienextoff, movieprintentry, movieprintkey},
  137. {"movied", "Movies -- by director",
  138. "/lib/movie/data", "/lib/dict/movdindex",
  139. movienextoff, movieprintentry, movieprintkey},
  140. {"slang", "English Slang",
  141. "/lib/dict/slang", "/lib/dict/slangindex",
  142. slangnextoff, slangprintentry, slangprintkey},
  143. {"robert", "Robert Électronique",
  144. "/lib/dict/robert/_pointers", "/lib/dict/robert/_index",
  145. robertnextoff, robertindexentry, robertprintkey},
  146. {"robertv", "Robert Électronique - formes des verbes",
  147. "/lib/dict/robert/flex.rob", "/lib/dict/robert/_flexindex",
  148. robertnextflex, robertflexentry, robertprintkey},
  149. {0, 0, 0, 0, 0}
  150. };
  151. typedef struct Lig Lig;
  152. struct Lig {
  153. Rune start; /* accent rune */
  154. Rune *pairs; /* <char,accented version> pairs */
  155. };
  156. static Lig ligtab[Nligs] = {
  157. [LACU-LIGS] {L'´', L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
  158. [LGRV-LIGS] {L'ˋ', L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
  159. [LUML-LIGS] {L'¨', L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
  160. [LCED-LIGS] {L'¸', L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
  161. [LTIL-LIGS] {L'˜', L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
  162. [LBRV-LIGS] {L'˘', L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
  163. [LRNG-LIGS] {L'˚', L"AÅaåUŮuů"},
  164. [LDOT-LIGS] {L'˙', L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
  165. [LDTB-LIGS] {L'.', L""},
  166. [LFRN-LIGS] {L'⌢', L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
  167. [LFRB-LIGS] {L'̯', L""},
  168. [LOGO-LIGS] {L'˛', L"AĄaąEĘeęIĮiįıįUŲuų"},
  169. [LMAC-LIGS] {L'¯', L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
  170. [LHCK-LIGS] {L'ˇ', L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
  171. [LASP-LIGS] {L'ʽ', L""},
  172. [LLEN-LIGS] {L'ʼ', L""},
  173. [LBRB-LIGS] {L'̮', L""}
  174. };
  175. Rune *multitab[Nmulti] = {
  176. [MAAS-MULTI] L"ʽα",
  177. [MALN-MULTI] L"ʼα",
  178. [MAND-MULTI] L"and",
  179. [MAOQ-MULTI] L"a/q",
  180. [MBRA-MULTI] L"<|",
  181. [MDD-MULTI] L"..",
  182. [MDDD-MULTI] L"...",
  183. [MEAS-MULTI] L"ʽε",
  184. [MELN-MULTI] L"ʼε",
  185. [MEMM-MULTI] L"——",
  186. [MHAS-MULTI] L"ʽη",
  187. [MHLN-MULTI] L"ʼη",
  188. [MIAS-MULTI] L"ʽι",
  189. [MILN-MULTI] L"ʼι",
  190. [MLCT-MULTI] L"ct",
  191. [MLFF-MULTI] L"ff",
  192. [MLFFI-MULTI] L"ffi",
  193. [MLFFL-MULTI] L"ffl",
  194. [MLFL-MULTI] L"fl",
  195. [MLFI-MULTI] L"fi",
  196. [MLLS-MULTI] L"ɫɫ",
  197. [MLST-MULTI] L"st",
  198. [MOAS-MULTI] L"ʽο",
  199. [MOLN-MULTI] L"ʼο",
  200. [MOR-MULTI] L"or",
  201. [MRAS-MULTI] L"ʽρ",
  202. [MRLN-MULTI] L"ʼρ",
  203. [MTT-MULTI] L"~~",
  204. [MUAS-MULTI] L"ʽυ",
  205. [MULN-MULTI] L"ʼυ",
  206. [MWAS-MULTI] L"ʽω",
  207. [MWLN-MULTI] L"ʼω",
  208. [MOE-MULTI] L"oe",
  209. [MES-MULTI] L" ",
  210. };
  211. #define risupper(r) (L'A' <= (r) && (r) <= L'Z')
  212. #define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF)
  213. #define rtolower(r) ((r)-'A'+'a')
  214. static Rune latin_fold_tab[] =
  215. {
  216. /* Table to fold latin 1 characters to ASCII equivalents
  217. based at Rune value 0xc0
  218. À Á Â Ã Ä Å Æ Ç
  219. È É Ê Ë Ì Í Î Ï
  220. Ð Ñ Ò Ó Ô Õ Ö ×
  221. Ø Ù Ú Û Ü Ý Þ ß
  222. à á â ã ä å æ ç
  223. è é ê ë ì í î ï
  224. ð ñ ò ó ô õ ö ÷
  225. ø ù ú û ü ý þ ÿ
  226. */
  227. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  228. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  229. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  230. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 ,
  231. 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
  232. 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
  233. 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
  234. 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y',
  235. };
  236. static Rune *ttabstack[20];
  237. static int ntt;
  238. /*
  239. * tab is an array of n Assoc's, sorted by key.
  240. * Look for key in tab, and return corresponding val
  241. * or -1 if not there
  242. */
  243. long
  244. lookassoc(Assoc *tab, int n, char *key)
  245. {
  246. Assoc *q;
  247. long i, low, high;
  248. int r;
  249. for(low = -1, high = n; high > low+1; ){
  250. i = (high+low)/2;
  251. q = &tab[i];
  252. if((r=strcmp(key, q->key))<0)
  253. high = i;
  254. else if(r == 0)
  255. return q->val;
  256. else
  257. low=i;
  258. }
  259. return -1;
  260. }
  261. long
  262. looknassoc(Nassoc *tab, int n, long key)
  263. {
  264. Nassoc *q;
  265. long i, low, high;
  266. for(low = -1, high = n; high > low+1; ){
  267. i = (high+low)/2;
  268. q = &tab[i];
  269. if(key < q->key)
  270. high = i;
  271. else if(key == q->key)
  272. return q->val;
  273. else
  274. low=i;
  275. }
  276. return -1;
  277. }
  278. void
  279. err(char *fmt, ...)
  280. {
  281. char buf[1000];
  282. va_list v;
  283. va_start(v, fmt);
  284. vsnprint(buf, sizeof(buf), fmt, v);
  285. va_end(v);
  286. fprint(2, "%s: %s\n", argv0, buf);
  287. }
  288. /*
  289. * Write the rune r to bout, keeping track of line length
  290. * and breaking the lines (at blanks) when they get too long
  291. */
  292. void
  293. outrune(long r)
  294. {
  295. if(outinhibit)
  296. return;
  297. if(++linelen > breaklen && r == L' ') {
  298. Bputc(bout, '\n');
  299. linelen = 0;
  300. } else
  301. Bputrune(bout, r);
  302. }
  303. void
  304. outrunes(Rune *rp)
  305. {
  306. Rune r;
  307. while((r = *rp++) != 0)
  308. outrune(r);
  309. }
  310. /* like outrune, but when arg is know to be a char */
  311. void
  312. outchar(int c)
  313. {
  314. if(outinhibit)
  315. return;
  316. if(++linelen > breaklen && c == ' ') {
  317. c ='\n';
  318. linelen = 0;
  319. }
  320. Bputc(bout, c);
  321. }
  322. void
  323. outchars(char *s)
  324. {
  325. char c;
  326. while((c = *s++) != 0)
  327. outchar(c);
  328. }
  329. void
  330. outprint(char *fmt, ...)
  331. {
  332. char buf[1000];
  333. va_list v;
  334. va_start(v, fmt);
  335. vsnprint(buf, sizeof(buf), fmt, v);
  336. va_end(v);
  337. outchars(buf);
  338. }
  339. void
  340. outpiece(char *b, char *e)
  341. {
  342. int c, lastc;
  343. lastc = 0;
  344. while(b < e) {
  345. c = *b++;
  346. if(c == '\n')
  347. c = ' ';
  348. if(!(c == ' ' && lastc == ' '))
  349. outchar(c);
  350. lastc = c;
  351. }
  352. }
  353. /*
  354. * Go to new line if not already there; indent if ind != 0.
  355. * If ind > 1, leave a blank line too.
  356. * Slight hack: assume if current line is only one or two
  357. * characters long, then they were spaces.
  358. */
  359. void
  360. outnl(int ind)
  361. {
  362. if(outinhibit)
  363. return;
  364. if(ind) {
  365. if(ind > 1) {
  366. if(linelen > 2)
  367. Bputc(bout, '\n');
  368. Bprint(bout, "\n ");
  369. } else if(linelen == 0)
  370. Bprint(bout, " ");
  371. else if(linelen == 1)
  372. Bputc(bout, ' ');
  373. else if(linelen != 2)
  374. Bprint(bout, "\n ");
  375. linelen = 2;
  376. } else {
  377. if(linelen) {
  378. Bputc(bout, '\n');
  379. linelen = 0;
  380. }
  381. }
  382. }
  383. /*
  384. * Fold the runes in null-terminated rp.
  385. * Use the sort(1) definition of folding (uppercase to lowercase,
  386. * latin1-accented characters to corresponding unaccented chars)
  387. */
  388. void
  389. fold(Rune *rp)
  390. {
  391. Rune r;
  392. while((r = *rp) != 0) {
  393. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  394. r = latin_fold_tab[r-0xc0];
  395. if(risupper(r))
  396. r = rtolower(r);
  397. *rp++ = r;
  398. }
  399. }
  400. /*
  401. * Like fold, but put folded result into new
  402. * (assumed to have enough space).
  403. * old is a regular expression, but we know that
  404. * metacharacters aren't affected
  405. */
  406. void
  407. foldre(char *new, char *old)
  408. {
  409. Rune r;
  410. while(*old) {
  411. old += chartorune(&r, old);
  412. if (rislatin1(r) && latin_fold_tab[r-0xc0])
  413. r = latin_fold_tab[r-0xc0];
  414. if(risupper(r))
  415. r = rtolower(r);
  416. new += runetochar(new, &r);
  417. }
  418. *new = 0;
  419. }
  420. /*
  421. * acomp(s, t) returns:
  422. * -2 if s strictly precedes t
  423. * -1 if s is a prefix of t
  424. * 0 if s is the same as t
  425. * 1 if t is a prefix of s
  426. * 2 if t strictly precedes s
  427. */
  428. int
  429. acomp(Rune *s, Rune *t)
  430. {
  431. int cs, ct;
  432. for(;;) {
  433. cs = *s;
  434. ct = *t;
  435. if(cs != ct)
  436. break;
  437. if(cs == 0)
  438. return 0;
  439. s++;
  440. t++;
  441. }
  442. if(cs == 0)
  443. return -1;
  444. if(ct == 0)
  445. return 1;
  446. if(cs < ct)
  447. return -2;
  448. return 2;
  449. }
  450. /*
  451. * Copy null terminated Runes from 'from' to 'to'.
  452. */
  453. void
  454. runescpy(Rune *to, Rune *from)
  455. {
  456. while((*to++ = *from++) != 0)
  457. continue;
  458. }
  459. /*
  460. * Conversion of unsigned number to long, no overflow detection
  461. */
  462. long
  463. runetol(Rune *r)
  464. {
  465. int c;
  466. long n;
  467. n = 0;
  468. for(;; r++){
  469. c = *r;
  470. if(L'0'<=c && c<=L'9')
  471. c -= '0';
  472. else
  473. break;
  474. n = n*10 + c;
  475. }
  476. return n;
  477. }
  478. /*
  479. * See if there is a rune corresponding to the accented
  480. * version of r with accent acc (acc in [LIGS..LIGE-1]),
  481. * and return it if so, else return NONE.
  482. */
  483. Rune
  484. liglookup(Rune acc, Rune r)
  485. {
  486. Rune *p;
  487. if(acc < LIGS || acc >= LIGE)
  488. return NONE;
  489. for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
  490. if(*p == r)
  491. return *(p+1);
  492. return NONE;
  493. }
  494. /*
  495. * Maintain a translation table stack (a translation table
  496. * is an array of Runes indexed by bytes or 7-bit bytes).
  497. * If starting is true, push the curtab onto the stack
  498. * and return newtab; else pop the top of the stack and
  499. * return it.
  500. * If curtab is 0, initialize the stack and return.
  501. */
  502. Rune *
  503. changett(Rune *curtab, Rune *newtab, int starting)
  504. {
  505. if(curtab == 0) {
  506. ntt = 0;
  507. return 0;
  508. }
  509. if(starting) {
  510. if(ntt >= asize(ttabstack)) {
  511. if(debug)
  512. err("translation stack overflow");
  513. return curtab;
  514. }
  515. ttabstack[ntt++] = curtab;
  516. return newtab;
  517. } else {
  518. if(ntt == 0) {
  519. if(debug)
  520. err("translation stack underflow");
  521. return curtab;
  522. }
  523. return ttabstack[--ntt];
  524. }
  525. }