n8.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include "tdef.h"
  10. #include "fns.h"
  11. #include "ext.h"
  12. #include <assert.h>
  13. #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
  14. /* this value is used (as a literal) in suftab.c */
  15. /* to encode possible hyphenation points in suffixes. */
  16. /* it could be changed, by widening the tables */
  17. /* to be shorts instead of chars. */
  18. /*
  19. * troff8.c
  20. *
  21. * hyphenation
  22. */
  23. int hexsize = 0; /* hyphenation exception list size */
  24. char *hbufp = NULL; /* base of list */
  25. char *nexth = NULL; /* first free slot in list */
  26. Tchar *hyend;
  27. #define THRESH 160 /* digram goodness threshold */
  28. int thresh = THRESH;
  29. int texhyphen(void);
  30. static int alpha(Tchar);
  31. void hyphen(Tchar *wp)
  32. {
  33. int j;
  34. Tchar *i;
  35. i = wp;
  36. while (punct((*i++)))
  37. ;
  38. if (!alpha(*--i))
  39. return;
  40. wdstart = i++;
  41. while (alpha(*i++))
  42. ;
  43. hyend = wdend = --i - 1;
  44. while (punct((*i++)))
  45. ;
  46. if (*--i)
  47. return;
  48. if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
  49. return;
  50. hyp = hyptr;
  51. *hyp = 0;
  52. hyoff = 2;
  53. /* for now, try exceptions first, then tex (if hyphalg is non-zero),
  54. then suffix and digram if tex didn't hyphenate it at all.
  55. */
  56. if (!exword() && !texhyphen() && !suffix())
  57. digram();
  58. /* this appears to sort hyphenation points into increasing order */
  59. *hyp++ = 0;
  60. if (*hyptr)
  61. for (j = 1; j; ) {
  62. j = 0;
  63. for (hyp = hyptr + 1; *hyp != 0; hyp++) {
  64. if (*(hyp - 1) > *hyp) {
  65. j++;
  66. i = *hyp;
  67. *hyp = *(hyp - 1);
  68. *(hyp - 1) = i;
  69. }
  70. }
  71. }
  72. }
  73. static alpha(Tchar i) /* non-zero if really alphabetic */
  74. {
  75. if (ismot(i))
  76. return 0;
  77. else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
  78. return 0; /* no good way to make sure i is in range for */
  79. else /* the call of isalpha */
  80. return isalpha(cbits(i));
  81. }
  82. punct(Tchar i)
  83. {
  84. if (!i || alpha(i))
  85. return(0);
  86. else
  87. return(1);
  88. }
  89. void caseha(void) /* set hyphenation algorithm */
  90. {
  91. hyphalg = HYPHALG;
  92. if (skip())
  93. return;
  94. noscale++;
  95. hyphalg = atoi0();
  96. noscale = 0;
  97. }
  98. void caseht(void) /* set hyphenation threshold; not in manual! */
  99. {
  100. thresh = THRESH;
  101. if (skip())
  102. return;
  103. noscale++;
  104. thresh = atoi0();
  105. noscale = 0;
  106. }
  107. char *growh(char *where)
  108. {
  109. char *new;
  110. hexsize += NHEX;
  111. if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
  112. return NULL;
  113. if (new == hbufp) {
  114. return where;
  115. } else {
  116. int diff;
  117. diff = where - hbufp;
  118. hbufp = new;
  119. return new + diff;
  120. }
  121. }
  122. void casehw(void)
  123. {
  124. int i, k;
  125. char *j;
  126. Tchar t;
  127. if (nexth == NULL) {
  128. if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
  129. ERROR "No space for exception word list." WARN;
  130. return;
  131. }
  132. hexsize = NHEX;
  133. }
  134. k = 0;
  135. while (!skip()) {
  136. if ((j = nexth) >= hbufp + hexsize - 2)
  137. if ((j = nexth = growh(j)) == NULL)
  138. goto full;
  139. for (;;) {
  140. if (ismot(t = getch()))
  141. continue;
  142. i = cbits(t);
  143. if (i == ' ' || i == '\n') {
  144. *j++ = 0;
  145. nexth = j;
  146. *j = 0;
  147. if (i == ' ')
  148. break;
  149. else
  150. return;
  151. }
  152. if (i == '-') {
  153. k = HY_BIT;
  154. continue;
  155. }
  156. *j++ = maplow(i) | k;
  157. k = 0;
  158. if (j >= hbufp + hexsize - 2)
  159. if ((j = growh(j)) == NULL)
  160. goto full;
  161. }
  162. }
  163. return;
  164. full:
  165. ERROR "Cannot grow exception word list." WARN;
  166. *nexth = 0;
  167. }
  168. int exword(void)
  169. {
  170. Tchar *w;
  171. char *e, *save;
  172. e = hbufp;
  173. while (1) {
  174. save = e;
  175. if (e == NULL || *e == 0)
  176. return(0);
  177. w = wdstart;
  178. while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
  179. e++;
  180. w++;
  181. }
  182. if (!*e) {
  183. if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
  184. w = wdstart;
  185. for (e = save; *e; e++) {
  186. if (*e & HY_BIT)
  187. *hyp++ = w;
  188. if (hyp > hyptr + NHYP - 1)
  189. hyp = hyptr + NHYP - 1;
  190. w++;
  191. }
  192. return(1);
  193. } else {
  194. e++;
  195. continue;
  196. }
  197. } else
  198. while (*e++)
  199. ;
  200. }
  201. }
  202. suffix(void)
  203. {
  204. Tchar *w;
  205. char *s, *s0;
  206. Tchar i;
  207. extern char *suftab[];
  208. again:
  209. i = cbits(*hyend);
  210. if (!alpha(i))
  211. return(0);
  212. if (i < 'a')
  213. i -= 'A' - 'a';
  214. if ((s0 = suftab[i-'a']) == 0)
  215. return(0);
  216. for (;;) {
  217. if ((i = *s0 & 017) == 0)
  218. return(0);
  219. s = s0 + i - 1;
  220. w = hyend - 1;
  221. while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
  222. s--;
  223. w--;
  224. }
  225. if (s == s0)
  226. break;
  227. s0 += i;
  228. }
  229. s = s0 + i - 1;
  230. w = hyend;
  231. if (*s0 & HY_BIT)
  232. goto mark;
  233. while (s > s0) {
  234. w--;
  235. if (*s-- & HY_BIT) {
  236. mark:
  237. hyend = w - 1;
  238. if (*s0 & 0100) /* 0100 used in suftab to encode something too */
  239. continue;
  240. if (!chkvow(w))
  241. return(0);
  242. *hyp++ = w;
  243. }
  244. }
  245. if (*s0 & 040)
  246. return(0);
  247. if (exword())
  248. return(1);
  249. goto again;
  250. }
  251. maplow(int i)
  252. {
  253. if (isupper(i))
  254. i = tolower(i);
  255. return(i);
  256. }
  257. vowel(int i)
  258. {
  259. switch (i) {
  260. case 'a': case 'A':
  261. case 'e': case 'E':
  262. case 'i': case 'I':
  263. case 'o': case 'O':
  264. case 'u': case 'U':
  265. case 'y': case 'Y':
  266. return(1);
  267. default:
  268. return(0);
  269. }
  270. }
  271. Tchar *chkvow(Tchar *w)
  272. {
  273. while (--w >= wdstart)
  274. if (vowel(cbits(*w)))
  275. return(w);
  276. return(0);
  277. }
  278. void digram(void)
  279. {
  280. int maxval, val;
  281. Tchar *nhyend, *maxw, *w;
  282. extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
  283. maxw = 0;
  284. again:
  285. if (!(w = chkvow(hyend + 1)))
  286. return;
  287. hyend = w;
  288. if (!(w = chkvow(hyend)))
  289. return;
  290. nhyend = w;
  291. maxval = 0;
  292. w--;
  293. while (++w < hyend && w < wdend - 1) {
  294. val = 1;
  295. if (w == wdstart)
  296. val *= dilook('a', cbits(*w), bxh);
  297. else if (w == wdstart + 1)
  298. val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
  299. else
  300. val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
  301. val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
  302. val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
  303. if (val > maxval) {
  304. maxval = val;
  305. maxw = w + 1;
  306. }
  307. }
  308. hyend = nhyend;
  309. if (maxval > thresh)
  310. *hyp++ = maxw;
  311. goto again;
  312. }
  313. dilook(int a, int b, char t[26][13])
  314. {
  315. int i, j;
  316. i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
  317. if (!(j & 01))
  318. i >>= 4;
  319. return(i & 017);
  320. }
  321. /* here beginneth the tex hyphenation code, as interpreted freely */
  322. /* the main difference is that there is no attempt to squeeze space */
  323. /* as tightly at tex does. */
  324. static int texit(Tchar *, Tchar *);
  325. static int readpats(void);
  326. static void install(char *);
  327. static void fixup(void);
  328. static int trieindex(int, int);
  329. static char pats[50000]; /* size ought to be computed dynamically */
  330. static char *nextpat = pats;
  331. static char *trie[27*27]; /* english-specific sizes */
  332. int texhyphen(void)
  333. {
  334. static int loaded = 0; /* -1: couldn't find tex file */
  335. if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
  336. return 0;
  337. if (loaded == 0) {
  338. if (readpats())
  339. loaded = 1;
  340. else
  341. loaded = -1;
  342. }
  343. return texit(wdstart, wdend);
  344. }
  345. static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
  346. {
  347. int nw, i, k, equal, cnt[500];
  348. char w[500+1], *np, *pp, *wp, *xpp, *xwp;
  349. w[0] = '.';
  350. for (nw = 1; start <= end && nw < 500-1; nw++, start++)
  351. w[nw] = maplow(tolower(cbits(*start)));
  352. start -= (nw - 1);
  353. w[nw++] = '.';
  354. w[nw] = 0;
  355. /*
  356. * printf("try %s\n", w);
  357. */
  358. for (i = 0; i <= nw; i++)
  359. cnt[i] = '0';
  360. for (wp = w; wp+1 < w+nw; wp++) {
  361. for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
  362. if (pp == 0 /* no trie entry */
  363. || *pp != *wp /* no match on 1st letter */
  364. || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
  365. break; /* so move to next letter of word */
  366. equal = 1;
  367. for (xpp = pp+2, xwp = wp+2; *xpp; )
  368. if (*xpp++ != *xwp++) {
  369. equal = 0;
  370. break;
  371. }
  372. if (equal) {
  373. np = xpp+1; /* numpat */
  374. for (k = wp-w; *np; k++, np++)
  375. if (*np > cnt[k])
  376. cnt[k] = *np;
  377. /*
  378. * printf("match: %s %s\n", pp, xpp+1);
  379. */
  380. }
  381. pp += *(pp-1); /* skip over pattern and numbers to next */
  382. }
  383. }
  384. /*
  385. * for (i = 0; i < nw; i++) printf("%c", w[i]);
  386. * printf(" ");
  387. * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
  388. * printf("\n");
  389. */
  390. /*
  391. * for (i = 1; i < nw - 1; i++) {
  392. * if (i > 2 && i < nw - 3 && cnt[i] % 2)
  393. * printf("-");
  394. * if (cbits(start[i-1]) != '.')
  395. * printf("%c", cbits(start[i-1]));
  396. * }
  397. * printf("\n");
  398. */
  399. for (i = 1; i < nw -1; i++)
  400. if (i > 2 && i < nw - 3 && cnt[i] % 2)
  401. *hyp++ = start + i - 1;
  402. return hyp - hyptr; /* non-zero if a hyphen was found */
  403. }
  404. /*
  405. This code assumes that hyphen.tex looks like
  406. % some comments
  407. \patterns{ % more comments
  408. pat5ter4ns, 1 per line, SORTED, nothing else
  409. }
  410. more goo
  411. \hyphenation{ % more comments
  412. ex-cep-tions, one per line; i ignore this part for now
  413. }
  414. this code is NOT robust against variations. unfortunately,
  415. it looks like every local language version of this file has
  416. a different format. i have also made no provision for weird
  417. characters. sigh.
  418. */
  419. static int readpats(void)
  420. {
  421. FILE *fp;
  422. char buf[200], buf1[200];
  423. if ((fp = fopen(TEXHYPHENS, "r")) == NULL
  424. && (fp = fopen(DWBalthyphens, "r")) == NULL) {
  425. ERROR "warning: can't find hyphen.tex" WARN;
  426. return 0;
  427. }
  428. while (fgets(buf, sizeof buf, fp) != NULL) {
  429. sscanf(buf, "%s", buf1);
  430. if (strcmp(buf1, "\\patterns{") == 0)
  431. break;
  432. }
  433. while (fgets(buf, sizeof buf, fp) != NULL) {
  434. if (buf[0] == '}')
  435. break;
  436. install(buf);
  437. }
  438. fclose(fp);
  439. fixup();
  440. return 1;
  441. }
  442. static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
  443. {
  444. int npat, lastpat;
  445. char num[500], *onextpat = nextpat;
  446. num[0] = '0';
  447. *nextpat++ = ' '; /* fill in with count later */
  448. for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
  449. if (isdigit(*s)) {
  450. num[npat] = *s;
  451. lastpat = npat;
  452. } else {
  453. *nextpat++ = *s;
  454. npat++;
  455. num[npat] = '0';
  456. }
  457. }
  458. *nextpat++ = 0;
  459. if (nextpat > pats + sizeof(pats)-20) {
  460. ERROR "tex hyphenation table overflow, tail end ignored" WARN;
  461. nextpat = onextpat;
  462. }
  463. num[lastpat+1] = 0;
  464. strcat(nextpat, num);
  465. nextpat += strlen(nextpat) + 1;
  466. }
  467. static void fixup(void) /* build indexes of where . a b c ... start */
  468. {
  469. char *p, *lastc;
  470. int n;
  471. for (lastc = pats, p = pats+1; p < nextpat; p++)
  472. if (*p == ' ') {
  473. *lastc = p - lastc;
  474. lastc = p;
  475. }
  476. *lastc = p - lastc;
  477. for (p = pats+1; p < nextpat; ) {
  478. n = trieindex(p[0], p[1]);
  479. if (trie[n] == 0)
  480. trie[n] = p;
  481. p += p[-1];
  482. }
  483. /* printf("pats = %d\n", nextpat - pats); */
  484. }
  485. static int trieindex(int d1, int d2)
  486. {
  487. int i;
  488. i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
  489. assert(0 <= i && i < 27*27);
  490. return i;
  491. }