n8.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
  1. #include "tdef.h"
  2. #include "fns.h"
  3. #include "ext.h"
  4. #include <assert.h>
  5. #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
  6. /* this value is used (as a literal) in suftab.c */
  7. /* to encode possible hyphenation points in suffixes. */
  8. /* it could be changed, by widening the tables */
  9. /* to be shorts instead of chars. */
  10. /*
  11. * troff8.c
  12. *
  13. * hyphenation
  14. */
  15. int hexsize = 0; /* hyphenation exception list size */
  16. char *hbufp = NULL; /* base of list */
  17. char *nexth = NULL; /* first free slot in list */
  18. Tchar *hyend;
  19. #define THRESH 160 /* digram goodness threshold */
  20. int thresh = THRESH;
  21. int texhyphen(void);
  22. static int alpha(Tchar);
  23. void hyphen(Tchar *wp)
  24. {
  25. int j;
  26. Tchar *i;
  27. i = wp;
  28. while (punct((*i++)))
  29. ;
  30. if (!alpha(*--i))
  31. return;
  32. wdstart = i++;
  33. while (alpha(*i++))
  34. ;
  35. hyend = wdend = --i - 1;
  36. while (punct((*i++)))
  37. ;
  38. if (*--i)
  39. return;
  40. if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
  41. return;
  42. hyp = hyptr;
  43. *hyp = 0;
  44. hyoff = 2;
  45. /* for now, try exceptions first, then tex (if hyphalg is non-zero),
  46. then suffix and digram if tex didn't hyphenate it at all.
  47. */
  48. if (!exword() && !texhyphen() && !suffix())
  49. digram();
  50. /* this appears to sort hyphenation points into increasing order */
  51. *hyp++ = 0;
  52. if (*hyptr)
  53. for (j = 1; j; ) {
  54. j = 0;
  55. for (hyp = hyptr + 1; *hyp != 0; hyp++) {
  56. if (*(hyp - 1) > *hyp) {
  57. j++;
  58. i = *hyp;
  59. *hyp = *(hyp - 1);
  60. *(hyp - 1) = i;
  61. }
  62. }
  63. }
  64. }
  65. static alpha(Tchar i) /* non-zero if really alphabetic */
  66. {
  67. if (ismot(i))
  68. return 0;
  69. else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
  70. return 0; /* no good way to make sure i is in range for */
  71. else /* the call of isalpha */
  72. return isalpha(cbits(i));
  73. }
  74. punct(Tchar i)
  75. {
  76. if (!i || alpha(i))
  77. return(0);
  78. else
  79. return(1);
  80. }
  81. void caseha(void) /* set hyphenation algorithm */
  82. {
  83. hyphalg = HYPHALG;
  84. if (skip())
  85. return;
  86. noscale++;
  87. hyphalg = atoi0();
  88. noscale = 0;
  89. }
  90. void caseht(void) /* set hyphenation threshold; not in manual! */
  91. {
  92. thresh = THRESH;
  93. if (skip())
  94. return;
  95. noscale++;
  96. thresh = atoi0();
  97. noscale = 0;
  98. }
  99. char *growh(char *where)
  100. {
  101. char *new;
  102. hexsize += NHEX;
  103. if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
  104. return NULL;
  105. if (new == hbufp) {
  106. return where;
  107. } else {
  108. int diff;
  109. diff = where - hbufp;
  110. hbufp = new;
  111. return new + diff;
  112. }
  113. }
  114. void casehw(void)
  115. {
  116. int i, k;
  117. char *j;
  118. Tchar t;
  119. if (nexth == NULL) {
  120. if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
  121. ERROR "No space for exception word list." WARN;
  122. return;
  123. }
  124. hexsize = NHEX;
  125. }
  126. k = 0;
  127. while (!skip()) {
  128. if ((j = nexth) >= hbufp + hexsize - 2)
  129. if ((j = nexth = growh(j)) == NULL)
  130. goto full;
  131. for (;;) {
  132. if (ismot(t = getch()))
  133. continue;
  134. i = cbits(t);
  135. if (i == ' ' || i == '\n') {
  136. *j++ = 0;
  137. nexth = j;
  138. *j = 0;
  139. if (i == ' ')
  140. break;
  141. else
  142. return;
  143. }
  144. if (i == '-') {
  145. k = HY_BIT;
  146. continue;
  147. }
  148. *j++ = maplow(i) | k;
  149. k = 0;
  150. if (j >= hbufp + hexsize - 2)
  151. if ((j = growh(j)) == NULL)
  152. goto full;
  153. }
  154. }
  155. return;
  156. full:
  157. ERROR "Cannot grow exception word list." WARN;
  158. *nexth = 0;
  159. }
  160. int exword(void)
  161. {
  162. Tchar *w;
  163. char *e, *save;
  164. e = hbufp;
  165. while (1) {
  166. save = e;
  167. if (e == NULL || *e == 0)
  168. return(0);
  169. w = wdstart;
  170. while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
  171. e++;
  172. w++;
  173. }
  174. if (!*e) {
  175. if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
  176. w = wdstart;
  177. for (e = save; *e; e++) {
  178. if (*e & HY_BIT)
  179. *hyp++ = w;
  180. if (hyp > hyptr + NHYP - 1)
  181. hyp = hyptr + NHYP - 1;
  182. w++;
  183. }
  184. return(1);
  185. } else {
  186. e++;
  187. continue;
  188. }
  189. } else
  190. while (*e++)
  191. ;
  192. }
  193. }
  194. suffix(void)
  195. {
  196. Tchar *w;
  197. char *s, *s0;
  198. Tchar i;
  199. extern char *suftab[];
  200. again:
  201. i = cbits(*hyend);
  202. if (!alpha(i))
  203. return(0);
  204. if (i < 'a')
  205. i -= 'A' - 'a';
  206. if ((s0 = suftab[i-'a']) == 0)
  207. return(0);
  208. for (;;) {
  209. if ((i = *s0 & 017) == 0)
  210. return(0);
  211. s = s0 + i - 1;
  212. w = hyend - 1;
  213. while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
  214. s--;
  215. w--;
  216. }
  217. if (s == s0)
  218. break;
  219. s0 += i;
  220. }
  221. s = s0 + i - 1;
  222. w = hyend;
  223. if (*s0 & HY_BIT)
  224. goto mark;
  225. while (s > s0) {
  226. w--;
  227. if (*s-- & HY_BIT) {
  228. mark:
  229. hyend = w - 1;
  230. if (*s0 & 0100) /* 0100 used in suftab to encode something too */
  231. continue;
  232. if (!chkvow(w))
  233. return(0);
  234. *hyp++ = w;
  235. }
  236. }
  237. if (*s0 & 040)
  238. return(0);
  239. if (exword())
  240. return(1);
  241. goto again;
  242. }
  243. maplow(int i)
  244. {
  245. if (isupper(i))
  246. i = tolower(i);
  247. return(i);
  248. }
  249. vowel(int i)
  250. {
  251. switch (i) {
  252. case 'a': case 'A':
  253. case 'e': case 'E':
  254. case 'i': case 'I':
  255. case 'o': case 'O':
  256. case 'u': case 'U':
  257. case 'y': case 'Y':
  258. return(1);
  259. default:
  260. return(0);
  261. }
  262. }
  263. Tchar *chkvow(Tchar *w)
  264. {
  265. while (--w >= wdstart)
  266. if (vowel(cbits(*w)))
  267. return(w);
  268. return(0);
  269. }
  270. void digram(void)
  271. {
  272. Tchar *w;
  273. int val;
  274. Tchar *nhyend, *maxw;
  275. int maxval;
  276. extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
  277. again:
  278. if (!(w = chkvow(hyend + 1)))
  279. return;
  280. hyend = w;
  281. if (!(w = chkvow(hyend)))
  282. return;
  283. nhyend = w;
  284. maxval = 0;
  285. w--;
  286. while (++w < hyend && w < wdend - 1) {
  287. val = 1;
  288. if (w == wdstart)
  289. val *= dilook('a', cbits(*w), bxh);
  290. else if (w == wdstart + 1)
  291. val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
  292. else
  293. val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
  294. val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
  295. val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
  296. if (val > maxval) {
  297. maxval = val;
  298. maxw = w + 1;
  299. }
  300. }
  301. hyend = nhyend;
  302. if (maxval > thresh)
  303. *hyp++ = maxw;
  304. goto again;
  305. }
  306. dilook(int a, int b, char t[26][13])
  307. {
  308. int i, j;
  309. i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
  310. if (!(j & 01))
  311. i >>= 4;
  312. return(i & 017);
  313. }
  314. /* here beginneth the tex hyphenation code, as interpreted freely */
  315. /* the main difference is that there is no attempt to squeeze space */
  316. /* as tightly at tex does. */
  317. static int texit(Tchar *, Tchar *);
  318. static int readpats(void);
  319. static void install(char *);
  320. static void fixup(void);
  321. static int trieindex(int, int);
  322. static char pats[50000]; /* size ought to be computed dynamically */
  323. static char *nextpat = pats;
  324. static char *trie[27*27]; /* english-specific sizes */
  325. int texhyphen(void)
  326. {
  327. static int loaded = 0; /* -1: couldn't find tex file */
  328. if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
  329. return 0;
  330. if (loaded == 0) {
  331. if (readpats())
  332. loaded = 1;
  333. else
  334. loaded = -1;
  335. }
  336. return texit(wdstart, wdend);
  337. }
  338. static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
  339. {
  340. int nw, i, k, equal, cnt[500];
  341. char w[500+1], *np, *pp, *wp, *xpp, *xwp;
  342. w[0] = '.';
  343. for (nw = 1; start <= end && nw < 500-1; nw++, start++)
  344. w[nw] = maplow(tolower(cbits(*start)));
  345. start -= (nw - 1);
  346. w[nw++] = '.';
  347. w[nw] = 0;
  348. /*
  349. * printf("try %s\n", w);
  350. */
  351. for (i = 0; i <= nw; i++)
  352. cnt[i] = '0';
  353. for (wp = w; wp+1 < w+nw; wp++) {
  354. for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
  355. if (pp == 0 /* no trie entry */
  356. || *pp != *wp /* no match on 1st letter */
  357. || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
  358. break; /* so move to next letter of word */
  359. equal = 1;
  360. for (xpp = pp+2, xwp = wp+2; *xpp; )
  361. if (*xpp++ != *xwp++) {
  362. equal = 0;
  363. break;
  364. }
  365. if (equal) {
  366. np = xpp+1; /* numpat */
  367. for (k = wp-w; *np; k++, np++)
  368. if (*np > cnt[k])
  369. cnt[k] = *np;
  370. /*
  371. * printf("match: %s %s\n", pp, xpp+1);
  372. */
  373. }
  374. pp += *(pp-1); /* skip over pattern and numbers to next */
  375. }
  376. }
  377. /*
  378. * for (i = 0; i < nw; i++) printf("%c", w[i]);
  379. * printf(" ");
  380. * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
  381. * printf("\n");
  382. */
  383. /*
  384. * for (i = 1; i < nw - 1; i++) {
  385. * if (i > 2 && i < nw - 3 && cnt[i] % 2)
  386. * printf("-");
  387. * if (cbits(start[i-1]) != '.')
  388. * printf("%c", cbits(start[i-1]));
  389. * }
  390. * printf("\n");
  391. */
  392. for (i = 1; i < nw -1; i++)
  393. if (i > 2 && i < nw - 3 && cnt[i] % 2)
  394. *hyp++ = start + i - 1;
  395. return hyp - hyptr; /* non-zero if a hyphen was found */
  396. }
  397. /*
  398. This code assumes that hyphen.tex looks like
  399. % some comments
  400. \patterns{ % more comments
  401. pat5ter4ns, 1 per line, SORTED, nothing else
  402. }
  403. more goo
  404. \hyphenation{ % more comments
  405. ex-cep-tions, one per line; i ignore this part for now
  406. }
  407. this code is NOT robust against variations. unfortunately,
  408. it looks like every local language version of this file has
  409. a different format. i have also made no provision for weird
  410. characters. sigh.
  411. */
  412. static int readpats(void)
  413. {
  414. FILE *fp;
  415. char buf[200], buf1[200];
  416. if ((fp = fopen(TEXHYPHENS, "r")) == NULL
  417. && (fp = fopen(DWBalthyphens, "r")) == NULL) {
  418. ERROR "warning: can't find hyphen.tex" WARN;
  419. return 0;
  420. }
  421. while (fgets(buf, sizeof buf, fp) != NULL) {
  422. sscanf(buf, "%s", buf1);
  423. if (strcmp(buf1, "\\patterns{") == 0)
  424. break;
  425. }
  426. while (fgets(buf, sizeof buf, fp) != NULL) {
  427. if (buf[0] == '}')
  428. break;
  429. install(buf);
  430. }
  431. fclose(fp);
  432. fixup();
  433. return 1;
  434. }
  435. static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
  436. {
  437. int npat, lastpat;
  438. char num[500], *onextpat = nextpat;
  439. num[0] = '0';
  440. *nextpat++ = ' '; /* fill in with count later */
  441. for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
  442. if (isdigit(*s)) {
  443. num[npat] = *s;
  444. lastpat = npat;
  445. } else {
  446. *nextpat++ = *s;
  447. npat++;
  448. num[npat] = '0';
  449. }
  450. }
  451. *nextpat++ = 0;
  452. if (nextpat > pats + sizeof(pats)-20) {
  453. ERROR "tex hyphenation table overflow, tail end ignored" WARN;
  454. nextpat = onextpat;
  455. }
  456. num[lastpat+1] = 0;
  457. strcat(nextpat, num);
  458. nextpat += strlen(nextpat) + 1;
  459. }
  460. static void fixup(void) /* build indexes of where . a b c ... start */
  461. {
  462. char *p, *lastc;
  463. int n;
  464. for (lastc = pats, p = pats+1; p < nextpat; p++)
  465. if (*p == ' ') {
  466. *lastc = p - lastc;
  467. lastc = p;
  468. }
  469. *lastc = p - lastc;
  470. for (p = pats+1; p < nextpat; ) {
  471. n = trieindex(p[0], p[1]);
  472. if (trie[n] == 0)
  473. trie[n] = p;
  474. p += p[-1];
  475. }
  476. /* printf("pats = %d\n", nextpat - pats); */
  477. }
  478. static int trieindex(int d1, int d2)
  479. {
  480. int i;
  481. i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
  482. assert(0 <= i && i < 27*27);
  483. return i;
  484. }