123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545 |
- #include "tdef.h"
- #include "fns.h"
- #include "ext.h"
- #include <assert.h>
- #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
- /* this value is used (as a literal) in suftab.c */
- /* to encode possible hyphenation points in suffixes. */
- /* it could be changed, by widening the tables */
- /* to be shorts instead of chars. */
- /*
- * troff8.c
- *
- * hyphenation
- */
- int hexsize = 0; /* hyphenation exception list size */
- char *hbufp = NULL; /* base of list */
- char *nexth = NULL; /* first free slot in list */
- Tchar *hyend;
- #define THRESH 160 /* digram goodness threshold */
- int thresh = THRESH;
- int texhyphen(void);
- static int alpha(Tchar);
- void hyphen(Tchar *wp)
- {
- int j;
- Tchar *i;
- i = wp;
- while (punct((*i++)))
- ;
- if (!alpha(*--i))
- return;
- wdstart = i++;
- while (alpha(*i++))
- ;
- hyend = wdend = --i - 1;
- while (punct((*i++)))
- ;
- if (*--i)
- return;
- if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
- return;
- hyp = hyptr;
- *hyp = 0;
- hyoff = 2;
- /* for now, try exceptions first, then tex (if hyphalg is non-zero),
- then suffix and digram if tex didn't hyphenate it at all.
- */
- if (!exword() && !texhyphen() && !suffix())
- digram();
- /* this appears to sort hyphenation points into increasing order */
- *hyp++ = 0;
- if (*hyptr)
- for (j = 1; j; ) {
- j = 0;
- for (hyp = hyptr + 1; *hyp != 0; hyp++) {
- if (*(hyp - 1) > *hyp) {
- j++;
- i = *hyp;
- *hyp = *(hyp - 1);
- *(hyp - 1) = i;
- }
- }
- }
- }
- static alpha(Tchar i) /* non-zero if really alphabetic */
- {
- if (ismot(i))
- return 0;
- else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
- return 0; /* no good way to make sure i is in range for */
- else /* the call of isalpha */
- return isalpha(cbits(i));
- }
- punct(Tchar i)
- {
- if (!i || alpha(i))
- return(0);
- else
- return(1);
- }
- void caseha(void) /* set hyphenation algorithm */
- {
- hyphalg = HYPHALG;
- if (skip())
- return;
- noscale++;
- hyphalg = atoi0();
- noscale = 0;
- }
- void caseht(void) /* set hyphenation threshold; not in manual! */
- {
- thresh = THRESH;
- if (skip())
- return;
- noscale++;
- thresh = atoi0();
- noscale = 0;
- }
- char *growh(char *where)
- {
- char *new;
- hexsize += NHEX;
- if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
- return NULL;
- if (new == hbufp) {
- return where;
- } else {
- int diff;
- diff = where - hbufp;
- hbufp = new;
- return new + diff;
- }
- }
- void casehw(void)
- {
- int i, k;
- char *j;
- Tchar t;
- if (nexth == NULL) {
- if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
- ERROR "No space for exception word list." WARN;
- return;
- }
- hexsize = NHEX;
- }
- k = 0;
- while (!skip()) {
- if ((j = nexth) >= hbufp + hexsize - 2)
- if ((j = nexth = growh(j)) == NULL)
- goto full;
- for (;;) {
- if (ismot(t = getch()))
- continue;
- i = cbits(t);
- if (i == ' ' || i == '\n') {
- *j++ = 0;
- nexth = j;
- *j = 0;
- if (i == ' ')
- break;
- else
- return;
- }
- if (i == '-') {
- k = HY_BIT;
- continue;
- }
- *j++ = maplow(i) | k;
- k = 0;
- if (j >= hbufp + hexsize - 2)
- if ((j = growh(j)) == NULL)
- goto full;
- }
- }
- return;
- full:
- ERROR "Cannot grow exception word list." WARN;
- *nexth = 0;
- }
- int exword(void)
- {
- Tchar *w;
- char *e, *save;
- e = hbufp;
- while (1) {
- save = e;
- if (e == NULL || *e == 0)
- return(0);
- w = wdstart;
- while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
- e++;
- w++;
- }
- if (!*e) {
- if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
- w = wdstart;
- for (e = save; *e; e++) {
- if (*e & HY_BIT)
- *hyp++ = w;
- if (hyp > hyptr + NHYP - 1)
- hyp = hyptr + NHYP - 1;
- w++;
- }
- return(1);
- } else {
- e++;
- continue;
- }
- } else
- while (*e++)
- ;
- }
- }
- suffix(void)
- {
- Tchar *w;
- char *s, *s0;
- Tchar i;
- extern char *suftab[];
- again:
- i = cbits(*hyend);
- if (!alpha(i))
- return(0);
- if (i < 'a')
- i -= 'A' - 'a';
- if ((s0 = suftab[i-'a']) == 0)
- return(0);
- for (;;) {
- if ((i = *s0 & 017) == 0)
- return(0);
- s = s0 + i - 1;
- w = hyend - 1;
- while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
- s--;
- w--;
- }
- if (s == s0)
- break;
- s0 += i;
- }
- s = s0 + i - 1;
- w = hyend;
- if (*s0 & HY_BIT)
- goto mark;
- while (s > s0) {
- w--;
- if (*s-- & HY_BIT) {
- mark:
- hyend = w - 1;
- if (*s0 & 0100) /* 0100 used in suftab to encode something too */
- continue;
- if (!chkvow(w))
- return(0);
- *hyp++ = w;
- }
- }
- if (*s0 & 040)
- return(0);
- if (exword())
- return(1);
- goto again;
- }
- maplow(int i)
- {
- if (isupper(i))
- i = tolower(i);
- return(i);
- }
- vowel(int i)
- {
- switch (i) {
- case 'a': case 'A':
- case 'e': case 'E':
- case 'i': case 'I':
- case 'o': case 'O':
- case 'u': case 'U':
- case 'y': case 'Y':
- return(1);
- default:
- return(0);
- }
- }
- Tchar *chkvow(Tchar *w)
- {
- while (--w >= wdstart)
- if (vowel(cbits(*w)))
- return(w);
- return(0);
- }
- void digram(void)
- {
- Tchar *w;
- int val;
- Tchar *nhyend, *maxw;
- int maxval;
- extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
- again:
- if (!(w = chkvow(hyend + 1)))
- return;
- hyend = w;
- if (!(w = chkvow(hyend)))
- return;
- nhyend = w;
- maxval = 0;
- w--;
- while (++w < hyend && w < wdend - 1) {
- val = 1;
- if (w == wdstart)
- val *= dilook('a', cbits(*w), bxh);
- else if (w == wdstart + 1)
- val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
- else
- val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
- val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
- val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
- if (val > maxval) {
- maxval = val;
- maxw = w + 1;
- }
- }
- hyend = nhyend;
- if (maxval > thresh)
- *hyp++ = maxw;
- goto again;
- }
- dilook(int a, int b, char t[26][13])
- {
- int i, j;
- i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
- if (!(j & 01))
- i >>= 4;
- return(i & 017);
- }
- /* here beginneth the tex hyphenation code, as interpreted freely */
- /* the main difference is that there is no attempt to squeeze space */
- /* as tightly at tex does. */
- static int texit(Tchar *, Tchar *);
- static int readpats(void);
- static void install(char *);
- static void fixup(void);
- static int trieindex(int, int);
- static char pats[50000]; /* size ought to be computed dynamically */
- static char *nextpat = pats;
- static char *trie[27*27]; /* english-specific sizes */
- int texhyphen(void)
- {
- static int loaded = 0; /* -1: couldn't find tex file */
- if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
- return 0;
- if (loaded == 0) {
- if (readpats())
- loaded = 1;
- else
- loaded = -1;
- }
- return texit(wdstart, wdend);
- }
- static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
- {
- int nw, i, k, equal, cnt[500];
- char w[500+1], *np, *pp, *wp, *xpp, *xwp;
- w[0] = '.';
- for (nw = 1; start <= end && nw < 500-1; nw++, start++)
- w[nw] = maplow(tolower(cbits(*start)));
- start -= (nw - 1);
- w[nw++] = '.';
- w[nw] = 0;
- /*
- * printf("try %s\n", w);
- */
- for (i = 0; i <= nw; i++)
- cnt[i] = '0';
- for (wp = w; wp+1 < w+nw; wp++) {
- for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
- if (pp == 0 /* no trie entry */
- || *pp != *wp /* no match on 1st letter */
- || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
- break; /* so move to next letter of word */
- equal = 1;
- for (xpp = pp+2, xwp = wp+2; *xpp; )
- if (*xpp++ != *xwp++) {
- equal = 0;
- break;
- }
- if (equal) {
- np = xpp+1; /* numpat */
- for (k = wp-w; *np; k++, np++)
- if (*np > cnt[k])
- cnt[k] = *np;
- /*
- * printf("match: %s %s\n", pp, xpp+1);
- */
- }
- pp += *(pp-1); /* skip over pattern and numbers to next */
- }
- }
- /*
- * for (i = 0; i < nw; i++) printf("%c", w[i]);
- * printf(" ");
- * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
- * printf("\n");
- */
- /*
- * for (i = 1; i < nw - 1; i++) {
- * if (i > 2 && i < nw - 3 && cnt[i] % 2)
- * printf("-");
- * if (cbits(start[i-1]) != '.')
- * printf("%c", cbits(start[i-1]));
- * }
- * printf("\n");
- */
- for (i = 1; i < nw -1; i++)
- if (i > 2 && i < nw - 3 && cnt[i] % 2)
- *hyp++ = start + i - 1;
- return hyp - hyptr; /* non-zero if a hyphen was found */
- }
- /*
- This code assumes that hyphen.tex looks like
- % some comments
- \patterns{ % more comments
- pat5ter4ns, 1 per line, SORTED, nothing else
- }
- more goo
- \hyphenation{ % more comments
- ex-cep-tions, one per line; i ignore this part for now
- }
- this code is NOT robust against variations. unfortunately,
- it looks like every local language version of this file has
- a different format. i have also made no provision for weird
- characters. sigh.
- */
- static int readpats(void)
- {
- FILE *fp;
- char buf[200], buf1[200];
- if ((fp = fopen(TEXHYPHENS, "r")) == NULL
- && (fp = fopen(DWBalthyphens, "r")) == NULL) {
- ERROR "warning: can't find hyphen.tex" WARN;
- return 0;
- }
- while (fgets(buf, sizeof buf, fp) != NULL) {
- sscanf(buf, "%s", buf1);
- if (strcmp(buf1, "\\patterns{") == 0)
- break;
- }
- while (fgets(buf, sizeof buf, fp) != NULL) {
- if (buf[0] == '}')
- break;
- install(buf);
- }
- fclose(fp);
- fixup();
- return 1;
- }
- static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
- {
- int npat, lastpat;
- char num[500], *onextpat = nextpat;
- num[0] = '0';
- *nextpat++ = ' '; /* fill in with count later */
- for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
- if (isdigit(*s)) {
- num[npat] = *s;
- lastpat = npat;
- } else {
- *nextpat++ = *s;
- npat++;
- num[npat] = '0';
- }
- }
- *nextpat++ = 0;
- if (nextpat > pats + sizeof(pats)-20) {
- ERROR "tex hyphenation table overflow, tail end ignored" WARN;
- nextpat = onextpat;
- }
- num[lastpat+1] = 0;
- strcat(nextpat, num);
- nextpat += strlen(nextpat) + 1;
- }
- static void fixup(void) /* build indexes of where . a b c ... start */
- {
- char *p, *lastc;
- int n;
- for (lastc = pats, p = pats+1; p < nextpat; p++)
- if (*p == ' ') {
- *lastc = p - lastc;
- lastc = p;
- }
- *lastc = p - lastc;
- for (p = pats+1; p < nextpat; ) {
- n = trieindex(p[0], p[1]);
- if (trie[n] == 0)
- trie[n] = p;
- p += p[-1];
- }
- /* printf("pats = %d\n", nextpat - pats); */
- }
- static int trieindex(int d1, int d2)
- {
- int i;
- i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
- assert(0 <= i && i < 27*27);
- return i;
- }
|