123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <ctype.h>
- #include "code.h"
- /* read an annotated spelling list in the form
- word <tab> affixcode [ , affixcode ] ...
- print a reencoded version
- octal <tab> word
- */
- typedef struct Dict Dict;
- struct Dict
- {
- char* word;
- int encode;
- };
- Dict words[200000];
- char space[500000];
- long encodes[4094];
- long nspace;
- long nwords;
- int ncodes;
- Biobuf bout;
- void readinput(int f);
- long typecode(char *str);
- int wcmp(void*, void*);
- void pdict(void);
- void sput(int);
- void
- main(int argc, char *argv[])
- {
- int f;
- Binit(&bout, 1, OWRITE);
- nwords = 0;
- nspace = 0;
- ncodes = 0;
- if(argc <= 1)
- readinput(0);
- while(argc > 1) {
- f = open(argv[1], 0);
- if(f < 0) {
- fprint(2, "Cannot open %s\n", argv[1]);
- exits("open");
- }
- readinput(f);
- argc--;
- argv++;
- }
- fprint(2, "words = %ld; space = %ld; codes = %d\n",
- nwords, nspace, ncodes);
- qsort(words, nwords, sizeof(words[0]), wcmp);
- pdict();
- exits(0);
- }
- wcmp(void *a, void *b)
- {
- return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
- }
- void
- readinput(int f)
- {
- long i;
- char *code, *line, *bword;
- Biobuf buf;
- long lineno = 0;
- Binit(&buf, f, OREAD);
- while(line = Brdline(&buf, '\n')) {
- line[Blinelen(&buf)-1] = 0;
- lineno++;
- code = line;
- while(isspace(*code))
- code++;
- bword = code;
- while(*code && !isspace(*code))
- code++;
- i = code-bword;
- memmove(space+nspace, bword, i);
- words[nwords].word = space+nspace;
- nspace += i;
- space[nspace] = 0;
- nspace++;
- if(*code) {
- *code++ = 0;
- while(isspace(*code))
- code++;
- }
- words[nwords].encode = typecode(code);
- nwords++;
- if(nwords >= sizeof(words)/sizeof(words[0])) {
- fprint(2, "words array too small\n");
- exits("words");
- }
- if(nspace >= sizeof(space)/sizeof(space[0])) {
- fprint(2, "space array too small\n");
- exits("space");
- }
- }
- Bterm(&buf);
- }
- typedef struct Class Class;
- struct Class
- {
- char* codename;
- long bits;
- };
- Class codea[] =
- {
- { "a", ADJ },
- { "adv", ADV },
- 0
- };
- Class codec[] =
- {
- { "comp", COMP },
- 0
- };
- Class coded[] =
- {
- { "d", DONT_TOUCH},
- 0
- };
- Class codee[] =
- {
- { "ed", ED },
- { "er", ACTOR },
- 0
- };
- Class codei[] =
- {
- { "in", IN },
- { "ion", ION },
- 0
- };
- Class codem[] =
- {
- { "man", MAN },
- { "ms", MONO },
- 0
- };
- Class coden[] =
- {
- { "n", NOUN },
- { "na", N_AFFIX },
- { "nopref", NOPREF },
- 0
- };
- Class codep[] =
- {
- { "pc", PROP_COLLECT },
- 0
- };
- Class codes[] =
- {
- { "s", STOP },
- 0
- };
- Class codev[] =
- {
- { "v", VERB },
- { "va", V_AFFIX },
- { "vi", V_IRREG },
- 0
- };
- Class codey[] =
- {
- { "y", _Y },
- 0
- };
- Class codez[] =
- {
- 0
- };
- Class* codetab[] =
- {
- codea,
- codez,
- codec,
- coded,
- codee,
- codez,
- codez,
- codez,
- codei,
- codez,
- codez,
- codez,
- codem,
- coden,
- codez,
- codep,
- codez,
- codez,
- codes,
- codez,
- codez,
- codev,
- codez,
- codez,
- codey,
- codez,
- };
- long
- typecode(char *str)
- {
- Class *p;
- long code;
- int n, i;
- char *s, *sp, *st;
- code = 0;
- loop:
- for(s=str; *s != 0 && *s != ','; s++)
- ;
- for(p = codetab[*str-'a']; sp = p->codename; p++) {
- st = str;
- for(n=s-str;; st++,sp++) {
- if(*st != *sp)
- goto cont;
- n--;
- if(n == 0)
- break;
- }
- code |= p->bits;
- if(*s == 0)
- goto out;
- str = s+1;
- goto loop;
- cont:;
- }
- fprint(2, "Unknown affix code \"%s\"\n", str);
- return 0;
- out:
- for(i=0; i<ncodes; i++)
- if(encodes[i] == code)
- return i;
- encodes[i] = code;
- ncodes++;
- return i;
- }
- void
- sput(int s)
- {
- Bputc(&bout, s>>8);
- Bputc(&bout, s);
- }
- void
- lput(long l)
- {
- Bputc(&bout, l>>24);
- Bputc(&bout, l>>16);
- Bputc(&bout, l>>8);
- Bputc(&bout, l);
- }
- /*
- * spit out the encoded dictionary
- * all numbers are encoded big-endian.
- * struct
- * {
- * short ncodes;
- * long encodes[ncodes];
- * struct
- * {
- * short encode;
- * char word[*];
- * } words[*];
- * };
- * 0x8000 flag for code word
- * 0x7800 count of number of common bytes with previous word
- * 0x07ff index into codes array for affixes
- */
- void
- pdict(void)
- {
- long i, count;
- int encode, j, c;
- char *lastword, *thisword, *word;
- sput(ncodes);
- for(i=0; i<ncodes; i++)
- lput(encodes[i]);
- count = ncodes*4 + 2;
- lastword = "";
- for(i=0; i<nwords; i++) {
- word = words[i].word;
- thisword = word;
- for(j=0; *thisword == *lastword; j++) {
- if(*thisword == 0) {
- fprint(2, "identical words: %s\n", word);
- break;
- }
- thisword++;
- lastword++;
- }
- if(j > 15)
- j = 15;
- encode = words[i].encode;
- c = (1<<15) | (j<<11) | encode;
- sput(c);
- count += 2;
- for(thisword=word+j; c = *thisword; thisword++) {
- Bputc(&bout, c);
- count++;
- }
- lastword = word;
- }
- fprint(2, "output bytes = %ld\n", count);
- }
|