pcode.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include "code.h"
  6. /* read an annotated spelling list in the form
  7. word <tab> affixcode [ , affixcode ] ...
  8. print a reencoded version
  9. octal <tab> word
  10. */
  11. typedef struct Dict Dict;
  12. struct Dict
  13. {
  14. char* word;
  15. int encode;
  16. };
  17. Dict words[200000];
  18. char space[500000];
  19. long encodes[4094];
  20. long nspace;
  21. long nwords;
  22. int ncodes;
  23. Biobuf bout;
  24. void readinput(int f);
  25. long typecode(char *str);
  26. int wcmp(void*, void*);
  27. void pdict(void);
  28. void sput(int);
  29. void
  30. main(int argc, char *argv[])
  31. {
  32. int f;
  33. Binit(&bout, 1, OWRITE);
  34. nwords = 0;
  35. nspace = 0;
  36. ncodes = 0;
  37. if(argc <= 1)
  38. readinput(0);
  39. while(argc > 1) {
  40. f = open(argv[1], 0);
  41. if(f < 0) {
  42. fprint(2, "Cannot open %s\n", argv[1]);
  43. exits("open");
  44. }
  45. readinput(f);
  46. argc--;
  47. argv++;
  48. }
  49. fprint(2, "words = %ld; space = %ld; codes = %d\n",
  50. nwords, nspace, ncodes);
  51. qsort(words, nwords, sizeof(words[0]), wcmp);
  52. pdict();
  53. exits(0);
  54. }
  55. wcmp(void *a, void *b)
  56. {
  57. return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
  58. }
  59. void
  60. readinput(int f)
  61. {
  62. long i;
  63. char *code, *line, *bword;
  64. Biobuf buf;
  65. long lineno = 0;
  66. Binit(&buf, f, OREAD);
  67. while(line = Brdline(&buf, '\n')) {
  68. line[Blinelen(&buf)-1] = 0;
  69. lineno++;
  70. code = line;
  71. while(isspace(*code))
  72. code++;
  73. bword = code;
  74. while(*code && !isspace(*code))
  75. code++;
  76. i = code-bword;
  77. memmove(space+nspace, bword, i);
  78. words[nwords].word = space+nspace;
  79. nspace += i;
  80. space[nspace] = 0;
  81. nspace++;
  82. if(*code) {
  83. *code++ = 0;
  84. while(isspace(*code))
  85. code++;
  86. }
  87. words[nwords].encode = typecode(code);
  88. nwords++;
  89. if(nwords >= sizeof(words)/sizeof(words[0])) {
  90. fprint(2, "words array too small\n");
  91. exits("words");
  92. }
  93. if(nspace >= sizeof(space)/sizeof(space[0])) {
  94. fprint(2, "space array too small\n");
  95. exits("space");
  96. }
  97. }
  98. Bterm(&buf);
  99. }
  100. typedef struct Class Class;
  101. struct Class
  102. {
  103. char* codename;
  104. long bits;
  105. };
  106. Class codea[] =
  107. {
  108. { "a", ADJ },
  109. { "adv", ADV },
  110. 0
  111. };
  112. Class codec[] =
  113. {
  114. { "comp", COMP },
  115. 0
  116. };
  117. Class coded[] =
  118. {
  119. { "d", DONT_TOUCH},
  120. 0
  121. };
  122. Class codee[] =
  123. {
  124. { "ed", ED },
  125. { "er", ACTOR },
  126. 0
  127. };
  128. Class codei[] =
  129. {
  130. { "in", IN },
  131. { "ion", ION },
  132. 0
  133. };
  134. Class codem[] =
  135. {
  136. { "man", MAN },
  137. { "ms", MONO },
  138. 0
  139. };
  140. Class coden[] =
  141. {
  142. { "n", NOUN },
  143. { "na", N_AFFIX },
  144. { "nopref", NOPREF },
  145. 0
  146. };
  147. Class codep[] =
  148. {
  149. { "pc", PROP_COLLECT },
  150. 0
  151. };
  152. Class codes[] =
  153. {
  154. { "s", STOP },
  155. 0
  156. };
  157. Class codev[] =
  158. {
  159. { "v", VERB },
  160. { "va", V_AFFIX },
  161. { "vi", V_IRREG },
  162. 0
  163. };
  164. Class codey[] =
  165. {
  166. { "y", _Y },
  167. 0
  168. };
  169. Class codez[] =
  170. {
  171. 0
  172. };
  173. Class* codetab[] =
  174. {
  175. codea,
  176. codez,
  177. codec,
  178. coded,
  179. codee,
  180. codez,
  181. codez,
  182. codez,
  183. codei,
  184. codez,
  185. codez,
  186. codez,
  187. codem,
  188. coden,
  189. codez,
  190. codep,
  191. codez,
  192. codez,
  193. codes,
  194. codez,
  195. codez,
  196. codev,
  197. codez,
  198. codez,
  199. codey,
  200. codez,
  201. };
  202. long
  203. typecode(char *str)
  204. {
  205. Class *p;
  206. long code;
  207. int n, i;
  208. char *s, *sp, *st;
  209. code = 0;
  210. loop:
  211. for(s=str; *s != 0 && *s != ','; s++)
  212. ;
  213. for(p = codetab[*str-'a']; sp = p->codename; p++) {
  214. st = str;
  215. for(n=s-str;; st++,sp++) {
  216. if(*st != *sp)
  217. goto cont;
  218. n--;
  219. if(n == 0)
  220. break;
  221. }
  222. code |= p->bits;
  223. if(*s == 0)
  224. goto out;
  225. str = s+1;
  226. goto loop;
  227. cont:;
  228. }
  229. fprint(2, "Unknown affix code \"%s\"\n", str);
  230. return 0;
  231. out:
  232. for(i=0; i<ncodes; i++)
  233. if(encodes[i] == code)
  234. return i;
  235. encodes[i] = code;
  236. ncodes++;
  237. return i;
  238. }
  239. void
  240. sput(int s)
  241. {
  242. Bputc(&bout, s>>8);
  243. Bputc(&bout, s);
  244. }
  245. void
  246. lput(long l)
  247. {
  248. Bputc(&bout, l>>24);
  249. Bputc(&bout, l>>16);
  250. Bputc(&bout, l>>8);
  251. Bputc(&bout, l);
  252. }
  253. /*
  254. * spit out the encoded dictionary
  255. * all numbers are encoded big-endian.
  256. * struct
  257. * {
  258. * short ncodes;
  259. * long encodes[ncodes];
  260. * struct
  261. * {
  262. * short encode;
  263. * char word[*];
  264. * } words[*];
  265. * };
  266. * 0x8000 flag for code word
  267. * 0x7800 count of number of common bytes with previous word
  268. * 0x07ff index into codes array for affixes
  269. */
  270. void
  271. pdict(void)
  272. {
  273. long i, count;
  274. int encode, j, c;
  275. char *lastword, *thisword, *word;
  276. sput(ncodes);
  277. for(i=0; i<ncodes; i++)
  278. lput(encodes[i]);
  279. count = ncodes*4 + 2;
  280. lastword = "";
  281. for(i=0; i<nwords; i++) {
  282. word = words[i].word;
  283. thisword = word;
  284. for(j=0; *thisword == *lastword; j++) {
  285. if(*thisword == 0) {
  286. fprint(2, "identical words: %s\n", word);
  287. break;
  288. }
  289. thisword++;
  290. lastword++;
  291. }
  292. if(j > 15)
  293. j = 15;
  294. encode = words[i].encode;
  295. c = (1<<15) | (j<<11) | encode;
  296. sput(c);
  297. count += 2;
  298. for(thisword=word+j; c = *thisword; thisword++) {
  299. Bputc(&bout, c);
  300. count++;
  301. }
  302. lastword = word;
  303. }
  304. fprint(2, "output bytes = %ld\n", count);
  305. }