sprog.c 22 KB


  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include "code.h"
  6. /* fig leaves for possibly signed char quantities */
  7. #define ISUPPER(c) isupper((c)&0xff)
  8. #define ISLOWER(c) islower((c)&0xff)
  9. #define ISALPHA(c) isalpha((c)&0xff)
  10. #define ISDIGIT(c) isdigit((c)&0xff)
  11. #define ISVOWEL(c) voweltab[(c)&0xff]
  12. #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
  13. #define pair(a,b) (((a)<<8) | (b))
  14. #define DLEV 2
  15. #define DSIZ 40
  16. typedef long Bits;
  17. #define Set(h, f) ((long)(h) & (f))
  18. Bits nop(char*, char*, char*, int, int);
  19. Bits strip(char*, char*, char*, int, int);
  20. Bits ize(char*, char*, char*, int, int);
  21. Bits i_to_y(char*, char*, char*, int, int);
  22. Bits ily(char*, char*, char*, int, int);
  23. Bits subst(char*, char*, char*, int, int);
  24. Bits CCe(char*, char*, char*, int, int);
  25. Bits tion(char*, char*, char*, int, int);
  26. Bits an(char*, char*, char*, int, int);
  27. Bits s(char*, char*, char*, int, int);
  28. Bits es(char*, char*, char*, int, int);
  29. Bits bility(char*, char*, char*, int, int);
  30. Bits y_to_e(char*, char*, char*, int, int);
  31. Bits VCe(char*, char*, char*, int, int);
  32. Bits trypref(char*, char*, int, int);
  33. Bits tryword(char*, char*, int, int);
  34. Bits trysuff(char*, int, int);
  35. Bits dict(char*, char*);
  36. void typeprint(Bits);
  37. void pcomma(char*);
  38. void ise(void);
  39. int ordinal(void);
  40. char* skipv(char*);
  41. int inun(char*, Bits);
  42. char* ztos(char*);
  43. void readdict(char*);
  44. typedef struct Ptab Ptab;
  45. struct Ptab
  46. {
  47. char* s;
  48. int flag;
  49. };
  50. typedef struct Suftab Suftab;
  51. struct Suftab
  52. {
  53. char *suf;
  54. Bits (*p1)(char*, char*, char*, int, int);
  55. int n1;
  56. char *d1;
  57. char *a1;
  58. int flag;
  59. int affixable;
  60. Bits (*p2)(char*, char*, char*, int, int);
  61. int n2;
  62. char *d2;
  63. char *a2;
  64. };
  65. Suftab staba[] = {
  66. {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
  67. 0
  68. };
  69. Suftab stabc[] =
  70. {
  71. {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
  72. {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
  73. {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
  74. {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
  75. {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
  76. {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
  77. {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
  78. {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
  79. {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
  80. 0
  81. };
  82. Suftab stabd[] =
  83. {
  84. {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
  85. {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
  86. 0
  87. };
  88. Suftab stabe[] =
  89. {
  90. /*
  91. * V_affix for comment ->commence->commentment??
  92. */
  93. {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
  94. {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
  95. {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
  96. {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
  97. {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
  98. {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
  99. {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
  100. 0
  101. };
  102. Suftab stabg[] =
  103. {
  104. {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
  105. {"gnikam",strip,6,"","+making",NOUN,NOUN},
  106. {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
  107. {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
  108. 0
  109. };
  110. Suftab stabl[] =
  111. {
  112. {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
  113. {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
  114. {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
  115. {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
  116. {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
  117. 0
  118. };
  119. Suftab stabm[] =
  120. {
  121. /* congregational + ism */
  122. {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
  123. {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
  124. 0
  125. };
  126. Suftab stabn[] =
  127. {
  128. {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
  129. {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
  130. {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
  131. {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
  132. {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
  133. {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
  134. {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
  135. {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
  136. {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
  137. {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
  138. 0
  139. };
  140. Suftab stabp[] =
  141. {
  142. {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
  143. 0
  144. };
  145. Suftab stabr[] =
  146. {
  147. {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
  148. {"reyhparg",nop,0,"","",0,NOUN},
  149. {"reyl",nop,0,"","",0,NOUN},
  150. {"rekam",strip,5,"","+maker",NOUN,NOUN},
  151. {"repeek",strip,6,"","+keeper",NOUN,NOUN},
  152. {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
  153. {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
  154. {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
  155. {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
  156. 0
  157. };
  158. Suftab stabs[] =
  159. {
  160. {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
  161. {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
  162. {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
  163. {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
  164. {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
  165. 0
  166. };
  167. Suftab stabt[] =
  168. {
  169. {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
  170. {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
  171. {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
  172. {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
  173. 0
  174. };
  175. Suftab staby[] =
  176. {
  177. {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
  178. {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
  179. {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
  180. {"ytisuo",nop,0,"","",NOUN},
  181. {"ytilb",nop,0,"","",0,NOUN},
  182. {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
  183. {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
  184. {"ylc",nop,0,"","",0},
  185. {"ylelb",nop,0,"","",0},
  186. {"ylelp",nop,0,"","",0},
  187. {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
  188. {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
  189. {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
  190. 0
  191. };
  192. Suftab stabz[] =
  193. {
  194. 0
  195. };
  196. Suftab* suftab[] =
  197. {
  198. staba,
  199. stabz,
  200. stabc,
  201. stabd,
  202. stabe,
  203. stabz,
  204. stabg,
  205. stabz,
  206. stabz,
  207. stabz,
  208. stabz,
  209. stabl,
  210. stabm,
  211. stabn,
  212. stabz,
  213. stabp,
  214. stabz,
  215. stabr,
  216. stabs,
  217. stabt,
  218. stabz,
  219. stabz,
  220. stabz,
  221. stabz,
  222. staby,
  223. stabz,
  224. };
  225. Ptab ptaba[] =
  226. {
  227. "anti", 0,
  228. "auto", 0,
  229. 0
  230. };
  231. Ptab ptabb[] =
  232. {
  233. "bio", 0,
  234. 0
  235. };
  236. Ptab ptabc[] =
  237. {
  238. "counter", 0,
  239. 0
  240. };
  241. Ptab ptabd[] =
  242. {
  243. "dis", 0,
  244. 0
  245. };
  246. Ptab ptabe[] =
  247. {
  248. "electro", 0,
  249. 0
  250. };
  251. Ptab ptabf[] =
  252. {
  253. "femto", 0,
  254. 0
  255. };
  256. Ptab ptabg[] =
  257. {
  258. "geo", 0,
  259. "giga", 0,
  260. 0
  261. };
  262. Ptab ptabh[] =
  263. {
  264. "hyper", 0,
  265. 0
  266. };
  267. Ptab ptabi[] =
  268. {
  269. "immuno", 0,
  270. "im", IN,
  271. "intra", 0,
  272. "inter", 0,
  273. "in", IN,
  274. "ir", IN,
  275. "iso", 0,
  276. 0
  277. };
  278. Ptab ptabj[] =
  279. {
  280. 0
  281. };
  282. Ptab ptabk[] =
  283. {
  284. "kilo", 0,
  285. 0
  286. };
  287. Ptab ptabl[] =
  288. {
  289. 0
  290. };
  291. Ptab ptabm[] =
  292. {
  293. "magneto", 0,
  294. "mega", 0,
  295. "meta", 0,
  296. "micro", 0,
  297. "mid", 0,
  298. "milli", 0,
  299. "mini", 0,
  300. "mis", 0,
  301. "mono", 0,
  302. "multi", 0,
  303. 0
  304. };
  305. Ptab ptabn[] =
  306. {
  307. "nano", 0,
  308. "neuro", 0,
  309. "non", 0,
  310. 0
  311. };
  312. Ptab ptabo[] =
  313. {
  314. "out", 0,
  315. "over", 0,
  316. 0
  317. };
  318. Ptab ptabp[] =
  319. {
  320. "para", 0,
  321. "photo", 0,
  322. "pico", 0,
  323. "poly", 0,
  324. "pre", 0,
  325. "pseudo", 0,
  326. "psycho", 0,
  327. 0
  328. };
  329. Ptab ptabq[] =
  330. {
  331. "quasi", 0,
  332. 0
  333. };
  334. Ptab ptabr[] =
  335. {
  336. "radio", 0,
  337. "re", 0,
  338. 0
  339. };
  340. Ptab ptabs[] =
  341. {
  342. "semi", 0,
  343. "stereo", 0,
  344. "sub", 0,
  345. "super", 0,
  346. 0
  347. };
  348. Ptab ptabt[] =
  349. {
  350. "tele", 0,
  351. "tera", 0,
  352. "thermo", 0,
  353. 0
  354. };
  355. Ptab ptabu[] =
  356. {
  357. "ultra", 0,
  358. "under", 0, /*must precede un*/
  359. "un", IN,
  360. 0
  361. };
  362. Ptab ptabv[] =
  363. {
  364. 0
  365. };
  366. Ptab ptabw[] =
  367. {
  368. 0
  369. };
  370. Ptab ptabx[] =
  371. {
  372. 0
  373. };
  374. Ptab ptaby[] =
  375. {
  376. 0
  377. };
  378. Ptab ptabz[] =
  379. {
  380. 0
  381. };
  382. Ptab* preftab[] =
  383. {
  384. ptaba,
  385. ptabb,
  386. ptabc,
  387. ptabd,
  388. ptabe,
  389. ptabf,
  390. ptabg,
  391. ptabh,
  392. ptabi,
  393. ptabj,
  394. ptabk,
  395. ptabl,
  396. ptabm,
  397. ptabn,
  398. ptabo,
  399. ptabp,
  400. ptabq,
  401. ptabr,
  402. ptabs,
  403. ptabt,
  404. ptabu,
  405. ptabv,
  406. ptabw,
  407. ptabx,
  408. ptaby,
  409. ptabz,
  410. };
  411. typedef struct {
  412. char *mesg;
  413. enum { NONE, SUFF, PREF} type;
  414. } Deriv;
  415. int aflag;
  416. int cflag;
  417. int fflag;
  418. int vflag;
  419. int xflag;
  420. int nflag;
  421. char word[500];
  422. char* original;
  423. Deriv emptyderiv;
  424. Deriv deriv[DSIZ+3];
  425. char affix[DSIZ*10]; /* 10 is longest affix message */
  426. int prefcount;
  427. int suffcount;
  428. char* acmeid;
  429. char space[300000]; /* must be as large as "words"+"space" in pcode run */
  430. Bits encode[2048]; /* must be as long as "codes" in pcode run */
  431. int nencode;
  432. char voweltab[256];
  433. char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
  434. Biobuf bin;
  435. Biobuf bout;
  436. char* codefile = "/sys/lib/amspell";
  437. char* brfile = "/sys/lib/brspell";
  438. char* Usage = "usage";
  439. void
  440. main(int argc, char *argv[])
  441. {
  442. char *ep, *cp;
  443. char *dp;
  444. int j, i, c;
  445. int low;
  446. Bits h;
  447. Binit(&bin, 0, OREAD);
  448. Binit(&bout, 1, OWRITE);
  449. for(i=0; c = "aeiouyAEIOUY"[i]; i++)
  450. voweltab[c] = 1;
  451. while(argc > 1) {
  452. if(argv[1][0] != '-')
  453. break;
  454. for(i=1; c = argv[1][i]; i++)
  455. switch(c) {
  456. default:
  457. fprint(2, "usage: spell [-bcCvx] [-f file]\n");
  458. exits(Usage);
  459. case 'a':
  460. aflag++;
  461. continue;
  462. case 'b':
  463. ise();
  464. if(!fflag)
  465. codefile = brfile;
  466. continue;
  467. case 'C': /* for "correct" */
  468. vflag++;
  469. case 'c': /* for ocr */
  470. cflag++;
  471. continue;
  472. case 'v':
  473. vflag++;
  474. continue;
  475. case 'x':
  476. xflag++;
  477. continue;
  478. case 'f':
  479. if(argc <= 2) {
  480. fprint(2, "spell: -f requires another argument\n");
  481. exits(Usage);
  482. }
  483. argv++;
  484. argc--;
  485. codefile = argv[1];
  486. fflag++;
  487. goto brk;
  488. }
  489. brk:
  490. argv++;
  491. argc--;
  492. }
  493. readdict(codefile);
  494. if(argc > 1) {
  495. fprint(2, "usage: spell [-bcCvx] [-f file]\n");
  496. exits(Usage);
  497. }
  498. if(aflag)
  499. cflag = vflag = 0;
  500. for(;;) {
  501. affix[0] = 0;
  502. original = Brdline(&bin, '\n');
  503. if(original == 0)
  504. exits(0);
  505. original[Blinelen(&bin)-1] = 0;
  506. low = 0;
  507. if(aflag) {
  508. acmeid = original;
  509. while(*original != ':')
  510. if(*original++ == 0)
  511. exits(0);
  512. while(*++original != ':')
  513. if(*original == 0)
  514. exits(0);
  515. *original++ = 0;
  516. }
  517. for(ep=word,dp=original; j = *dp; ep++,dp++) {
  518. if(ISLOWER(j))
  519. low++;
  520. if(ep >= word+sizeof(word)-1)
  521. break;
  522. *ep = j;
  523. }
  524. *ep = 0;
  525. if(ISDIGIT(word[0]) && ordinal())
  526. continue;
  527. h = 0;
  528. if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
  529. for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
  530. *dp = Tolower(*cp);
  531. if(!h)
  532. for(;;) { /* at most twice */
  533. if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
  534. break;
  535. if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
  536. break;
  537. if(!ISUPPER(word[0]))
  538. break;
  539. cp = original;
  540. dp = word;
  541. while(*dp = *cp++) {
  542. if(!low)
  543. *dp = Tolower(*dp);
  544. dp++;
  545. }
  546. word[0] = Tolower(word[0]);
  547. }
  548. if(cflag) {
  549. if(!h || Set(h,STOP))
  550. print("-");
  551. else if(!vflag)
  552. print("+");
  553. else
  554. print("%c",'0' + (suffcount>0) +
  555. (prefcount>4? 8: 2*prefcount));
  556. } else if(!h || Set(h,STOP)) {
  557. if(aflag)
  558. Bprint(&bout, "%s:%s\n", acmeid, original);
  559. else
  560. Bprint(&bout, "%s\n", original);
  561. } else if(affix[0] != 0 && affix[0] != '.')
  562. print("%s\t%s\n", affix, original);
  563. }
  564. exits(0);
  565. }
  566. /* strip exactly one suffix and do
  567. * indicated routine(s), which may recursively
  568. * strip suffixes
  569. */
  570. Bits
  571. trysuff(char* ep, int lev, int flag)
  572. {
  573. Suftab *t;
  574. char *cp, *sp;
  575. Bits h = 0;
  576. int initchar = ep[-1];
  577. flag &= ~MONO;
  578. lev += DLEV;
  579. if(lev < DSIZ) {
  580. deriv[lev] = emptyderiv;
  581. deriv[lev-1] = emptyderiv;
  582. }
  583. if(!ISLOWER(initchar))
  584. return h;
  585. for(t=suftab[initchar-'a']; sp=t->suf; t++) {
  586. cp = ep;
  587. while(*sp)
  588. if(*--cp != *sp++)
  589. goto next;
  590. for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
  591. ;
  592. if(sp < word)
  593. continue;
  594. if(!(t->affixable & flag))
  595. return 0;
  596. h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
  597. if(!h && t->p2!=0) {
  598. if(lev < DSIZ) {
  599. deriv[lev] = emptyderiv;
  600. deriv[lev+1] = emptyderiv;
  601. }
  602. h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
  603. }
  604. break;
  605. next:;
  606. }
  607. return h;
  608. }
  609. Bits
  610. nop(char* ep, char* d, char* a, int lev, int flag)
  611. {
  612. USED(ep, d, a, lev, flag);
  613. return 0;
  614. }
  615. Bits
  616. cstrip(char* ep, char* d, char* a, int lev, int flag)
  617. {
  618. int temp = ep[0];
  619. if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
  620. switch(pair(ep[-1],ep[0])) {
  621. case pair('a', 'a'):
  622. case pair('a', 'e'):
  623. case pair('a', 'i'):
  624. case pair('e', 'a'):
  625. case pair('e', 'e'):
  626. case pair('e', 'i'):
  627. case pair('i', 'i'):
  628. case pair('o', 'a'):
  629. return 0;
  630. }
  631. } else
  632. if(temp==ep[-1]&&temp==ep[-2])
  633. return 0;
  634. return strip(ep,d,a,lev,flag);
  635. }
  636. Bits
  637. strip(char* ep, char* d, char* a, int lev, int flag)
  638. {
  639. Bits h = trypref(ep, a, lev, flag);
  640. USED(d);
  641. if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
  642. h = 0;
  643. if(h)
  644. return h;
  645. if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
  646. h = trypref(ep-1,a,lev,flag|MONO);
  647. if(h)
  648. return h;
  649. }
  650. return trysuff(ep,lev,flag);
  651. }
  652. Bits
  653. s(char* ep, char* d, char* a, int lev, int flag)
  654. {
  655. if(lev > DLEV+1)
  656. return 0;
  657. if(*ep=='s') {
  658. switch(ep[-1]) {
  659. case 'y':
  660. if(ISVOWEL(ep[-2])||ISUPPER(*word))
  661. break; /*says Kennedys*/
  662. case 'x':
  663. case 'z':
  664. case 's':
  665. return 0;
  666. case 'h':
  667. switch(ep[-2]) {
  668. case 'c':
  669. case 's':
  670. return 0;
  671. }
  672. }
  673. }
  674. return strip(ep,d,a,lev,flag);
  675. }
  676. Bits
  677. an(char* ep, char* d, char* a, int lev, int flag)
  678. {
  679. USED(d);
  680. if(!ISUPPER(*word)) /*must be proper name*/
  681. return 0;
  682. return trypref(ep,a,lev,flag);
  683. }
  684. Bits
  685. ize(char* ep, char* d, char* a, int lev, int flag)
  686. {
  687. int temp = ep[-1];
  688. Bits h;
  689. USED(a);
  690. ep[-1] = 'e';
  691. h = strip(ep,"",d,lev,flag);
  692. ep[-1] = temp;
  693. return h;
  694. }
  695. Bits
  696. y_to_e(char* ep, char* d, char* a, int lev, int flag)
  697. {
  698. Bits h;
  699. int temp;
  700. USED(a);
  701. switch(ep[-1]) {
  702. case 'a':
  703. case 'e':
  704. case 'i':
  705. return 0;
  706. }
  707. temp = *ep;
  708. *ep++ = 'e';
  709. h = strip(ep,"",d,lev,flag);
  710. ep[-1] = temp;
  711. return h;
  712. }
  713. Bits
  714. ily(char* ep, char* d, char* a, int lev, int flag)
  715. {
  716. int temp = ep[0];
  717. char *cp = ep;
  718. if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
  719. return 0;
  720. if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
  721. while(cp>word)
  722. if(ISVOWEL(*--cp)) /* shyness */
  723. return 0;
  724. if(ep[-1]=='i')
  725. return i_to_y(ep,d,a,lev,flag);
  726. return cstrip(ep,d,a,lev,flag);
  727. }
  728. Bits
  729. bility(char* ep, char* d, char* a, int lev, int flag)
  730. {
  731. *ep++ = 'l';
  732. return y_to_e(ep,d,a,lev,flag);
  733. }
  734. Bits
  735. i_to_y(char* ep, char* d, char* a, int lev, int flag)
  736. {
  737. Bits h;
  738. int temp;
  739. if(ISUPPER(*word))
  740. return 0;
  741. if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
  742. ep[-1] = 'y';
  743. a = d;
  744. }
  745. h = cstrip(ep,"",a,lev,flag);
  746. ep[-1] = temp;
  747. return h;
  748. }
  749. Bits
  750. es(char* ep, char* d, char* a, int lev, int flag)
  751. {
  752. if(lev>DLEV)
  753. return 0;
  754. switch(ep[-1]) {
  755. default:
  756. return 0;
  757. case 'i':
  758. return i_to_y(ep,d,a,lev,flag);
  759. case 'h':
  760. switch(ep[-2]) {
  761. default:
  762. return 0;
  763. case 'c':
  764. case 's':
  765. break;
  766. }
  767. case 's':
  768. case 'z':
  769. case 'x':
  770. return strip(ep,d,a,lev,flag);
  771. }
  772. }
  773. Bits
  774. subst(char* ep, char* d, char* a, int lev, int flag)
  775. {
  776. char *u,*t;
  777. Bits h;
  778. USED(a);
  779. if(skipv(skipv(ep-1)) < word)
  780. return 0;
  781. for(t=d; *t!='+'; t++)
  782. continue;
  783. for(u=ep; *--t!='-';)
  784. *--u = *t;
  785. h = strip(ep,"",d,lev,flag);
  786. while(*++t != '+')
  787. continue;
  788. while(*++t)
  789. *u++ = *t;
  790. return h;
  791. }
  792. Bits
  793. tion(char* ep, char* d, char* a, int lev, int flag)
  794. {
  795. switch(ep[-2]) {
  796. default:
  797. return trypref(ep,a,lev,flag);
  798. case 'a':
  799. case 'e':
  800. case 'i':
  801. case 'o':
  802. case 'u':
  803. return y_to_e(ep,d,a,lev,flag);
  804. }
  805. }
  806. /*
  807. * possible consonant-consonant-e ending
  808. */
  809. Bits
  810. CCe(char* ep, char* d, char* a, int lev, int flag)
  811. {
  812. Bits h;
  813. switch(ep[-1]) {
  814. case 'l':
  815. if(ISVOWEL(ep[-2]))
  816. break;
  817. switch(ep[-2]) {
  818. case 'l':
  819. case 'r':
  820. case 'w':
  821. break;
  822. default:
  823. return y_to_e(ep,d,a,lev,flag);
  824. }
  825. break;
  826. case 'c':
  827. case 'g':
  828. if(*ep == 'a') /* prevent -able for -eable */
  829. return 0;
  830. case 's':
  831. case 'v':
  832. case 'z':
  833. if(ep[-2]==ep[-1])
  834. break;
  835. if(ISVOWEL(ep[-2]))
  836. break;
  837. case 'u':
  838. if(h = y_to_e(ep,d,a,lev,flag))
  839. return h;
  840. if(!(ep[-2]=='n' && ep[-1]=='g'))
  841. return 0;
  842. }
  843. return VCe(ep,d,a,lev,flag);
  844. }
  845. /*
  846. * possible consonant-vowel-consonant-e ending
  847. */
  848. Bits
  849. VCe(char* ep, char* d, char* a, int lev, int flag)
  850. {
  851. int c;
  852. Bits h;
  853. c = ep[-1];
  854. if(c=='e')
  855. return 0;
  856. if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
  857. c = *ep;
  858. *ep++ = 'e';
  859. h = trypref(ep,d,lev,flag);
  860. if(!h)
  861. h = trysuff(ep,lev,flag);
  862. if(h)
  863. return h;
  864. ep--;
  865. *ep = c;
  866. }
  867. return cstrip(ep,d,a,lev,flag);
  868. }
  869. Ptab*
  870. lookuppref(uchar** wp, char* ep)
  871. {
  872. Ptab *sp;
  873. uchar *bp,*cp;
  874. unsigned int initchar = Tolower(**wp);
  875. if(!ISALPHA(initchar))
  876. return 0;
  877. for(sp=preftab[initchar-'a'];sp->s;sp++) {
  878. bp = *wp;
  879. for(cp= (uchar*)sp->s;*cp; )
  880. if(*bp++!=*cp++)
  881. goto next;
  882. for(cp=bp;cp<(uchar*)ep;cp++)
  883. if(ISVOWEL(*cp)) {
  884. *wp = bp;
  885. return sp;
  886. }
  887. next:;
  888. }
  889. return 0;
  890. }
  891. /* while word is not in dictionary try stripping
  892. * prefixes. Fail if no more prefixes.
  893. */
  894. Bits
  895. trypref(char* ep, char* a, int lev, int flag)
  896. {
  897. Ptab *tp;
  898. char *bp, *cp;
  899. char *pp;
  900. Bits h;
  901. char space[20];
  902. if(lev<DSIZ) {
  903. deriv[lev].mesg = a;
  904. deriv[lev].type = *a=='.'? NONE: SUFF;
  905. }
  906. if(h = tryword(word,ep,lev,flag)) {
  907. if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
  908. return h;
  909. h = 0;
  910. }
  911. bp = word;
  912. pp = space;
  913. if(lev<DSIZ) {
  914. deriv[lev+1].mesg = pp;
  915. deriv[lev+1].type = 0;
  916. }
  917. while(tp=lookuppref((uchar**)&bp,ep)) {
  918. *pp++ = '+';
  919. cp = tp->s;
  920. while(pp<space+sizeof(space) && (*pp = *cp++))
  921. pp++;
  922. deriv[lev+1].type += PREF;
  923. h = tryword(bp,ep,lev+1,flag);
  924. if(Set(h,NOPREF) ||
  925. ((tp->flag&IN) && inun(bp-2,h)==0)) {
  926. h = 0;
  927. break;
  928. }
  929. if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
  930. break;
  931. h = 0;
  932. }
  933. if(lev < DSIZ) {
  934. deriv[lev+1] = emptyderiv;
  935. deriv[lev+2] = emptyderiv;
  936. }
  937. return h;
  938. }
  939. Bits
  940. tryword(char* bp, char* ep, int lev, int flag)
  941. {
  942. int j;
  943. Bits h = 0;
  944. char duple[3];
  945. if(ep-bp <= 1)
  946. return h;
  947. if(flag&MONO) {
  948. if(lev<DSIZ) {
  949. deriv[++lev].mesg = duple;
  950. deriv[lev].type = SUFF;
  951. }
  952. duple[0] = '+';
  953. duple[1] = *ep;
  954. duple[2] = 0;
  955. }
  956. h = dict(bp, ep);
  957. if(vflag==0 || h==0)
  958. return h;
  959. /*
  960. * when derivations are wanted, collect them
  961. * for printing
  962. */
  963. j = lev;
  964. prefcount = suffcount = 0;
  965. do {
  966. if(j<DSIZ && deriv[j].type) {
  967. strcat(affix, deriv[j].mesg);
  968. if(deriv[j].type == SUFF)
  969. suffcount++;
  970. else if(deriv[j].type != NONE)
  971. prefcount = deriv[j].type/PREF;
  972. }
  973. } while(--j > 0);
  974. return h;
  975. }
  976. int
  977. inun(char* bp, Bits h)
  978. {
  979. if(*bp == 'u')
  980. return Set(h, IN) == 0;
  981. /* *bp == 'i' */
  982. if(Set(h, IN) == 0)
  983. return 0;
  984. switch(bp[2]) {
  985. case 'r':
  986. return bp[1] == 'r';
  987. case 'm':
  988. case 'p':
  989. return bp[1] == 'm';
  990. }
  991. return bp[1] == 'n';
  992. }
  993. char*
  994. skipv(char *s)
  995. {
  996. if(s >= word && ISVOWEL(*s))
  997. s--;
  998. while(s >= word && !ISVOWEL(*s))
  999. s--;
  1000. return s;
  1001. }
  1002. /*
  1003. * crummy way to Britishise
  1004. */
  1005. void
  1006. ise(void)
  1007. {
  1008. Suftab *p;
  1009. int i;
  1010. for(i=0; i<26; i++)
  1011. for(p = suftab[i]; p->suf; p++) {
  1012. p->suf = ztos(p->suf);
  1013. p->d1 = ztos(p->d1);
  1014. p->a1 = ztos(p->a1);
  1015. }
  1016. }
  1017. char*
  1018. ztos(char *as)
  1019. {
  1020. char *s, *ds;
  1021. for(s=as; *s; s++)
  1022. if(*s == 'z')
  1023. goto copy;
  1024. return as;
  1025. copy:
  1026. ds = strdup(as);
  1027. for(s=ds; *s; s++)
  1028. if(*s == 'z')
  1029. *s = 's';
  1030. return ds;
  1031. }
  1032. Bits
  1033. dict(char* bp, char* ep)
  1034. {
  1035. char *cp, *cp1, *w, *wp, *we;
  1036. int n, f;
  1037. w = bp;
  1038. we = ep;
  1039. n = ep-bp;
  1040. if(n <= 1)
  1041. return NOUN;
  1042. f = w[0] & 0x7f;
  1043. f *= 128;
  1044. f += w[1] & 0x7f;
  1045. bp = spacep[f];
  1046. ep = spacep[f+1];
  1047. loop:
  1048. if(bp >= ep) {
  1049. if(xflag)
  1050. fprint(2, "=%.*s\n", utfnlen(w, n), w);
  1051. return 0;
  1052. }
  1053. /*
  1054. * find the beginning of some word in the middle
  1055. */
  1056. cp = bp + (ep-bp)/2;
  1057. while(cp > bp && !(*cp & 0x80))
  1058. cp--;
  1059. while(cp > bp && (cp[-1] & 0x80))
  1060. cp--;
  1061. wp = w + 2; /* skip two letters */
  1062. cp1 = cp + 2; /* skip affix code */
  1063. for(;;) {
  1064. if(wp >= we) {
  1065. if(*cp1 & 0x80)
  1066. goto found;
  1067. else
  1068. f = 1;
  1069. break;
  1070. }
  1071. if(*cp1 & 0x80) {
  1072. f = -1;
  1073. break;
  1074. }
  1075. f = *cp1++ - *wp++;
  1076. if(f != 0)
  1077. break;
  1078. }
  1079. if(f < 0) {
  1080. while(!(*cp1 & 0x80))
  1081. cp1++;
  1082. bp = cp1;
  1083. goto loop;
  1084. }
  1085. ep = cp;
  1086. goto loop;
  1087. found:
  1088. f = ((cp[0] & 0x7) << 8) |
  1089. (cp[1] & 0xff);
  1090. if(xflag) {
  1091. fprint(2, "=%.*s ", utfnlen(w, n), w);
  1092. typeprint(encode[f]);
  1093. }
  1094. return encode[f];
  1095. }
  1096. void
  1097. typeprint(Bits h)
  1098. {
  1099. pcomma("");
  1100. if(h & NOUN)
  1101. pcomma("n");
  1102. if(h & PROP_COLLECT)
  1103. pcomma("pc");
  1104. if(h & VERB) {
  1105. if((h & VERB) == VERB)
  1106. pcomma("v");
  1107. else
  1108. if((h & VERB) == V_IRREG)
  1109. pcomma("vi");
  1110. else
  1111. if(h & ED)
  1112. pcomma("ed");
  1113. }
  1114. if(h & ADJ)
  1115. pcomma("a");
  1116. if(h & COMP) {
  1117. if((h & COMP) == ACTOR)
  1118. pcomma("er");
  1119. else
  1120. pcomma("comp");
  1121. }
  1122. if(h & DONT_TOUCH)
  1123. pcomma("d");
  1124. if(h & N_AFFIX)
  1125. pcomma("na");
  1126. if(h & ADV)
  1127. pcomma("adv");
  1128. if(h & ION)
  1129. pcomma("ion");
  1130. if(h & V_AFFIX)
  1131. pcomma("va");
  1132. if(h & MAN)
  1133. pcomma("man");
  1134. if(h & NOPREF)
  1135. pcomma("nopref");
  1136. if(h & MONO)
  1137. pcomma("ms");
  1138. if(h & IN)
  1139. pcomma("in");
  1140. if(h & _Y)
  1141. pcomma("y");
  1142. if(h & STOP)
  1143. pcomma("s");
  1144. fprint(2, "\n");
  1145. }
  1146. void
  1147. pcomma(char *s)
  1148. {
  1149. static flag;
  1150. if(*s == 0) {
  1151. flag = 0;
  1152. return;
  1153. }
  1154. if(!flag) {
  1155. fprint(2, "%s", s);
  1156. flag = 1;
  1157. } else
  1158. fprint(2, ",%s", s);
  1159. }
  1160. /*
  1161. * is the word on of the following
  1162. * 12th teen
  1163. * 21st end in 1
  1164. * 23rd end in 3
  1165. * 77th default
  1166. * called knowing word[0] is a digit
  1167. */
  1168. int
  1169. ordinal(void)
  1170. {
  1171. char *cp = word;
  1172. static char sp[4];
  1173. while(ISDIGIT(*cp))
  1174. cp++;
  1175. strncpy(sp,cp,3);
  1176. if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
  1177. sp[0] = Tolower(cp[0]);
  1178. sp[1] = Tolower(cp[1]);
  1179. }
  1180. return 0 == strncmp(sp,
  1181. cp[-2]=='1'? "th": /* out of bounds if 1 digit */
  1182. *--cp=='1'? "st": /* harmless */
  1183. *cp=='2'? "nd":
  1184. *cp=='3'? "rd":
  1185. "th", 3);
  1186. }
  1187. /*
  1188. * read in the dictionary.
  1189. * format is
  1190. * {
  1191. * short nencode;
  1192. * long encode[nencode];
  1193. * char space[*];
  1194. * };
  1195. *
  1196. * the encodings are a table all different
  1197. * affixes.
  1198. * the dictionary proper has 2 bytes
  1199. * that demark and then the rest of the
  1200. * word. the 2 bytes have the following
  1201. * 0x80 0x00 flag
  1202. * 0x78 0x00 count of prefix bytes
  1203. * common with prev word
  1204. * 0x07 0xff affix code
  1205. *
  1206. * all ints are big endians in the file.
  1207. */
  1208. void
  1209. readdict(char *file)
  1210. {
  1211. char *s, *is, *lasts, *ls;
  1212. int c, i, sp, p;
  1213. int f;
  1214. long l;
  1215. lasts = 0;
  1216. f = open(file, 0);
  1217. if(f == -1) {
  1218. fprint(2, "cannot open %s\n", file);
  1219. exits("open");
  1220. }
  1221. if(read(f, space, 2) != 2)
  1222. goto bad;
  1223. nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
  1224. if(read(f, space, 4*nencode) != 4*nencode)
  1225. goto bad;
  1226. s = space;
  1227. for(i=0; i<nencode; i++) {
  1228. l = (long)(s[0] & 0xff) << 24;
  1229. l |= (s[1] & 0xff) << 16;
  1230. l |= (s[2] & 0xff) << 8;
  1231. l |= s[3] & 0xff;
  1232. encode[i] = (Bits)l;
  1233. s += 4;
  1234. }
  1235. l = read(f, space, sizeof(space));
  1236. if(l == sizeof(space))
  1237. goto noroom;
  1238. is = space + (sizeof(space) - l);
  1239. memmove(is, space, l);
  1240. s = space;
  1241. c = *is++ & 0xff;
  1242. sp = -1;
  1243. i = 0;
  1244. loop:
  1245. if(s > is)
  1246. goto noroom;
  1247. if(c < 0) {
  1248. close(f);
  1249. while(sp < 128*128)
  1250. spacep[++sp] = s;
  1251. *s = 0x80; /* fence */
  1252. return;
  1253. }
  1254. p = (c>>3) & 0xf;
  1255. *s++ = c;
  1256. *s++ = *is++ & 0xff;
  1257. if(p <= 0)
  1258. i = (*is++ & 0xff)*128;
  1259. if(p <= 1) {
  1260. if(!(*is & 0x80))
  1261. i = i/128*128 + (*is++ & 0xff);
  1262. if(i <= sp) {
  1263. fprint(2, "the dict isnt sorted or \n");
  1264. fprint(2, "memmove didn't work\n");
  1265. goto bad;
  1266. }
  1267. while(sp < i)
  1268. spacep[++sp] = s-2;
  1269. }
  1270. ls = lasts;
  1271. lasts = s;
  1272. for(p-=2; p>0; p--)
  1273. *s++ = *ls++;
  1274. for(;;) {
  1275. if(is >= space+sizeof(space)) {
  1276. c = -1;
  1277. break;
  1278. }
  1279. c = *is++ & 0xff;
  1280. if(c & 0x80)
  1281. break;
  1282. *s++ = c;
  1283. }
  1284. *s = 0;
  1285. goto loop;
  1286. bad:
  1287. fprint(2, "trouble reading %s\n", file);
  1288. exits("read");
  1289. noroom:
  1290. fprint(2, "not enough space for dictionary\n");
  1291. exits("space");
  1292. }