sprog.c 23 KB


  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include <u.h>
  10. #include <libc.h>
  11. #include <bio.h>
  12. #include <ctype.h>
  13. #include "code.h"
  14. /* fig leaves for possibly signed char quantities */
  15. #define ISUPPER(c) isupper((c)&0xff)
  16. #define ISLOWER(c) islower((c)&0xff)
  17. #define ISALPHA(c) isalpha((c)&0xff)
  18. #define ISDIGIT(c) isdigit((c)&0xff)
  19. #define ISVOWEL(c) voweltab[(c)&0xff]
  20. #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
  21. #define pair(a,b) (((a)<<8) | (b))
  22. #define DLEV 2
  23. #define DSIZ 40
  24. typedef int32_t Bits;
  25. #define Set(h, f) ((int32_t)(h) & (f))
  26. Bits nop(char*, char*, char*, int, int);
  27. Bits strip(char*, char*, char*, int, int);
  28. Bits ize(char*, char*, char*, int, int);
  29. Bits i_to_y(char*, char*, char*, int, int);
  30. Bits ily(char*, char*, char*, int, int);
  31. Bits subst(char*, char*, char*, int, int);
  32. Bits CCe(char*, char*, char*, int, int);
  33. Bits tion(char*, char*, char*, int, int);
  34. Bits an(char*, char*, char*, int, int);
  35. Bits s(char*, char*, char*, int, int);
  36. Bits es(char*, char*, char*, int, int);
  37. Bits bility(char*, char*, char*, int, int);
  38. Bits y_to_e(char*, char*, char*, int, int);
  39. Bits VCe(char*, char*, char*, int, int);
  40. Bits trypref(char*, char*, int, int);
  41. Bits tryword(char*, char*, int, int);
  42. Bits trysuff(char*, int, int);
  43. Bits dict(char*, char*);
  44. void typeprint(Bits);
  45. void pcomma(char*);
  46. void ise(void);
  47. int ordinal(void);
  48. char* skipv(char*);
  49. int inun(char*, Bits);
  50. char* ztos(char*);
  51. void readdict(char*);
  52. typedef struct Ptab Ptab;
  53. struct Ptab
  54. {
  55. char* s;
  56. int flag;
  57. };
  58. typedef struct Suftab Suftab;
  59. struct Suftab
  60. {
  61. char *suf;
  62. Bits (*p1)(char*, char*, char*, int, int);
  63. int n1;
  64. char *d1;
  65. char *a1;
  66. int flag;
  67. int affixable;
  68. Bits (*p2)(char*, char*, char*, int, int);
  69. int n2;
  70. char *d2;
  71. char *a2;
  72. };
  73. Suftab staba[] = {
  74. {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
  75. 0
  76. };
  77. Suftab stabc[] =
  78. {
  79. {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
  80. {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
  81. {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
  82. {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
  83. {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
  84. {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
  85. {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
  86. {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
  87. {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
  88. 0
  89. };
  90. Suftab stabd[] =
  91. {
  92. {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
  93. {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
  94. 0
  95. };
  96. Suftab stabe[] =
  97. {
  98. /*
  99. * V_affix for comment ->commence->commentment??
  100. */
  101. {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
  102. {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
  103. {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
  104. {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
  105. {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
  106. {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
  107. {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
  108. 0
  109. };
  110. Suftab stabg[] =
  111. {
  112. {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
  113. {"gnikam",strip,6,"","+making",NOUN,NOUN},
  114. {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
  115. {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
  116. 0
  117. };
  118. Suftab stabl[] =
  119. {
  120. {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
  121. {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
  122. {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
  123. {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
  124. {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
  125. 0
  126. };
  127. Suftab stabm[] =
  128. {
  129. /* congregational + ism */
  130. {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
  131. {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
  132. 0
  133. };
  134. Suftab stabn[] =
  135. {
  136. {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
  137. {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
  138. {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
  139. {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
  140. {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
  141. {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
  142. {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
  143. {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
  144. {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
  145. {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
  146. 0
  147. };
  148. Suftab stabp[] =
  149. {
  150. {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
  151. 0
  152. };
  153. Suftab stabr[] =
  154. {
  155. {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
  156. {"reyhparg",nop,0,"","",0,NOUN},
  157. {"reyl",nop,0,"","",0,NOUN},
  158. {"rekam",strip,5,"","+maker",NOUN,NOUN},
  159. {"repeek",strip,6,"","+keeper",NOUN,NOUN},
  160. {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
  161. {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
  162. {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
  163. {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
  164. 0
  165. };
  166. Suftab stabs[] =
  167. {
  168. {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
  169. {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
  170. {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
  171. {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
  172. {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
  173. 0
  174. };
  175. Suftab stabt[] =
  176. {
  177. {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
  178. {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
  179. {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
  180. {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
  181. 0
  182. };
  183. Suftab staby[] =
  184. {
  185. {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
  186. {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
  187. {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
  188. {"ytisuo",nop,0,"","",NOUN},
  189. {"ytilb",nop,0,"","",0,NOUN},
  190. {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
  191. {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
  192. {"ylc",nop,0,"","",0},
  193. {"ylelb",nop,0,"","",0},
  194. {"ylelp",nop,0,"","",0},
  195. {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
  196. {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
  197. {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
  198. 0
  199. };
  200. Suftab stabz[] =
  201. {
  202. 0
  203. };
  204. Suftab* suftab[] =
  205. {
  206. staba,
  207. stabz,
  208. stabc,
  209. stabd,
  210. stabe,
  211. stabz,
  212. stabg,
  213. stabz,
  214. stabz,
  215. stabz,
  216. stabz,
  217. stabl,
  218. stabm,
  219. stabn,
  220. stabz,
  221. stabp,
  222. stabz,
  223. stabr,
  224. stabs,
  225. stabt,
  226. stabz,
  227. stabz,
  228. stabz,
  229. stabz,
  230. staby,
  231. stabz,
  232. };
  233. Ptab ptaba[] =
  234. {
  235. "anti", 0,
  236. "auto", 0,
  237. 0
  238. };
  239. Ptab ptabb[] =
  240. {
  241. "bio", 0,
  242. 0
  243. };
  244. Ptab ptabc[] =
  245. {
  246. "counter", 0,
  247. 0
  248. };
  249. Ptab ptabd[] =
  250. {
  251. "dis", 0,
  252. 0
  253. };
  254. Ptab ptabe[] =
  255. {
  256. "electro", 0,
  257. 0
  258. };
  259. Ptab ptabf[] =
  260. {
  261. "femto", 0,
  262. 0
  263. };
  264. Ptab ptabg[] =
  265. {
  266. "geo", 0,
  267. "giga", 0,
  268. 0
  269. };
  270. Ptab ptabh[] =
  271. {
  272. "hyper", 0,
  273. 0
  274. };
  275. Ptab ptabi[] =
  276. {
  277. "immuno", 0,
  278. "im", IN,
  279. "intra", 0,
  280. "inter", 0,
  281. "in", IN,
  282. "ir", IN,
  283. "iso", 0,
  284. 0
  285. };
  286. Ptab ptabj[] =
  287. {
  288. 0
  289. };
  290. Ptab ptabk[] =
  291. {
  292. "kilo", 0,
  293. 0
  294. };
  295. Ptab ptabl[] =
  296. {
  297. 0
  298. };
  299. Ptab ptabm[] =
  300. {
  301. "magneto", 0,
  302. "mega", 0,
  303. "meta", 0,
  304. "micro", 0,
  305. "mid", 0,
  306. "milli", 0,
  307. "mini", 0,
  308. "mis", 0,
  309. "mono", 0,
  310. "multi", 0,
  311. 0
  312. };
  313. Ptab ptabn[] =
  314. {
  315. "nano", 0,
  316. "neuro", 0,
  317. "non", 0,
  318. 0
  319. };
  320. Ptab ptabo[] =
  321. {
  322. "out", 0,
  323. "over", 0,
  324. 0
  325. };
  326. Ptab ptabp[] =
  327. {
  328. "para", 0,
  329. "photo", 0,
  330. "pico", 0,
  331. "poly", 0,
  332. "pre", 0,
  333. "pseudo", 0,
  334. "psycho", 0,
  335. 0
  336. };
  337. Ptab ptabq[] =
  338. {
  339. "quasi", 0,
  340. 0
  341. };
  342. Ptab ptabr[] =
  343. {
  344. "radio", 0,
  345. "re", 0,
  346. 0
  347. };
  348. Ptab ptabs[] =
  349. {
  350. "semi", 0,
  351. "stereo", 0,
  352. "sub", 0,
  353. "super", 0,
  354. 0
  355. };
  356. Ptab ptabt[] =
  357. {
  358. "tele", 0,
  359. "tera", 0,
  360. "thermo", 0,
  361. 0
  362. };
  363. Ptab ptabu[] =
  364. {
  365. "ultra", 0,
  366. "under", 0, /*must precede un*/
  367. "un", IN,
  368. 0
  369. };
  370. Ptab ptabv[] =
  371. {
  372. 0
  373. };
  374. Ptab ptabw[] =
  375. {
  376. 0
  377. };
  378. Ptab ptabx[] =
  379. {
  380. 0
  381. };
  382. Ptab ptaby[] =
  383. {
  384. 0
  385. };
  386. Ptab ptabz[] =
  387. {
  388. 0
  389. };
  390. Ptab* preftab[] =
  391. {
  392. ptaba,
  393. ptabb,
  394. ptabc,
  395. ptabd,
  396. ptabe,
  397. ptabf,
  398. ptabg,
  399. ptabh,
  400. ptabi,
  401. ptabj,
  402. ptabk,
  403. ptabl,
  404. ptabm,
  405. ptabn,
  406. ptabo,
  407. ptabp,
  408. ptabq,
  409. ptabr,
  410. ptabs,
  411. ptabt,
  412. ptabu,
  413. ptabv,
  414. ptabw,
  415. ptabx,
  416. ptaby,
  417. ptabz,
  418. };
  419. typedef struct {
  420. char *mesg;
  421. enum { NONE, SUFF, PREF} type;
  422. } Deriv;
  423. int aflag;
  424. int cflag;
  425. int fflag;
  426. int vflag;
  427. int xflag;
  428. int nflag;
  429. char word[500];
  430. char* original;
  431. Deriv emptyderiv;
  432. Deriv deriv[DSIZ+3];
  433. char affix[DSIZ*10]; /* 10 is longest affix message */
  434. int prefcount;
  435. int suffcount;
  436. char* acmeid;
  437. char space[300000]; /* must be as large as "words"+"space" in pcode run */
  438. Bits encode[2048]; /* must be as long as "codes" in pcode run */
  439. int nencode;
  440. char voweltab[256];
  441. char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
  442. Biobuf bin;
  443. Biobuf bout;
  444. char* codefile = "/sys/lib/amspell";
  445. char* brfile = "/sys/lib/brspell";
  446. char* Usage = "usage";
  447. void
  448. main(int argc, char *argv[])
  449. {
  450. char *ep, *cp;
  451. char *dp;
  452. int j, i, c;
  453. int low;
  454. Bits h;
  455. Binit(&bin, 0, OREAD);
  456. Binit(&bout, 1, OWRITE);
  457. for(i=0; c = "aeiouyAEIOUY"[i]; i++)
  458. voweltab[c] = 1;
  459. while(argc > 1) {
  460. if(argv[1][0] != '-')
  461. break;
  462. for(i=1; c = argv[1][i]; i++)
  463. switch(c) {
  464. default:
  465. fprint(2, "usage: spell [-bcCvx] [-f file]\n");
  466. exits(Usage);
  467. case 'a':
  468. aflag++;
  469. continue;
  470. case 'b':
  471. ise();
  472. if(!fflag)
  473. codefile = brfile;
  474. continue;
  475. case 'C': /* for "correct" */
  476. vflag++;
  477. case 'c': /* for ocr */
  478. cflag++;
  479. continue;
  480. case 'v':
  481. vflag++;
  482. continue;
  483. case 'x':
  484. xflag++;
  485. continue;
  486. case 'f':
  487. if(argc <= 2) {
  488. fprint(2, "spell: -f requires another argument\n");
  489. exits(Usage);
  490. }
  491. argv++;
  492. argc--;
  493. codefile = argv[1];
  494. fflag++;
  495. goto brk;
  496. }
  497. brk:
  498. argv++;
  499. argc--;
  500. }
  501. readdict(codefile);
  502. if(argc > 1) {
  503. fprint(2, "usage: spell [-bcCvx] [-f file]\n");
  504. exits(Usage);
  505. }
  506. if(aflag)
  507. cflag = vflag = 0;
  508. for(;;) {
  509. affix[0] = 0;
  510. original = Brdline(&bin, '\n');
  511. if(original == 0)
  512. exits(0);
  513. original[Blinelen(&bin)-1] = 0;
  514. low = 0;
  515. if(aflag) {
  516. acmeid = original;
  517. while(*original != ':')
  518. if(*original++ == 0)
  519. exits(0);
  520. while(*++original != ':')
  521. if(*original == 0)
  522. exits(0);
  523. *original++ = 0;
  524. }
  525. for(ep=word,dp=original; j = *dp; ep++,dp++) {
  526. if(ISLOWER(j))
  527. low++;
  528. if(ep >= word+sizeof(word)-1)
  529. break;
  530. *ep = j;
  531. }
  532. *ep = 0;
  533. if(ISDIGIT(word[0]) && ordinal())
  534. continue;
  535. h = 0;
  536. if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
  537. for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
  538. *dp = Tolower(*cp);
  539. if(!h)
  540. for(;;) { /* at most twice */
  541. if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
  542. break;
  543. if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
  544. break;
  545. if(!ISUPPER(word[0]))
  546. break;
  547. cp = original;
  548. dp = word;
  549. while(*dp = *cp++) {
  550. if(!low)
  551. *dp = Tolower(*dp);
  552. dp++;
  553. }
  554. word[0] = Tolower(word[0]);
  555. }
  556. if(cflag) {
  557. if(!h || Set(h,STOP))
  558. print("-");
  559. else if(!vflag)
  560. print("+");
  561. else
  562. print("%c",'0' + (suffcount>0) +
  563. (prefcount>4? 8: 2*prefcount));
  564. } else if(!h || Set(h,STOP)) {
  565. if(aflag)
  566. Bprint(&bout, "%s:%s\n", acmeid, original);
  567. else
  568. Bprint(&bout, "%s\n", original);
  569. } else if(affix[0] != 0 && affix[0] != '.')
  570. print("%s\t%s\n", affix, original);
  571. }
  572. /* not reached */
  573. }
  574. /* strip exactly one suffix and do
  575. * indicated routine(s), which may recursively
  576. * strip suffixes
  577. */
  578. Bits
  579. trysuff(char* ep, int lev, int flag)
  580. {
  581. Suftab *t;
  582. char *cp, *sp;
  583. Bits h = 0;
  584. int initchar = ep[-1];
  585. flag &= ~MONO;
  586. lev += DLEV;
  587. if(lev < DSIZ) {
  588. deriv[lev] = emptyderiv;
  589. deriv[lev-1] = emptyderiv;
  590. }
  591. if(!ISLOWER(initchar))
  592. return h;
  593. for(t=suftab[initchar-'a']; sp=t->suf; t++) {
  594. cp = ep;
  595. while(*sp)
  596. if(*--cp != *sp++)
  597. goto next;
  598. for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
  599. ;
  600. if(sp < word)
  601. continue;
  602. if(!(t->affixable & flag))
  603. return 0;
  604. h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
  605. if(!h && t->p2!=0) {
  606. if(lev < DSIZ) {
  607. deriv[lev] = emptyderiv;
  608. deriv[lev+1] = emptyderiv;
  609. }
  610. h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
  611. }
  612. break;
  613. next:;
  614. }
  615. return h;
  616. }
  617. Bits
  618. nop(char* ep, char* d, char* a, int lev, int flag)
  619. {
  620. USED(ep, d, a, lev, flag);
  621. return 0;
  622. }
  623. Bits
  624. cstrip(char* ep, char* d, char* a, int lev, int flag)
  625. {
  626. int temp = ep[0];
  627. if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
  628. switch(pair(ep[-1],ep[0])) {
  629. case pair('a', 'a'):
  630. case pair('a', 'e'):
  631. case pair('a', 'i'):
  632. case pair('e', 'a'):
  633. case pair('e', 'e'):
  634. case pair('e', 'i'):
  635. case pair('i', 'i'):
  636. case pair('o', 'a'):
  637. return 0;
  638. }
  639. } else
  640. if(temp==ep[-1]&&temp==ep[-2])
  641. return 0;
  642. return strip(ep,d,a,lev,flag);
  643. }
  644. Bits
  645. strip(char* ep, char* d, char* a, int lev, int flag)
  646. {
  647. Bits h = trypref(ep, a, lev, flag);
  648. USED(d);
  649. if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
  650. h = 0;
  651. if(h)
  652. return h;
  653. if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
  654. h = trypref(ep-1,a,lev,flag|MONO);
  655. if(h)
  656. return h;
  657. }
  658. return trysuff(ep,lev,flag);
  659. }
  660. Bits
  661. s(char* ep, char* d, char* a, int lev, int flag)
  662. {
  663. if(lev > DLEV+1)
  664. return 0;
  665. if(*ep=='s') {
  666. switch(ep[-1]) {
  667. case 'y':
  668. if(ISVOWEL(ep[-2])||ISUPPER(*word))
  669. break; /*says Kennedys*/
  670. case 'x':
  671. case 'z':
  672. case 's':
  673. return 0;
  674. case 'h':
  675. switch(ep[-2]) {
  676. case 'c':
  677. case 's':
  678. return 0;
  679. }
  680. }
  681. }
  682. return strip(ep,d,a,lev,flag);
  683. }
  684. Bits
  685. an(char* ep, char* d, char* a, int lev, int flag)
  686. {
  687. USED(d);
  688. if(!ISUPPER(*word)) /*must be proper name*/
  689. return 0;
  690. return trypref(ep,a,lev,flag);
  691. }
  692. Bits
  693. ize(char* ep, char* d, char* a, int lev, int flag)
  694. {
  695. int temp = ep[-1];
  696. Bits h;
  697. USED(a);
  698. ep[-1] = 'e';
  699. h = strip(ep,"",d,lev,flag);
  700. ep[-1] = temp;
  701. return h;
  702. }
  703. Bits
  704. y_to_e(char* ep, char* d, char* a, int lev, int flag)
  705. {
  706. Bits h;
  707. int temp;
  708. USED(a);
  709. switch(ep[-1]) {
  710. case 'a':
  711. case 'e':
  712. case 'i':
  713. return 0;
  714. }
  715. temp = *ep;
  716. *ep++ = 'e';
  717. h = strip(ep,"",d,lev,flag);
  718. ep[-1] = temp;
  719. return h;
  720. }
  721. Bits
  722. ily(char* ep, char* d, char* a, int lev, int flag)
  723. {
  724. int temp = ep[0];
  725. char *cp = ep;
  726. if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
  727. return 0;
  728. if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
  729. while(cp>word)
  730. if(ISVOWEL(*--cp)) /* shyness */
  731. return 0;
  732. if(ep[-1]=='i')
  733. return i_to_y(ep,d,a,lev,flag);
  734. return cstrip(ep,d,a,lev,flag);
  735. }
  736. Bits
  737. bility(char* ep, char* d, char* a, int lev, int flag)
  738. {
  739. *ep++ = 'l';
  740. return y_to_e(ep,d,a,lev,flag);
  741. }
  742. Bits
  743. i_to_y(char* ep, char* d, char* a, int lev, int flag)
  744. {
  745. Bits h;
  746. int temp;
  747. if(ISUPPER(*word))
  748. return 0;
  749. if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
  750. ep[-1] = 'y';
  751. a = d;
  752. }
  753. h = cstrip(ep,"",a,lev,flag);
  754. ep[-1] = temp;
  755. return h;
  756. }
  757. Bits
  758. es(char* ep, char* d, char* a, int lev, int flag)
  759. {
  760. if(lev>DLEV)
  761. return 0;
  762. switch(ep[-1]) {
  763. default:
  764. return 0;
  765. case 'i':
  766. return i_to_y(ep,d,a,lev,flag);
  767. case 'h':
  768. switch(ep[-2]) {
  769. default:
  770. return 0;
  771. case 'c':
  772. case 's':
  773. break;
  774. }
  775. case 's':
  776. case 'z':
  777. case 'x':
  778. return strip(ep,d,a,lev,flag);
  779. }
  780. }
  781. Bits
  782. subst(char* ep, char* d, char* a, int lev, int flag)
  783. {
  784. char *u,*t;
  785. Bits h;
  786. USED(a);
  787. if(skipv(skipv(ep-1)) < word)
  788. return 0;
  789. for(t=d; *t!='+'; t++)
  790. continue;
  791. for(u=ep; *--t!='-';)
  792. *--u = *t;
  793. h = strip(ep,"",d,lev,flag);
  794. while(*++t != '+')
  795. continue;
  796. while(*++t)
  797. *u++ = *t;
  798. return h;
  799. }
  800. Bits
  801. tion(char* ep, char* d, char* a, int lev, int flag)
  802. {
  803. switch(ep[-2]) {
  804. default:
  805. return trypref(ep,a,lev,flag);
  806. case 'a':
  807. case 'e':
  808. case 'i':
  809. case 'o':
  810. case 'u':
  811. return y_to_e(ep,d,a,lev,flag);
  812. }
  813. }
  814. /*
  815. * possible consonant-consonant-e ending
  816. */
  817. Bits
  818. CCe(char* ep, char* d, char* a, int lev, int flag)
  819. {
  820. Bits h;
  821. switch(ep[-1]) {
  822. case 'l':
  823. if(ISVOWEL(ep[-2]))
  824. break;
  825. switch(ep[-2]) {
  826. case 'l':
  827. case 'r':
  828. case 'w':
  829. break;
  830. default:
  831. return y_to_e(ep,d,a,lev,flag);
  832. }
  833. break;
  834. case 'c':
  835. case 'g':
  836. if(*ep == 'a') /* prevent -able for -eable */
  837. return 0;
  838. case 's':
  839. case 'v':
  840. case 'z':
  841. if(ep[-2]==ep[-1])
  842. break;
  843. if(ISVOWEL(ep[-2]))
  844. break;
  845. case 'u':
  846. if(h = y_to_e(ep,d,a,lev,flag))
  847. return h;
  848. if(!(ep[-2]=='n' && ep[-1]=='g'))
  849. return 0;
  850. }
  851. return VCe(ep,d,a,lev,flag);
  852. }
  853. /*
  854. * possible consonant-vowel-consonant-e ending
  855. */
  856. Bits
  857. VCe(char* ep, char* d, char* a, int lev, int flag)
  858. {
  859. int c;
  860. Bits h;
  861. c = ep[-1];
  862. if(c=='e')
  863. return 0;
  864. if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
  865. c = *ep;
  866. *ep++ = 'e';
  867. h = trypref(ep,d,lev,flag);
  868. if(!h)
  869. h = trysuff(ep,lev,flag);
  870. if(h)
  871. return h;
  872. ep--;
  873. *ep = c;
  874. }
  875. return cstrip(ep,d,a,lev,flag);
  876. }
  877. Ptab*
  878. lookuppref(uint8_t** wp, char* ep)
  879. {
  880. Ptab *sp;
  881. uint8_t *bp,*cp;
  882. unsigned int initchar = Tolower(**wp);
  883. if(!ISALPHA(initchar))
  884. return 0;
  885. for(sp=preftab[initchar-'a'];sp->s;sp++) {
  886. bp = *wp;
  887. for(cp= (uint8_t*)sp->s;*cp; )
  888. if(*bp++!=*cp++)
  889. goto next;
  890. for(cp=bp;cp<(uint8_t*)ep;cp++)
  891. if(ISVOWEL(*cp)) {
  892. *wp = bp;
  893. return sp;
  894. }
  895. next:;
  896. }
  897. return 0;
  898. }
  899. /* while word is not in dictionary try stripping
  900. * prefixes. Fail if no more prefixes.
  901. */
  902. Bits
  903. trypref(char* ep, char* a, int lev, int flag)
  904. {
  905. Ptab *tp;
  906. char *bp, *cp;
  907. char *pp;
  908. Bits h;
  909. char space[20];
  910. if(lev<DSIZ) {
  911. deriv[lev].mesg = a;
  912. deriv[lev].type = *a=='.'? NONE: SUFF;
  913. }
  914. if(h = tryword(word,ep,lev,flag)) {
  915. if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
  916. return h;
  917. h = 0;
  918. }
  919. bp = word;
  920. pp = space;
  921. if(lev<DSIZ) {
  922. deriv[lev+1].mesg = pp;
  923. deriv[lev+1].type = 0;
  924. }
  925. while(tp=lookuppref((uint8_t**)&bp,ep)) {
  926. *pp++ = '+';
  927. cp = tp->s;
  928. while(pp<space+sizeof(space) && (*pp = *cp++))
  929. pp++;
  930. deriv[lev+1].type += PREF;
  931. h = tryword(bp,ep,lev+1,flag);
  932. if(Set(h,NOPREF) ||
  933. ((tp->flag&IN) && inun(bp-2,h)==0)) {
  934. h = 0;
  935. break;
  936. }
  937. if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
  938. break;
  939. h = 0;
  940. }
  941. if(lev < DSIZ) {
  942. deriv[lev+1] = emptyderiv;
  943. deriv[lev+2] = emptyderiv;
  944. }
  945. return h;
  946. }
  947. Bits
  948. tryword(char* bp, char* ep, int lev, int flag)
  949. {
  950. int j;
  951. Bits h = 0;
  952. char duple[3];
  953. if(ep-bp <= 1)
  954. return h;
  955. if(flag&MONO) {
  956. if(lev<DSIZ) {
  957. deriv[++lev].mesg = duple;
  958. deriv[lev].type = SUFF;
  959. }
  960. duple[0] = '+';
  961. duple[1] = *ep;
  962. duple[2] = 0;
  963. }
  964. h = dict(bp, ep);
  965. if(vflag==0 || h==0)
  966. return h;
  967. /*
  968. * when derivations are wanted, collect them
  969. * for printing
  970. */
  971. j = lev;
  972. prefcount = suffcount = 0;
  973. do {
  974. if(j<DSIZ && deriv[j].type) {
  975. strcat(affix, deriv[j].mesg);
  976. if(deriv[j].type == SUFF)
  977. suffcount++;
  978. else if(deriv[j].type != NONE)
  979. prefcount = deriv[j].type/PREF;
  980. }
  981. } while(--j > 0);
  982. return h;
  983. }
  984. int
  985. inun(char* bp, Bits h)
  986. {
  987. if(*bp == 'u')
  988. return Set(h, IN) == 0;
  989. /* *bp == 'i' */
  990. if(Set(h, IN) == 0)
  991. return 0;
  992. switch(bp[2]) {
  993. case 'r':
  994. return bp[1] == 'r';
  995. case 'm':
  996. case 'p':
  997. return bp[1] == 'm';
  998. }
  999. return bp[1] == 'n';
  1000. }
  1001. char*
  1002. skipv(char *s)
  1003. {
  1004. if(s >= word && ISVOWEL(*s))
  1005. s--;
  1006. while(s >= word && !ISVOWEL(*s))
  1007. s--;
  1008. return s;
  1009. }
  1010. /*
  1011. * crummy way to Britishise
  1012. */
  1013. void
  1014. ise(void)
  1015. {
  1016. Suftab *p;
  1017. int i;
  1018. for(i=0; i<26; i++)
  1019. for(p = suftab[i]; p->suf; p++) {
  1020. p->suf = ztos(p->suf);
  1021. p->d1 = ztos(p->d1);
  1022. p->a1 = ztos(p->a1);
  1023. }
  1024. }
  1025. char*
  1026. ztos(char *as)
  1027. {
  1028. char *s, *ds;
  1029. for(s=as; *s; s++)
  1030. if(*s == 'z')
  1031. goto copy;
  1032. return as;
  1033. copy:
  1034. ds = strdup(as);
  1035. for(s=ds; *s; s++)
  1036. if(*s == 'z')
  1037. *s = 's';
  1038. return ds;
  1039. }
  1040. Bits
  1041. dict(char* bp, char* ep)
  1042. {
  1043. char *cp, *cp1, *w, *wp, *we;
  1044. int n, f;
  1045. w = bp;
  1046. we = ep;
  1047. n = ep-bp;
  1048. if(n <= 1)
  1049. return NOUN;
  1050. f = w[0] & 0x7f;
  1051. f *= 128;
  1052. f += w[1] & 0x7f;
  1053. bp = spacep[f];
  1054. ep = spacep[f+1];
  1055. loop:
  1056. if(bp >= ep) {
  1057. if(xflag)
  1058. fprint(2, "=%.*s\n", utfnlen(w, n), w);
  1059. return 0;
  1060. }
  1061. /*
  1062. * find the beginning of some word in the middle
  1063. */
  1064. cp = bp + (ep-bp)/2;
  1065. while(cp > bp && !(*cp & 0x80))
  1066. cp--;
  1067. while(cp > bp && (cp[-1] & 0x80))
  1068. cp--;
  1069. wp = w + 2; /* skip two letters */
  1070. cp1 = cp + 2; /* skip affix code */
  1071. for(;;) {
  1072. if(wp >= we) {
  1073. if(*cp1 & 0x80)
  1074. goto found;
  1075. else
  1076. f = 1;
  1077. break;
  1078. }
  1079. if(*cp1 & 0x80) {
  1080. f = -1;
  1081. break;
  1082. }
  1083. f = *cp1++ - *wp++;
  1084. if(f != 0)
  1085. break;
  1086. }
  1087. if(f < 0) {
  1088. while(!(*cp1 & 0x80))
  1089. cp1++;
  1090. bp = cp1;
  1091. goto loop;
  1092. }
  1093. ep = cp;
  1094. goto loop;
  1095. found:
  1096. f = ((cp[0] & 0x7) << 8) |
  1097. (cp[1] & 0xff);
  1098. if(xflag) {
  1099. fprint(2, "=%.*s ", utfnlen(w, n), w);
  1100. typeprint(encode[f]);
  1101. }
  1102. return encode[f];
  1103. }
  1104. void
  1105. typeprint(Bits h)
  1106. {
  1107. pcomma("");
  1108. if(h & NOUN)
  1109. pcomma("n");
  1110. if(h & PROP_COLLECT)
  1111. pcomma("pc");
  1112. if(h & VERB) {
  1113. if((h & VERB) == VERB)
  1114. pcomma("v");
  1115. else
  1116. if((h & VERB) == V_IRREG)
  1117. pcomma("vi");
  1118. else
  1119. if(h & ED)
  1120. pcomma("ed");
  1121. }
  1122. if(h & ADJ)
  1123. pcomma("a");
  1124. if(h & COMP) {
  1125. if((h & COMP) == ACTOR)
  1126. pcomma("er");
  1127. else
  1128. pcomma("comp");
  1129. }
  1130. if(h & DONT_TOUCH)
  1131. pcomma("d");
  1132. if(h & N_AFFIX)
  1133. pcomma("na");
  1134. if(h & ADV)
  1135. pcomma("adv");
  1136. if(h & ION)
  1137. pcomma("ion");
  1138. if(h & V_AFFIX)
  1139. pcomma("va");
  1140. if(h & MAN)
  1141. pcomma("man");
  1142. if(h & NOPREF)
  1143. pcomma("nopref");
  1144. if(h & MONO)
  1145. pcomma("ms");
  1146. if(h & IN)
  1147. pcomma("in");
  1148. if(h & _Y)
  1149. pcomma("y");
  1150. if(h & STOP)
  1151. pcomma("s");
  1152. fprint(2, "\n");
  1153. }
  1154. void
  1155. pcomma(char *s)
  1156. {
  1157. static flag;
  1158. if(*s == 0) {
  1159. flag = 0;
  1160. return;
  1161. }
  1162. if(!flag) {
  1163. fprint(2, "%s", s);
  1164. flag = 1;
  1165. } else
  1166. fprint(2, ",%s", s);
  1167. }
  1168. /*
  1169. * is the word on of the following
  1170. * 12th teen
  1171. * 21st end in 1
  1172. * 23rd end in 3
  1173. * 77th default
  1174. * called knowing word[0] is a digit
  1175. */
  1176. int
  1177. ordinal(void)
  1178. {
  1179. char *cp = word;
  1180. static char sp[4];
  1181. while(ISDIGIT(*cp))
  1182. cp++;
  1183. strncpy(sp,cp,3);
  1184. if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
  1185. sp[0] = Tolower(cp[0]);
  1186. sp[1] = Tolower(cp[1]);
  1187. }
  1188. return 0 == strncmp(sp,
  1189. cp[-2]=='1'? "th": /* out of bounds if 1 digit */
  1190. *--cp=='1'? "st": /* harmless */
  1191. *cp=='2'? "nd":
  1192. *cp=='3'? "rd":
  1193. "th", 3);
  1194. }
  1195. /*
  1196. * read in the dictionary.
  1197. * format is
  1198. * {
  1199. * short nencode;
  1200. * long encode[nencode];
  1201. * char space[*];
  1202. * };
  1203. *
  1204. * the encodings are a table all different
  1205. * affixes.
  1206. * the dictionary proper has 2 bytes
  1207. * that demark and then the rest of the
  1208. * word. the 2 bytes have the following
  1209. * 0x80 0x00 flag
  1210. * 0x78 0x00 count of prefix bytes
  1211. * common with prev word
  1212. * 0x07 0xff affix code
  1213. *
  1214. * all ints are big endians in the file.
  1215. */
  1216. void
  1217. readdict(char *file)
  1218. {
  1219. char *s, *is, *lasts, *ls;
  1220. int c, i, sp, p;
  1221. int f;
  1222. int32_t l;
  1223. lasts = 0;
  1224. f = open(file, 0);
  1225. if(f == -1) {
  1226. fprint(2, "cannot open %s\n", file);
  1227. exits("open");
  1228. }
  1229. if(read(f, space, 2) != 2)
  1230. goto bad;
  1231. nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
  1232. if(read(f, space, 4*nencode) != 4*nencode)
  1233. goto bad;
  1234. s = space;
  1235. for(i=0; i<nencode; i++) {
  1236. l = (int32_t)(s[0] & 0xff) << 24;
  1237. l |= (s[1] & 0xff) << 16;
  1238. l |= (s[2] & 0xff) << 8;
  1239. l |= s[3] & 0xff;
  1240. encode[i] = (Bits)l;
  1241. s += 4;
  1242. }
  1243. l = read(f, space, sizeof(space));
  1244. if(l == sizeof(space))
  1245. goto noroom;
  1246. is = space + (sizeof(space) - l);
  1247. memmove(is, space, l);
  1248. s = space;
  1249. c = *is++ & 0xff;
  1250. sp = -1;
  1251. i = 0;
  1252. loop:
  1253. if(s > is)
  1254. goto noroom;
  1255. if(c < 0) {
  1256. close(f);
  1257. while(sp < 128*128)
  1258. spacep[++sp] = s;
  1259. *s = 0x80; /* fence */
  1260. return;
  1261. }
  1262. p = (c>>3) & 0xf;
  1263. *s++ = c;
  1264. *s++ = *is++ & 0xff;
  1265. if(p <= 0)
  1266. i = (*is++ & 0xff)*128;
  1267. if(p <= 1) {
  1268. if(!(*is & 0x80))
  1269. i = i/128*128 + (*is++ & 0xff);
  1270. if(i <= sp) {
  1271. fprint(2, "the dict isnt sorted or \n");
  1272. fprint(2, "memmove didn't work\n");
  1273. goto bad;
  1274. }
  1275. while(sp < i)
  1276. spacep[++sp] = s-2;
  1277. }
  1278. ls = lasts;
  1279. lasts = s;
  1280. for(p-=2; p>0; p--)
  1281. *s++ = *ls++;
  1282. for(;;) {
  1283. if(is >= space+sizeof(space)) {
  1284. c = -1;
  1285. break;
  1286. }
  1287. c = *is++ & 0xff;
  1288. if(c & 0x80)
  1289. break;
  1290. *s++ = c;
  1291. }
  1292. *s = 0;
  1293. goto loop;
  1294. bad:
  1295. fprint(2, "trouble reading %s\n", file);
  1296. exits("read");
  1297. noroom:
  1298. fprint(2, "not enough space for dictionary\n");
  1299. exits("space");
  1300. }