123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <ctype.h>
- #include "code.h"
- /* fig leaves for possibly signed char quantities */
- #define ISUPPER(c) isupper((c)&0xff)
- #define ISLOWER(c) islower((c)&0xff)
- #define ISALPHA(c) isalpha((c)&0xff)
- #define ISDIGIT(c) isdigit((c)&0xff)
- #define ISVOWEL(c) voweltab[(c)&0xff]
- #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
- #define pair(a,b) (((a)<<8) | (b))
- #define DLEV 2
- #define DSIZ 40
- typedef long Bits;
- #define Set(h, f) ((long)(h) & (f))
- Bits nop(char*, char*, char*, int, int);
- Bits strip(char*, char*, char*, int, int);
- Bits ize(char*, char*, char*, int, int);
- Bits i_to_y(char*, char*, char*, int, int);
- Bits ily(char*, char*, char*, int, int);
- Bits subst(char*, char*, char*, int, int);
- Bits CCe(char*, char*, char*, int, int);
- Bits tion(char*, char*, char*, int, int);
- Bits an(char*, char*, char*, int, int);
- Bits s(char*, char*, char*, int, int);
- Bits es(char*, char*, char*, int, int);
- Bits bility(char*, char*, char*, int, int);
- Bits y_to_e(char*, char*, char*, int, int);
- Bits VCe(char*, char*, char*, int, int);
- Bits trypref(char*, char*, int, int);
- Bits tryword(char*, char*, int, int);
- Bits trysuff(char*, int, int);
- Bits dict(char*, char*);
- void typeprint(Bits);
- void pcomma(char*);
- void ise(void);
- int ordinal(void);
- char* skipv(char*);
- int inun(char*, Bits);
- char* ztos(char*);
- void readdict(char*);
- typedef struct Ptab Ptab;
- struct Ptab
- {
- char* s;
- int flag;
- };
- typedef struct Suftab Suftab;
- struct Suftab
- {
- char *suf;
- Bits (*p1)(char*, char*, char*, int, int);
- int n1;
- char *d1;
- char *a1;
- int flag;
- int affixable;
- Bits (*p2)(char*, char*, char*, int, int);
- int n2;
- char *d2;
- char *a2;
- };
- Suftab staba[] = {
- {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
- 0
- };
- Suftab stabc[] =
- {
- {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
- {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
- {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
- {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
- {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
- {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
- {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
- {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
- {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
- 0
- };
- Suftab stabd[] =
- {
- {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
- {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
- 0
- };
- Suftab stabe[] =
- {
- /*
- * V_affix for comment ->commence->commentment??
- */
- {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
- {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
- {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
- {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
- {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
- {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
- {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
- 0
- };
- Suftab stabg[] =
- {
- {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
- {"gnikam",strip,6,"","+making",NOUN,NOUN},
- {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
- {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
- 0
- };
- Suftab stabl[] =
- {
- {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
- {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
- {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
- {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
- {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
- 0
- };
- Suftab stabm[] =
- {
- /* congregational + ism */
- {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
- {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
- 0
- };
- Suftab stabn[] =
- {
- {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
- {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
- {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
- {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
- {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
- {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
- {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
- {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
- {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
- {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
- 0
- };
- Suftab stabp[] =
- {
- {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
- 0
- };
- Suftab stabr[] =
- {
- {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
- {"reyhparg",nop,0,"","",0,NOUN},
- {"reyl",nop,0,"","",0,NOUN},
- {"rekam",strip,5,"","+maker",NOUN,NOUN},
- {"repeek",strip,6,"","+keeper",NOUN,NOUN},
- {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
- {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
- {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
- {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
- 0
- };
- Suftab stabs[] =
- {
- {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
- {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
- {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
- {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
- {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
- 0
- };
- Suftab stabt[] =
- {
- {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
- {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
- {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
- {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
- 0
- };
- Suftab staby[] =
- {
- {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
- {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
- {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
- {"ytisuo",nop,0,"","",NOUN},
- {"ytilb",nop,0,"","",0,NOUN},
- {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
- {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
- {"ylc",nop,0,"","",0},
- {"ylelb",nop,0,"","",0},
- {"ylelp",nop,0,"","",0},
- {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
- {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
- {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
- 0
- };
- Suftab stabz[] =
- {
- 0
- };
- Suftab* suftab[] =
- {
- staba,
- stabz,
- stabc,
- stabd,
- stabe,
- stabz,
- stabg,
- stabz,
- stabz,
- stabz,
- stabz,
- stabl,
- stabm,
- stabn,
- stabz,
- stabp,
- stabz,
- stabr,
- stabs,
- stabt,
- stabz,
- stabz,
- stabz,
- stabz,
- staby,
- stabz,
- };
- Ptab ptaba[] =
- {
- "anti", 0,
- "auto", 0,
- 0
- };
- Ptab ptabb[] =
- {
- "bio", 0,
- 0
- };
- Ptab ptabc[] =
- {
- "counter", 0,
- 0
- };
- Ptab ptabd[] =
- {
- "dis", 0,
- 0
- };
- Ptab ptabe[] =
- {
- "electro", 0,
- 0
- };
- Ptab ptabf[] =
- {
- "femto", 0,
- 0
- };
- Ptab ptabg[] =
- {
- "geo", 0,
- "giga", 0,
- 0
- };
- Ptab ptabh[] =
- {
- "hyper", 0,
- 0
- };
- Ptab ptabi[] =
- {
- "immuno", 0,
- "im", IN,
- "intra", 0,
- "inter", 0,
- "in", IN,
- "ir", IN,
- "iso", 0,
- 0
- };
- Ptab ptabj[] =
- {
- 0
- };
- Ptab ptabk[] =
- {
- "kilo", 0,
- 0
- };
- Ptab ptabl[] =
- {
- 0
- };
- Ptab ptabm[] =
- {
- "magneto", 0,
- "mega", 0,
- "meta", 0,
- "micro", 0,
- "mid", 0,
- "milli", 0,
- "mini", 0,
- "mis", 0,
- "mono", 0,
- "multi", 0,
- 0
- };
- Ptab ptabn[] =
- {
- "nano", 0,
- "neuro", 0,
- "non", 0,
- 0
- };
- Ptab ptabo[] =
- {
- "out", 0,
- "over", 0,
- 0
- };
- Ptab ptabp[] =
- {
- "para", 0,
- "photo", 0,
- "pico", 0,
- "poly", 0,
- "pre", 0,
- "pseudo", 0,
- "psycho", 0,
- 0
- };
- Ptab ptabq[] =
- {
- "quasi", 0,
- 0
- };
- Ptab ptabr[] =
- {
- "radio", 0,
- "re", 0,
- 0
- };
- Ptab ptabs[] =
- {
- "semi", 0,
- "stereo", 0,
- "sub", 0,
- "super", 0,
- 0
- };
- Ptab ptabt[] =
- {
- "tele", 0,
- "tera", 0,
- "thermo", 0,
- 0
- };
- Ptab ptabu[] =
- {
- "ultra", 0,
- "under", 0, /*must precede un*/
- "un", IN,
- 0
- };
- Ptab ptabv[] =
- {
- 0
- };
- Ptab ptabw[] =
- {
- 0
- };
- Ptab ptabx[] =
- {
- 0
- };
- Ptab ptaby[] =
- {
- 0
- };
- Ptab ptabz[] =
- {
- 0
- };
- Ptab* preftab[] =
- {
- ptaba,
- ptabb,
- ptabc,
- ptabd,
- ptabe,
- ptabf,
- ptabg,
- ptabh,
- ptabi,
- ptabj,
- ptabk,
- ptabl,
- ptabm,
- ptabn,
- ptabo,
- ptabp,
- ptabq,
- ptabr,
- ptabs,
- ptabt,
- ptabu,
- ptabv,
- ptabw,
- ptabx,
- ptaby,
- ptabz,
- };
- typedef struct {
- char *mesg;
- enum { NONE, SUFF, PREF} type;
- } Deriv;
- int aflag;
- int cflag;
- int fflag;
- int vflag;
- int xflag;
- int nflag;
- char word[500];
- char* original;
- Deriv emptyderiv;
- Deriv deriv[DSIZ+3];
- char affix[DSIZ*10]; /* 10 is longest affix message */
- int prefcount;
- int suffcount;
- char* acmeid;
- char space[300000]; /* must be as large as "words"+"space" in pcode run */
- Bits encode[2048]; /* must be as long as "codes" in pcode run */
- int nencode;
- char voweltab[256];
- char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
- Biobuf bin;
- Biobuf bout;
- char* codefile = "/sys/lib/amspell";
- char* brfile = "/sys/lib/brspell";
- char* Usage = "usage";
- void
- main(int argc, char *argv[])
- {
- char *ep, *cp;
- char *dp;
- int j, i, c;
- int low;
- Bits h;
- Binit(&bin, 0, OREAD);
- Binit(&bout, 1, OWRITE);
- for(i=0; c = "aeiouyAEIOUY"[i]; i++)
- voweltab[c] = 1;
- while(argc > 1) {
- if(argv[1][0] != '-')
- break;
- for(i=1; c = argv[1][i]; i++)
- switch(c) {
- default:
- fprint(2, "usage: spell [-bcCvx] [-f file]\n");
- exits(Usage);
- case 'a':
- aflag++;
- continue;
- case 'b':
- ise();
- if(!fflag)
- codefile = brfile;
- continue;
- case 'C': /* for "correct" */
- vflag++;
- case 'c': /* for ocr */
- cflag++;
- continue;
- case 'v':
- vflag++;
- continue;
- case 'x':
- xflag++;
- continue;
- case 'f':
- if(argc <= 2) {
- fprint(2, "spell: -f requires another argument\n");
- exits(Usage);
- }
- argv++;
- argc--;
- codefile = argv[1];
- fflag++;
- goto brk;
- }
- brk:
- argv++;
- argc--;
- }
- readdict(codefile);
- if(argc > 1) {
- fprint(2, "usage: spell [-bcCvx] [-f file]\n");
- exits(Usage);
- }
- if(aflag)
- cflag = vflag = 0;
- for(;;) {
- affix[0] = 0;
- original = Brdline(&bin, '\n');
- if(original == 0)
- exits(0);
- original[Blinelen(&bin)-1] = 0;
- low = 0;
- if(aflag) {
- acmeid = original;
- while(*original != ':')
- if(*original++ == 0)
- exits(0);
- while(*++original != ':')
- if(*original == 0)
- exits(0);
- *original++ = 0;
- }
- for(ep=word,dp=original; j = *dp; ep++,dp++) {
- if(ISLOWER(j))
- low++;
- if(ep >= word+sizeof(word)-1)
- break;
- *ep = j;
- }
- *ep = 0;
- if(ISDIGIT(word[0]) && ordinal())
- continue;
- h = 0;
- if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
- for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
- *dp = Tolower(*cp);
- if(!h)
- for(;;) { /* at most twice */
- if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
- break;
- if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
- break;
- if(!ISUPPER(word[0]))
- break;
- cp = original;
- dp = word;
- while(*dp = *cp++) {
- if(!low)
- *dp = Tolower(*dp);
- dp++;
- }
- word[0] = Tolower(word[0]);
- }
- if(cflag) {
- if(!h || Set(h,STOP))
- print("-");
- else if(!vflag)
- print("+");
- else
- print("%c",'0' + (suffcount>0) +
- (prefcount>4? 8: 2*prefcount));
- } else if(!h || Set(h,STOP)) {
- if(aflag)
- Bprint(&bout, "%s:%s\n", acmeid, original);
- else
- Bprint(&bout, "%s\n", original);
- } else if(affix[0] != 0 && affix[0] != '.')
- print("%s\t%s\n", affix, original);
- }
- exits(0);
- }
- /* strip exactly one suffix and do
- * indicated routine(s), which may recursively
- * strip suffixes
- */
- Bits
- trysuff(char* ep, int lev, int flag)
- {
- Suftab *t;
- char *cp, *sp;
- Bits h = 0;
- int initchar = ep[-1];
- flag &= ~MONO;
- lev += DLEV;
- if(lev < DSIZ) {
- deriv[lev] = emptyderiv;
- deriv[lev-1] = emptyderiv;
- }
- if(!ISLOWER(initchar))
- return h;
- for(t=suftab[initchar-'a']; sp=t->suf; t++) {
- cp = ep;
- while(*sp)
- if(*--cp != *sp++)
- goto next;
- for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
- ;
- if(sp < word)
- continue;
- if(!(t->affixable & flag))
- return 0;
- h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
- if(!h && t->p2!=0) {
- if(lev < DSIZ) {
- deriv[lev] = emptyderiv;
- deriv[lev+1] = emptyderiv;
- }
- h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
- }
- break;
- next:;
- }
- return h;
- }
- Bits
- nop(char* ep, char* d, char* a, int lev, int flag)
- {
- USED(ep, d, a, lev, flag);
- return 0;
- }
- Bits
- cstrip(char* ep, char* d, char* a, int lev, int flag)
- {
- int temp = ep[0];
- if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
- switch(pair(ep[-1],ep[0])) {
- case pair('a', 'a'):
- case pair('a', 'e'):
- case pair('a', 'i'):
- case pair('e', 'a'):
- case pair('e', 'e'):
- case pair('e', 'i'):
- case pair('i', 'i'):
- case pair('o', 'a'):
- return 0;
- }
- } else
- if(temp==ep[-1]&&temp==ep[-2])
- return 0;
- return strip(ep,d,a,lev,flag);
- }
- Bits
- strip(char* ep, char* d, char* a, int lev, int flag)
- {
- Bits h = trypref(ep, a, lev, flag);
- USED(d);
- if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
- h = 0;
- if(h)
- return h;
- if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
- h = trypref(ep-1,a,lev,flag|MONO);
- if(h)
- return h;
- }
- return trysuff(ep,lev,flag);
- }
- Bits
- s(char* ep, char* d, char* a, int lev, int flag)
- {
- if(lev > DLEV+1)
- return 0;
- if(*ep=='s') {
- switch(ep[-1]) {
- case 'y':
- if(ISVOWEL(ep[-2])||ISUPPER(*word))
- break; /*says Kennedys*/
- case 'x':
- case 'z':
- case 's':
- return 0;
- case 'h':
- switch(ep[-2]) {
- case 'c':
- case 's':
- return 0;
- }
- }
- }
- return strip(ep,d,a,lev,flag);
- }
- Bits
- an(char* ep, char* d, char* a, int lev, int flag)
- {
- USED(d);
- if(!ISUPPER(*word)) /*must be proper name*/
- return 0;
- return trypref(ep,a,lev,flag);
- }
- Bits
- ize(char* ep, char* d, char* a, int lev, int flag)
- {
- int temp = ep[-1];
- Bits h;
- USED(a);
- ep[-1] = 'e';
- h = strip(ep,"",d,lev,flag);
- ep[-1] = temp;
- return h;
- }
- Bits
- y_to_e(char* ep, char* d, char* a, int lev, int flag)
- {
- Bits h;
- int temp;
- USED(a);
- switch(ep[-1]) {
- case 'a':
- case 'e':
- case 'i':
- return 0;
- }
- temp = *ep;
- *ep++ = 'e';
- h = strip(ep,"",d,lev,flag);
- ep[-1] = temp;
- return h;
- }
- Bits
- ily(char* ep, char* d, char* a, int lev, int flag)
- {
- int temp = ep[0];
- char *cp = ep;
- if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
- return 0;
- if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
- while(cp>word)
- if(ISVOWEL(*--cp)) /* shyness */
- return 0;
- if(ep[-1]=='i')
- return i_to_y(ep,d,a,lev,flag);
- return cstrip(ep,d,a,lev,flag);
- }
- Bits
- bility(char* ep, char* d, char* a, int lev, int flag)
- {
- *ep++ = 'l';
- return y_to_e(ep,d,a,lev,flag);
- }
- Bits
- i_to_y(char* ep, char* d, char* a, int lev, int flag)
- {
- Bits h;
- int temp;
- if(ISUPPER(*word))
- return 0;
- if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
- ep[-1] = 'y';
- a = d;
- }
- h = cstrip(ep,"",a,lev,flag);
- ep[-1] = temp;
- return h;
- }
- Bits
- es(char* ep, char* d, char* a, int lev, int flag)
- {
- if(lev>DLEV)
- return 0;
- switch(ep[-1]) {
- default:
- return 0;
- case 'i':
- return i_to_y(ep,d,a,lev,flag);
- case 'h':
- switch(ep[-2]) {
- default:
- return 0;
- case 'c':
- case 's':
- break;
- }
- case 's':
- case 'z':
- case 'x':
- return strip(ep,d,a,lev,flag);
- }
- }
- Bits
- subst(char* ep, char* d, char* a, int lev, int flag)
- {
- char *u,*t;
- Bits h;
- USED(a);
- if(skipv(skipv(ep-1)) < word)
- return 0;
- for(t=d; *t!='+'; t++)
- continue;
- for(u=ep; *--t!='-';)
- *--u = *t;
- h = strip(ep,"",d,lev,flag);
- while(*++t != '+')
- continue;
- while(*++t)
- *u++ = *t;
- return h;
- }
- Bits
- tion(char* ep, char* d, char* a, int lev, int flag)
- {
- switch(ep[-2]) {
- default:
- return trypref(ep,a,lev,flag);
- case 'a':
- case 'e':
- case 'i':
- case 'o':
- case 'u':
- return y_to_e(ep,d,a,lev,flag);
- }
- }
- /*
- * possible consonant-consonant-e ending
- */
- Bits
- CCe(char* ep, char* d, char* a, int lev, int flag)
- {
- Bits h;
- switch(ep[-1]) {
- case 'l':
- if(ISVOWEL(ep[-2]))
- break;
- switch(ep[-2]) {
- case 'l':
- case 'r':
- case 'w':
- break;
- default:
- return y_to_e(ep,d,a,lev,flag);
- }
- break;
- case 'c':
- case 'g':
- if(*ep == 'a') /* prevent -able for -eable */
- return 0;
- case 's':
- case 'v':
- case 'z':
- if(ep[-2]==ep[-1])
- break;
- if(ISVOWEL(ep[-2]))
- break;
- case 'u':
- if(h = y_to_e(ep,d,a,lev,flag))
- return h;
- if(!(ep[-2]=='n' && ep[-1]=='g'))
- return 0;
- }
- return VCe(ep,d,a,lev,flag);
- }
- /*
- * possible consonant-vowel-consonant-e ending
- */
- Bits
- VCe(char* ep, char* d, char* a, int lev, int flag)
- {
- int c;
- Bits h;
- c = ep[-1];
- if(c=='e')
- return 0;
- if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
- c = *ep;
- *ep++ = 'e';
- h = trypref(ep,d,lev,flag);
- if(!h)
- h = trysuff(ep,lev,flag);
- if(h)
- return h;
- ep--;
- *ep = c;
- }
- return cstrip(ep,d,a,lev,flag);
- }
- Ptab*
- lookuppref(uchar** wp, char* ep)
- {
- Ptab *sp;
- uchar *bp,*cp;
- unsigned int initchar = Tolower(**wp);
- if(!ISALPHA(initchar))
- return 0;
- for(sp=preftab[initchar-'a'];sp->s;sp++) {
- bp = *wp;
- for(cp= (uchar*)sp->s;*cp; )
- if(*bp++!=*cp++)
- goto next;
- for(cp=bp;cp<(uchar*)ep;cp++)
- if(ISVOWEL(*cp)) {
- *wp = bp;
- return sp;
- }
- next:;
- }
- return 0;
- }
- /* while word is not in dictionary try stripping
- * prefixes. Fail if no more prefixes.
- */
- Bits
- trypref(char* ep, char* a, int lev, int flag)
- {
- Ptab *tp;
- char *bp, *cp;
- char *pp;
- Bits h;
- char space[20];
- if(lev<DSIZ) {
- deriv[lev].mesg = a;
- deriv[lev].type = *a=='.'? NONE: SUFF;
- }
- if(h = tryword(word,ep,lev,flag)) {
- if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
- return h;
- h = 0;
- }
- bp = word;
- pp = space;
- if(lev<DSIZ) {
- deriv[lev+1].mesg = pp;
- deriv[lev+1].type = 0;
- }
- while(tp=lookuppref((uchar**)&bp,ep)) {
- *pp++ = '+';
- cp = tp->s;
- while(pp<space+sizeof(space) && (*pp = *cp++))
- pp++;
- deriv[lev+1].type += PREF;
- h = tryword(bp,ep,lev+1,flag);
- if(Set(h,NOPREF) ||
- ((tp->flag&IN) && inun(bp-2,h)==0)) {
- h = 0;
- break;
- }
- if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
- break;
- h = 0;
- }
- if(lev < DSIZ) {
- deriv[lev+1] = emptyderiv;
- deriv[lev+2] = emptyderiv;
- }
- return h;
- }
- Bits
- tryword(char* bp, char* ep, int lev, int flag)
- {
- int j;
- Bits h = 0;
- char duple[3];
- if(ep-bp <= 1)
- return h;
- if(flag&MONO) {
- if(lev<DSIZ) {
- deriv[++lev].mesg = duple;
- deriv[lev].type = SUFF;
- }
- duple[0] = '+';
- duple[1] = *ep;
- duple[2] = 0;
- }
- h = dict(bp, ep);
- if(vflag==0 || h==0)
- return h;
- /*
- * when derivations are wanted, collect them
- * for printing
- */
- j = lev;
- prefcount = suffcount = 0;
- do {
- if(j<DSIZ && deriv[j].type) {
- strcat(affix, deriv[j].mesg);
- if(deriv[j].type == SUFF)
- suffcount++;
- else if(deriv[j].type != NONE)
- prefcount = deriv[j].type/PREF;
- }
- } while(--j > 0);
- return h;
- }
- int
- inun(char* bp, Bits h)
- {
- if(*bp == 'u')
- return Set(h, IN) == 0;
- /* *bp == 'i' */
- if(Set(h, IN) == 0)
- return 0;
- switch(bp[2]) {
- case 'r':
- return bp[1] == 'r';
- case 'm':
- case 'p':
- return bp[1] == 'm';
- }
- return bp[1] == 'n';
- }
- char*
- skipv(char *s)
- {
- if(s >= word && ISVOWEL(*s))
- s--;
- while(s >= word && !ISVOWEL(*s))
- s--;
- return s;
- }
- /*
- * crummy way to Britishise
- */
- void
- ise(void)
- {
- Suftab *p;
- int i;
- for(i=0; i<26; i++)
- for(p = suftab[i]; p->suf; p++) {
- p->suf = ztos(p->suf);
- p->d1 = ztos(p->d1);
- p->a1 = ztos(p->a1);
- }
- }
- char*
- ztos(char *as)
- {
- char *s, *ds;
- for(s=as; *s; s++)
- if(*s == 'z')
- goto copy;
- return as;
- copy:
- ds = strdup(as);
- for(s=ds; *s; s++)
- if(*s == 'z')
- *s = 's';
- return ds;
- }
- Bits
- dict(char* bp, char* ep)
- {
- char *cp, *cp1, *w, *wp, *we;
- int n, f;
- w = bp;
- we = ep;
- n = ep-bp;
- if(n <= 1)
- return NOUN;
- f = w[0] & 0x7f;
- f *= 128;
- f += w[1] & 0x7f;
- bp = spacep[f];
- ep = spacep[f+1];
- loop:
- if(bp >= ep) {
- if(xflag)
- fprint(2, "=%.*s\n", utfnlen(w, n), w);
- return 0;
- }
- /*
- * find the beginning of some word in the middle
- */
- cp = bp + (ep-bp)/2;
- while(cp > bp && !(*cp & 0x80))
- cp--;
- while(cp > bp && (cp[-1] & 0x80))
- cp--;
- wp = w + 2; /* skip two letters */
- cp1 = cp + 2; /* skip affix code */
- for(;;) {
- if(wp >= we) {
- if(*cp1 & 0x80)
- goto found;
- else
- f = 1;
- break;
- }
- if(*cp1 & 0x80) {
- f = -1;
- break;
- }
- f = *cp1++ - *wp++;
- if(f != 0)
- break;
- }
- if(f < 0) {
- while(!(*cp1 & 0x80))
- cp1++;
- bp = cp1;
- goto loop;
- }
- ep = cp;
- goto loop;
- found:
- f = ((cp[0] & 0x7) << 8) |
- (cp[1] & 0xff);
- if(xflag) {
- fprint(2, "=%.*s ", utfnlen(w, n), w);
- typeprint(encode[f]);
- }
- return encode[f];
- }
- void
- typeprint(Bits h)
- {
- pcomma("");
- if(h & NOUN)
- pcomma("n");
- if(h & PROP_COLLECT)
- pcomma("pc");
- if(h & VERB) {
- if((h & VERB) == VERB)
- pcomma("v");
- else
- if((h & VERB) == V_IRREG)
- pcomma("vi");
- else
- if(h & ED)
- pcomma("ed");
- }
- if(h & ADJ)
- pcomma("a");
- if(h & COMP) {
- if((h & COMP) == ACTOR)
- pcomma("er");
- else
- pcomma("comp");
- }
- if(h & DONT_TOUCH)
- pcomma("d");
- if(h & N_AFFIX)
- pcomma("na");
- if(h & ADV)
- pcomma("adv");
- if(h & ION)
- pcomma("ion");
- if(h & V_AFFIX)
- pcomma("va");
- if(h & MAN)
- pcomma("man");
- if(h & NOPREF)
- pcomma("nopref");
- if(h & MONO)
- pcomma("ms");
- if(h & IN)
- pcomma("in");
- if(h & _Y)
- pcomma("y");
- if(h & STOP)
- pcomma("s");
- fprint(2, "\n");
- }
- void
- pcomma(char *s)
- {
- static flag;
- if(*s == 0) {
- flag = 0;
- return;
- }
- if(!flag) {
- fprint(2, "%s", s);
- flag = 1;
- } else
- fprint(2, ",%s", s);
- }
- /*
- * is the word on of the following
- * 12th teen
- * 21st end in 1
- * 23rd end in 3
- * 77th default
- * called knowing word[0] is a digit
- */
- int
- ordinal(void)
- {
- char *cp = word;
- static char sp[4];
- while(ISDIGIT(*cp))
- cp++;
- strncpy(sp,cp,3);
- if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
- sp[0] = Tolower(cp[0]);
- sp[1] = Tolower(cp[1]);
- }
- return 0 == strncmp(sp,
- cp[-2]=='1'? "th": /* out of bounds if 1 digit */
- *--cp=='1'? "st": /* harmless */
- *cp=='2'? "nd":
- *cp=='3'? "rd":
- "th", 3);
- }
- /*
- * read in the dictionary.
- * format is
- * {
- * short nencode;
- * long encode[nencode];
- * char space[*];
- * };
- *
- * the encodings are a table all different
- * affixes.
- * the dictionary proper has 2 bytes
- * that demark and then the rest of the
- * word. the 2 bytes have the following
- * 0x80 0x00 flag
- * 0x78 0x00 count of prefix bytes
- * common with prev word
- * 0x07 0xff affix code
- *
- * all ints are big endians in the file.
- */
- void
- readdict(char *file)
- {
- char *s, *is, *lasts, *ls;
- int c, i, sp, p;
- int f;
- long l;
- lasts = 0;
- f = open(file, 0);
- if(f == -1) {
- fprint(2, "cannot open %s\n", file);
- exits("open");
- }
- if(read(f, space, 2) != 2)
- goto bad;
- nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
- if(read(f, space, 4*nencode) != 4*nencode)
- goto bad;
- s = space;
- for(i=0; i<nencode; i++) {
- l = (long)(s[0] & 0xff) << 24;
- l |= (s[1] & 0xff) << 16;
- l |= (s[2] & 0xff) << 8;
- l |= s[3] & 0xff;
- encode[i] = (Bits)l;
- s += 4;
- }
- l = read(f, space, sizeof(space));
- if(l == sizeof(space))
- goto noroom;
- is = space + (sizeof(space) - l);
- memmove(is, space, l);
- s = space;
- c = *is++ & 0xff;
- sp = -1;
- i = 0;
- loop:
- if(s > is)
- goto noroom;
- if(c < 0) {
- close(f);
- while(sp < 128*128)
- spacep[++sp] = s;
- *s = 0x80; /* fence */
- return;
- }
- p = (c>>3) & 0xf;
- *s++ = c;
- *s++ = *is++ & 0xff;
- if(p <= 0)
- i = (*is++ & 0xff)*128;
- if(p <= 1) {
- if(!(*is & 0x80))
- i = i/128*128 + (*is++ & 0xff);
- if(i <= sp) {
- fprint(2, "the dict isnt sorted or \n");
- fprint(2, "memmove didn't work\n");
- goto bad;
- }
- while(sp < i)
- spacep[++sp] = s-2;
- }
- ls = lasts;
- lasts = s;
- for(p-=2; p>0; p--)
- *s++ = *ls++;
- for(;;) {
- if(is >= space+sizeof(space)) {
- c = -1;
- break;
- }
- c = *is++ & 0xff;
- if(c & 0x80)
- break;
- *s++ = c;
- }
- *s = 0;
- goto loop;
- bad:
- fprint(2, "trouble reading %s\n", file);
- exits("read");
- noroom:
- fprint(2, "not enough space for dictionary\n");
- exits("space");
- }
|