123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <ctype.h>
- #include "msgdb.h"
- void
- usage(void)
- {
- fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
- exits("usage");
- }
- enum
- {
- MAXBEST = 32,
- MAXLEN = 64,
- MAXTAB = 256,
- };
- typedef struct Ndb Ndb;
- struct Ndb
- {
- char *name;
- char *file;
- Msgdb *db;
- double p;
- long nmsg;
- };
- typedef struct Word Word;
- struct Word
- {
- char s[MAXLEN];
- int count[MAXTAB];
- double p[MAXTAB];
- double mp;
- int mi; /* w.p[w.mi] = w.mp */
- int nmsg;
- };
- Ndb db[MAXTAB];
- int ndb;
- int add;
- int mul;
- Msgdb *indb;
- Word best[MAXBEST];
- int mbest = 15;
- int nbest;
- void process(Biobuf*, char*);
- void lockfile(char*);
- void
- noteword(Word *w, char *s)
- {
- int i;
- for(i=nbest-1; i>=0; i--)
- if(w->mp < best[i].mp)
- break;
- i++;
- if(i >= mbest)
- return;
- if(nbest == mbest)
- nbest--;
- if(i < nbest)
- memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
- best[i] = *w;
- strecpy(best[i].s, best[i].s+MAXLEN, s);
- nbest++;
- }
- void
- main(int argc, char **argv)
- {
- int i, bad, m, tot, nn, j;
- Biobuf bin, *b, bout;
- char *s, *lf;
- double totp, p, thresh;
- long n;
- Word w;
- lf = nil;
- thresh = 0;
- ARGBEGIN{
- case 'a':
- add = 1;
- break;
- case 'd':
- if(ndb >= MAXTAB)
- sysfatal("too many db classes");
- db[ndb].name = EARGF(usage());
- db[ndb].file = EARGF(usage());
- ndb++;
- break;
- case 'l':
- lf = EARGF(usage());
- break;
- case 'm':
- mul = atoi(EARGF(usage()));
- break;
- case 't':
- thresh = atof(EARGF(usage()));
- break;
- default:
- usage();
- }ARGEND
- if(ndb == 0){
- fprint(2, "must have at least one -d option\n");
- usage();
- }
- indb = mdopen(nil, 1);
- if(argc == 0){
- Binit(&bin, 0, OREAD);
- process(&bin, "<stdin>");
- Bterm(&bin);
- }else{
- bad = 0;
- for(i=0; i<argc; i++){
- if((b = Bopen(argv[i], OREAD)) == nil){
- fprint(2, "opening %s: %r\n", argv[i]);
- bad = 1;
- continue;
- }
- process(b, argv[i]);
- Bterm(b);
- }
- if(bad)
- exits("open inputs");
- }
- lockfile(lf);
- bad = 0;
- for(i=0; i<ndb; i++){
- if((db[i].db = mdopen(db[i].file, 0)) == nil){
- fprint(2, "opendb %s: %r\n", db[i].file);
- bad = 1;
- }
- db[i].nmsg = mdget(db[i].db, "*From*");
- }
- if(bad)
- exits("open databases");
- /* run conditional probabilities of input words, getting 15 most specific */
- mdenum(indb);
- nbest = 0;
- while(mdnext(indb, &s, &n) >= 0){
- tot = 0;
- totp = 0.0;
- for(i=0; i<ndb; i++){
- nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
- tot += nn;
- w.count[i] = nn;
- p = w.count[i]/(double)db[i].nmsg;
- if(p >= 1.0)
- p = 1.0;
- w.p[i] = p;
- totp += p;
- }
- //fprint(2, "%s tot %d totp %g\n", s, tot, totp);
- if(tot < 2)
- continue;
- w.mp = 0.0;
- for(i=0; i<ndb; i++){
- p = w.p[i];
- p /= totp;
- if(p < 0.001)
- p = 0.001;
- else if(p > 0.999)
- p = 0.999;
- if(p > w.mp){
- w.mp = p;
- w.mi = i;
- }
- w.p[i] = p;
- }
- noteword(&w, s);
- }
- /* compute conditional probabilities of message classes using 15 most specific */
- totp = 0.0;
- for(i=0; i<ndb; i++){
- p = 1.0;
- for(j=0; j<nbest; j++)
- p *= best[j].p[i];
- db[i].p = p;
- totp += p;
- }
- for(i=0; i<ndb; i++)
- db[i].p /= totp;
- m = 0;
- for(i=1; i<ndb; i++)
- if(db[i].p > db[m].p)
- m = i;
- Binit(&bout, 1, OWRITE);
- if(db[m].p < thresh)
- m = -1;
- if(m >= 0)
- Bprint(&bout, "%s", db[m].name);
- else
- Bprint(&bout, "inconclusive");
- for(j=0; j<ndb; j++)
- Bprint(&bout, " %s=%g", db[j].name, db[j].p);
- Bprint(&bout, "\n");
- for(i=0; i<nbest; i++){
- Bprint(&bout, "%s", best[i].s);
- for(j=0; j<ndb; j++)
- Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
- Bprint(&bout, "\n");
- }
- Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
- Bterm(&bout);
- if(m >= 0 && add){
- mdenum(indb);
- while(mdnext(indb, &s, &n) >= 0)
- mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
- mdclose(db[m].db);
- }
- exits(nil);
- }
- void
- process(Biobuf *b, char*)
- {
- char *s;
- char *p;
- long n;
- while((s = Brdline(b, '\n')) != nil){
- s[Blinelen(b)-1] = 0;
- if((p = strrchr(s, ' ')) != nil){
- *p++ = 0;
- n = atoi(p);
- }else
- n = 1;
- mdput(indb, s, mdget(indb, s)+n);
- }
- }
- int tpid;
- void
- killtickle(void)
- {
- postnote(PNPROC, tpid, "die");
- }
- void
- lockfile(char *s)
- {
- int fd, t, w;
- char err[ERRMAX];
- if(s == nil)
- return;
- w = 50;
- t = 0;
- for(;;){
- fd = open(s, OREAD);
- if(fd >= 0)
- break;
- rerrstr(err, sizeof err);
- if(strstr(err, "file is locked") == nil)
- break;
- sleep(w);
- t += w;
- if(w < 1000)
- w = (w*3)/2;
- if(t > 120*1000)
- break;
- }
- if(fd < 0)
- sysfatal("could not lock %s", s);
- switch(tpid = fork()){
- case -1:
- sysfatal("fork: %r");
- case 0:
- for(;;){
- sleep(30*1000);
- free(dirfstat(fd));
- }
- _exits(nil);
- default:
- break;
- }
- close(fd);
- atexit(killtickle);
- }
|