123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- /*
- * RFC822 message tokenizer (really feature generator) for spam filter.
- *
- * See Paul Graham's musings on spam filtering for theory.
- */
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <regexp.h>
- #include <ctype.h>
- #include "dfa.h"
- void buildre(Dreprog*[3]);
- int debug;
- char *refile = "/mail/lib/classify.re";
- int maxtoklen = 20;
- int trim(char*);
- void
- usage(void)
- {
- fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
- exits("usage");
- }
- void
- main(int argc, char **argv)
- {
- int i, hdr, n, eof, off;
- Dreprog *re[3];
- int m[3];
- char *p, *ep, *tag;
- Biobuf bout, bin;
- char msg[1024+1];
- char buf[1024];
- buildre(re);
- ARGBEGIN{
- case 'D':
- debug = 1;
- break;
- case 'n':
- maxtoklen = atoi(EARGF(usage()));
- break;
- case 'r':
- refile = EARGF(usage());
- break;
- default:
- usage();
- }ARGEND;
- if(argc > 1)
- usage();
- if(argc == 1){
- close(0);
- if(open(argv[0], OREAD) < 0)
- sysfatal("open %s: %r", argv[0]);
- }
- tag = nil;
- Binit(&bin, 0, OREAD);
- Binit(&bout, 1, OWRITE);
- ep = msg;
- p = msg;
- eof = 0;
- off = 0;
- hdr = 1;
- for(;;){
- /* replenish buffer */
- if(ep - p < 512 && !eof){
- if(p > msg + 1){
- n = ep - p;
- memmove(msg, p-1, ep-(p-1));
- off += (p-1) - msg;
- p = msg+1;
- ep = p + n;
- }
- n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
- if(n < 0)
- sysfatal("read error: %r");
- if(n == 0)
- eof = 1;
- ep += n;
- *ep = 0;
- }
- if(p >= ep)
- break;
- if(*p == 0){
- p++;
- continue;
- }
- if(hdr && p[-1]=='\n'){
- if(p[0]=='\n')
- hdr = 0;
- else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
- tag = "From*";
- else if(cistrncmp(p-1, "\nto:", 4) == 0)
- tag = "To*";
- else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
- tag = "Subject*";
- else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
- tag = "Return-Path*";
- else
- tag = nil;
- }
- m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
- m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
- m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
- n = m[0];
- if(n < m[1])
- n = m[1];
- if(n < m[2])
- n = m[2];
- if(n <= 0){
- fprint(2, "«%s» %.2ux", p, p[0]);
- sysfatal("no regexps matched at %ld", off + (p-msg));
- }
- if(m[0] >= m[1] && m[0] >= m[2]){
- /* "From " marks start of new message */
- Bprint(&bout, "*From*\n");
- n = m[0];
- hdr = 1;
- }else if(m[2] > 1){
- /* ignore */
- n = m[2];
- }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
- /* keyword */
- /* should do UTF-aware lowercasing, too much bother */
- /*
- for(i=0; i<n; i++)
- if('A' <= p[i] && p[i] <= 'Z')
- p[i] += 'a' - 'A';
- */
- if(tag){
- i = strlen(tag);
- memmove(buf, tag, i);
- memmove(buf+i, p, m[1]);
- buf[i+m[1]] = 0;
- }else{
- memmove(buf, p, m[1]);
- buf[m[1]] = 0;
- }
- Bprint(&bout, "%s\n", buf);
- while(trim(buf) >= 0)
- Bprint(&bout, "stem*%s\n", buf);
- n = m[1];
- }else
- n = m[2];
- if(debug)
- fprint(2, "%.*s¦", utfnlen(p, n), p);
- p += n;
- }
- Bterm(&bout);
- exits(0);
- }
- void
- buildre(Dreprog *re[3])
- {
- Biobuf *b;
- if((b = Bopen(refile, OREAD)) == nil)
- sysfatal("open %s: %r", refile);
- re[0] = Breaddfa(b);
- re[1] = Breaddfa(b);
- re[2] = Breaddfa(b);
- if(re[0]==nil || re[1]==nil || re[2]==nil)
- sysfatal("Breaddfa: %r");
- Bterm(b);
- }
- /* perhaps this belongs in the tokenizer */
- int
- trim(char *s)
- {
- char *p, *op;
- int mix, mix1;
- if(*s == '*')
- return -1;
- /* strip leading punctuation */
- p = strchr(s, '*');
- if(p == nil)
- p = s;
- while(*p && !isalpha(*p))
- p++;
- if(strlen(p) < 2)
- {
- return -1;
- }
- memmove(s, p, strlen(p)+1);
- /* strip suffix of punctuation */
- p = s+strlen(s);
- op = p;
- while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
- p--;
- /* chop punctuation */
- if(p > s){
- /* free!!! -> free! */
- if(p+1 < op){
- p[1] = 0;
- return 0;
- }
- /* free! -> free */
- if(p < op){
- p[0] = 0;
- return 0;
- }
- }
- mix = mix1 = 0;
- if(isupper(s[0]))
- mix = 1;
- for(p=s+1; *p; p++)
- if(isupper(*p)){
- mix1 = 1;
- break;
- }
- /* turn FREE into Free */
- if(mix1){
- for(p=s+1; *p; p++)
- if(isupper(*p))
- *p += 'a'-'A';
- return 0;
- }
- /* turn Free into free */
- if(mix){
- *s += 'a'-'A';
- return 0;
- }
- return -1;
- }
|