msgtok.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /*
  2. * RFC822 message tokenizer (really feature generator) for spam filter.
  3. *
  4. * See Paul Graham's musings on spam filtering for theory.
  5. */
  6. #include <u.h>
  7. #include <libc.h>
  8. #include <bio.h>
  9. #include <regexp.h>
  10. #include <ctype.h>
  11. #include "dfa.h"
  12. void buildre(Dreprog*[3]);
  13. int debug;
  14. char *refile = "/mail/lib/classify.re";
  15. int maxtoklen = 20;
  16. int trim(char*);
  17. void
  18. usage(void)
  19. {
  20. fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
  21. exits("usage");
  22. }
  23. void
  24. main(int argc, char **argv)
  25. {
  26. int i, hdr, n, eof, off;
  27. Dreprog *re[3];
  28. int m[3];
  29. char *p, *ep, *tag;
  30. Biobuf bout, bin;
  31. char msg[1024+1];
  32. char buf[1024];
  33. buildre(re);
  34. ARGBEGIN{
  35. case 'D':
  36. debug = 1;
  37. break;
  38. case 'n':
  39. maxtoklen = atoi(EARGF(usage()));
  40. break;
  41. case 'r':
  42. refile = EARGF(usage());
  43. break;
  44. default:
  45. usage();
  46. }ARGEND;
  47. if(argc > 1)
  48. usage();
  49. if(argc == 1){
  50. close(0);
  51. if(open(argv[0], OREAD) < 0)
  52. sysfatal("open %s: %r", argv[0]);
  53. }
  54. tag = nil;
  55. Binit(&bin, 0, OREAD);
  56. Binit(&bout, 1, OWRITE);
  57. ep = msg;
  58. p = msg;
  59. eof = 0;
  60. off = 0;
  61. hdr = 1;
  62. for(;;){
  63. /* replenish buffer */
  64. if(ep - p < 512 && !eof){
  65. if(p > msg + 1){
  66. n = ep - p;
  67. memmove(msg, p-1, ep-(p-1));
  68. off += (p-1) - msg;
  69. p = msg+1;
  70. ep = p + n;
  71. }
  72. n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
  73. if(n < 0)
  74. sysfatal("read error: %r");
  75. if(n == 0)
  76. eof = 1;
  77. ep += n;
  78. *ep = 0;
  79. }
  80. if(p >= ep)
  81. break;
  82. if(*p == 0){
  83. p++;
  84. continue;
  85. }
  86. if(hdr && p[-1]=='\n'){
  87. if(p[0]=='\n')
  88. hdr = 0;
  89. else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
  90. tag = "From*";
  91. else if(cistrncmp(p-1, "\nto:", 4) == 0)
  92. tag = "To*";
  93. else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
  94. tag = "Subject*";
  95. else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
  96. tag = "Return-Path*";
  97. else
  98. tag = nil;
  99. }
  100. m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
  101. m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
  102. m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
  103. n = m[0];
  104. if(n < m[1])
  105. n = m[1];
  106. if(n < m[2])
  107. n = m[2];
  108. if(n <= 0){
  109. fprint(2, "«%s» %.2ux", p, p[0]);
  110. sysfatal("no regexps matched at %ld", off + (p-msg));
  111. }
  112. if(m[0] >= m[1] && m[0] >= m[2]){
  113. /* "From " marks start of new message */
  114. Bprint(&bout, "*From*\n");
  115. n = m[0];
  116. hdr = 1;
  117. }else if(m[2] > 1){
  118. /* ignore */
  119. n = m[2];
  120. }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
  121. /* keyword */
  122. /* should do UTF-aware lowercasing, too much bother */
  123. /*
  124. for(i=0; i<n; i++)
  125. if('A' <= p[i] && p[i] <= 'Z')
  126. p[i] += 'a' - 'A';
  127. */
  128. if(tag){
  129. i = strlen(tag);
  130. memmove(buf, tag, i);
  131. memmove(buf+i, p, m[1]);
  132. buf[i+m[1]] = 0;
  133. }else{
  134. memmove(buf, p, m[1]);
  135. buf[m[1]] = 0;
  136. }
  137. Bprint(&bout, "%s\n", buf);
  138. while(trim(buf) >= 0)
  139. Bprint(&bout, "stem*%s\n", buf);
  140. n = m[1];
  141. }else
  142. n = m[2];
  143. if(debug)
  144. fprint(2, "%.*s¦", utfnlen(p, n), p);
  145. p += n;
  146. }
  147. Bterm(&bout);
  148. exits(0);
  149. }
  150. void
  151. buildre(Dreprog *re[3])
  152. {
  153. Biobuf *b;
  154. if((b = Bopen(refile, OREAD)) == nil)
  155. sysfatal("open %s: %r", refile);
  156. re[0] = Breaddfa(b);
  157. re[1] = Breaddfa(b);
  158. re[2] = Breaddfa(b);
  159. if(re[0]==nil || re[1]==nil || re[2]==nil)
  160. sysfatal("Breaddfa: %r");
  161. Bterm(b);
  162. }
  163. /* perhaps this belongs in the tokenizer */
  164. int
  165. trim(char *s)
  166. {
  167. char *p, *op;
  168. int mix, mix1;
  169. if(*s == '*')
  170. return -1;
  171. /* strip leading punctuation */
  172. p = strchr(s, '*');
  173. if(p == nil)
  174. p = s;
  175. while(*p && !isalpha(*p))
  176. p++;
  177. if(strlen(p) < 2)
  178. {
  179. return -1;
  180. }
  181. memmove(s, p, strlen(p)+1);
  182. /* strip suffix of punctuation */
  183. p = s+strlen(s);
  184. op = p;
  185. while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
  186. p--;
  187. /* chop punctuation */
  188. if(p > s){
  189. /* free!!! -> free! */
  190. if(p+1 < op){
  191. p[1] = 0;
  192. return 0;
  193. }
  194. /* free! -> free */
  195. if(p < op){
  196. p[0] = 0;
  197. return 0;
  198. }
  199. }
  200. mix = mix1 = 0;
  201. if(isupper(s[0]))
  202. mix = 1;
  203. for(p=s+1; *p; p++)
  204. if(isupper(*p)){
  205. mix1 = 1;
  206. break;
  207. }
  208. /* turn FREE into Free */
  209. if(mix1){
  210. for(p=s+1; *p; p++)
  211. if(isupper(*p))
  212. *p += 'a'-'A';
  213. return 0;
  214. }
  215. /* turn Free into free */
  216. if(mix){
  217. *s += 'a'-'A';
  218. return 0;
  219. }
  220. return -1;
  221. }