common.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <regexp.h>
  5. #include "spam.h"
  6. enum {
  7. Quanta = 8192,
  8. Minbody = 6000,
  9. HdrMax = 15,
  10. };
  11. typedef struct keyword Keyword;
  12. typedef struct word Word;
  13. struct word{
  14. char *string;
  15. int n;
  16. };
  17. struct keyword{
  18. char *string;
  19. int value;
  20. };
  21. Word htmlcmds[] =
  22. {
  23. "html", 4,
  24. "!doctype html", 13,
  25. 0,
  26. };
  27. Word hrefs[] =
  28. {
  29. "a href=", 7,
  30. "a title=", 8,
  31. "a target=", 9,
  32. "base href=", 10,
  33. "img src=", 8,
  34. "img border=", 11,
  35. "form action=", 12,
  36. "!--", 3,
  37. 0,
  38. };
  39. /*
  40. * RFC822 header keywords to look for for fractured header.
  41. * all lengths must be less than HdrMax defined above.
  42. */
  43. Word hdrwords[] =
  44. {
  45. "cc:", 3,
  46. "bcc:", 4,
  47. "to:", 3,
  48. 0, 0,
  49. };
  50. Keyword keywords[] =
  51. {
  52. "header", HoldHeader,
  53. "line", SaveLine,
  54. "hold", Hold,
  55. "dump", Dump,
  56. "loff", Lineoff,
  57. 0, Nactions,
  58. };
  59. Patterns patterns[] = {
  60. [Dump] { "DUMP:", 0, 0 },
  61. [HoldHeader] { "HEADER:", 0, 0 },
  62. [Hold] { "HOLD:", 0, 0 },
  63. [SaveLine] { "LINE:", 0, 0 },
  64. [Lineoff] { "LINEOFF:", 0, 0 },
  65. [Nactions] { 0, 0, 0 },
  66. };
  67. static char* endofhdr(char*, char*);
  68. static int escape(char**);
  69. static int extract(char*);
  70. static int findkey(char*);
  71. static int hash(int);
  72. static int isword(Word*, char*, int);
  73. static void parsealt(Biobuf*, char*, Spat**);
  74. /*
  75. * The canonicalizer: convert input to canonical representation
  76. */
  77. char*
  78. readmsg(Biobuf *bp, int *hsize, int *bufsize)
  79. {
  80. char *p, *buf;
  81. int n, offset, eoh, bsize, delta;
  82. buf = 0;
  83. offset = 0;
  84. if(bufsize)
  85. *bufsize = 0;
  86. if(hsize)
  87. *hsize = 0;
  88. for(;;) {
  89. buf = Realloc(buf, offset+Quanta+1);
  90. n = Bread(bp, buf+offset, Quanta);
  91. if(n < 0){
  92. free(buf);
  93. return 0;
  94. }
  95. p = buf+offset; /* start of this chunk */
  96. offset += n; /* end of this chunk */
  97. buf[offset] = 0;
  98. if(n == 0){
  99. if(offset == 0)
  100. return 0;
  101. break;
  102. }
  103. if(hsize == 0) /* don't process header */
  104. break;
  105. if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
  106. p--;
  107. p = endofhdr(p, buf+offset);
  108. if(p)
  109. break;
  110. if(offset >= Maxread) /* gargantuan header - just punt*/
  111. {
  112. if(hsize)
  113. *hsize = offset;
  114. if(bufsize)
  115. *bufsize = offset;
  116. return buf;
  117. }
  118. }
  119. eoh = p-buf; /* End of header */
  120. bsize = offset - eoh; /* amount of body already read */
  121. /* Read at least Minbody bytes of the body */
  122. if (bsize < Minbody){
  123. delta = Minbody-bsize;
  124. buf = Realloc(buf, offset+delta+1);
  125. n = Bread(bp, buf+offset, delta);
  126. if(n > 0) {
  127. offset += n;
  128. buf[offset] = 0;
  129. }
  130. }
  131. if(hsize)
  132. *hsize = eoh;
  133. if(bufsize)
  134. *bufsize = offset;
  135. return buf;
  136. }
  137. static int
  138. isword(Word *wp, char *text, int len)
  139. {
  140. for(;wp->string; wp++)
  141. if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
  142. return 1;
  143. return 0;
  144. }
  145. static char*
  146. endofhdr(char *raw, char *end)
  147. {
  148. int i;
  149. char *p, *q;
  150. char buf[HdrMax];
  151. /*
  152. * can't use strchr to search for newlines because
  153. * there may be embedded NULL's.
  154. */
  155. for(p = raw; p < end; p++){
  156. if(*p != '\n' || p[1] != '\n')
  157. continue;
  158. p++;
  159. for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
  160. buf[i++] = tolower(*q);
  161. if(*q == ':' || *q == '\n')
  162. break;
  163. }
  164. if(!isword(hdrwords, buf, i))
  165. return p+1;
  166. }
  167. return 0;
  168. }
  169. static int
  170. htmlmatch(Word *wp, char *text, char *end, int *n)
  171. {
  172. char *cp;
  173. int i, c, lastc;
  174. char buf[MaxHtml];
  175. /*
  176. * extract a string up to '>'
  177. */
  178. i = lastc = 0;
  179. cp = text;
  180. while (cp < end && i < sizeof(buf)-1){
  181. c = *cp++;
  182. if(c == '=')
  183. c = escape(&cp);
  184. switch(c){
  185. case 0:
  186. case '\r':
  187. continue;
  188. case '>':
  189. goto out;
  190. case '\n':
  191. case ' ':
  192. case '\t':
  193. if(lastc == ' ')
  194. continue;
  195. c = ' ';
  196. break;
  197. default:
  198. c = tolower(c);
  199. break;
  200. }
  201. buf[i++] = lastc = c;
  202. }
  203. out:
  204. buf[i] = 0;
  205. if(n)
  206. *n = cp-text;
  207. return isword(wp, buf, i);
  208. }
  209. static int
  210. escape(char **msg)
  211. {
  212. int c;
  213. char *p;
  214. p = *msg;
  215. c = *p;
  216. if(c == '\n'){
  217. p++;
  218. c = *p++;
  219. } else
  220. if(c == '2'){
  221. c = tolower(p[1]);
  222. if(c == 'e'){
  223. p += 2;
  224. c = '.';
  225. }else
  226. if(c == 'f'){
  227. p += 2;
  228. c = '/';
  229. }else
  230. if(c == '0'){
  231. p += 2;
  232. c = ' ';
  233. }
  234. else c = '=';
  235. } else {
  236. if(c == '3' && tolower(p[1]) == 'd')
  237. p += 2;
  238. c = '=';
  239. }
  240. *msg = p;
  241. return c;
  242. }
  243. static int
  244. htmlchk(char **msg, char *end)
  245. {
  246. int n;
  247. char *p;
  248. static int ishtml;
  249. p = *msg;
  250. if(ishtml == 0){
  251. ishtml = htmlmatch(htmlcmds, p, end, &n);
  252. /* If not an HTML keyword, check if it's
  253. * an HTML comment (<!comment>). if so,
  254. * skip over it; otherwise copy it in.
  255. */
  256. if(ishtml == 0 && *p != '!') /* not comment */
  257. return '<'; /* copy it */
  258. } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
  259. return '<'; /* copy it */
  260. /*
  261. * this is an uninteresting HTML command; skip over it.
  262. */
  263. p += n;
  264. *msg = p+1;
  265. return *p;
  266. }
  267. /*
  268. * decode a base 64 encode body
  269. */
  270. void
  271. conv64(char *msg, char *end, char *buf, int bufsize)
  272. {
  273. int len, i;
  274. char *cp;
  275. len = end - msg;
  276. i = (len*3)/4+1; // room for max chars + null
  277. cp = Malloc(i);
  278. len = dec64((uchar*)cp, i, msg, len);
  279. convert(cp, cp+len, buf, bufsize, 1);
  280. free(cp);
  281. }
  282. int
  283. convert(char *msg, char *end, char *buf, int bufsize, int isbody)
  284. {
  285. char *p;
  286. int c, lastc, base64;
  287. lastc = 0;
  288. base64 = 0;
  289. while(msg < end && bufsize > 0){
  290. c = *msg++;
  291. /*
  292. * In the body only, try to strip most HTML and
  293. * replace certain MIME escape sequences with the character
  294. */
  295. if(isbody) {
  296. do{
  297. p = msg;
  298. if(c == '<')
  299. c = htmlchk(&msg, end);
  300. if(c == '=')
  301. c = escape(&msg);
  302. } while(p != msg && p < end);
  303. }
  304. switch(c){
  305. case 0:
  306. case '\r':
  307. continue;
  308. case '\t':
  309. case ' ':
  310. case '\n':
  311. if(lastc == ' ')
  312. continue;
  313. c = ' ';
  314. break;
  315. case 'C': /* check for MIME base 64 encoding in header */
  316. case 'c':
  317. if(isbody == 0)
  318. if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
  319. if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
  320. base64 = 1;
  321. c = 'c';
  322. break;
  323. default:
  324. c = tolower(c);
  325. break;
  326. }
  327. *buf++ = c;
  328. lastc = c;
  329. bufsize--;
  330. }
  331. *buf = 0;
  332. return base64;
  333. }
  334. /*
  335. * The pattern parser: build data structures from the pattern file
  336. */
  337. static int
  338. hash(int c)
  339. {
  340. return c & 127;
  341. }
  342. static int
  343. findkey(char *val)
  344. {
  345. Keyword *kp;
  346. for(kp = keywords; kp->string; kp++)
  347. if(strcmp(val, kp->string) == 0)
  348. break;
  349. return kp->value;
  350. }
  351. #define whitespace(c) ((c) == ' ' || (c) == '\t')
  352. void
  353. parsepats(Biobuf *bp)
  354. {
  355. Pattern *p, *new;
  356. char *cp, *qp;
  357. int type, action, n, h;
  358. Spat *spat;
  359. for(;;){
  360. cp = Brdline(bp, '\n');
  361. if(cp == 0)
  362. break;
  363. cp[Blinelen(bp)-1] = 0;
  364. while(*cp == ' ' || *cp == '\t')
  365. cp++;
  366. if(*cp == '#' || *cp == 0)
  367. continue;
  368. type = regexp;
  369. if(*cp == '*'){
  370. type = string;
  371. cp++;
  372. }
  373. qp = strchr(cp, ':');
  374. if(qp == 0)
  375. continue;
  376. *qp = 0;
  377. if(debug)
  378. fprint(2, "action = %s\n", cp);
  379. action = findkey(cp);
  380. if(action >= Nactions)
  381. continue;
  382. cp = qp+1;
  383. n = extract(cp);
  384. if(n <= 0 || *cp == 0)
  385. continue;
  386. qp = strstr(cp, "~~");
  387. if(qp){
  388. *qp = 0;
  389. n = strlen(cp);
  390. }
  391. if(debug)
  392. fprint(2, " Pattern: `%s'\n", cp);
  393. /* Hook regexps into a chain */
  394. if(type == regexp) {
  395. new = Malloc(sizeof(Pattern));
  396. new->action = action;
  397. new->pat = regcomp(cp);
  398. if(new->pat == 0){
  399. free(new);
  400. continue;
  401. }
  402. new->type = regexp;
  403. new->alt = 0;
  404. new->next = 0;
  405. if(qp)
  406. parsealt(bp, qp+2, &new->alt);
  407. new->next = patterns[action].regexps;
  408. patterns[action].regexps = new;
  409. continue;
  410. }
  411. /* not a Regexp - hook strings into Pattern hash chain */
  412. spat = Malloc(sizeof(*spat));
  413. spat->next = 0;
  414. spat->alt = 0;
  415. spat->len = n;
  416. spat->string = Malloc(n+1);
  417. spat->c1 = cp[1];
  418. strcpy(spat->string, cp);
  419. if(qp)
  420. parsealt(bp, qp+2, &spat->alt);
  421. p = patterns[action].strings;
  422. if(p == 0) {
  423. p = Malloc(sizeof(Pattern));
  424. memset(p, 0, sizeof(*p));
  425. p->action = action;
  426. p->type = string;
  427. patterns[action].strings = p;
  428. }
  429. h = hash(*spat->string);
  430. spat->next = p->spat[h];
  431. p->spat[h] = spat;
  432. }
  433. }
  434. static void
  435. parsealt(Biobuf *bp, char *cp, Spat** head)
  436. {
  437. char *p;
  438. Spat *alt;
  439. while(cp){
  440. if(*cp == 0){ /*escaped newline*/
  441. do{
  442. cp = Brdline(bp, '\n');
  443. if(cp == 0)
  444. return;
  445. cp[Blinelen(bp)-1] = 0;
  446. } while(extract(cp) <= 0 || *cp == 0);
  447. }
  448. p = cp;
  449. cp = strstr(p, "~~");
  450. if(cp){
  451. *cp = 0;
  452. cp += 2;
  453. }
  454. if(strlen(p)){
  455. alt = Malloc(sizeof(*alt));
  456. alt->string = strdup(p);
  457. alt->next = *head;
  458. *head = alt;
  459. }
  460. }
  461. }
  462. static int
  463. extract(char *cp)
  464. {
  465. int c;
  466. char *p, *q, *r;
  467. p = q = r = cp;
  468. while(whitespace(*p))
  469. p++;
  470. while(c = *p++){
  471. if (c == '#')
  472. break;
  473. if(c == '"'){
  474. while(*p && *p != '"'){
  475. if(*p == '\\' && p[1] == '"')
  476. p++;
  477. if('A' <= *p && *p <= 'Z')
  478. *q++ = *p++ + ('a'-'A');
  479. else
  480. *q++ = *p++;
  481. }
  482. if(*p)
  483. p++;
  484. r = q; /* never back up over a quoted string */
  485. } else {
  486. if('A' <= c && c <= 'Z')
  487. c += ('a'-'A');
  488. *q++ = c;
  489. }
  490. }
  491. while(q > r && whitespace(q[-1]))
  492. q--;
  493. *q = 0;
  494. return q-cp;
  495. }
  496. /*
  497. * The matching engine: compare canonical input to pattern structures
  498. */
  499. static Spat*
  500. isalt(char *message, Spat *alt)
  501. {
  502. while(alt) {
  503. if(*cmd)
  504. if(message != cmd && strstr(cmd, alt->string))
  505. break;
  506. if(message != header+1 && strstr(header+1, alt->string))
  507. break;
  508. if(strstr(message, alt->string))
  509. break;
  510. alt = alt->next;
  511. }
  512. return alt;
  513. }
  514. int
  515. matchpat(Pattern *p, char *message, Resub *m)
  516. {
  517. Spat *spat;
  518. char *s;
  519. int c, c1;
  520. if(p->type == string){
  521. c1 = *message;
  522. for(s=message; c=c1; s++){
  523. c1 = s[1];
  524. for(spat=p->spat[hash(c)]; spat; spat=spat->next){
  525. if(c1 == spat->c1)
  526. if(memcmp(s, spat->string, spat->len) == 0)
  527. if(!isalt(message, spat->alt)){
  528. m->sp = s;
  529. m->ep = s + spat->len;
  530. return 1;
  531. }
  532. }
  533. }
  534. return 0;
  535. }
  536. m->sp = m->ep = 0;
  537. if(regexec(p->pat, message, m, 1) == 0)
  538. return 0;
  539. if(isalt(message, p->alt))
  540. return 0;
  541. return 1;
  542. }
  543. void
  544. xprint(int fd, char *type, Resub *m)
  545. {
  546. char *p, *q;
  547. int i;
  548. if(m->sp == 0 || m->ep == 0)
  549. return;
  550. /* back up approx 30 characters to whitespace */
  551. for(p = m->sp, i = 0; *p && i < 30; i++, p--)
  552. ;
  553. while(*p && *p != ' ')
  554. p--;
  555. p++;
  556. /* grab about 30 more chars beyond the end of the match */
  557. for(q = m->ep, i = 0; *q && i < 30; i++, q++)
  558. ;
  559. while(*q && *q != ' ')
  560. q++;
  561. fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
  562. }
  563. enum {
  564. INVAL= 255
  565. };
  566. static uchar t64d[256] = {
  567. /*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  568. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  569. /*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  570. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  571. /*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  572. INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
  573. /*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
  574. 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  575. /*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
  576. 7, 8, 9, 10, 11, 12, 13, 14,
  577. /*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
  578. 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
  579. /*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
  580. 33, 34, 35, 36, 37, 38, 39, 40,
  581. /*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
  582. 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
  583. /*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  584. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  585. /*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  586. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  587. /*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  588. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  589. /*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  590. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  591. /*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  592. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  593. /*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  594. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  595. /*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  596. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  597. /*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  598. INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
  599. };