123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <regexp.h>
- #include "spam.h"
- enum {
- Quanta = 8192,
- Minbody = 6000,
- HdrMax = 15,
- };
- typedef struct keyword Keyword;
- typedef struct word Word;
- struct word{
- char *string;
- int n;
- };
- struct keyword{
- char *string;
- int value;
- };
- Word htmlcmds[] =
- {
- "html", 4,
- "!doctype html", 13,
- 0,
- };
- Word hrefs[] =
- {
- "a href=", 7,
- "a title=", 8,
- "a target=", 9,
- "base href=", 10,
- "img src=", 8,
- "img border=", 11,
- "form action=", 12,
- "!--", 3,
- 0,
- };
- /*
- * RFC822 header keywords to look for for fractured header.
- * all lengths must be less than HdrMax defined above.
- */
- Word hdrwords[] =
- {
- "cc:", 3,
- "bcc:", 4,
- "to:", 3,
- 0, 0,
- };
- Keyword keywords[] =
- {
- "header", HoldHeader,
- "line", SaveLine,
- "hold", Hold,
- "dump", Dump,
- "loff", Lineoff,
- 0, Nactions,
- };
- Patterns patterns[] = {
- [Dump] { "DUMP:", 0, 0 },
- [HoldHeader] { "HEADER:", 0, 0 },
- [Hold] { "HOLD:", 0, 0 },
- [SaveLine] { "LINE:", 0, 0 },
- [Lineoff] { "LINEOFF:", 0, 0 },
- [Nactions] { 0, 0, 0 },
- };
- static char* endofhdr(char*, char*);
- static int escape(char**);
- static int extract(char*);
- static int findkey(char*);
- static int hash(int);
- static int isword(Word*, char*, int);
- static void parsealt(Biobuf*, char*, Spat**);
- /*
- * The canonicalizer: convert input to canonical representation
- */
- char*
- readmsg(Biobuf *bp, int *hsize, int *bufsize)
- {
- char *p, *buf;
- int n, offset, eoh, bsize, delta;
- buf = 0;
- offset = 0;
- if(bufsize)
- *bufsize = 0;
- if(hsize)
- *hsize = 0;
- for(;;) {
- buf = Realloc(buf, offset+Quanta+1);
- n = Bread(bp, buf+offset, Quanta);
- if(n < 0){
- free(buf);
- return 0;
- }
- p = buf+offset; /* start of this chunk */
- offset += n; /* end of this chunk */
- buf[offset] = 0;
- if(n == 0){
- if(offset == 0)
- return 0;
- break;
- }
- if(hsize == 0) /* don't process header */
- break;
- if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
- p--;
- p = endofhdr(p, buf+offset);
- if(p)
- break;
- if(offset >= Maxread) /* gargantuan header - just punt*/
- {
- if(hsize)
- *hsize = offset;
- if(bufsize)
- *bufsize = offset;
- return buf;
- }
- }
- eoh = p-buf; /* End of header */
- bsize = offset - eoh; /* amount of body already read */
- /* Read at least Minbody bytes of the body */
- if (bsize < Minbody){
- delta = Minbody-bsize;
- buf = Realloc(buf, offset+delta+1);
- n = Bread(bp, buf+offset, delta);
- if(n > 0) {
- offset += n;
- buf[offset] = 0;
- }
- }
- if(hsize)
- *hsize = eoh;
- if(bufsize)
- *bufsize = offset;
- return buf;
- }
- static int
- isword(Word *wp, char *text, int len)
- {
- for(;wp->string; wp++)
- if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
- return 1;
- return 0;
- }
- static char*
- endofhdr(char *raw, char *end)
- {
- int i;
- char *p, *q;
- char buf[HdrMax];
- /*
- * can't use strchr to search for newlines because
- * there may be embedded NULL's.
- */
- for(p = raw; p < end; p++){
- if(*p != '\n' || p[1] != '\n')
- continue;
- p++;
- for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
- buf[i++] = tolower(*q);
- if(*q == ':' || *q == '\n')
- break;
- }
- if(!isword(hdrwords, buf, i))
- return p+1;
- }
- return 0;
- }
- static int
- htmlmatch(Word *wp, char *text, char *end, int *n)
- {
- char *cp;
- int i, c, lastc;
- char buf[MaxHtml];
- /*
- * extract a string up to '>'
- */
- i = lastc = 0;
- cp = text;
- while (cp < end && i < sizeof(buf)-1){
- c = *cp++;
- if(c == '=')
- c = escape(&cp);
- switch(c){
- case 0:
- case '\r':
- continue;
- case '>':
- goto out;
- case '\n':
- case ' ':
- case '\t':
- if(lastc == ' ')
- continue;
- c = ' ';
- break;
- default:
- c = tolower(c);
- break;
- }
- buf[i++] = lastc = c;
- }
- out:
- buf[i] = 0;
- if(n)
- *n = cp-text;
- return isword(wp, buf, i);
- }
- static int
- escape(char **msg)
- {
- int c;
- char *p;
- p = *msg;
- c = *p;
- if(c == '\n'){
- p++;
- c = *p++;
- } else
- if(c == '2'){
- c = tolower(p[1]);
- if(c == 'e'){
- p += 2;
- c = '.';
- }else
- if(c == 'f'){
- p += 2;
- c = '/';
- }else
- if(c == '0'){
- p += 2;
- c = ' ';
- }
- else c = '=';
- } else {
- if(c == '3' && tolower(p[1]) == 'd')
- p += 2;
- c = '=';
- }
- *msg = p;
- return c;
- }
- static int
- htmlchk(char **msg, char *end)
- {
- int n;
- char *p;
- static int ishtml;
- p = *msg;
- if(ishtml == 0){
- ishtml = htmlmatch(htmlcmds, p, end, &n);
-
- /* If not an HTML keyword, check if it's
- * an HTML comment (<!comment>). if so,
- * skip over it; otherwise copy it in.
- */
- if(ishtml == 0 && *p != '!') /* not comment */
- return '<'; /* copy it */
- } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
- return '<'; /* copy it */
-
- /*
- * this is an uninteresting HTML command; skip over it.
- */
- p += n;
- *msg = p+1;
- return *p;
- }
- /*
- * decode a base 64 encode body
- */
- void
- conv64(char *msg, char *end, char *buf, int bufsize)
- {
- int len, i;
- char *cp;
- len = end - msg;
- i = (len*3)/4+1; // room for max chars + null
- cp = Malloc(i);
- len = dec64((uchar*)cp, i, msg, len);
- convert(cp, cp+len, buf, bufsize, 1);
- free(cp);
- }
- int
- convert(char *msg, char *end, char *buf, int bufsize, int isbody)
- {
- char *p;
- int c, lastc, base64;
- lastc = 0;
- base64 = 0;
- while(msg < end && bufsize > 0){
- c = *msg++;
- /*
- * In the body only, try to strip most HTML and
- * replace certain MIME escape sequences with the character
- */
- if(isbody) {
- do{
- p = msg;
- if(c == '<')
- c = htmlchk(&msg, end);
- if(c == '=')
- c = escape(&msg);
- } while(p != msg && p < end);
- }
- switch(c){
- case 0:
- case '\r':
- continue;
- case '\t':
- case ' ':
- case '\n':
- if(lastc == ' ')
- continue;
- c = ' ';
- break;
- case 'C': /* check for MIME base 64 encoding in header */
- case 'c':
- if(isbody == 0)
- if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
- if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
- base64 = 1;
- c = 'c';
- break;
- default:
- c = tolower(c);
- break;
- }
- *buf++ = c;
- lastc = c;
- bufsize--;
- }
- *buf = 0;
- return base64;
- }
- /*
- * The pattern parser: build data structures from the pattern file
- */
- static int
- hash(int c)
- {
- return c & 127;
- }
- static int
- findkey(char *val)
- {
- Keyword *kp;
- for(kp = keywords; kp->string; kp++)
- if(strcmp(val, kp->string) == 0)
- break;
- return kp->value;
- }
- #define whitespace(c) ((c) == ' ' || (c) == '\t')
- void
- parsepats(Biobuf *bp)
- {
- Pattern *p, *new;
- char *cp, *qp;
- int type, action, n, h;
- Spat *spat;
- for(;;){
- cp = Brdline(bp, '\n');
- if(cp == 0)
- break;
- cp[Blinelen(bp)-1] = 0;
- while(*cp == ' ' || *cp == '\t')
- cp++;
- if(*cp == '#' || *cp == 0)
- continue;
- type = regexp;
- if(*cp == '*'){
- type = string;
- cp++;
- }
- qp = strchr(cp, ':');
- if(qp == 0)
- continue;
- *qp = 0;
- if(debug)
- fprint(2, "action = %s\n", cp);
- action = findkey(cp);
- if(action >= Nactions)
- continue;
- cp = qp+1;
- n = extract(cp);
- if(n <= 0 || *cp == 0)
- continue;
- qp = strstr(cp, "~~");
- if(qp){
- *qp = 0;
- n = strlen(cp);
- }
- if(debug)
- fprint(2, " Pattern: `%s'\n", cp);
- /* Hook regexps into a chain */
- if(type == regexp) {
- new = Malloc(sizeof(Pattern));
- new->action = action;
- new->pat = regcomp(cp);
- if(new->pat == 0){
- free(new);
- continue;
- }
- new->type = regexp;
- new->alt = 0;
- new->next = 0;
- if(qp)
- parsealt(bp, qp+2, &new->alt);
- new->next = patterns[action].regexps;
- patterns[action].regexps = new;
- continue;
- }
- /* not a Regexp - hook strings into Pattern hash chain */
- spat = Malloc(sizeof(*spat));
- spat->next = 0;
- spat->alt = 0;
- spat->len = n;
- spat->string = Malloc(n+1);
- spat->c1 = cp[1];
- strcpy(spat->string, cp);
- if(qp)
- parsealt(bp, qp+2, &spat->alt);
- p = patterns[action].strings;
- if(p == 0) {
- p = Malloc(sizeof(Pattern));
- memset(p, 0, sizeof(*p));
- p->action = action;
- p->type = string;
- patterns[action].strings = p;
- }
- h = hash(*spat->string);
- spat->next = p->spat[h];
- p->spat[h] = spat;
- }
- }
- static void
- parsealt(Biobuf *bp, char *cp, Spat** head)
- {
- char *p;
- Spat *alt;
- while(cp){
- if(*cp == 0){ /*escaped newline*/
- do{
- cp = Brdline(bp, '\n');
- if(cp == 0)
- return;
- cp[Blinelen(bp)-1] = 0;
- } while(extract(cp) <= 0 || *cp == 0);
- }
- p = cp;
- cp = strstr(p, "~~");
- if(cp){
- *cp = 0;
- cp += 2;
- }
- if(strlen(p)){
- alt = Malloc(sizeof(*alt));
- alt->string = strdup(p);
- alt->next = *head;
- *head = alt;
- }
- }
- }
- static int
- extract(char *cp)
- {
- int c;
- char *p, *q, *r;
- p = q = r = cp;
- while(whitespace(*p))
- p++;
- while(c = *p++){
- if (c == '#')
- break;
- if(c == '"'){
- while(*p && *p != '"'){
- if(*p == '\\' && p[1] == '"')
- p++;
- if('A' <= *p && *p <= 'Z')
- *q++ = *p++ + ('a'-'A');
- else
- *q++ = *p++;
- }
- if(*p)
- p++;
- r = q; /* never back up over a quoted string */
- } else {
- if('A' <= c && c <= 'Z')
- c += ('a'-'A');
- *q++ = c;
- }
- }
- while(q > r && whitespace(q[-1]))
- q--;
- *q = 0;
- return q-cp;
- }
- /*
- * The matching engine: compare canonical input to pattern structures
- */
- static Spat*
- isalt(char *message, Spat *alt)
- {
- while(alt) {
- if(*cmd)
- if(message != cmd && strstr(cmd, alt->string))
- break;
- if(message != header+1 && strstr(header+1, alt->string))
- break;
- if(strstr(message, alt->string))
- break;
- alt = alt->next;
- }
- return alt;
- }
- int
- matchpat(Pattern *p, char *message, Resub *m)
- {
- Spat *spat;
- char *s;
- int c, c1;
- if(p->type == string){
- c1 = *message;
- for(s=message; c=c1; s++){
- c1 = s[1];
- for(spat=p->spat[hash(c)]; spat; spat=spat->next){
- if(c1 == spat->c1)
- if(memcmp(s, spat->string, spat->len) == 0)
- if(!isalt(message, spat->alt)){
- m->sp = s;
- m->ep = s + spat->len;
- return 1;
- }
- }
- }
- return 0;
- }
- m->sp = m->ep = 0;
- if(regexec(p->pat, message, m, 1) == 0)
- return 0;
- if(isalt(message, p->alt))
- return 0;
- return 1;
- }
- void
- xprint(int fd, char *type, Resub *m)
- {
- char *p, *q;
- int i;
- if(m->sp == 0 || m->ep == 0)
- return;
- /* back up approx 30 characters to whitespace */
- for(p = m->sp, i = 0; *p && i < 30; i++, p--)
- ;
- while(*p && *p != ' ')
- p--;
- p++;
- /* grab about 30 more chars beyond the end of the match */
- for(q = m->ep, i = 0; *q && i < 30; i++, q++)
- ;
- while(*q && *q != ' ')
- q++;
- fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
- }
- enum {
- INVAL= 255
- };
- static uchar t64d[256] = {
- /*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
- /*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
- 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 11, 12, 13, 14,
- /*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
- 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
- 33, 34, 35, 36, 37, 38, 39, 40,
- /*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
- 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- /*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
- };
|