123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include <draw.h>
- #include <regexp.h>
- #include <html.h>
- #include <ctype.h>
- #include "dat.h"
- char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
- Reprog *urlprog;
- int inword = 0;
- int col = 0;
- int wordi = 0;
- char*
- loadhtml(int fd)
- {
- URLwin *u;
- Bytes *b;
- int n;
- char buf[4096];
- u = emalloc(sizeof(URLwin));
- u->infd = fd;
- u->outfd = 1;
- u->url = estrdup(url);
- u->type = TextHtml;
- b = emalloc(sizeof(Bytes));
- while((n = read(fd, buf, sizeof buf)) > 0)
- growbytes(b, buf, n);
- if(b->b == nil)
- return nil; /* empty file */
- rendertext(u, b);
- freeurlwin(u);
- return nil;
- }
- char*
- runetobyte(Rune *r, int n)
- {
- char *s;
- if(n == 0)
- return emalloc(1);
- s = smprint("%.*S", n, r);
- if(s == nil)
- error("malloc failed");
- return s;
- }
- int
- closingpunct(int c)
- {
- return strchr(".,:;'\")]}>!?", c) != nil;
- }
- void
- emitword(Bytes *b, Rune *r, int nr)
- {
- char *s;
- int space;
- if(nr == 0)
- return;
- s = smprint("%.*S", nr, r);
- space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
- if(col>0 && col+space+nr > width){
- growbytes(b, "\n", 1);
- space = 0;
- col = 0;
- }
- if(space && col>0){
- growbytes(b, " ", 1);
- col++;
- }
- growbytes(b, s, strlen(s));
- col += nr;
- free(s);
- inword = 0;
- }
- void
- renderrunes(Bytes *b, Rune *r)
- {
- int i, n;
- n = runestrlen(r);
- for(i=0; i<n; i++){
- switch(r[i]){
- case '\n':
- if(inword)
- emitword(b, r+wordi, i-wordi);
- col = 0;
- if(b->n == 0)
- break; /* don't start with blank lines */
- if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
- growbytes(b, "\n", 1);
- break;
- case ' ':
- if(inword)
- emitword(b, r+wordi, i-wordi);
- break;
- default:
- if(!inword)
- wordi = i;
- inword = 1;
- break;
- }
- }
- if(inword)
- emitword(b, r+wordi, i-wordi);
- }
- void
- renderbytes(Bytes *b, char *fmt, ...)
- {
- Rune *r;
- va_list arg;
- va_start(arg, fmt);
- r = runevsmprint(fmt, arg);
- va_end(arg);
- renderrunes(b, r);
- free(r);
- }
- char*
- baseurl(char *url)
- {
- char *base, *slash;
- Resub rs[10];
- if(url == nil)
- return nil;
- if(urlprog == nil){
- urlprog = regcomp(urlexpr);
- if(urlprog == nil)
- error("can't compile URL regexp");
- }
- memset(rs, 0, sizeof rs);
- if(regexec(urlprog, url, rs, nelem(rs)) == 0)
- return nil;
- base = estrdup(url);
- slash = strrchr(base, '/');
- if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
- *slash = '\0';
- else
- base[rs[0].ep-rs[0].sp] = '\0';
- return base;
- }
- char*
- fullurl(URLwin *u, Rune *rhref)
- {
- char *base, *href, *hrefbase;
- char *result;
- if(rhref == nil)
- return estrdup("NULL URL");
- href = runetobyte(rhref, runestrlen(rhref));
- hrefbase = baseurl(href);
- result = nil;
- if(hrefbase==nil && (base = baseurl(u->url))!=nil){
- result = estrdup(base);
- if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
- result = eappend(result, "/", "");
- free(base);
- }
- if(href){
- if(result)
- result = eappend(result, "", href);
- else
- result = estrdup(href);
- }
- free(hrefbase);
- if(result == nil)
- return estrdup("***unknown***");
- return result;
- }
- void
- render(URLwin *u, Bytes *t, Item *items, int curanchor)
- {
- Item *il;
- Itext *it;
- Ifloat *ifl;
- Ispacer *is;
- Itable *ita;
- Iimage *im;
- Anchor *a;
- Table *tab;
- Tablecell *cell;
- char *href;
- inword = 0;
- col = 0;
- wordi = 0;
- for(il=items; il!=nil; il=il->next){
- if(il->state & IFbrk)
- renderbytes(t, "\n");
- if(il->state & IFbrksp)
- renderbytes(t, "\n");
- switch(il->tag){
- case Itexttag:
- it = (Itext*)il;
- if(it->state & IFwrap)
- renderrunes(t, it->s);
- else
- emitword(t, it->s, runestrlen(it->s));
- break;
- case Iruletag:
- if(t->n>0 && t->b[t->n-1]!='\n')
- renderbytes(t, "\n");
- renderbytes(t, "=======\n");
- break;
- case Iimagetag:
- if(!aflag)
- break;
- im = (Iimage*)il;
- if(im->imsrc){
- href = fullurl(u, im->imsrc);
- renderbytes(t, "[image %s]", href);
- free(href);
- }
- break;
- case Iformfieldtag:
- if(aflag)
- renderbytes(t, "[formfield]");
- break;
- case Itabletag:
- ita = (Itable*)il;
- tab = ita->table;
- for(cell=tab->cells; cell!=nil; cell=cell->next){
- render(u, t, cell->content, curanchor);
- }
- if(t->n>0 && t->b[t->n-1]!='\n')
- renderbytes(t, "\n");
- break;
- case Ifloattag:
- ifl = (Ifloat*)il;
- render(u, t, ifl->item, curanchor);
- break;
- case Ispacertag:
- is = (Ispacer*)il;
- if(is->spkind != ISPnull)
- renderbytes(t, " ");
- break;
- default:
- error("unknown item tag %d\n", il->tag);
- }
- if(il->anchorid != 0 && il->anchorid!=curanchor){
- for(a=u->docinfo->anchors; a!=nil; a=a->next)
- if(aflag && a->index == il->anchorid){
- href = fullurl(u, a->href);
- renderbytes(t, "[%s]", href);
- free(href);
- break;
- }
- curanchor = il->anchorid;
- }
- }
- if(t->n>0 && t->b[t->n-1]!='\n')
- renderbytes(t, "\n");
- }
- void
- rerender(URLwin *u)
- {
- Bytes *t;
- t = emalloc(sizeof(Bytes));
- render(u, t, u->items, 0);
- if(t->n)
- write(u->outfd, (char*)t->b, t->n);
- free(t->b);
- free(t);
- }
- /*
- * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
- * of the document (cistrstr only looks at first somewhat bytes).
- */
- int
- charset(char *s)
- {
- char *meta, *emeta, *charset;
- if(defcharset == 0)
- defcharset = ISO_8859_1;
- meta = cistrstr(s, "<meta");
- if(meta == nil)
- return defcharset;
- for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
- ;
- charset = cistrstr(s, "charset=");
- if(charset == nil)
- return defcharset;
- charset += 8;
- if(*charset == '"')
- charset++;
- if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
- return UTF_8;
- return defcharset;
- }
- void
- rendertext(URLwin *u, Bytes *b)
- {
- Rune *rurl;
- rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
- u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
- // free(rurl);
- rerender(u);
- }
- void
- freeurlwin(URLwin *u)
- {
- freeitems(u->items);
- u->items = nil;
- freedocinfo(u->docinfo);
- u->docinfo = nil;
- free(u);
- }
|