html.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <draw.h>
  5. #include <regexp.h>
  6. #include <html.h>
  7. #include <ctype.h>
  8. #include "dat.h"
  9. char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
  10. Reprog *urlprog;
  11. int inword = 0;
  12. int col = 0;
  13. int wordi = 0;
  14. char*
  15. loadhtml(int fd)
  16. {
  17. URLwin *u;
  18. Bytes *b;
  19. int n;
  20. char buf[4096];
  21. u = emalloc(sizeof(URLwin));
  22. u->infd = fd;
  23. u->outfd = 1;
  24. u->url = estrdup(url);
  25. u->type = TextHtml;
  26. b = emalloc(sizeof(Bytes));
  27. while((n = read(fd, buf, sizeof buf)) > 0)
  28. growbytes(b, buf, n);
  29. if(b->b == nil)
  30. return nil; /* empty file */
  31. rendertext(u, b);
  32. freeurlwin(u);
  33. return nil;
  34. }
  35. char*
  36. runetobyte(Rune *r, int n)
  37. {
  38. char *s;
  39. if(n == 0)
  40. return emalloc(1);
  41. s = smprint("%.*S", n, r);
  42. if(s == nil)
  43. error("malloc failed");
  44. return s;
  45. }
  46. int
  47. closingpunct(int c)
  48. {
  49. return strchr(".,:;'\")]}>!?", c) != nil;
  50. }
  51. void
  52. emitword(Bytes *b, Rune *r, int nr)
  53. {
  54. char *s;
  55. int space;
  56. if(nr == 0)
  57. return;
  58. s = smprint("%.*S", nr, r);
  59. space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
  60. if(col>0 && col+space+nr > width){
  61. growbytes(b, "\n", 1);
  62. space = 0;
  63. col = 0;
  64. }
  65. if(space && col>0){
  66. growbytes(b, " ", 1);
  67. col++;
  68. }
  69. growbytes(b, s, strlen(s));
  70. col += nr;
  71. free(s);
  72. inword = 0;
  73. }
  74. void
  75. renderrunes(Bytes *b, Rune *r)
  76. {
  77. int i, n;
  78. n = runestrlen(r);
  79. for(i=0; i<n; i++){
  80. switch(r[i]){
  81. case '\n':
  82. if(inword)
  83. emitword(b, r+wordi, i-wordi);
  84. col = 0;
  85. if(b->n == 0)
  86. break; /* don't start with blank lines */
  87. if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
  88. growbytes(b, "\n", 1);
  89. break;
  90. case ' ':
  91. if(inword)
  92. emitword(b, r+wordi, i-wordi);
  93. break;
  94. default:
  95. if(!inword)
  96. wordi = i;
  97. inword = 1;
  98. break;
  99. }
  100. }
  101. if(inword)
  102. emitword(b, r+wordi, i-wordi);
  103. }
  104. void
  105. renderbytes(Bytes *b, char *fmt, ...)
  106. {
  107. Rune *r;
  108. va_list arg;
  109. va_start(arg, fmt);
  110. r = runevsmprint(fmt, arg);
  111. va_end(arg);
  112. renderrunes(b, r);
  113. free(r);
  114. }
  115. char*
  116. baseurl(char *url)
  117. {
  118. char *base, *slash;
  119. Resub rs[10];
  120. if(url == nil)
  121. return nil;
  122. if(urlprog == nil){
  123. urlprog = regcomp(urlexpr);
  124. if(urlprog == nil)
  125. error("can't compile URL regexp");
  126. }
  127. memset(rs, 0, sizeof rs);
  128. if(regexec(urlprog, url, rs, nelem(rs)) == 0)
  129. return nil;
  130. base = estrdup(url);
  131. slash = strrchr(base, '/');
  132. if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
  133. *slash = '\0';
  134. else
  135. base[rs[0].ep-rs[0].sp] = '\0';
  136. return base;
  137. }
  138. char*
  139. fullurl(URLwin *u, Rune *rhref)
  140. {
  141. char *base, *href, *hrefbase;
  142. char *result;
  143. if(rhref == nil)
  144. return estrdup("NULL URL");
  145. href = runetobyte(rhref, runestrlen(rhref));
  146. hrefbase = baseurl(href);
  147. result = nil;
  148. if(hrefbase==nil && (base = baseurl(u->url))!=nil){
  149. result = estrdup(base);
  150. if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
  151. result = eappend(result, "/", "");
  152. free(base);
  153. }
  154. if(href){
  155. if(result)
  156. result = eappend(result, "", href);
  157. else
  158. result = estrdup(href);
  159. }
  160. free(hrefbase);
  161. if(result == nil)
  162. return estrdup("***unknown***");
  163. return result;
  164. }
  165. void
  166. render(URLwin *u, Bytes *t, Item *items, int curanchor)
  167. {
  168. Item *il;
  169. Itext *it;
  170. Ifloat *ifl;
  171. Ispacer *is;
  172. Itable *ita;
  173. Iimage *im;
  174. Anchor *a;
  175. Table *tab;
  176. Tablecell *cell;
  177. char *href;
  178. inword = 0;
  179. col = 0;
  180. wordi = 0;
  181. for(il=items; il!=nil; il=il->next){
  182. if(il->state & IFbrk)
  183. renderbytes(t, "\n");
  184. if(il->state & IFbrksp)
  185. renderbytes(t, "\n");
  186. switch(il->tag){
  187. case Itexttag:
  188. it = (Itext*)il;
  189. if(it->state & IFwrap)
  190. renderrunes(t, it->s);
  191. else
  192. emitword(t, it->s, runestrlen(it->s));
  193. break;
  194. case Iruletag:
  195. if(t->n>0 && t->b[t->n-1]!='\n')
  196. renderbytes(t, "\n");
  197. renderbytes(t, "=======\n");
  198. break;
  199. case Iimagetag:
  200. if(!aflag)
  201. break;
  202. im = (Iimage*)il;
  203. if(im->imsrc){
  204. href = fullurl(u, im->imsrc);
  205. renderbytes(t, "[image %s]", href);
  206. free(href);
  207. }
  208. break;
  209. case Iformfieldtag:
  210. if(aflag)
  211. renderbytes(t, "[formfield]");
  212. break;
  213. case Itabletag:
  214. ita = (Itable*)il;
  215. tab = ita->table;
  216. for(cell=tab->cells; cell!=nil; cell=cell->next){
  217. render(u, t, cell->content, curanchor);
  218. }
  219. if(t->n>0 && t->b[t->n-1]!='\n')
  220. renderbytes(t, "\n");
  221. break;
  222. case Ifloattag:
  223. ifl = (Ifloat*)il;
  224. render(u, t, ifl->item, curanchor);
  225. break;
  226. case Ispacertag:
  227. is = (Ispacer*)il;
  228. if(is->spkind != ISPnull)
  229. renderbytes(t, " ");
  230. break;
  231. default:
  232. error("unknown item tag %d\n", il->tag);
  233. }
  234. if(il->anchorid != 0 && il->anchorid!=curanchor){
  235. for(a=u->docinfo->anchors; a!=nil; a=a->next)
  236. if(aflag && a->index == il->anchorid){
  237. href = fullurl(u, a->href);
  238. renderbytes(t, "[%s]", href);
  239. free(href);
  240. break;
  241. }
  242. curanchor = il->anchorid;
  243. }
  244. }
  245. if(t->n>0 && t->b[t->n-1]!='\n')
  246. renderbytes(t, "\n");
  247. }
  248. void
  249. rerender(URLwin *u)
  250. {
  251. Bytes *t;
  252. t = emalloc(sizeof(Bytes));
  253. render(u, t, u->items, 0);
  254. if(t->n)
  255. write(u->outfd, (char*)t->b, t->n);
  256. free(t->b);
  257. free(t);
  258. }
  259. /*
  260. * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
  261. * of the document (cistrstr only looks at first somewhat bytes).
  262. */
  263. int
  264. charset(char *s)
  265. {
  266. char *meta, *emeta, *charset;
  267. if(defcharset == 0)
  268. defcharset = ISO_8859_1;
  269. meta = cistrstr(s, "<meta");
  270. if(meta == nil)
  271. return defcharset;
  272. for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
  273. ;
  274. charset = cistrstr(s, "charset=");
  275. if(charset == nil)
  276. return defcharset;
  277. charset += 8;
  278. if(*charset == '"')
  279. charset++;
  280. if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
  281. return UTF_8;
  282. return defcharset;
  283. }
  284. void
  285. rendertext(URLwin *u, Bytes *b)
  286. {
  287. Rune *rurl;
  288. rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
  289. u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
  290. // free(rurl);
  291. rerender(u);
  292. }
  293. void
  294. freeurlwin(URLwin *u)
  295. {
  296. freeitems(u->items);
  297. u->items = nil;
  298. freedocinfo(u->docinfo);
  299. u->docinfo = nil;
  300. free(u);
  301. }