html.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <draw.h>
  5. #include <regexp.h>
  6. #include <html.h>
  7. #include <ctype.h>
  8. #include "dat.h"
  9. char urlexpr[] =
  10. "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
  11. "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
  12. Reprog *urlprog;
  13. int inword = 0;
  14. int col = 0;
  15. int wordi = 0;
  16. char*
  17. loadhtml(int fd)
  18. {
  19. URLwin *u;
  20. Bytes *b;
  21. int n;
  22. char buf[4096];
  23. u = emalloc(sizeof(URLwin));
  24. u->infd = fd;
  25. u->outfd = 1;
  26. u->url = estrdup(url);
  27. u->type = TextHtml;
  28. b = emalloc(sizeof(Bytes));
  29. while((n = read(fd, buf, sizeof buf)) > 0)
  30. growbytes(b, buf, n);
  31. if(b->b == nil)
  32. return nil; /* empty file */
  33. rendertext(u, b);
  34. freeurlwin(u);
  35. return nil;
  36. }
  37. char*
  38. runetobyte(Rune *r, int n)
  39. {
  40. char *s;
  41. if(n == 0)
  42. return emalloc(1);
  43. s = smprint("%.*S", n, r);
  44. if(s == nil)
  45. error("malloc failed");
  46. return s;
  47. }
  48. int
  49. closingpunct(char c)
  50. {
  51. return strchr(".,:;'\")]}>!?", c) != nil;
  52. }
  53. void
  54. emitword(Bytes *b, Rune *r, int nr)
  55. {
  56. char *s;
  57. int space;
  58. if(nr == 0)
  59. return;
  60. s = smprint("%.*S", nr, r);
  61. space = b->n > 0 && !isspace(b->b[b->n-1]) && !closingpunct(*s);
  62. if(col > 0 && col+space+nr > width){
  63. growbytes(b, "\n", 1);
  64. space = 0;
  65. col = 0;
  66. }
  67. if(space && col > 0){
  68. growbytes(b, " ", 1);
  69. col++;
  70. }
  71. growbytes(b, s, strlen(s));
  72. col += nr;
  73. free(s);
  74. inword = 0;
  75. }
  76. void
  77. renderrunes(Bytes *b, Rune *r)
  78. {
  79. int i, n;
  80. n = runestrlen(r);
  81. for(i=0; i<n; i++){
  82. switch(r[i]){
  83. case '\n':
  84. if(inword)
  85. emitword(b, r+wordi, i-wordi);
  86. col = 0;
  87. if(b->n == 0)
  88. break; /* don't start with blank lines */
  89. if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
  90. growbytes(b, "\n", 1);
  91. break;
  92. case ' ':
  93. if(inword)
  94. emitword(b, r+wordi, i-wordi);
  95. break;
  96. default:
  97. if(!inword)
  98. wordi = i;
  99. inword = 1;
  100. break;
  101. }
  102. }
  103. if(inword)
  104. emitword(b, r+wordi, i-wordi);
  105. }
  106. void
  107. renderbytes(Bytes *b, char *fmt, ...)
  108. {
  109. Rune *r;
  110. va_list arg;
  111. va_start(arg, fmt);
  112. r = runevsmprint(fmt, arg);
  113. va_end(arg);
  114. renderrunes(b, r);
  115. free(r);
  116. }
  117. char*
  118. baseurl(char *url)
  119. {
  120. char *base, *slash;
  121. Resub rs[10];
  122. if(url == nil)
  123. return nil;
  124. if(urlprog == nil){
  125. urlprog = regcomp(urlexpr);
  126. if(urlprog == nil)
  127. error("can't compile URL regexp");
  128. }
  129. memset(rs, 0, sizeof rs);
  130. if(regexec(urlprog, url, rs, nelem(rs)) == 0)
  131. return nil;
  132. base = estrdup(url);
  133. slash = strrchr(base, '/');
  134. if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
  135. *slash = '\0';
  136. else
  137. base[rs[0].ep-rs[0].sp] = '\0';
  138. return base;
  139. }
  140. char*
  141. fullurl(URLwin *u, Rune *rhref)
  142. {
  143. char *base, *href, *hrefbase;
  144. char *result;
  145. if(rhref == nil)
  146. return estrdup("NULL URL");
  147. href = runetobyte(rhref, runestrlen(rhref));
  148. hrefbase = baseurl(href);
  149. result = nil;
  150. if(hrefbase==nil && (base = baseurl(u->url))!=nil){
  151. result = estrdup(base);
  152. if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
  153. result = eappend(result, "/", "");
  154. free(base);
  155. }
  156. if(href){
  157. if(result)
  158. result = eappend(result, "", href);
  159. else
  160. result = estrdup(href);
  161. }
  162. free(hrefbase);
  163. if(result == nil)
  164. return estrdup("***unknown***");
  165. return result;
  166. }
  167. void
  168. render(URLwin *u, Bytes *t, Item *items, int curanchor)
  169. {
  170. Item *il;
  171. Itext *it;
  172. Ifloat *ifl;
  173. Ispacer *is;
  174. Itable *ita;
  175. Iimage *im;
  176. Anchor *a;
  177. Table *tab;
  178. Tablecell *cell;
  179. char *href;
  180. inword = 0;
  181. col = 0;
  182. wordi = 0;
  183. for(il=items; il!=nil; il=il->next){
  184. if(il->state & IFbrk)
  185. renderbytes(t, "\n");
  186. if(il->state & IFbrksp)
  187. renderbytes(t, "\n");
  188. switch(il->tag){
  189. case Itexttag:
  190. it = (Itext*)il;
  191. if(it->state & IFwrap)
  192. renderrunes(t, it->s);
  193. else
  194. emitword(t, it->s, runestrlen(it->s));
  195. break;
  196. case Iruletag:
  197. if(t->n>0 && t->b[t->n-1]!='\n')
  198. renderbytes(t, "\n");
  199. renderbytes(t, "=======\n");
  200. break;
  201. case Iimagetag:
  202. if(!aflag)
  203. break;
  204. im = (Iimage*)il;
  205. if(im->imsrc){
  206. href = fullurl(u, im->imsrc);
  207. renderbytes(t, "[image %s]", href);
  208. free(href);
  209. }
  210. break;
  211. case Iformfieldtag:
  212. if(aflag)
  213. renderbytes(t, "[formfield]");
  214. break;
  215. case Itabletag:
  216. ita = (Itable*)il;
  217. tab = ita->table;
  218. for(cell=tab->cells; cell!=nil; cell=cell->next){
  219. render(u, t, cell->content, curanchor);
  220. }
  221. if(t->n>0 && t->b[t->n-1]!='\n')
  222. renderbytes(t, "\n");
  223. break;
  224. case Ifloattag:
  225. ifl = (Ifloat*)il;
  226. render(u, t, ifl->item, curanchor);
  227. break;
  228. case Ispacertag:
  229. is = (Ispacer*)il;
  230. if(is->spkind != ISPnull)
  231. renderbytes(t, " ");
  232. break;
  233. default:
  234. error("unknown item tag %d\n", il->tag);
  235. }
  236. if(il->anchorid != 0 && il->anchorid!=curanchor){
  237. for(a=u->docinfo->anchors; a!=nil; a=a->next)
  238. if(aflag && a->index == il->anchorid){
  239. href = fullurl(u, a->href);
  240. renderbytes(t, "[%s]", href);
  241. free(href);
  242. break;
  243. }
  244. curanchor = il->anchorid;
  245. }
  246. }
  247. if(t->n>0 && t->b[t->n-1]!='\n')
  248. renderbytes(t, "\n");
  249. }
  250. void
  251. rerender(URLwin *u)
  252. {
  253. Bytes *t;
  254. t = emalloc(sizeof(Bytes));
  255. render(u, t, u->items, 0);
  256. if(t->n)
  257. write(u->outfd, (char*)t->b, t->n);
  258. free(t->b);
  259. free(t);
  260. }
  261. /*
  262. * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
  263. * of the document (cistrstr only looks at first somewhat bytes).
  264. */
  265. int
  266. charset(char *s)
  267. {
  268. char *meta, *emeta, *charset;
  269. if(defcharset == 0)
  270. defcharset = ISO_8859_1;
  271. meta = cistrstr(s, "<meta");
  272. if(meta == nil)
  273. return defcharset;
  274. for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
  275. ;
  276. charset = cistrstr(s, "charset=");
  277. if(charset == nil)
  278. return defcharset;
  279. charset += 8;
  280. if(*charset == '"')
  281. charset++;
  282. if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
  283. return UTF_8;
  284. return defcharset;
  285. }
  286. void
  287. rendertext(URLwin *u, Bytes *b)
  288. {
  289. Rune *rurl;
  290. rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
  291. u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
  292. // free(rurl);
  293. rerender(u);
  294. }
  295. void
  296. freeurlwin(URLwin *u)
  297. {
  298. freeitems(u->items);
  299. u->items = nil;
  300. freedocinfo(u->docinfo);
  301. u->docinfo = nil;
  302. free(u);
  303. }