html.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <draw.h>
  5. #include <regexp.h>
  6. #include <html.h>
  7. #include <ctype.h>
  8. #include "dat.h"
  9. char urlexpr[] =
  10. "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
  11. "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
  12. Reprog *urlprog;
  13. int newitextitem;
  14. int inword = 0;
  15. int col = 0;
  16. int wordi = 0;
  17. char*
  18. loadhtml(int fd)
  19. {
  20. URLwin *u;
  21. Bytes *b;
  22. int n;
  23. char buf[4096];
  24. u = emalloc(sizeof(URLwin));
  25. u->infd = fd;
  26. u->outfd = 1;
  27. u->url = estrdup(url);
  28. u->type = TextHtml;
  29. b = emalloc(sizeof(Bytes));
  30. while((n = read(fd, buf, sizeof buf)) > 0)
  31. growbytes(b, buf, n);
  32. if(b->b == nil)
  33. return nil; /* empty file */
  34. rendertext(u, b);
  35. freeurlwin(u);
  36. return nil;
  37. }
  38. char*
  39. runetobyte(Rune *r, int n)
  40. {
  41. char *s;
  42. if(n == 0)
  43. return emalloc(1);
  44. s = smprint("%.*S", n, r);
  45. if(s == nil)
  46. error("malloc failed");
  47. return s;
  48. }
  49. int
  50. closingpunct(char c)
  51. {
  52. return strchr(".,:;'\")]}>!?", c) != nil;
  53. }
  54. void
  55. emitword(Bytes *b, Rune *r, int nr)
  56. {
  57. char *s;
  58. int space;
  59. if(nr == 0)
  60. return;
  61. s = smprint("%.*S", nr, r);
  62. space = b->n > 0 && !isspace(b->b[b->n-1]) && (!newitextitem || !closingpunct(*s));
  63. if(col > 0 && col+space+nr > width){
  64. growbytes(b, "\n", 1);
  65. space = 0;
  66. col = 0;
  67. }
  68. if(space && col > 0){
  69. growbytes(b, " ", 1);
  70. col++;
  71. }
  72. growbytes(b, s, strlen(s));
  73. col += nr;
  74. free(s);
  75. inword = 0;
  76. newitextitem = 0;
  77. }
  78. void
  79. renderrunes(Bytes *b, Rune *r)
  80. {
  81. int i, n;
  82. newitextitem = 1;
  83. n = runestrlen(r);
  84. for(i=0; i<n; i++){
  85. switch(r[i]){
  86. case '\n':
  87. if(inword)
  88. emitword(b, r+wordi, i-wordi);
  89. col = 0;
  90. if(b->n == 0)
  91. break; /* don't start with blank lines */
  92. if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
  93. growbytes(b, "\n", 1);
  94. break;
  95. case ' ':
  96. if(inword)
  97. emitword(b, r+wordi, i-wordi);
  98. break;
  99. default:
  100. if(!inword)
  101. wordi = i;
  102. inword = 1;
  103. break;
  104. }
  105. }
  106. if(inword)
  107. emitword(b, r+wordi, i-wordi);
  108. }
  109. void
  110. renderbytes(Bytes *b, char *fmt, ...)
  111. {
  112. Rune *r;
  113. va_list arg;
  114. va_start(arg, fmt);
  115. r = runevsmprint(fmt, arg);
  116. va_end(arg);
  117. renderrunes(b, r);
  118. free(r);
  119. }
  120. char*
  121. baseurl(char *url)
  122. {
  123. char *base, *slash;
  124. Resub rs[10];
  125. if(url == nil)
  126. return nil;
  127. if(urlprog == nil){
  128. urlprog = regcomp(urlexpr);
  129. if(urlprog == nil)
  130. error("can't compile URL regexp");
  131. }
  132. memset(rs, 0, sizeof rs);
  133. if(regexec(urlprog, url, rs, nelem(rs)) == 0)
  134. return nil;
  135. base = estrdup(url);
  136. slash = strrchr(base, '/');
  137. if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
  138. *slash = '\0';
  139. else
  140. base[rs[0].ep-rs[0].sp] = '\0';
  141. return base;
  142. }
  143. char*
  144. fullurl(URLwin *u, Rune *rhref)
  145. {
  146. char *base, *href, *hrefbase;
  147. char *result;
  148. if(rhref == nil)
  149. return estrdup("NULL URL");
  150. href = runetobyte(rhref, runestrlen(rhref));
  151. hrefbase = baseurl(href);
  152. result = nil;
  153. if(hrefbase==nil && (base = baseurl(u->url))!=nil){
  154. result = estrdup(base);
  155. if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
  156. result = eappend(result, "/", "");
  157. free(base);
  158. }
  159. if(href){
  160. if(result)
  161. result = eappend(result, "", href);
  162. else
  163. result = estrdup(href);
  164. }
  165. free(hrefbase);
  166. if(result == nil)
  167. return estrdup("***unknown***");
  168. return result;
  169. }
  170. void
  171. render(URLwin *u, Bytes *t, Item *items, int curanchor)
  172. {
  173. Item *il;
  174. Itext *it;
  175. Ifloat *ifl;
  176. Ispacer *is;
  177. Itable *ita;
  178. Iimage *im;
  179. Anchor *a;
  180. Table *tab;
  181. Tablecell *cell;
  182. char *href;
  183. inword = 0;
  184. col = 0;
  185. wordi = 0;
  186. for(il=items; il!=nil; il=il->next){
  187. if(il->state & IFbrk)
  188. renderbytes(t, "\n");
  189. if(il->state & IFbrksp)
  190. renderbytes(t, "\n");
  191. switch(il->tag){
  192. case Itexttag:
  193. it = (Itext*)il;
  194. if(it->state & IFwrap)
  195. renderrunes(t, it->s);
  196. else {
  197. newitextitem = 1;
  198. emitword(t, it->s, runestrlen(it->s));
  199. }
  200. break;
  201. case Iruletag:
  202. if(t->n>0 && t->b[t->n-1]!='\n')
  203. renderbytes(t, "\n");
  204. renderbytes(t, "=======\n");
  205. break;
  206. case Iimagetag:
  207. if(!aflag)
  208. break;
  209. im = (Iimage*)il;
  210. if(im->imsrc){
  211. href = fullurl(u, im->imsrc);
  212. renderbytes(t, "[image %s]", href);
  213. free(href);
  214. }
  215. break;
  216. case Iformfieldtag:
  217. if(aflag)
  218. renderbytes(t, "[formfield]");
  219. break;
  220. case Itabletag:
  221. ita = (Itable*)il;
  222. tab = ita->table;
  223. for(cell=tab->cells; cell!=nil; cell=cell->next){
  224. render(u, t, cell->content, curanchor);
  225. }
  226. if(t->n>0 && t->b[t->n-1]!='\n')
  227. renderbytes(t, "\n");
  228. break;
  229. case Ifloattag:
  230. ifl = (Ifloat*)il;
  231. render(u, t, ifl->item, curanchor);
  232. break;
  233. case Ispacertag:
  234. is = (Ispacer*)il;
  235. if(is->spkind != ISPnull)
  236. renderbytes(t, " ");
  237. break;
  238. default:
  239. error("unknown item tag %d\n", il->tag);
  240. }
  241. if(il->anchorid != 0 && il->anchorid!=curanchor){
  242. for(a=u->docinfo->anchors; a!=nil; a=a->next)
  243. if(aflag && a->index == il->anchorid){
  244. href = fullurl(u, a->href);
  245. renderbytes(t, "[%s]", href);
  246. free(href);
  247. break;
  248. }
  249. curanchor = il->anchorid;
  250. }
  251. }
  252. if(t->n>0 && t->b[t->n-1]!='\n')
  253. renderbytes(t, "\n");
  254. }
  255. void
  256. rerender(URLwin *u)
  257. {
  258. Bytes *t;
  259. t = emalloc(sizeof(Bytes));
  260. render(u, t, u->items, 0);
  261. if(t->n)
  262. write(u->outfd, (char*)t->b, t->n);
  263. free(t->b);
  264. free(t);
  265. }
  266. /*
  267. * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
  268. * of the document (cistrstr only looks at first somewhat bytes).
  269. */
  270. int
  271. charset(char *s)
  272. {
  273. char *meta, *emeta, *charset;
  274. if(defcharset == 0)
  275. defcharset = ISO_8859_1;
  276. meta = cistrstr(s, "<meta");
  277. if(meta == nil)
  278. return defcharset;
  279. for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
  280. ;
  281. charset = cistrstr(s, "charset=");
  282. if(charset == nil)
  283. return defcharset;
  284. charset += 8;
  285. if(*charset == '"')
  286. charset++;
  287. if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
  288. return UTF_8;
  289. return defcharset;
  290. }
  291. void
  292. rendertext(URLwin *u, Bytes *b)
  293. {
  294. Rune *rurl;
  295. rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
  296. u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
  297. // free(rurl);
  298. rerender(u);
  299. }
  300. void
  301. freeurlwin(URLwin *u)
  302. {
  303. freeitems(u->items);
  304. u->items = nil;
  305. freedocinfo(u->docinfo);
  306. u->docinfo = nil;
  307. free(u);
  308. }