html.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "hdr.h"
  5. #include "conv.h"
  6. typedef struct Hchar Hchar;
  7. struct Hchar
  8. {
  9. char *s;
  10. Rune r;
  11. };
  12. /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
  13. static Hchar byname[] =
  14. {
  15. {"AElig", 198},
  16. {"Aacute", 193},
  17. {"Acirc", 194},
  18. {"Agrave", 192},
  19. {"Aring", 197},
  20. {"Atilde", 195},
  21. {"Auml", 196},
  22. {"Ccedil", 199},
  23. {"ETH", 208},
  24. {"Eacute", 201},
  25. {"Ecirc", 202},
  26. {"Egrave", 200},
  27. {"Euml", 203},
  28. {"Iacute", 205},
  29. {"Icirc", 206},
  30. {"Igrave", 204},
  31. {"Iuml", 207},
  32. {"Ntilde", 209},
  33. {"Oacute", 211},
  34. {"Ocirc", 212},
  35. {"Ograve", 210},
  36. {"Oslash", 216},
  37. {"Otilde", 213},
  38. {"Ouml", 214},
  39. {"THORN", 222},
  40. {"Uacute", 218},
  41. {"Ucirc", 219},
  42. {"Ugrave", 217},
  43. {"Uuml", 220},
  44. {"Yacute", 221},
  45. {"aacute", 225},
  46. {"acirc", 226},
  47. {"acute", 180},
  48. {"aelig", 230},
  49. {"agrave", 224},
  50. {"alpha", 945},
  51. {"aring", 229},
  52. {"atilde", 227},
  53. {"auml", 228},
  54. {"beta", 946},
  55. {"brvbar", 166},
  56. {"ccedil", 231},
  57. {"cdots", 8943},
  58. {"cedil", 184},
  59. {"cent", 162},
  60. {"chi", 967},
  61. {"copy", 169},
  62. {"curren", 164},
  63. {"ddots", 8945},
  64. {"deg", 176},
  65. {"delta", 948},
  66. {"divide", 247},
  67. {"eacute", 233},
  68. {"ecirc", 234},
  69. {"egrave", 232},
  70. {"emdash", 8212}, /* non-standard but commonly used */
  71. {"emsp", 8195},
  72. {"endash", 8211}, /* non-standard but commonly used */
  73. {"ensp", 8194},
  74. {"epsilon", 949},
  75. {"eta", 951},
  76. {"eth", 240},
  77. {"euml", 235},
  78. {"frac12", 189},
  79. {"frac14", 188},
  80. {"frac34", 190},
  81. {"gamma", 947},
  82. {"iacute", 237},
  83. {"icirc", 238},
  84. {"iexcl", 161},
  85. {"igrave", 236},
  86. {"iota", 953},
  87. {"iquest", 191},
  88. {"iuml", 239},
  89. {"kappa", 954},
  90. {"lambda", 955},
  91. {"laquo", 171},
  92. {"ldquo", 8220},
  93. {"ldots", 8230},
  94. {"lsquo", 8216},
  95. {"macr", 175},
  96. {"mdash", 8212},
  97. {"micro", 181},
  98. {"middot", 183},
  99. {"mu", 956},
  100. {"nbsp", 160},
  101. {"ndash", 8211},
  102. {"not", 172},
  103. {"ntilde", 241},
  104. {"nu", 957},
  105. {"oacute", 243},
  106. {"ocirc", 244},
  107. {"ograve", 242},
  108. {"omega", 969},
  109. {"omicron", 959},
  110. {"ordf", 170},
  111. {"ordm", 186},
  112. {"oslash", 248},
  113. {"otilde", 245},
  114. {"ouml", 246},
  115. {"para", 182},
  116. {"phi", 966},
  117. {"pi", 960},
  118. {"plusmn", 177},
  119. {"pound", 163},
  120. {"psi", 968},
  121. {"quad", 8193},
  122. {"raquo", 187},
  123. {"rdquo", 8221},
  124. {"reg", 174},
  125. {"rho", 961},
  126. {"rsquo", 8217},
  127. {"sect", 167},
  128. {"shy", 173},
  129. {"sigma", 963},
  130. {"sp", 8194},
  131. {"sup1", 185},
  132. {"sup2", 178},
  133. {"sup3", 179},
  134. {"szlig", 223},
  135. {"tau", 964},
  136. {"theta", 952},
  137. {"thinsp", 8201},
  138. {"thorn", 254},
  139. {"times", 215},
  140. {"trade", 8482},
  141. {"uacute", 250},
  142. {"ucirc", 251},
  143. {"ugrave", 249},
  144. {"uml", 168},
  145. {"upsilon", 965},
  146. {"uuml", 252},
  147. {"varepsilon", 8712},
  148. {"varphi", 981},
  149. {"varpi", 982},
  150. {"varrho", 1009},
  151. {"vdots", 8942},
  152. {"vsigma", 962},
  153. {"vtheta", 977},
  154. {"xi", 958},
  155. {"yacute", 253},
  156. {"yen", 165},
  157. {"yuml", 255},
  158. {"zeta", 950}
  159. };
  160. static Hchar byrune[nelem(byname)];
  161. static int
  162. hnamecmp(const void *va, const void *vb)
  163. {
  164. Hchar *a, *b;
  165. a = (Hchar*)va;
  166. b = (Hchar*)vb;
  167. return strcmp(a->s, b->s);
  168. }
  169. static int
  170. hrunecmp(const void *va, const void *vb)
  171. {
  172. Hchar *a, *b;
  173. a = (Hchar*)va;
  174. b = (Hchar*)vb;
  175. return a->r - b->r;
  176. }
  177. static void
  178. html_init(void)
  179. {
  180. static int init;
  181. if(init)
  182. return;
  183. init = 1;
  184. memmove(byrune, byname, sizeof byrune);
  185. qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
  186. qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
  187. }
  188. static Rune
  189. findbyname(char *s)
  190. {
  191. Hchar *h;
  192. int n, m, x;
  193. h = byname;
  194. n = nelem(byname);
  195. while(n > 0){
  196. m = n/2;
  197. x = strcmp(h[m].s, s);
  198. if(x == 0)
  199. return h[m].r;
  200. if(x < 0){
  201. h += m+1;
  202. n -= m+1;
  203. }else
  204. n = m;
  205. }
  206. return Runeerror;
  207. }
  208. static char*
  209. findbyrune(Rune r)
  210. {
  211. Hchar *h;
  212. int n, m;
  213. h = byrune;
  214. n = nelem(byrune);
  215. while(n > 0){
  216. m = n/2;
  217. if(h[m].r == r)
  218. return h[m].s;
  219. if(h[m].r < r){
  220. h += m+1;
  221. n -= m+1;
  222. }else
  223. n = m;
  224. }
  225. return nil;
  226. }
  227. void
  228. html_in(int fd, long *x, struct convert *out)
  229. {
  230. char buf[100], *p;
  231. Biobuf b;
  232. Rune rbuf[N];
  233. Rune *r, *er;
  234. int c, i;
  235. USED(x);
  236. html_init();
  237. r = rbuf;
  238. er = rbuf+N;
  239. Binit(&b, fd, OREAD);
  240. while((c = Bgetrune(&b)) != Beof){
  241. if(r >= er){
  242. OUT(out, rbuf, r-rbuf);
  243. r = rbuf;
  244. }
  245. if(c == '&'){
  246. buf[0] = c;
  247. for(i=1; i<nelem(buf)-1;){
  248. c = Bgetc(&b);
  249. if(c == Beof)
  250. break;
  251. buf[i++] = c;
  252. if(strchr("; \t\r\n", c))
  253. break;
  254. }
  255. buf[i] = 0;
  256. if(buf[i-1] == ';'){
  257. buf[i-1] = 0;
  258. if((c = findbyname(buf+1)) != Runeerror){
  259. *r++ = c;
  260. continue;
  261. }
  262. buf[i-1] = ';';
  263. if(buf[1] == '#'){
  264. if(buf[2] == 'x')
  265. c = strtol(buf+3, &p, 16);
  266. else
  267. c = strtol(buf+2, &p, 10);
  268. if(*p != ';' || c >= NRUNE || c < 0)
  269. goto bad;
  270. *r++ = c;
  271. continue;
  272. }
  273. }
  274. bad:
  275. for(p=buf; p<buf+i; ){
  276. p += chartorune(r++, p);
  277. if(r >= er){
  278. OUT(out, rbuf, r-rbuf);
  279. r = rbuf;
  280. }
  281. }
  282. continue;
  283. }
  284. *r++ = c;
  285. }
  286. if(r > rbuf)
  287. OUT(out, rbuf, r-rbuf);
  288. }
  289. /*
  290. * use biobuf because can use more than UTFmax bytes per rune
  291. */
  292. void
  293. html_out(Rune *r, int n, long *x)
  294. {
  295. char *s;
  296. Biobuf b;
  297. Rune *er;
  298. USED(x);
  299. html_init();
  300. Binit(&b, 1, OWRITE);
  301. er = r+n;
  302. for(; r<er; r++){
  303. if(*r < Runeself)
  304. Bputrune(&b, *r);
  305. else if((s = findbyrune(*r)) != nil)
  306. Bprint(&b, "&%s;", s);
  307. else
  308. Bprint(&b, "&#%d;", *r);
  309. }
  310. Bflush(&b);
  311. }