html.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "hdr.h"
  5. #include "conv.h"
  6. typedef struct Hchar Hchar;
  7. struct Hchar
  8. {
  9. char *s;
  10. Rune r;
  11. };
  12. /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
  13. static Hchar byname[] =
  14. {
  15. {"AElig", 198},
  16. {"Aacute", 193},
  17. {"Acirc", 194},
  18. {"Agrave", 192},
  19. {"Alpha", 913},
  20. {"Aring", 197},
  21. {"Atilde", 195},
  22. {"Auml", 196},
  23. {"Beta", 914},
  24. {"Ccedil", 199},
  25. {"Chi", 935},
  26. {"Dagger", 8225},
  27. {"Delta", 916},
  28. {"ETH", 208},
  29. {"Eacute", 201},
  30. {"Ecirc", 202},
  31. {"Egrave", 200},
  32. {"Epsilon", 917},
  33. {"Eta", 919},
  34. {"Euml", 203},
  35. {"Gamma", 915},
  36. {"Iacute", 205},
  37. {"Icirc", 206},
  38. {"Igrave", 204},
  39. {"Iota", 921},
  40. {"Iuml", 207},
  41. {"Kappa", 922},
  42. {"Lambda", 923},
  43. {"Mu", 924},
  44. {"Ntilde", 209},
  45. {"Nu", 925},
  46. {"OElig", 338},
  47. {"Oacute", 211},
  48. {"Ocirc", 212},
  49. {"Ograve", 210},
  50. {"Omega", 937},
  51. {"Omicron", 927},
  52. {"Oslash", 216},
  53. {"Otilde", 213},
  54. {"Ouml", 214},
  55. {"Phi", 934},
  56. {"Pi", 928},
  57. {"Prime", 8243},
  58. {"Psi", 936},
  59. {"Rho", 929},
  60. {"Scaron", 352},
  61. {"Sigma", 931},
  62. {"THORN", 222},
  63. {"Tau", 932},
  64. {"Theta", 920},
  65. {"Uacute", 218},
  66. {"Ucirc", 219},
  67. {"Ugrave", 217},
  68. {"Upsilon", 933},
  69. {"Uuml", 220},
  70. {"Xi", 926},
  71. {"Yacute", 221},
  72. {"Yuml", 376},
  73. {"Zeta", 918},
  74. {"aacute", 225},
  75. {"acirc", 226},
  76. {"acute", 180},
  77. {"aelig", 230},
  78. {"agrave", 224},
  79. {"alefsym", 8501},
  80. {"alpha", 945},
  81. {"amp", 38},
  82. {"and", 8743},
  83. {"ang", 8736},
  84. {"aring", 229},
  85. {"asymp", 8776},
  86. {"atilde", 227},
  87. {"auml", 228},
  88. {"bdquo", 8222},
  89. {"beta", 946},
  90. {"brvbar", 166},
  91. {"bull", 8226},
  92. {"cap", 8745},
  93. {"ccedil", 231},
  94. {"cdots", 8943},
  95. {"cedil", 184},
  96. {"cent", 162},
  97. {"chi", 967},
  98. {"circ", 710},
  99. {"clubs", 9827},
  100. {"cong", 8773},
  101. {"copy", 169},
  102. {"crarr", 8629},
  103. {"cup", 8746},
  104. {"curren", 164},
  105. {"dArr", 8659},
  106. {"dagger", 8224},
  107. {"darr", 8595},
  108. {"ddots", 8945},
  109. {"deg", 176},
  110. {"delta", 948},
  111. {"diams", 9830},
  112. {"divide", 247},
  113. {"eacute", 233},
  114. {"ecirc", 234},
  115. {"egrave", 232},
  116. {"emdash", 8212}, /* non-standard but commonly used */
  117. {"empty", 8709},
  118. {"emsp", 8195},
  119. {"endash", 8211}, /* non-standard but commonly used */
  120. {"ensp", 8194},
  121. {"epsilon", 949},
  122. {"equiv", 8801},
  123. {"eta", 951},
  124. {"eth", 240},
  125. {"euml", 235},
  126. {"euro", 8364},
  127. {"exist", 8707},
  128. {"fnof", 402},
  129. {"forall", 8704},
  130. {"frac12", 189},
  131. {"frac14", 188},
  132. {"frac34", 190},
  133. {"frasl", 8260},
  134. {"gamma", 947},
  135. {"ge", 8805},
  136. {"gt", 62},
  137. {"hArr", 8660},
  138. {"harr", 8596},
  139. {"hearts", 9829},
  140. {"hellip", 8230},
  141. {"iacute", 237},
  142. {"icirc", 238},
  143. {"iexcl", 161},
  144. {"igrave", 236},
  145. {"image", 8465},
  146. {"infin", 8734},
  147. {"int", 8747},
  148. {"iota", 953},
  149. {"iquest", 191},
  150. {"isin", 8712},
  151. {"iuml", 239},
  152. {"kappa", 954},
  153. {"lArr", 8656},
  154. {"lambda", 955},
  155. {"lang", 9001},
  156. {"laquo", 171},
  157. {"larr", 8592},
  158. {"lceil", 8968},
  159. {"ldots", 8230},
  160. {"ldquo", 8220},
  161. {"le", 8804},
  162. {"lfloor", 8970},
  163. {"lowast", 8727},
  164. {"loz", 9674},
  165. {"lrm", 8206},
  166. {"lsaquo", 8249},
  167. {"lsquo", 8216},
  168. {"lt", 60},
  169. {"macr", 175},
  170. {"mdash", 8212},
  171. {"micro", 181},
  172. {"middot", 183},
  173. {"minus", 8722},
  174. {"mu", 956},
  175. {"nabla", 8711},
  176. {"nbsp", 160},
  177. {"ndash", 8211},
  178. {"ne", 8800},
  179. {"ni", 8715},
  180. {"not", 172},
  181. {"notin", 8713},
  182. {"nsub", 8836},
  183. {"ntilde", 241},
  184. {"nu", 957},
  185. {"oacute", 243},
  186. {"ocirc", 244},
  187. {"oelig", 339},
  188. {"ograve", 242},
  189. {"oline", 8254},
  190. {"omega", 969},
  191. {"omicron", 959},
  192. {"oplus", 8853},
  193. {"or", 8744},
  194. {"ordf", 170},
  195. {"ordm", 186},
  196. {"oslash", 248},
  197. {"otilde", 245},
  198. {"otimes", 8855},
  199. {"ouml", 246},
  200. {"para", 182},
  201. {"part", 8706},
  202. {"permil", 8240},
  203. {"perp", 8869},
  204. {"phi", 966},
  205. {"pi", 960},
  206. {"piv", 982},
  207. {"plusmn", 177},
  208. {"pound", 163},
  209. {"prime", 8242},
  210. {"prod", 8719},
  211. {"prop", 8733},
  212. {"psi", 968},
  213. {"quad", 8193},
  214. {"quot", 34},
  215. {"rArr", 8658},
  216. {"radic", 8730},
  217. {"rang", 9002},
  218. {"raquo", 187},
  219. {"rarr", 8594},
  220. {"rceil", 8969},
  221. {"rdquo", 8221},
  222. {"real", 8476},
  223. {"reg", 174},
  224. {"rfloor", 8971},
  225. {"rho", 961},
  226. {"rlm", 8207},
  227. {"rsaquo", 8250},
  228. {"rsquo", 8217},
  229. {"sbquo", 8218},
  230. {"scaron", 353},
  231. {"sdot", 8901},
  232. {"sect", 167},
  233. {"shy", 173},
  234. {"sigma", 963},
  235. {"sigmaf", 962},
  236. {"sim", 8764},
  237. {"sp", 8194},
  238. {"spades", 9824},
  239. {"sub", 8834},
  240. {"sube", 8838},
  241. {"sum", 8721},
  242. {"sup", 8835},
  243. {"sup1", 185},
  244. {"sup2", 178},
  245. {"sup3", 179},
  246. {"supe", 8839},
  247. {"szlig", 223},
  248. {"tau", 964},
  249. {"there4", 8756},
  250. {"theta", 952},
  251. {"thetasym", 977},
  252. {"thinsp", 8201},
  253. {"thorn", 254},
  254. {"tilde", 732},
  255. {"times", 215},
  256. {"trade", 8482},
  257. {"uArr", 8657},
  258. {"uacute", 250},
  259. {"uarr", 8593},
  260. {"ucirc", 251},
  261. {"ugrave", 249},
  262. {"uml", 168},
  263. {"upsih", 978},
  264. {"upsilon", 965},
  265. {"uuml", 252},
  266. {"varepsilon", 8712},
  267. {"varphi", 981},
  268. {"varpi", 982},
  269. {"varrho", 1009},
  270. {"vdots", 8942},
  271. {"vsigma", 962},
  272. {"vtheta", 977},
  273. {"weierp", 8472},
  274. {"xi", 958},
  275. {"yacute", 253},
  276. {"yen", 165},
  277. {"yuml", 255},
  278. {"zeta", 950},
  279. {"zwj", 8205},
  280. {"zwnj", 8204}
  281. };
  282. static Hchar byrune[nelem(byname)];
  283. static int
  284. hnamecmp(const void *va, const void *vb)
  285. {
  286. Hchar *a, *b;
  287. a = (Hchar*)va;
  288. b = (Hchar*)vb;
  289. return strcmp(a->s, b->s);
  290. }
  291. static int
  292. hrunecmp(const void *va, const void *vb)
  293. {
  294. Hchar *a, *b;
  295. a = (Hchar*)va;
  296. b = (Hchar*)vb;
  297. return a->r - b->r;
  298. }
  299. static void
  300. html_init(void)
  301. {
  302. static int init;
  303. if(init)
  304. return;
  305. init = 1;
  306. memmove(byrune, byname, sizeof byrune);
  307. qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
  308. qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
  309. }
  310. static Rune
  311. findbyname(char *s)
  312. {
  313. Hchar *h;
  314. int n, m, x;
  315. h = byname;
  316. n = nelem(byname);
  317. while(n > 0){
  318. m = n/2;
  319. x = strcmp(h[m].s, s);
  320. if(x == 0)
  321. return h[m].r;
  322. if(x < 0){
  323. h += m+1;
  324. n -= m+1;
  325. }else
  326. n = m;
  327. }
  328. return Runeerror;
  329. }
  330. static char*
  331. findbyrune(Rune r)
  332. {
  333. Hchar *h;
  334. int n, m;
  335. h = byrune;
  336. n = nelem(byrune);
  337. while(n > 0){
  338. m = n/2;
  339. if(h[m].r == r)
  340. return h[m].s;
  341. if(h[m].r < r){
  342. h += m+1;
  343. n -= m+1;
  344. }else
  345. n = m;
  346. }
  347. return nil;
  348. }
  349. void
  350. html_in(int fd, long *x, struct convert *out)
  351. {
  352. char buf[100], *p;
  353. Biobuf b;
  354. Rune rbuf[N];
  355. Rune *r, *er;
  356. int c, i;
  357. USED(x);
  358. html_init();
  359. r = rbuf;
  360. er = rbuf+N;
  361. Binit(&b, fd, OREAD);
  362. while((c = Bgetrune(&b)) != Beof){
  363. if(r >= er){
  364. OUT(out, rbuf, r-rbuf);
  365. r = rbuf;
  366. }
  367. if(c == '&'){
  368. buf[0] = c;
  369. for(i=1; i<nelem(buf)-1;){
  370. c = Bgetc(&b);
  371. if(c == Beof)
  372. break;
  373. buf[i++] = c;
  374. if(strchr("; \t\r\n", c))
  375. break;
  376. }
  377. buf[i] = 0;
  378. if(buf[i-1] == ';'){
  379. buf[i-1] = 0;
  380. if((c = findbyname(buf+1)) != Runeerror){
  381. *r++ = c;
  382. continue;
  383. }
  384. buf[i-1] = ';';
  385. if(buf[1] == '#'){
  386. if(buf[2] == 'x')
  387. c = strtol(buf+3, &p, 16);
  388. else
  389. c = strtol(buf+2, &p, 10);
  390. if(*p != ';' || c >= NRUNE || c < 0)
  391. goto bad;
  392. *r++ = c;
  393. continue;
  394. }
  395. }
  396. bad:
  397. for(p=buf; p<buf+i; ){
  398. p += chartorune(r++, p);
  399. if(r >= er){
  400. OUT(out, rbuf, r-rbuf);
  401. r = rbuf;
  402. }
  403. }
  404. continue;
  405. }
  406. *r++ = c;
  407. }
  408. if(r > rbuf)
  409. OUT(out, rbuf, r-rbuf);
  410. OUT(out, rbuf, 0);
  411. }
  412. /*
  413. * use biobuf because can use more than UTFmax bytes per rune
  414. */
  415. void
  416. html_out(Rune *r, int n, long *x)
  417. {
  418. char *s;
  419. Biobuf b;
  420. Rune *er;
  421. USED(x);
  422. html_init();
  423. Binit(&b, 1, OWRITE);
  424. er = r+n;
  425. for(; r<er; r++){
  426. if(*r < Runeself)
  427. Bputrune(&b, *r);
  428. else if((s = findbyrune(*r)) != nil)
  429. Bprint(&b, "&%s;", s);
  430. else
  431. Bprint(&b, "&#%d;", *r);
  432. }
  433. Bflush(&b);
  434. }