html.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include "hdr.h"
  5. #include "conv.h"
  6. typedef struct Hchar Hchar;
  7. struct Hchar
  8. {
  9. char *s;
  10. Rune r;
  11. };
  12. /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
  13. /*
  14. * Names beginning with _ are names we recognize
  15. * (without the underscore) but will not generate,
  16. * because they are nonstandard.
  17. */
  18. static Hchar byname[] =
  19. {
  20. {"AElig", 198},
  21. {"Aacute", 193},
  22. {"Acirc", 194},
  23. {"Agrave", 192},
  24. {"Alpha", 913},
  25. {"Aring", 197},
  26. {"Atilde", 195},
  27. {"Auml", 196},
  28. {"Beta", 914},
  29. {"Ccedil", 199},
  30. {"Chi", 935},
  31. {"Dagger", 8225},
  32. {"Delta", 916},
  33. {"ETH", 208},
  34. {"Eacute", 201},
  35. {"Ecirc", 202},
  36. {"Egrave", 200},
  37. {"Epsilon", 917},
  38. {"Eta", 919},
  39. {"Euml", 203},
  40. {"Gamma", 915},
  41. {"Iacute", 205},
  42. {"Icirc", 206},
  43. {"Igrave", 204},
  44. {"Iota", 921},
  45. {"Iuml", 207},
  46. {"Kappa", 922},
  47. {"Lambda", 923},
  48. {"Mu", 924},
  49. {"Ntilde", 209},
  50. {"Nu", 925},
  51. {"OElig", 338},
  52. {"Oacute", 211},
  53. {"Ocirc", 212},
  54. {"Ograve", 210},
  55. {"Omega", 937},
  56. {"Omicron", 927},
  57. {"Oslash", 216},
  58. {"Otilde", 213},
  59. {"Ouml", 214},
  60. {"Phi", 934},
  61. {"Pi", 928},
  62. {"Prime", 8243},
  63. {"Psi", 936},
  64. {"Rho", 929},
  65. {"Scaron", 352},
  66. {"Sigma", 931},
  67. {"THORN", 222},
  68. {"Tau", 932},
  69. {"Theta", 920},
  70. {"Uacute", 218},
  71. {"Ucirc", 219},
  72. {"Ugrave", 217},
  73. {"Upsilon", 933},
  74. {"Uuml", 220},
  75. {"Xi", 926},
  76. {"Yacute", 221},
  77. {"Yuml", 376},
  78. {"Zeta", 918},
  79. {"aacute", 225},
  80. {"acirc", 226},
  81. {"acute", 180},
  82. {"aelig", 230},
  83. {"agrave", 224},
  84. {"alefsym", 8501},
  85. {"alpha", 945},
  86. {"amp", 38},
  87. {"and", 8743},
  88. {"ang", 8736},
  89. {"aring", 229},
  90. {"asymp", 8776},
  91. {"atilde", 227},
  92. {"auml", 228},
  93. {"bdquo", 8222},
  94. {"beta", 946},
  95. {"brvbar", 166},
  96. {"bull", 8226},
  97. {"cap", 8745},
  98. {"ccedil", 231},
  99. {"cdots", 8943},
  100. {"cedil", 184},
  101. {"cent", 162},
  102. {"chi", 967},
  103. {"circ", 710},
  104. {"clubs", 9827},
  105. {"cong", 8773},
  106. {"copy", 169},
  107. {"crarr", 8629},
  108. {"cup", 8746},
  109. {"curren", 164},
  110. {"dArr", 8659},
  111. {"dagger", 8224},
  112. {"darr", 8595},
  113. {"ddots", 8945},
  114. {"deg", 176},
  115. {"delta", 948},
  116. {"diams", 9830},
  117. {"divide", 247},
  118. {"eacute", 233},
  119. {"ecirc", 234},
  120. {"egrave", 232},
  121. {"_emdash", 8212}, /* non-standard but commonly used */
  122. {"empty", 8709},
  123. {"emsp", 8195},
  124. {"_endash", 8211}, /* non-standard but commonly used */
  125. {"ensp", 8194},
  126. {"epsilon", 949},
  127. {"equiv", 8801},
  128. {"eta", 951},
  129. {"eth", 240},
  130. {"euml", 235},
  131. {"euro", 8364},
  132. {"exist", 8707},
  133. {"fnof", 402},
  134. {"forall", 8704},
  135. {"frac12", 189},
  136. {"frac14", 188},
  137. {"frac34", 190},
  138. {"frasl", 8260},
  139. {"gamma", 947},
  140. {"ge", 8805},
  141. {"gt", 62},
  142. {"hArr", 8660},
  143. {"harr", 8596},
  144. {"hearts", 9829},
  145. {"hellip", 8230},
  146. {"iacute", 237},
  147. {"icirc", 238},
  148. {"iexcl", 161},
  149. {"igrave", 236},
  150. {"image", 8465},
  151. {"infin", 8734},
  152. {"int", 8747},
  153. {"iota", 953},
  154. {"iquest", 191},
  155. {"isin", 8712},
  156. {"iuml", 239},
  157. {"kappa", 954},
  158. {"lArr", 8656},
  159. {"lambda", 955},
  160. {"lang", 9001},
  161. {"laquo", 171},
  162. {"larr", 8592},
  163. {"lceil", 8968},
  164. {"_ldots", 8230},
  165. {"ldquo", 8220},
  166. {"le", 8804},
  167. {"lfloor", 8970},
  168. {"lowast", 8727},
  169. {"loz", 9674},
  170. {"lrm", 8206},
  171. {"lsaquo", 8249},
  172. {"lsquo", 8216},
  173. {"lt", 60},
  174. {"macr", 175},
  175. {"mdash", 8212},
  176. {"micro", 181},
  177. {"middot", 183},
  178. {"minus", 8722},
  179. {"mu", 956},
  180. {"nabla", 8711},
  181. {"nbsp", 160},
  182. {"ndash", 8211},
  183. {"ne", 8800},
  184. {"ni", 8715},
  185. {"not", 172},
  186. {"notin", 8713},
  187. {"nsub", 8836},
  188. {"ntilde", 241},
  189. {"nu", 957},
  190. {"oacute", 243},
  191. {"ocirc", 244},
  192. {"oelig", 339},
  193. {"ograve", 242},
  194. {"oline", 8254},
  195. {"omega", 969},
  196. {"omicron", 959},
  197. {"oplus", 8853},
  198. {"or", 8744},
  199. {"ordf", 170},
  200. {"ordm", 186},
  201. {"oslash", 248},
  202. {"otilde", 245},
  203. {"otimes", 8855},
  204. {"ouml", 246},
  205. {"para", 182},
  206. {"part", 8706},
  207. {"permil", 8240},
  208. {"perp", 8869},
  209. {"phi", 966},
  210. {"pi", 960},
  211. {"piv", 982},
  212. {"plusmn", 177},
  213. {"pound", 163},
  214. {"prime", 8242},
  215. {"prod", 8719},
  216. {"prop", 8733},
  217. {"psi", 968},
  218. {"quad", 8193},
  219. {"quot", 34},
  220. {"rArr", 8658},
  221. {"radic", 8730},
  222. {"rang", 9002},
  223. {"raquo", 187},
  224. {"rarr", 8594},
  225. {"rceil", 8969},
  226. {"rdquo", 8221},
  227. {"real", 8476},
  228. {"reg", 174},
  229. {"rfloor", 8971},
  230. {"rho", 961},
  231. {"rlm", 8207},
  232. {"rsaquo", 8250},
  233. {"rsquo", 8217},
  234. {"sbquo", 8218},
  235. {"scaron", 353},
  236. {"sdot", 8901},
  237. {"sect", 167},
  238. {"shy", 173},
  239. {"sigma", 963},
  240. {"sigmaf", 962},
  241. {"sim", 8764},
  242. {"_sp", 8194},
  243. {"spades", 9824},
  244. {"sub", 8834},
  245. {"sube", 8838},
  246. {"sum", 8721},
  247. {"sup", 8835},
  248. {"sup1", 185},
  249. {"sup2", 178},
  250. {"sup3", 179},
  251. {"supe", 8839},
  252. {"szlig", 223},
  253. {"tau", 964},
  254. {"there4", 8756},
  255. {"theta", 952},
  256. {"thetasym", 977},
  257. {"thinsp", 8201},
  258. {"thorn", 254},
  259. {"tilde", 732},
  260. {"times", 215},
  261. {"trade", 8482},
  262. {"uArr", 8657},
  263. {"uacute", 250},
  264. {"uarr", 8593},
  265. {"ucirc", 251},
  266. {"ugrave", 249},
  267. {"uml", 168},
  268. {"upsih", 978},
  269. {"upsilon", 965},
  270. {"uuml", 252},
  271. {"_varepsilon", 8712},
  272. {"varphi", 981},
  273. {"_varpi", 982},
  274. {"varrho", 1009},
  275. {"vdots", 8942},
  276. {"_vsigma", 962},
  277. {"_vtheta", 977},
  278. {"weierp", 8472},
  279. {"xi", 958},
  280. {"yacute", 253},
  281. {"yen", 165},
  282. {"yuml", 255},
  283. {"zeta", 950},
  284. {"zwj", 8205},
  285. {"zwnj", 8204}
  286. };
  287. static Hchar byrune[nelem(byname)];
  288. static int
  289. hnamecmp(const void *va, const void *vb)
  290. {
  291. Hchar *a, *b;
  292. a = (Hchar*)va;
  293. b = (Hchar*)vb;
  294. return strcmp(a->s, b->s);
  295. }
  296. static int
  297. hrunecmp(const void *va, const void *vb)
  298. {
  299. Hchar *a, *b;
  300. a = (Hchar*)va;
  301. b = (Hchar*)vb;
  302. return a->r - b->r;
  303. }
  304. static void
  305. html_init(void)
  306. {
  307. static int init;
  308. int i;
  309. if(init)
  310. return;
  311. init = 1;
  312. memmove(byrune, byname, sizeof byrune);
  313. /* Eliminate names we aren't allowed to generate. */
  314. for(i=0; i<nelem(byrune); i++){
  315. if(byrune[i].s[0] == '_'){
  316. byrune[i].r = Runeerror;
  317. byname[i].s++;
  318. }
  319. }
  320. qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
  321. qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
  322. }
  323. static Rune
  324. findbyname(char *s)
  325. {
  326. Hchar *h;
  327. int n, m, x;
  328. h = byname;
  329. n = nelem(byname);
  330. while(n > 0){
  331. m = n/2;
  332. x = strcmp(h[m].s, s);
  333. if(x == 0)
  334. return h[m].r;
  335. if(x < 0){
  336. h += m+1;
  337. n -= m+1;
  338. }else
  339. n = m;
  340. }
  341. return Runeerror;
  342. }
  343. static char*
  344. findbyrune(Rune r)
  345. {
  346. Hchar *h;
  347. int n, m;
  348. if(r == Runeerror)
  349. return nil;
  350. h = byrune;
  351. n = nelem(byrune);
  352. while(n > 0){
  353. m = n/2;
  354. if(h[m].r == r)
  355. return h[m].s;
  356. if(h[m].r < r){
  357. h += m+1;
  358. n -= m+1;
  359. }else
  360. n = m;
  361. }
  362. return nil;
  363. }
  364. void
  365. html_in(int fd, long *x, struct convert *out)
  366. {
  367. char buf[100], *p;
  368. Biobuf b;
  369. Rune rbuf[N];
  370. Rune *r, *er;
  371. int c, i;
  372. USED(x);
  373. html_init();
  374. r = rbuf;
  375. er = rbuf+N;
  376. Binit(&b, fd, OREAD);
  377. while((c = Bgetrune(&b)) != Beof){
  378. if(r >= er){
  379. OUT(out, rbuf, r-rbuf);
  380. r = rbuf;
  381. }
  382. if(c == '&'){
  383. buf[0] = c;
  384. for(i=1; i<nelem(buf)-1;){
  385. c = Bgetc(&b);
  386. if(c == Beof)
  387. break;
  388. buf[i++] = c;
  389. if(strchr("; \t\r\n", c))
  390. break;
  391. }
  392. buf[i] = 0;
  393. if(buf[i-1] == ';'){
  394. buf[i-1] = 0;
  395. if((c = findbyname(buf+1)) != Runeerror){
  396. *r++ = c;
  397. continue;
  398. }
  399. buf[i-1] = ';';
  400. if(buf[1] == '#'){
  401. if(buf[2] == 'x')
  402. c = strtol(buf+3, &p, 16);
  403. else
  404. c = strtol(buf+2, &p, 10);
  405. if(*p != ';' || c >= NRUNE || c < 0)
  406. goto bad;
  407. *r++ = c;
  408. continue;
  409. }
  410. }
  411. bad:
  412. for(p=buf; p<buf+i; ){
  413. p += chartorune(r++, p);
  414. if(r >= er){
  415. OUT(out, rbuf, r-rbuf);
  416. r = rbuf;
  417. }
  418. }
  419. continue;
  420. }
  421. *r++ = c;
  422. }
  423. if(r > rbuf)
  424. OUT(out, rbuf, r-rbuf);
  425. OUT(out, rbuf, 0);
  426. }
  427. /*
  428. * use biobuf because can use more than UTFmax bytes per rune
  429. */
  430. void
  431. html_out(Rune *r, int n, long *x)
  432. {
  433. char *s;
  434. Biobuf b;
  435. Rune *er;
  436. USED(x);
  437. html_init();
  438. Binit(&b, 1, OWRITE);
  439. er = r+n;
  440. for(; r<er; r++){
  441. if(*r < Runeself)
  442. Bputrune(&b, *r);
  443. else if((s = findbyrune(*r)) != nil)
  444. Bprint(&b, "&%s;", s);
  445. else
  446. Bprint(&b, "&#%d;", *r);
  447. }
  448. Bflush(&b);
  449. }