123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include "hdr.h"
- #include "conv.h"
- typedef struct Hchar Hchar;
- struct Hchar
- {
- char *s;
- Rune r;
- };
- /* <, >, ", & intentionally omitted */
- /*
- * Names beginning with _ are names we recognize
- * (without the underscore) but will not generate,
- * because they are nonstandard.
- */
- static Hchar byname[] =
- {
- {"AElig", 198},
- {"Aacute", 193},
- {"Acirc", 194},
- {"Agrave", 192},
- {"Alpha", 913},
- {"Aring", 197},
- {"Atilde", 195},
- {"Auml", 196},
- {"Beta", 914},
- {"Ccedil", 199},
- {"Chi", 935},
- {"Dagger", 8225},
- {"Delta", 916},
- {"ETH", 208},
- {"Eacute", 201},
- {"Ecirc", 202},
- {"Egrave", 200},
- {"Epsilon", 917},
- {"Eta", 919},
- {"Euml", 203},
- {"Gamma", 915},
- {"Iacute", 205},
- {"Icirc", 206},
- {"Igrave", 204},
- {"Iota", 921},
- {"Iuml", 207},
- {"Kappa", 922},
- {"Lambda", 923},
- {"Mu", 924},
- {"Ntilde", 209},
- {"Nu", 925},
- {"OElig", 338},
- {"Oacute", 211},
- {"Ocirc", 212},
- {"Ograve", 210},
- {"Omega", 937},
- {"Omicron", 927},
- {"Oslash", 216},
- {"Otilde", 213},
- {"Ouml", 214},
- {"Phi", 934},
- {"Pi", 928},
- {"Prime", 8243},
- {"Psi", 936},
- {"Rho", 929},
- {"Scaron", 352},
- {"Sigma", 931},
- {"THORN", 222},
- {"Tau", 932},
- {"Theta", 920},
- {"Uacute", 218},
- {"Ucirc", 219},
- {"Ugrave", 217},
- {"Upsilon", 933},
- {"Uuml", 220},
- {"Xi", 926},
- {"Yacute", 221},
- {"Yuml", 376},
- {"Zeta", 918},
- {"aacute", 225},
- {"acirc", 226},
- {"acute", 180},
- {"aelig", 230},
- {"agrave", 224},
- {"alefsym", 8501},
- {"alpha", 945},
- {"amp", 38},
- {"and", 8743},
- {"ang", 8736},
- {"aring", 229},
- {"asymp", 8776},
- {"atilde", 227},
- {"auml", 228},
- {"bdquo", 8222},
- {"beta", 946},
- {"brvbar", 166},
- {"bull", 8226},
- {"cap", 8745},
- {"ccedil", 231},
- {"cdots", 8943},
- {"cedil", 184},
- {"cent", 162},
- {"chi", 967},
- {"circ", 710},
- {"clubs", 9827},
- {"cong", 8773},
- {"copy", 169},
- {"crarr", 8629},
- {"cup", 8746},
- {"curren", 164},
- {"dArr", 8659},
- {"dagger", 8224},
- {"darr", 8595},
- {"ddots", 8945},
- {"deg", 176},
- {"delta", 948},
- {"diams", 9830},
- {"divide", 247},
- {"eacute", 233},
- {"ecirc", 234},
- {"egrave", 232},
- {"_emdash", 8212}, /* non-standard but commonly used */
- {"empty", 8709},
- {"emsp", 8195},
- {"_endash", 8211}, /* non-standard but commonly used */
- {"ensp", 8194},
- {"epsilon", 949},
- {"equiv", 8801},
- {"eta", 951},
- {"eth", 240},
- {"euml", 235},
- {"euro", 8364},
- {"exist", 8707},
- {"fnof", 402},
- {"forall", 8704},
- {"frac12", 189},
- {"frac14", 188},
- {"frac34", 190},
- {"frasl", 8260},
- {"gamma", 947},
- {"ge", 8805},
- {"gt", 62},
- {"hArr", 8660},
- {"harr", 8596},
- {"hearts", 9829},
- {"hellip", 8230},
- {"iacute", 237},
- {"icirc", 238},
- {"iexcl", 161},
- {"igrave", 236},
- {"image", 8465},
- {"infin", 8734},
- {"int", 8747},
- {"iota", 953},
- {"iquest", 191},
- {"isin", 8712},
- {"iuml", 239},
- {"kappa", 954},
- {"lArr", 8656},
- {"lambda", 955},
- {"lang", 9001},
- {"laquo", 171},
- {"larr", 8592},
- {"lceil", 8968},
- {"_ldots", 8230},
- {"ldquo", 8220},
- {"le", 8804},
- {"lfloor", 8970},
- {"lowast", 8727},
- {"loz", 9674},
- {"lrm", 8206},
- {"lsaquo", 8249},
- {"lsquo", 8216},
- {"lt", 60},
- {"macr", 175},
- {"mdash", 8212},
- {"micro", 181},
- {"middot", 183},
- {"minus", 8722},
- {"mu", 956},
- {"nabla", 8711},
- {"nbsp", 160},
- {"ndash", 8211},
- {"ne", 8800},
- {"ni", 8715},
- {"not", 172},
- {"notin", 8713},
- {"nsub", 8836},
- {"ntilde", 241},
- {"nu", 957},
- {"oacute", 243},
- {"ocirc", 244},
- {"oelig", 339},
- {"ograve", 242},
- {"oline", 8254},
- {"omega", 969},
- {"omicron", 959},
- {"oplus", 8853},
- {"or", 8744},
- {"ordf", 170},
- {"ordm", 186},
- {"oslash", 248},
- {"otilde", 245},
- {"otimes", 8855},
- {"ouml", 246},
- {"para", 182},
- {"part", 8706},
- {"permil", 8240},
- {"perp", 8869},
- {"phi", 966},
- {"pi", 960},
- {"piv", 982},
- {"plusmn", 177},
- {"pound", 163},
- {"prime", 8242},
- {"prod", 8719},
- {"prop", 8733},
- {"psi", 968},
- {"quad", 8193},
- {"quot", 34},
- {"rArr", 8658},
- {"radic", 8730},
- {"rang", 9002},
- {"raquo", 187},
- {"rarr", 8594},
- {"rceil", 8969},
- {"rdquo", 8221},
- {"real", 8476},
- {"reg", 174},
- {"rfloor", 8971},
- {"rho", 961},
- {"rlm", 8207},
- {"rsaquo", 8250},
- {"rsquo", 8217},
- {"sbquo", 8218},
- {"scaron", 353},
- {"sdot", 8901},
- {"sect", 167},
- {"shy", 173},
- {"sigma", 963},
- {"sigmaf", 962},
- {"sim", 8764},
- {"_sp", 8194},
- {"spades", 9824},
- {"sub", 8834},
- {"sube", 8838},
- {"sum", 8721},
- {"sup", 8835},
- {"sup1", 185},
- {"sup2", 178},
- {"sup3", 179},
- {"supe", 8839},
- {"szlig", 223},
- {"tau", 964},
- {"there4", 8756},
- {"theta", 952},
- {"thetasym", 977},
- {"thinsp", 8201},
- {"thorn", 254},
- {"tilde", 732},
- {"times", 215},
- {"trade", 8482},
- {"uArr", 8657},
- {"uacute", 250},
- {"uarr", 8593},
- {"ucirc", 251},
- {"ugrave", 249},
- {"uml", 168},
- {"upsih", 978},
- {"upsilon", 965},
- {"uuml", 252},
- {"_varepsilon", 8712},
- {"varphi", 981},
- {"_varpi", 982},
- {"varrho", 1009},
- {"vdots", 8942},
- {"_vsigma", 962},
- {"_vtheta", 977},
- {"weierp", 8472},
- {"xi", 958},
- {"yacute", 253},
- {"yen", 165},
- {"yuml", 255},
- {"zeta", 950},
- {"zwj", 8205},
- {"zwnj", 8204}
- };
- static Hchar byrune[nelem(byname)];
- static int
- hnamecmp(const void *va, const void *vb)
- {
- Hchar *a, *b;
-
- a = (Hchar*)va;
- b = (Hchar*)vb;
- return strcmp(a->s, b->s);
- }
- static int
- hrunecmp(const void *va, const void *vb)
- {
- Hchar *a, *b;
-
- a = (Hchar*)va;
- b = (Hchar*)vb;
- return a->r - b->r;
- }
- static void
- html_init(void)
- {
- static int init;
- int i;
-
- if(init)
- return;
- init = 1;
- memmove(byrune, byname, sizeof byrune);
-
- /* Eliminate names we aren't allowed to generate. */
- for(i=0; i<nelem(byrune); i++){
- if(byrune[i].s[0] == '_'){
- byrune[i].r = Runeerror;
- byname[i].s++;
- }
- }
-
- qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
- qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
- }
- static Rune
- findbyname(char *s)
- {
- Hchar *h;
- int n, m, x;
-
- h = byname;
- n = nelem(byname);
- while(n > 0){
- m = n/2;
- x = strcmp(h[m].s, s);
- if(x == 0)
- return h[m].r;
- if(x < 0){
- h += m+1;
- n -= m+1;
- }else
- n = m;
- }
- return Runeerror;
- }
- static char*
- findbyrune(Rune r)
- {
- Hchar *h;
- int n, m;
- if(r == Runeerror)
- return nil;
- h = byrune;
- n = nelem(byrune);
- while(n > 0){
- m = n/2;
- if(h[m].r == r)
- return h[m].s;
- if(h[m].r < r){
- h += m+1;
- n -= m+1;
- }else
- n = m;
- }
- return nil;
- }
- void
- html_in(int fd, long *x, struct convert *out)
- {
- char buf[100], *p;
- Biobuf b;
- Rune rbuf[N];
- Rune *r, *er;
- int c, i;
-
- USED(x);
-
- html_init();
- r = rbuf;
- er = rbuf+N;
- Binit(&b, fd, OREAD);
- while((c = Bgetrune(&b)) != Beof){
- if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
- }
- if(c == '&'){
- buf[0] = c;
- for(i=1; i<nelem(buf)-1;){
- c = Bgetc(&b);
- if(c == Beof)
- break;
- buf[i++] = c;
- if(strchr("; \t\r\n", c))
- break;
- }
- buf[i] = 0;
- if(buf[i-1] == ';'){
- buf[i-1] = 0;
- if((c = findbyname(buf+1)) != Runeerror){
- *r++ = c;
- continue;
- }
- buf[i-1] = ';';
- if(buf[1] == '#'){
- if(buf[2] == 'x')
- c = strtol(buf+3, &p, 16);
- else
- c = strtol(buf+2, &p, 10);
- if(*p != ';' || c >= NRUNE || c < 0)
- goto bad;
- *r++ = c;
- continue;
- }
- }
- bad:
- for(p=buf; p<buf+i; ){
- p += chartorune(r++, p);
- if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
- }
- }
- continue;
- }
- *r++ = c;
- }
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- OUT(out, rbuf, 0);
- }
- /*
- * use biobuf because can use more than UTFmax bytes per rune
- */
- void
- html_out(Rune *r, int n, long *x)
- {
- char *s;
- Biobuf b;
- Rune *er;
-
- USED(x);
- html_init();
- Binit(&b, 1, OWRITE);
- er = r+n;
- for(; r<er; r++){
- if(*r < Runeself)
- Bputrune(&b, *r);
- else if((s = findbyrune(*r)) != nil)
- Bprint(&b, "&%s;", s);
- else
- Bprint(&b, "&#%d;", *r);
- }
- Bflush(&b);
- }
|