123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #include "hdr.h"
- #include "conv.h"
- typedef struct Hchar Hchar;
- struct Hchar
- {
- char *s;
- Rune r;
- };
- /* <, >, ", & intentionally omitted */
- static Hchar byname[] =
- {
- {"AElig", 198},
- {"Aacute", 193},
- {"Acirc", 194},
- {"Agrave", 192},
- {"Aring", 197},
- {"Atilde", 195},
- {"Auml", 196},
- {"Ccedil", 199},
- {"ETH", 208},
- {"Eacute", 201},
- {"Ecirc", 202},
- {"Egrave", 200},
- {"Euml", 203},
- {"Iacute", 205},
- {"Icirc", 206},
- {"Igrave", 204},
- {"Iuml", 207},
- {"Ntilde", 209},
- {"Oacute", 211},
- {"Ocirc", 212},
- {"Ograve", 210},
- {"Oslash", 216},
- {"Otilde", 213},
- {"Ouml", 214},
- {"THORN", 222},
- {"Uacute", 218},
- {"Ucirc", 219},
- {"Ugrave", 217},
- {"Uuml", 220},
- {"Yacute", 221},
- {"aacute", 225},
- {"acirc", 226},
- {"acute", 180},
- {"aelig", 230},
- {"agrave", 224},
- {"alpha", 945},
- {"aring", 229},
- {"atilde", 227},
- {"auml", 228},
- {"beta", 946},
- {"brvbar", 166},
- {"ccedil", 231},
- {"cdots", 8943},
- {"cedil", 184},
- {"cent", 162},
- {"chi", 967},
- {"copy", 169},
- {"curren", 164},
- {"ddots", 8945},
- {"deg", 176},
- {"delta", 948},
- {"divide", 247},
- {"eacute", 233},
- {"ecirc", 234},
- {"egrave", 232},
- {"emdash", 8212}, /* non-standard but commonly used */
- {"emsp", 8195},
- {"endash", 8211}, /* non-standard but commonly used */
- {"ensp", 8194},
- {"epsilon", 949},
- {"eta", 951},
- {"eth", 240},
- {"euml", 235},
- {"frac12", 189},
- {"frac14", 188},
- {"frac34", 190},
- {"gamma", 947},
- {"iacute", 237},
- {"icirc", 238},
- {"iexcl", 161},
- {"igrave", 236},
- {"iota", 953},
- {"iquest", 191},
- {"iuml", 239},
- {"kappa", 954},
- {"lambda", 955},
- {"laquo", 171},
- {"ldquo", 8220},
- {"ldots", 8230},
- {"lsquo", 8216},
- {"macr", 175},
- {"mdash", 8212},
- {"micro", 181},
- {"middot", 183},
- {"mu", 956},
- {"nbsp", 160},
- {"ndash", 8211},
- {"not", 172},
- {"ntilde", 241},
- {"nu", 957},
- {"oacute", 243},
- {"ocirc", 244},
- {"ograve", 242},
- {"omega", 969},
- {"omicron", 959},
- {"ordf", 170},
- {"ordm", 186},
- {"oslash", 248},
- {"otilde", 245},
- {"ouml", 246},
- {"para", 182},
- {"phi", 966},
- {"pi", 960},
- {"plusmn", 177},
- {"pound", 163},
- {"psi", 968},
- {"quad", 8193},
- {"raquo", 187},
- {"rdquo", 8221},
- {"reg", 174},
- {"rho", 961},
- {"rsquo", 8217},
- {"sect", 167},
- {"shy", 173},
- {"sigma", 963},
- {"sp", 8194},
- {"sup1", 185},
- {"sup2", 178},
- {"sup3", 179},
- {"szlig", 223},
- {"tau", 964},
- {"theta", 952},
- {"thinsp", 8201},
- {"thorn", 254},
- {"times", 215},
- {"trade", 8482},
- {"uacute", 250},
- {"ucirc", 251},
- {"ugrave", 249},
- {"uml", 168},
- {"upsilon", 965},
- {"uuml", 252},
- {"varepsilon", 8712},
- {"varphi", 981},
- {"varpi", 982},
- {"varrho", 1009},
- {"vdots", 8942},
- {"vsigma", 962},
- {"vtheta", 977},
- {"xi", 958},
- {"yacute", 253},
- {"yen", 165},
- {"yuml", 255},
- {"zeta", 950}
- };
- static Hchar byrune[nelem(byname)];
- static int
- hnamecmp(const void *va, const void *vb)
- {
- Hchar *a, *b;
-
- a = (Hchar*)va;
- b = (Hchar*)vb;
- return strcmp(a->s, b->s);
- }
- static int
- hrunecmp(const void *va, const void *vb)
- {
- Hchar *a, *b;
-
- a = (Hchar*)va;
- b = (Hchar*)vb;
- return a->r - b->r;
- }
- static void
- html_init(void)
- {
- static int init;
-
- if(init)
- return;
- init = 1;
- memmove(byrune, byname, sizeof byrune);
- qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
- qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
- }
- static Rune
- findbyname(char *s)
- {
- Hchar *h;
- int n, m, x;
-
- h = byname;
- n = nelem(byname);
- while(n > 0){
- m = n/2;
- x = strcmp(h[m].s, s);
- if(x == 0)
- return h[m].r;
- if(x < 0){
- h += m+1;
- n -= m+1;
- }else
- n = m;
- }
- return Runeerror;
- }
- static char*
- findbyrune(Rune r)
- {
- Hchar *h;
- int n, m;
- h = byrune;
- n = nelem(byrune);
- while(n > 0){
- m = n/2;
- if(h[m].r == r)
- return h[m].s;
- if(h[m].r < r){
- h += m+1;
- n -= m+1;
- }else
- n = m;
- }
- return nil;
- }
- void
- html_in(int fd, long *x, struct convert *out)
- {
- char buf[100], *p;
- Biobuf b;
- Rune rbuf[N];
- Rune *r, *er;
- int c, i;
-
- USED(x);
-
- html_init();
- r = rbuf;
- er = rbuf+N;
- Binit(&b, fd, OREAD);
- while((c = Bgetrune(&b)) != Beof){
- if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
- }
- if(c == '&'){
- buf[0] = c;
- for(i=1; i<nelem(buf)-1;){
- c = Bgetc(&b);
- if(c == Beof)
- break;
- buf[i++] = c;
- if(strchr("; \t\r\n", c))
- break;
- }
- buf[i] = 0;
- if(buf[i-1] == ';'){
- buf[i-1] = 0;
- if((c = findbyname(buf+1)) != Runeerror){
- *r++ = c;
- continue;
- }
- buf[i-1] = ';';
- if(buf[1] == '#'){
- if(buf[2] == 'x')
- c = strtol(buf+3, &p, 16);
- else
- c = strtol(buf+2, &p, 10);
- if(*p != ';' || c >= NRUNE || c < 0)
- goto bad;
- *r++ = c;
- continue;
- }
- }
- bad:
- for(p=buf; p<buf+i; ){
- p += chartorune(r++, p);
- if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
- }
- }
- continue;
- }
- *r++ = c;
- }
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- }
- /*
- * use biobuf because can use more than UTFmax bytes per rune
- */
- void
- html_out(Rune *r, int n, long *x)
- {
- char *s;
- Biobuf b;
- Rune *er;
-
- USED(x);
- html_init();
- Binit(&b, 1, OWRITE);
- er = r+n;
- for(; r<er; r++){
- if(*r < Runeself)
- Bputrune(&b, *r);
- else if((s = findbyrune(*r)) != nil)
- Bprint(&b, "&%s;", s);
- else
- Bprint(&b, "&#%d;", *r);
- }
- Bflush(&b);
- }
|