12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502 |
- #include <u.h>
- #include <libc.h>
- #include <draw.h>
- #include <ctype.h>
- #include <html.h>
- #include "impl.h"
- typedef struct TokenSource TokenSource;
- struct TokenSource
- {
- int i; // index of next byte to use
- uchar* data; // all the data
- int edata; // data[0:edata] is valid
- int chset; // one of US_Ascii, etc.
- int mtype; // TextHtml or TextPlain
- };
- enum {
- EOF = -2,
- EOB = -1
- };
- #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
- #define SMALLBUFSIZE 240
- #define BIGBUFSIZE 2000
- // HTML 4.0 tag names.
- // Keep sorted, and in correspondence with enum in iparse.h.
- Rune* tagnames[] = {
- L" ",
- L"!",
- L"a",
- L"abbr",
- L"acronym",
- L"address",
- L"applet",
- L"area",
- L"b",
- L"base",
- L"basefont",
- L"bdo",
- L"big",
- L"blink",
- L"blockquote",
- L"body",
- L"bq",
- L"br",
- L"button",
- L"caption",
- L"center",
- L"cite",
- L"code",
- L"col",
- L"colgroup",
- L"dd",
- L"del",
- L"dfn",
- L"dir",
- L"div",
- L"dl",
- L"dt",
- L"em",
- L"fieldset",
- L"font",
- L"form",
- L"frame",
- L"frameset",
- L"h1",
- L"h2",
- L"h3",
- L"h4",
- L"h5",
- L"h6",
- L"head",
- L"hr",
- L"html",
- L"i",
- L"iframe",
- L"img",
- L"input",
- L"ins",
- L"isindex",
- L"kbd",
- L"label",
- L"legend",
- L"li",
- L"link",
- L"map",
- L"menu",
- L"meta",
- L"nobr",
- L"noframes",
- L"noscript",
- L"object",
- L"ol",
- L"optgroup",
- L"option",
- L"p",
- L"param",
- L"pre",
- L"q",
- L"s",
- L"samp",
- L"script",
- L"select",
- L"small",
- L"span",
- L"strike",
- L"strong",
- L"style",
- L"sub",
- L"sup",
- L"table",
- L"tbody",
- L"td",
- L"textarea",
- L"tfoot",
- L"th",
- L"thead",
- L"title",
- L"tr",
- L"tt",
- L"u",
- L"ul",
- L"var"
- };
- // HTML 4.0 attribute names.
- // Keep sorted, and in correspondence with enum in impl.h.
- Rune* attrnames[] = {
- L"abbr",
- L"accept-charset",
- L"access-key",
- L"action",
- L"align",
- L"alink",
- L"alt",
- L"archive",
- L"axis",
- L"background",
- L"bgcolor",
- L"border",
- L"cellpadding",
- L"cellspacing",
- L"char",
- L"charoff",
- L"charset",
- L"checked",
- L"cite",
- L"class",
- L"classid",
- L"clear",
- L"code",
- L"codebase",
- L"codetype",
- L"color",
- L"cols",
- L"colspan",
- L"compact",
- L"content",
- L"coords",
- L"data",
- L"datetime",
- L"declare",
- L"defer",
- L"dir",
- L"disabled",
- L"enctype",
- L"face",
- L"for",
- L"frame",
- L"frameborder",
- L"headers",
- L"height",
- L"href",
- L"hreflang",
- L"hspace",
- L"http-equiv",
- L"id",
- L"ismap",
- L"label",
- L"lang",
- L"link",
- L"longdesc",
- L"marginheight",
- L"marginwidth",
- L"maxlength",
- L"media",
- L"method",
- L"multiple",
- L"name",
- L"nohref",
- L"noresize",
- L"noshade",
- L"nowrap",
- L"object",
- L"onblur",
- L"onchange",
- L"onclick",
- L"ondblclick",
- L"onfocus",
- L"onkeypress",
- L"onkeyup",
- L"onload",
- L"onmousedown",
- L"onmousemove",
- L"onmouseout",
- L"onmouseover",
- L"onmouseup",
- L"onreset",
- L"onselect",
- L"onsubmit",
- L"onunload",
- L"profile",
- L"prompt",
- L"readonly",
- L"rel",
- L"rev",
- L"rows",
- L"rowspan",
- L"rules",
- L"scheme",
- L"scope",
- L"scrolling",
- L"selected",
- L"shape",
- L"size",
- L"span",
- L"src",
- L"standby",
- L"start",
- L"style",
- L"summary",
- L"tabindex",
- L"target",
- L"text",
- L"title",
- L"type",
- L"usemap",
- L"valign",
- L"value",
- L"valuetype",
- L"version",
- L"vlink",
- L"vspace",
- L"width"
- };
- // Character entity to unicode character number map.
- // Keep sorted by name.
- StringInt chartab[]= {
- {L"AElig", 198},
- {L"Aacute", 193},
- {L"Acirc", 194},
- {L"Agrave", 192},
- {L"Alpha", 913},
- {L"Aring", 197},
- {L"Atilde", 195},
- {L"Auml", 196},
- {L"Beta", 914},
- {L"Ccedil", 199},
- {L"Chi", 935},
- {L"Dagger", 8225},
- {L"Delta", 916},
- {L"ETH", 208},
- {L"Eacute", 201},
- {L"Ecirc", 202},
- {L"Egrave", 200},
- {L"Epsilon", 917},
- {L"Eta", 919},
- {L"Euml", 203},
- {L"Gamma", 915},
- {L"Iacute", 205},
- {L"Icirc", 206},
- {L"Igrave", 204},
- {L"Iota", 921},
- {L"Iuml", 207},
- {L"Kappa", 922},
- {L"Lambda", 923},
- {L"Mu", 924},
- {L"Ntilde", 209},
- {L"Nu", 925},
- {L"OElig", 338},
- {L"Oacute", 211},
- {L"Ocirc", 212},
- {L"Ograve", 210},
- {L"Omega", 937},
- {L"Omicron", 927},
- {L"Oslash", 216},
- {L"Otilde", 213},
- {L"Ouml", 214},
- {L"Phi", 934},
- {L"Pi", 928},
- {L"Prime", 8243},
- {L"Psi", 936},
- {L"Rho", 929},
- {L"Scaron", 352},
- {L"Sigma", 931},
- {L"THORN", 222},
- {L"Tau", 932},
- {L"Theta", 920},
- {L"Uacute", 218},
- {L"Ucirc", 219},
- {L"Ugrave", 217},
- {L"Upsilon", 933},
- {L"Uuml", 220},
- {L"Xi", 926},
- {L"Yacute", 221},
- {L"Yuml", 376},
- {L"Zeta", 918},
- {L"aacute", 225},
- {L"acirc", 226},
- {L"acute", 180},
- {L"aelig", 230},
- {L"agrave", 224},
- {L"alefsym", 8501},
- {L"alpha", 945},
- {L"amp", 38},
- {L"and", 8743},
- {L"ang", 8736},
- {L"aring", 229},
- {L"asymp", 8776},
- {L"atilde", 227},
- {L"auml", 228},
- {L"bdquo", 8222},
- {L"beta", 946},
- {L"brvbar", 166},
- {L"bull", 8226},
- {L"cap", 8745},
- {L"ccedil", 231},
- {L"cdots", 8943},
- {L"cedil", 184},
- {L"cent", 162},
- {L"chi", 967},
- {L"circ", 710},
- {L"clubs", 9827},
- {L"cong", 8773},
- {L"copy", 169},
- {L"crarr", 8629},
- {L"cup", 8746},
- {L"curren", 164},
- {L"dArr", 8659},
- {L"dagger", 8224},
- {L"darr", 8595},
- {L"ddots", 8945},
- {L"deg", 176},
- {L"delta", 948},
- {L"diams", 9830},
- {L"divide", 247},
- {L"eacute", 233},
- {L"ecirc", 234},
- {L"egrave", 232},
- {L"emdash", 8212}, /* non-standard but commonly used */
- {L"empty", 8709},
- {L"emsp", 8195},
- {L"endash", 8211}, /* non-standard but commonly used */
- {L"ensp", 8194},
- {L"epsilon", 949},
- {L"equiv", 8801},
- {L"eta", 951},
- {L"eth", 240},
- {L"euml", 235},
- {L"euro", 8364},
- {L"exist", 8707},
- {L"fnof", 402},
- {L"forall", 8704},
- {L"frac12", 189},
- {L"frac14", 188},
- {L"frac34", 190},
- {L"frasl", 8260},
- {L"gamma", 947},
- {L"ge", 8805},
- {L"gt", 62},
- {L"hArr", 8660},
- {L"harr", 8596},
- {L"hearts", 9829},
- {L"hellip", 8230},
- {L"iacute", 237},
- {L"icirc", 238},
- {L"iexcl", 161},
- {L"igrave", 236},
- {L"image", 8465},
- {L"infin", 8734},
- {L"int", 8747},
- {L"iota", 953},
- {L"iquest", 191},
- {L"isin", 8712},
- {L"iuml", 239},
- {L"kappa", 954},
- {L"lArr", 8656},
- {L"lambda", 955},
- {L"lang", 9001},
- {L"laquo", 171},
- {L"larr", 8592},
- {L"lceil", 8968},
- {L"ldots", 8230},
- {L"ldquo", 8220},
- {L"le", 8804},
- {L"lfloor", 8970},
- {L"lowast", 8727},
- {L"loz", 9674},
- {L"lrm", 8206},
- {L"lsaquo", 8249},
- {L"lsquo", 8216},
- {L"lt", 60},
- {L"macr", 175},
- {L"mdash", 8212},
- {L"micro", 181},
- {L"middot", 183},
- {L"minus", 8722},
- {L"mu", 956},
- {L"nabla", 8711},
- {L"nbsp", 160},
- {L"ndash", 8211},
- {L"ne", 8800},
- {L"ni", 8715},
- {L"not", 172},
- {L"notin", 8713},
- {L"nsub", 8836},
- {L"ntilde", 241},
- {L"nu", 957},
- {L"oacute", 243},
- {L"ocirc", 244},
- {L"oelig", 339},
- {L"ograve", 242},
- {L"oline", 8254},
- {L"omega", 969},
- {L"omicron", 959},
- {L"oplus", 8853},
- {L"or", 8744},
- {L"ordf", 170},
- {L"ordm", 186},
- {L"oslash", 248},
- {L"otilde", 245},
- {L"otimes", 8855},
- {L"ouml", 246},
- {L"para", 182},
- {L"part", 8706},
- {L"permil", 8240},
- {L"perp", 8869},
- {L"phi", 966},
- {L"pi", 960},
- {L"piv", 982},
- {L"plusmn", 177},
- {L"pound", 163},
- {L"prime", 8242},
- {L"prod", 8719},
- {L"prop", 8733},
- {L"psi", 968},
- {L"quad", 8193},
- {L"quot", 34},
- {L"rArr", 8658},
- {L"radic", 8730},
- {L"rang", 9002},
- {L"raquo", 187},
- {L"rarr", 8594},
- {L"rceil", 8969},
- {L"rdquo", 8221},
- {L"real", 8476},
- {L"reg", 174},
- {L"rfloor", 8971},
- {L"rho", 961},
- {L"rlm", 8207},
- {L"rsaquo", 8250},
- {L"rsquo", 8217},
- {L"sbquo", 8218},
- {L"scaron", 353},
- {L"sdot", 8901},
- {L"sect", 167},
- {L"shy", 173},
- {L"sigma", 963},
- {L"sigmaf", 962},
- {L"sim", 8764},
- {L"sp", 8194},
- {L"spades", 9824},
- {L"sub", 8834},
- {L"sube", 8838},
- {L"sum", 8721},
- {L"sup", 8835},
- {L"sup1", 185},
- {L"sup2", 178},
- {L"sup3", 179},
- {L"supe", 8839},
- {L"szlig", 223},
- {L"tau", 964},
- {L"there4", 8756},
- {L"theta", 952},
- {L"thetasym", 977},
- {L"thinsp", 8201},
- {L"thorn", 254},
- {L"tilde", 732},
- {L"times", 215},
- {L"trade", 8482},
- {L"uArr", 8657},
- {L"uacute", 250},
- {L"uarr", 8593},
- {L"ucirc", 251},
- {L"ugrave", 249},
- {L"uml", 168},
- {L"upsih", 978},
- {L"upsilon", 965},
- {L"uuml", 252},
- {L"varepsilon", 8712},
- {L"varphi", 981},
- {L"varpi", 982},
- {L"varrho", 1009},
- {L"vdots", 8942},
- {L"vsigma", 962},
- {L"vtheta", 977},
- {L"weierp", 8472},
- {L"xi", 958},
- {L"yacute", 253},
- {L"yen", 165},
- {L"yuml", 255},
- {L"zeta", 950},
- {L"zwj", 8205},
- {L"zwnj", 8204}
- };
- #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
- // Characters Winstart..Winend are those that Windows
- // uses interpolated into the Latin1 set.
- // They aren't supposed to appear in HTML, but they do....
- enum {
- Winstart = 127,
- Winend = 159
- };
- static int winchars[]= { 8226, // 8226 is a bullet
- 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
- 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
- 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
- 732, 8482, 353, 8250, 339, 8226, 8226, 376};
- static StringInt* tagtable; // initialized from tagnames
- static StringInt* attrtable; // initialized from attrnames
- static void lexinit(void);
- static int getplaindata(TokenSource* ts, Token* a, int* pai);
- static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
- static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
- static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
- static Rune* buftostr(Rune* s, Rune* buf, int j);
- static int comment(TokenSource* ts);
- static int findstr(TokenSource* ts, Rune* s);
- static int ampersand(TokenSource* ts);
- static int lowerc(int c);
- static int getchar(TokenSource* ts);
- static void ungetchar(TokenSource* ts, int c);
- static void backup(TokenSource* ts, int savei);
- static void freeinsidetoken(Token* t);
- static void freeattrs(Attr* ahead);
- static Attr* newattr(int attid, Rune* value, Attr* link);
- static int Tconv(Fmt* f);
- int dbglex = 0;
- static int lexinited = 0;
- static void
- lexinit(void)
- {
- tagtable = _makestrinttab(tagnames, Numtags);
- attrtable = _makestrinttab(attrnames, Numattrs);
- fmtinstall('T', Tconv);
- lexinited = 1;
- }
- static TokenSource*
- newtokensource(uchar* data, int edata, int chset, int mtype)
- {
- TokenSource* ans;
- assert(chset == US_Ascii || chset == ISO_8859_1 ||
- chset == UTF_8 || chset == Unicode);
- ans = (TokenSource*)emalloc(sizeof(TokenSource));
- ans->i = 0;
- ans->data = data;
- ans->edata = edata;
- ans->chset = chset;
- ans->mtype = mtype;
- return ans;
- }
- enum {
- ToksChunk = 500,
- };
- // Call this to get the tokens.
- // The number of returned tokens is returned in *plen.
- Token*
- _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
- {
- TokenSource* ts;
- Token* a;
- int alen;
- int ai;
- int starti;
- int c;
- int tag;
- if(!lexinited)
- lexinit();
- ts = newtokensource(data, datalen, chset, mtype);
- if(dbglex)
- fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
- alen = 0;
- ai = 0;
- a = 0;
- if(ts->mtype == TextHtml) {
- for(;;) {
- if(alen - ai < ToksChunk/32) {
- alen += ToksChunk;
- a = erealloc(a, alen*sizeof *a);
- }
- starti = ts->i;
- c = getchar(ts);
- if(c < 0)
- break;
- if(c == '<') {
- tag = gettag(ts, starti, a, &ai);
- if(tag == Tscript || tag == Tstyle) {
- // special rules for getting Data after....
- starti = ts->i;
- c = getchar(ts);
- tag = getscriptdata(ts, c, starti, a, &ai, tag);
- }
- }
- else
- tag = getdata(ts, c, starti, a, &ai);
- if(tag == -1)
- break;
- else if(dbglex > 1 && tag != Comment)
- fprint(2, "lex: got token %T\n", &a[ai-1]);
- }
- }
- else {
- // plain text (non-html) tokens
- for(;;) {
- if(alen - ai < ToksChunk/32) {
- alen += ToksChunk;
- a = erealloc(a, alen*sizeof *a);
- }
- tag = getplaindata(ts, a, &ai);
- if(tag == -1)
- break;
- if(dbglex > 1)
- fprint(2, "lex: got token %T\n", &a[ai]);
- }
- }
- free(ts);
- if(dbglex)
- fprint(2, "lex: returning %d tokens\n", ai);
- *plen = ai;
- if(ai == 0){
- free(a);
- a = 0;
- }
- return a;
- }
- // For case where source isn't HTML.
- // Just make data tokens, one per line (or partial line,
- // at end of buffer), ignoring non-whitespace control
- // characters and dumping \r's.
- // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
- // Otherwise return -1;
- static int
- getplaindata(TokenSource* ts, Token* a, int* pai)
- {
- Rune* s;
- int j;
- int starti;
- int c;
- Token* tok;
- Rune buf[BIGBUFSIZE];
- s = nil;
- j = 0;
- starti = ts->i;
- for(c = getchar(ts); c >= 0; c = getchar(ts)) {
- if(c < ' ') {
- if(isspace(c)) {
- if(c == '\r') {
- // ignore it unless no following '\n',
- // in which case treat it like '\n'
- c = getchar(ts);
- if(c != '\n') {
- if(c >= 0)
- ungetchar(ts, c);
- c = '\n';
- }
- }
- }
- else
- c = 0;
- }
- if(c != 0) {
- buf[j++] = c;
- if(j == nelem(buf)-1) {
- s = buftostr(s, buf, j);
- j = 0;
- }
- }
- if(c == '\n')
- break;
- }
- s = buftostr(s, buf, j);
- if(s == nil)
- return -1;
- tok = &a[(*pai)++];
- tok->tag = Data;
- tok->text = s;
- tok->attr = nil;
- tok->starti = starti;
- return Data;
- }
- // Return concatenation of s and buf[0:j]
- static Rune*
- buftostr(Rune* s, Rune* buf, int j)
- {
- int i;
- if(s == nil)
- s = _Strndup(buf, j);
- else {
- i = _Strlen(s);
- s = realloc(s, ( i+j+1)*sizeof *s);
- memcpy(&s[i], buf, j*sizeof *s);
- s[i+j] = 0;
- }
- return s;
- }
- // Gather data up to next start-of-tag or end-of-buffer.
- // Translate entity references (&).
- // Ignore non-whitespace control characters and get rid of \r's.
- // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
- // Otherwise return -1;
- static int
- getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
- {
- Rune* s;
- int j;
- int c;
- Token* tok;
- Rune buf[SMALLBUFSIZE];
- s = nil;
- j = 0;
- for(c = firstc; c >= 0; c = getchar(ts)){
- if(c == '&') {
- c = ampersand(ts);
- if(c < 0)
- break;
- }
- else if(c < ' ') {
- if(isspace(c)) {
- if(c == '\r') {
- // ignore it unless no following '\n',
- // in which case treat it like '\n'
- c = getchar(ts);
- if(c != '\n') {
- if(c >= 0)
- ungetchar(ts, c);
- c = '\n';
- }
- }
- }
- else {
- if(warn)
- fprint(2, "warning: non-whitespace control character %d ignored\n", c);
- c = 0;
- }
- }
- else if(c == '<') {
- ungetchar(ts, c);
- break;
- }
- if(c != 0) {
- buf[j++] = c;
- if(j == nelem(buf)-1) {
- s = buftostr(s, buf, j);
- j = 0;
- }
- }
- }
- s = buftostr(s, buf, j);
- if(s == nil)
- return -1;
- tok = &a[(*pai)++];
- tok->tag = Data;
- tok->text = s;
- tok->attr = nil;
- tok->starti = starti;
- return Data;
- }
- // The rules for lexing scripts are different (ugh).
- // Gather up everything until see an "</" tagnames[tok] ">"
- static int
- getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
- {
- Rune* s;
- int j;
- int tstarti;
- int savei;
- int c;
- int tag;
- int done;
- Token* tok;
- Rune buf[BIGBUFSIZE];
- s = nil;
- j = 0;
- tstarti = starti;
- c = firstc;
- done = 0;
- while(c >= 0) {
- if(c == '<') {
- // other browsers ignore stuff to end of line after <!
- savei = ts->i;
- c = getchar(ts);
- if(c == '!') {
- if(comment(ts) == -1)
- break;
- if(c == '\r')
- c = getchar(ts);
- if(c == '\n')
- c = getchar(ts);
- }
- else if(c >= 0) {
- backup(ts, savei);
- tag = gettag(ts, tstarti, a, pai);
- if(tag == -1)
- break;
- if(tag != Comment)
- (*pai)--;
- backup(ts, tstarti);
- if(tag == findtag + RBRA) {
- done = 1;
- break;
- }
- // here tag was not the one we were looking for, so take as regular data
- c = getchar(ts);
- }
- }
- if(c < 0)
- break;
- if(c != 0) {
- buf[j++] = c;
- if(j == nelem(buf)-1) {
- s = buftostr(s, buf, j);
- j = 0;
- }
- }
- tstarti = ts->i;
- c = getchar(ts);
- }
- if(done || ts->i == ts->edata) {
- s = buftostr(s, buf, j);
- tok = &a[(*pai)++];
- tok->tag = Data;
- tok->text = s;
- tok->attr = nil;
- tok->starti = starti;
- return Data;
- }
- free(s);
- backup(ts, starti);
- return -1;
- }
- // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
- // ends before then, return -1).
- // If it's a tag, look up the name, gather the attributes, and return
- // the appropriate token.
- // Else it's either just plain data or some kind of ignorable stuff:
- // return Data or Comment as appropriate.
- // If it's not a Comment, put it in a[*pai] and bump *pai.
- static int
- gettag(TokenSource* ts, int starti, Token* a, int* pai)
- {
- int rbra;
- int ans;
- Attr* al;
- int nexti;
- int c;
- int ti;
- int afnd;
- int attid;
- int quote;
- Rune* val;
- int nv;
- int i;
- int tag;
- Token* tok;
- Rune buf[BIGBUFSIZE];
- rbra = 0;
- nexti = ts->i;
- tok = &a[*pai];
- tok->tag = Notfound;
- tok->text = nil;
- tok->attr = nil;
- tok->starti = starti;
- c = getchar(ts);
- if(c == '/') {
- rbra = RBRA;
- c = getchar(ts);
- }
- if(c < 0)
- goto eob_done;
- if(c >= 256 || !isalpha(c)) {
- // not a tag
- if(c == '!') {
- ans = comment(ts);
- if(ans != -1)
- return ans;
- goto eob_done;
- }
- else {
- backup(ts, nexti);
- tok->tag = Data;
- tok->text = _Strdup(L"<");
- (*pai)++;
- return Data;
- }
- }
- // c starts a tagname
- buf[0] = c;
- i = 1;
- while(1) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- if(!ISNAMCHAR(c))
- break;
- // if name is bigger than buf it won't be found anyway...
- if(i < BIGBUFSIZE)
- buf[i++] = c;
- }
- if(_lookup(tagtable, Numtags, buf, i, &tag))
- tok->tag = tag + rbra;
- else
- tok->text = _Strndup(buf, i); // for warning print, in build
- // attribute gathering loop
- al = nil;
- while(1) {
- // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
- // skip whitespace
- attrloop_continue:
- while(c < 256 && isspace(c)) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- }
- if(c == '>')
- goto attrloop_done;
- if(c == '<') {
- if(warn)
- fprint(2, "warning: unclosed tag\n");
- ungetchar(ts, c);
- goto attrloop_done;
- }
- if(c >= 256 || !isalpha(c)) {
- if(warn)
- fprint(2, "warning: expected attribute name\n");
- // skipt to next attribute name
- while(1) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- if(c < 256 && isalpha(c))
- goto attrloop_continue;
- if(c == '<') {
- if(warn)
- fprint(2, "warning: unclosed tag\n");
- ungetchar(ts, 60);
- goto attrloop_done;
- }
- if(c == '>')
- goto attrloop_done;
- }
- }
- // gather attribute name
- buf[0] = c;
- i = 1;
- while(1) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- if(!ISNAMCHAR(c))
- break;
- if(i < BIGBUFSIZE-1)
- buf[i++] = c;
- }
- afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
- if(warn && !afnd) {
- buf[i] = 0;
- fprint(2, "warning: unknown attribute name %S\n", buf);
- }
- // skip whitespace
- while(c < 256 && isspace(c)) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- }
- if(c != '=') {
- if(afnd)
- al = newattr(attid, nil, al);
- goto attrloop_continue;
- }
- //# c is '=' here; skip whitespace
- while(1) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- if(c >= 256 || !isspace(c))
- break;
- }
- quote = 0;
- if(c == '\'' || c == '"') {
- quote = c;
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- }
- val = nil;
- nv = 0;
- while(1) {
- valloop_continue:
- if(c < 0)
- goto eob_done;
- if(c == '>') {
- if(quote) {
- // c might be part of string (though not good style)
- // but if line ends before close quote, assume
- // there was an unmatched quote
- ti = ts->i;
- while(1) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- if(c == quote) {
- backup(ts, ti);
- buf[nv++] = '>';
- if(nv == BIGBUFSIZE-1) {
- val = buftostr(val, buf, nv);
- nv = 0;
- }
- c = getchar(ts);
- goto valloop_continue;
- }
- if(c == '\n') {
- if(warn)
- fprint(2, "warning: apparent unmatched quote\n");
- backup(ts, ti);
- c = '>';
- goto valloop_done;
- }
- }
- }
- else
- goto valloop_done;
- }
- if(quote) {
- if(c == quote) {
- c = getchar(ts);
- if(c < 0)
- goto eob_done;
- goto valloop_done;
- }
- if(c == '\r') {
- c = getchar(ts);
- goto valloop_continue;
- }
- if(c == '\t' || c == '\n')
- c = ' ';
- }
- else {
- if(c < 256 && isspace(c))
- goto valloop_done;
- }
- if(c == '&') {
- c = ampersand(ts);
- if(c == -1)
- goto eob_done;
- }
- buf[nv++] = c;
- if(nv == BIGBUFSIZE-1) {
- val = buftostr(val, buf, nv);
- nv = 0;
- }
- c = getchar(ts);
- }
- valloop_done:
- if(afnd) {
- val = buftostr(val, buf, nv);
- al = newattr(attid, val, al);
- }
- }
- attrloop_done:
- tok->attr = al;
- (*pai)++;
- return tok->tag;
- eob_done:
- if(warn)
- fprint(2, "warning: incomplete tag at end of page\n");
- backup(ts, nexti);
- tok->tag = Data;
- tok->text = _Strdup(L"<");
- return Data;
- }
- // We've just read a '<!' at position starti,
- // so this may be a comment or other ignored section, or it may
- // be just a literal string if there is no close before end of file
- // (other browsers do that).
- // The accepted practice seems to be (note: contrary to SGML spec!):
- // If see <!--, look for --> to close, or if none, > to close.
- // If see <!(not --), look for > to close.
- // If no close before end of file, leave original characters in as literal data.
- //
- // If we see ignorable stuff, return Comment.
- // Else return nil (caller should back up and try again when more data arrives,
- // unless at end of file, in which case caller should just make '<' a data token).
- static int
- comment(TokenSource* ts)
- {
- int nexti;
- int havecomment;
- int c;
- nexti = ts->i;
- havecomment = 0;
- c = getchar(ts);
- if(c == '-') {
- c = getchar(ts);
- if(c == '-') {
- if(findstr(ts, L"-->"))
- havecomment = 1;
- else
- backup(ts, nexti);
- }
- }
- if(!havecomment) {
- if(c == '>')
- havecomment = 1;
- else if(c >= 0) {
- if(findstr(ts, L">"))
- havecomment = 1;
- }
- }
- if(havecomment)
- return Comment;
- return -1;
- }
- // Look for string s in token source.
- // If found, return 1, with buffer at next char after s,
- // else return 0 (caller should back up).
- static int
- findstr(TokenSource* ts, Rune* s)
- {
- int c0;
- int n;
- int nexti;
- int i;
- int c;
- c0 = s[0];
- n = runestrlen(s);
- while(1) {
- c = getchar(ts);
- if(c < 0)
- break;
- if(c == c0) {
- if(n == 1)
- return 1;
- nexti = ts->i;
- for(i = 1; i < n; i++) {
- c = getchar(ts);
- if(c < 0)
- goto mainloop_done;
- if(c != s[i])
- break;
- }
- if(i == n)
- return 1;
- backup(ts, nexti);
- }
- }
- mainloop_done:
- return 0;
- }
- // We've just read an '&'; look for an entity reference
- // name, and if found, return translated char.
- // if there is a complete entity name but it isn't known,
- // back up to just past the '&' and return '&'.
- // If the entity can't be completed in the current buffer, back up
- // to the '&' and return -1.
- static int
- ampersand(TokenSource* ts)
- {
- int savei;
- int c;
- int fnd;
- int ans;
- int v;
- int k;
- Rune buf[25];
- savei = ts->i;
- c = getchar(ts);
- fnd = 0;
- ans = -1;
- if(c == '#') {
- c = getchar(ts);
- v = 0;
- if(c == 'X' || c == 'x')
- for(c = getchar(ts); c < 256; c = getchar(ts))
- if(c >= '0' && c <= '9')
- v = v*16+c-'0';
- else if(c >= 'A' && c<= 'F')
- v = v*16+c-'A'+10;
- else if(c >= 'a' && c <= 'f')
- v = v*16+c-'a'+10;
- else
- break;
- else
- while(c >= 0) {
- if(!(c < 256 && isdigit(c)))
- break;
- v = v*10 + c - 48;
- c = getchar(ts);
- }
- if(c >= 0) {
- if(!(c == ';' || c == '\n' || c == '\r'))
- ungetchar(ts, c);
- c = v;
- if(c == 160)
- c = 160;
- if(c >= Winstart && c <= Winend) {
- c = winchars[c - Winstart];
- }
- ans = c;
- fnd = 1;
- }
- }
- else if(c < 256 && isalpha(c)) {
- buf[0] = c;
- k = 1;
- while(1) {
- c = getchar(ts);
- if(c < 0)
- break;
- if(c < 256 && (isalpha(c) || isdigit(c))) {
- if(k < nelem(buf)-1)
- buf[k++] = c;
- }
- else {
- if(!(c == ';' || c == '\n' || c == '\r'))
- ungetchar(ts, c);
- break;
- }
- }
- if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
- fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
- }
- if(!fnd) {
- backup(ts, savei);
- ans = '&';
- }
- return ans;
- }
- // Get next char, obeying ts.chset.
- // Returns -1 if no complete character left before current end of data.
- static int
- getchar(TokenSource* ts)
- {
- uchar* buf;
- int c;
- int n;
- int ok;
- Rune r;
- if(ts->i >= ts->edata)
- return -1;
- buf = ts->data;
- c = buf[ts->i];
- switch(ts->chset) {
- case ISO_8859_1:
- if(c >= Winstart && c <= Winend)
- c = winchars[c - Winstart];
- ts->i++;
- break;
- case US_Ascii:
- if(c > 127) {
- if(warn)
- fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
- }
- ts->i++;
- break;
- case UTF_8:
- ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
- n = chartorune(&r, (char*)(buf+ts->i));
- if(ok) {
- if(warn && c == 0x80)
- fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
- ts->i += n;
- c = r;
- }
- else {
- // not enough bytes in buf to complete utf-8 char
- ts->i = ts->edata; // mark "all used"
- c = -1;
- }
- break;
- case Unicode:
- if(ts->i < ts->edata - 1) {
- //standards say most-significant byte first
- c = (c << 8)|(buf[ts->i + 1]);
- ts->i += 2;
- }
- else {
- ts->i = ts->edata; // mark "all used"
- c = -1;
- }
- break;
- default:
- return -1;
- }
- return c;
- }
- // Assuming c was the last character returned by getchar, set
- // things up so that next getchar will get that same character
- // followed by the current 'next character', etc.
- static void
- ungetchar(TokenSource* ts, int c)
- {
- int n;
- Rune r;
- char a[UTFmax];
- n = 1;
- switch(ts->chset) {
- case UTF_8:
- if(c >= 128) {
- r = c;
- n = runetochar(a, &r);
- }
- break;
- case Unicode:
- n = 2;
- break;
- }
- ts->i -= n;
- }
- // Restore ts so that it is at the state where the index was savei.
- static void
- backup(TokenSource* ts, int savei)
- {
- if(dbglex)
- fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
- ts->i = savei;
- }
- // Look for value associated with attribute attid in token t.
- // If there is one, return 1 and put the value in *pans,
- // else return 0.
- // If xfer is true, transfer ownership of the string to the caller
- // (nil it out here); otherwise, caller must duplicate the answer
- // if it needs to save it.
- // OK to have pans==0, in which case this is just looking
- // to see if token is present.
- int
- _tokaval(Token* t, int attid, Rune** pans, int xfer)
- {
- Attr* attr;
- attr = t->attr;
- while(attr != nil) {
- if(attr->attid == attid) {
- if(pans != nil)
- *pans = attr->value;
- if(xfer)
- attr->value = nil;
- return 1;
- }
- attr = attr->next;
- }
- if(pans != nil)
- *pans = nil;
- return 0;
- }
- static int
- Tconv(Fmt *f)
- {
- Token* t;
- int i;
- int tag;
- char* srbra;
- Rune* aname;
- Rune* tname;
- Attr* a;
- char buf[BIGBUFSIZE];
- t = va_arg(f->args, Token*);
- if(t == nil)
- sprint(buf, "<null>");
- else {
- i = 0;
- if(dbglex > 1)
- i = snprint(buf, sizeof(buf), "[%d]", t->starti);
- tag = t->tag;
- if(tag == Data) {
- i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
- }
- else {
- srbra = "";
- if(tag >= RBRA) {
- tag -= RBRA;
- srbra = "/";
- }
- tname = tagnames[tag];
- if(tag == Notfound)
- tname = L"?";
- i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
- for(a = t->attr; a != nil; a = a->next) {
- aname = attrnames[a->attid];
- i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
- if(a->value != nil)
- i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
- }
- i += snprint(buf+i, sizeof(buf)-i-1, ">");
- }
- buf[i] = 0;
- }
- return fmtstrcpy(f, buf);
- }
- // Attrs own their constituent strings, but build may eventually
- // transfer some values to its items and nil them out in the Attr.
- static Attr*
- newattr(int attid, Rune* value, Attr* link)
- {
- Attr* ans;
- ans = (Attr*)emalloc(sizeof(Attr));
- ans->attid = attid;
- ans->value = value;
- ans->next = link;
- return ans;
- }
- // Free list of Attrs linked through next field
- static void
- freeattrs(Attr* ahead)
- {
- Attr* a;
- Attr* nexta;
- a = ahead;
- while(a != nil) {
- nexta = a->next;
- free(a->value);
- free(a);
- a = nexta;
- }
- }
- // Free array of Tokens.
- // Allocated space might have room for more than n tokens,
- // but only n of them are initialized.
- // If caller has transferred ownership of constitutent strings
- // or attributes, it must have nil'd out the pointers in the Tokens.
- void
- _freetokens(Token* tarray, int n)
- {
- int i;
- Token* t;
- if(tarray == nil)
- return;
- for(i = 0; i < n; i++) {
- t = &tarray[i];
- free(t->text);
- freeattrs(t->attr);
- }
- free(tarray);
- }
|