html 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420
  1. .TH HTML 2
  2. .SH NAME
  3. parsehtml,
  4. printitems,
  5. validitems,
  6. freeitems,
  7. freedocinfo,
  8. dimenkind,
  9. dimenspec,
  10. targetid,
  11. targetname,
  12. fromStr,
  13. toStr
  14. \- HTML parser
  15. .SH SYNOPSIS
  16. .nf
  17. .PP
  18. .ft L
  19. #include <u.h>
  20. #include <libc.h>
  21. #include <html.h>
  22. .ft P
  23. .PP
  24. .ta \w'\fLToken* 'u
  25. .B
  26. Item* parsehtml(uchar* data, int datalen, Rune* src, int mtype,
  27. .B
  28. int chset, Docinfo** pdi)
  29. .PP
  30. .B
  31. void printitems(Item* items, char* msg)
  32. .PP
  33. .B
  34. int validitems(Item* items)
  35. .PP
  36. .B
  37. void freeitems(Item* items)
  38. .PP
  39. .B
  40. void freedocinfo(Docinfo* d)
  41. .PP
  42. .B
  43. int dimenkind(Dimen d)
  44. .PP
  45. .B
  46. int dimenspec(Dimen d)
  47. .PP
  48. .B
  49. int targetid(Rune* s)
  50. .PP
  51. .B
  52. Rune* targetname(int targid)
  53. .PP
  54. .B
  55. uchar* fromStr(Rune* buf, int n, int chset)
  56. .PP
  57. .B
  58. Rune* toStr(uchar* buf, int n, int chset)
  59. .SH DESCRIPTION
  60. .PP
  61. This library implements a parser for HTML 4.0 documents.
  62. The parsed HTML is converted into an intermediate representation that
  63. describes how the formatted HTML should be laid out.
  64. .PP
  65. .I Parsehtml
  66. parses an entire HTML document contained in the buffer
  67. .I data
  68. and having length
  69. .IR datalen .
  70. The URL of the document should be passed in as
  71. .IR src .
  72. .I Mtype
  73. is the media type of the document, which should be either
  74. .B TextHtml
  75. or
  76. .BR TextPlain .
  77. The character set of the document is described in
  78. .IR chset ,
  79. which can be one of
  80. .BR US_Ascii ,
  81. .BR ISO_8859_1 ,
  82. .B UTF_8
  83. or
  84. .BR Unicode .
  85. The return value is a linked list of
  86. .B Item
  87. structures, described in detail below.
  88. As a side effect,
  89. .BI * pdi
  90. is set to point to a newly created
  91. .B Docinfo
  92. structure, containing information pertaining to the entire document.
  93. .PP
  94. The library expects two allocation routines to be provided by the
  95. caller,
  96. .B emalloc
  97. and
  98. .BR erealloc .
  99. These routines are analogous to the standard malloc and realloc routines,
  100. except that they should not return if the memory allocation fails.
  101. In addition,
  102. .B emalloc
  103. is required to zero the memory.
  104. .PP
  105. For debugging purposes,
  106. .I printitems
  107. may be called to display the contents of an item list; individual items may
  108. be printed using the
  109. .B %I
  110. print verb, installed on the first call to
  111. .IR parsehtml .
  112. .I validitems
  113. traverses the item list, checking that all of the pointers are valid.
  114. It returns
  115. .B 1
  116. is everything is ok, and
  117. .B 0
  118. if an error was found.
  119. Normally, one would not call these routines directly.
  120. Instead, one sets the global variable
  121. .I dbgbuild
  122. and the library calls them automatically.
  123. One can also set
  124. .IR warn ,
  125. to cause the library to print a warning whenever it finds a problem with the
  126. input document, and
  127. .IR dbglex ,
  128. to print debugging information in the lexer.
  129. .PP
  130. When an item list is finished with, it should be freed with
  131. .IR freeitems .
  132. Then,
  133. .I freedocinfo
  134. should be called on the pointer returned in
  135. .BI * pdi\f1.
  136. .PP
  137. .I Dimenkind
  138. and
  139. .I dimenspec
  140. are provided to interpret the
  141. .B Dimen
  142. type, as described in the section
  143. .IR "Dimension Specifications" .
  144. .PP
  145. Frame target names are mapped to integer ids via a global, permanent mapping.
  146. To find the value for a given name, call
  147. .IR targetid ,
  148. which allocates a new id if the name hasn't been seen before.
  149. The name of a given, known id may be retrieved using
  150. .IR targetname .
  151. The library predefines
  152. .BR FTtop ,
  153. .BR FTself ,
  154. .B FTparent
  155. and
  156. .BR FTblank .
  157. .PP
  158. The library handles all text as Unicode strings (type
  159. .BR Rune* ).
  160. Character set conversion is provided by
  161. .I fromStr
  162. and
  163. .IR toStr .
  164. .I FromStr
  165. takes
  166. .I n
  167. Unicode characters from
  168. .I buf
  169. and converts them to the character set described by
  170. .IR chset .
  171. .I ToStr
  172. takes
  173. .I n
  174. bytes from
  175. .IR buf ,
  176. interpretted as belonging to character set
  177. .IR chset ,
  178. and converts them to a Unicode string.
  179. Both routines null-terminate the result, and use
  180. .B emalloc
  181. to allocate space for it.
  182. .SS Items
  183. The return value of
  184. .I parsehtml
  185. is a linked list of variant structures,
  186. with the generic portion described by the following definition:
  187. .PP
  188. .EX
  189. .ta 6n +\w'Genattr* 'u
  190. typedef struct Item Item;
  191. struct Item
  192. {
  193. Item* next;
  194. int width;
  195. int height;
  196. int ascent;
  197. int anchorid;
  198. int state;
  199. Genattr* genattr;
  200. int tag;
  201. };
  202. .EE
  203. .PP
  204. The field
  205. .B next
  206. points to the successor in the linked list of items, while
  207. .BR width ,
  208. .BR height ,
  209. and
  210. .B ascent
  211. are intended for use by the caller as part of the layout process.
  212. .BR Anchorid ,
  213. if non-zero, gives the integer id assigned by the parser to the anchor that
  214. this item is in (see section
  215. .IR Anchors ).
  216. .B State
  217. is a collection of flags and values described as follows:
  218. .PP
  219. .EX
  220. .ta 6n +\w'IFindentshift = 'u
  221. enum
  222. {
  223. IFbrk = 0x80000000,
  224. IFbrksp = 0x40000000,
  225. IFnobrk = 0x20000000,
  226. IFcleft = 0x10000000,
  227. IFcright = 0x08000000,
  228. IFwrap = 0x04000000,
  229. IFhang = 0x02000000,
  230. IFrjust = 0x01000000,
  231. IFcjust = 0x00800000,
  232. IFsmap = 0x00400000,
  233. IFindentshift = 8,
  234. IFindentmask = (255<<IFindentshift),
  235. IFhangmask = 255
  236. };
  237. .EE
  238. .PP
  239. .B IFbrk
  240. is set if a break is to be forced before placing this item.
  241. .B IFbrksp
  242. is set if a 1 line space should be added to the break (in which case
  243. .B IFbrk
  244. is also set).
  245. .B IFnobrk
  246. is set if a break is not permitted before the item.
  247. .B IFcleft
  248. is set if left floats should be cleared (that is, if the list of pending left floats should be placed)
  249. before this item is placed, and
  250. .B IFcright
  251. is set for right floats.
  252. In both cases, IFbrk is also set.
  253. .B IFwrap
  254. is set if the line containing this item is allowed to wrap.
  255. .B IFhang
  256. is set if this item hangs into the left indent.
  257. .B IFrjust
  258. is set if the line containing this item should be right justified,
  259. and
  260. .B IFcjust
  261. is set for center justified lines.
  262. .B IFsmap
  263. is used to indicate that an image is a server-side map.
  264. The low 8 bits, represented by
  265. .BR IFhangmask ,
  266. indicate the current hang into left indent, in tenths of a tabstop.
  267. The next 8 bits, represented by
  268. .B IFindentmask
  269. and
  270. .BR IFindentshift ,
  271. indicate the current indent in tab stops.
  272. .PP
  273. The field
  274. .B genattr
  275. is an optional pointer to an auxiliary structure, described in the section
  276. .IR "Generic Attributes" .
  277. .PP
  278. Finally,
  279. .B tag
  280. describes which variant type this item has.
  281. It can have one of the values
  282. .BR Itexttag ,
  283. .BR Iruletag ,
  284. .BR Iimagetag ,
  285. .BR Iformfieldtag ,
  286. .BR Itabletag ,
  287. .B Ifloattag
  288. or
  289. .BR Ispacertag .
  290. For each of these values, there is an additional structure defined, which
  291. includes Item as an unnamed initial substructure, and then defines additional
  292. fields.
  293. .PP
  294. Items of type
  295. .B Itexttag
  296. represent a piece of text, using the following structure:
  297. .PP
  298. .EX
  299. .ta 6n +\w'Rune* 'u
  300. struct Itext
  301. {
  302. Item;
  303. Rune* s;
  304. int fnt;
  305. int fg;
  306. uchar voff;
  307. uchar ul;
  308. };
  309. .EE
  310. .PP
  311. Here
  312. .B s
  313. is a null-terminated Unicode string of the actual characters making up this text item,
  314. .B fnt
  315. is the font number (described in the section
  316. .IR "Font Numbers" ),
  317. and
  318. .B fg
  319. is the RGB encoded color for the text.
  320. .B Voff
  321. measures the vertical offset from the baseline; subtract
  322. .B Voffbias
  323. to get the actual value (negative values represent a displacement down the page).
  324. The field
  325. .B ul
  326. is the underline style:
  327. .B ULnone
  328. if no underline,
  329. .B ULunder
  330. for conventional underline, and
  331. .B ULmid
  332. for strike-through.
  333. .PP
  334. Items of type
  335. .B Iruletag
  336. represent a horizontal rule, as follows:
  337. .PP
  338. .EX
  339. .ta 6n +\w'Dimen 'u
  340. struct Irule
  341. {
  342. Item;
  343. uchar align;
  344. uchar noshade;
  345. int size;
  346. Dimen wspec;
  347. };
  348. .EE
  349. .PP
  350. Here
  351. .B align
  352. is the alignment specification (described in the corresponding section),
  353. .B noshade
  354. is set if the rule should not be shaded,
  355. .B size
  356. is the height of the rule (as set by the size attribute),
  357. and
  358. .B wspec
  359. is the desired width (see section
  360. .IR "Dimension Specifications" ).
  361. .PP
  362. Items of type
  363. .B Iimagetag
  364. describe embedded images, for which the following structure is defined:
  365. .PP
  366. .EX
  367. .ta 6n +\w'Iimage* 'u
  368. struct Iimage
  369. {
  370. Item;
  371. Rune* imsrc;
  372. int imwidth;
  373. int imheight;
  374. Rune* altrep;
  375. Map* map;
  376. int ctlid;
  377. uchar align;
  378. uchar hspace;
  379. uchar vspace;
  380. uchar border;
  381. Iimage* nextimage;
  382. };
  383. .EE
  384. .PP
  385. Here
  386. .B imsrc
  387. is the URL of the image source,
  388. .B imwidth
  389. and
  390. .BR imheight ,
  391. if non-zero, contain the specified width and height for the image,
  392. and
  393. .B altrep
  394. is the text to use as an alternative to the image, if the image is not displayed.
  395. .BR Map ,
  396. if set, points to a structure describing an associated client-side image map.
  397. .B Ctlid
  398. is reserved for use by the application, for handling animated images.
  399. .B Align
  400. encodes the alignment specification of the image.
  401. .B Hspace
  402. contains the number of pixels to pad the image with on either side, and
  403. .B Vspace
  404. the padding above and below.
  405. .B Border
  406. is the width of the border to draw around the image.
  407. .B Nextimage
  408. points to the next image in the document (the head of this list is
  409. .BR Docinfo.images ).
  410. .PP
  411. For items of type
  412. .BR Iformfieldtag ,
  413. the following structure is defined:
  414. .PP
  415. .EX
  416. .ta 6n +\w'Formfield* 'u
  417. struct Iformfield
  418. {
  419. Item;
  420. Formfield* formfield;
  421. };
  422. .EE
  423. .PP
  424. This adds a single field,
  425. .BR formfield ,
  426. which points to a structure describing a field in a form, described in section
  427. .IR Forms .
  428. .PP
  429. For items of type
  430. .BR Itabletag ,
  431. the following structure is defined:
  432. .PP
  433. .EX
  434. .ta 6n +\w'Table* 'u
  435. struct Itable
  436. {
  437. Item;
  438. Table* table;
  439. };
  440. .EE
  441. .PP
  442. .B Table
  443. points to a structure describing the table, described in the section
  444. .IR Tables .
  445. .PP
  446. For items of type
  447. .BR Ifloattag ,
  448. the following structure is defined:
  449. .PP
  450. .EX
  451. .ta 6n +\w'Ifloat* 'u
  452. struct Ifloat
  453. {
  454. Item;
  455. Item* item;
  456. int x;
  457. int y;
  458. uchar side;
  459. uchar infloats;
  460. Ifloat* nextfloat;
  461. };
  462. .EE
  463. .PP
  464. The
  465. .B item
  466. points to a single item (either a table or an image) that floats (the text of the
  467. document flows around it), and
  468. .B side
  469. indicates the margin that this float sticks to; it is either
  470. .B ALleft
  471. or
  472. .BR ALright .
  473. .B X
  474. and
  475. .B y
  476. are reserved for use by the caller; these are typically used for the coordinates
  477. of the top of the float.
  478. .B Infloats
  479. is used by the caller to keep track of whether it has placed the float.
  480. .B Nextfloat
  481. is used by the caller to link together all of the floats that it has placed.
  482. .PP
  483. For items of type
  484. .BR Ispacertag ,
  485. the following structure is defined:
  486. .PP
  487. .EX
  488. .ta 6n +\w'Item; 'u
  489. struct Ispacer
  490. {
  491. Item;
  492. int spkind;
  493. };
  494. .EE
  495. .PP
  496. .B Spkind
  497. encodes the kind of spacer, and may be one of
  498. .B ISPnull
  499. (zero height and width),
  500. .B ISPvline
  501. (takes on height and ascent of the current font),
  502. .B ISPhspace
  503. (has the width of a space in the current font) and
  504. .B ISPgeneral
  505. (for all other purposes, such as between markers and lists).
  506. .SS Generic Attributes
  507. .PP
  508. The genattr field of an item, if non-nil, points to a structure that holds
  509. the values of attributes not specific to any particular
  510. item type, as they occur on a wide variety of underlying HTML tags.
  511. The structure is as follows:
  512. .PP
  513. .EX
  514. .ta 6n +\w'SEvent* 'u
  515. typedef struct Genattr Genattr;
  516. struct Genattr
  517. {
  518. Rune* id;
  519. Rune* class;
  520. Rune* style;
  521. Rune* title;
  522. SEvent* events;
  523. };
  524. .EE
  525. .PP
  526. Fields
  527. .BR id ,
  528. .BR class ,
  529. .B style
  530. and
  531. .BR title ,
  532. when non-nil, contain values of correspondingly named attributes of the HTML tag
  533. associated with this item.
  534. .B Events
  535. is a linked list of events (with corresponding scripted actions) associated with the item:
  536. .PP
  537. .EX
  538. .ta 6n +\w'SEvent* 'u
  539. typedef struct SEvent SEvent;
  540. struct SEvent
  541. {
  542. SEvent* next;
  543. int type;
  544. Rune* script;
  545. };
  546. .EE
  547. .PP
  548. Here,
  549. .B next
  550. points to the next event in the list,
  551. .B type
  552. is one of
  553. .BR SEonblur ,
  554. .BR SEonchange ,
  555. .BR SEonclick ,
  556. .BR SEondblclick ,
  557. .BR SEonfocus ,
  558. .BR SEonkeypress ,
  559. .BR SEonkeyup ,
  560. .BR SEonload ,
  561. .BR SEonmousedown ,
  562. .BR SEonmousemove ,
  563. .BR SEonmouseout ,
  564. .BR SEonmouseover ,
  565. .BR SEonmouseup ,
  566. .BR SEonreset ,
  567. .BR SEonselect ,
  568. .B SEonsubmit
  569. or
  570. .BR SEonunload ,
  571. and
  572. .B script
  573. is the text of the associated script.
  574. .SS Dimension Specifications
  575. .PP
  576. Some structures include a dimension specification, used where
  577. a number can be followed by a
  578. .B %
  579. or a
  580. .B *
  581. to indicate
  582. percentage of total or relative weight.
  583. This is encoded using the following structure:
  584. .PP
  585. .EX
  586. .ta 6n +\w'int 'u
  587. typedef struct Dimen Dimen;
  588. struct Dimen
  589. {
  590. int kindspec;
  591. };
  592. .EE
  593. .PP
  594. Separate kind and spec values are extracted using
  595. .I dimenkind
  596. and
  597. .IR dimenspec .
  598. .I Dimenkind
  599. returns one of
  600. .BR Dnone ,
  601. .BR Dpixels ,
  602. .B Dpercent
  603. or
  604. .BR Drelative .
  605. .B Dnone
  606. means that no dimension was specified.
  607. In all other cases,
  608. .I dimenspec
  609. should be called to find the absolute number of pixels, the percentage of total,
  610. or the relative weight.
  611. .SS Background Specifications
  612. .PP
  613. It is possible to set the background of the entire document, and also
  614. for some parts of the document (such as tables).
  615. This is encoded as follows:
  616. .PP
  617. .EX
  618. .ta 6n +\w'Rune* 'u
  619. typedef struct Background Background;
  620. struct Background
  621. {
  622. Rune* image;
  623. int color;
  624. };
  625. .EE
  626. .PP
  627. .BR Image ,
  628. if non-nil, is the URL of an image to use as the background.
  629. If this is nil,
  630. .B color
  631. is used instead, as the RGB value for a solid fill color.
  632. .SS Alignment Specifications
  633. .PP
  634. Certain items have alignment specifiers taken from the following
  635. enumerated type:
  636. .PP
  637. .EX
  638. .ta 6n
  639. enum
  640. {
  641. ALnone = 0, ALleft, ALcenter, ALright, ALjustify,
  642. ALchar, ALtop, ALmiddle, ALbottom, ALbaseline
  643. };
  644. .EE
  645. .PP
  646. These values correspond to the various alignment types named in the HTML 4.0
  647. standard.
  648. If an item has an alignment of
  649. .B ALleft
  650. or
  651. .BR ALright ,
  652. the library automatically encapsulates it inside a float item.
  653. .PP
  654. Tables, and the various rows, columns and cells within them, have a more
  655. complex alignment specification, composed of separate vertical and
  656. horizontal alignments:
  657. .PP
  658. .EX
  659. .ta 6n +\w'uchar 'u
  660. typedef struct Align Align;
  661. struct Align
  662. {
  663. uchar halign;
  664. uchar valign;
  665. };
  666. .EE
  667. .PP
  668. .B Halign
  669. can be one of
  670. .BR ALnone ,
  671. .BR ALleft ,
  672. .BR ALcenter ,
  673. .BR ALright ,
  674. .B ALjustify
  675. or
  676. .BR ALchar .
  677. .B Valign
  678. can be one of
  679. .BR ALnone ,
  680. .BR ALmiddle ,
  681. .BR ALbottom ,
  682. .BR ALtop
  683. or
  684. .BR ALbaseline .
  685. .SS Font Numbers
  686. .PP
  687. Text items have an associated font number (the
  688. .B fnt
  689. field), which is encoded as
  690. .BR style*NumSize+size .
  691. Here,
  692. .B style
  693. is one of
  694. .BR FntR ,
  695. .BR FntI ,
  696. .B FntB
  697. or
  698. .BR FntT ,
  699. for roman, italic, bold and typewriter font styles, respectively, and size is
  700. .BR Tiny ,
  701. .BR Small ,
  702. .BR Normal ,
  703. .B Large
  704. or
  705. .BR Verylarge .
  706. The total number of possible font numbers is
  707. .BR NumFnt ,
  708. and the default font number is
  709. .B DefFnt
  710. (which is roman style, normal size).
  711. .SS Document Info
  712. .PP
  713. Global information about an HTML page is stored in the following structure:
  714. .PP
  715. .EX
  716. .ta 6n +\w'DestAnchor* 'u
  717. typedef struct Docinfo Docinfo;
  718. struct Docinfo
  719. {
  720. // stuff from HTTP headers, doc head, and body tag
  721. Rune* src;
  722. Rune* base;
  723. Rune* doctitle;
  724. Background background;
  725. Iimage* backgrounditem;
  726. int text;
  727. int link;
  728. int vlink;
  729. int alink;
  730. int target;
  731. int chset;
  732. int mediatype;
  733. int scripttype;
  734. int hasscripts;
  735. Rune* refresh;
  736. Kidinfo* kidinfo;
  737. int frameid;
  738. // info needed to respond to user actions
  739. Anchor* anchors;
  740. DestAnchor* dests;
  741. Form* forms;
  742. Table* tables;
  743. Map* maps;
  744. Iimage* images;
  745. };
  746. .EE
  747. .PP
  748. .B Src
  749. gives the URL of the original source of the document,
  750. and
  751. .B base
  752. is the base URL.
  753. .B Doctitle
  754. is the document's title, as set by a
  755. .B <title>
  756. element.
  757. .B Background
  758. is as described in the section
  759. .IR "Background Specifications" ,
  760. and
  761. .B backgrounditem
  762. is set to be an image item for the document's background image (if given as a URL),
  763. or else nil.
  764. .B Text
  765. gives the default foregound text color of the document,
  766. .B link
  767. the unvisited hyperlink color,
  768. .B vlink
  769. the visited hyperlink color, and
  770. .B alink
  771. the color for highlighting hyperlinks (all in 24-bit RGB format).
  772. .B Target
  773. is the default target frame id.
  774. .B Chset
  775. and
  776. .B mediatype
  777. are as for the
  778. .I chset
  779. and
  780. .I mtype
  781. parameters to
  782. .IR parsehtml .
  783. .B Scripttype
  784. is the type of any scripts contained in the document, and is always
  785. .BR TextJavascript .
  786. .B Hasscripts
  787. is set if the document contains any scripts.
  788. Scripting is currently unsupported.
  789. .B Refresh
  790. is the contents of a
  791. .B "<meta http-equiv=Refresh ...>"
  792. tag, if any.
  793. .B Kidinfo
  794. is set if this document is a frameset (see section
  795. .IR Frames ).
  796. .B Frameid
  797. is this document's frame id.
  798. .PP
  799. .B Anchors
  800. is a list of hyperlinks contained in the document,
  801. and
  802. .B dests
  803. is a list of hyperlink destinations within the page (see the following section for details).
  804. .BR Forms ,
  805. .B tables
  806. and
  807. .B maps
  808. are lists of the various forms, tables and client-side maps contained
  809. in the document, as described in subsequent sections.
  810. .B Images
  811. is a list of all the image items in the document.
  812. .SS Anchors
  813. .PP
  814. The library builds two lists for all of the
  815. .B <a>
  816. elements (anchors) in a document.
  817. Each anchor is assigned a unique anchor id within the document.
  818. For anchors which are hyperlinks (the
  819. .B href
  820. attribute was supplied), the following structure is defined:
  821. .PP
  822. .EX
  823. .ta 6n +\w'Anchor* 'u
  824. typedef struct Anchor Anchor;
  825. struct Anchor
  826. {
  827. Anchor* next;
  828. int index;
  829. Rune* name;
  830. Rune* href;
  831. int target;
  832. };
  833. .EE
  834. .PP
  835. .B Next
  836. points to the next anchor in the list (the head of this list is
  837. .BR Docinfo.anchors ).
  838. .B Index
  839. is the anchor id; each item within this hyperlink is tagged with this value
  840. in its
  841. .B anchorid
  842. field.
  843. .B Name
  844. and
  845. .B href
  846. are the values of the correspondingly named attributes of the anchor
  847. (in particular, href is the URL to go to).
  848. .B Target
  849. is the value of the target attribute (if provided) converted to a frame id.
  850. .PP
  851. Destinations within the document (anchors with the name attribute set)
  852. are held in the
  853. .B Docinfo.dests
  854. list, using the following structure:
  855. .PP
  856. .EX
  857. .ta 6n +\w'DestAnchor* 'u
  858. typedef struct DestAnchor DestAnchor;
  859. struct DestAnchor
  860. {
  861. DestAnchor* next;
  862. int index;
  863. Rune* name;
  864. Item* item;
  865. };
  866. .EE
  867. .PP
  868. .B Next
  869. is the next element of the list,
  870. .B index
  871. is the anchor id,
  872. .B name
  873. is the value of the name attribute, and
  874. .B item
  875. is points to the item within the parsed document that should be considered
  876. to be the destination.
  877. .SS Forms
  878. .PP
  879. Any forms within a document are kept in a list, headed by
  880. .BR Docinfo.forms .
  881. The elements of this list are as follows:
  882. .PP
  883. .EX
  884. .ta 6n +\w'Formfield* 'u
  885. typedef struct Form Form;
  886. struct Form
  887. {
  888. Form* next;
  889. int formid;
  890. Rune* name;
  891. Rune* action;
  892. int target;
  893. int method;
  894. int nfields;
  895. Formfield* fields;
  896. };
  897. .EE
  898. .PP
  899. .B Next
  900. points to the next form in the list.
  901. .B Formid
  902. is a serial number for the form within the document.
  903. .B Name
  904. is the value of the form's name or id attribute.
  905. .B Action
  906. is the value of any action attribute.
  907. .B Target
  908. is the value of the target attribute (if any) converted to a frame target id.
  909. .B Method
  910. is one of
  911. .B HGet
  912. or
  913. .BR HPost .
  914. .B Nfields
  915. is the number of fields in the form, and
  916. .B fields
  917. is a linked list of the actual fields.
  918. .PP
  919. The individual fields in a form are described by the following structure:
  920. .PP
  921. .EX
  922. .ta 6n +\w'Formfield* 'u
  923. typedef struct Formfield Formfield;
  924. struct Formfield
  925. {
  926. Formfield* next;
  927. int ftype;
  928. int fieldid;
  929. Form* form;
  930. Rune* name;
  931. Rune* value;
  932. int size;
  933. int maxlength;
  934. int rows;
  935. int cols;
  936. uchar flags;
  937. Option* options;
  938. Item* image;
  939. int ctlid;
  940. SEvent* events;
  941. };
  942. .EE
  943. .PP
  944. Here,
  945. .B next
  946. points to the next field in the list.
  947. .B Ftype
  948. is the type of the field, which can be one of
  949. .BR Ftext ,
  950. .BR Fpassword ,
  951. .BR Fcheckbox ,
  952. .BR Fradio ,
  953. .BR Fsubmit ,
  954. .BR Fhidden ,
  955. .BR Fimage ,
  956. .BR Freset ,
  957. .BR Ffile ,
  958. .BR Fbutton ,
  959. .B Fselect
  960. or
  961. .BR Ftextarea .
  962. .B Fieldid
  963. is a serial number for the field within the form.
  964. .B Form
  965. points back to the form containing this field.
  966. .BR Name ,
  967. .BR value ,
  968. .BR size ,
  969. .BR maxlength ,
  970. .B rows
  971. and
  972. .B cols
  973. each contain the values of corresponding attributes of the field, if present.
  974. .B Flags
  975. contains per-field flags, of which
  976. .B FFchecked
  977. and
  978. .B FFmultiple
  979. are defined.
  980. .B Image
  981. is only used for fields of type
  982. .BR Fimage ;
  983. it points to an image item containing the image to be displayed.
  984. .B Ctlid
  985. is reserved for use by the caller, typically to store a unique id
  986. of an associated control used to implement the field.
  987. .B Events
  988. is the same as the corresponding field of the generic attributes
  989. associated with the item containing this field.
  990. .B Options
  991. is only used by fields of type
  992. .BR Fselect ;
  993. it consists of a list of possible options that may be selected for that
  994. field, using the following structure:
  995. .PP
  996. .EX
  997. .ta 6n +\w'Option* 'u
  998. typedef struct Option Option;
  999. struct Option
  1000. {
  1001. Option* next;
  1002. int selected;
  1003. Rune* value;
  1004. Rune* display;
  1005. };
  1006. .EE
  1007. .PP
  1008. .B Next
  1009. points to the next element of the list.
  1010. .B Selected
  1011. is set if this option is to be displayed initially.
  1012. .B Value
  1013. is the value to send when the form is submitted if this option is selected.
  1014. .B Display
  1015. is the string to display on the screen for this option.
  1016. .SS Tables
  1017. .PP
  1018. The library builds a list of all the tables in the document,
  1019. headed by
  1020. .BR Docinfo.tables .
  1021. Each element of this list has the following format:
  1022. .PP
  1023. .EX
  1024. .ta 6n +\w'Tablecell*** 'u
  1025. typedef struct Table Table;
  1026. struct Table
  1027. {
  1028. Table* next;
  1029. int tableid;
  1030. Tablerow* rows;
  1031. int nrow;
  1032. Tablecol* cols;
  1033. int ncol;
  1034. Tablecell* cells;
  1035. int ncell;
  1036. Tablecell*** grid;
  1037. Align align;
  1038. Dimen width;
  1039. int border;
  1040. int cellspacing;
  1041. int cellpadding;
  1042. Background background;
  1043. Item* caption;
  1044. uchar caption_place;
  1045. Lay* caption_lay;
  1046. int totw;
  1047. int toth;
  1048. int caph;
  1049. int availw;
  1050. Token* tabletok;
  1051. uchar flags;
  1052. };
  1053. .EE
  1054. .PP
  1055. .B Next
  1056. points to the next element in the list of tables.
  1057. .B Tableid
  1058. is a serial number for the table within the document.
  1059. .B Rows
  1060. is an array of row specifications (described below) and
  1061. .B nrow
  1062. is the number of elements in this array.
  1063. Similarly,
  1064. .B cols
  1065. is an array of column specifications, and
  1066. .B ncol
  1067. the size of this array.
  1068. .B Cells
  1069. is a list of all cells within the table (structure described below)
  1070. and
  1071. .B ncell
  1072. is the number of elements in this list.
  1073. Note that a cell may span multiple rows and/or columns, thus
  1074. .B ncell
  1075. may be smaller than
  1076. .BR nrow*ncol .
  1077. .B Grid
  1078. is a two-dimensional array of cells within the table; the cell
  1079. at row
  1080. .B i
  1081. and column
  1082. .B j
  1083. is
  1084. .BR Table.grid[i][j] .
  1085. A cell that spans multiple rows and/or columns will
  1086. be referenced by
  1087. .B grid
  1088. multiple times, however it will only occur once in
  1089. .BR cells .
  1090. .B Align
  1091. gives the alignment specification for the entire table,
  1092. and
  1093. .B width
  1094. gives the requested width as a dimension specification.
  1095. .BR Border ,
  1096. .B cellspacing
  1097. and
  1098. .B cellpadding
  1099. give the values of the corresponding attributes for the table,
  1100. and
  1101. .B background
  1102. gives the requested background for the table.
  1103. .B Caption
  1104. is a linked list of items to be displayed as the caption of the
  1105. table, either above or below depending on whether
  1106. .B caption_place
  1107. is
  1108. .B ALtop
  1109. or
  1110. .BR ALbottom .
  1111. Most of the remaining fields are reserved for use by the caller,
  1112. except
  1113. .BR tabletok ,
  1114. which is reserved for internal use.
  1115. The type
  1116. .B Lay
  1117. is not defined by the library; the caller can provide its
  1118. own definition.
  1119. .PP
  1120. The
  1121. .B Tablecol
  1122. structure is defined for use by the caller.
  1123. The library ensures that the correct number of these
  1124. is allocated, but leaves them blank.
  1125. The fields are as follows:
  1126. .PP
  1127. .EX
  1128. .ta 6n +\w'Point 'u
  1129. typedef struct Tablecol Tablecol;
  1130. struct Tablecol
  1131. {
  1132. int width;
  1133. Align align;
  1134. Point pos;
  1135. };
  1136. .EE
  1137. .PP
  1138. The rows in the table are specified as follows:
  1139. .PP
  1140. .EX
  1141. .ta 6n +\w'Background 'u
  1142. typedef struct Tablerow Tablerow;
  1143. struct Tablerow
  1144. {
  1145. Tablerow* next;
  1146. Tablecell* cells;
  1147. int height;
  1148. int ascent;
  1149. Align align;
  1150. Background background;
  1151. Point pos;
  1152. uchar flags;
  1153. };
  1154. .EE
  1155. .PP
  1156. .B Next
  1157. is only used during parsing; it should be ignored by the caller.
  1158. .B Cells
  1159. provides a list of all the cells in a row, linked through their
  1160. .B nextinrow
  1161. fields (see below).
  1162. .BR Height ,
  1163. .B ascent
  1164. and
  1165. .B pos
  1166. are reserved for use by the caller.
  1167. .B Align
  1168. is the alignment specification for the row, and
  1169. .B background
  1170. is the background to use, if specified.
  1171. .B Flags
  1172. is used by the parser; ignore this field.
  1173. .PP
  1174. The individual cells of the table are described as follows:
  1175. .PP
  1176. .EX
  1177. .ta 6n +\w'Background 'u
  1178. typedef struct Tablecell Tablecell;
  1179. struct Tablecell
  1180. {
  1181. Tablecell* next;
  1182. Tablecell* nextinrow;
  1183. int cellid;
  1184. Item* content;
  1185. Lay* lay;
  1186. int rowspan;
  1187. int colspan;
  1188. Align align;
  1189. uchar flags;
  1190. Dimen wspec;
  1191. int hspec;
  1192. Background background;
  1193. int minw;
  1194. int maxw;
  1195. int ascent;
  1196. int row;
  1197. int col;
  1198. Point pos;
  1199. };
  1200. .EE
  1201. .PP
  1202. .B Next
  1203. is used to link together the list of all cells within a table
  1204. .RB ( Table.cells ),
  1205. whereas
  1206. .B nextinrow
  1207. is used to link together all the cells within a single row
  1208. .RB ( Tablerow.cells ).
  1209. .B Cellid
  1210. provides a serial number for the cell within the table.
  1211. .B Content
  1212. is a linked list of the items to be laid out within the cell.
  1213. .B Lay
  1214. is reserved for the user to describe how these items have
  1215. been laid out.
  1216. .B Rowspan
  1217. and
  1218. .B colspan
  1219. are the number of rows and columns spanned by this cell,
  1220. respectively.
  1221. .B Align
  1222. is the alignment specification for the cell.
  1223. .B Flags
  1224. is some combination of
  1225. .BR TFparsing ,
  1226. .B TFnowrap
  1227. and
  1228. .B TFisth
  1229. or'd together.
  1230. Here
  1231. .B TFparsing
  1232. is used internally by the parser, and should be ignored.
  1233. .B TFnowrap
  1234. means that the contents of the cell should not be
  1235. wrapped if they don't fit the available width,
  1236. rather, the table should be expanded if need be
  1237. (this is set when the nowrap attribute is supplied).
  1238. .B TFisth
  1239. means that the cell was created by the
  1240. .B <th>
  1241. element (rather than the
  1242. .B <td>
  1243. element),
  1244. indicating that it is a header cell rather than a data cell.
  1245. .B Wspec
  1246. provides a suggested width as a dimension specification,
  1247. and
  1248. .B hspec
  1249. provides a suggested height in pixels.
  1250. .B Background
  1251. gives a background specification for the individual cell.
  1252. .BR Minw ,
  1253. .BR maxw ,
  1254. .B ascent
  1255. and
  1256. .B pos
  1257. are reserved for use by the caller during layout.
  1258. .B Row
  1259. and
  1260. .B col
  1261. give the indices of the row and column of the top left-hand
  1262. corner of the cell within the table grid.
  1263. .SS Client-side Maps
  1264. .PP
  1265. The library builds a list of client-side maps, headed by
  1266. .BR Docinfo.maps ,
  1267. and having the following structure:
  1268. .PP
  1269. .EX
  1270. .ta 6n +\w'Rune* 'u
  1271. typedef struct Map Map;
  1272. struct Map
  1273. {
  1274. Map* next;
  1275. Rune* name;
  1276. Area* areas;
  1277. };
  1278. .EE
  1279. .PP
  1280. .B Next
  1281. points to the next element in the list,
  1282. .B name
  1283. is the name of the map (use to bind it to an image), and
  1284. .B areas
  1285. is a list of the areas within the image that comprise the map,
  1286. using the following structure:
  1287. .PP
  1288. .EX
  1289. .ta 6n +\w'Dimen* 'u
  1290. typedef struct Area Area;
  1291. struct Area
  1292. {
  1293. Area* next;
  1294. int shape;
  1295. Rune* href;
  1296. int target;
  1297. Dimen* coords;
  1298. int ncoords;
  1299. };
  1300. .EE
  1301. .PP
  1302. .B Next
  1303. points to the next element in the map's list of areas.
  1304. .B Shape
  1305. describes the shape of the area, and is one of
  1306. .BR SHrect ,
  1307. .B SHcircle
  1308. or
  1309. .BR SHpoly .
  1310. .B Href
  1311. is the URL associated with this area in its role as
  1312. a hypertext link, and
  1313. .B target
  1314. is the target frame it should be loaded in.
  1315. .B Coords
  1316. is an array of coordinates for the shape, and
  1317. .B ncoords
  1318. is the size of this array (number of elements).
  1319. .SS Frames
  1320. .PP
  1321. If the
  1322. .B Docinfo.kidinfo
  1323. field is set, the document is a frameset.
  1324. In this case, it is typical for
  1325. .I parsehtml
  1326. to return nil, as a document which is a frameset should have no actual
  1327. items that need to be laid out (such will appear only in subsidiary documents).
  1328. It is possible that items will be returned by a malformed document; the caller
  1329. should check for this and free any such items.
  1330. .PP
  1331. The
  1332. .B Kidinfo
  1333. structure itself reflects the fact that framesets can be nested within a document.
  1334. If is defined as follows:
  1335. .PP
  1336. .EX
  1337. .ta 6n +\w'Kidinfo* 'u
  1338. typedef struct Kidinfo Kidinfo;
  1339. struct Kidinfo
  1340. {
  1341. Kidinfo* next;
  1342. int isframeset;
  1343. // fields for "frame"
  1344. Rune* src;
  1345. Rune* name;
  1346. int marginw;
  1347. int marginh;
  1348. int framebd;
  1349. int flags;
  1350. // fields for "frameset"
  1351. Dimen* rows;
  1352. int nrows;
  1353. Dimen* cols;
  1354. int ncols;
  1355. Kidinfo* kidinfos;
  1356. Kidinfo* nextframeset;
  1357. };
  1358. .EE
  1359. .PP
  1360. .B Next
  1361. is only used if this structure is part of a containing frameset; it points to the next
  1362. element in the list of children of that frameset.
  1363. .B Isframeset
  1364. is set when this structure represents a frameset; if clear, it is an individual frame.
  1365. .PP
  1366. Some fields are used only for framesets.
  1367. .B Rows
  1368. is an array of dimension specifications for rows in the frameset, and
  1369. .B nrows
  1370. is the length of this array.
  1371. .B Cols
  1372. is the corresponding array for columns, of length
  1373. .BR ncols .
  1374. .B Kidinfos
  1375. points to a list of components contained within this frameset, each
  1376. of which may be a frameset or a frame.
  1377. .B Nextframeset
  1378. is only used during parsing, and should be ignored.
  1379. .PP
  1380. The remaining fields are used if the structure describes a frame, not a frameset.
  1381. .B Src
  1382. provides the URL for the document that should be initially loaded into this frame.
  1383. Note that this may be a relative URL, in which case it should be interpretted
  1384. using the containing document's URL as the base.
  1385. .B Name
  1386. gives the name of the frame, typically supplied via a name attribute in the HTML.
  1387. If no name was given, the library allocates one.
  1388. .BR Marginw ,
  1389. .B marginh
  1390. and
  1391. .B framebd
  1392. are the values of the marginwidth, marginheight and frameborder attributes, respectively.
  1393. .B Flags
  1394. can contain some combination of the following:
  1395. .B FRnoresize
  1396. (the frame had the noresize attribute set, and the user should not be allowed to resize it),
  1397. .B FRnoscroll
  1398. (the frame should not have any scroll bars),
  1399. .B FRhscroll
  1400. (the frame should have a horizontal scroll bar),
  1401. .B FRvscroll
  1402. (the frame should have a vertical scroll bar),
  1403. .B FRhscrollauto
  1404. (the frame should be automatically given a horizontal scroll bar if its contents
  1405. would not otherwise fit), and
  1406. .B FRvscrollauto
  1407. (the frame gets a vertical scrollbar only if required).
  1408. .SH SOURCE
  1409. .B /sys/src/libhtml
  1410. .SH SEE ALSO
  1411. .IR fmt (1)
  1412. .PP
  1413. W3C World Wide Web Consortium,
  1414. ``HTML 4.01 Specification''.
  1415. .SH BUGS
  1416. The entire HTML document must be loaded into memory before
  1417. any of it can be parsed.