html2ms.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <ctype.h>
  4. #include <bio.h>
  5. enum
  6. {
  7. SSIZE = 10,
  8. /* list types */
  9. Lordered = 0,
  10. Lunordered,
  11. Lmenu,
  12. Ldir,
  13. };
  14. Biobuf in, out;
  15. int lastc = '\n';
  16. int inpre = 0;
  17. /* stack for fonts */
  18. char *fontstack[SSIZE];
  19. char *font = "R";
  20. int fsp;
  21. /* stack for lists */
  22. struct
  23. {
  24. int type;
  25. int ord;
  26. } liststack[SSIZE];
  27. int lsp;
  28. int quoting;
  29. typedef struct Goobie Goobie;
  30. struct Goobie
  31. {
  32. char *name;
  33. void (*f)(Goobie*, char*);
  34. void (*ef)(Goobie*, char*);
  35. };
  36. void eatwhite(void);
  37. void escape(void);
  38. typedef void Action(Goobie*, char*);
  39. Action g_ignore;
  40. Action g_unexpected;
  41. Action g_title;
  42. Action g_p;
  43. Action g_h;
  44. Action g_li;
  45. Action g_list, g_listend;
  46. Action g_pre;
  47. Action g_fpush, g_fpop;
  48. Action g_indent, g_exdent;
  49. Action g_dt;
  50. Action g_display;
  51. Action g_displayend;
  52. Action g_table, g_tableend, g_caption, g_captionend;
  53. Action g_br, g_hr;
  54. Goobie gtab[] =
  55. {
  56. "!--", g_ignore, g_unexpected,
  57. "!doctype", g_ignore, g_unexpected,
  58. "a", g_ignore, g_ignore,
  59. "address", g_display, g_displayend,
  60. "b", g_fpush, g_fpop,
  61. "base", g_ignore, g_unexpected,
  62. "blink", g_ignore, g_ignore,
  63. "blockquote", g_ignore, g_ignore,
  64. "body", g_ignore, g_ignore,
  65. "br", g_br, g_unexpected,
  66. "caption", g_caption, g_captionend,
  67. "center", g_ignore, g_ignore,
  68. "cite", g_ignore, g_ignore,
  69. "code", g_ignore, g_ignore,
  70. "dd", g_ignore, g_unexpected,
  71. "dfn", g_ignore, g_ignore,
  72. "dir", g_list, g_listend,
  73. "dl", g_indent, g_exdent,
  74. "dt", g_dt, g_unexpected,
  75. "em", g_ignore, g_ignore,
  76. "font", g_ignore, g_ignore,
  77. "form", g_ignore, g_ignore,
  78. "h1", g_h, g_p,
  79. "h2", g_h, g_p,
  80. "h3", g_h, g_p,
  81. "h4", g_h, g_p,
  82. "h5", g_h, g_p,
  83. "h6", g_h, g_p,
  84. "head", g_ignore, g_ignore,
  85. "hr", g_hr, g_unexpected,
  86. "html", g_ignore, g_ignore,
  87. "i", g_fpush, g_fpop,
  88. "input", g_ignore, g_unexpected,
  89. "img", g_ignore, g_unexpected,
  90. "isindex", g_ignore, g_unexpected,
  91. "kbd", g_fpush, g_fpop,
  92. "key", g_ignore, g_ignore,
  93. "li", g_li, g_unexpected,
  94. "link", g_ignore, g_unexpected,
  95. "listing", g_ignore, g_ignore,
  96. "menu", g_list, g_listend,
  97. "meta", g_ignore, g_unexpected,
  98. "nextid", g_ignore, g_unexpected,
  99. "ol", g_list, g_listend,
  100. "option", g_ignore, g_unexpected,
  101. "p", g_p, g_ignore,
  102. "plaintext", g_ignore, g_unexpected,
  103. "pre", g_pre, g_displayend,
  104. "samp", g_ignore, g_ignore,
  105. "select", g_ignore, g_ignore,
  106. "strong", g_ignore, g_ignore,
  107. "table", g_table, g_tableend,
  108. "textarea", g_ignore, g_ignore,
  109. "title", g_title, g_ignore,
  110. "tt", g_fpush, g_fpop,
  111. "u", g_ignore, g_ignore,
  112. "ul", g_list, g_listend,
  113. "var", g_ignore, g_ignore,
  114. "xmp", g_ignore, g_ignore,
  115. 0, 0, 0,
  116. };
  117. typedef struct Entity Entity;
  118. struct Entity
  119. {
  120. char *name;
  121. Rune value;
  122. };
  123. Entity pl_entity[]=
  124. {
  125. "#SPACE", L' ', "#RS", L'\n', "#RE", L'\r', "quot", L'"',
  126. "AElig", L'Æ', "Aacute", L'Á', "Acirc", L'Â', "Agrave", L'À', "Aring", L'Å',
  127. "Atilde", L'Ã', "Auml", L'Ä', "Ccedil", L'Ç', "ETH", L'Ð', "Eacute", L'É',
  128. "Ecirc", L'Ê', "Egrave", L'È', "Euml", L'Ë', "Iacute", L'Í', "Icirc", L'Î',
  129. "Igrave", L'Ì', "Iuml", L'Ï', "Ntilde", L'Ñ', "Oacute", L'Ó', "Ocirc", L'Ô',
  130. "Ograve", L'Ò', "Oslash", L'Ø', "Otilde", L'Õ', "Ouml", L'Ö', "THORN", L'Þ',
  131. "Uacute", L'Ú', "Ucirc", L'Û', "Ugrave", L'Ù', "Uuml", L'Ü', "Yacute", L'Ý',
  132. "aacute", L'á', "acirc", L'â', "aelig", L'æ', "agrave", L'à', "amp", L'&',
  133. "aring", L'å', "atilde", L'ã', "auml", L'ä', "ccedil", L'ç', "eacute", L'é',
  134. "ecirc", L'ê', "egrave", L'è', "eth", L'ð', "euml", L'ë', "gt", L'>',
  135. "iacute", L'í', "icirc", L'î', "igrave", L'ì', "iuml", L'ï', "lt", L'<',
  136. "ntilde", L'ñ', "oacute", L'ó', "ocirc", L'ô', "ograve", L'ò', "oslash", L'ø',
  137. "otilde", L'õ', "ouml", L'ö', "szlig", L'ß', "thorn", L'þ', "uacute", L'ú',
  138. "ucirc", L'û', "ugrave", L'ù', "uuml", L'ü', "yacute", L'ý', "yuml", L'ÿ',
  139. 0
  140. };
  141. int
  142. cistrcmp(char *a, char *b)
  143. {
  144. int c, d;
  145. for(;; a++, b++){
  146. d = tolower(*a);
  147. c = d - tolower(*b);
  148. if(c)
  149. break;
  150. if(d == 0)
  151. break;
  152. }
  153. return c;
  154. }
  155. int
  156. readupto(char *buf, int n, char d, char notme)
  157. {
  158. char *p;
  159. int c;
  160. buf[0] = 0;
  161. for(p = buf;; p++){
  162. c = Bgetc(&in);
  163. if(c < 0){
  164. *p = 0;
  165. return -1;
  166. }
  167. if(c == notme){
  168. Bungetc(&in);
  169. return -1;
  170. }
  171. if(c == d){
  172. *p = 0;
  173. return 0;
  174. }
  175. *p = c;
  176. if(p == buf + n){
  177. *p = 0;
  178. Bprint(&out, "<%s", buf);
  179. return -1;
  180. }
  181. }
  182. }
  183. void
  184. dogoobie(void)
  185. {
  186. char *arg, *type;
  187. Goobie *g;
  188. char buf[1024];
  189. int closing;
  190. if(readupto(buf, sizeof(buf), '>', '<') < 0){
  191. Bprint(&out, "<%s", buf);
  192. return;
  193. }
  194. type = buf;
  195. if(*type == '/'){
  196. type++;
  197. closing = 1;
  198. } else
  199. closing = 0;
  200. arg = strchr(type, ' ');
  201. if(arg == 0)
  202. arg = strchr(type, '\r');
  203. if(arg == 0)
  204. arg = strchr(type, '\n');
  205. if(arg)
  206. *arg++ = 0;
  207. for(g = gtab; g->name; g++)
  208. if(cistrcmp(type, g->name) == 0){
  209. if(closing){
  210. if(g->ef){
  211. (*g->ef)(g, arg);
  212. return;
  213. }
  214. } else {
  215. if(g->f){
  216. (*g->f)(g, arg);
  217. return;
  218. }
  219. }
  220. }
  221. if(closing)
  222. type--;
  223. if(arg)
  224. Bprint(&out, "<%s %s>\n", type, arg);
  225. else
  226. Bprint(&out, "<%s>\n", type);
  227. }
  228. void
  229. main(void)
  230. {
  231. int c, pos;
  232. Binit(&in, 0, OREAD);
  233. Binit(&out, 1, OWRITE);
  234. pos = 0;
  235. for(;;){
  236. c = Bgetc(&in);
  237. if(c < 0)
  238. return;
  239. switch(c){
  240. case '<':
  241. dogoobie();
  242. break;
  243. case '&':
  244. escape();
  245. break;
  246. case '\r':
  247. pos = 0;
  248. break;
  249. case '\n':
  250. if(quoting){
  251. Bputc(&out, '"');
  252. quoting = 0;
  253. }
  254. if(lastc != '\n')
  255. Bputc(&out, '\n');
  256. /* can't emit leading spaces in filled troff docs */
  257. if (!inpre)
  258. eatwhite();
  259. lastc = c;
  260. break;
  261. default:
  262. ++pos;
  263. if(!inpre && isascii(c) && isspace(c) && pos > 80){
  264. Bputc(&out, '\n');
  265. eatwhite();
  266. pos = 0;
  267. }else
  268. Bputc(&out, c);
  269. lastc = c;
  270. break;
  271. }
  272. }
  273. }
  274. void
  275. escape(void)
  276. {
  277. int c;
  278. Entity *e;
  279. char buf[8];
  280. if(readupto(buf, sizeof(buf), ';', '\n') < 0){
  281. Bprint(&out, "&%s", buf);
  282. return;
  283. }
  284. for(e = pl_entity; e->name; e++)
  285. if(strcmp(buf, e->name) == 0){
  286. Bprint(&out, "%C", e->value);
  287. return;
  288. }
  289. if(*buf == '#'){
  290. c = atoi(buf+1);
  291. if(isascii(c) && isprint(c)){
  292. Bputc(&out, c);
  293. return;
  294. }
  295. }
  296. Bprint(&out, "&%s;", buf);
  297. }
  298. /*
  299. * whitespace is not significant to HTML, but newlines
  300. * and leading spaces are significant to troff.
  301. */
  302. void
  303. eatwhite(void)
  304. {
  305. int c;
  306. for(;;){
  307. c = Bgetc(&in);
  308. if(c < 0)
  309. break;
  310. if(!isspace(c)){
  311. Bungetc(&in);
  312. break;
  313. }
  314. }
  315. }
  316. /*
  317. * print at start of line
  318. */
  319. void
  320. printsol(char *fmt, ...)
  321. {
  322. va_list arg;
  323. if(quoting){
  324. Bputc(&out, '"');
  325. quoting = 0;
  326. }
  327. if(lastc != '\n')
  328. Bputc(&out, '\n');
  329. va_start(arg, fmt);
  330. Bvprint(&out, fmt, arg);
  331. va_end(arg);
  332. lastc = '\n';
  333. }
  334. void
  335. g_ignore(Goobie *g, char *arg)
  336. {
  337. USED(g, arg);
  338. }
  339. void
  340. g_unexpected(Goobie *g, char *arg)
  341. {
  342. USED(arg);
  343. fprint(2, "unexpected %s ending\n", g->name);
  344. }
  345. void
  346. g_title(Goobie *g, char *arg)
  347. {
  348. USED(arg);
  349. printsol(".TL\n", g->name);
  350. }
  351. void
  352. g_p(Goobie *g, char *arg)
  353. {
  354. USED(arg);
  355. printsol(".LP\n", g->name);
  356. }
  357. void
  358. g_h(Goobie *g, char *arg)
  359. {
  360. USED(arg);
  361. printsol(".SH %c\n", g->name[1]);
  362. }
  363. void
  364. g_list(Goobie *g, char *arg)
  365. {
  366. USED(arg);
  367. if(lsp != SSIZE){
  368. switch(g->name[0]){
  369. case 'o':
  370. liststack[lsp].type = Lordered;
  371. liststack[lsp].ord = 0;
  372. break;
  373. default:
  374. liststack[lsp].type = Lunordered;
  375. break;
  376. }
  377. }
  378. lsp++;
  379. }
  380. void
  381. g_br(Goobie *g, char *arg)
  382. {
  383. USED(g, arg);
  384. printsol(".br\n");
  385. }
  386. void
  387. g_li(Goobie *g, char *arg)
  388. {
  389. USED(g, arg);
  390. if(lsp <= 0 || lsp > SSIZE){
  391. printsol(".IP \\(bu\n");
  392. return;
  393. }
  394. switch(liststack[lsp-1].type){
  395. case Lunordered:
  396. printsol(".IP \\(bu\n");
  397. break;
  398. case Lordered:
  399. printsol(".IP %d\n", ++liststack[lsp-1].ord);
  400. break;
  401. }
  402. }
  403. void
  404. g_listend(Goobie *g, char *arg)
  405. {
  406. USED(g, arg);
  407. if(--lsp < 0)
  408. lsp = 0;
  409. printsol(".LP\n");
  410. }
  411. void
  412. g_display(Goobie *g, char *arg)
  413. {
  414. USED(g, arg);
  415. printsol(".DS\n");
  416. }
  417. void
  418. g_pre(Goobie *g, char *arg)
  419. {
  420. USED(g, arg);
  421. printsol(".DS L\n");
  422. inpre = 1;
  423. }
  424. void
  425. g_displayend(Goobie *g, char *arg)
  426. {
  427. USED(g, arg);
  428. printsol(".DE\n");
  429. inpre = 0;
  430. }
  431. void
  432. g_fpush(Goobie *g, char *arg)
  433. {
  434. USED(arg);
  435. if(fsp < SSIZE)
  436. fontstack[fsp] = font;
  437. fsp++;
  438. switch(g->name[0]){
  439. case 'b':
  440. font = "B";
  441. break;
  442. case 'i':
  443. font = "I";
  444. break;
  445. case 'k': /* kbd */
  446. case 't': /* tt */
  447. font = "(CW";
  448. break;
  449. }
  450. Bprint(&out, "\\f%s", font);
  451. }
  452. void
  453. g_fpop(Goobie *g, char *arg)
  454. {
  455. USED(g, arg);
  456. fsp--;
  457. if(fsp < SSIZE)
  458. font = fontstack[fsp];
  459. else
  460. font = "R";
  461. Bprint(&out, "\\f%s", font);
  462. }
  463. void
  464. g_indent(Goobie *g, char *arg)
  465. {
  466. USED(g, arg);
  467. printsol(".RS\n");
  468. }
  469. void
  470. g_exdent(Goobie *g, char *arg)
  471. {
  472. USED(g, arg);
  473. printsol(".RE\n");
  474. }
  475. void
  476. g_dt(Goobie *g, char *arg)
  477. {
  478. USED(g, arg);
  479. printsol(".IP \"");
  480. quoting = 1;
  481. }
  482. void
  483. g_hr(Goobie *g, char *arg)
  484. {
  485. USED(g, arg);
  486. printsol(".br\n");
  487. printsol("\\l'5i'\n");
  488. }
  489. /*
  490. <table border>
  491. <caption><font size="+1"><b>Cumulative Class Data</b></font></caption>
  492. <tr><th rowspan=2>DOSE<br>mg/kg</th><th colspan=2>PARALYSIS</th><th colspan=2>DEATH</th>
  493. </tr>
  494. <tr><th width=80>Number</th><th width=80>Percent</th><th width=80>Number</th><th width=80>Percent</th>
  495. </tr>
  496. <tr align=center>
  497. <td>0.1</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  498. </tr>
  499. <tr align=center>
  500. <td>0.2</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  501. </tr>
  502. <tr align=center>
  503. <td>0.3</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  504. </tr>
  505. <tr align=center>
  506. <td>0.4</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  507. </tr>
  508. <tr align=center>
  509. <td>0.5</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  510. </tr>
  511. <tr align=center>
  512. <td>0.6</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  513. </tr>
  514. <tr align=center>
  515. <td>0.7</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  516. </tr>
  517. <tr align=center>
  518. <td>0.8</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  519. </tr>
  520. <tr align=center>
  521. <td>0.8 oral</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
  522. </tr>
  523. </table>
  524. */
  525. void
  526. g_table(Goobie *g, char *arg)
  527. {
  528. USED(g, arg);
  529. printsol(".TS\ncenter ;\n");
  530. }
  531. void
  532. g_tableend(Goobie *g, char *arg)
  533. {
  534. USED(g, arg);
  535. printsol(".TE\n");
  536. }
  537. void
  538. g_caption(Goobie *g, char *arg)
  539. {
  540. USED(g, arg);
  541. }
  542. void
  543. g_captionend(Goobie *g, char *arg)
  544. {
  545. USED(g, arg);
  546. }