troff2html.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. enum{
  5. Nfont = 11,
  6. Wid = 20, /* tmac.anhtml sets page width to 20" so we can recognize .nf text */
  7. };
  8. typedef ulong Char;
  9. typedef struct Troffchar Troffchar;
  10. typedef struct Htmlchar Htmlchar;
  11. typedef struct Font Font;
  12. typedef struct HTMLfont HTMLfont;
  13. /* a Char is 32 bits. low 16 bits are the rune. higher are attributes */
  14. enum
  15. {
  16. Italic = 16,
  17. Bold = 17,
  18. CW = 18,
  19. Indent1 = 19,
  20. Indent2 = 20,
  21. Indent3 = 21,
  22. Heading = 25,
  23. Anchor = 26, /* must be last */
  24. };
  25. struct Troffchar
  26. {
  27. char *name;
  28. char *value;
  29. };
  30. struct Htmlchar
  31. {
  32. int value;
  33. char *name;
  34. };
  35. #include "chars.h"
  36. struct Font{
  37. char *name;
  38. HTMLfont *htmlfont;
  39. };
  40. struct HTMLfont{
  41. char *name;
  42. char *htmlname;
  43. int bit;
  44. };
  45. /* R must be first; it's the default representation for fonts we don't recognize */
  46. HTMLfont htmlfonts[] =
  47. {
  48. "R", nil, 0,
  49. "LucidaSans", nil, 0,
  50. "I", "EM", Italic,
  51. "LucidaSansI", "EM", Italic,
  52. "CW", "TT", CW,
  53. "LucidaCW", "TT", CW,
  54. nil, nil,
  55. };
  56. char*
  57. onattr[8*sizeof(ulong)] = {
  58. [Italic] = "<EM>",
  59. [Bold] = "<B>",
  60. [CW] = "<TT>",
  61. [Indent1] = "<DL><DT><DD>",
  62. [Indent2] = "<DL><DT><DD>",
  63. [Indent3] = "<DL><DT><DD>",
  64. [Heading] = "<H4>",
  65. [Anchor] = "<UNUSED>",
  66. };
  67. char*
  68. offattr[8*sizeof(ulong)] = {
  69. [Italic] = "</EM>",
  70. [Bold] = "</B>",
  71. [CW] = "</TT>",
  72. [Indent1] = "</DL>",
  73. [Indent2] = "</DL>",
  74. [Indent3] = "</DL>",
  75. [Heading] = "</H4>",
  76. [Anchor] = "</A>",
  77. };
  78. Font *font[Nfont];
  79. Biobuf bout;
  80. int debug = 0;
  81. /* troff state */
  82. int page = 1;
  83. int ft = 1;
  84. int vp = 0;
  85. int hp = 0;
  86. int ps = 1;
  87. int res = 720;
  88. int didP = 0;
  89. int atnewline = 1;
  90. int prevlineH = 0;
  91. ulong attr = 0; /* or'ed into each Char */
  92. Char *chars;
  93. int nchars;
  94. int nalloc;
  95. char** anchors; /* allocated in order */
  96. int nanchors;
  97. char *filename;
  98. int cno;
  99. char buf[8192];
  100. char *title = "Plan 9 man page";
  101. void process(Biobuf*, char*);
  102. void mountfont(int, char*);
  103. void switchfont(int);
  104. void header(char*);
  105. void flush(void);
  106. void trailer(void);
  107. void*
  108. emalloc(ulong n)
  109. {
  110. void *p;
  111. p = malloc(n);
  112. if(p == nil)
  113. sysfatal("malloc failed: %r");
  114. return p;
  115. }
  116. void*
  117. erealloc(void *p, ulong n)
  118. {
  119. p = realloc(p, n);
  120. if(p == nil)
  121. sysfatal("realloc failed: %r");
  122. return p;
  123. }
  124. char*
  125. estrdup(char *s)
  126. {
  127. char *t;
  128. t = strdup(s);
  129. if(t == nil)
  130. sysfatal("strdup failed: %r");
  131. return t;
  132. }
  133. void
  134. usage(void)
  135. {
  136. fprint(2, "usage: dhtml [-d] [-t title] [file ...]\n");
  137. exits("usage");
  138. }
  139. void
  140. main(int argc, char *argv[])
  141. {
  142. int i;
  143. Biobuf in, *inp;
  144. ARGBEGIN{
  145. case 't':
  146. title = ARGF();
  147. if(title == nil)
  148. usage();
  149. break;
  150. case 'd':
  151. debug++;
  152. break;
  153. default:
  154. usage();
  155. }ARGEND
  156. Binit(&bout, 1, OWRITE);
  157. if(argc == 0){
  158. header(title);
  159. Binit(&in, 0, OREAD);
  160. process(&in, "<stdin>");
  161. }else{
  162. header(title);
  163. for(i=0; i<argc; i++){
  164. inp = Bopen(argv[i], OREAD);
  165. if(inp == nil)
  166. sysfatal("can't open %s: %r", argv[i]);
  167. process(inp, argv[i]);
  168. Bterm(inp);
  169. }
  170. }
  171. flush();
  172. trailer();
  173. exits(nil);
  174. }
  175. void
  176. emitul(ulong ul)
  177. {
  178. if(nalloc == nchars){
  179. nalloc += 10000;
  180. chars = realloc(chars, nalloc*sizeof(chars[0]));
  181. if(chars == nil)
  182. sysfatal("malloc failed: %r");
  183. }
  184. chars[nchars++] = ul;
  185. }
  186. void
  187. emit(Rune r)
  188. {
  189. emitul(r | attr);
  190. }
  191. void
  192. emitstr(char *s)
  193. {
  194. emitul(0);
  195. emitul((ulong)s);
  196. }
  197. void
  198. flush(void)
  199. {
  200. int i, anchor;
  201. ulong c, oattr, off, on, a, top;
  202. anchor = 0;
  203. oattr = 0;
  204. for(i=0; i<nchars; i++){
  205. c = chars[i];
  206. if(c == 0){
  207. /* next word is string to print */
  208. Bprint(&bout, "%s", (char*)chars[++i]);
  209. continue;
  210. }
  211. attr = c & ~0xFFFF;
  212. /* clear old attributes */
  213. off = (oattr^attr) & oattr;
  214. if(off){
  215. /* do fonts first, since they tend to nest best */
  216. for(a=16; a<=Anchor; a++)
  217. if(off & (1<<a))
  218. Bprint(&bout, "%s", offattr[a]);
  219. }
  220. /* set new attributes */
  221. on = (oattr^attr) & attr;
  222. if(on){
  223. /* before we turn on an attribute, we need to hold off all lower ones to maintain nesting */
  224. for(top=Anchor-1; top>=16; top--)
  225. if(on & (1<<top))
  226. break;
  227. for(a=16; a<=top; a++)
  228. if((oattr^off) & (1<<a))
  229. Bprint(&bout, "%s", offattr[a]);
  230. a = Anchor;
  231. if(on & (1<<a)) /* anchors are special */
  232. Bprint(&bout, "%s", anchors[anchor++]);
  233. while(--a >= 16)
  234. if(on & (1<<a))
  235. Bprint(&bout, "%s", onattr[a]);
  236. /* turn the 'held' ones back on */
  237. for(a=top; a>=16; --a)
  238. if((oattr^off) & (1<<a))
  239. Bprint(&bout, "%s", onattr[a]);
  240. }
  241. oattr = attr;
  242. Bputrune(&bout, c & 0xFFFF);
  243. }
  244. }
  245. void
  246. header(char *s)
  247. {
  248. Bprint(&bout, "<HEAD>\n");
  249. Bprint(&bout, "<TITLE>%s</TITLE>\n", s);
  250. Bprint(&bout, "<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
  251. Bprint(&bout, "</HEAD>\n");
  252. Bprint(&bout, "<BODY BGCOLOR=WHITE>\n");
  253. }
  254. void
  255. trailer(void)
  256. {
  257. Tm *t;
  258. t = localtime(time(nil));
  259. Bprint(&bout, "<BR><FONT size=1><A HREF=\"http://www.lucent.com/copyright.html\">\n");
  260. Bprint(&bout, "Copyright</A> &#169; %d Lucent Technologies. All rights reserved.</FONT>\n", t->year+1900);
  261. Bprint(&bout, "\n</BODY></HTML>\n");
  262. }
  263. int
  264. getc(Biobuf *b)
  265. {
  266. cno++;
  267. return Bgetrune(b);
  268. }
  269. void
  270. ungetc(Biobuf *b)
  271. {
  272. cno--;
  273. Bungetrune(b);
  274. }
  275. char*
  276. getline(Biobuf *b)
  277. {
  278. int i, c;
  279. for(i=0; i<sizeof buf; i++){
  280. c = getc(b);
  281. if(c == Beof)
  282. return nil;
  283. buf[i] = c;
  284. if(c == '\n'){
  285. buf[i] = '\0';
  286. break;
  287. }
  288. }
  289. return buf;
  290. }
  291. int
  292. getnum(Biobuf *b)
  293. {
  294. int i, c;
  295. i = 0;
  296. for(;;){
  297. c = getc(b);
  298. if(c<'0' || '9'<c){
  299. ungetc(b);
  300. break;
  301. }
  302. i = i*10 + (c-'0');
  303. }
  304. return i;
  305. }
  306. char*
  307. getstr(Biobuf *b)
  308. {
  309. int i, c;
  310. for(i=0; i<sizeof buf; i++){
  311. /* must get bytes not runes */
  312. cno++;
  313. c = Bgetc(b);
  314. if(c == Beof)
  315. return nil;
  316. buf[i] = c;
  317. if(c == '\n' || c==' ' || c=='\t'){
  318. ungetc(b);
  319. buf[i] = '\0';
  320. break;
  321. }
  322. }
  323. return buf;
  324. }
  325. int
  326. setnum(Biobuf *b, char *name, int min, int max)
  327. {
  328. int i;
  329. i = getnum(b);
  330. if(debug > 2)
  331. fprint(2, "set %s = %d\n", name, i);
  332. if(min<=i && i<max)
  333. return i;
  334. sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
  335. return i;
  336. }
  337. void
  338. xcmd(Biobuf *b)
  339. {
  340. char *p, *fld[16], buf[256];
  341. int i, nfld;
  342. p = getline(b);
  343. if(p == nil)
  344. sysfatal("xcmd error: %r");
  345. if(debug)
  346. fprint(2, "x command '%s'\n", p);
  347. nfld = tokenize(p, fld, nelem(fld));
  348. if(nfld == 0)
  349. return;
  350. switch(fld[0][0]){
  351. case 'f':
  352. /* mount font */
  353. if(nfld != 3)
  354. break;
  355. i = atoi(fld[1]);
  356. if(i<0 || Nfont<=i)
  357. sysfatal("font %d out of range at %s:#%d", i, filename, cno);
  358. mountfont(i, fld[2]);
  359. return;
  360. case 'i':
  361. /* init */
  362. return;
  363. case 'r':
  364. if(nfld<2 || atoi(fld[1])!=res)
  365. sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
  366. return;
  367. case 's':
  368. /* stop */
  369. return;
  370. case 't':
  371. /* trailer */
  372. return;
  373. case 'T':
  374. if(nfld!=2 || strcmp(fld[1], "utf")!=0)
  375. sysfatal("output for unknown typesetter type %s", fld[1]);
  376. return;
  377. case 'X':
  378. if(nfld<3 || strcmp(fld[1], "html")!=0)
  379. break;
  380. /* is it a man reference of the form cp(1)? */
  381. /* X manref start/end cp (1) */
  382. if(nfld==6 && strcmp(fld[2], "manref")==0){
  383. /* was the right macro; is it the right form? */
  384. if(strlen(fld[5])>=3 &&
  385. fld[5][0]=='(' && fld[5][2]==')' &&
  386. '0'<=fld[5][1] && fld[5][1]<='9'){
  387. if(strcmp(fld[3], "start") == 0){
  388. /* set anchor attribute and remember string */
  389. attr |= (1<<Anchor);
  390. snprint(buf, sizeof buf,
  391. "<A HREF=\"/magic/man2html/%c/%s\">",
  392. fld[5][1], fld[4]);
  393. nanchors++;
  394. anchors = erealloc(anchors, nanchors*sizeof(char*));
  395. anchors[nanchors-1] = estrdup(buf);
  396. }else if(strcmp(fld[3], "end") == 0)
  397. attr &= ~(1<<Anchor);
  398. }
  399. }else if(nfld<4 || strcmp(fld[2], "manref")!=0){
  400. if(nfld>2 && strcmp(fld[2], "<P>")==0){ /* avoid triggering extra <br> */
  401. didP = 1;
  402. /* clear all font attributes before paragraph */
  403. emitul(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))));
  404. emitstr("<P>");
  405. /* next emittec char will turn font attributes back on */
  406. }else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
  407. attr |= (1<<Heading);
  408. else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
  409. attr &= ~(1<<Heading);
  410. else if(debug)
  411. fprint(2, "unknown in-line html %s... at %s:%#d\n",
  412. fld[2], filename, cno);
  413. }
  414. return;
  415. }
  416. if(debug)
  417. fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
  418. }
  419. int
  420. lookup(int c, Htmlchar tab[], int ntab)
  421. {
  422. int low, high, mid;
  423. low = 0;
  424. high = ntab - 1;
  425. while(low <= high){
  426. mid = (low+high)/2;
  427. if(c < tab[mid].value)
  428. high = mid - 1;
  429. else if(c > tab[mid].value)
  430. low = mid + 1;
  431. else
  432. return mid;
  433. }
  434. return -1; /* no match */
  435. }
  436. void
  437. emithtmlchar(int r)
  438. {
  439. static char buf[10];
  440. int i;
  441. i = lookup(r, htmlchars, nelem(htmlchars));
  442. if(i >= 0)
  443. emitstr(htmlchars[i].name);
  444. else
  445. emit(r);
  446. }
  447. char*
  448. troffchar(char *s)
  449. {
  450. int i;
  451. for(i=0; troffchars[i].name!=nil; i++)
  452. if(strcmp(s, troffchars[i].name) == 0)
  453. return troffchars[i].value;
  454. return "??";
  455. }
  456. void
  457. indent(void)
  458. {
  459. int nind;
  460. didP = 0;
  461. if(atnewline){
  462. if(hp != prevlineH){
  463. prevlineH = hp;
  464. /* these most peculiar numbers appear in the troff -man output */
  465. nind = ((prevlineH-1*res)+323)/324;
  466. attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
  467. if(nind >= 1)
  468. attr |= (1<<Indent1);
  469. if(nind >= 2)
  470. attr |= (1<<Indent2);
  471. if(nind >= 2)
  472. attr |= (1<<Indent3);
  473. }
  474. atnewline = 0;
  475. }
  476. }
  477. void
  478. process(Biobuf *b, char *name)
  479. {
  480. int c, r, v, i;
  481. char *p;
  482. cno = 0;
  483. prevlineH = res;
  484. filename = name;
  485. for(;;){
  486. c = getc(b);
  487. switch(c){
  488. case Beof:
  489. /* go to ground state */
  490. attr = 0;
  491. emit('\n');
  492. return;
  493. case '\n':
  494. break;
  495. case '0': case '1': case '2': case '3': case '4':
  496. case '5': case '6': case '7': case '8': case '9':
  497. v = c-'0';
  498. c = getc(b);
  499. if(c<'0' || '9'<c)
  500. sysfatal("illegal character motion at %s:#%d", filename, cno);
  501. v = v*10 + (c-'0');
  502. hp += v;
  503. /* fall through to character case */
  504. case 'c':
  505. indent();
  506. r = getc(b);
  507. emithtmlchar(r);
  508. break;
  509. case 'D':
  510. /* draw line; ignore */
  511. do
  512. c = getc(b);
  513. while(c!='\n' && c!= Beof);
  514. break;
  515. case 'f':
  516. v = setnum(b, "font", 0, Nfont);
  517. switchfont(v);
  518. break;
  519. case 'h':
  520. v = setnum(b, "hpos", -20000, 20000);
  521. /* generate spaces if motion is large and within a line */
  522. if(!atnewline && v>2*72)
  523. for(i=0; i<v; i+=72)
  524. emitstr("&nbsp;");
  525. hp += v;
  526. break;
  527. case 'n':
  528. setnum(b, "n1", -10000, 10000);
  529. //Bprint(&bout, " N1=%d", v);
  530. getc(b); /* space separates */
  531. setnum(b, "n2", -10000, 10000);
  532. atnewline = 1;
  533. if(!didP && hp < (Wid-1)*res) /* if line is less than 19" long, probably need a line break */
  534. emitstr("<br>");
  535. emit('\n');
  536. break;
  537. case 'p':
  538. page = setnum(b, "ps", -10000, 10000);
  539. break;
  540. case 's':
  541. ps = setnum(b, "ps", 1, 1000);
  542. break;
  543. case 'v':
  544. vp += setnum(b, "vpos", -10000, 10000);
  545. /* BUG: ignore motion */
  546. break;
  547. case 'x':
  548. xcmd(b);
  549. break;
  550. case 'w':
  551. emit(' ');
  552. break;
  553. case 'C':
  554. indent();
  555. p = getstr(b);
  556. emitstr(troffchar(p));
  557. break;
  558. case 'H':
  559. hp = setnum(b, "hpos", 0, 20000);
  560. //Bprint(&bout, " H=%d ", hp);
  561. break;
  562. case 'V':
  563. vp = setnum(b, "vpos", 0, 10000);
  564. break;
  565. default:
  566. fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
  567. return;
  568. }
  569. }
  570. }
  571. HTMLfont*
  572. htmlfont(char *name)
  573. {
  574. int i;
  575. for(i=0; htmlfonts[i].name!=nil; i++)
  576. if(strcmp(name, htmlfonts[i].name) == 0)
  577. return &htmlfonts[i];
  578. return &htmlfonts[0];
  579. }
  580. void
  581. mountfont(int pos, char *name)
  582. {
  583. if(debug)
  584. fprint(2, "mount font %s on %d\n", name, pos);
  585. if(font[pos] != nil){
  586. free(font[pos]->name);
  587. free(font[pos]);
  588. }
  589. font[pos] = emalloc(sizeof(Font));
  590. font[pos]->name = estrdup(name);
  591. font[pos]->htmlfont = htmlfont(name);
  592. }
  593. void
  594. switchfont(int pos)
  595. {
  596. HTMLfont *hf;
  597. if(debug)
  598. fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
  599. if(pos == ft)
  600. return;
  601. hf = font[ft]->htmlfont;
  602. if(hf->bit != 0)
  603. attr &= ~(1<<hf->bit);
  604. ft = pos;
  605. hf = font[ft]->htmlfont;
  606. if(hf->bit != 0)
  607. attr |= (1<<hf->bit);
  608. }