troff2html.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. enum{
  5. Nfont = 11,
  6. Wid = 20, /* tmac.anhtml sets page width to 20" so we can recognize .nf text */
  7. };
  8. typedef ulong Char;
  9. typedef struct Troffchar Troffchar;
  10. typedef struct Htmlchar Htmlchar;
  11. typedef struct Font Font;
  12. typedef struct HTMLfont HTMLfont;
  13. /* a Char is 32 bits. low 16 bits are the rune. higher are attributes */
  14. enum
  15. {
  16. Italic = 16,
  17. Bold,
  18. CW,
  19. Indent1,
  20. Indent2,
  21. Indent3,
  22. Heading = 25,
  23. Anchor = 26, /* must be last */
  24. };
  25. enum /* magic emissions */
  26. {
  27. Estring = 0,
  28. Epp = 1<<16,
  29. };
  30. int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW };
  31. int nest[10];
  32. int nnest;
  33. struct Troffchar
  34. {
  35. char *name;
  36. char *value;
  37. };
  38. struct Htmlchar
  39. {
  40. char *utf;
  41. char *name;
  42. int value;
  43. };
  44. #include "chars.h"
  45. struct Font{
  46. char *name;
  47. HTMLfont *htmlfont;
  48. };
  49. struct HTMLfont{
  50. char *name;
  51. char *htmlname;
  52. int bit;
  53. };
  54. /* R must be first; it's the default representation for fonts we don't recognize */
  55. HTMLfont htmlfonts[] =
  56. {
  57. "R", nil, 0,
  58. "LucidaSans", nil, 0,
  59. "I", "i", Italic,
  60. "LucidaSansI", "i", Italic,
  61. "CW", "tt", CW,
  62. "LucidaCW", "tt", CW,
  63. nil, nil,
  64. };
  65. #define TABLE "<table border=0 cellpadding=0 cellspacing=0>"
  66. char*
  67. onattr[8*sizeof(ulong)] =
  68. {
  69. 0, 0, 0, 0, 0, 0, 0, 0,
  70. 0, 0, 0, 0, 0, 0, 0, 0,
  71. "<i>", /* italic */
  72. "<b>", /* bold */
  73. "<tt><font size=+1>", /* cw */
  74. "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent1 */
  75. "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent2 */
  76. "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent3 */
  77. 0,
  78. 0,
  79. 0,
  80. "<p><font size=+1><b>", /* heading 25 */
  81. "<unused>", /* anchor 26 */
  82. };
  83. char*
  84. offattr[8*sizeof(ulong)] =
  85. {
  86. 0, 0, 0, 0, 0, 0, 0, 0,
  87. 0, 0, 0, 0, 0, 0, 0, 0,
  88. "</i>", /* italic */
  89. "</b>", /* bold */
  90. "</font></tt>", /* cw */
  91. "<-/table>", /* indent1 */
  92. "<-/table>", /* indent2 */
  93. "<-/table>", /* indent3 */
  94. 0,
  95. 0,
  96. 0,
  97. "</b></font>", /* heading 25 */
  98. "</a>", /* anchor 26 */
  99. };
  100. Font *font[Nfont];
  101. Biobuf bout;
  102. int debug = 0;
  103. /* troff state */
  104. int page = 1;
  105. int ft = 1;
  106. int vp = 0;
  107. int hp = 0;
  108. int ps = 1;
  109. int res = 720;
  110. int didP = 0;
  111. int atnewline = 1;
  112. int prevlineH = 0;
  113. ulong attr = 0; /* or'ed into each Char */
  114. Char *chars;
  115. int nchars;
  116. int nalloc;
  117. char** anchors; /* allocated in order */
  118. int nanchors;
  119. char *filename;
  120. int cno;
  121. char buf[8192];
  122. char *title = "Plan 9 man page";
  123. void process(Biobuf*, char*);
  124. void mountfont(int, char*);
  125. void switchfont(int);
  126. void header(char*);
  127. void flush(void);
  128. void trailer(void);
  129. void*
  130. emalloc(ulong n)
  131. {
  132. void *p;
  133. p = malloc(n);
  134. if(p == nil)
  135. sysfatal("malloc failed: %r");
  136. return p;
  137. }
  138. void*
  139. erealloc(void *p, ulong n)
  140. {
  141. p = realloc(p, n);
  142. if(p == nil)
  143. sysfatal("realloc failed: %r");
  144. return p;
  145. }
  146. char*
  147. estrdup(char *s)
  148. {
  149. char *t;
  150. t = strdup(s);
  151. if(t == nil)
  152. sysfatal("strdup failed: %r");
  153. return t;
  154. }
  155. void
  156. usage(void)
  157. {
  158. fprint(2, "usage: troff2html [-d] [-t title] [file ...]\n");
  159. exits("usage");
  160. }
  161. int
  162. hccmp(const void *va, const void *vb)
  163. {
  164. Htmlchar *a, *b;
  165. a = (Htmlchar*)va;
  166. b = (Htmlchar*)vb;
  167. return a->value - b->value;
  168. }
  169. void
  170. main(int argc, char *argv[])
  171. {
  172. int i;
  173. Biobuf in, *inp;
  174. Rune r;
  175. for(i=0; i<nelem(htmlchars); i++){
  176. chartorune(&r, htmlchars[i].utf);
  177. htmlchars[i].value = r;
  178. }
  179. qsort(htmlchars, nelem(htmlchars), sizeof(htmlchars[0]), hccmp);
  180. ARGBEGIN{
  181. case 't':
  182. title = ARGF();
  183. if(title == nil)
  184. usage();
  185. break;
  186. case 'd':
  187. debug++;
  188. break;
  189. default:
  190. usage();
  191. }ARGEND
  192. Binit(&bout, 1, OWRITE);
  193. if(argc == 0){
  194. header(title);
  195. Binit(&in, 0, OREAD);
  196. process(&in, "<stdin>");
  197. }else{
  198. header(title);
  199. for(i=0; i<argc; i++){
  200. inp = Bopen(argv[i], OREAD);
  201. if(inp == nil)
  202. sysfatal("can't open %s: %r", argv[i]);
  203. process(inp, argv[i]);
  204. Bterm(inp);
  205. }
  206. }
  207. flush();
  208. trailer();
  209. exits(nil);
  210. }
  211. void
  212. emitul(ulong ul)
  213. {
  214. if(nalloc == nchars){
  215. nalloc += 10000;
  216. chars = realloc(chars, nalloc*sizeof(chars[0]));
  217. if(chars == nil)
  218. sysfatal("malloc failed: %r");
  219. }
  220. chars[nchars++] = ul;
  221. }
  222. void
  223. emit(Rune r)
  224. {
  225. emitul(r | attr);
  226. /*
  227. * Close man page references early, so that
  228. * .IR proof (1),
  229. * doesn't make the comma part of the link.
  230. */
  231. if(r == ')')
  232. attr &= ~(1<<Anchor);
  233. }
  234. void
  235. emitstr(char *s)
  236. {
  237. emitul(Estring);
  238. emitul((ulong)s);
  239. }
  240. int indentlevel;
  241. int linelen;
  242. void
  243. iputrune(Biobuf *b, Rune r)
  244. {
  245. int i;
  246. if(linelen++ > 60 && r == ' ')
  247. r = '\n';
  248. Bputrune(b, r);
  249. if(r == '\n'){
  250. for(i=0; i<indentlevel; i++)
  251. Bprint(b, " ");
  252. linelen = 0;
  253. }
  254. }
  255. void
  256. iputs(Biobuf *b, char *s)
  257. {
  258. if(s[0]=='<' && s[1]=='+'){
  259. iputrune(b, '\n');
  260. Bprint(b, "<%s", s+2);
  261. indentlevel++;
  262. iputrune(b, '\n');
  263. }else if(s[0]=='<' && s[1]=='-'){
  264. indentlevel--;
  265. iputrune(b, '\n');
  266. Bprint(b, "<%s", s+2);
  267. iputrune(b, '\n');
  268. }else
  269. Bprint(b, "%s", s);
  270. }
  271. void
  272. setattr(ulong a)
  273. {
  274. int on, off, i, j;
  275. on = a & ~attr;
  276. off = attr & ~a;
  277. /* walk up the nest stack until we reach something we need to turn off. */
  278. for(i=0; i<nnest; i++)
  279. if(off&(1<<nest[i]))
  280. break;
  281. /* turn off everything above that */
  282. for(j=nnest-1; j>=i; j--)
  283. iputs(&bout, offattr[nest[j]]);
  284. /* turn on everything we just turned off but didn't want to */
  285. for(j=i; j<nnest; j++)
  286. if(a&(1<<nest[j]))
  287. iputs(&bout, onattr[nest[j]]);
  288. else
  289. nest[j] = 0;
  290. /* shift the zeros (turned off things) up */
  291. for(i=j=0; i<nnest; i++)
  292. if(nest[i] != 0)
  293. nest[j++] = nest[i];
  294. nnest = j;
  295. /* now turn on the new attributes */
  296. for(i=0; i<nelem(attrorder); i++){
  297. j = attrorder[i];
  298. if(on&(1<<j)){
  299. if(j == Anchor)
  300. onattr[j] = anchors[nanchors++];
  301. iputs(&bout, onattr[j]);
  302. nest[nnest++] = j;
  303. }
  304. }
  305. attr = a;
  306. }
  307. void
  308. flush(void)
  309. {
  310. int i;
  311. ulong c, a;
  312. nanchors = 0;
  313. for(i=0; i<nchars; i++){
  314. c = chars[i];
  315. if(c == Estring){
  316. /* next word is string to print */
  317. iputs(&bout, (char*)chars[++i]);
  318. continue;
  319. }
  320. if(c == Epp){
  321. iputrune(&bout, '\n');
  322. iputs(&bout, TABLE "<tr height=5><td></table>");
  323. iputrune(&bout, '\n');
  324. continue;
  325. }
  326. a = c & ~0xFFFF;
  327. c &= 0xFFFF;
  328. /*
  329. * If we're going to something off after a space,
  330. * let's just turn it off before.
  331. */
  332. if(c == ' ' && i<nchars-1 && (chars[i+1]&0xFFFF) >= 32)
  333. a ^= a & ~chars[i+1];
  334. setattr(a);
  335. iputrune(&bout, c & 0xFFFF);
  336. }
  337. }
  338. void
  339. header(char *s)
  340. {
  341. Bprint(&bout, "<head>\n");
  342. Bprint(&bout, "<title>%s</title>\n", s);
  343. Bprint(&bout, "<meta content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
  344. Bprint(&bout, "</head>\n");
  345. Bprint(&bout, "<body bgcolor=#ffffff>\n");
  346. }
  347. void
  348. trailer(void)
  349. {
  350. #ifdef LUCENT
  351. Tm *t;
  352. t = localtime(time(nil));
  353. Bprint(&bout, TABLE "<tr height=20><td></table>\n");
  354. Bprint(&bout, "<font size=-1><a href=\"http://www.lucent.com/copyright.html\">\n");
  355. Bprint(&bout, "Copyright</A> &#169; %d Lucent Technologies. All rights reserved.</font>\n", t->year+1900);
  356. #endif
  357. Bprint(&bout, "</body></html>\n");
  358. }
  359. int
  360. getc(Biobuf *b)
  361. {
  362. cno++;
  363. return Bgetrune(b);
  364. }
  365. void
  366. ungetc(Biobuf *b)
  367. {
  368. cno--;
  369. Bungetrune(b);
  370. }
  371. char*
  372. getline(Biobuf *b)
  373. {
  374. int i, c;
  375. for(i=0; i<sizeof buf; i++){
  376. c = getc(b);
  377. if(c == Beof)
  378. return nil;
  379. buf[i] = c;
  380. if(c == '\n'){
  381. buf[i] = '\0';
  382. break;
  383. }
  384. }
  385. return buf;
  386. }
  387. int
  388. getnum(Biobuf *b)
  389. {
  390. int i, c;
  391. i = 0;
  392. for(;;){
  393. c = getc(b);
  394. if(c<'0' || '9'<c){
  395. ungetc(b);
  396. break;
  397. }
  398. i = i*10 + (c-'0');
  399. }
  400. return i;
  401. }
  402. char*
  403. getstr(Biobuf *b)
  404. {
  405. int i, c;
  406. for(i=0; i<sizeof buf; i++){
  407. /* must get bytes not runes */
  408. cno++;
  409. c = Bgetc(b);
  410. if(c == Beof)
  411. return nil;
  412. buf[i] = c;
  413. if(c == '\n' || c==' ' || c=='\t'){
  414. ungetc(b);
  415. buf[i] = '\0';
  416. break;
  417. }
  418. }
  419. return buf;
  420. }
  421. int
  422. setnum(Biobuf *b, char *name, int min, int max)
  423. {
  424. int i;
  425. i = getnum(b);
  426. if(debug > 2)
  427. fprint(2, "set %s = %d\n", name, i);
  428. if(min<=i && i<max)
  429. return i;
  430. sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
  431. return i;
  432. }
  433. void
  434. xcmd(Biobuf *b)
  435. {
  436. char *p, *fld[16], buf[1024];
  437. int i, nfld;
  438. p = getline(b);
  439. if(p == nil)
  440. sysfatal("xcmd error: %r");
  441. if(debug)
  442. fprint(2, "x command '%s'\n", p);
  443. nfld = tokenize(p, fld, nelem(fld));
  444. if(nfld == 0)
  445. return;
  446. switch(fld[0][0]){
  447. case 'f':
  448. /* mount font */
  449. if(nfld != 3)
  450. break;
  451. i = atoi(fld[1]);
  452. if(i<0 || Nfont<=i)
  453. sysfatal("font %d out of range at %s:#%d", i, filename, cno);
  454. mountfont(i, fld[2]);
  455. return;
  456. case 'i':
  457. /* init */
  458. return;
  459. case 'r':
  460. if(nfld<2 || atoi(fld[1])!=res)
  461. sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
  462. return;
  463. case 's':
  464. /* stop */
  465. return;
  466. case 't':
  467. /* trailer */
  468. return;
  469. case 'T':
  470. if(nfld!=2 || strcmp(fld[1], "utf")!=0)
  471. sysfatal("output for unknown typesetter type %s", fld[1]);
  472. return;
  473. case 'X':
  474. if(nfld<3 || strcmp(fld[1], "html")!=0)
  475. break;
  476. /* is it a man reference of the form cp(1)? */
  477. /* X manref start/end cp (1) */
  478. if(nfld==6 && strcmp(fld[2], "manref")==0){
  479. /* was the right macro; is it the right form? */
  480. if(strlen(fld[5])>=3 &&
  481. fld[5][0]=='(' && fld[5][2]==')' &&
  482. '0'<=fld[5][1] && fld[5][1]<='9'){
  483. if(strcmp(fld[3], "start") == 0){
  484. /* set anchor attribute and remember string */
  485. attr |= (1<<Anchor);
  486. snprint(buf, sizeof buf,
  487. "<a href=\"/magic/man2html/%c/%s\">",
  488. fld[5][1], fld[4]);
  489. nanchors++;
  490. anchors = erealloc(anchors, nanchors*sizeof(char*));
  491. anchors[nanchors-1] = estrdup(buf);
  492. }else if(strcmp(fld[3], "end") == 0)
  493. attr &= ~(1<<Anchor);
  494. }
  495. }else if(strcmp(fld[2], "manPP") == 0){
  496. didP = 1;
  497. emitul(Epp);
  498. }else if(nfld<4 || strcmp(fld[2], "manref")!=0){
  499. if(nfld>2 && strcmp(fld[2], "<P>")==0){ /* avoid triggering extra <br> */
  500. didP = 1;
  501. /* clear all font attributes before paragraph */
  502. emitul(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))));
  503. emitstr("<P>");
  504. /* next emittec char will turn font attributes back on */
  505. }else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
  506. attr |= (1<<Heading);
  507. else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
  508. attr &= ~(1<<Heading);
  509. else if(debug)
  510. fprint(2, "unknown in-line html %s... at %s:%#d\n",
  511. fld[2], filename, cno);
  512. }
  513. return;
  514. }
  515. if(debug)
  516. fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
  517. }
  518. int
  519. lookup(int c, Htmlchar tab[], int ntab)
  520. {
  521. int low, high, mid;
  522. low = 0;
  523. high = ntab - 1;
  524. while(low <= high){
  525. mid = (low+high)/2;
  526. if(c < tab[mid].value)
  527. high = mid - 1;
  528. else if(c > tab[mid].value)
  529. low = mid + 1;
  530. else
  531. return mid;
  532. }
  533. return -1; /* no match */
  534. }
  535. void
  536. emithtmlchar(int r)
  537. {
  538. static char buf[10];
  539. int i;
  540. i = lookup(r, htmlchars, nelem(htmlchars));
  541. if(i >= 0)
  542. emitstr(htmlchars[i].name);
  543. else
  544. emit(r);
  545. }
  546. char*
  547. troffchar(char *s)
  548. {
  549. int i;
  550. for(i=0; troffchars[i].name!=nil; i++)
  551. if(strcmp(s, troffchars[i].name) == 0)
  552. return troffchars[i].value;
  553. return "??";
  554. }
  555. void
  556. indent(void)
  557. {
  558. int nind;
  559. didP = 0;
  560. if(atnewline){
  561. if(hp != prevlineH){
  562. prevlineH = hp;
  563. /* these most peculiar numbers appear in the troff -man output */
  564. nind = ((prevlineH-1*res)+323)/324;
  565. attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
  566. if(nind >= 1)
  567. attr |= (1<<Indent1);
  568. if(nind >= 2)
  569. attr |= (1<<Indent2);
  570. if(nind >= 3)
  571. attr |= (1<<Indent3);
  572. }
  573. atnewline = 0;
  574. }
  575. }
  576. void
  577. process(Biobuf *b, char *name)
  578. {
  579. int c, r, v, i;
  580. char *p;
  581. cno = 0;
  582. prevlineH = res;
  583. filename = name;
  584. for(;;){
  585. c = getc(b);
  586. switch(c){
  587. case Beof:
  588. /* go to ground state */
  589. attr = 0;
  590. emit('\n');
  591. return;
  592. case '\n':
  593. break;
  594. case '0': case '1': case '2': case '3': case '4':
  595. case '5': case '6': case '7': case '8': case '9':
  596. v = c-'0';
  597. c = getc(b);
  598. if(c<'0' || '9'<c)
  599. sysfatal("illegal character motion at %s:#%d", filename, cno);
  600. v = v*10 + (c-'0');
  601. hp += v;
  602. /* fall through to character case */
  603. case 'c':
  604. indent();
  605. r = getc(b);
  606. emithtmlchar(r);
  607. break;
  608. case 'D':
  609. /* draw line; ignore */
  610. do
  611. c = getc(b);
  612. while(c!='\n' && c!= Beof);
  613. break;
  614. case 'f':
  615. v = setnum(b, "font", 0, Nfont);
  616. switchfont(v);
  617. break;
  618. case 'h':
  619. v = setnum(b, "hpos", -20000, 20000);
  620. /* generate spaces if motion is large and within a line */
  621. if(!atnewline && v>2*72)
  622. for(i=0; i<v; i+=72)
  623. emitstr("&nbsp;");
  624. hp += v;
  625. break;
  626. case 'n':
  627. setnum(b, "n1", -10000, 10000);
  628. //Bprint(&bout, " N1=%d", v);
  629. getc(b); /* space separates */
  630. setnum(b, "n2", -10000, 10000);
  631. atnewline = 1;
  632. if(!didP && hp < (Wid-1)*res) /* if line is less than 19" long, probably need a line break */
  633. emitstr("<br>");
  634. emit('\n');
  635. break;
  636. case 'p':
  637. page = setnum(b, "ps", -10000, 10000);
  638. break;
  639. case 's':
  640. ps = setnum(b, "ps", 1, 1000);
  641. break;
  642. case 'v':
  643. vp += setnum(b, "vpos", -10000, 10000);
  644. /* BUG: ignore motion */
  645. break;
  646. case 'x':
  647. xcmd(b);
  648. break;
  649. case 'w':
  650. emit(' ');
  651. break;
  652. case 'C':
  653. indent();
  654. p = getstr(b);
  655. emitstr(troffchar(p));
  656. break;
  657. case 'H':
  658. hp = setnum(b, "hpos", 0, 20000);
  659. //Bprint(&bout, " H=%d ", hp);
  660. break;
  661. case 'V':
  662. vp = setnum(b, "vpos", 0, 10000);
  663. break;
  664. default:
  665. fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
  666. return;
  667. }
  668. }
  669. }
  670. HTMLfont*
  671. htmlfont(char *name)
  672. {
  673. int i;
  674. for(i=0; htmlfonts[i].name!=nil; i++)
  675. if(strcmp(name, htmlfonts[i].name) == 0)
  676. return &htmlfonts[i];
  677. return &htmlfonts[0];
  678. }
  679. void
  680. mountfont(int pos, char *name)
  681. {
  682. if(debug)
  683. fprint(2, "mount font %s on %d\n", name, pos);
  684. if(font[pos] != nil){
  685. free(font[pos]->name);
  686. free(font[pos]);
  687. }
  688. font[pos] = emalloc(sizeof(Font));
  689. font[pos]->name = estrdup(name);
  690. font[pos]->htmlfont = htmlfont(name);
  691. }
  692. void
  693. switchfont(int pos)
  694. {
  695. HTMLfont *hf;
  696. if(debug)
  697. fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
  698. if(pos == ft)
  699. return;
  700. hf = font[ft]->htmlfont;
  701. if(hf->bit != 0)
  702. attr &= ~(1<<hf->bit);
  703. ft = pos;
  704. hf = font[ft]->htmlfont;
  705. if(hf->bit != 0)
  706. attr |= (1<<hf->bit);
  707. }