utf.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. #ifdef PLAN9
  2. #include <u.h>
  3. #include <libc.h>
  4. #include <bio.h>
  5. #else
  6. #include <sys/types.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #include <unistd.h>
  11. #include <errno.h>
  12. #include "plan9.h"
  13. #endif
  14. #include "hdr.h"
  15. /*
  16. the our_* routines are implementations for the corresponding library
  17. routines. for a while, i tried to actually name them wctomb etc
  18. but stopped that after i found a system which made wchar_t an
  19. unsigned char.
  20. */
  21. #ifdef PLAN9
  22. long getrune(Biobuf *);
  23. long getisorune(Biobuf *);
  24. #else
  25. long getrune(FILE *);
  26. long getisorune(FILE *);
  27. #endif
  28. int our_wctomb(char *s, unsigned long wc);
  29. int our_mbtowc(unsigned long *p, char *s, unsigned n);
  30. int runetoisoutf(char *str, Rune *rune);
  31. int fullisorune(char *str, int n);
  32. int isochartorune(Rune *rune, char *str);
  33. void
  34. utf_in(int fd, long *notused, struct convert *out)
  35. {
  36. #ifndef PLAN9
  37. FILE *fp;
  38. #else /* PLAN9 */
  39. Biobuf b;
  40. #endif /* PLAN9 */
  41. Rune *r;
  42. long l;
  43. USED(notused);
  44. #ifndef PLAN9
  45. if((fp = fdopen(fd, "r")) == NULL){
  46. EPR "%s: input setup error: %s\n", argv0, strerror(errno));
  47. #else /* PLAN9 */
  48. if(Binit(&b, fd, OREAD) < 0){
  49. EPR "%s: input setup error: %r\n", argv0);
  50. #endif /* PLAN9 */
  51. EXIT(1, "input error");
  52. }
  53. r = runes;
  54. for(;;)
  55. #ifndef PLAN9
  56. switch(l = getrune(fp))
  57. #else /* PLAN9 */
  58. switch(l = getrune(&b))
  59. #endif /* PLAN9 */
  60. {
  61. case -1:
  62. goto done;
  63. case -2:
  64. if(squawk)
  65. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
  66. if(clean)
  67. continue;
  68. nerrors++;
  69. l = Runeerror;
  70. default:
  71. *r++ = l;
  72. if(r >= &runes[N]){
  73. OUT(out, runes, r-runes);
  74. r = runes;
  75. }
  76. }
  77. done:
  78. if(r > runes)
  79. OUT(out, runes, r-runes);
  80. }
  81. void
  82. utf_out(Rune *base, int n, long *notused)
  83. {
  84. char *p;
  85. Rune *r;
  86. USED(notused);
  87. nrunes += n;
  88. for(r = base, p = obuf; n-- > 0; r++){
  89. p += our_wctomb(p, *r);
  90. }
  91. noutput += p-obuf;
  92. write(1, obuf, p-obuf);
  93. }
  94. void
  95. isoutf_in(int fd, long *notused, struct convert *out)
  96. {
  97. #ifndef PLAN9
  98. FILE *fp;
  99. #else /* PLAN9 */
  100. Biobuf b;
  101. #endif /* PLAN9 */
  102. Rune *r;
  103. long l;
  104. USED(notused);
  105. #ifndef PLAN9
  106. if((fp = fdopen(fd, "r")) == 0){
  107. EPR "%s: input setup error: %s\n", argv0, strerror(errno));
  108. #else /* PLAN9 */
  109. if(Binit(&b, fd, OREAD) < 0){
  110. EPR "%s: input setup error: %r\n", argv0);
  111. #endif /* PLAN9 */
  112. EXIT(1, "input error");
  113. }
  114. r = runes;
  115. for(;;)
  116. #ifndef PLAN9
  117. switch(l = getisorune(fp))
  118. #else /* PLAN9 */
  119. switch(l = getisorune(&b))
  120. #endif /* PLAN9 */
  121. {
  122. case -1:
  123. goto done;
  124. case -2:
  125. if(squawk)
  126. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
  127. if(clean)
  128. continue;
  129. nerrors++;
  130. l = Runeerror;
  131. default:
  132. *r++ = l;
  133. if(r >= &runes[N]){
  134. OUT(out, runes, r-runes);
  135. r = runes;
  136. }
  137. }
  138. done:
  139. if(r > runes)
  140. OUT(out, runes, r-runes);
  141. }
  142. void
  143. isoutf_out(Rune *base, int n, long *notused)
  144. {
  145. char *p;
  146. Rune *r;
  147. USED(notused);
  148. nrunes += n;
  149. for(r = base, p = obuf; n-- > 0; r++)
  150. p += runetoisoutf(p, r);
  151. noutput += p-obuf;
  152. write(1, obuf, p-obuf);
  153. }
  154. long
  155. #ifndef PLAN9
  156. getrune(FILE *fp)
  157. #else /* PLAN9 */
  158. getrune(Biobuf *bp)
  159. #endif /* PLAN9 */
  160. {
  161. int c, i;
  162. char str[UTFmax]; /* MB_LEN_MAX really */
  163. unsigned long l;
  164. int n;
  165. for(i = 0;;){
  166. #ifndef PLAN9
  167. c = getc(fp);
  168. #else /* PLAN9 */
  169. c = Bgetc(bp);
  170. #endif /* PLAN9 */
  171. if(c < 0)
  172. return(c);
  173. ninput++;
  174. str[i++] = c;
  175. n = our_mbtowc(&l, str, i);
  176. if(n == -1)
  177. return(-2);
  178. if(n > 0)
  179. return(l);
  180. }
  181. }
  182. long
  183. #ifndef PLAN9
  184. getisorune(FILE *fp)
  185. #else /* PLAN9 */
  186. getisorune(Biobuf *bp)
  187. #endif /* PLAN9 */
  188. {
  189. int c, i;
  190. Rune rune;
  191. char str[UTFmax]; /* MB_LEN_MAX really */
  192. for(i = 0;;){
  193. #ifndef PLAN9
  194. c = getc(fp);
  195. #else /* PLAN9 */
  196. c = Bgetc(bp);
  197. #endif /* PLAN9 */
  198. if(c < 0)
  199. return(c);
  200. ninput++;
  201. str[i++] = c;
  202. if(fullisorune(str, i))
  203. break;
  204. }
  205. isochartorune(&rune, str);
  206. if(rune == Runeerror)
  207. return -2;
  208. return(rune);
  209. }
  210. enum
  211. {
  212. Char1 = Runeself, Rune1 = Runeself,
  213. Char21 = 0xA1, Rune21 = 0x0100,
  214. Char22 = 0xF6, Rune22 = 0x4016,
  215. Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
  216. Esc = 0xBE, Bad = Runeerror
  217. };
  218. static uchar U[256];
  219. static uchar T[256];
  220. static
  221. void
  222. mktable(void)
  223. {
  224. int i, u;
  225. for(i=0; i<256; i++) {
  226. u = i + (0x5E - 0xA0);
  227. if(i < 0xA0)
  228. u = i + (0xDF - 0x7F);
  229. if(i < 0x7F)
  230. u = i + (0x00 - 0x21);
  231. if(i < 0x21)
  232. u = i + (0xBE - 0x00);
  233. U[i] = u;
  234. T[u] = i;
  235. }
  236. }
  237. int
  238. isochartorune(Rune *rune, char *str)
  239. {
  240. int c, c1, c2;
  241. long l;
  242. if(U[0] == 0)
  243. mktable();
  244. /*
  245. * one character sequence
  246. * 00000-0009F => 00-9F
  247. */
  248. c = *(uchar*)str;
  249. if(c < Char1) {
  250. *rune = c;
  251. return 1;
  252. }
  253. /*
  254. * two character sequence
  255. * 000A0-000FF => A0; A0-FF
  256. */
  257. c1 = *(uchar*)(str+1);
  258. if(c < Char21) {
  259. if(c1 >= Rune1 && c1 < Rune21) {
  260. *rune = c1;
  261. return 2;
  262. }
  263. goto bad;
  264. }
  265. /*
  266. * two character sequence
  267. * 00100-04015 => A1-F5; 21-7E/A0-FF
  268. */
  269. c1 = U[c1];
  270. if(c1 >= Esc)
  271. goto bad;
  272. if(c < Char22) {
  273. *rune = (c-Char21)*Esc + c1 + Rune21;
  274. return 2;
  275. }
  276. /*
  277. * three character sequence
  278. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  279. */
  280. c2 = U[*(uchar*)(str+2)];
  281. if(c2 >= Esc)
  282. goto bad;
  283. if(c < Char3) {
  284. l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
  285. if(l >= Rune3)
  286. goto bad;
  287. *rune = l;
  288. return 3;
  289. }
  290. /*
  291. * bad decoding
  292. */
  293. bad:
  294. *rune = Bad;
  295. return 1;
  296. }
  297. int
  298. runetoisoutf(char *str, Rune *rune)
  299. {
  300. long c;
  301. if(T[0] == 0)
  302. mktable();
  303. /*
  304. * one character sequence
  305. * 00000-0009F => 00-9F
  306. */
  307. c = *rune;
  308. if(c < Rune1) {
  309. str[0] = c;
  310. return 1;
  311. }
  312. /*
  313. * two character sequence
  314. * 000A0-000FF => A0; A0-FF
  315. */
  316. if(c < Rune21) {
  317. str[0] = Char1;
  318. str[1] = c;
  319. return 2;
  320. }
  321. /*
  322. * two character sequence
  323. * 00100-04015 => A1-F5; 21-7E/A0-FF
  324. */
  325. if(c < Rune22) {
  326. c -= Rune21;
  327. str[0] = c/Esc + Char21;
  328. str[1] = T[c%Esc];
  329. return 2;
  330. }
  331. /*
  332. * three character sequence
  333. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  334. */
  335. c -= Rune22;
  336. str[0] = c/(Esc*Esc) + Char22;
  337. str[1] = T[c/Esc%Esc];
  338. str[2] = T[c%Esc];
  339. return 3;
  340. }
  341. int
  342. fullisorune(char *str, int n)
  343. {
  344. int c;
  345. if(n > 0) {
  346. c = *(uchar*)str;
  347. if(c < Char1)
  348. return 1;
  349. if(n > 1)
  350. if(c < Char22 || n > 2)
  351. return 1;
  352. }
  353. return 0;
  354. }
  355. #ifdef PLAN9
  356. int errno;
  357. #endif
  358. enum
  359. {
  360. T1 = 0x00,
  361. Tx = 0x80,
  362. T2 = 0xC0,
  363. T3 = 0xE0,
  364. T4 = 0xF0,
  365. T5 = 0xF8,
  366. T6 = 0xFC,
  367. Bit1 = 7,
  368. Bitx = 6,
  369. Bit2 = 5,
  370. Bit3 = 4,
  371. Bit4 = 3,
  372. Bit5 = 2,
  373. Bit6 = 2,
  374. Mask1 = (1<<Bit1)-1,
  375. Maskx = (1<<Bitx)-1,
  376. Mask2 = (1<<Bit2)-1,
  377. Mask3 = (1<<Bit3)-1,
  378. Mask4 = (1<<Bit4)-1,
  379. Mask5 = (1<<Bit5)-1,
  380. Mask6 = (1<<Bit6)-1,
  381. Wchar1 = (1UL<<Bit1)-1,
  382. Wchar2 = (1UL<<(Bit2+Bitx))-1,
  383. Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
  384. Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
  385. Wchar5 = (1UL<<(Bit5+4*Bitx))-1
  386. #ifndef EILSEQ
  387. , /* we hate ansi c's comma rules */
  388. EILSEQ = 123
  389. #endif /* PLAN9 */
  390. };
  391. int
  392. our_wctomb(char *s, unsigned long wc)
  393. {
  394. if(s == 0)
  395. return 0; /* no shift states */
  396. if(wc & ~Wchar2) {
  397. if(wc & ~Wchar4) {
  398. if(wc & ~Wchar5) {
  399. /* 6 bytes */
  400. s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
  401. s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
  402. s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
  403. s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
  404. s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
  405. s[5] = Tx | (wc & Maskx);
  406. return 6;
  407. }
  408. /* 5 bytes */
  409. s[0] = T5 | (wc >> 4*Bitx);
  410. s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
  411. s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
  412. s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
  413. s[4] = Tx | (wc & Maskx);
  414. return 5;
  415. }
  416. if(wc & ~Wchar3) {
  417. /* 4 bytes */
  418. s[0] = T4 | (wc >> 3*Bitx);
  419. s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
  420. s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
  421. s[3] = Tx | (wc & Maskx);
  422. return 4;
  423. }
  424. /* 3 bytes */
  425. s[0] = T3 | (wc >> 2*Bitx);
  426. s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
  427. s[2] = Tx | (wc & Maskx);
  428. return 3;
  429. }
  430. if(wc & ~Wchar1) {
  431. /* 2 bytes */
  432. s[0] = T2 | (wc >> 1*Bitx);
  433. s[1] = Tx | (wc & Maskx);
  434. return 2;
  435. }
  436. /* 1 byte */
  437. s[0] = T1 | wc;
  438. return 1;
  439. }
  440. int
  441. our_mbtowc(unsigned long *p, char *s, unsigned n)
  442. {
  443. uchar *us;
  444. int c0, c1, c2, c3, c4, c5;
  445. unsigned long wc;
  446. if(s == 0)
  447. return 0; /* no shift states */
  448. if(n < 1)
  449. goto badlen;
  450. us = (uchar*)s;
  451. c0 = us[0];
  452. if(c0 >= T3) {
  453. if(n < 3)
  454. goto badlen;
  455. c1 = us[1] ^ Tx;
  456. c2 = us[2] ^ Tx;
  457. if((c1|c2) & T2)
  458. goto bad;
  459. if(c0 >= T5) {
  460. if(n < 5)
  461. goto badlen;
  462. c3 = us[3] ^ Tx;
  463. c4 = us[4] ^ Tx;
  464. if((c3|c4) & T2)
  465. goto bad;
  466. if(c0 >= T6) {
  467. /* 6 bytes */
  468. if(n < 6)
  469. goto badlen;
  470. c5 = us[5] ^ Tx;
  471. if(c5 & T2)
  472. goto bad;
  473. wc = ((((((((((c0 & Mask6) << Bitx) |
  474. c1) << Bitx) | c2) << Bitx) |
  475. c3) << Bitx) | c4) << Bitx) | c5;
  476. if(wc <= Wchar5)
  477. goto bad;
  478. *p = wc;
  479. return 6;
  480. }
  481. /* 5 bytes */
  482. wc = ((((((((c0 & Mask5) << Bitx) |
  483. c1) << Bitx) | c2) << Bitx) |
  484. c3) << Bitx) | c4;
  485. if(wc <= Wchar4)
  486. goto bad;
  487. *p = wc;
  488. return 5;
  489. }
  490. if(c0 >= T4) {
  491. /* 4 bytes */
  492. if(n < 4)
  493. goto badlen;
  494. c3 = us[3] ^ Tx;
  495. if(c3 & T2)
  496. goto bad;
  497. wc = ((((((c0 & Mask4) << Bitx) |
  498. c1) << Bitx) | c2) << Bitx) |
  499. c3;
  500. if(wc <= Wchar3)
  501. goto bad;
  502. *p = wc;
  503. return 4;
  504. }
  505. /* 3 bytes */
  506. wc = ((((c0 & Mask3) << Bitx) |
  507. c1) << Bitx) | c2;
  508. if(wc <= Wchar2)
  509. goto bad;
  510. *p = wc;
  511. return 3;
  512. }
  513. if(c0 >= T2) {
  514. /* 2 bytes */
  515. if(n < 2)
  516. goto badlen;
  517. c1 = us[1] ^ Tx;
  518. if(c1 & T2)
  519. goto bad;
  520. wc = ((c0 & Mask2) << Bitx) |
  521. c1;
  522. if(wc <= Wchar1)
  523. goto bad;
  524. *p = wc;
  525. return 2;
  526. }
  527. /* 1 byte */
  528. if(c0 >= Tx)
  529. goto bad;
  530. *p = c0;
  531. return 1;
  532. bad:
  533. errno = EILSEQ;
  534. return -1;
  535. badlen:
  536. return -2;
  537. }