utf.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. #ifdef PLAN9
  2. #include <u.h>
  3. #include <libc.h>
  4. #include <bio.h>
  5. #else
  6. #include <sys/types.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #include <unistd.h>
  11. #include <errno.h>
  12. #include "plan9.h"
  13. #endif
  14. #include "hdr.h"
  15. /*
  16. the our_* routines are implementations for the corresponding library
  17. routines. for a while, i tried to actually name them wctomb etc
  18. but stopped that after i found a system which made wchar_t an
  19. unsigned char.
  20. */
  21. int our_wctomb(char *s, unsigned long wc);
  22. int our_mbtowc(unsigned long *p, char *s, unsigned n);
  23. int runetoisoutf(char *str, Rune *rune);
  24. int fullisorune(char *str, int n);
  25. int isochartorune(Rune *rune, char *str);
  26. void
  27. utf_in(int fd, long *notused, struct convert *out)
  28. {
  29. char buf[N];
  30. int i, j, c, n, tot;
  31. ulong l;
  32. USED(notused);
  33. tot = 0;
  34. while((n = read(fd, buf+tot, N-tot)) >= 0){
  35. tot += n;
  36. for(i=j=0; i<tot; ){
  37. c = our_mbtowc(&l, buf+i, tot-i);
  38. if(c == -1)
  39. break;
  40. if(c == -2){
  41. if(squawk)
  42. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  43. if(clean)
  44. continue;
  45. nerrors++;
  46. l = Runeerror;
  47. }
  48. runes[j++] = l;
  49. i += c;
  50. }
  51. OUT(out, runes, j);
  52. tot -= i;
  53. ninput += i;
  54. if(tot)
  55. memmove(buf, buf+i, tot);
  56. if(n == 0)
  57. break;
  58. }
  59. }
  60. void
  61. utf_out(Rune *base, int n, long *notused)
  62. {
  63. char *p;
  64. Rune *r;
  65. USED(notused);
  66. nrunes += n;
  67. for(r = base, p = obuf; n-- > 0; r++){
  68. p += our_wctomb(p, *r);
  69. }
  70. noutput += p-obuf;
  71. write(1, obuf, p-obuf);
  72. }
  73. void
  74. isoutf_in(int fd, long *notused, struct convert *out)
  75. {
  76. char buf[N];
  77. int i, j, c, n, tot;
  78. USED(notused);
  79. tot = 0;
  80. while((n = read(fd, buf+tot, N-tot)) >= 0){
  81. tot += n;
  82. for(i=j=0; i<tot; ){
  83. if(!fullisorune(buf+i, tot-i))
  84. break;
  85. c = isochartorune(&runes[j], buf+i);
  86. if(runes[j] == Runeerror){
  87. if(squawk)
  88. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  89. if(clean)
  90. continue;
  91. nerrors++;
  92. }
  93. j++;
  94. i += c;
  95. }
  96. OUT(out, runes, j);
  97. tot -= i;
  98. ninput += i;
  99. if(tot)
  100. memmove(buf, buf+i, tot);
  101. if(n == 0)
  102. break;
  103. }
  104. }
  105. void
  106. isoutf_out(Rune *base, int n, long *notused)
  107. {
  108. char *p;
  109. Rune *r;
  110. USED(notused);
  111. nrunes += n;
  112. for(r = base, p = obuf; n-- > 0; r++)
  113. p += runetoisoutf(p, r);
  114. noutput += p-obuf;
  115. write(1, obuf, p-obuf);
  116. }
  117. enum
  118. {
  119. Char1 = Runeself, Rune1 = Runeself,
  120. Char21 = 0xA1, Rune21 = 0x0100,
  121. Char22 = 0xF6, Rune22 = 0x4016,
  122. Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
  123. Esc = 0xBE, Bad = Runeerror
  124. };
  125. static uchar U[256];
  126. static uchar T[256];
  127. static
  128. void
  129. mktable(void)
  130. {
  131. int i, u;
  132. for(i=0; i<256; i++) {
  133. u = i + (0x5E - 0xA0);
  134. if(i < 0xA0)
  135. u = i + (0xDF - 0x7F);
  136. if(i < 0x7F)
  137. u = i + (0x00 - 0x21);
  138. if(i < 0x21)
  139. u = i + (0xBE - 0x00);
  140. U[i] = u;
  141. T[u] = i;
  142. }
  143. }
  144. int
  145. isochartorune(Rune *rune, char *str)
  146. {
  147. int c, c1, c2;
  148. long l;
  149. if(U[0] == 0)
  150. mktable();
  151. /*
  152. * one character sequence
  153. * 00000-0009F => 00-9F
  154. */
  155. c = *(uchar*)str;
  156. if(c < Char1) {
  157. *rune = c;
  158. return 1;
  159. }
  160. /*
  161. * two character sequence
  162. * 000A0-000FF => A0; A0-FF
  163. */
  164. c1 = *(uchar*)(str+1);
  165. if(c < Char21) {
  166. if(c1 >= Rune1 && c1 < Rune21) {
  167. *rune = c1;
  168. return 2;
  169. }
  170. goto bad;
  171. }
  172. /*
  173. * two character sequence
  174. * 00100-04015 => A1-F5; 21-7E/A0-FF
  175. */
  176. c1 = U[c1];
  177. if(c1 >= Esc)
  178. goto bad;
  179. if(c < Char22) {
  180. *rune = (c-Char21)*Esc + c1 + Rune21;
  181. return 2;
  182. }
  183. /*
  184. * three character sequence
  185. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  186. */
  187. c2 = U[*(uchar*)(str+2)];
  188. if(c2 >= Esc)
  189. goto bad;
  190. if(c < Char3) {
  191. l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
  192. if(l >= Rune3)
  193. goto bad;
  194. *rune = l;
  195. return 3;
  196. }
  197. /*
  198. * bad decoding
  199. */
  200. bad:
  201. *rune = Bad;
  202. return 1;
  203. }
  204. int
  205. runetoisoutf(char *str, Rune *rune)
  206. {
  207. long c;
  208. if(T[0] == 0)
  209. mktable();
  210. /*
  211. * one character sequence
  212. * 00000-0009F => 00-9F
  213. */
  214. c = *rune;
  215. if(c < Rune1) {
  216. str[0] = c;
  217. return 1;
  218. }
  219. /*
  220. * two character sequence
  221. * 000A0-000FF => A0; A0-FF
  222. */
  223. if(c < Rune21) {
  224. str[0] = Char1;
  225. str[1] = c;
  226. return 2;
  227. }
  228. /*
  229. * two character sequence
  230. * 00100-04015 => A1-F5; 21-7E/A0-FF
  231. */
  232. if(c < Rune22) {
  233. c -= Rune21;
  234. str[0] = c/Esc + Char21;
  235. str[1] = T[c%Esc];
  236. return 2;
  237. }
  238. /*
  239. * three character sequence
  240. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  241. */
  242. c -= Rune22;
  243. str[0] = c/(Esc*Esc) + Char22;
  244. str[1] = T[c/Esc%Esc];
  245. str[2] = T[c%Esc];
  246. return 3;
  247. }
  248. int
  249. fullisorune(char *str, int n)
  250. {
  251. int c;
  252. if(n > 0) {
  253. c = *(uchar*)str;
  254. if(c < Char1)
  255. return 1;
  256. if(n > 1)
  257. if(c < Char22 || n > 2)
  258. return 1;
  259. }
  260. return 0;
  261. }
  262. #ifdef PLAN9
  263. int errno;
  264. #endif
  265. enum
  266. {
  267. T1 = 0x00,
  268. Tx = 0x80,
  269. T2 = 0xC0,
  270. T3 = 0xE0,
  271. T4 = 0xF0,
  272. T5 = 0xF8,
  273. T6 = 0xFC,
  274. Bit1 = 7,
  275. Bitx = 6,
  276. Bit2 = 5,
  277. Bit3 = 4,
  278. Bit4 = 3,
  279. Bit5 = 2,
  280. Bit6 = 2,
  281. Mask1 = (1<<Bit1)-1,
  282. Maskx = (1<<Bitx)-1,
  283. Mask2 = (1<<Bit2)-1,
  284. Mask3 = (1<<Bit3)-1,
  285. Mask4 = (1<<Bit4)-1,
  286. Mask5 = (1<<Bit5)-1,
  287. Mask6 = (1<<Bit6)-1,
  288. Wchar1 = (1UL<<Bit1)-1,
  289. Wchar2 = (1UL<<(Bit2+Bitx))-1,
  290. Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
  291. Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
  292. Wchar5 = (1UL<<(Bit5+4*Bitx))-1
  293. #ifndef EILSEQ
  294. , /* we hate ansi c's comma rules */
  295. EILSEQ = 123
  296. #endif /* PLAN9 */
  297. };
  298. int
  299. our_wctomb(char *s, unsigned long wc)
  300. {
  301. if(s == 0)
  302. return 0; /* no shift states */
  303. if(wc & ~Wchar2) {
  304. if(wc & ~Wchar4) {
  305. if(wc & ~Wchar5) {
  306. /* 6 bytes */
  307. s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
  308. s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
  309. s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
  310. s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
  311. s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
  312. s[5] = Tx | (wc & Maskx);
  313. return 6;
  314. }
  315. /* 5 bytes */
  316. s[0] = T5 | (wc >> 4*Bitx);
  317. s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
  318. s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
  319. s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
  320. s[4] = Tx | (wc & Maskx);
  321. return 5;
  322. }
  323. if(wc & ~Wchar3) {
  324. /* 4 bytes */
  325. s[0] = T4 | (wc >> 3*Bitx);
  326. s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
  327. s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
  328. s[3] = Tx | (wc & Maskx);
  329. return 4;
  330. }
  331. /* 3 bytes */
  332. s[0] = T3 | (wc >> 2*Bitx);
  333. s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
  334. s[2] = Tx | (wc & Maskx);
  335. return 3;
  336. }
  337. if(wc & ~Wchar1) {
  338. /* 2 bytes */
  339. s[0] = T2 | (wc >> 1*Bitx);
  340. s[1] = Tx | (wc & Maskx);
  341. return 2;
  342. }
  343. /* 1 byte */
  344. s[0] = T1 | wc;
  345. return 1;
  346. }
  347. int
  348. our_mbtowc(unsigned long *p, char *s, unsigned n)
  349. {
  350. uchar *us;
  351. int c0, c1, c2, c3, c4, c5;
  352. unsigned long wc;
  353. if(s == 0)
  354. return 0; /* no shift states */
  355. if(n < 1)
  356. goto badlen;
  357. us = (uchar*)s;
  358. c0 = us[0];
  359. if(c0 >= T3) {
  360. if(n < 3)
  361. goto badlen;
  362. c1 = us[1] ^ Tx;
  363. c2 = us[2] ^ Tx;
  364. if((c1|c2) & T2)
  365. goto bad;
  366. if(c0 >= T5) {
  367. if(n < 5)
  368. goto badlen;
  369. c3 = us[3] ^ Tx;
  370. c4 = us[4] ^ Tx;
  371. if((c3|c4) & T2)
  372. goto bad;
  373. if(c0 >= T6) {
  374. /* 6 bytes */
  375. if(n < 6)
  376. goto badlen;
  377. c5 = us[5] ^ Tx;
  378. if(c5 & T2)
  379. goto bad;
  380. wc = ((((((((((c0 & Mask6) << Bitx) |
  381. c1) << Bitx) | c2) << Bitx) |
  382. c3) << Bitx) | c4) << Bitx) | c5;
  383. if(wc <= Wchar5)
  384. goto bad;
  385. *p = wc;
  386. return 6;
  387. }
  388. /* 5 bytes */
  389. wc = ((((((((c0 & Mask5) << Bitx) |
  390. c1) << Bitx) | c2) << Bitx) |
  391. c3) << Bitx) | c4;
  392. if(wc <= Wchar4)
  393. goto bad;
  394. *p = wc;
  395. return 5;
  396. }
  397. if(c0 >= T4) {
  398. /* 4 bytes */
  399. if(n < 4)
  400. goto badlen;
  401. c3 = us[3] ^ Tx;
  402. if(c3 & T2)
  403. goto bad;
  404. wc = ((((((c0 & Mask4) << Bitx) |
  405. c1) << Bitx) | c2) << Bitx) |
  406. c3;
  407. if(wc <= Wchar3)
  408. goto bad;
  409. *p = wc;
  410. return 4;
  411. }
  412. /* 3 bytes */
  413. wc = ((((c0 & Mask3) << Bitx) |
  414. c1) << Bitx) | c2;
  415. if(wc <= Wchar2)
  416. goto bad;
  417. *p = wc;
  418. return 3;
  419. }
  420. if(c0 >= T2) {
  421. /* 2 bytes */
  422. if(n < 2)
  423. goto badlen;
  424. c1 = us[1] ^ Tx;
  425. if(c1 & T2)
  426. goto bad;
  427. wc = ((c0 & Mask2) << Bitx) |
  428. c1;
  429. if(wc <= Wchar1)
  430. goto bad;
  431. *p = wc;
  432. return 2;
  433. }
  434. /* 1 byte */
  435. if(c0 >= Tx)
  436. goto bad;
  437. *p = c0;
  438. return 1;
  439. bad:
  440. errno = EILSEQ;
  441. return -1;
  442. badlen:
  443. return -2;
  444. }