utf.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. #ifdef PLAN9
  2. #include <u.h>
  3. #include <libc.h>
  4. #include <bio.h>
  5. #else
  6. #include <sys/types.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #include <unistd.h>
  11. #include <errno.h>
  12. #include "plan9.h"
  13. #endif
  14. #include "hdr.h"
  15. /*
  16. the our_* routines are implementations for the corresponding library
  17. routines. for a while, i tried to actually name them wctomb etc
  18. but stopped that after i found a system which made wchar_t an
  19. unsigned char.
  20. */
  21. int our_wctomb(char *s, unsigned long wc);
  22. int our_mbtowc(unsigned long *p, char *s, unsigned n);
  23. int runetoisoutf(char *str, Rune *rune);
  24. int fullisorune(char *str, int n);
  25. int isochartorune(Rune *rune, char *str);
  26. void
  27. utf_in(int fd, long *notused, struct convert *out)
  28. {
  29. char buf[N];
  30. int i, j, c, n, tot;
  31. ulong l;
  32. USED(notused);
  33. tot = 0;
  34. while((n = read(fd, buf+tot, N-tot)) >= 0){
  35. tot += n;
  36. for(i=j=0; i<tot-UTFmax || (n==0 && i<tot); ){
  37. c = our_mbtowc(&l, buf+i, tot-i);
  38. if(c == -1){
  39. if(squawk)
  40. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  41. if(clean){
  42. i++;
  43. continue;
  44. }
  45. nerrors++;
  46. l = Runeerror;
  47. c = 1;
  48. }
  49. runes[j++] = l;
  50. i += c;
  51. }
  52. OUT(out, runes, j);
  53. tot -= i;
  54. ninput += i;
  55. if(tot)
  56. memmove(buf, buf+i, tot);
  57. if(n == 0)
  58. break;
  59. }
  60. OUT(out, runes, 0);
  61. }
  62. void
  63. utf_out(Rune *base, int n, long *notused)
  64. {
  65. char *p;
  66. Rune *r;
  67. USED(notused);
  68. nrunes += n;
  69. for(r = base, p = obuf; n-- > 0; r++){
  70. p += our_wctomb(p, *r);
  71. }
  72. noutput += p-obuf;
  73. write(1, obuf, p-obuf);
  74. }
  75. void
  76. isoutf_in(int fd, long *notused, struct convert *out)
  77. {
  78. char buf[N];
  79. int i, j, c, n, tot;
  80. USED(notused);
  81. tot = 0;
  82. while((n = read(fd, buf+tot, N-tot)) >= 0){
  83. tot += n;
  84. for(i=j=0; i<tot; ){
  85. if(!fullisorune(buf+i, tot-i))
  86. break;
  87. c = isochartorune(&runes[j], buf+i);
  88. if(runes[j] == Runeerror && c == 1){
  89. if(squawk)
  90. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  91. if(clean){
  92. i++;
  93. continue;
  94. }
  95. nerrors++;
  96. }
  97. j++;
  98. i += c;
  99. }
  100. OUT(out, runes, j);
  101. tot -= i;
  102. ninput += i;
  103. if(tot)
  104. memmove(buf, buf+i, tot);
  105. if(n == 0)
  106. break;
  107. }
  108. OUT(out, runes, 0);
  109. }
  110. void
  111. isoutf_out(Rune *base, int n, long *notused)
  112. {
  113. char *p;
  114. Rune *r;
  115. USED(notused);
  116. nrunes += n;
  117. for(r = base, p = obuf; n-- > 0; r++)
  118. p += runetoisoutf(p, r);
  119. noutput += p-obuf;
  120. write(1, obuf, p-obuf);
  121. }
  122. enum
  123. {
  124. Char1 = Runeself, Rune1 = Runeself,
  125. Char21 = 0xA1, Rune21 = 0x0100,
  126. Char22 = 0xF6, Rune22 = 0x4016,
  127. Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
  128. Esc = 0xBE, Bad = Runeerror
  129. };
  130. static uchar U[256];
  131. static uchar T[256];
  132. static
  133. void
  134. mktable(void)
  135. {
  136. int i, u;
  137. for(i=0; i<256; i++) {
  138. u = i + (0x5E - 0xA0);
  139. if(i < 0xA0)
  140. u = i + (0xDF - 0x7F);
  141. if(i < 0x7F)
  142. u = i + (0x00 - 0x21);
  143. if(i < 0x21)
  144. u = i + (0xBE - 0x00);
  145. U[i] = u;
  146. T[u] = i;
  147. }
  148. }
  149. int
  150. isochartorune(Rune *rune, char *str)
  151. {
  152. int c, c1, c2;
  153. long l;
  154. if(U[0] == 0)
  155. mktable();
  156. /*
  157. * one character sequence
  158. * 00000-0009F => 00-9F
  159. */
  160. c = *(uchar*)str;
  161. if(c < Char1) {
  162. *rune = c;
  163. return 1;
  164. }
  165. /*
  166. * two character sequence
  167. * 000A0-000FF => A0; A0-FF
  168. */
  169. c1 = *(uchar*)(str+1);
  170. if(c < Char21) {
  171. if(c1 >= Rune1 && c1 < Rune21) {
  172. *rune = c1;
  173. return 2;
  174. }
  175. goto bad;
  176. }
  177. /*
  178. * two character sequence
  179. * 00100-04015 => A1-F5; 21-7E/A0-FF
  180. */
  181. c1 = U[c1];
  182. if(c1 >= Esc)
  183. goto bad;
  184. if(c < Char22) {
  185. *rune = (c-Char21)*Esc + c1 + Rune21;
  186. return 2;
  187. }
  188. /*
  189. * three character sequence
  190. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  191. */
  192. c2 = U[*(uchar*)(str+2)];
  193. if(c2 >= Esc)
  194. goto bad;
  195. if(c < Char3) {
  196. l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
  197. if(l >= Rune3)
  198. goto bad;
  199. *rune = l;
  200. return 3;
  201. }
  202. /*
  203. * bad decoding
  204. */
  205. bad:
  206. *rune = Bad;
  207. return 1;
  208. }
  209. int
  210. runetoisoutf(char *str, Rune *rune)
  211. {
  212. long c;
  213. if(T[0] == 0)
  214. mktable();
  215. /*
  216. * one character sequence
  217. * 00000-0009F => 00-9F
  218. */
  219. c = *rune;
  220. if(c < Rune1) {
  221. str[0] = c;
  222. return 1;
  223. }
  224. /*
  225. * two character sequence
  226. * 000A0-000FF => A0; A0-FF
  227. */
  228. if(c < Rune21) {
  229. str[0] = Char1;
  230. str[1] = c;
  231. return 2;
  232. }
  233. /*
  234. * two character sequence
  235. * 00100-04015 => A1-F5; 21-7E/A0-FF
  236. */
  237. if(c < Rune22) {
  238. c -= Rune21;
  239. str[0] = c/Esc + Char21;
  240. str[1] = T[c%Esc];
  241. return 2;
  242. }
  243. /*
  244. * three character sequence
  245. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  246. */
  247. c -= Rune22;
  248. str[0] = c/(Esc*Esc) + Char22;
  249. str[1] = T[c/Esc%Esc];
  250. str[2] = T[c%Esc];
  251. return 3;
  252. }
  253. int
  254. fullisorune(char *str, int n)
  255. {
  256. int c;
  257. if(n > 0) {
  258. c = *(uchar*)str;
  259. if(c < Char1)
  260. return 1;
  261. if(n > 1)
  262. if(c < Char22 || n > 2)
  263. return 1;
  264. }
  265. return 0;
  266. }
  267. #ifdef PLAN9
  268. int errno;
  269. #endif
  270. enum
  271. {
  272. T1 = 0x00,
  273. Tx = 0x80,
  274. T2 = 0xC0,
  275. T3 = 0xE0,
  276. T4 = 0xF0,
  277. T5 = 0xF8,
  278. T6 = 0xFC,
  279. Bit1 = 7,
  280. Bitx = 6,
  281. Bit2 = 5,
  282. Bit3 = 4,
  283. Bit4 = 3,
  284. Bit5 = 2,
  285. Bit6 = 2,
  286. Mask1 = (1<<Bit1)-1,
  287. Maskx = (1<<Bitx)-1,
  288. Mask2 = (1<<Bit2)-1,
  289. Mask3 = (1<<Bit3)-1,
  290. Mask4 = (1<<Bit4)-1,
  291. Mask5 = (1<<Bit5)-1,
  292. Mask6 = (1<<Bit6)-1,
  293. Wchar1 = (1UL<<Bit1)-1,
  294. Wchar2 = (1UL<<(Bit2+Bitx))-1,
  295. Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
  296. Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
  297. Wchar5 = (1UL<<(Bit5+4*Bitx))-1,
  298. #ifndef EILSEQ
  299. EILSEQ = 123,
  300. #endif /* EILSEQ */
  301. };
  302. int
  303. our_wctomb(char *s, unsigned long wc)
  304. {
  305. if(s == 0)
  306. return 0; /* no shift states */
  307. if(wc & ~Wchar2) {
  308. if(wc & ~Wchar4) {
  309. if(wc & ~Wchar5) {
  310. /* 6 bytes */
  311. s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
  312. s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
  313. s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
  314. s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
  315. s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
  316. s[5] = Tx | (wc & Maskx);
  317. return 6;
  318. }
  319. /* 5 bytes */
  320. s[0] = T5 | (wc >> 4*Bitx);
  321. s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
  322. s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
  323. s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
  324. s[4] = Tx | (wc & Maskx);
  325. return 5;
  326. }
  327. if(wc & ~Wchar3) {
  328. /* 4 bytes */
  329. s[0] = T4 | (wc >> 3*Bitx);
  330. s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
  331. s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
  332. s[3] = Tx | (wc & Maskx);
  333. return 4;
  334. }
  335. /* 3 bytes */
  336. s[0] = T3 | (wc >> 2*Bitx);
  337. s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
  338. s[2] = Tx | (wc & Maskx);
  339. return 3;
  340. }
  341. if(wc & ~Wchar1) {
  342. /* 2 bytes */
  343. s[0] = T2 | (wc >> 1*Bitx);
  344. s[1] = Tx | (wc & Maskx);
  345. return 2;
  346. }
  347. /* 1 byte */
  348. s[0] = T1 | wc;
  349. return 1;
  350. }
  351. int
  352. our_mbtowc(unsigned long *p, char *s, unsigned n)
  353. {
  354. uchar *us;
  355. int c0, c1, c2, c3, c4, c5;
  356. unsigned long wc;
  357. if(s == 0)
  358. return 0; /* no shift states */
  359. if(n < 1)
  360. goto bad;
  361. us = (uchar*)s;
  362. c0 = us[0];
  363. if(c0 >= T3) {
  364. if(n < 3)
  365. goto bad;
  366. c1 = us[1] ^ Tx;
  367. c2 = us[2] ^ Tx;
  368. if((c1|c2) & T2)
  369. goto bad;
  370. if(c0 >= T5) {
  371. if(n < 5)
  372. goto bad;
  373. c3 = us[3] ^ Tx;
  374. c4 = us[4] ^ Tx;
  375. if((c3|c4) & T2)
  376. goto bad;
  377. if(c0 >= T6) {
  378. /* 6 bytes */
  379. if(n < 6)
  380. goto bad;
  381. c5 = us[5] ^ Tx;
  382. if(c5 & T2)
  383. goto bad;
  384. wc = ((((((((((c0 & Mask6) << Bitx) |
  385. c1) << Bitx) | c2) << Bitx) |
  386. c3) << Bitx) | c4) << Bitx) | c5;
  387. if(wc <= Wchar5)
  388. goto bad;
  389. *p = wc;
  390. return 6;
  391. }
  392. /* 5 bytes */
  393. wc = ((((((((c0 & Mask5) << Bitx) |
  394. c1) << Bitx) | c2) << Bitx) |
  395. c3) << Bitx) | c4;
  396. if(wc <= Wchar4)
  397. goto bad;
  398. *p = wc;
  399. return 5;
  400. }
  401. if(c0 >= T4) {
  402. /* 4 bytes */
  403. if(n < 4)
  404. goto bad;
  405. c3 = us[3] ^ Tx;
  406. if(c3 & T2)
  407. goto bad;
  408. wc = ((((((c0 & Mask4) << Bitx) |
  409. c1) << Bitx) | c2) << Bitx) |
  410. c3;
  411. if(wc <= Wchar3)
  412. goto bad;
  413. *p = wc;
  414. return 4;
  415. }
  416. /* 3 bytes */
  417. wc = ((((c0 & Mask3) << Bitx) |
  418. c1) << Bitx) | c2;
  419. if(wc <= Wchar2)
  420. goto bad;
  421. *p = wc;
  422. return 3;
  423. }
  424. if(c0 >= T2) {
  425. /* 2 bytes */
  426. if(n < 2)
  427. goto bad;
  428. c1 = us[1] ^ Tx;
  429. if(c1 & T2)
  430. goto bad;
  431. wc = ((c0 & Mask2) << Bitx) |
  432. c1;
  433. if(wc <= Wchar1)
  434. goto bad;
  435. *p = wc;
  436. return 2;
  437. }
  438. /* 1 byte */
  439. if(c0 >= Tx)
  440. goto bad;
  441. *p = c0;
  442. return 1;
  443. bad:
  444. errno = EILSEQ;
  445. return -1;
  446. }