utf.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #ifdef PLAN9
  10. #include <u.h>
  11. #include <libc.h>
  12. #include <bio.h>
  13. #else
  14. #include <sys/types.h>
  15. #include <stdio.h>
  16. #include <stdlib.h>
  17. #include <string.h>
  18. #include <unistd.h>
  19. #include <errno.h>
  20. #include "plan9.h"
  21. #endif
  22. #include "hdr.h"
  23. /*
  24. the our_* routines are implementations for the corresponding library
  25. routines. for a while, i tried to actually name them wctomb etc
  26. but stopped that after i found a system which made wchar_t an
  27. unsigned char.
  28. */
  29. int our_wctomb(char *s, unsigned long wc);
  30. int our_mbtowc(unsigned long *p, char *s, unsigned n);
  31. int runetoisoutf(char *str, Rune *rune);
  32. int fullisorune(char *str, int n);
  33. int isochartorune(Rune *rune, char *str);
  34. void
  35. utf_in(int fd, long *notused, struct convert *out)
  36. {
  37. char buf[N];
  38. int i, j, c, n, tot;
  39. ulong l;
  40. USED(notused);
  41. tot = 0;
  42. while((n = read(fd, buf+tot, N-tot)) >= 0){
  43. tot += n;
  44. for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
  45. c = our_mbtowc(&l, buf+i, tot-i);
  46. if(c == -1){
  47. if(squawk)
  48. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  49. if(clean){
  50. i++;
  51. continue;
  52. }
  53. nerrors++;
  54. l = Runeerror;
  55. c = 1;
  56. }
  57. runes[j++] = l;
  58. i += c;
  59. }
  60. OUT(out, runes, j);
  61. tot -= i;
  62. ninput += i;
  63. if(tot)
  64. memmove(buf, buf+i, tot);
  65. if(n == 0)
  66. break;
  67. }
  68. OUT(out, runes, 0);
  69. }
  70. void
  71. utf_out(Rune *base, int n, int32_t *notused)
  72. {
  73. char *p;
  74. Rune *r;
  75. USED(notused);
  76. nrunes += n;
  77. for(r = base, p = obuf; n-- > 0; r++){
  78. p += our_wctomb(p, *r);
  79. }
  80. noutput += p-obuf;
  81. write(1, obuf, p-obuf);
  82. }
  83. void
  84. isoutf_in(int fd, long *notused, struct convert *out)
  85. {
  86. char buf[N];
  87. int i, j, c, n, tot;
  88. USED(notused);
  89. tot = 0;
  90. while((n = read(fd, buf+tot, N-tot)) >= 0){
  91. tot += n;
  92. for(i=j=0; i<tot; ){
  93. if(!fullisorune(buf+i, tot-i))
  94. break;
  95. c = isochartorune(&runes[j], buf+i);
  96. if(runes[j] == Runeerror && c == 1){
  97. if(squawk)
  98. EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
  99. if(clean){
  100. i++;
  101. continue;
  102. }
  103. nerrors++;
  104. }
  105. j++;
  106. i += c;
  107. }
  108. OUT(out, runes, j);
  109. tot -= i;
  110. ninput += i;
  111. if(tot)
  112. memmove(buf, buf+i, tot);
  113. if(n == 0)
  114. break;
  115. }
  116. OUT(out, runes, 0);
  117. }
  118. void
  119. isoutf_out(Rune *base, int n, int32_t *notused)
  120. {
  121. char *p;
  122. Rune *r;
  123. USED(notused);
  124. nrunes += n;
  125. for(r = base, p = obuf; n-- > 0; r++)
  126. p += runetoisoutf(p, r);
  127. noutput += p-obuf;
  128. write(1, obuf, p-obuf);
  129. }
  130. enum
  131. {
  132. Char1 = Runeself, Rune1 = Runeself,
  133. Char21 = 0xA1, Rune21 = 0x0100,
  134. Char22 = 0xF6, Rune22 = 0x4016,
  135. Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
  136. Esc = 0xBE, Bad = Runeerror
  137. };
  138. static uint8_t U[256];
  139. static uint8_t T[256];
  140. static
  141. void
  142. mktable(void)
  143. {
  144. int i, u;
  145. for(i=0; i<256; i++) {
  146. u = i + (0x5E - 0xA0);
  147. if(i < 0xA0)
  148. u = i + (0xDF - 0x7F);
  149. if(i < 0x7F)
  150. u = i + (0x00 - 0x21);
  151. if(i < 0x21)
  152. u = i + (0xBE - 0x00);
  153. U[i] = u;
  154. T[u] = i;
  155. }
  156. }
  157. int
  158. isochartorune(Rune *rune, char *str)
  159. {
  160. int c, c1, c2;
  161. int32_t l;
  162. if(U[0] == 0)
  163. mktable();
  164. /*
  165. * one character sequence
  166. * 00000-0009F => 00-9F
  167. */
  168. c = *(uint8_t*)str;
  169. if(c < Char1) {
  170. *rune = c;
  171. return 1;
  172. }
  173. /*
  174. * two character sequence
  175. * 000A0-000FF => A0; A0-FF
  176. */
  177. c1 = *(uint8_t*)(str+1);
  178. if(c < Char21) {
  179. if(c1 >= Rune1 && c1 < Rune21) {
  180. *rune = c1;
  181. return 2;
  182. }
  183. goto bad;
  184. }
  185. /*
  186. * two character sequence
  187. * 00100-04015 => A1-F5; 21-7E/A0-FF
  188. */
  189. c1 = U[c1];
  190. if(c1 >= Esc)
  191. goto bad;
  192. if(c < Char22) {
  193. *rune = (c-Char21)*Esc + c1 + Rune21;
  194. return 2;
  195. }
  196. /*
  197. * three character sequence
  198. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  199. */
  200. c2 = U[*(uint8_t*)(str+2)];
  201. if(c2 >= Esc)
  202. goto bad;
  203. if(c < Char3) {
  204. l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
  205. if(l >= Rune3)
  206. goto bad;
  207. *rune = l;
  208. return 3;
  209. }
  210. /*
  211. * bad decoding
  212. */
  213. bad:
  214. *rune = Bad;
  215. return 1;
  216. }
  217. int
  218. runetoisoutf(char *str, Rune *rune)
  219. {
  220. int32_t c;
  221. if(T[0] == 0)
  222. mktable();
  223. /*
  224. * one character sequence
  225. * 00000-0009F => 00-9F
  226. */
  227. c = *rune;
  228. if(c < Rune1) {
  229. str[0] = c;
  230. return 1;
  231. }
  232. /*
  233. * two character sequence
  234. * 000A0-000FF => A0; A0-FF
  235. */
  236. if(c < Rune21) {
  237. str[0] = Char1;
  238. str[1] = c;
  239. return 2;
  240. }
  241. /*
  242. * two character sequence
  243. * 00100-04015 => A1-F5; 21-7E/A0-FF
  244. */
  245. if(c < Rune22) {
  246. c -= Rune21;
  247. str[0] = c/Esc + Char21;
  248. str[1] = T[c%Esc];
  249. return 2;
  250. }
  251. /*
  252. * three character sequence
  253. * 04016-38E2D => A6-FB; 21-7E/A0-FF
  254. */
  255. c -= Rune22;
  256. str[0] = c/(Esc*Esc) + Char22;
  257. str[1] = T[c/Esc%Esc];
  258. str[2] = T[c%Esc];
  259. return 3;
  260. }
  261. int
  262. fullisorune(char *str, int n)
  263. {
  264. int c;
  265. if(n > 0) {
  266. c = *(uint8_t*)str;
  267. if(c < Char1)
  268. return 1;
  269. if(n > 1)
  270. if(c < Char22 || n > 2)
  271. return 1;
  272. }
  273. return 0;
  274. }
  275. #ifdef PLAN9
  276. int errno;
  277. #endif
  278. enum
  279. {
  280. T1 = 0x00,
  281. Tx = 0x80,
  282. T2 = 0xC0,
  283. T3 = 0xE0,
  284. T4 = 0xF0,
  285. T5 = 0xF8,
  286. T6 = 0xFC,
  287. Bit1 = 7,
  288. Bitx = 6,
  289. Bit2 = 5,
  290. Bit3 = 4,
  291. Bit4 = 3,
  292. Bit5 = 2,
  293. Bit6 = 2,
  294. Mask1 = (1<<Bit1)-1,
  295. Maskx = (1<<Bitx)-1,
  296. Mask2 = (1<<Bit2)-1,
  297. Mask3 = (1<<Bit3)-1,
  298. Mask4 = (1<<Bit4)-1,
  299. Mask5 = (1<<Bit5)-1,
  300. Mask6 = (1<<Bit6)-1,
  301. Wchar1 = (1UL<<Bit1)-1,
  302. Wchar2 = (1UL<<(Bit2+Bitx))-1,
  303. Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
  304. Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
  305. Wchar5 = (1UL<<(Bit5+4*Bitx))-1,
  306. #ifndef EILSEQ
  307. EILSEQ = 123,
  308. #endif /* EILSEQ */
  309. };
  310. int
  311. our_wctomb(char *s, unsigned long wc)
  312. {
  313. if(s == 0)
  314. return 0; /* no shift states */
  315. if(wc & ~Wchar2) {
  316. if(wc & ~Wchar4) {
  317. if(wc & ~Wchar5) {
  318. /* 6 bytes */
  319. s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
  320. s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
  321. s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
  322. s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
  323. s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
  324. s[5] = Tx | (wc & Maskx);
  325. return 6;
  326. }
  327. /* 5 bytes */
  328. s[0] = T5 | (wc >> 4*Bitx);
  329. s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
  330. s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
  331. s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
  332. s[4] = Tx | (wc & Maskx);
  333. return 5;
  334. }
  335. if(wc & ~Wchar3) {
  336. /* 4 bytes */
  337. s[0] = T4 | (wc >> 3*Bitx);
  338. s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
  339. s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
  340. s[3] = Tx | (wc & Maskx);
  341. return 4;
  342. }
  343. /* 3 bytes */
  344. s[0] = T3 | (wc >> 2*Bitx);
  345. s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
  346. s[2] = Tx | (wc & Maskx);
  347. return 3;
  348. }
  349. if(wc & ~Wchar1) {
  350. /* 2 bytes */
  351. s[0] = T2 | (wc >> 1*Bitx);
  352. s[1] = Tx | (wc & Maskx);
  353. return 2;
  354. }
  355. /* 1 byte */
  356. s[0] = T1 | wc;
  357. return 1;
  358. }
  359. int
  360. our_mbtowc(unsigned long *p, char *s, unsigned n)
  361. {
  362. uint8_t *us;
  363. int c0, c1, c2, c3, c4, c5;
  364. unsigned long wc;
  365. if(s == 0)
  366. return 0; /* no shift states */
  367. if(n < 1)
  368. goto bad;
  369. us = (uint8_t*)s;
  370. c0 = us[0];
  371. if(c0 >= T3) {
  372. if(n < 3)
  373. goto bad;
  374. c1 = us[1] ^ Tx;
  375. c2 = us[2] ^ Tx;
  376. if((c1|c2) & T2)
  377. goto bad;
  378. if(c0 >= T5) {
  379. if(n < 5)
  380. goto bad;
  381. c3 = us[3] ^ Tx;
  382. c4 = us[4] ^ Tx;
  383. if((c3|c4) & T2)
  384. goto bad;
  385. if(c0 >= T6) {
  386. /* 6 bytes */
  387. if(n < 6)
  388. goto bad;
  389. c5 = us[5] ^ Tx;
  390. if(c5 & T2)
  391. goto bad;
  392. wc = ((((((((((c0 & Mask6) << Bitx) |
  393. c1) << Bitx) | c2) << Bitx) |
  394. c3) << Bitx) | c4) << Bitx) | c5;
  395. if(wc <= Wchar5)
  396. goto bad;
  397. *p = wc;
  398. return 6;
  399. }
  400. /* 5 bytes */
  401. wc = ((((((((c0 & Mask5) << Bitx) |
  402. c1) << Bitx) | c2) << Bitx) |
  403. c3) << Bitx) | c4;
  404. if(wc <= Wchar4)
  405. goto bad;
  406. *p = wc;
  407. return 5;
  408. }
  409. if(c0 >= T4) {
  410. /* 4 bytes */
  411. if(n < 4)
  412. goto bad;
  413. c3 = us[3] ^ Tx;
  414. if(c3 & T2)
  415. goto bad;
  416. wc = ((((((c0 & Mask4) << Bitx) |
  417. c1) << Bitx) | c2) << Bitx) |
  418. c3;
  419. if(wc <= Wchar3)
  420. goto bad;
  421. *p = wc;
  422. return 4;
  423. }
  424. /* 3 bytes */
  425. wc = ((((c0 & Mask3) << Bitx) |
  426. c1) << Bitx) | c2;
  427. if(wc <= Wchar2)
  428. goto bad;
  429. *p = wc;
  430. return 3;
  431. }
  432. if(c0 >= T2) {
  433. /* 2 bytes */
  434. if(n < 2)
  435. goto bad;
  436. c1 = us[1] ^ Tx;
  437. if(c1 & T2)
  438. goto bad;
  439. wc = ((c0 & Mask2) << Bitx) |
  440. c1;
  441. if(wc <= Wchar1)
  442. goto bad;
  443. *p = wc;
  444. return 2;
  445. }
  446. /* 1 byte */
  447. if(c0 >= Tx)
  448. goto bad;
  449. *p = c0;
  450. return 1;
  451. bad:
  452. errno = EILSEQ;
  453. return -1;
  454. }