mbwc.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #include <stdlib.h>
  2. #include <limits.h>
  3. /*
  4. * Use the FSS-UTF transformation proposed by posix.
  5. * We define 7 byte types:
  6. * T0 0xxxxxxx 7 free bits
  7. * Tx 10xxxxxx 6 free bits
  8. * T1 110xxxxx 5 free bits
  9. * T2 1110xxxx 4 free bits
  10. * T3 11110xxx 3 free bits
  11. * T4 111110xx 2 free bits
  12. * T5 1111110x 1 free bit
  13. *
  14. * Encoding is as follows.
  15. * From hex Thru hex Sequence Bits
  16. * 00000000 0000007F T0 7
  17. * 00000080 000007FF T1 Tx 11
  18. * 00000800 0000FFFF T2 Tx Tx 16
  19. * 00010000 001FFFFF T3 Tx Tx Tx 21
  20. * 00200000 03FFFFFF T4 Tx Tx Tx Tx 26
  21. * 04000000 7FFFFFFF T5 Tx Tx Tx Tx Tx 31
  22. */
  23. int
  24. mbtowc(wchar_t *pwc, const char *s, size_t n);
  25. int
  26. mblen(const char *s, size_t n)
  27. {
  28. return mbtowc(0, s, n);
  29. }
  30. enum {
  31. C0MSK = 0x7F,
  32. C1MSK = 0x7FF,
  33. T1 = 0xC0,
  34. T2 = 0xE0,
  35. NT1BITS = 11,
  36. NSHFT = 5,
  37. NCSHFT = NSHFT + 1,
  38. WCHARMSK = (1<< (8*MB_LEN_MAX - 1)) - 1,
  39. };
  40. int
  41. mbtowc(wchar_t *pwc, const char *s, size_t n)
  42. {
  43. unsigned long long c[MB_LEN_MAX];
  44. unsigned long long l, m, wm, b;
  45. int i;
  46. if(!s)
  47. return 0;
  48. if(n < 1)
  49. goto bad;
  50. c[0] = s[0] & 0xff; /* first one is special */
  51. if((c[0] & 0x80) == 0x00) {
  52. if(pwc)
  53. *pwc = c[0];
  54. if(c[0] == 0)
  55. return 0;
  56. return 1;
  57. }
  58. m = T2;
  59. b = m^0x20;
  60. l = c[0];
  61. wm = C1MSK;
  62. for(i = 1; i < MB_LEN_MAX + 1; i++){
  63. if(n < i+1)
  64. goto bad;
  65. c[i] = (s[i] ^ 0x80) & 0xff;
  66. l = (l << NCSHFT) | c[i];
  67. if((c[i] & 0xC0) != 0x00)
  68. goto bad;
  69. if((c[0] & m) == b) {
  70. if(pwc)
  71. *pwc = l & wm;
  72. return i + 1;
  73. }
  74. b = m;
  75. m = (m >> 1) | 0x80;
  76. wm = (wm << NSHFT) | wm;
  77. }
  78. /*
  79. * bad decoding
  80. */
  81. bad:
  82. return -1;
  83. }
  84. int
  85. wctomb(char *s, wchar_t wchar)
  86. {
  87. unsigned long long c, maxc, m;
  88. int i, j;
  89. if(!s)
  90. return 0;
  91. maxc = 0x80;
  92. c = wchar & WCHARMSK;
  93. if(c < maxc) {
  94. s[0] = c;
  95. return 1;
  96. }
  97. m = T1;
  98. for(i = 2; i < MB_LEN_MAX + 1; i++){
  99. maxc <<= 4;
  100. if(c < maxc || i == MB_LEN_MAX){
  101. s[0] = m | (c >> ((i - 1) * NCSHFT));
  102. for(j = i - 1; j >= 1; j--){
  103. s[i - j] = 0x80|((c>>(6 * (j - 1)))&0x3f);
  104. }
  105. return i;
  106. }
  107. m = (m >> 1) | 0x80;
  108. }
  109. return MB_LEN_MAX;
  110. }
  111. size_t
  112. mbstowcs(wchar_t *pwcs, const char *s, size_t n)
  113. {
  114. int i, d, c;
  115. for(i=0; i < n; i++) {
  116. c = *s & 0xff;
  117. if(c < 0x80) {
  118. *pwcs = c;
  119. if(c == 0)
  120. break;
  121. s++;
  122. } else {
  123. d = mbtowc(pwcs, s, MB_LEN_MAX);
  124. if(d <= 0)
  125. return (size_t)((d<0) ? -1 : i);
  126. s += d;
  127. }
  128. pwcs++;
  129. }
  130. return i;
  131. }
  132. size_t
  133. wcstombs(char *s, const wchar_t *pwcs, size_t n)
  134. {
  135. int i, d;
  136. long c;
  137. char *p, *pe;
  138. char buf[MB_LEN_MAX];
  139. p = s;
  140. pe = p+n-MB_LEN_MAX;
  141. while(p < pe) {
  142. c = *pwcs++;
  143. if(c < 0x80)
  144. *p++ = c;
  145. else
  146. p += wctomb(p, c);
  147. if(c == 0)
  148. return p-s;
  149. }
  150. while(p < pe+MB_LEN_MAX) {
  151. c = *pwcs++;
  152. d = wctomb(buf, c);
  153. if(p+d <= pe+MB_LEN_MAX) {
  154. *p++ = buf[0]; /* first one is special */
  155. for(i = 2; i < MB_LEN_MAX + 1; i++){
  156. if(d <= i -1)
  157. break;
  158. *p++ = buf[i];
  159. }
  160. }
  161. if(c == 0)
  162. break;
  163. }
  164. return p-s;
  165. }