unicode.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /* vi: set sw=4 ts=4: */
  2. /*
  3. * Unicode support routines.
  4. *
  5. * Copyright (C) 2009 Denys Vlasenko
  6. *
  7. * Licensed under GPL version 2, see file LICENSE in this tarball for details.
  8. */
  9. #include "libbb.h"
  10. # include "unicode.h"
  11. size_t FAST_FUNC bb_mbstrlen(const char *string)
  12. {
  13. size_t width = mbstowcs(NULL, string, INT_MAX);
  14. if (width == (size_t)-1L)
  15. return strlen(string);
  16. return width;
  17. }
  18. #if !ENABLE_LOCALE_SUPPORT
  19. /* Crude "locale support" which knows only C and Unicode locales */
  20. /* unicode_is_enabled:
  21. * 0: not known yet,
  22. * 1: not unicode (IOW: assuming one char == one byte)
  23. * 2: unicode
  24. */
  25. # if !ENABLE_FEATURE_CHECK_UNICODE_IN_ENV
  26. # define unicode_is_enabled 2
  27. # else
  28. static smallint unicode_is_enabled;
  29. void FAST_FUNC check_unicode_in_env(void)
  30. {
  31. char *lang;
  32. if (unicode_is_enabled)
  33. return;
  34. unicode_is_enabled = 1;
  35. lang = getenv("LANG");
  36. if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF")))
  37. return;
  38. unicode_is_enabled = 2;
  39. }
  40. # endif
  41. static size_t wcrtomb_internal(char *s, wchar_t wc)
  42. {
  43. int n, i;
  44. uint32_t v = wc;
  45. if (v <= 0x7f) {
  46. *s = v;
  47. return 1;
  48. }
  49. /* RFC 3629 says that Unicode ends at 10FFFF,
  50. * but we cover entire 32 bits */
  51. /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
  52. /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
  53. /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
  54. /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
  55. /* 80-7FF -> 110yyyxx 10xxxxxx */
  56. /* How many bytes do we need? */
  57. n = 2;
  58. /* (0x80000000+ would result in n = 7, limiting n to 6) */
  59. while (v >= 0x800 && n < 6) {
  60. v >>= 5;
  61. n++;
  62. }
  63. /* Fill bytes n-1..1 */
  64. i = n;
  65. while (--i) {
  66. s[i] = (wc & 0x3f) | 0x80;
  67. wc >>= 6;
  68. }
  69. /* Fill byte 0 */
  70. s[0] = wc | (uint8_t)(0x3f00 >> n);
  71. return n;
  72. }
  73. size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM)
  74. {
  75. if (unicode_is_enabled != 2) {
  76. *s = wc;
  77. return 1;
  78. }
  79. return wcrtomb_internal(s, wc);
  80. }
  81. size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
  82. {
  83. size_t org_n = n;
  84. if (unicode_is_enabled != 2) {
  85. while (n) {
  86. wchar_t c = *src++;
  87. *dest++ = c;
  88. if (c == 0)
  89. break;
  90. n--;
  91. }
  92. return org_n - n;
  93. }
  94. while (n >= MB_CUR_MAX) {
  95. wchar_t wc = *src++;
  96. size_t len = wcrtomb_internal(dest, wc);
  97. if (wc == L'\0')
  98. return org_n - n;
  99. dest += len;
  100. n -= len;
  101. }
  102. while (n) {
  103. char tbuf[MB_CUR_MAX];
  104. wchar_t wc = *src++;
  105. size_t len = wcrtomb_internal(tbuf, wc);
  106. if (len > n)
  107. len = n;
  108. memcpy(dest, tbuf, len);
  109. if (wc == L'\0')
  110. return org_n - n;
  111. dest += len;
  112. n -= len;
  113. }
  114. return org_n - n;
  115. }
  116. size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
  117. {
  118. size_t org_n = n;
  119. if (unicode_is_enabled != 2) {
  120. while (n) {
  121. unsigned char c = *src++;
  122. if (dest)
  123. *dest++ = c;
  124. if (c == 0)
  125. break;
  126. n--;
  127. }
  128. return org_n - n;
  129. }
  130. while (n) {
  131. int bytes;
  132. unsigned c = (unsigned char) *src++;
  133. if (c <= 0x7f) {
  134. if (dest)
  135. *dest++ = c;
  136. if (c == '\0')
  137. break;
  138. n--;
  139. continue;
  140. }
  141. /* 80-7FF -> 110yyyxx 10xxxxxx */
  142. /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
  143. /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
  144. /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
  145. /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
  146. bytes = 0;
  147. do {
  148. c <<= 1;
  149. bytes++;
  150. } while ((c & 0x80) && bytes < 6);
  151. if (bytes == 1)
  152. return (size_t) -1L;
  153. c = (uint8_t)(c) >> bytes;
  154. while (--bytes) {
  155. unsigned ch = (unsigned char) *src++;
  156. if ((ch & 0xc0) != 0x80) {
  157. return (size_t) -1L;
  158. }
  159. c = (c << 6) + (ch & 0x3f);
  160. }
  161. /* TODO */
  162. /* Need to check that c isn't produced by overlong encoding */
  163. /* Example: 11000000 10000000 converts to NUL */
  164. /* 11110000 10000000 10000100 10000000 converts to 0x100 */
  165. /* correct encoding: 11000100 10000000 */
  166. if (c <= 0x7f) { /* crude check */
  167. return (size_t) -1L;
  168. //or maybe: c = 0xfffd; /* replacement character */
  169. }
  170. if (dest)
  171. *dest++ = c;
  172. n--;
  173. }
  174. return org_n - n;
  175. }
  176. int FAST_FUNC iswspace(wint_t wc)
  177. {
  178. return (unsigned)wc <= 0x7f && isspace(wc);
  179. }
  180. int FAST_FUNC iswalnum(wint_t wc)
  181. {
  182. return (unsigned)wc <= 0x7f && isalnum(wc);
  183. }
  184. int FAST_FUNC iswpunct(wint_t wc)
  185. {
  186. return (unsigned)wc <= 0x7f && ispunct(wc);
  187. }
  188. #endif