utf8.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. // utf8
  2. unsigned int utf8_rune_len(uint8_t b) {
  3. if ((b & 0x80)==0) { // ascii
  4. return 1;
  5. } else if ((b & 0xe0) == 0xc0) {
  6. return 2;
  7. } else if ((b & 0xf0) == 0xe0) {
  8. return 3;
  9. } else if ((b & 0xf8) == 0xf0) {
  10. return 4;
  11. }
  12. return 1;
  13. }
  14. int utf8_strlen(char *s, int len) {
  15. int i = 0, j = 0;
  16. while (s[i] && i<len) {
  17. if ((s[i] & 0xc0) != 0x80) j++;
  18. i++;
  19. }
  20. return j;
  21. }
  22. unsigned int utf8_rune_at(char* s, int idx) {
  23. int i = 0, j = 0;
  24. unsigned int rune = 0;
  25. int state = 0;
  26. while (s[i]) {
  27. unsigned char b1 = s[i];
  28. if ((b1 & 0x80)==0) { // ascii
  29. rune = b1;
  30. state = 0;
  31. } else if (state>0) {
  32. rune=(rune<<6) | (b1 & 0x3fu);
  33. state--;
  34. } else if ((b1 & 0xe0) == 0xc0) {
  35. // 16 bit
  36. rune = b1 & 0x1f;
  37. state = 1;
  38. } else if ((b1 & 0xf0) == 0xe0) {
  39. // 24 bit
  40. rune = b1 & 0x0f;
  41. state = 2;
  42. } else if ((b1 & 0xf8) == 0xf0) {
  43. // 32 bit
  44. rune = b1 & 0x07;
  45. state = 3;
  46. }
  47. // next char
  48. if (state == 0) {
  49. if (idx == j) {
  50. return rune;
  51. }
  52. j++;
  53. }
  54. i++;
  55. }
  56. return 0;
  57. }
  58. // adapted from TidyLib (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
  59. int rune_to_utf8(jit_word_t c, void* tempbuf, int* count)
  60. {
  61. uint8_t* buf = (uint8_t*)tempbuf;
  62. int bytes = 0;
  63. int has_error = 0;
  64. if (c <= 0x7F) /* 0XXX XXXX one uint8_t */
  65. {
  66. buf[0] = (uint8_t) c;
  67. bytes = 1;
  68. }
  69. else if (c <= 0x7FF) /* 110X XXXX two bytes */
  70. {
  71. buf[0] = (uint8_t) (0xC0 | (c >> 6));
  72. buf[1] = (uint8_t) (0x80 | (c & 0x3F));
  73. bytes = 2;
  74. }
  75. else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
  76. {
  77. buf[0] = (uint8_t) (0xE0 | (c >> 12));
  78. buf[1] = (uint8_t) (0x80 | ((c >> 6) & 0x3F));
  79. buf[2] = (uint8_t) (0x80 | (c & 0x3F));
  80. bytes = 3;
  81. }
  82. else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
  83. {
  84. buf[0] = (uint8_t) (0xF0 | (c >> 18));
  85. buf[1] = (uint8_t) (0x80 | ((c >> 12) & 0x3F));
  86. buf[2] = (uint8_t) (0x80 | ((c >> 6) & 0x3F));
  87. buf[3] = (uint8_t) (0x80 | (c & 0x3F));
  88. bytes = 4;
  89. }
  90. else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
  91. {
  92. buf[0] = (uint8_t) (0xF8 | (c >> 24));
  93. buf[1] = (uint8_t) (0x80 | (c >> 18));
  94. buf[2] = (uint8_t) (0x80 | ((c >> 12) & 0x3F));
  95. buf[3] = (uint8_t) (0x80 | ((c >> 6) & 0x3F));
  96. buf[4] = (uint8_t) (0x80 | (c & 0x3F));
  97. bytes = 5;
  98. has_error = 1;
  99. }
  100. else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
  101. {
  102. buf[0] = (uint8_t) (0xFC | (c >> 30));
  103. buf[1] = (uint8_t) (0x80 | ((c >> 24) & 0x3F));
  104. buf[2] = (uint8_t) (0x80 | ((c >> 18) & 0x3F));
  105. buf[3] = (uint8_t) (0x80 | ((c >> 12) & 0x3F));
  106. buf[4] = (uint8_t) (0x80 | ((c >> 6) & 0x3F));
  107. buf[5] = (uint8_t) (0x80 | (c & 0x3F));
  108. bytes = 6;
  109. has_error = 1;
  110. }
  111. else {
  112. has_error = 1;
  113. }
  114. *count = bytes;
  115. if (has_error) return -1;
  116. return 0;
  117. }
  118. int utf8_str_to_runestr(char* ustr, int len_bytes, uint32_t* dest) {
  119. uint32_t desti = 0;
  120. uint32_t rune = 0;
  121. int state = 0;
  122. for (int i=0; i<len_bytes; i++) {
  123. uint8_t b1 = ustr[i];
  124. if ((b1 & 0x80)==0) { // ascii
  125. rune = b1;
  126. state = 0;
  127. } else if (state>0) {
  128. rune=(rune<<6) | (b1 & 0x3fu);
  129. state--;
  130. } else if ((b1 & 0xe0) == 0xc0) {
  131. // 16 bit
  132. rune = b1 & 0x1f;
  133. state = 1;
  134. } else if ((b1 & 0xf0) == 0xe0) {
  135. // 24 bit
  136. rune = b1 & 0x0f;
  137. state = 2;
  138. } else if ((b1 & 0xf8) == 0xf0) {
  139. // 32 bit
  140. rune = b1 & 0x07;
  141. state = 3;
  142. }
  143. // next char
  144. if (state == 0) {
  145. dest[desti++] = rune;
  146. }
  147. }
  148. return desti;
  149. }
  150. /*
  151. jit_word_t utf8_strlen_cell(Cell* cell) {
  152. if (!cell || (cell->tag!=TAG_STR && cell->tag!=TAG_BYTES) || !cell->addr) return 0;
  153. return utf8_strlen(cell->addr, cell->size);
  154. }
  155. jit_word_t utf8_rune_at_cell(Cell* cell, Cell* c_idx) {
  156. if (!cell || (cell->tag!=TAG_STR && cell->tag!=TAG_BYTES)) return 0;
  157. if (!c_idx || c_idx->tag!=TAG_INT) return 0;
  158. if (c_idx->value >= cell->size) return 0;
  159. if (c_idx->value < 0) return 0;
  160. if (!cell->addr) {
  161. printf("error: string with NULL addr at %p!\n",cell);
  162. return 0;
  163. }
  164. unsigned int result = utf8_rune_at(cell->addr, c_idx->value);
  165. return result;
  166. }
  167. jit_word_t utf8_put_rune_at(Cell* cell, Cell* c_idx, Cell* c_rune) {
  168. if (!cell || (cell->tag!=TAG_STR && cell->tag!=TAG_BYTES)) return 0;
  169. if (!c_idx || c_idx->tag!=TAG_INT) return 0;
  170. if (!c_rune || c_rune->tag!=TAG_INT) return 0;
  171. char* s = cell->addr;
  172. int idx = c_idx->value;
  173. int rune = c_rune->value;
  174. if (idx<0 || idx>=cell->size) return 0;
  175. // fast forward to the right place
  176. unsigned int i = 0, j = 0;
  177. while (i<cell->size && s[i]) {
  178. if (j==idx) break;
  179. i+=utf8_rune_len(s[i]);
  180. j++;
  181. }
  182. // how long is the existing rune at target spot?
  183. int existing_len = utf8_rune_len(s[i]);
  184. int rune_len = 0;
  185. char tmp[10];
  186. rune_to_utf8(rune, tmp, &rune_len);
  187. if ((i+rune_len)>=cell->size) return 0;
  188. //printf("-- existing rune length at %d: %d new rune length: %d\n",idx,j,rune_len);
  189. if (existing_len>rune_len) {
  190. // new rune is smaller
  191. int movelen = cell->size - (i+existing_len);
  192. if (movelen<rune_len) {
  193. //printf("-- utf8_put_rune_at error: rune %d doesn't fit into string at %d\n",rune,idx);
  194. return 0;
  195. }
  196. printf("move a: %d -> %d len %d / size %d\r\n",i+existing_len,i+rune_len,movelen,cell->size);
  197. memmove(cell->addr+i+rune_len, cell->addr+i+existing_len, movelen);
  198. } else if (j<rune_len) {
  199. // new rune is bigger
  200. int movelen = cell->size - (i+rune_len);
  201. if (movelen<rune_len) {
  202. //printf("-- utf8_put_rune_at error: rune %d doesn't fit into string at %d\n",rune,idx);
  203. return 0;
  204. }
  205. printf("move b: %d -> %d len %d / size %d\r\n",i+existing_len,i+rune_len,movelen,cell->size);
  206. memmove(cell->addr+i+rune_len, cell->addr+i+existing_len, movelen);
  207. }
  208. // write the new rune
  209. for (int m=0; m<rune_len; m++) {
  210. ((uint8_t*)cell->addr)[i+m] = tmp[m];
  211. }
  212. return i;
  213. }
  214. */