rune.c 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #include <plan9.h>
  2. char *argv0;
  3. enum
  4. {
  5. Bit1 = 7,
  6. Bitx = 6,
  7. Bit2 = 5,
  8. Bit3 = 4,
  9. Bit4 = 3,
  10. T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  11. Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  12. T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  13. T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  14. T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  15. T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
  16. Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
  17. Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
  18. Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
  19. Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
  20. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  21. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  22. SurrogateMin = 0xD800,
  23. SurrogateMax = 0xDFFF,
  24. Bad = Runeerror
  25. };
  26. int
  27. chartorune(Rune *rune, char *str)
  28. {
  29. int c, c1, c2, c3;
  30. long l;
  31. /*
  32. * one character sequence
  33. * 00000-0007F => T1
  34. */
  35. c = *(uchar*)str;
  36. if(c < Tx) {
  37. *rune = c;
  38. return 1;
  39. }
  40. /*
  41. * two character sequence
  42. * 00080-007FF => T2 Tx
  43. */
  44. c1 = *(uchar*)(str+1) ^ Tx;
  45. if(c1 & Testx)
  46. goto bad;
  47. if(c < T3) {
  48. if(c < T2)
  49. goto bad;
  50. l = ((c << Bitx) | c1) & Rune2;
  51. if(l <= Rune1)
  52. goto bad;
  53. *rune = l;
  54. return 2;
  55. }
  56. /*
  57. * three character sequence
  58. * 00800-0FFFF => T3 Tx Tx
  59. */
  60. c2 = *(uchar*)(str+2) ^ Tx;
  61. if(c2 & Testx)
  62. goto bad;
  63. if(c < T4) {
  64. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  65. if(l <= Rune2)
  66. goto bad;
  67. if (SurrogateMin <= l && l <= SurrogateMax)
  68. goto bad;
  69. *rune = l;
  70. return 3;
  71. }
  72. /*
  73. * four character sequence
  74. * 10000-10FFFF => T4 Tx Tx Tx
  75. */
  76. if(UTFmax >= 4) {
  77. c3 = *(uchar*)(str+3) ^ Tx;
  78. if(c3 & Testx)
  79. goto bad;
  80. if(c < T5) {
  81. l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
  82. if(l <= Rune3)
  83. goto bad;
  84. if(l > Runemax)
  85. goto bad;
  86. *rune = l;
  87. return 4;
  88. }
  89. }
  90. /*
  91. * bad decoding
  92. */
  93. bad:
  94. *rune = Bad;
  95. return 1;
  96. }
  97. int
  98. runetochar(char *str, Rune *rune)
  99. {
  100. long c;
  101. /*
  102. * one character sequence
  103. * 00000-0007F => 00-7F
  104. */
  105. c = *rune;
  106. if(c <= Rune1) {
  107. str[0] = c;
  108. return 1;
  109. }
  110. /*
  111. * two character sequence
  112. * 0080-07FF => T2 Tx
  113. */
  114. if(c <= Rune2) {
  115. str[0] = T2 | (c >> 1*Bitx);
  116. str[1] = Tx | (c & Maskx);
  117. return 2;
  118. }
  119. /*
  120. * If the Rune is out of range or a surrogate half, convert it to the error rune.
  121. * Do this test here because the error rune encodes to three bytes.
  122. * Doing it earlier would duplicate work, since an out of range
  123. * Rune wouldn't have fit in one or two bytes.
  124. */
  125. if (c > Runemax)
  126. c = Runeerror;
  127. if (SurrogateMin <= c && c <= SurrogateMax)
  128. c = Runeerror;
  129. /*
  130. * three character sequence
  131. * 0800-FFFF => T3 Tx Tx
  132. */
  133. if (c <= Rune3) {
  134. str[0] = T3 | (c >> 2*Bitx);
  135. str[1] = Tx | ((c >> 1*Bitx) & Maskx);
  136. str[2] = Tx | (c & Maskx);
  137. return 3;
  138. }
  139. /*
  140. * four character sequence (21-bit value)
  141. * 10000-1FFFFF => T4 Tx Tx Tx
  142. */
  143. str[0] = T4 | (c >> 3*Bitx);
  144. str[1] = Tx | ((c >> 2*Bitx) & Maskx);
  145. str[2] = Tx | ((c >> 1*Bitx) & Maskx);
  146. str[3] = Tx | (c & Maskx);
  147. return 4;
  148. }
  149. int
  150. runelen(long c)
  151. {
  152. Rune rune;
  153. char str[10];
  154. rune = c;
  155. return runetochar(str, &rune);
  156. }
  157. int
  158. utflen(char *s)
  159. {
  160. int c;
  161. long n;
  162. Rune rune;
  163. n = 0;
  164. for(;;) {
  165. c = *(uchar*)s;
  166. if(c < Runeself) {
  167. if(c == 0)
  168. return n;
  169. s++;
  170. } else
  171. s += chartorune(&rune, s);
  172. n++;
  173. }
  174. return 0;
  175. }