rune.c 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #include <u.h>
  2. #include <libc.h>
  3. #define Bit(i) (7-(i))
  4. /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
  5. #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
  6. /* 0000 0000 0000 0111 1111 1111 */
  7. #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
  8. enum
  9. {
  10. Bitx = Bit(1),
  11. Tx = T(1), /* 1000 0000 */
  12. Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
  13. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  14. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  15. SurrogateMin = 0xD800,
  16. SurrogateMax = 0xDFFF,
  17. Bad = Runeerror,
  18. };
  19. int
  20. chartorune(Rune *rune, char *str)
  21. {
  22. int c[UTFmax], i;
  23. Rune l;
  24. /*
  25. * N character sequence
  26. * 00000-0007F => T1
  27. * 00080-007FF => T2 Tx
  28. * 00800-0FFFF => T3 Tx Tx
  29. * 10000-10FFFF => T4 Tx Tx Tx
  30. */
  31. c[0] = *(uchar*)(str);
  32. if(c[0] < Tx){
  33. *rune = c[0];
  34. return 1;
  35. }
  36. l = c[0];
  37. for(i = 1; i < UTFmax; i++) {
  38. c[i] = *(uchar*)(str+i);
  39. c[i] ^= Tx;
  40. if(c[i] & Testx)
  41. goto bad;
  42. l = (l << Bitx) | c[i];
  43. if(c[0] < T(i + 2)) {
  44. l &= RuneX(i + 1);
  45. if(i == 1) {
  46. if(c[0] < T(2) || l <= Rune1)
  47. goto bad;
  48. } else if(l <= RuneX(i) || l > Runemax)
  49. goto bad;
  50. if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
  51. goto bad;
  52. *rune = l;
  53. return i + 1;
  54. }
  55. }
  56. /*
  57. * bad decoding
  58. */
  59. bad:
  60. *rune = Bad;
  61. return 1;
  62. }
  63. int
  64. runetochar(char *str, Rune *rune)
  65. {
  66. int i, j;
  67. Rune c;
  68. c = *rune;
  69. if(c <= Rune1) {
  70. str[0] = c;
  71. return 1;
  72. }
  73. /*
  74. * one character sequence
  75. * 00000-0007F => 00-7F
  76. * two character sequence
  77. * 0080-07FF => T2 Tx
  78. * three character sequence
  79. * 0800-FFFF => T3 Tx Tx
  80. * four character sequence (21-bit value)
  81. * 10000-1FFFFF => T4 Tx Tx Tx
  82. * If the Rune is out of range or a surrogate half,
  83. * convert it to the error rune.
  84. * Do this test when i==3 because the error rune encodes to three bytes.
  85. * Doing it earlier would duplicate work, since an out of range
  86. * Rune wouldn't have fit in one or two bytes.
  87. */
  88. for(i = 2; i < UTFmax + 1; i++){
  89. if(i == 3){
  90. if(c > Runemax)
  91. c = Runeerror;
  92. if(SurrogateMin <= c && c <= SurrogateMax)
  93. c = Runeerror;
  94. }
  95. if (c <= RuneX(i) || i == UTFmax ) {
  96. str[0] = T(i) | (c >> (i - 1)*Bitx);
  97. for(j = 1; j < i; j++)
  98. str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
  99. return i;
  100. }
  101. }
  102. return UTFmax;
  103. }
  104. int
  105. runelen(long c)
  106. {
  107. Rune rune;
  108. char str[10];
  109. rune = c;
  110. return runetochar(str, &rune);
  111. }
  112. int
  113. runenlen(Rune *r, int nrune)
  114. {
  115. int nb, i;
  116. Rune c;
  117. nb = 0;
  118. while(nrune--) {
  119. c = *r++;
  120. if(c <= Rune1){
  121. nb++;
  122. } else {
  123. for(i = 2; i < UTFmax + 1; i++)
  124. if(c <= RuneX(i) || i == UTFmax){
  125. nb += i;
  126. break;
  127. }
  128. }
  129. }
  130. return nb;
  131. }
  132. int
  133. fullrune(char *str, int n)
  134. {
  135. int i;
  136. Rune c;
  137. if(n <= 0)
  138. return 0;
  139. c = *(uchar*)str;
  140. if(c < Tx)
  141. return 1;
  142. for(i = 3; i < UTFmax + 1; i++)
  143. if(c < T(i))
  144. return n >= i - 1;
  145. return n >= UTFmax;
  146. }