rune.c 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #include "lib9.h"
  2. #define Bit(i) (7-(i))
  3. /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
  4. #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
  5. /* 0000 0000 0000 0111 1111 1111 */
  6. #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
  7. enum
  8. {
  9. Bitx = Bit(1),
  10. Tx = T(1), /* 1000 0000 */
  11. Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
  12. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  13. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  14. SurrogateMin = 0xD800,
  15. SurrogateMax = 0xDFFF,
  16. Bad = Runeerror,
  17. };
  18. int
  19. chartorune(Rune *rune, char *str)
  20. {
  21. int c[UTFmax], i;
  22. Rune l;
  23. /*
  24. * N character sequence
  25. * 00000-0007F => T1
  26. * 00080-007FF => T2 Tx
  27. * 00800-0FFFF => T3 Tx Tx
  28. * 10000-10FFFF => T4 Tx Tx Tx
  29. */
  30. c[0] = *(uchar*)(str);
  31. if(c[0] < Tx){
  32. *rune = c[0];
  33. return 1;
  34. }
  35. l = c[0];
  36. for(i = 1; i < UTFmax; i++) {
  37. c[i] = *(uchar*)(str+i);
  38. c[i] ^= Tx;
  39. if(c[i] & Testx)
  40. goto bad;
  41. l = (l << Bitx) | c[i];
  42. if(c[0] < T(i + 2)) {
  43. l &= RuneX(i + 1);
  44. if(i == 1) {
  45. if(c[0] < T(2) || l <= Rune1)
  46. goto bad;
  47. } else if(l <= RuneX(i) || l > Runemax)
  48. goto bad;
  49. if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
  50. goto bad;
  51. *rune = l;
  52. return i + 1;
  53. }
  54. }
  55. /*
  56. * bad decoding
  57. */
  58. bad:
  59. *rune = Bad;
  60. return 1;
  61. }
  62. int
  63. runetochar(char *str, Rune *rune)
  64. {
  65. int i, j;
  66. Rune c;
  67. c = *rune;
  68. if(c <= Rune1) {
  69. str[0] = c;
  70. return 1;
  71. }
  72. /*
  73. * one character sequence
  74. * 00000-0007F => 00-7F
  75. * two character sequence
  76. * 0080-07FF => T2 Tx
  77. * three character sequence
  78. * 0800-FFFF => T3 Tx Tx
  79. * four character sequence (21-bit value)
  80. * 10000-1FFFFF => T4 Tx Tx Tx
  81. * If the Rune is out of range or a surrogate half,
  82. * convert it to the error rune.
  83. * Do this test when i==3 because the error rune encodes to three bytes.
  84. * Doing it earlier would duplicate work, since an out of range
  85. * Rune wouldn't have fit in one or two bytes.
  86. */
  87. for(i = 2; i < UTFmax + 1; i++){
  88. if(i == 3){
  89. if(c > Runemax)
  90. c = Runeerror;
  91. if(SurrogateMin <= c && c <= SurrogateMax)
  92. c = Runeerror;
  93. }
  94. if (c <= RuneX(i) || i == UTFmax ) {
  95. str[0] = T(i) | (c >> (i - 1)*Bitx);
  96. for(j = 1; j < i; j++)
  97. str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
  98. return i;
  99. }
  100. }
  101. return UTFmax;
  102. }
  103. int
  104. runelen(long c)
  105. {
  106. Rune rune;
  107. char str[10];
  108. rune = c;
  109. return runetochar(str, &rune);
  110. }
  111. int
  112. runenlen(Rune *r, int nrune)
  113. {
  114. int nb, i;
  115. Rune c;
  116. nb = 0;
  117. while(nrune--) {
  118. c = *r++;
  119. if(c <= Rune1){
  120. nb++;
  121. } else {
  122. for(i = 2; i < UTFmax + 1; i++)
  123. if(c <= RuneX(i) || i == UTFmax){
  124. nb += i;
  125. break;
  126. }
  127. }
  128. }
  129. return nb;
  130. }
  131. int
  132. fullrune(char *str, int n)
  133. {
  134. int i;
  135. Rune c;
  136. if(n <= 0)
  137. return 0;
  138. c = *(uchar*)str;
  139. if(c < Tx)
  140. return 1;
  141. for(i = 3; i < UTFmax + 1; i++)
  142. if(c < T(i))
  143. return n >= i - 1;
  144. return n >= UTFmax;
  145. }