rune.c 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. #include <u.h>
  2. #include <libc.h>
  3. enum
  4. {
  5. Bit1 = 7,
  6. Bitx = 6,
  7. Bit2 = 5,
  8. Bit3 = 4,
  9. Bit4 = 3,
  10. T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  11. Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  12. T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  13. T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  14. T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  15. Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
  16. Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
  17. Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
  18. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  19. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  20. Bad = Runeerror,
  21. };
  22. int
  23. chartorune(Rune *rune, char *str)
  24. {
  25. int c, c1, c2;
  26. long l;
  27. /*
  28. * one character sequence
  29. * 00000-0007F => T1
  30. */
  31. c = *(uchar*)str;
  32. if(c < Tx) {
  33. *rune = c;
  34. return 1;
  35. }
  36. /*
  37. * two character sequence
  38. * 0080-07FF => T2 Tx
  39. */
  40. c1 = *(uchar*)(str+1) ^ Tx;
  41. if(c1 & Testx)
  42. goto bad;
  43. if(c < T3) {
  44. if(c < T2)
  45. goto bad;
  46. l = ((c << Bitx) | c1) & Rune2;
  47. if(l <= Rune1)
  48. goto bad;
  49. *rune = l;
  50. return 2;
  51. }
  52. /*
  53. * three character sequence
  54. * 0800-FFFF => T3 Tx Tx
  55. */
  56. c2 = *(uchar*)(str+2) ^ Tx;
  57. if(c2 & Testx)
  58. goto bad;
  59. if(c < T4) {
  60. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  61. if(l <= Rune2)
  62. goto bad;
  63. *rune = l;
  64. return 3;
  65. }
  66. /*
  67. * bad decoding
  68. */
  69. bad:
  70. *rune = Bad;
  71. return 1;
  72. }
  73. int
  74. runetochar(char *str, Rune *rune)
  75. {
  76. long c;
  77. /*
  78. * one character sequence
  79. * 00000-0007F => 00-7F
  80. */
  81. c = *rune;
  82. if(c <= Rune1) {
  83. str[0] = c;
  84. return 1;
  85. }
  86. /*
  87. * two character sequence
  88. * 0080-07FF => T2 Tx
  89. */
  90. if(c <= Rune2) {
  91. str[0] = T2 | (c >> 1*Bitx);
  92. str[1] = Tx | (c & Maskx);
  93. return 2;
  94. }
  95. /*
  96. * three character sequence
  97. * 0800-FFFF => T3 Tx Tx
  98. */
  99. str[0] = T3 | (c >> 2*Bitx);
  100. str[1] = Tx | ((c >> 1*Bitx) & Maskx);
  101. str[2] = Tx | (c & Maskx);
  102. return 3;
  103. }
  104. int
  105. runelen(long c)
  106. {
  107. Rune rune;
  108. char str[10];
  109. rune = c;
  110. return runetochar(str, &rune);
  111. }
  112. int
  113. runenlen(Rune *r, int nrune)
  114. {
  115. int nb, c;
  116. nb = 0;
  117. while(nrune--) {
  118. c = *r++;
  119. if(c <= Rune1)
  120. nb++;
  121. else
  122. if(c <= Rune2)
  123. nb += 2;
  124. else
  125. nb += 3;
  126. }
  127. return nb;
  128. }
  129. int
  130. fullrune(char *str, int n)
  131. {
  132. int c;
  133. if(n > 0) {
  134. c = *(uchar*)str;
  135. if(c < Tx)
  136. return 1;
  137. if(n > 1)
  138. if(c < T3 || n > 2)
  139. return 1;
  140. }
  141. return 0;
  142. }