123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485 |
- #ifdef PLAN9
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #else
- #include <sys/types.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <unistd.h>
- #include <errno.h>
- #include "plan9.h"
- #endif
- #include "hdr.h"
- /*
- the our_* routines are implementations for the corresponding library
- routines. for a while, i tried to actually name them wctomb etc
- but stopped that after i found a system which made wchar_t an
- unsigned char.
- */
- int our_wctomb(char *s, unsigned long wc);
- int our_mbtowc(unsigned long *p, char *s, unsigned n);
- int runetoisoutf(char *str, Rune *rune);
- int fullisorune(char *str, int n);
- int isochartorune(Rune *rune, char *str);
- void
- utf_in(int fd, long *notused, struct convert *out)
- {
- char buf[N];
- int i, j, c, n, tot;
- ulong l;
- USED(notused);
- tot = 0;
- while((n = read(fd, buf+tot, N-tot)) >= 0){
- tot += n;
- for(i=j=0; i<tot; ){
- c = our_mbtowc(&l, buf+i, tot-i);
- if(c == -1)
- break;
- if(c == -2){
- if(squawk)
- EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
- if(clean)
- continue;
- nerrors++;
- l = Runeerror;
- }
- runes[j++] = l;
- i += c;
- }
- OUT(out, runes, j);
- tot -= i;
- ninput += i;
- if(tot)
- memmove(buf, buf+i, tot);
- if(n == 0)
- break;
- }
- }
- void
- utf_out(Rune *base, int n, long *notused)
- {
- char *p;
- Rune *r;
- USED(notused);
- nrunes += n;
- for(r = base, p = obuf; n-- > 0; r++){
- p += our_wctomb(p, *r);
- }
- noutput += p-obuf;
- write(1, obuf, p-obuf);
- }
- void
- isoutf_in(int fd, long *notused, struct convert *out)
- {
- char buf[N];
- int i, j, c, n, tot;
- USED(notused);
- tot = 0;
- while((n = read(fd, buf+tot, N-tot)) >= 0){
- tot += n;
- for(i=j=0; i<tot; ){
- if(!fullisorune(buf+i, tot-i))
- break;
- c = isochartorune(&runes[j], buf+i);
- if(runes[j] == Runeerror){
- if(squawk)
- EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
- if(clean)
- continue;
- nerrors++;
- }
- j++;
- i += c;
- }
- OUT(out, runes, j);
- tot -= i;
- ninput += i;
- if(tot)
- memmove(buf, buf+i, tot);
- if(n == 0)
- break;
- }
- }
- void
- isoutf_out(Rune *base, int n, long *notused)
- {
- char *p;
- Rune *r;
- USED(notused);
- nrunes += n;
- for(r = base, p = obuf; n-- > 0; r++)
- p += runetoisoutf(p, r);
- noutput += p-obuf;
- write(1, obuf, p-obuf);
- }
- enum
- {
- Char1 = Runeself, Rune1 = Runeself,
- Char21 = 0xA1, Rune21 = 0x0100,
- Char22 = 0xF6, Rune22 = 0x4016,
- Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
- Esc = 0xBE, Bad = Runeerror
- };
- static uchar U[256];
- static uchar T[256];
- static
- void
- mktable(void)
- {
- int i, u;
- for(i=0; i<256; i++) {
- u = i + (0x5E - 0xA0);
- if(i < 0xA0)
- u = i + (0xDF - 0x7F);
- if(i < 0x7F)
- u = i + (0x00 - 0x21);
- if(i < 0x21)
- u = i + (0xBE - 0x00);
- U[i] = u;
- T[u] = i;
- }
- }
- int
- isochartorune(Rune *rune, char *str)
- {
- int c, c1, c2;
- long l;
- if(U[0] == 0)
- mktable();
- /*
- * one character sequence
- * 00000-0009F => 00-9F
- */
- c = *(uchar*)str;
- if(c < Char1) {
- *rune = c;
- return 1;
- }
- /*
- * two character sequence
- * 000A0-000FF => A0; A0-FF
- */
- c1 = *(uchar*)(str+1);
- if(c < Char21) {
- if(c1 >= Rune1 && c1 < Rune21) {
- *rune = c1;
- return 2;
- }
- goto bad;
- }
- /*
- * two character sequence
- * 00100-04015 => A1-F5; 21-7E/A0-FF
- */
- c1 = U[c1];
- if(c1 >= Esc)
- goto bad;
- if(c < Char22) {
- *rune = (c-Char21)*Esc + c1 + Rune21;
- return 2;
- }
- /*
- * three character sequence
- * 04016-38E2D => A6-FB; 21-7E/A0-FF
- */
- c2 = U[*(uchar*)(str+2)];
- if(c2 >= Esc)
- goto bad;
- if(c < Char3) {
- l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
- if(l >= Rune3)
- goto bad;
- *rune = l;
- return 3;
- }
- /*
- * bad decoding
- */
- bad:
- *rune = Bad;
- return 1;
- }
- int
- runetoisoutf(char *str, Rune *rune)
- {
- long c;
- if(T[0] == 0)
- mktable();
- /*
- * one character sequence
- * 00000-0009F => 00-9F
- */
- c = *rune;
- if(c < Rune1) {
- str[0] = c;
- return 1;
- }
- /*
- * two character sequence
- * 000A0-000FF => A0; A0-FF
- */
- if(c < Rune21) {
- str[0] = Char1;
- str[1] = c;
- return 2;
- }
- /*
- * two character sequence
- * 00100-04015 => A1-F5; 21-7E/A0-FF
- */
- if(c < Rune22) {
- c -= Rune21;
- str[0] = c/Esc + Char21;
- str[1] = T[c%Esc];
- return 2;
- }
- /*
- * three character sequence
- * 04016-38E2D => A6-FB; 21-7E/A0-FF
- */
- c -= Rune22;
- str[0] = c/(Esc*Esc) + Char22;
- str[1] = T[c/Esc%Esc];
- str[2] = T[c%Esc];
- return 3;
- }
- int
- fullisorune(char *str, int n)
- {
- int c;
- if(n > 0) {
- c = *(uchar*)str;
- if(c < Char1)
- return 1;
- if(n > 1)
- if(c < Char22 || n > 2)
- return 1;
- }
- return 0;
- }
- #ifdef PLAN9
- int errno;
- #endif
- enum
- {
- T1 = 0x00,
- Tx = 0x80,
- T2 = 0xC0,
- T3 = 0xE0,
- T4 = 0xF0,
- T5 = 0xF8,
- T6 = 0xFC,
- Bit1 = 7,
- Bitx = 6,
- Bit2 = 5,
- Bit3 = 4,
- Bit4 = 3,
- Bit5 = 2,
- Bit6 = 2,
- Mask1 = (1<<Bit1)-1,
- Maskx = (1<<Bitx)-1,
- Mask2 = (1<<Bit2)-1,
- Mask3 = (1<<Bit3)-1,
- Mask4 = (1<<Bit4)-1,
- Mask5 = (1<<Bit5)-1,
- Mask6 = (1<<Bit6)-1,
- Wchar1 = (1UL<<Bit1)-1,
- Wchar2 = (1UL<<(Bit2+Bitx))-1,
- Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
- Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
- Wchar5 = (1UL<<(Bit5+4*Bitx))-1
- #ifndef EILSEQ
- , /* we hate ansi c's comma rules */
- EILSEQ = 123
- #endif /* PLAN9 */
- };
- int
- our_wctomb(char *s, unsigned long wc)
- {
- if(s == 0)
- return 0; /* no shift states */
- if(wc & ~Wchar2) {
- if(wc & ~Wchar4) {
- if(wc & ~Wchar5) {
- /* 6 bytes */
- s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
- s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
- s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
- s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
- s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
- s[5] = Tx | (wc & Maskx);
- return 6;
- }
- /* 5 bytes */
- s[0] = T5 | (wc >> 4*Bitx);
- s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
- s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
- s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
- s[4] = Tx | (wc & Maskx);
- return 5;
- }
- if(wc & ~Wchar3) {
- /* 4 bytes */
- s[0] = T4 | (wc >> 3*Bitx);
- s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
- s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
- s[3] = Tx | (wc & Maskx);
- return 4;
- }
- /* 3 bytes */
- s[0] = T3 | (wc >> 2*Bitx);
- s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
- s[2] = Tx | (wc & Maskx);
- return 3;
- }
- if(wc & ~Wchar1) {
- /* 2 bytes */
- s[0] = T2 | (wc >> 1*Bitx);
- s[1] = Tx | (wc & Maskx);
- return 2;
- }
- /* 1 byte */
- s[0] = T1 | wc;
- return 1;
- }
- int
- our_mbtowc(unsigned long *p, char *s, unsigned n)
- {
- uchar *us;
- int c0, c1, c2, c3, c4, c5;
- unsigned long wc;
- if(s == 0)
- return 0; /* no shift states */
- if(n < 1)
- goto badlen;
- us = (uchar*)s;
- c0 = us[0];
- if(c0 >= T3) {
- if(n < 3)
- goto badlen;
- c1 = us[1] ^ Tx;
- c2 = us[2] ^ Tx;
- if((c1|c2) & T2)
- goto bad;
- if(c0 >= T5) {
- if(n < 5)
- goto badlen;
- c3 = us[3] ^ Tx;
- c4 = us[4] ^ Tx;
- if((c3|c4) & T2)
- goto bad;
- if(c0 >= T6) {
- /* 6 bytes */
- if(n < 6)
- goto badlen;
- c5 = us[5] ^ Tx;
- if(c5 & T2)
- goto bad;
- wc = ((((((((((c0 & Mask6) << Bitx) |
- c1) << Bitx) | c2) << Bitx) |
- c3) << Bitx) | c4) << Bitx) | c5;
- if(wc <= Wchar5)
- goto bad;
- *p = wc;
- return 6;
- }
- /* 5 bytes */
- wc = ((((((((c0 & Mask5) << Bitx) |
- c1) << Bitx) | c2) << Bitx) |
- c3) << Bitx) | c4;
- if(wc <= Wchar4)
- goto bad;
- *p = wc;
- return 5;
- }
- if(c0 >= T4) {
- /* 4 bytes */
- if(n < 4)
- goto badlen;
- c3 = us[3] ^ Tx;
- if(c3 & T2)
- goto bad;
- wc = ((((((c0 & Mask4) << Bitx) |
- c1) << Bitx) | c2) << Bitx) |
- c3;
- if(wc <= Wchar3)
- goto bad;
- *p = wc;
- return 4;
- }
- /* 3 bytes */
- wc = ((((c0 & Mask3) << Bitx) |
- c1) << Bitx) | c2;
- if(wc <= Wchar2)
- goto bad;
- *p = wc;
- return 3;
- }
- if(c0 >= T2) {
- /* 2 bytes */
- if(n < 2)
- goto badlen;
- c1 = us[1] ^ Tx;
- if(c1 & T2)
- goto bad;
- wc = ((c0 & Mask2) << Bitx) |
- c1;
- if(wc <= Wchar1)
- goto bad;
- *p = wc;
- return 2;
- }
- /* 1 byte */
- if(c0 >= Tx)
- goto bad;
- *p = c0;
- return 1;
- bad:
- errno = EILSEQ;
- return -1;
- badlen:
- return -2;
- }
|