123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535 |
- #ifdef PLAN9
- #include <u.h>
- #include <libc.h>
- #include <bio.h>
- #else
- #include <stdio.h>
- #include <unistd.h>
- #include "plan9.h"
- #endif
- #include "hdr.h"
- #include "conv.h"
- #include "kuten208.h"
- #include "jis.h"
- /*
- a state machine for interpreting all sorts of encodings
- */
- static void
- alljis(int c, Rune **r, long input_loc)
- {
- static enum { state0, state1, state2, state3, state4 } state = state0;
- static int set8 = 0;
- static int japan646 = 0;
- static int lastc;
- int n;
- long l;
- again:
- switch(state)
- {
- case state0: /* idle state */
- if(c == ESC){ state = state1; return; }
- if(c < 0) return;
- if(!set8 && (c < 128)){
- if(japan646){
- switch(c)
- {
- case '\\': emit(0xA5); return; /* yen */
- case '~': emit(0xAF); return; /* spacing macron */
- default: emit(c); return;
- }
- } else {
- emit(c);
- return;
- }
- }
- if(c < 0x21){ /* guard against bogus characters in JIS mode */
- if(squawk)
- EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
- emit(c);
- return;
- }
- lastc = c; state = state4; return;
- case state1: /* seen an escape */
- if(c == '$'){ state = state2; return; }
- if(c == '('){ state = state3; return; }
- emit(ESC); state = state0; goto again;
- case state2: /* may be shifting into JIS */
- if((c == '@') || (c == 'B')){
- set8 = 1; state = state0; return;
- }
- emit(ESC); emit('$'); state = state0; goto again;
- case state3: /* may be shifting out of JIS */
- if((c == 'J') || (c == 'H') || (c == 'B')){
- japan646 = (c == 'J');
- set8 = 0; state = state0; return;
- }
- emit(ESC); emit('('); state = state0; goto again;
- case state4: /* two part char */
- if(c < 0){
- if(squawk)
- EPR "%s: unexpected EOF in %s\n", argv0, file);
- c = 0x21 | (lastc&0x80);
- }
- if(CANS2J(lastc, c)){ /* ms dos sjis */
- int hi = lastc, lo = c;
- S2J(hi, lo); /* convert to 208 */
- n = hi*100 + lo - 3232; /* convert to kuten208 */
- } else
- n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
- if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
- nerrors++;
- if(squawk)
- EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
- if(!clean)
- emit(BADMAP);
- } else {
- if(l < 0){
- l = -l;
- if(squawk)
- EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
- }
- emit(l);
- }
- state = state0;
- }
- }
- /*
- a state machine for interpreting ms-kanji == shift-jis.
- */
- static void
- ms(int c, Rune **r, long input_loc)
- {
- static enum { state0, state1, state2, state3, state4 } state = state0;
- static int set8 = 0;
- static int japan646 = 0;
- static int lastc;
- int n;
- long l;
- again:
- switch(state)
- {
- case state0: /* idle state */
- if(c == ESC){ state = state1; return; }
- if(c < 0) return;
- if(!set8 && (c < 128)){
- if(japan646){
- switch(c)
- {
- case '\\': emit(0xA5); return; /* yen */
- case '~': emit(0xAF); return; /* spacing macron */
- default: emit(c); return;
- }
- } else {
- emit(c);
- return;
- }
- }
- lastc = c; state = state4; return;
- case state1: /* seen an escape */
- if(c == '$'){ state = state2; return; }
- if(c == '('){ state = state3; return; }
- emit(ESC); state = state0; goto again;
- case state2: /* may be shifting into JIS */
- if((c == '@') || (c == 'B')){
- set8 = 1; state = state0; return;
- }
- emit(ESC); emit('$'); state = state0; goto again;
- case state3: /* may be shifting out of JIS */
- if((c == 'J') || (c == 'H') || (c == 'B')){
- japan646 = (c == 'J');
- set8 = 0; state = state0; return;
- }
- emit(ESC); emit('('); state = state0; goto again;
- case state4: /* two part char */
- if(c < 0){
- if(squawk)
- EPR "%s: unexpected EOF in %s\n", argv0, file);
- c = 0x21 | (lastc&0x80);
- }
- if(CANS2J(lastc, c)){ /* ms dos sjis */
- int hi = lastc, lo = c;
- S2J(hi, lo); /* convert to 208 */
- n = hi*100 + lo - 3232; /* convert to kuten208 */
- } else {
- nerrors++;
- if(squawk)
- EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
- if(!clean)
- emit(BADMAP);
- state = state0;
- goto again;
- }
- if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
- nerrors++;
- if(squawk)
- EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
- if(!clean)
- emit(BADMAP);
- } else {
- if(l < 0){
- l = -l;
- if(squawk)
- EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
- }
- emit(l);
- }
- state = state0;
- }
- }
- /*
- a state machine for interpreting ujis == EUC
- */
- static void
- ujis(int c, Rune **r, long input_loc)
- {
- static enum { state0, state1 } state = state0;
- static int lastc;
- int n;
- long l;
- switch(state)
- {
- case state0: /* idle state */
- if(c < 0) return;
- if(c < 128){
- emit(c);
- return;
- }
- if(c == 0x8e){ /* codeset 2 */
- nerrors++;
- if(squawk)
- EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
- if(!clean)
- emit(BADMAP);
- return;
- }
- if(c == 0x8f){ /* codeset 3 */
- nerrors++;
- if(squawk)
- EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
- if(!clean)
- emit(BADMAP);
- return;
- }
- lastc = c;
- state = state1;
- return;
- case state1: /* two part char */
- if(c < 0){
- if(squawk)
- EPR "%s: unexpected EOF in %s\n", argv0, file);
- c = 0xA1;
- }
- n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
- if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
- nerrors++;
- if(squawk)
- EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
- if(!clean)
- emit(BADMAP);
- } else {
- if(l < 0){
- l = -l;
- if(squawk)
- EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
- }
- emit(l);
- }
- state = state0;
- }
- }
- /*
- a state machine for interpreting jis-kanji == 2022-JP
- */
- static void
- jis(int c, Rune **r, long input_loc)
- {
- static enum { state0, state1, state2, state3, state4 } state = state0;
- static int set8 = 0;
- static int japan646 = 0;
- static int lastc;
- int n;
- long l;
- again:
- switch(state)
- {
- case state0: /* idle state */
- if(c == ESC){ state = state1; return; }
- if(c < 0) return;
- if(!set8 && (c < 128)){
- if(japan646){
- switch(c)
- {
- case '\\': emit(0xA5); return; /* yen */
- case '~': emit(0xAF); return; /* spacing macron */
- default: emit(c); return;
- }
- } else {
- emit(c);
- return;
- }
- }
- lastc = c; state = state4; return;
- case state1: /* seen an escape */
- if(c == '$'){ state = state2; return; }
- if(c == '('){ state = state3; return; }
- emit(ESC); state = state0; goto again;
- case state2: /* may be shifting into JIS */
- if((c == '@') || (c == 'B')){
- set8 = 1; state = state0; return;
- }
- emit(ESC); emit('$'); state = state0; goto again;
- case state3: /* may be shifting out of JIS */
- if((c == 'J') || (c == 'H') || (c == 'B')){
- japan646 = (c == 'J');
- set8 = 0; state = state0; return;
- }
- emit(ESC); emit('('); state = state0; goto again;
- case state4: /* two part char */
- if(c < 0){
- if(squawk)
- EPR "%s: unexpected EOF in %s\n", argv0, file);
- c = 0x21 | (lastc&0x80);
- }
- if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
- emit(lastc);
- state = state0;
- goto again;
- }
- n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
- if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
- nerrors++;
- if(squawk)
- EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
- if(!clean)
- emit(BADMAP);
- } else {
- if(l < 0){
- l = -l;
- if(squawk)
- EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
- }
- emit(l);
- }
- state = state0;
- }
- }
- static void
- do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
- {
- Rune ob[N];
- Rune *r, *re;
- uchar ibuf[N];
- int n, i;
- long nin;
- r = ob;
- re = ob+N-3;
- nin = 0;
- while((n = read(fd, ibuf, sizeof ibuf)) > 0){
- for(i = 0; i < n; i++){
- (*procfn)(ibuf[i], &r, nin++);
- if(r >= re){
- OUT(out, ob, r-ob);
- r = ob;
- }
- }
- if(r > ob){
- OUT(out, ob, r-ob);
- r = ob;
- }
- }
- (*procfn)(-1, &r, nin);
- if(r > ob)
- OUT(out, ob, r-ob);
- }
- void
- jis_in(int fd, long *notused, struct convert *out)
- {
- USED(notused);
- do_in(fd, alljis, out);
- }
- void
- ujis_in(int fd, long *notused, struct convert *out)
- {
- USED(notused);
- do_in(fd, ujis, out);
- }
- void
- msjis_in(int fd, long *notused, struct convert *out)
- {
- USED(notused);
- do_in(fd, ms, out);
- }
- void
- jisjis_in(int fd, long *notused, struct convert *out)
- {
- USED(notused);
- do_in(fd, jis, out);
- }
- static int first = 1;
- static void
- tab_init(void)
- {
- int i;
- long l;
- first = 0;
- for(i = 0; i < NRUNE; i++)
- tab[i] = -1;
- for(i = 0; i < KUTEN208MAX; i++)
- if((l = tabkuten208[i]) != -1){
- if(l < 0)
- tab[-l] = i;
- else
- tab[l] = i;
- }
- }
- /* jis-kanji, or ISO 2022-JP */
- void
- jisjis_out(Rune *base, int n, long *notused)
- {
- char *p;
- int i;
- Rune r;
- static enum { ascii, japan646, jp2022 } state = ascii;
- USED(notused);
- if(first)
- tab_init();
- nrunes += n;
- p = obuf;
- for(i = 0; i < n; i++){
- r = base[i];
- if(r < 128){
- if(state == jp2022){
- *p++ = ESC; *p++ = '('; *p++ = 'B';
- state = ascii;
- }
- *p++ = r;
- } else {
- if(tab[r] != -1){
- if(state != jp2022){
- *p++ = ESC; *p++ = '$'; *p++ = 'B';
- state = jp2022;
- }
- *p++ = tab[r]/100 + ' ';
- *p++ = tab[r]%100 + ' ';
- continue;
- }
- if(squawk)
- EPR "%s: rune 0x%x not in output cs\n", argv0, r);
- nerrors++;
- if(clean)
- continue;
- *p++ = BYTEBADMAP;
- }
- }
- noutput += p-obuf;
- if(p > obuf)
- write(1, obuf, p-obuf);
- }
- /* ms-kanji, or Shift-JIS */
- void
- msjis_out(Rune *base, int n, long *notused)
- {
- char *p;
- int i, hi, lo;
- Rune r;
- USED(notused);
- if(first)
- tab_init();
- nrunes += n;
- p = obuf;
- for(i = 0; i < n; i++){
- r = base[i];
- if(r < 128)
- *p++ = r;
- else {
- if(tab[r] != -1){
- hi = tab[r]/100 + ' ';
- lo = tab[r]%100 + ' ';
- J2S(hi, lo);
- *p++ = hi;
- *p++ = lo;
- continue;
- }
- if(squawk)
- EPR "%s: rune 0x%x not in output cs\n", argv0, r);
- nerrors++;
- if(clean)
- continue;
- *p++ = BYTEBADMAP;
- }
- }
- noutput += p-obuf;
- if(p > obuf)
- write(1, obuf, p-obuf);
- }
- /* ujis, or EUC */
- void
- ujis_out(Rune *base, int n, long *notused)
- {
- char *p;
- int i;
- Rune r;
- USED(notused);
- if(first)
- tab_init();
- nrunes += n;
- p = obuf;
- for(i = 0; i < n; i++){
- r = base[i];
- if(r < 128)
- *p++ = r;
- else {
- if(tab[r] != -1){
- *p++ = 0x80 | (tab[r]/100 + ' ');
- *p++ = 0x80 | (tab[r]%100 + ' ');
- continue;
- }
- if(squawk)
- EPR "%s: rune 0x%x not in output cs\n", argv0, r);
- nerrors++;
- if(clean)
- continue;
- *p++ = BYTEBADMAP;
- }
- }
- noutput += p-obuf;
- if(p > obuf)
- write(1, obuf, p-obuf);
- }
|