Browse Source

Plan 9 from Bell Labs 2013-04-24

David du Colombier 11 years ago
parent
commit
3a1fac6bb6

+ 0 - 7
sys/man/1/wc

@@ -37,13 +37,6 @@ are reported.
 .SH SOURCE
 .B /sys/src/cmd/wc.c
 .SH BUGS
-The Unicode Standard has many blank characters scattered through it,
-but
-.I wc
-looks for only
-.SM ASCII
-space, tab and newline.
-.PP
 .I Wc
 should have options to count suboptimal
 .SM UTF

+ 2 - 1
sys/src/9/pc/sdiahci.c

@@ -1938,7 +1938,8 @@ didtype(Pcidev *p)
 		/*
 		 * 0x27c4 is the intel 82801 in compatibility (not sata) mode.
 		 */
-		if (p->did == 0x24d1 ||			/* 82801eb/er */
+		if (p->did == 0x1e02 ||			/* c210 */
+		    p->did == 0x24d1 ||			/* 82801eb/er */
 		    (p->did & 0xfffb) == 0x27c1 ||	/* 82801g[bh]m ich7 */
 		    p->did == 0x2821 ||			/* 82801h[roh] */
 		    (p->did & 0xfffe) == 0x2824 ||	/* 82801h[b] */

+ 2 - 0
sys/src/9/port/fpi.c

@@ -291,6 +291,8 @@ fpicmp(Internal *x, Internal *y)
 		return 0;
 	if(IsInfinity(x) && IsInfinity(y))
 		return y->s - x->s;
+	if(IsZero(x) && IsZero(y))
+		return 0;
 	if(x->e == y->e && x->h == y->h && x->l == y->l)
 		return y->s - x->s;
 	if(x->e < y->e

+ 26 - 23
sys/src/cmd/acme/regx.c

@@ -20,7 +20,7 @@ Rune		*lastregexp;
 typedef struct Inst Inst;
 struct Inst
 {
-	uint	type;	/* < 0x10000 ==> literal, otherwise action */
+	uint	type;	/* <= Runemax+1 ==> literal, otherwise action */
 	union {
 		int sid;
 		int subid;
@@ -61,25 +61,28 @@ static	Rangeset sempty;
  *	0x100xx are operators, value == precedence
  *	0x200xx are tokens, i.e. operands for operators
  */
-#define	OPERATOR	0x10000	/* Bitmask of all operators */
-#define	START		0x10000	/* Start, used for marker on stack */
-#define	RBRA		0x10001	/* Right bracket, ) */
-#define	LBRA		0x10002	/* Left bracket, ( */
-#define	OR		0x10003	/* Alternation, | */
-#define	CAT		0x10004	/* Concatentation, implicit operator */
-#define	STAR		0x10005	/* Closure, * */
-#define	PLUS		0x10006	/* a+ == aa* */
-#define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
-#define	ANY		0x20000	/* Any character but newline, . */
-#define	NOP		0x20001	/* No operation, internal use only */
-#define	BOL		0x20002	/* Beginning of line, ^ */
-#define	EOL		0x20003	/* End of line, $ */
-#define	CCLASS		0x20004	/* Character class, [] */
-#define	NCCLASS		0x20005	/* Negated character class, [^] */
-#define	END		0x20077	/* Terminate: match found */
-
-#define	ISATOR		0x10000
-#define	ISAND		0x20000
+enum {
+	OPERATOR = Runemask+1,	/* Bitmask of all operators */
+	START	= OPERATOR,	/* Start, used for marker on stack */
+	RBRA,			/* Right bracket, ) */
+	LBRA,			/* Left bracket, ( */
+	OR,			/* Alternation, | */
+	CAT,			/* Concatentation, implicit operator */
+	STAR,			/* Closure, * */
+	PLUS,			/* a+ == aa* */
+	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
+
+	ANY	= OPERATOR<<1,	/* Any character but newline, . */
+	NOP,			/* No operation, internal use only */
+	BOL,			/* Beginning of line, ^ */
+	EOL,			/* End of line, $ */
+	CCLASS,			/* Character class, [] */
+	NCCLASS,		/* Negated character class, [^] */
+	END,			/* Terminate: match found */
+
+	ISATOR	= OPERATOR,
+	ISAND	= OPERATOR<<1,
+};
 
 /*
  * Parser Information
@@ -452,7 +455,7 @@ nextrec(void)
 			exprp++;
 			return '\n';
 		}
-		return *exprp++|0x10000;
+		return *exprp++|(Runemax+1);
 	}
 	return *exprp++;
 }
@@ -487,7 +490,7 @@ bldcclass(void)
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -509,7 +512,7 @@ classmatch(int classno, int c, int negate)
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;

+ 7 - 2
sys/src/cmd/cc/cc.h

@@ -20,6 +20,8 @@ typedef	struct	Term	Term;
 typedef	struct	Init	Init;
 typedef	struct	Bits	Bits;
 
+typedef	Rune	TRune;	/* target system type */
+
 #define	NHUNK		50000L
 #define	BUFSIZ		8192
 #define	NSYMB		1500
@@ -51,7 +53,7 @@ struct	Node
 	double	fconst;		/* fp constant */
 	vlong	vconst;		/* non fp const */
 	char*	cstring;	/* character string */
-	ushort*	rstring;	/* rune string */
+	TRune*	rstring;	/* rune string */
 
 	Sym*	sym;
 	Type*	type;
@@ -336,6 +338,9 @@ enum
 	TFILE,
 	TOLD,
 	NALLTYPES,
+
+	/* adapt size of Rune to target system's size */
+	TRUNE = sizeof(TRune)==4? TUINT: TUSHORT,
 };
 enum
 {
@@ -740,7 +745,7 @@ void	gclean(void);
 void	gextern(Sym*, Node*, long, long);
 void	ginit(void);
 long	outstring(char*, long);
-long	outlstring(ushort*, long);
+long	outlstring(TRune*, long);
 void	xcom(Node*);
 long	exreg(Type*);
 long	align(long, Type*, int);

+ 6 - 6
sys/src/cmd/cc/cc.y

@@ -855,9 +855,9 @@ lstring:
 	LLSTRING
 	{
 		$$ = new(OLSTRING, Z, Z);
-		$$->type = typ(TARRAY, types[TUSHORT]);
-		$$->type->width = $1.l + sizeof(ushort);
-		$$->rstring = (ushort*)$1.s;
+		$$->type = typ(TARRAY, types[TRUNE]);
+		$$->type->width = $1.l + sizeof(TRune);
+		$$->rstring = (TRune*)$1.s;
 		$$->sym = symstring;
 		$$->etype = TARRAY;
 		$$->class = CSTATIC;
@@ -867,16 +867,16 @@ lstring:
 		char *s;
 		int n;
 
-		n = $1->type->width - sizeof(ushort);
+		n = $1->type->width - sizeof(TRune);
 		s = alloc(n+$2.l+MAXALIGN);
 
 		memcpy(s, $1->rstring, n);
 		memcpy(s+n, $2.s, $2.l);
-		*(ushort*)(s+n+$2.l) = 0;
+		*(TRune*)(s+n+$2.l) = 0;
 
 		$$ = $1;
 		$$->type->width += $2.l;
-		$$->rstring = (ushort*)s;
+		$$->rstring = (TRune*)s;
 	}
 
 zelist:

+ 3 - 2
sys/src/cmd/cc/com.c

@@ -67,6 +67,7 @@ tcomo(Node *n, int f)
 	Node *l, *r;
 	Type *t;
 	int o;
+	static TRune zer;
 
 	if(n == Z) {
 		diag(Z, "Z in tcom");
@@ -633,10 +634,10 @@ tcomo(Node *n, int f)
 		break;
 
 	case OLSTRING:
-		if(n->type->link != types[TUSHORT]) {
+		if(n->type->link != types[TRUNE]) {
 			o = outstring(0, 0);
 			while(o & 3) {
-				outlstring(L"", sizeof(ushort));
+				outlstring(&zer, sizeof(TRune));
 				o = outlstring(0, 0);
 			}
 		}

+ 1 - 1
sys/src/cmd/cc/dcl.c

@@ -232,7 +232,7 @@ nextinit(void)
 			a->cstring++;
 		}
 		if(a->op == OLSTRING) {
-			b->vconst = convvtox(*a->rstring, TUSHORT);
+			b->vconst = convvtox(*a->rstring, TRUNE);
 			a->rstring++;
 		}
 		a->type->width -= b->type->width;

+ 10 - 9
sys/src/cmd/cc/lex.c

@@ -80,7 +80,8 @@ main(int argc, char *argv[])
 
 	case 'I':
 		p = ARGF();
-		setinclude(p);
+		if(p)
+			setinclude(p);
 		break;
 	} ARGEND
 	if(argc < 1 && outfile == 0) {
@@ -465,7 +466,7 @@ l1:
 				yyerror("missing '");
 				peekc = c1;
 			}
-			yylval.vval = convvtox(c, TUSHORT);
+			yylval.vval = convvtox(c, TRUNE);
 			return LUCONST;
 		}
 		if(c == '"') {
@@ -539,15 +540,15 @@ l1:
 			c = escchar('"', 1, 0);
 			if(c == EOF)
 				break;
-			cp = allocn(cp, c1, sizeof(ushort));
-			*(ushort*)(cp + c1) = c;
-			c1 += sizeof(ushort);
+			cp = allocn(cp, c1, sizeof(TRune));
+			*(TRune*)(cp + c1) = c;
+			c1 += sizeof(TRune);
 		}
 		yylval.sval.l = c1;
 		do {
-			cp = allocn(cp, c1, sizeof(ushort));
-			*(ushort*)(cp + c1) = 0;
-			c1 += sizeof(ushort);
+			cp = allocn(cp, c1, sizeof(TRune));
+			*(TRune*)(cp + c1) = 0;
+			c1 += sizeof(TRune);
 		} while(c1 & MAXALIGN);
 		yylval.sval.s = cp;
 		return LLSTRING;
@@ -1025,7 +1026,7 @@ getnsc(void)
 	} else
 		c = GETC();
 	for(;;) {
-		if(!isspace(c))
+		if(c >= Runeself || !isspace(c))
 			return c;
 		if(c == '\n') {
 			lineno++;

+ 11 - 10
sys/src/cmd/cc/pswt.c

@@ -132,28 +132,29 @@ casf(void)
 }
 
 long
-outlstring(ushort *s, long n)
+outlstring(TRune *s, long n)
 {
-	char buf[2];
-	int c;
+	char buf[sizeof(TRune)];
+	uint c;
+	int i;
 	long r;
 
 	if(suppress)
 		return nstring;
-	while(nstring & 1)
+	while(nstring & (sizeof(TRune)-1))
 		outstring("", 1);
 	r = nstring;
 	while(n > 0) {
 		c = *s++;
 		if(align(0, types[TCHAR], Aarg1)) {
-			buf[0] = c>>8;
-			buf[1] = c;
+			for(i = 0; i < sizeof(TRune); i++)
+				buf[i] = c>>(8*(sizeof(TRune) - i - 1));
 		} else {
-			buf[0] = c;
-			buf[1] = c>>8;
+			for(i = 0; i < sizeof(TRune); i++)
+				buf[i] = c>>(8*i);
 		}
-		outstring(buf, 2);
-		n -= sizeof(ushort);
+		outstring(buf, sizeof(TRune));
+		n -= sizeof(TRune);
 	}
 	return r;
 }

+ 4 - 1
sys/src/cmd/cc/sub.c

@@ -85,7 +85,10 @@ prtree1(Node *n, int d, int f)
 		break;
 
 	case OLSTRING:
-		print(" \"%S\"", n->rstring);
+		if(sizeof(TRune) == sizeof(Rune))
+			print(" \"%S\"", (Rune*)n->rstring);
+		else
+			print(" \"...\"");
 		i = 0;
 		break;
 

+ 14 - 12
sys/src/cmd/ed.c

@@ -15,7 +15,7 @@ enum
 	ESIZE	= 256,		/* max size of reg exp */
 	GBSIZE	= 256,		/* max size of global command */
 	MAXSUB	= 9,		/* max number of sub reg exp */
-	ESCFLG	= 0xFFFF,	/* escape Rune - user defined code */
+	ESCFLG	= Runemax,	/* escape Rune - user defined code */
 	EOF	= -1,
 };
 
@@ -54,7 +54,7 @@ Reprog	*pattern;
 int	peekc;
 int	pflag;
 int	rescuing;
-Rune	rhsbuf[LBSIZE/2];
+Rune	rhsbuf[LBSIZE/sizeof(Rune)];
 char	savedfile[FNSIZE];
 jmp_buf	savej;
 int	subnewa;
@@ -735,7 +735,7 @@ gety(void)
 		if(c == 0)
 			continue;
 		*p++ = c;
-		if(p >= &linebuf[LBSIZE-2])
+		if(p >= &linebuf[LBSIZE-sizeof(Rune)])
 			error(Q);
 	}
 }
@@ -988,11 +988,12 @@ getline(int tl)
 	lp = linebuf;
 	bp = getblock(tl, OREAD);
 	nl = nleft;
-	tl &= ~((BLKSIZE/2) - 1);
+	tl &= ~((BLKSIZE/sizeof(Rune)) - 1);
 	while(*lp++ = *bp++) {
 		nl -= sizeof(Rune);
 		if(nl == 0) {
-			bp = getblock(tl += BLKSIZE/2, OREAD);
+			tl += BLKSIZE/sizeof(Rune);
+			bp = getblock(tl, OREAD);
 			nl = nleft;
 		}
 	}
@@ -1010,7 +1011,7 @@ putline(void)
 	tl = tline;
 	bp = getblock(tl, OWRITE);
 	nl = nleft;
-	tl &= ~((BLKSIZE/2)-1);
+	tl &= ~((BLKSIZE/sizeof(Rune))-1);
 	while(*bp = *lp++) {
 		if(*bp++ == '\n') {
 			bp[-1] = 0;
@@ -1019,7 +1020,7 @@ putline(void)
 		}
 		nl -= sizeof(Rune);
 		if(nl == 0) {
-			tl += BLKSIZE/2;
+			tl += BLKSIZE/sizeof(Rune);
 			bp = getblock(tl, OWRITE);
 			nl = nleft;
 		}
@@ -1046,8 +1047,9 @@ getblock(int atl, int iof)
 	static uchar ibuff[BLKSIZE];
 	static uchar obuff[BLKSIZE];
 
-	bno = atl / (BLKSIZE/2);
-	off = (atl<<1) & (BLKSIZE-1) & ~03;
+	bno = atl / (BLKSIZE/sizeof(Rune));
+	/* &~3 so the ptr is aligned to 4 (?) */
+	off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~3;
 	if(bno >= NBLK) {
 		lastc = '\n';
 		error(T);
@@ -1160,7 +1162,7 @@ join(void)
 	for(a1=addr1; a1<=addr2; a1++) {
 		lp = getline(*a1);
 		while(*gp = *lp++)
-			if(gp++ >= &genbuf[LBSIZE-2])
+			if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)])
 				error(Q);
 	}
 	lp = linebuf;
@@ -1238,7 +1240,7 @@ compsub(void)
 		if(c == '\\') {
 			c = getchr();
 			*p++ = ESCFLG;
-			if(p >= &rhsbuf[LBSIZE/2])
+			if(p >= &rhsbuf[LBSIZE/sizeof(Rune)])
 				error(Q);
 		} else
 		if(c == '\n' && (!globp || !globp[0])) {
@@ -1249,7 +1251,7 @@ compsub(void)
 		if(c == seof)
 			break;
 		*p++ = c;
-		if(p >= &rhsbuf[LBSIZE/2])
+		if(p >= &rhsbuf[LBSIZE/sizeof(Rune)])
 			error(Q);
 	}
 	*p = 0;

+ 3 - 57
sys/src/cmd/file.c

@@ -267,64 +267,10 @@ type(char *file, int nlen)
 	close(fd);
 }
 
-/*
- * Unicode 4.0 4-byte runes.
- */
-typedef int Rune1;
-
-enum {
-	UTFmax1 = 4,
-};
-
-int
-fullrune1(char *p, int n)
-{
-	int c;
-
-	if(n >= 1) {
-		c = *(uchar*)p;
-		if(c < 0x80)
-			return 1;
-		if(n >= 2 && c < 0xE0)
-			return 1;
-		if(n >= 3 && c < 0xF0)
-			return 1;
-		if(n >= 4)
-			return 1;
-	}
-	return 0;
-}
-
-int
-chartorune1(Rune1 *rune, char *str)
-{
-	int c, c1, c2, c3, n;
-	Rune r;
-
-	c = *(uchar*)str;
-	if(c < 0xF0){
-		r = 0;
-		n = chartorune(&r, str);
-		*rune = r;
-		return n;
-	}
-	c &= ~0xF0;
-	c1 = *(uchar*)(str+1) & ~0x80;
-	c2 = *(uchar*)(str+2) & ~0x80;
-	c3 = *(uchar*)(str+3) & ~0x80;
-	n = (c<<18) | (c1<<12) | (c2<<6) | c3;
-	if(n < 0x10000 || n > 0x10FFFF){
-		*rune = Runeerror;
-		return 1;
-	}
-	*rune = n;
-	return 4;
-}
-
 void
 filetype(int fd)
 {
-	Rune1 r;
+	Rune r;
 	int i, f, n;
 	char *p, *eob;
 
@@ -363,9 +309,9 @@ filetype(int fd)
 		language[i].count = 0;
 	eob = (char *)buf+nbuf;
 	for(n = 0, p = (char *)buf; p < eob; n++) {
-		if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
+		if (!fullrune(p, eob-p) && eob-p < UTFmax)
 			break;
-		p += chartorune1(&r, p);
+		p += chartorune(&r, p);
 		if (r == 0)
 			f = Cnull;
 		else if (r <= 0x7f) {

+ 1 - 1
sys/src/cmd/freq.c

@@ -2,7 +2,7 @@
 #include <libc.h>
 #include <bio.h>
 
-uvlong	count[1<<16];
+uvlong	count[Runemax+1];
 Biobuf	bout;
 
 void	usage(void);

+ 1 - 1
sys/src/cmd/grep/comp.c

@@ -275,7 +275,7 @@ re2class(char *s)
 			x = re2or(x, rclass(ov, p[0]-1));
 			ov = p[1]+1;
 		}
-		x = re2or(x, rclass(ov, 0xffff));
+		x = re2or(x, rclass(ov, Runemask));
 	} else {
 		x = rclass(p[0], p[1]);
 		for(p+=2; *p; p+=2)

+ 1 - 1
sys/src/cmd/grep/grep.h

@@ -53,7 +53,7 @@ enum
 
 	Caselim		= 7,
 	Nhunk		= 1<<16,
-	Cbegin		= 0x10000,
+	Cbegin		= Runemax+1,
 	Flshcnt		= (1<<9)-1,
 
 	Cflag		= 1<<0,

+ 7 - 1
sys/src/cmd/htmlroff/char.c

@@ -16,6 +16,12 @@ rune2html(Rune r)
 	if(r == '\n')
 		return L("\n");
 
+	if(((uint)r&~0xFFFF) != 0){
+		/* The cache must grow a lot to handle them */
+		fprint(2, "%s: can't handle rune '%C'\n", argv0, r);
+		return L("?");
+	}
+
 	if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
 		return tcscache[r>>8][r&0xFF];
 
@@ -59,7 +65,7 @@ rune2html(Rune r)
 typedef struct Trtab Trtab;
 struct Trtab
 {
-	char t[3];
+	char t[UTFmax];
 	Rune r;
 };
 

+ 14 - 19
sys/src/cmd/rc/glob.c

@@ -111,25 +111,22 @@ glob(void *ap)
 	else
 		globsort(globv, svglobv);
 }
+
 /*
  * Do p and q point at equal utf codes
  */
-
 int
 equtf(uchar *p, uchar *q)
 {
+	Rune pr, qr;
 	if(*p!=*q)
 		return 0;
-	if(twobyte(*p)) return p[1]==q[1];
-	if(threebyte(*p)){
-		if(p[1]!=q[1])
-			return 0;
-		if(p[1]=='\0')
-			return 1;	/* broken code at end of string! */
-		return p[2]==q[2];
-	}
-	return 1;
+	
+	chartorune(&pr, (char*)p);
+	chartorune(&qr, (char*)q);
+	return pr == qr;
 }
+
 /*
  * Return a pointer to the next utf code in the string,
  * not jumping past nuls in broken utf codes!
@@ -138,10 +135,10 @@ equtf(uchar *p, uchar *q)
 uchar*
 nextutf(uchar *p)
 {
-	if(twobyte(*p)) return p[1]=='\0'?p+1:p+2;
-	if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3;
-	return p+1;
+	Rune dummy;
+	return p + chartorune(&dummy, (char*)p);
 }
+
 /*
  * Convert the utf code at *p to a unicode value
  */
@@ -149,14 +146,12 @@ nextutf(uchar *p)
 int
 unicode(uchar *p)
 {
-	int u = *p;
+	Rune r;
 
-	if(twobyte(u))
-		return ((u&0x1f)<<6)|(p[1]&0x3f);
-	if(threebyte(u))
-		return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f);
-	return u;
+	chartorune(&r, (char*)p);
+	return r;
 }
+
 /*
  * Does the string s match the pattern p
  * . and .. are only matched by patterns starting with .

+ 15 - 5
sys/src/cmd/rc/lex.c

@@ -166,15 +166,25 @@ addtok(char *p, int val)
 char*
 addutf(char *p, int c)
 {
-	p = addtok(p, c);
-	if(twobyte(c))	 /* 2-byte escape */
-		return addtok(p, advance());
-	if(threebyte(c)){	/* 3-byte escape */
+	uchar b, m;
+	int i;
+
+	p = addtok(p, c);	/* 1-byte UTF runes are special */
+	if(onebyte(c))
+		return p;
+
+	m = 0xc0;
+	b = 0x80;
+	for(i=1; i < UTFmax; i++){
+		if((c&m) == b)
+			break;
 		p = addtok(p, advance());
-		return addtok(p, advance());
+		b = m;
+		m = (m >> 1)|0x80;
 	}
 	return p;
 }
+
 int lastdol;	/* was the last token read '$' or '$#' or '"'? */
 int lastword;	/* was the last token read a word or compound word terminator? */
 

+ 3 - 4
sys/src/cmd/rc/rc.h

@@ -128,13 +128,12 @@ int mypid;
  *	GLOBGLOB matches GLOB
  */
 #define	GLOB	((char)0x01)
+
 /*
- * onebyte(c), twobyte(c), threebyte(c)
- * Is c the first character of a one- two- or three-byte utf sequence?
+ * onebyte(c)
+ * Is c the first character of a one-byte utf sequence?
  */
 #define	onebyte(c)	((c&0x80)==0x00)
-#define	twobyte(c)	((c&0xe0)==0xc0)
-#define	threebyte(c)	((c&0xf0)==0xe0)
 
 char **argp;
 char **args;

+ 1 - 1
sys/src/cmd/sam/cmd.c

@@ -71,7 +71,7 @@ int
 inputc(void)
 {
 	int n, nbuf;
-	char buf[3];
+	char buf[UTFmax];
 	Rune r;
 
     Again:

+ 27 - 24
sys/src/cmd/sam/regexp.c

@@ -9,7 +9,7 @@ typedef struct Inst Inst;
 
 struct Inst
 {
-	long	type;	/* < 0x10000 ==> literal, otherwise action */
+	long	type;	/* <= Runemax ==> literal, otherwise action */
 	union {
 		int rsid;
 		int rsubid;
@@ -46,7 +46,7 @@ struct Ilist
 
 #define	NLIST	127
 
-Ilist	*tl, *nl;	/* This list, next list */
+Ilist	*tl, *nl;		/* This list, next list */
 Ilist	list[2][NLIST+1];	/* +1 for trailing null */
 static	Rangeset sempty;
 
@@ -56,25 +56,28 @@ static	Rangeset sempty;
  *	0x100xx are operators, value == precedence
  *	0x200xx are tokens, i.e. operands for operators
  */
-#define	OPERATOR	0x10000	/* Bitmask of all operators */
-#define	START		0x10000	/* Start, used for marker on stack */
-#define	RBRA		0x10001	/* Right bracket, ) */
-#define	LBRA		0x10002	/* Left bracket, ( */
-#define	OR		0x10003	/* Alternation, | */
-#define	CAT		0x10004	/* Concatentation, implicit operator */
-#define	STAR		0x10005	/* Closure, * */
-#define	PLUS		0x10006	/* a+ == aa* */
-#define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
-#define	ANY		0x20000	/* Any character but newline, . */
-#define	NOP		0x20001	/* No operation, internal use only */
-#define	BOL		0x20002	/* Beginning of line, ^ */
-#define	EOL		0x20003	/* End of line, $ */
-#define	CCLASS		0x20004	/* Character class, [] */
-#define	NCCLASS		0x20005	/* Negated character class, [^] */
-#define	END		0x20077	/* Terminate: match found */
-
-#define	ISATOR		0x10000
-#define	ISAND		0x20000
+enum {
+	OPERATOR = Runemask+1,	/* Bitmask of all operators */
+	START	= OPERATOR,	/* Start, used for marker on stack */
+	RBRA,			/* Right bracket, ) */
+	LBRA,			/* Left bracket, ( */
+	OR,			/* Alternation, | */
+	CAT,			/* Concatentation, implicit operator */
+	STAR,			/* Closure, * */
+	PLUS,			/* a+ == aa* */
+	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
+
+	ANY	= OPERATOR<<1,	/* Any character but newline, . */
+	NOP,			/* No operation, internal use only */
+	BOL,			/* Beginning of line, ^ */
+	EOL,			/* End of line, $ */
+	CCLASS,			/* Character class, [] */
+	NCCLASS,		/* Negated character class, [^] */
+	END,			/* Terminate: match found */
+
+	ISATOR	= OPERATOR,
+	ISAND	= OPERATOR<<1,
+};
 
 /*
  * Parser Information
@@ -459,7 +462,7 @@ nextrec(void){
 			exprp++;
 			return '\n';
 		}
-		return *exprp++|0x10000;
+		return *exprp++|(Runemax+1);
 	}
 	return *exprp++;
 }
@@ -494,7 +497,7 @@ bldcclass(void)
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -516,7 +519,7 @@ classmatch(int classno, int c, int negate)
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;

+ 2 - 2
sys/src/cmd/sed.c

@@ -623,7 +623,7 @@ compsub(Rune *rhs, Rune *end)
 	while ((r = *cp++) != '\0') {
 		if(r == '\\') {
 			if (rhs < end)
-				*rhs++ = 0xFFFF;
+				*rhs++ = Runemax;
 			else
 				return 0;
 			r = *cp++;
@@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
 			sp = place(sp, loc1, loc2);
 			continue;
 		}
-		if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') {
+		if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') {
 			n = c-'0';
 			if (subexp[n].rsp && subexp[n].rep) {
 				sp = place(sp, subexp[n].rsp, subexp[n].rep);

+ 1 - 1
sys/src/cmd/tcs/conv.h

@@ -19,6 +19,6 @@ void tune_in(int fd, long *notused, struct convert *out);
 void tune_out(Rune *base, int n, long *notused);
 
 #define		emit(x)		*(*r)++ = (x)
-#define		NRUNE		65536
+#define		NRUNE		(Runemax+1)
 
 extern long tab[];		/* common table indexed by Runes for reverse mappings */

+ 2 - 4
sys/src/cmd/tr.c

@@ -15,10 +15,8 @@ uchar	bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
 #define	CLEARBIT(a,c)		((a)[(c)/8] &= ~bits[(c)&07])
 #define	BITSET(a,c)		((a)[(c)/8] & bits[(c)&07])
 
-#define	MAXRUNE	0xFFFF
-
-uchar	f[(MAXRUNE+1)/8];
-uchar	t[(MAXRUNE+1)/8];
+uchar	f[(Runemax+1)/8];
+uchar	t[(Runemax+1)/8];
 char 	wbuf[4096];
 char	*wptr;
 

+ 13 - 8
sys/src/cmd/unicode.c

@@ -51,13 +51,13 @@ range(char *argv[])
 			return "bad range";
 		}
 		min = strtoul(q, &q, 16);
-		if(min<0 || min>0xFFFF || *q!='-')
+		if(min<0 || min>Runemax || *q!='-')
 			goto err;
 		q++;
 		if(strchr(hex, *q) == 0)
 			goto err;
 		max = strtoul(q, &q, 16);
-		if(max<0 || max>0xFFFF || max<min || *q!=0)
+		if(max<0 || max>Runemax || max<min || *q!=0)
 			goto err;
 		i = 0;
 		do{
@@ -79,17 +79,22 @@ nums(char *argv[])
 {
 	char *q;
 	Rune r;
-	int w;
+	int w, rsz;
+	char utferr[UTFmax];
 
+	r = Runeerror;
+	rsz = runetochar(utferr, &r);
 	while(*argv){
 		q = *argv;
 		while(*q){
 			w = chartorune(&r, q);
-			if(r==0x80 && (q[0]&0xFF)!=0x80){
-				fprint(2, "unicode: invalid utf string %s\n", *argv);
-				return "bad utf";
+			if(r==Runeerror){
+				if(strlen(q) != rsz || memcmp(q, utferr, rsz) != 0){
+					fprint(2, "unicode: invalid utf string %s\n", *argv);
+					return "bad utf";
+				}
 			}
-			Bprint(&bout, "%.4x\n", r);
+			Bprint(&bout, "%.6x\n", r);
 			q += w;
 		}
 		argv++;
@@ -111,7 +116,7 @@ chars(char *argv[])
 			return "bad char";
 		}
 		m = strtoul(q, &q, 16);
-		if(m<0 || m>0xFFFF || *q!=0)
+		if(m<0 || m>Runemax || *q!=0)
 			goto err;
 		Bprint(&bout, "%C", m);
 		if(!text)

+ 73 - 23
sys/src/cmd/unix/drawterm/libc/rune.c

@@ -14,21 +14,26 @@ enum
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
 
+	SurrogateMin	= 0xD800,
+	SurrogateMax	= 0xDFFF,
+
 	Bad	= Runeerror,
 };
 
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -43,7 +48,7 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * two character sequence
-	 *	0080-07FF => T2 Tx
+	 *	00080-007FF => T2 Tx
 	 */
 	c1 = *(uchar*)(str+1) ^ Tx;
 	if(c1 & Testx)
@@ -60,19 +65,41 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * three character sequence
-	 *	0800-FFFF => T3 Tx Tx
+	 *	00800-0FFFF => T3 Tx Tx
 	 */
 	c2 = *(uchar*)(str+2) ^ Tx;
+
 	if(c2 & Testx)
 		goto bad;
 	if(c < T4) {
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
+		if (SurrogateMin <= l && l <= SurrogateMax)
+			goto bad;
 		*rune = l;
 		return 3;
 	}
 
+	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -105,15 +132,37 @@ runetochar(char *str, Rune *rune)
 		str[1] = Tx | (c & Maskx);
 		return 2;
 	}
+	/*
+	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+	if (SurrogateMin <= c && c <= SurrogateMax)
+		c = Runeerror;
 
 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
 }
 
 int
@@ -136,11 +185,12 @@ runenlen(Rune *r, int nrune)
 		c = *r++;
 		if(c <= Rune1)
 			nb++;
-		else
-		if(c <= Rune2)
+		else if(c <= Rune2)
 			nb += 2;
+		else if(c <= Rune3)
+ 			nb += 3;
 		else
-			nb += 3;
+			nb += 4;
 	}
 	return nb;
 }
@@ -149,14 +199,14 @@ int
 fullrune(char *str, int n)
 {
 	int c;
-
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(c < T4)
+		return n >= 3;
+	return n >= 4;
 }

+ 60 - 10
sys/src/cmd/unix/u9fs/rune.c

@@ -14,21 +14,27 @@ enum
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
 
+	SurrogateMin	= 0xD800,
+	SurrogateMax	= 0xDFFF,
+
 	Bad	= Runeerror
 };
 
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -43,7 +49,7 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * two character sequence
-	 *	0080-07FF => T2 Tx
+	 *	00080-007FF => T2 Tx
 	 */
 	c1 = *(uchar*)(str+1) ^ Tx;
 	if(c1 & Testx)
@@ -60,19 +66,41 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * three character sequence
-	 *	0800-FFFF => T3 Tx Tx
+	 *	00800-0FFFF => T3 Tx Tx
 	 */
 	c2 = *(uchar*)(str+2) ^ Tx;
+
 	if(c2 & Testx)
 		goto bad;
 	if(c < T4) {
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
+		if (SurrogateMin <= l && l <= SurrogateMax)
+			goto bad;
 		*rune = l;
 		return 3;
 	}
 
+	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -105,15 +133,37 @@ runetochar(char *str, Rune *rune)
 		str[1] = Tx | (c & Maskx);
 		return 2;
 	}
+	/*
+	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+	if (SurrogateMin <= c && c <= SurrogateMax)
+		c = Runeerror;
 
 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
 }
 
 int

+ 1 - 1
sys/src/cmd/vnc/devcons.c

@@ -158,7 +158,7 @@ void
 kbdputc(int ch)
 {
 	int n;
-	char buf[3];
+	char buf[UTFmax];
 	Rune r;
 
 	r = ch;

+ 90 - 281
sys/src/cmd/wc.c

@@ -1,26 +1,84 @@
 /*
- * wc -- count things in utf-encoded text files
- * Bugs:
- *	The only white space characters recognized are ' ', '\t' and '\n', even though
- *	ISO 10646 has many more blanks scattered through it.
- *	Should count characters that cannot occur in any rune (hex f0-ff) separately.
- *	Should count non-canonical runes (e.g. hex c1,80 instead of hex 40).
+ * Count bytes within runes, if it fits in a uvlong, and other things.
  */
 #include <u.h>
 #include <libc.h>
-#define	NBUF	(8*1024)
-uvlong nline, tnline, pline;
-uvlong nword, tnword, pword;
-uvlong nrune, tnrune, prune;
-uvlong nbadr, tnbadr, pbadr;
-uvlong nchar, tnchar, pchar;
-void count(int, char *);
-void report(uvlong, uvlong, uvlong, uvlong, uvlong, char *);
+#include <bio.h>
+
+/* flags, per-file counts, and total counts */
+static int pline, pword, prune, pbadr, pchar;
+static uvlong nline, nword, nrune, nbadr, nchar;
+static uvlong tnline, tnword, tnrune, tnbadr, tnchar;
+
+enum{Space, Word};
+
+static void
+wc(Biobuf *bin)
+{
+	int where;
+	long r;
+
+	nline = 0;
+	nword = 0;
+	nrune = 0;
+	nbadr = 0;
+	where = Space;
+	while ((long)(r = Bgetrune(bin)) >= 0) {
+		nrune++;
+		if(r == Runeerror) {
+			nbadr++;
+			continue;
+		}
+		if(r == '\n')
+			nline++;
+		if(where == Word){
+			if(isspacerune(r))
+				where = Space;
+		}else
+			if(isspacerune(r) == 0){
+				where = Word;
+				nword++;
+			}
+	}
+	nchar = Boffset(bin);
+	tnline += nline;
+	tnword += nword;
+	tnrune += nrune;
+	tnbadr += nbadr;
+	tnchar += nchar;
+}
+
+static void
+report(uvlong nline, uvlong nword, uvlong nrune, uvlong nbadr, uvlong nchar, char *fname)
+{
+	char line[1024], *s, *e;
+
+	s = line;
+	e = line + sizeof line;
+	line[0] = 0;
+	if(pline)
+		s = seprint(s, e, " %7llud", nline);
+	if(pword)
+		s = seprint(s, e, " %7llud", nword);
+	if(prune)
+		s = seprint(s, e, " %7llud", nrune);
+	if(pbadr)
+		s = seprint(s, e, " %7llud", nbadr);
+	if(pchar)
+		s = seprint(s, e, " %7llud", nchar);
+	if(fname != nil)
+		seprint(s, e, " %s",   fname);
+	print("%s\n", line+1);
+}
+
 void
 main(int argc, char *argv[])
 {
-	char *status="";
-	int i, f;
+	char *sts;
+	Biobuf sin, *bin;
+	int i;
+
+	sts = nil;
 	ARGBEGIN {
 	case 'l': pline++; break;
 	case 'w': pword++; break;
@@ -31,279 +89,30 @@ main(int argc, char *argv[])
 		fprint(2, "Usage: %s [-lwrbc] [file ...]\n", argv0);
 		exits("usage");
 	} ARGEND
-	if(pline+pword+prune+pbadr+pchar == 0) {
+	if(pline+pword+prune+pbadr+pchar == 0){
 		pline = 1;
 		pword = 1;
 		pchar = 1;
 	}
-	if(argc==0)
-		count(0, 0);
-	else{
-		for(i=0;i<argc;i++){
-			f=open(argv[i], OREAD);
-			if(f<0){
+	if(argc == 0){
+		Binit(&sin, 0, OREAD);
+		wc(&sin);
+		report(nline, nword, nrune, nbadr, nchar, nil);
+		Bterm(&sin);
+	}else{
+		for(i = 0; i < argc; i++){
+			bin = Bopen(argv[i], OREAD);
+			if(bin == nil){
 				perror(argv[i]);
-				status="can't open";
-			}
-			else{
-				count(f, argv[i]);
-				tnline+=nline;
-				tnword+=nword;
-				tnrune+=nrune;
-				tnbadr+=nbadr;
-				tnchar+=nchar;
-				close(f);
+				sts = "can't open";
+				continue;
 			}
+			wc(bin);
+			report(nline, nword, nrune, nbadr, nchar, argv[i]);
+			Bterm(bin);
 		}
 		if(argc>1)
 			report(tnline, tnword, tnrune, tnbadr, tnchar, "total");
 	}
-	exits(status);
-}
-void
-report(uvlong nline, uvlong nword, uvlong nrune, uvlong nbadr, uvlong nchar, char *fname)
-{
-	char line[1024], word[128];
-	line[0] = '\0';
-	if(pline){
-		sprint(word, " %7llud", nline);
-		strcat(line, word);
-	}
-	if(pword){
-		sprint(word, " %7llud", nword);
-		strcat(line, word);
-	}
-	if(prune){
-		sprint(word, " %7llud", nrune);
-		strcat(line, word);
-	}
-	if(pbadr){
-		sprint(word, " %7llud", nbadr);
-		strcat(line, word);
-	}
-	if(pchar){
-		sprint(word, " %7llud", nchar);
-		strcat(line, word);
-	}
-	if(fname){
-		sprint(word, " %s",   fname);
-		strcat(line, word);
-	}
-	print("%s\n", line+1);
-}
-/*
- * How it works.  Start in statesp.  Each time we read a character,
- * increment various counts, and do state transitions according to the
- * following table.  If we're not in statesp or statewd when done, the
- * file ends with a partial rune.
- *        |                character
- *  state |09,20| 0a  |00-7f|80-bf|c0-df|e0-ef|f0-ff
- * -------+-----+-----+-----+-----+-----+-----+-----
- * statesp|ASP  |ASPN |AWDW |AWDWX|AC2W |AC3W |AWDWX
- * statewd|ASP  |ASPN |AWD  |AWDX |AC2  |AC3  |AWDX
- * statec2|ASPX |ASPNX|AWDX |AWDR |AC2X |AC3X |AWDX
- * statec3|ASPX |ASPNX|AWDX |AC2R |AC2X |AC3X |AWDX
- */
-enum{			/* actions */
-	AC2,		/* enter statec2 */
-	AC2R,		/* enter statec2, don't count a rune */
-	AC2W,		/* enter statec2, count a word */
-	AC2X,		/* enter statec2, count a bad rune */
-	AC3,		/* enter statec3 */
-	AC3W,		/* enter statec3, count a word */
-	AC3X,		/* enter statec3, count a bad rune */
-	ASP,		/* enter statesp */
-	ASPN,		/* enter statesp, count a newline */
-	ASPNX,		/* enter statesp, count a newline, count a bad rune */
-	ASPX,		/* enter statesp, count a bad rune */
-	AWD,		/* enter statewd */
-	AWDR,		/* enter statewd, don't count a rune */
-	AWDW,		/* enter statewd, count a word */
-	AWDWX,		/* enter statewd, count a word, count a bad rune */
-	AWDX,		/* enter statewd, count a bad rune */
-};
-uchar statesp[256]={	/* looking for the start of a word */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 00-07 */
-AWDW, ASP,  ASPN, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 08-0f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 10-17 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 18-1f */
-ASP,  AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 20-27 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 28-2f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 30-37 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 38-3f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 40-47 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 48-4f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 50-57 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 58-5f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 60-67 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 68-6f */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 70-77 */
-AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW,	/* 78-7f */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 80-87 */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 88-8f */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 90-97 */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 98-9f */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* a0-a7 */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* a8-af */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* b0-b7 */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* b8-bf */
-AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W,	/* c0-c7 */
-AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W,	/* c8-cf */
-AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W,	/* d0-d7 */
-AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W,	/* d8-df */
-AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W,	/* e0-e7 */
-AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W,	/* e8-ef */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* f0-f7 */
-AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* f8-ff */
-};
-uchar statewd[256]={	/* looking for the next character in a word */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 00-07 */
-AWD,  ASP,  ASPN, AWD,  AWD,  AWD,  AWD,  AWD,	/* 08-0f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 10-17 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 18-1f */
-ASP,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 20-27 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 28-2f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 30-37 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 38-3f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 40-47 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 48-4f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 50-57 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 58-5f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 60-67 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 68-6f */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 70-77 */
-AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,  AWD,	/* 78-7f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 80-87 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 88-8f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 90-97 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 98-9f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* a0-a7 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* a8-af */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* b0-b7 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* b8-bf */
-AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,	/* c0-c7 */
-AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,	/* c8-cf */
-AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,	/* d0-d7 */
-AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,  AC2,	/* d8-df */
-AC3,  AC3,  AC3,  AC3,  AC3,  AC3,  AC3,  AC3,	/* e0-e7 */
-AC3,  AC3,  AC3,  AC3,  AC3,  AC3,  AC3,  AC3,	/* e8-ef */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f0-f7 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f8-ff */
-};
-uchar statec2[256]={	/* looking for 10xxxxxx to complete a rune */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 00-07 */
-AWDX, ASPX, ASPNX,AWDX, AWDX, AWDX, AWDX, AWDX,	/* 08-0f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 10-17 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 18-1f */
-ASPX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 20-27 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 28-2f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 30-37 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 38-3f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 40-47 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 48-4f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 50-57 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 58-5f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 60-67 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 68-6f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 70-77 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 78-7f */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* 80-87 */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* 88-8f */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* 90-97 */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* 98-9f */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* a0-a7 */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* a8-af */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* b0-b7 */
-AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR,	/* b8-bf */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* c0-c7 */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* c8-cf */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* d0-d7 */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* d8-df */
-AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X,	/* e0-e7 */
-AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X,	/* e8-ef */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f0-f7 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f8-ff */
-};
-uchar statec3[256]={	/* looking for 10xxxxxx,10xxxxxx to complete a rune */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 00-07 */
-AWDX, ASPX, ASPNX,AWDX, AWDX, AWDX, AWDX, AWDX,	/* 08-0f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 10-17 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 18-1f */
-ASPX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 20-27 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 28-2f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 30-37 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 38-3f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 40-47 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 48-4f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 50-57 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 58-5f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 60-67 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 68-6f */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 70-77 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* 78-7f */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* 80-87 */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* 88-8f */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* 90-97 */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* 98-9f */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* a0-a7 */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* a8-af */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* b0-b7 */
-AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R,	/* b8-bf */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* c0-c7 */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* c8-cf */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* d0-d7 */
-AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X,	/* d8-df */
-AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X,	/* e0-e7 */
-AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X,	/* e8-ef */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f0-f7 */
-AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX,	/* f8-ff */
-};
-void
-count(int f, char *name)
-{
-	int n;
-	uchar buf[NBUF];
-	uchar *bufp, *ebuf;
-	uchar *state=statesp;
-
-	nline = 0;
-	nword = 0;
-	nrune = 0;
-	nbadr = 0;
-	nchar = 0;
-
-	for(;;){
-		n=read(f, buf, NBUF);
-		if(n<=0)
-			break;
-		nchar+=n;
-		nrune+=n;	/* might be too large, gets decreased later */
-		bufp=buf;
-		ebuf=buf+n;
-		do{
-			switch(state[*bufp]){
-			case AC2:   state=statec2;                   break;
-			case AC2R:  state=statec2; --nrune;          break;
-			case AC2W:  state=statec2; nword++;          break;
-			case AC2X:  state=statec2;          nbadr++; break;
-			case AC3:   state=statec3;                   break;
-			case AC3W:  state=statec3; nword++;          break;
-			case AC3X:  state=statec3;          nbadr++; break;
-			case ASP:   state=statesp;                   break;
-			case ASPN:  state=statesp; nline++;          break;
-			case ASPNX: state=statesp; nline++; nbadr++; break;
-			case ASPX:  state=statesp;          nbadr++; break;
-			case AWD:   state=statewd;                   break;
-			case AWDR:  state=statewd; --nrune;          break;
-			case AWDW:  state=statewd; nword++;          break;
-			case AWDWX: state=statewd; nword++; nbadr++; break;
-			case AWDX:  state=statewd;          nbadr++; break;
-			}
-		}while(++bufp!=ebuf);
-	}
-	if(state!=statesp && state!=statewd)
-		nbadr++;
-	if(n<0)
-		perror(name);
-	report(nline, nword, nrune, nbadr, nchar, name);
+	exits(sts);
 }