Browse Source

Plan 9 from Bell Labs 2013-05-01

David du Colombier 6 years ago
parent
commit
83d6593e6c

+ 1 - 1
386/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long	uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef union FPdbleword FPdbleword;
 typedef long		jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
amd64/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long long uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef union FPdbleword FPdbleword;
 typedef uintptr		jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
arm/include/u.h

@@ -9,7 +9,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long	uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef 	union FPdbleword FPdbleword;
 typedef long	jmp_buf[2];
 #define	JMPBUFSP	0

+ 0 - 8
dist/replica/plan9.proto

@@ -31,12 +31,6 @@ usr	- sys sys
 # architectures owned by sys
 386	- sys sys
 	+	- sys sys
-68000	- sys sys
-	+	- sys sys
-68020	- sys sys
-	+	- sys sys
-alpha	- sys sys
-	+	- sys sys
 amd64	- sys sys
 	+	- sys sys
 arm	- sys sys
@@ -49,8 +43,6 @@ power64	- sys sys
 	+	- sys sys
 sparc	- sys sys
 	+	- sys sys
-sparc64	- sys sys
-	+	- sys sys
 
 # everything else is owned by sys
 acme	- sys sys

+ 0 - 8
dist/replica/plan9binary.proto

@@ -31,12 +31,6 @@ usr	- sys sys
 # architectures owned by sys
 386	- sys sys
 	+	- sys sys
-68000	- sys sys
-	+	- sys sys
-68020	- sys sys
-	+	- sys sys
-alpha	- sys sys
-	+	- sys sys
 amd64	- sys sys
 	+	- sys sys
 arm	- sys sys
@@ -49,8 +43,6 @@ power64	- sys sys
 	+	- sys sys
 sparc	- sys sys
 	+	- sys sys
-sparc64	- sys sys
-	+	- sys sys
 
 # everything else is owned by sys
 acme	- sys sys

+ 1 - 1
mips/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long	uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef 	union FPdbleword FPdbleword;
 typedef long	jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
power/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long	uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef union FPdbleword FPdbleword;
 typedef long		jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
power64/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long long uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef 	union FPdbleword FPdbleword;
 typedef uintptr	jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
sparc/include/u.h

@@ -8,7 +8,7 @@ typedef	long long	vlong;
 typedef	unsigned long long uvlong;
 typedef unsigned long	uintptr;
 typedef unsigned long	usize;
-typedef	ushort		Rune;
+typedef	uint		Rune;
 typedef 	union FPdbleword FPdbleword;
 typedef long	jmp_buf[2];
 #define	JMPBUFSP	0

+ 1 - 1
sys/include/ape/limits.h

@@ -3,7 +3,7 @@
 /* 8 bit chars (signed), 16 bit shorts, 32 bit ints/longs */
 
 #define CHAR_BIT	8
-#define MB_LEN_MAX	3
+#define MB_LEN_MAX	4
 
 #define UCHAR_MAX	0xff
 #define USHRT_MAX	0xffff

+ 1 - 1
sys/include/ape/stdlib.h

@@ -6,7 +6,7 @@
 
 #define EXIT_FAILURE 1
 #define EXIT_SUCCESS 0
-#define MB_CUR_MAX 3
+#define MB_CUR_MAX 4
 #define RAND_MAX 32767
 
 typedef struct { int quot, rem; } div_t;

+ 1 - 1
sys/include/ape/u.h

@@ -12,7 +12,7 @@ typedef unsigned int	uint;
 typedef   signed char	schar;
 typedef	long long	vlong;
 typedef	unsigned long long uvlong;
-typedef	ushort		Rune;
+typedef	unsigned int		Rune;
 typedef 	union FPdbleword FPdbleword;
 typedef	char*	p9va_list;
 

+ 4 - 3
sys/include/ape/utf.h

@@ -7,14 +7,15 @@
 extern "C" { 
 #endif
 
-typedef unsigned short Rune;	/* 16 bits */
+typedef unsigned int Rune;	/* 32 bits */
 
 enum
 {
-	UTFmax		= 3,		/* maximum bytes per rune */
+	UTFmax		= 4,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0x80,		/* decoding error in UTF */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0x10FFFF,	/* 21-bit rune */
 };
 
 /*

+ 1 - 1
sys/include/bio.h

@@ -7,7 +7,7 @@ typedef	struct	Biobufhdr	Biobufhdr;
 enum
 {
 	Bsize		= 8*1024,
-	Bungetsize	= 4,		/* space for ungetc */
+	Bungetsize	= UTFmax+1,		/* space for ungetc */
 	Bmagic		= 0x314159,
 	Beof		= -1,
 	Bbad		= -2,

+ 6 - 4
sys/include/libc.h

@@ -41,12 +41,12 @@ extern	int	tokenize(char*, char**, int);
 
 enum
 {
-	UTFmax		= 3,		/* maximum bytes per rune */
+	UTFmax		= 4,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
 	Runeerror	= 0xFFFD,	/* decoding error in UTF */
-	Runemax		= 0xFFFF,	/* largest 16-bit character */
-	Runemask	= 0xFFFF,	/* bits used by runes (see grep) */
+	Runemax		= 0x10FFFF,	/* 21-bit rune */
+	Runemask	= 0x1FFFFF,	/* bits used by runes (see grep) */
 };
 
 /*
@@ -80,12 +80,14 @@ extern	Rune*	runestrstr(Rune*, Rune*);
 extern	Rune	tolowerrune(Rune);
 extern	Rune	totitlerune(Rune);
 extern	Rune	toupperrune(Rune);
+extern	Rune tobaserune(Rune);
 extern	int	isalpharune(Rune);
+extern	int	isbaserune(Rune);
+extern	int	isdigitrune(Rune);
 extern	int	islowerrune(Rune);
 extern	int	isspacerune(Rune);
 extern	int	istitlerune(Rune);
 extern	int	isupperrune(Rune);
-extern	int	isdigitrune(Rune);
 
 /*
  * malloc

+ 0 - 1
sys/lib/dist/setup

@@ -16,7 +16,6 @@ if (! test -e $dist/web.protect/plan9.iso) {
 	echo import web.protect failed: $dist/web.protect/plan9.iso missing \
 		>[1=2]
 
-	9fs outfsother
 	bind /n/outfsother/web.protect $dist/web.protect	# HACK
 	if (! test -e $dist/web.protect/plan9.iso) {
 		echo bind outfsother failed: $dist/web.protect/plan9.iso \

+ 2 - 2
sys/man/1/tr

@@ -62,9 +62,9 @@ character whose
 value is given by those digits.
 The character sequence
 .L \ex
-followed by 1, 2, 3, or 4 hexadecimal digits stands
+followed by 1 to 6 hexadecimal digits stands
 for the character whose
-16-bit value is given by those digits.
+21-bit value is given by those digits.
 A 
 .L \e
 followed by any other character stands

+ 1 - 1
sys/man/2/bio

@@ -257,7 +257,7 @@ Otherwise, a zero is returned.
 calls
 .I Bputc
 to output the low order
-16 bits of
+21 bits of
 .I c
 as a rune
 in

+ 1 - 1
sys/man/2/rune

@@ -54,7 +54,7 @@ bytes starting at
 and returns the number of bytes copied.
 .BR UTFmax ,
 defined as
-.B 3
+.B 4
 in
 .BR <libc.h> ,
 is the maximum number of bytes required to represent a rune.

+ 1 - 1
sys/man/3/cons

@@ -51,7 +51,7 @@ and a backspace
 the previous non-kill, non-erase character from the input buffer.
 Killing and erasing only delete characters back to, but not including,
 the last newline.
-Characters typed at the keyboard actually produce 16-bit runes (see
+Characters typed at the keyboard actually produce 21-bit runes (see
 .IR utf (6)),
 but the runes are translated into the variable-length
 .SM UTF

+ 10 - 8
sys/man/6/utf

@@ -7,7 +7,7 @@ based on the Unicode Standard and on the ISO multibyte
 .SM UTF-8
 encoding (Universal Character
 Set Transformation Format, 8 bits wide).
-The Unicode Standard represents its characters in 16
+The Unicode Standard represents its characters in 21
 bits;
 .SM UTF-8
 represents such
@@ -19,7 +19,7 @@ is shortened to
 .PP
 In Plan 9, a
 .I rune
-is a 16-bit quantity representing a Unicode character.
+is a 21-bit quantity representing a Unicode character.
 Internally, programs may store characters as runes.
 However, any external manifestation of textual information,
 in files or at the interface between programs, uses a
@@ -65,19 +65,21 @@ a rune x is converted to a multibyte
 sequence
 as follows:
 .PP
-01.   x in [00000000.0bbbbbbb] → 0bbbbbbb
+01.   x in [000000.00000000.0bbbbbbb] → 0bbbbbbb
 .br
-10.   x in [00000bbb.bbbbbbbb] → 110bbbbb, 10bbbbbb
+10.   x in [000000.00000bbb.bbbbbbbb] → 110bbbbb, 10bbbbbb
 .br
-11.   x in [bbbbbbbb.bbbbbbbb] → 1110bbbb, 10bbbbbb, 10bbbbbb
+11.   x in [000000.bbbbbbbb.bbbbbbbb] → 1110bbbb, 10bbbbbb, 10bbbbbb
+.br
+100. x in [bbbbbb.bbbbbbbb.bbbbbbbb] → 1110bbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
 .br
 .PP
 Conversion 01 provides a one-byte sequence that spans the
 .SM ASCII
 character set in a compatible way.
-Conversions 10 and 11 represent higher-valued characters
-as sequences of two or three bytes with the high bit set.
-Plan 9 does not support the 4, 5, and 6 byte sequences proposed by X-Open.
+Conversions 10, 11 and 100 represent higher-valued characters
+as sequences of two, three or four bytes with the high bit set.
+Plan 9 does not support the 5 and 6 byte sequences proposed by X-Open.
 When there are multiple ways to encode a value, for example rune 0,
 the shortest encoding is used.
 .PP

+ 19 - 14
sys/src/9/port/latin1.c

@@ -1,5 +1,5 @@
-#include <u.h>
-
+#include	"u.h"
+#include	"../port/lib.h"
 /*
  * The code makes two assumptions: strlen(ld) is 1 or 2; latintab[i].ld can be a
  * prefix of latintab[j].ld only when j<i.
@@ -15,23 +15,23 @@ struct cvlist
 };
 
 /*
- * Given 5 characters k[0]..k[4], find the rune or return -1 for failure.
+ * Given n characters k[0]..k[n-1], find the rune or return -1 for failure.
  */
 long
-unicode(Rune *k)
+unicode(Rune *k, int n)
 {
-	long i, c;
+	long c;
+	Rune *r;
 
-	k++;	/* skip 'X' */
 	c = 0;
-	for(i=0; i<4; i++,k++){
+	for(r = &k[1]; r<&k[n]; r++){		/* +1 to skip [Xx] */
 		c <<= 4;
-		if('0'<=*k && *k<='9')
-			c += *k-'0';
-		else if('a'<=*k && *k<='f')
-			c += 10 + *k-'a';
-		else if('A'<=*k && *k<='F')
-			c += 10 + *k-'A';
+		if('0'<=*r && *r<='9')
+			c += *r-'0';
+		else if('a'<=*r && *r<='f')
+			c += 10 + *r-'a';
+		else if('A'<=*r && *r<='F')
+			c += 10 + *r-'A';
 		else
 			return -1;
 	}
@@ -52,9 +52,14 @@ latin1(Rune *k, int n)
 
 	if(k[0] == 'X')
 		if(n>=5)
-			return unicode(k);
+			return unicode(k, 5);
 		else
 			return -5;
+	if(k[0] == 'x')
+		if(n>=UTFmax*2+1)
+			return unicode(k, UTFmax*2+1);
+		else
+			return -(UTFmax+1);
 	for(l=latintab; l->ld!=0; l++)
 		if(k[0] == l->ld[0]){
 			if(n == 1)

+ 6 - 4
sys/src/9/port/lib.h

@@ -35,10 +35,12 @@ extern	int	cistrncmp(char*, char*, int);
 
 enum
 {
-	UTFmax		= 3,	/* maximum bytes per rune */
-	Runesync	= 0x80,	/* cannot represent part of a UTF sequence */
-	Runeself	= 0x80,	/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0x80,	/* decoding error in UTF */
+	UTFmax		= 4,		/* maximum bytes per rune */
+	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
+	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0x10FFFF,	/* 24 bit rune */
+	Runemask	= 0x1FFFFF,	/* bits used by runes (see grep) */
 };
 
 /*

+ 74 - 56
sys/src/ape/lib/ap/gen/mbwc.c

@@ -1,4 +1,5 @@
 #include <stdlib.h>
+#include <limits.h>
 
 /*
  * Use the FSS-UTF transformation proposed by posix.
@@ -7,67 +8,80 @@
  *	Tx	10xxxxxx	6 free bits
  *	T1	110xxxxx	5 free bits
  *	T2	1110xxxx	4 free bits
+ *	T3	11110xxx	3 free bits
+ *	T4	111110xx	2 free bits
+ *	T5	1111110x	1 free bit
  *
  *	Encoding is as follows.
  *	From hex	Thru hex	Sequence		Bits
- *	00000000	0000007F	T0			7
+ *	00000000	0000007F	T0				7
  *	00000080	000007FF	T1 Tx			11
- *	00000800	0000FFFF	T2 Tx Tx		16
+ *	00000800	0000FFFF	T2 Tx Tx			16
+ *	00010000	001FFFFF	T3 Tx Tx Tx		21
+ *	00200000	03FFFFFF	T4 Tx Tx Tx Tx		26
+ *	04000000	7FFFFFFF	T5 Tx Tx  Tx Tx Tx	31
  */
+int
+mbtowc(wchar_t *pwc, const char *s, size_t n);
 
 int
 mblen(const char *s, size_t n)
 {
-
 	return mbtowc(0, s, n);
 }
 
+enum {
+	C0MSK = 0x7F,
+	C1MSK = 0x7FF,
+	T1 = 0xC0,
+	T2 = 0xE0,
+	NT1BITS = 11,
+	NSHFT = 5,
+	NCSHFT = NSHFT + 1,
+	WCHARMSK = (1<< (8*MB_LEN_MAX - 1)) - 1,
+};
+
 int
 mbtowc(wchar_t *pwc, const char *s, size_t n)
 {
-	int c, c1, c2;
-	long l;
+	unsigned long long c[MB_LEN_MAX];
+	unsigned long long l, m, wm, b;
+	int i;
 
 	if(!s)
 		return 0;
 
 	if(n < 1)
 		goto bad;
-	c = s[0] & 0xff;
-	if((c & 0x80) == 0x00) {
+
+	c[0] = s[0] & 0xff;		/* first one is special */
+	if((c[0] & 0x80) == 0x00) {
 		if(pwc)
-			*pwc = c;
-		if(c == 0)
+			*pwc = c[0];
+		if(c[0] == 0)
 			return 0;
 		return 1;
 	}
 
-	if(n < 2)
-		goto bad;
-	c1 = (s[1] ^ 0x80) & 0xff;
-	if((c1 & 0xC0) != 0x00)
-		goto bad;
-	if((c & 0xE0) == 0xC0) {
-		l = ((c << 6) | c1) & 0x7FF;
-		if(l < 0x080)
+	m = T2;
+	b = m^0x20;
+	l = c[0];
+	wm = C1MSK;
+	for(i = 1; i < MB_LEN_MAX + 1; i++){
+		if(n < i+1)
 			goto bad;
-		if(pwc)
-			*pwc = l;
-		return 2;
-	}
-
-	if(n < 3)
-		goto bad;
-	c2 = (s[2] ^ 0x80) & 0xff;
-	if((c2 & 0xC0) != 0x00)
-		goto bad;
-	if((c & 0xF0) == 0xE0) {
-		l = ((((c << 6) | c1) << 6) | c2) & 0xFFFF;
-		if(l < 0x0800)
+		c[i] = (s[i] ^ 0x80) & 0xff;
+		l = (l << NCSHFT) | c[i];
+		if((c[i] & 0xC0) != 0x00)
 			goto bad;
-		if(pwc)
-			*pwc = l;
-		return 3;
+		if((c[0] & m) == b) {
+			if(pwc)
+				*pwc = l & wm;
+			return i + 1;
+		}
+		b = m;
+		m = (m >> 1) | 0x80;
+		wm = (wm << NSHFT) | wm;
 	}
 
 	/*
@@ -81,27 +95,32 @@ bad:
 int
 wctomb(char *s, wchar_t wchar)
 {
-	long c;
+	unsigned long long c, maxc, m;
+	int i, j;
 
 	if(!s)
 		return 0;
 
-	c = wchar & 0xFFFF;
-	if(c < 0x80) {
+	maxc = 0x80;
+	c = wchar & WCHARMSK;
+	if(c < maxc) {
 		s[0] = c;
 		return 1;
 	}
 
-	if(c < 0x800) {
-		s[0] = 0xC0 | (c >> 6);
-		s[1] = 0x80 | (c & 0x3F);
-		return 2;
+	m = T1;
+	for(i = 2; i < MB_LEN_MAX + 1; i++){
+		maxc <<= 4;
+		if(c < maxc || i == MB_LEN_MAX){
+			s[0] = m | (c >> ((i - 1) * NCSHFT));
+			for(j = i - 1; j >= 1; j--){
+				s[i - j] = 0x80|((c>>(6 * (j - 1)))&0x3f);
+			}
+			return i;
+		}
+		m = (m >> 1) | 0x80;
 	}
-
-	s[0] = 0xE0 |  (c >> 12);
-	s[1] = 0x80 | ((c >> 6) & 0x3F);
-	s[2] = 0x80 |  (c & 0x3F);
-	return 3;
+	return MB_LEN_MAX;
 }
 
 size_t
@@ -117,7 +136,7 @@ mbstowcs(wchar_t *pwcs, const char *s, size_t n)
 				break;
 			s++;
 		} else {
-			d = mbtowc(pwcs, s, 3);
+			d = mbtowc(pwcs, s, MB_LEN_MAX);
 			if(d <= 0)
 				return (size_t)((d<0) ? -1 : i);
 			s += d;
@@ -133,10 +152,10 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
 	int i, d;
 	long c;
 	char *p, *pe;
-	char buf[3];
+	char buf[MB_LEN_MAX];
 
 	p = s;
-	pe = p+n-3;
+	pe = p+n-MB_LEN_MAX;
 	while(p < pe) {
 		c = *pwcs++;
 		if(c < 0x80)
@@ -146,15 +165,15 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
 		if(c == 0)
 			return p-s;
 	}
-	while(p < pe+3) {
+	while(p < pe+MB_LEN_MAX) {
 		c = *pwcs++;
 		d = wctomb(buf, c);
-		if(p+d <= pe+3) {
-			*p++ = buf[0];
-			if(d > 1) {
-				*p++ = buf[2];
-				if(d > 2)
-					*p++ = buf[3];
+		if(p+d <= pe+MB_LEN_MAX) {
+			*p++ = buf[0];		/* first one is special */
+			for(i = 2; i < MB_LEN_MAX + 1; i++){
+				if(d <= i -1)
+					break;
+				*p++ = buf[i];
 			}
 		}
 		if(c == 0)
@@ -162,4 +181,3 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
 	}
 	return p-s;
 }
-

+ 74 - 23
sys/src/ape/lib/utf/rune.c

@@ -23,27 +23,33 @@ enum
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
 
+	SurrogateMin	= 0xD800,
+	SurrogateMax	= 0xDFFF,
+
 	Bad	= Runeerror,
 };
 
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -58,7 +64,7 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * two character sequence
-	 *	0080-07FF => T2 Tx
+	 *	00080-007FF => T2 Tx
 	 */
 	c1 = *(uchar*)(str+1) ^ Tx;
 	if(c1 & Testx)
@@ -75,20 +81,42 @@ chartorune(Rune *rune, char *str)
 
 	/*
 	 * three character sequence
-	 *	0800-FFFF => T3 Tx Tx
+	 *	00800-0FFFF => T3 Tx Tx
 	 */
 	c2 = *(uchar*)(str+2) ^ Tx;
+
 	if(c2 & Testx)
 		goto bad;
 	if(c < T4) {
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
+		if (SurrogateMin <= l && l <= SurrogateMax)
+			goto bad;
 		*rune = l;
 		return 3;
 	}
 
 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
+	/*
 	 * bad decoding
 	 */
 bad:
@@ -120,15 +148,37 @@ runetochar(char *str, Rune *rune)
 		str[1] = Tx | (c & Maskx);
 		return 2;
 	}
+	/*
+	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+	if (SurrogateMin <= c && c <= SurrogateMax)
+		c = Runeerror;
 
 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
 }
 
 int
@@ -151,11 +201,12 @@ runenlen(Rune *r, int nrune)
 		c = *r++;
 		if(c <= Rune1)
 			nb++;
-		else
-		if(c <= Rune2)
+		else if(c <= Rune2)
 			nb += 2;
+		else if(c <= Rune3)
+ 			nb += 3;
 		else
-			nb += 3;
+			nb += 4;
 	}
 	return nb;
 }
@@ -164,14 +215,14 @@ int
 fullrune(char *str, int n)
 {
 	int c;
-
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(c < T4)
+		return n >= 3;
+	return n >= 4;
 }

+ 2 - 2
sys/src/cmd/cc/lex.c

@@ -1070,7 +1070,7 @@ loop:
 		 */
 		i = 2;
 		if(longflg)
-			i = 4;
+			i = 6;
 		l = 0;
 		for(; i>0; i--) {
 			c = getc();
@@ -1100,7 +1100,7 @@ loop:
 		 */
 		i = 2;
 		if(longflg)
-			i = 5;
+			i = 8;
 		l = c - '0';
 		for(; i>0; i--) {
 			c = getc();

+ 20 - 0
sys/src/cmd/page/pdfprolog.c

@@ -0,0 +1,20 @@
+"/Page null def\n"
+"/Page# 0 def\n"
+"/PDFSave null def\n"
+"/DSCPageCount 0 def\n"
+"/DoPDFPage {dup /Page# exch store pdfgetpage pdfshowpage } def\n"
+"\n"
+"/pdfshowpage_mysetpage {	% <pagedict> pdfshowpage_mysetpage <pagedict>\n"
+"  dup /CropBox pget {\n"
+"      boxrect\n"
+"      2 array astore /PageSize exch 4 2 roll\n"
+"      4 index /Rotate pget {\n"
+"        dup 0 lt {360 add} if 90 idiv {exch neg} repeat\n"
+"      } if\n"
+"      exch neg exch 2 array astore /PageOffset exch\n"
+"      << 5 1 roll >> setpagedevice\n"
+"  } if\n"
+"} bind def\n"
+"\n"
+"GS_PDF_ProcSet begin\n"
+"pdfdict begin\n"

+ 1 - 1
sys/src/cmd/tcs/plan9.h

@@ -1,4 +1,4 @@
-typedef unsigned short Rune;		/* 16 bits */
+typedef unsigned long Rune;		/* 21 bits */
 typedef unsigned char uchar;
 #define		Runeerror	0x80	/* decoding error in UTF */
 #define		Runeself	0x80	/* rune and UTF sequences are the same (<) */

+ 89 - 85
sys/src/libc/port/rune.c

@@ -1,26 +1,24 @@
 #include	<u.h>
 #include	<libc.h>
 
+#define Bit(i) (7-(i))
+/* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
+#define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
+/* 0000 0000 0000 0111 1111 1111 */
+#define	RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
+
 enum
 {
-	Bit1	= 7,
-	Bitx	= 6,
-	Bit2	= 5,
-	Bit3	= 4,
-	Bit4	= 3,
+	Bitx	= Bit(1),
 
-	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
-	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
-	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
-	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
-	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	Tx	= T(1),			/* 1000 0000 */
+	Rune1 = (1<<(Bit(0)+0*Bitx))-1,	/* 0000 0000 0000 0000 0111 1111 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Maskx	= (1<<Bitx)-1,		/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,		/* 1100 0000 */
 
-	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
-	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+	SurrogateMin	= 0xD800,
+	SurrogateMax	= 0xDFFF,
 
 	Bad	= Runeerror,
 };
@@ -28,49 +26,42 @@ enum
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
-	long l;
+	int c[UTFmax], i;
+	Rune l;
 
 	/*
-	 * one character sequence
+	 * N character sequence
 	 *	00000-0007F => T1
+	 *	00080-007FF => T2 Tx
+	 *	00800-0FFFF => T3 Tx Tx
+	 *	10000-10FFFF => T4 Tx Tx Tx
 	 */
-	c = *(uchar*)str;
-	if(c < Tx) {
-		*rune = c;
-		return 1;
-	}
 
-	/*
-	 * two character sequence
-	 *	0080-07FF => T2 Tx
-	 */
-	c1 = *(uchar*)(str+1) ^ Tx;
-	if(c1 & Testx)
-		goto bad;
-	if(c < T3) {
-		if(c < T2)
-			goto bad;
-		l = ((c << Bitx) | c1) & Rune2;
-		if(l <= Rune1)
-			goto bad;
-		*rune = l;
-		return 2;
+	c[0] = *(uchar*)(str);
+	if(c[0] < Tx){
+		*rune = c[0];
+		return 1;
 	}
+	l = c[0];
 
-	/*
-	 * three character sequence
-	 *	0800-FFFF => T3 Tx Tx
-	 */
-	c2 = *(uchar*)(str+2) ^ Tx;
-	if(c2 & Testx)
-		goto bad;
-	if(c < T4) {
-		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
-		if(l <= Rune2)
+	for(i = 1; i < UTFmax; i++) {
+		c[i] = *(uchar*)(str+i);
+		c[i] ^= Tx;
+		if(c[i] & Testx)
 			goto bad;
-		*rune = l;
-		return 3;
+		l = (l << Bitx) | c[i];
+		if(c[0] < T(i + 2)) {
+			l &= RuneX(i + 1);
+			if(i == 1) {
+				if(c[0] < T(2) || l <= Rune1)
+					goto bad;
+			} else if(l <= RuneX(i) || l > Runemax)
+				goto bad;
+			if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+				goto bad;
+			*rune = l;
+			return i + 1;
+		}
 	}
 
 	/*
@@ -84,12 +75,9 @@ bad:
 int
 runetochar(char *str, Rune *rune)
 {
-	long c;
+	int i, j;
+	Rune c;
 
-	/*
-	 * one character sequence
-	 *	00000-0007F => 00-7F
-	 */
 	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
@@ -97,23 +85,35 @@ runetochar(char *str, Rune *rune)
 	}
 
 	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
 	 * two character sequence
 	 *	0080-07FF => T2 Tx
-	 */
-	if(c <= Rune2) {
-		str[0] = T2 | (c >> 1*Bitx);
-		str[1] = Tx | (c & Maskx);
-		return 2;
-	}
-
-	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 * If the Rune is out of range or a surrogate half,
+	 * convert it to the error rune.
+	 * Do this test when i==3 because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	for(i = 2; i < UTFmax + 1; i++){
+		if(i == 3){
+			if(c > Runemax)
+				c = Runeerror;
+			if(SurrogateMin <= c && c <= SurrogateMax)
+				c = Runeerror;
+		}
+		if (c <= RuneX(i) || i == UTFmax ) {
+			str[0] = T(i) |  (c >> (i - 1)*Bitx);
+			for(j = 1; j < i; j++)
+				str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
+			return i;
+		}
+	}
+	return UTFmax;
 }
 
 int
@@ -129,18 +129,21 @@ runelen(long c)
 int
 runenlen(Rune *r, int nrune)
 {
-	int nb, c;
+	int nb, i;
+	Rune c;
 
 	nb = 0;
 	while(nrune--) {
 		c = *r++;
-		if(c <= Rune1)
+		if(c <= Rune1){
 			nb++;
-		else
-		if(c <= Rune2)
-			nb += 2;
-		else
-			nb += 3;
+		} else {
+			for(i = 2; i < UTFmax + 1; i++)
+				if(c <= RuneX(i) || i == UTFmax){
+					nb += i;
+					break;
+				}
+		}
 	}
 	return nb;
 }
@@ -148,15 +151,16 @@ runenlen(Rune *r, int nrune)
 int
 fullrune(char *str, int n)
 {
-	int c;
-
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	int  i;
+	Rune c;
+
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	for(i = 3; i < UTFmax + 1; i++)
+		if(c < T(i))
+			return n >= i - 1;
+	return n >= UTFmax;
 }

File diff suppressed because it is too large
+ 551 - 553
sys/src/libc/port/runebase.c


File diff suppressed because it is too large
+ 2 - 1177
sys/src/libc/port/runetype.c


File diff suppressed because it is too large
+ 1641 - 0
sys/src/libc/port/runetypebody-6.2.0.h