Browse Source

Plan 9 from Bell Labs 2012-05-12

David du Colombier 12 years ago
parent
commit
2c35c532a3
56 changed files with 14123 additions and 39 deletions
  1. 35 19
      sys/man/8/9boot
  2. 6 6
      sys/man/8/booting
  3. 7 4
      sys/man/8/prep
  4. 10 1
      sys/src/9/mkfile
  5. 11 6
      sys/src/9/pc/l.s
  6. 33 0
      sys/src/9/pcboot/bindpc
  7. 64 0
      sys/src/9/pcboot/boot
  8. 578 0
      sys/src/9/pcboot/bootld.c
  9. 139 0
      sys/src/9/pcboot/bootmkfile
  10. 72 0
      sys/src/9/pcboot/callsmkfile
  11. 96 0
      sys/src/9/pcboot/cga.tiny.c
  12. 559 0
      sys/src/9/pcboot/conf.c
  13. 529 0
      sys/src/9/pcboot/dat.h
  14. 804 0
      sys/src/9/pcboot/devbios.c
  15. 402 0
      sys/src/9/pcboot/dir.c
  16. 413 0
      sys/src/9/pcboot/diskload.c
  17. 608 0
      sys/src/9/pcboot/dosboot.c
  18. 156 0
      sys/src/9/pcboot/dosfs.h
  19. 314 0
      sys/src/9/pcboot/expand.c
  20. 4 0
      sys/src/9/pcboot/expand.h
  21. 63 0
      sys/src/9/pcboot/fns.h
  22. 103 0
      sys/src/9/pcboot/fs.c
  23. 11 0
      sys/src/9/pcboot/inflate.c
  24. 180 0
      sys/src/9/pcboot/inflate.guts.c
  25. 141 0
      sys/src/9/pcboot/iso9660.h
  26. 1 0
      sys/src/9/pcboot/ktzero.s
  27. 663 0
      sys/src/9/pcboot/l.s
  28. 248 0
      sys/src/9/pcboot/l16r.s
  29. 212 0
      sys/src/9/pcboot/l32p.s
  30. 549 0
      sys/src/9/pcboot/l32v.s
  31. 36 0
      sys/src/9/pcboot/l64p.s
  32. 322 0
      sys/src/9/pcboot/ldecomp.s
  33. 37 0
      sys/src/9/pcboot/load
  34. 37 0
      sys/src/9/pcboot/loadusb
  35. 737 0
      sys/src/9/pcboot/main.c
  36. 42 0
      sys/src/9/pcboot/mboot.s
  37. 11 0
      sys/src/9/pcboot/mbootstart.s
  38. 213 0
      sys/src/9/pcboot/mem.h
  39. 600 0
      sys/src/9/pcboot/memory.c
  40. 144 0
      sys/src/9/pcboot/mkfile
  41. 995 0
      sys/src/9/pcboot/mmu.c
  42. 38 0
      sys/src/9/pcboot/multiboot.c
  43. 9 0
      sys/src/9/pcboot/no-inflate.c
  44. 582 0
      sys/src/9/pcboot/parts.c
  45. 94 0
      sys/src/9/pcboot/pxe.h
  46. 1235 0
      sys/src/9/pcboot/pxeload.c
  47. 96 0
      sys/src/9/pcboot/rand.c
  48. 93 0
      sys/src/9/pcboot/realmode.c
  49. 272 0
      sys/src/9/pcboot/realmode0.s
  50. 182 0
      sys/src/9/pcboot/sdbios.c
  51. 400 0
      sys/src/9/pcboot/stub.c
  52. 767 0
      sys/src/9/pcboot/trap.c
  53. 12 0
      sys/src/9/pcboot/unbindpc
  54. 30 0
      sys/src/9/pcboot/warp64.c
  55. 125 0
      sys/src/9/pcboot/words
  56. 3 3
      sys/src/boot/pc/mkfile

+ 35 - 19
sys/man/8/9boot

@@ -78,21 +78,27 @@ at location
 .B 0x7C00
 (31K).
 .I 9boot
-begins execution at virtual address
-.B 0x80007C00
-and reads a
+reads a
 .IR plan.ini (8)
 file from
 .B /cfg/pxe
 via PXE,
 then loads the named
 .I bootfile
-via PXE,
-using each ethernet in sequence,
+via TFTP,
+trying each ethernet in sequence,
 at the entry address specified by the header,
 usually virtual
 .BR 0xF0100020 .
-After loading, control is passed to the entry location.
+After loading,
+.I 9boot
+creates a Gnu Multiboot header in low memory for
+the benefit of the loaded kernel
+and
+control is passed to the entry location.
+So far, only
+.B amd64
+kernels expect Multiboot headers.
 .PP
 .I 9load
 is a similar bootstrap program,
@@ -132,10 +138,10 @@ detects whether the BIOS supports LBA and
 acts appropriately, defaulting to CHS mode
 when LBA is not present.
 The PBSs cannot do this due to code size limitations.
-The Plan 9 MBR is suitable for booting non-Plan 9
+The Plan 9 MBR is suitable for booting non-Plan-9
 operating systems,
 and (modulo the large disk constraints just described)
-non-Plan 9 MBRs are suitable for booting Plan 9.
+non-Plan-9 MBRs are suitable for booting Plan 9.
 .PP
 .I 9load
 begins execution at virtual address
@@ -313,7 +319,7 @@ can be located via a DHCP server,
 where
 .I ether
 is the lower-case MAC address of a recognised ethernet adapter,
-the contents are obtained and used as a
+the contents are obtained by TFTP and used as a
 .IR plan9.ini .
 .PP
 .I 9load
@@ -326,7 +332,7 @@ In order to find configuration information,
 searches all units on devices
 .\" .BR fd
 .\" and
-.BI sd Cn \fR
+.B sd?[0-9]*
 (all
 .B sd
 devices),
@@ -391,7 +397,7 @@ If there was no
 option,
 .I 9load
 searches
-.BI sd Cn \fR
+.B sd?[0-9]*
 FAT partitions for a kernel
 (any file named
 .B 9pc*
@@ -455,6 +461,14 @@ may be read for NVRAM contents or
 or
 .IR kfs (4)
 file systems can be mounted as the root file system.
+On ISO 9660 CDs,
+.I 9load
+treats the contents of a file named
+.B /bootdisk.img
+as a 
+.B 9fat
+partition,
+and it is assumed to contain the image of a FAT file system.
 A more extensive partitioning is typically done by
 .I fdisk
 and
@@ -469,7 +483,7 @@ or
 cannot parse partition tables,
 as it lacks disk drivers, so add
 .L readparts=
-to your
+to the machine's
 .B /cfg/pxe
 file, per
 .IR plan9.ini (8),
@@ -495,25 +509,27 @@ for information on ensuring this.
 these programs reside here
 .TP
 .BI /cfg/pxe
-directory of
-.I plan9.ini
+directory of configuration
+.RI ( plan9.ini )
 files on your TFTP server
 .PP
 .IB "FAT-filesystem" :\e9load
 .br
 .IB "FAT-filesystem" :\eplan9.ini
-.br
-.IB "FAT-filesystem" :\eplan9\eplan9.ini
+.\" .br
+.\" .IB "FAT-filesystem" :\eplan9\eplan9.ini
 .SH SOURCE
-.TF /sys/src/9/pcboot
+.TF "/sys/src/9/^(pc port ip)"
 .PD 0
 .TP
 .B /sys/src/boot/pc
-first-stage disk boot sectors
+first-stage disk boot sectors (MBR, PBS)
 .TP
 .B /sys/src/9/pcboot
+PC-bootstrap-specific source
 .TP
-.B /sys/src/9/pc
+.B /sys/src/9/^(pc port ip)
+common kernel source
 .PD
 .SH "SEE ALSO"
 .IR cons (3),

+ 6 - 6
sys/man/8/booting

@@ -24,7 +24,7 @@ reboots the machine;
 other methods of rebooting are mentioned for some machines.
 .SS PCs
 To boot a PC, it is necessary to get
-.B /386/9load
+.B /386/9boot
 or
 .B /386/9load
 loaded into memory.
@@ -36,14 +36,14 @@ will load
 .B 9load
 when the PC is reset or powered on.
 Other methods are described in 
-.IR 9load (8).
-.I 9load
+.IR 9boot (8).
+.I 9boot
 or
 .I 9load
 then locates and loads a Plan 9 kernel, using configuration information
 from the matching file in
 .B /cfg/pxe
-.RI ( 9load )
+.RI ( 9boot )
 or the file
 .B plan9.ini
 stored in the
@@ -51,7 +51,7 @@ stored in the
 configuration partition or on a DOS file system
 .RI ( 9load ).
 See 
-.IR 9load (8)
+.IR 9boot (8)
 for details.
 .PP
 Once the kernel is booted, it behaves like the others.
@@ -280,7 +280,7 @@ when reset.
 .
 .SH "SEE ALSO"
 .IR ndb (6),
-.IR 9load (8),
+.IR 9boot (8),
 .IR boot (8),
 .IR init (8),
 .IR plan9.ini (8)

+ 7 - 4
sys/man/8/prep

@@ -165,7 +165,7 @@ A
 .IR swap (8)
 swap partition.
 .PD
-.PP
+.SS "fdisk and prep"
 .I Fdisk
 edits the PC partition table and is usually
 invoked with a disk like
@@ -478,7 +478,7 @@ Set the partition type.  If it is not given,
 .I fdisk
 will display a list of choices and then prompt for it.
 .PD
-.PP
+.SS "format and pbs"
 .I Format
 prepares for use the disk partition or the floppy diskette in the file named
 .IR disk ,
@@ -617,7 +617,7 @@ option prints debugging information.
 The file
 .B /386/pbs
 is an example of a suitable
-.I bfile
+.I bootblock
 to make the disk a boot disk.
 It gets loaded by the BIOS at 0x7C00,
 reads the first sector of the
@@ -632,7 +632,9 @@ The file
 .B /386/pbslba
 is similar, but because it uses LBA addressing (not supported
 by older BIOSes), it can access more than the first 8.5GB of the disk.
-.PP
+.B /386/pbsraw
+is suitable for CDs.
+.SS mbr
 .I Mbr
 installs a new boot block in sector 0 (the master boot record)
 of a disk such as
@@ -733,6 +735,7 @@ assembler source for
 .IR sd (3),
 .IR usb (4),
 .IR 9boot (8),
+.IR mk9660 (8),
 .IR mkusbboot (8),
 .IR partfs (8)
 .SH BUGS

+ 10 - 1
sys/src/9/mkfile

@@ -13,15 +13,24 @@ all:V:
 		cd $i
 		mk
 	}
+	# build pc boots last
+	@{ cd pc; mk clean }
+	@{ cd pcboot; mk }
+pcboot:V:
+	@{ cd pc; mk clean }
+	@{ cd pcboot; mk }
 
 installall install:V:
 	for(i in $ARCH) @{
 		cd $i
 		mk install
 	}
+	@{ cd pc; mk clean }
+	@{ cd pcboot; mk install }
+	@{ cd pc; mk clean }
 
 clean:V:
-	for(i in $ARCH) @{
+	for(i in $ARCH pcboot) @{
 		cd $i
 		mk clean
 	}

+ 11 - 6
sys/src/9/pc/l.s

@@ -756,8 +756,13 @@ _aamloop:
  * FNxxx variations) so WAIT instructions must be explicitly placed in the
  * code as necessary.
  */
-#define	FPOFF							 ;\
+#define	FPOFF(l)						 ;\
+	MOVL	CR0, AX 					 ;\
+	ANDL	$0xC, AX			/* EM, TS */	 ;\
+	CMPL	AX, $0x8					 ;\
+	JEQ 	l						 ;\
 	WAIT							 ;\
+l:								 ;\
 	MOVL	CR0, AX						 ;\
 	ANDL	$~0x4, AX			/* EM=0 */	 ;\
 	ORL	$0x28, AX			/* NE=1, TS=1 */ ;\
@@ -769,7 +774,7 @@ _aamloop:
 	MOVL	AX, CR0
 	
 TEXT fpoff(SB), $0				/* disable */
-	FPOFF
+	FPOFF(l1)
 	RET
 
 TEXT fpinit(SB), $0				/* enable and init */
@@ -785,9 +790,9 @@ TEXT fpinit(SB), $0				/* enable and init */
 	RET
 
 TEXT fpx87save(SB), $0				/* save state and disable */
-	MOVL	p+0(FP), AX			/* points to pointer */
+	MOVL	p+0(FP), AX
 	FSAVE	0(AX)				/* no WAIT */
-	FPOFF
+	FPOFF(l2)
 	RET
 
 TEXT fpx87restore(SB), $0			/* enable and restore state */
@@ -809,13 +814,13 @@ TEXT fpenv(SB), $0				/* save state without waiting */
 TEXT fpclear(SB), $0				/* clear pending exceptions */
 	FPON
 	FCLEX					/* no WAIT */
-	FPOFF
+	FPOFF(l3)
 	RET
 
 TEXT fpssesave0(SB), $0				/* save state and disable */
 	MOVL	p+0(FP), AX
 	FXSAVE					/* no WAIT */
-	FPOFF
+	FPOFF(l4)
 	RET
 
 TEXT fpsserestore0(SB), $0			/* enable and restore state */

+ 33 - 0
sys/src/9/pcboot/bindpc

@@ -0,0 +1,33 @@
+#!/bin/rc
+# bindpc pfx sfx - bind files from ../pc into .
+# creating lots of stubs with aux/stub bogs down eventually, alas.
+if (test -e etherigbe.c)
+	exit ''
+
+rfork e
+thisboot=`{basename `{pwd}}
+boot=../$thisboot
+echo bindpc $*
+pfx=$1
+sfx=$2
+
+# mostly for mk clean or *.clean
+if (~ $#pfx 0)
+	pfx=''
+if (~ $#sfx 0)
+	sfx=''
+
+bind -bc $boot $boot
+
+# duplicate $pfx as $pfx$sfx
+cfgs=`{ls -d $pfx^* | grep -v '\.|mkfile|'^$sfx^'$'}
+aux/stub -d /tmp/blank
+for (f in $cfgs)
+	aux/stub /tmp/blank/$f$sfx
+bind -a /tmp/blank .
+for (f in $cfgs)
+	bind $f $f$sfx
+bind -a ../pc $boot
+
+# bind systab.h ../port/systab.h
+exit ''

+ 64 - 0
sys/src/9/pcboot/boot

@@ -0,0 +1,64 @@
+# boot - 9pxeload (not 9load) as a variant of 9pccpu.
+#	compressed text + data segments must fit in 1st 512K; currently 393K.
+dev
+	root
+	cons
+	arch
+	rtc
+
+	ether		netif
+	ip		arp chandial ip ipaux ipv6 iproute netlog nullmedium pktmedium ptclbsum386 inferno
+
+	uart
+
+link
+# order of ethernet drivers should match that in ../pc/pc so that
+# devices are detected in the same order by bootstraps and kernels
+# and thus given the same controller numbers.
+	ether2000	ether8390
+	ether2114x	pci
+	ether589	etherelnk3
+	ether79c970	pci
+	ether8003	ether8390
+	ether8139	pci
+	ether8169	pci ethermii
+	ether82543gc	pci
+	ether82563	pci
+	ether82557	pci
+	ether83815	pci
+	etherdp83820	pci
+	etherec2t	ether8390
+	etherelnk3	pci
+	etherga620	pci
+	etherigbe	pci ethermii
+	ethervgbe	pci ethermii
+	ethervt6102	pci ethermii
+	ethervt6105m	pci ethermii
+#	ethersink
+	ethersmc	devi82365 cis
+	etherwavelan	wavelan devi82365 cis pci
+	ethermedium
+	etherm10g
+	ether82598	pci
+
+misc
+	bootld
+	conf
+	dir
+	nomtrr
+	no-inflate
+	pxeload
+	rand
+	stub
+	uarti8250
+
+ip
+	udp
+# would need tcp to read via 9p
+	ipifc
+	icmp
+	icmp6
+
+port
+	int cpuserver = 1;
+	char hellomsg[] = "\nPlan 9 from Bell Labs PXE boot loader";

+ 578 - 0
sys/src/9/pcboot/bootld.c

@@ -0,0 +1,578 @@
+/*
+ * load kernel into memory
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/netif.h"
+#include	"../ip/ip.h"
+#include	"pxe.h"
+
+#include	<a.out.h>
+#include 	"/sys/src/libmach/elf.h"
+
+#undef KADDR
+#undef PADDR
+// #define PADDR(a)	(paddr)((void *)(a))
+
+#define KADDR(a)	((void*)((ulong)(a) | KZERO))
+#define PADDR(a)	((ulong)(a) & ~KSEGM)
+
+extern int debug;
+extern void pagingoff(ulong);
+
+static uchar elfident[7] = {
+	'\177', 'E', 'L', 'F', '\1', '\1', '\1'
+};
+static Ehdr ehdr, rehdr;
+static Phdr *phdr;
+static int curphdr;
+static ulong curoff;
+static ulong elftotal;
+static long (*swal)(long);
+static ushort (*swab)(ushort);
+
+/*
+ * big-endian short
+ */
+ushort
+beswab(ushort s)
+{
+	uchar *p;
+
+	p = (uchar*)&s;
+	return (p[0]<<8) | p[1];
+}
+
+/*
+ * big-endian long
+ */
+long
+beswal(long l)
+{
+	uchar *p;
+
+	p = (uchar*)&l;
+	return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
+}
+
+/*
+ * big-endian vlong
+ */
+uvlong
+beswav(uvlong v)
+{
+	uchar *p;
+
+	p = (uchar*)&v;
+	return ((uvlong)p[0]<<56) | ((uvlong)p[1]<<48) | ((uvlong)p[2]<<40)
+				  | ((uvlong)p[3]<<32) | ((uvlong)p[4]<<24)
+				  | ((uvlong)p[5]<<16) | ((uvlong)p[6]<<8)
+				  | (uvlong)p[7];
+}
+
+/*
+ * little-endian short
+ */
+ushort
+leswab(ushort s)
+{
+	uchar *p;
+
+	p = (uchar*)&s;
+	return (p[1]<<8) | p[0];
+}
+
+/*
+ * little-endian long
+ */
+long
+leswal(long l)
+{
+	uchar *p;
+
+	p = (uchar*)&l;
+	return (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
+}
+
+/*
+ * Convert header to canonical form
+ */
+static void
+hswal(long *lp, int n, long (*swap) (long))
+{
+	while (n--) {
+		*lp = (*swap) (*lp);
+		lp++;
+	}
+}
+
+static int
+readehdr(Boot *b)
+{
+	int i;
+
+	/* bitswap the header according to the DATA format */
+	if(ehdr.ident[CLASS] != ELFCLASS32) {
+		print("bad ELF class - not 32 bit\n");
+		return 0;
+	}
+	if(ehdr.ident[DATA] == ELFDATA2LSB) {
+		swab = leswab;
+		swal = leswal;
+	} else if(ehdr.ident[DATA] == ELFDATA2MSB) {
+		swab = beswab;
+		swal = beswal;
+	} else {
+		print("bad ELF encoding - not big or little endian\n");
+		return 0;
+	}
+	memmove(&rehdr, &ehdr, sizeof(Ehdr));
+
+	ehdr.type = swab(ehdr.type);
+	ehdr.machine = swab(ehdr.machine);
+	ehdr.version = swal(ehdr.version);
+	ehdr.elfentry = swal(ehdr.elfentry);
+	ehdr.phoff = swal(ehdr.phoff);
+	ehdr.shoff = swal(ehdr.shoff);
+	ehdr.flags = swal(ehdr.flags);
+	ehdr.ehsize = swab(ehdr.ehsize);
+	ehdr.phentsize = swab(ehdr.phentsize);
+	ehdr.phnum = swab(ehdr.phnum);
+	ehdr.shentsize = swab(ehdr.shentsize);
+	ehdr.shnum = swab(ehdr.shnum);
+	ehdr.shstrndx = swab(ehdr.shstrndx);
+	if(ehdr.type != EXEC || ehdr.version != CURRENT)
+		return 0;
+	if(ehdr.phentsize != sizeof(Phdr))
+		return 0;
+
+	if(debug)
+		print("readehdr OK entry %#lux\n", ehdr.elfentry);
+
+	curoff = sizeof(Ehdr);
+	i = ehdr.phoff+ehdr.phentsize*ehdr.phnum - curoff;
+	b->state = READPHDR;
+	b->bp = (char*)smalloc(i);
+	b->wp = b->bp;
+	b->ep = b->wp + i;
+	phdr = (Phdr*)(b->bp + ehdr.phoff-sizeof(Ehdr));
+	if(debug)
+		print("phdr...");
+
+	return 1;
+}
+
+static int
+nextphdr(Boot *b)
+{
+	Phdr *php;
+	ulong entry, offset;
+	char *physaddr;
+
+	if(debug)
+		print("readedata %d\n", curphdr);
+
+	for(; curphdr < ehdr.phnum; curphdr++){
+		php = phdr+curphdr;
+		if(php->type != LOAD)
+			continue;
+		offset = php->offset;
+		physaddr = (char*)KADDR(PADDR(php->paddr));
+		if(offset < curoff){
+			/*
+			 * Can't (be bothered to) rewind the
+			 * input, it might be from tftp. If we
+			 * did then we could boot FreeBSD kernels
+			 * too maybe.
+			 */
+			return 0;
+		}
+		if(php->offset > curoff){
+			b->state = READEPAD;
+			b->bp = (char*)smalloc(offset - curoff);
+			b->wp = b->bp;
+			b->ep = b->wp + offset - curoff;
+			if(debug)
+				print("nextphdr %lud...\n", offset - curoff);
+			return 1;
+		}
+		b->state = READEDATA;
+		b->bp = physaddr;
+		b->wp = b->bp;
+		b->ep = b->wp+php->filesz;
+		print("%ud+", php->filesz);
+		elftotal += php->filesz;
+		if(debug)
+			print("nextphdr %ud@%#p\n", php->filesz, physaddr);
+
+		return 1;
+	}
+
+	if(curphdr != 0){
+		print("=%lud\n", elftotal);
+		b->state = TRYEBOOT;
+		entry = ehdr.elfentry & ~KSEGM;
+		PLLONG(b->hdr.entry, entry);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int
+readepad(Boot *b)
+{
+	Phdr *php;
+
+	php = phdr+curphdr;
+	if(debug)
+		print("readepad %d\n", curphdr);
+	curoff = php->offset;
+
+	return nextphdr(b);
+}
+
+static int
+readedata(Boot *b)
+{
+	Phdr *php;
+
+	php = phdr+curphdr;
+	if(debug)
+		print("readedata %d\n", curphdr);
+	if(php->filesz < php->memsz){
+		print("%lud",  php->memsz-php->filesz);
+		elftotal += php->memsz-php->filesz;
+		memset((char*)KADDR(PADDR(php->paddr)+php->filesz), 0,
+			php->memsz-php->filesz);
+	}
+	curoff = php->offset+php->filesz;
+	curphdr++;
+
+	return nextphdr(b);
+}
+
+static int
+readphdr(Boot *b)
+{
+	Phdr *php;
+
+	php = phdr;
+	hswal((long*)php, ehdr.phentsize*ehdr.phnum/sizeof(long), swal);
+	if(debug)
+		print("phdr curoff %lud vaddr %#lux paddr %#lux\n",
+			curoff, php->vaddr, php->paddr);
+
+	curoff = ehdr.phoff+ehdr.phentsize*ehdr.phnum;
+	curphdr = 0;
+
+	return nextphdr(b);
+}
+
+static int
+addbytes(char **dbuf, char *edbuf, char **sbuf, char *esbuf)
+{
+	int n;
+
+	n = edbuf - *dbuf;
+	if(n <= 0)
+		return 0;
+	if(n > esbuf - *sbuf)
+		n = esbuf - *sbuf;
+	if(n <= 0)
+		return -1;
+
+	memmove(*dbuf, *sbuf, n);
+	*sbuf += n;
+	*dbuf += n;
+	return edbuf - *dbuf;
+}
+
+void
+impulse(void)
+{
+	delay(500);				/* drain uart */
+	splhi();
+
+	/* turn off buffered serial console */
+	serialoq = nil;
+
+	/* shutdown devices */
+	chandevshutdown();
+	arch->introff();
+}
+
+void
+prstackuse(int)
+{
+	char *top, *base;
+	ulong *p;
+
+	base = up->kstack;
+	top =  up->kstack + KSTACK - (sizeof(Sargs) + BY2WD);
+	for (p = (ulong *)base; (char *)p < top && *p ==
+	    (Stkpat<<24 | Stkpat<<16 | Stkpat<<8 | Stkpat); p++)
+		;
+	print("proc stack: used %ld bytes, %ld left (stack pattern)\n",
+		top - (char *)p, (char *)p - base);
+}
+
+/*
+ * this code is simplified from reboot().  It doesn't need to relocate
+ * the new kernel nor deal with other processors.
+ */
+void
+warp9(ulong entry)
+{
+//	prstackuse(0);			/* debugging */
+	impulse();
+
+	/* get out of KZERO space, turn off paging and jump to entry */
+	pagingoff(PADDR(entry));
+}
+
+int
+bootpass(Boot *b, void *vbuf, int nbuf)
+{
+	char *buf, *ebuf;
+	uchar *sdata;
+	Exechdr *hdr;
+	ulong entry, pentry, text, data, bss, magic;
+	uvlong entry64;
+
+	if(b->state == FAILED)
+		return FAIL;
+
+	if(nbuf == 0)
+		goto Endofinput;
+
+	buf = vbuf;
+	ebuf = buf+nbuf;
+	while(addbytes(&b->wp, b->ep, &buf, ebuf) == 0) {
+		switch(b->state) {
+		case INITKERNEL:
+			b->state = READEXEC;
+			b->bp = (char*)&b->hdr;
+			b->wp = b->bp;
+			b->ep = b->bp+sizeof(Exechdr);
+			break;
+		case READEXEC:
+			hdr = &b->hdr;
+			magic = GLLONG(hdr->magic);
+			if(magic == I_MAGIC || magic == S_MAGIC) {
+				pentry = PADDR(GLLONG(hdr->entry));
+				text = GLLONG(hdr->text);
+				data = GLLONG(hdr->data);
+				if (pentry < MB)
+					panic("kernel entry %#p below 1 MB",
+						pentry);
+				if (PGROUND(pentry + text) + data >
+				    MB + Kernelmax)
+					panic("kernel larger than %d bytes",
+						Kernelmax);
+				b->state = READ9TEXT;
+				b->bp = (char*)KADDR(pentry);
+				b->wp = b->bp;
+				b->ep = b->wp+text;
+
+				if(magic == I_MAGIC){
+					memmove(b->bp, b->hdr.uvl, sizeof(b->hdr.uvl));
+					b->wp += sizeof(b->hdr.uvl);
+				}
+
+				print("%lud", text);
+			} else if(memcmp(b->bp, elfident, 4) == 0){
+				b->state = READEHDR;
+				b->bp = (char*)&ehdr;
+				b->wp = b->bp;
+				b->ep = b->wp + sizeof(Ehdr);
+				memmove(b->bp, &b->hdr, sizeof(Exechdr));
+				b->wp += sizeof(Exechdr);
+				print("elf...");
+			} else if(b->bp[0] == 0x1F && (uchar)b->bp[1] == 0x8B &&
+			    b->bp[2] == 0x08) {
+				b->state = READGZIP;
+				/* could use Unzipbuf instead of smalloc() */
+				b->bp = (char*)smalloc(Kernelmax);
+				b->wp = b->bp;
+				b->ep = b->wp + Kernelmax;
+				memmove(b->bp, &b->hdr, sizeof(Exechdr));
+				b->wp += sizeof(Exechdr);
+				print("gz...");
+			} else {
+				print("bad kernel format (magic %#lux)\n",
+					magic);
+				b->state = FAILED;
+				return FAIL;
+			}
+			break;
+
+		case READ9TEXT:
+			hdr = &b->hdr;
+			b->state = READ9DATA;
+			b->bp = (char*)PGROUND((uintptr)
+				KADDR(PADDR(GLLONG(hdr->entry))) +
+				GLLONG(hdr->text));
+			b->wp = b->bp;
+			b->ep = b->wp + GLLONG(hdr->data);
+			print("+%ld", GLLONG(hdr->data));
+			break;
+	
+		case READ9DATA:
+			hdr = &b->hdr;
+			bss = GLLONG(hdr->bss);
+			memset(b->ep, 0, bss);
+			print("+%ld=%ld\n",
+				bss, GLLONG(hdr->text)+GLLONG(hdr->data)+bss);
+			b->state = TRYBOOT;
+			return ENOUGH;
+
+		case READEHDR:
+			if(!readehdr(b)){
+				print("readehdr failed\n");
+				b->state = FAILED;
+				return FAIL;
+			}
+			break;
+
+		case READPHDR:
+			if(!readphdr(b)){
+				b->state = FAILED;
+				return FAIL;
+			}
+			break;
+
+		case READEPAD:
+			if(!readepad(b)){
+				b->state = FAILED;
+				return FAIL;
+			}
+			break;
+
+		case READEDATA:
+			if(!readedata(b)){
+				b->state = FAILED;
+				return FAIL;
+			}
+			if(b->state == TRYBOOT)
+				return ENOUGH;
+			break;
+
+		case TRYBOOT:
+		case TRYEBOOT:
+		case READGZIP:
+			return ENOUGH;
+
+		case READ9LOAD:
+		case INIT9LOAD:
+			panic("9load");
+
+		default:
+			panic("bootstate");
+		}
+	}
+	return MORE;
+
+
+Endofinput:
+	/* end of input */
+	switch(b->state) {
+	case INITKERNEL:
+	case READEXEC:
+	case READ9TEXT:
+	case READ9DATA:
+	case READEHDR:
+	case READPHDR:
+	case READEPAD:
+	case READEDATA:
+		print("premature EOF\n");
+		b->state = FAILED;
+		return FAIL;
+
+	case TRYBOOT:
+		entry = GLLONG(b->hdr.entry);
+		magic = GLLONG(b->hdr.magic);
+		if(magic == I_MAGIC){
+			print("entry: %#lux\n", entry);
+			warp9(PADDR(entry));
+		}
+		else if(magic == S_MAGIC){
+			entry64 = beswav(b->hdr.uvl[0]);
+			warp64(entry64);
+		}
+		b->state = FAILED;
+		return FAIL;
+
+	case TRYEBOOT:
+		entry = GLLONG(b->hdr.entry);
+		if(ehdr.machine == I386){
+			print("entry: %#lux\n", entry);
+			warp9(PADDR(entry));
+		}
+		else if(ehdr.machine == AMD64){
+			print("entry: %#lux\n", entry);
+			warp64(entry);
+		}	
+		b->state = FAILED;
+		return FAIL;
+
+	case READGZIP:
+		/* apparently the whole gzipped kernel is now at b->bp */
+		hdr = &b->hdr;
+		if(b->bp[0] != 0x1F || (uchar)b->bp[1] != 0x8B || b->bp[2] != 0x08)
+			print("lost magic\n");
+
+		print("%ld => ", b->wp - b->bp);
+		/* fill hdr from gzipped b->bp to get various sizes */
+		if(gunzip((uchar*)hdr, sizeof *hdr, (uchar*)b->bp, b->wp - b->bp)
+		    < sizeof *hdr) {
+			print("badly compressed kernel\n");
+			return FAIL;
+		}
+
+		magic = GLLONG(hdr->magic);
+		entry = GLLONG(hdr->entry);
+		text = GLLONG(hdr->text);
+		data = GLLONG(hdr->data);
+		bss = GLLONG(hdr->bss);
+		print("%lud+%lud+%lud=%lud\n", text, data, bss, text+data+bss);
+
+		/* fill entry from gzipped b->bp */
+		if(gunzip((uchar *)KADDR(PADDR(entry)) - sizeof(Exec),
+		     sizeof(Exec)+text+data, 
+		     (uchar*)b->bp, b->wp-b->bp) < sizeof(Exec)+text+data) {
+			print("error uncompressing kernel\n");
+			return FAIL;
+		}
+		/* relocate data to start at page boundary */
+		sdata = KADDR(PADDR(entry+text));
+		memmove((void*)PGROUND((uintptr)sdata), sdata, data);
+
+		if(magic == I_MAGIC){
+			print("entry: %#lux\n", entry);
+			warp9(PADDR(entry));
+		}
+		else if(magic == S_MAGIC){
+			entry64 = beswav(hdr->uvl[0]);
+			warp64(entry64);
+		} else
+			print("bad magic %#lux\n", magic);
+		b->state = FAILED;
+		return FAIL;
+
+	case INIT9LOAD:
+	case READ9LOAD:
+		panic("end 9load");
+
+	default:
+		panic("bootdone");
+	}
+	b->state = FAILED;
+	return FAIL;
+}

+ 139 - 0
sys/src/9/pcboot/bootmkfile

@@ -0,0 +1,139 @@
+# make 9boot(pbs) and 9load(usb)
+# inherit KTZERO, START, MBOOT, CONF, BASE and SFX from mkfile
+x=`{bindpc $BASE $SFX}
+CONFLIST=$CONF
+EXTRACOPIES=piestand lookout bovril boundary 
+
+objtype=386
+</$objtype/mkfile
+p=9
+
+DEVS=`{rc ../port/mkdevlist $CONF}
+
+PORT=\
+	alarm.$O\
+	alloc.$O\
+	allocb.$O\
+	auth.$O\
+	chan.$O\
+	dev.$O\
+	edf.$O\
+	latin1.$O\
+	page.$O\
+	parse.$O\
+	pgrp.$O\
+	portclock.$O\
+	print.$O\
+	proc.$O\
+	qio.$O\
+	qlock.$O\
+	rdb.$O\
+	rebootcmd.$O\
+	segment.$O\
+	taslock.$O\
+	tod.$O\
+	xalloc.$O\
+
+OBJ=\
+	ktzero.$O\
+	$START\
+	l32p.$O\
+	l32v.$O\
+	$MBOOT\
+	l64p.$O\
+	realmode0.$O\
+	l.$O\
+	cga.$O\
+	fault.$O\
+	i8253.$O\
+	i8259.$O\
+	kbd.$O\
+	main.$O\
+	memory.$O\
+	mmu.$O\
+	multiboot.$O\
+	trap.$O\
+	warp64.$O\
+	$CONF.root.$O\
+	$CONF.rootc.$O\
+	$DEVS\
+	$PORT\
+
+LIB=\
+	/$objtype/lib/libflate.a\
+	/$objtype/lib/libip.a\
+	/$objtype/lib/libc.a\
+
+ETHER=`{echo devether.c ether*.c | sed 's/\.c/.'$O'/g'}
+
+$p$CONF:Q:	$CONF.c $OBJ $LIB
+	$CC $CFLAGS '-DKERNDATE='`{date -n} $CONF.c
+	echo linking bootstraps
+	$LD -o $target -H3 -T$KTZERO -l $OBJ $CONF.$O $LIB	# dos .com
+	$LD -o $target^debug -T$KTZERO -l $OBJ $CONF.$O $LIB
+#	$LD -o $target.exe -H4 -T$KTZERO -l $OBJ $CONF.$O $LIB	# dos .exe
+#	$LD -o $target.elf -H5 -R4096 -T$KTZERO -l $OBJ $CONF.$O $LIB
+	size $target^debug
+
+# don't strip the gzipped kernels -- too frustrating when that's all you have!
+$p%.gz:D:	$p%
+	gzip -9 <$p$stem >$p$stem.gz
+
+install:V:	$p$CONF $p${CONF}debug		# $p$CONF.elf
+	cp $p$CONF $p$CONF^debug /$objtype/ &
+	for(i in $EXTRACOPIES)
+		{ 9fs $i && cp $p$CONF $p$CONF^debug /n/$i/$objtype &&
+			echo -n $i... & }
+	wait
+	echo
+
+<../port/portmkfile
+<|../port/mkbootrules $CONF
+
+%.$O:		/$objtype/include/ureg.h /sys/include/pool.h ../port/netif.h
+%.$O:		../port/sd.h ../ip/ip.h dosfs.h pxe.h
+$ETHER: 	etherif.h ethermii.h
+ether2000.$O ether8003.$O ether8390.$O etherec2t.$O: ether8390.h
+l.$O l16r.$O l32p.$O l32v.$O l64p.$O: /sys/src/boot/pc/x16.h
+sd%.$O:		/sys/include/disk.h
+auth.$O:	/sys/include/authsrv.h
+bootld.$O:	/sys/include/a.out.h /sys/src/libmach/elf.h
+devcons.$O:	/sys/include/authsrv.h
+devfloppy.$O:	floppy.h
+inflate.$O:	/sys/include/flate.h
+main.$O:	reboot.h
+l16r.$O mbootstart.$O:	mboot.s
+sd53c8xx.$O:	sd53c8xx.i
+sdiahci.$O:	ahci.h
+trap.$O:	/sys/include/tos.h
+
+init.h:
+	>$target
+
+reboot.h:D:	rebootcode.s
+	$AS rebootcode.s
+	$LD -l -s -T0x11000 -R4 -o reboot.out rebootcode.$O
+	{echo 'uchar rebootcode[]={'
+	 xd -1x reboot.out |
+		sed -e '1,2d' -e 's/^[0-9a-f]+ //' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g'
+	 echo '};'} > reboot.h
+
+acid:V:
+	8c -a -w -I. i8253.c>acid
+
+<callsmkfile
+
+%.checkether:VQ:
+	for (i in ether*.c){
+		x=`{echo $i | sed 's/\.c//'}
+		if(! ~ $x ether8390 && ! grep -s '^	'^$x^'([ 	]|$)' $stem)
+			echo $x not included in $stem
+	}
+	exit 0
+
+%.clean:V:
+	@ {
+		rfork n
+		unmount .
+		rm -f $stem.c [9bz]$stem [9bz]$stem.gz boot$stem.* reboot.h apbootstrap.h 9boot* 9load*
+	}

+ 72 - 0
sys/src/9/pcboot/callsmkfile

@@ -0,0 +1,72 @@
+PORTC=\
+	../port/alarm.c\
+	../port/alloc.c\
+	../port/allocb.c\
+	auth.c\
+	chan.c\
+	conf.c\
+	../port/dev.c\
+	../port/devroot.c\
+	../port/devuart.c\
+	../port/edf.c\
+	../port/latin1.c\
+	../port/netif.c\
+	../port/page.c\
+	../port/parse.c\
+	../port/pgrp.c\
+	../port/portclock.c\
+	../port/print.c\
+	../port/proc.c\
+	../port/qio.c\
+	../port/qlock.c\
+	../port/rdb.c\
+	../port/rebootcmd.c\
+	../port/segment.c\
+	../port/taslock.c\
+	../port/tod.c\
+	../port/xalloc.c\
+
+IPC=../ip/arp.c\
+	../ip/chandial.c\
+	../ip/devip.c\
+	../ip/ethermedium.c\
+	../ip/icmp.c\
+	../ip/icmp6.c\
+	../ip/inferno.c\
+	../ip/ip.c\
+	../ip/ipaux.c\
+	../ip/ipifc.c\
+	../ip/ipmux.c\
+	../ip/iproute.c\
+	../ip/ipv6.c\
+	../ip/loopbackmedium.c\
+	../ip/netdevmedium.c\
+	../ip/netlog.c\
+	../ip/ptclbsum.c\
+	../ip/udp.c\
+
+# print it in landscape orientation; it's way more useful than 80 columns
+disk.calls:D: main.c $CONF.c $PORTC \
+	boot.c cga.c devarch.c devcons.c devrtc.c \
+	diskload.c fault.c i8253.c i8259.c \
+	kbd.c memory.c mmu.c multiboot.c pci.c \
+	stub.c trap.c uarti8250.c \
+	warp64.c $CONF.rootc.c			#  ${DEVS:%.$O=%.c}
+	calls -w 132 -f main -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >main.$target
+	calls -w 132 -f bootloadproc -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >bootloadproc.$target
+	calls -w 132 -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >$target
+pxe.calls:D: main.c $CONF.c $IPC $PORTC \
+	boot.c cga.c devarch.c devcons.c devether.c devrtc.c \
+	ethermii.c ether82563.c fault.c i8253.c i8259.c \
+	kbd.c memory.c mmu.c multiboot.c pci.c \
+	pxeload.c rand.c stub.c trap.c uarti8250.c \
+	warp64.c $CONF.rootc.c			#  ${DEVS:%.$O=%.c}
+	calls -w 132 -f main -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >main.$target
+	calls -w 132 -f bootloadproc -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >bootloadproc.$target
+	calls -w 132 -I. -I../pc -I../port -I../ip $prereq |
+		grep -v '\[external\]' >$target

+ 96 - 0
sys/src/9/pcboot/cga.tiny.c

@@ -0,0 +1,96 @@
+#include <u.h>
+#include <libc.h>
+#include "expand.h"
+
+enum {
+	Black,
+	Blue,
+	Green,
+	Cyan,
+	Red,
+	Magenta,
+	Brown,
+	Grey,
+
+	Bright = 0x08,
+	Blinking = 0x80,
+
+	Yellow = Bright|Brown,
+	White = Bright|Grey,
+};
+	
+enum {
+	Width = 80*2,
+	Height = 25,
+	Attr = (Black<<4)|Grey,
+};
+
+#define cga	((uchar*)0xB8000)
+int cgapos;
+
+static uchar
+cgaregr(int index)
+{
+	outb(0x3D4, index);
+	return inb(0x3D4+1) & 0xFF;
+}
+
+static void
+cgaregw(int index, int data)
+{
+	outb(0x3D4, index);
+	outb(0x3D4+1, data);
+}
+
+static void
+movecursor(void)
+{
+	cgaregw(0x0E, (cgapos/2>>8) & 0xFF);
+	cgaregw(0x0F, cgapos/2 & 0xFF);
+	cga[cgapos+1] = Attr;
+}
+
+void
+cgaputc(int c)
+{
+	int i;
+	uchar *p;
+
+	if(c == '\n'){
+		cgapos = cgapos/Width;
+		cgapos = (cgapos+1)*Width;
+	}
+	else if(c == '\t'){
+		i = 8 - ((cgapos/2)&7);
+		while(i-->0)
+			cgaputc(' ');
+	}
+	else if(c == '\b'){
+		if(cgapos >= 2)
+			cgapos -= 2;
+		cgaputc(' ');
+		cgapos -= 2;
+	}
+	else{
+		cga[cgapos++] = c;
+		cga[cgapos++] = Attr;
+	}
+	if(cgapos >= Width*Height){
+		memmove(cga, &cga[Width], Width*(Height-1));
+		p = &cga[Width*(Height-1)];
+		for(i=0; i<Width/2; i++){
+			*p++ = ' ';
+			*p++ = Attr;
+		}
+		cgapos = Width*(Height-1);
+	}
+	movecursor();
+}
+
+void
+cgainit(void)
+{
+	cgapos = cgaregr(0x0E)<<8;
+	cgapos |= cgaregr(0x0F);
+	cgapos *= 2;
+}

+ 559 - 0
sys/src/9/pcboot/conf.c

@@ -0,0 +1,559 @@
+/*
+ * parse plan.ini or /cfg/pxe/%E file into low memory
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/netif.h"
+#include	"../ip/ip.h"
+#include	"pxe.h"
+
+typedef struct {
+	char*	name;
+	int	start;
+	int	end;
+} Mblock;
+
+typedef struct {
+	char*	tag;
+	Mblock*	mb;
+} Mitem;
+
+static Mblock mblock[MAXCONF];
+static int nmblock;
+static Mitem mitem[MAXCONF];
+static int nmitem;
+static char* mdefault;
+static char mdefaultbuf[10];
+static int mtimeout;
+
+static char*
+comma(char* line, char** residue)
+{
+	char *q, *r;
+
+	if((q = strchr(line, ',')) != nil){
+		*q++ = 0;
+		if(*q == ' ')
+			q++;
+	}
+	*residue = q;
+
+	if((r = strchr(line, ' ')) != nil)
+		*r = 0;
+
+	if(*line == ' ')
+		line++;
+	return line;
+}
+
+static Mblock*
+findblock(char* name, char** residue)
+{
+	int i;
+	char *p;
+
+	p = comma(name, residue);
+	for(i = 0; i < nmblock; i++){
+		if(strcmp(p, mblock[i].name) == 0)
+			return &mblock[i];
+	}
+	return nil;
+}
+
+static Mitem*
+finditem(char* name, char** residue)
+{
+	int i;
+	char *p;
+
+	p = comma(name, residue);
+	for(i = 0; i < nmitem; i++){
+		if(strcmp(p, mitem[i].mb->name) == 0)
+			return &mitem[i];
+	}
+	return nil;
+}
+
+static Chan *conschan;
+
+/* timeout is in seconds */
+int
+getstr(char *prompt, char *buf, int size, char *def, int timeout)
+{
+	int len, isdefault;
+	static char pbuf[PRINTSIZE];
+
+	if(conschan == nil)
+		conschan = enamecopen("#c/cons", ORDWR);
+	if(conschan == nil)
+		panic("can't open #c/cons");
+	buf[0] = 0;
+	isdefault = (def && *def);
+	if(isdefault == 0){
+		timeout = 0;
+		snprint(pbuf, sizeof pbuf, "%s: ", prompt);
+	}
+	else if(timeout)
+		snprint(pbuf, sizeof pbuf, "%s[default==%s (%ds timeout)]: ",
+			prompt, def, timeout);
+	else
+		snprint(pbuf, sizeof pbuf, "%s[default==%s]: ", prompt, def);
+	for (;;) {
+		print("%s", pbuf);
+		if (timeout > 0) {
+			for(timeout *= 1000; timeout > 0; timeout -= 100) {
+				tsleep(&up->sleep, return0, 0, 100);
+				if (qlen(kbdq) > 0)	/* if input queued */
+					break; 
+			}
+			if (timeout <= 0) {		/* use default */
+				print("\n");
+				len = 0;
+				break;
+			}
+		}
+		buf[0] = '\0';
+		len = devtab[conschan->type]->read(conschan, buf, size - 1,
+			conschan->offset);
+		if(len >= 0)
+			buf[len] = '\0';
+		switch(len){
+		case 0:				/* eof */
+		case 1:				/* newline */
+			len = 0;
+			buf[len] = '\0';
+			if(!isdefault)
+				continue;
+			break;
+		}
+		if(len < size - 1)
+			break;
+		print("line too long\n");
+	}
+	if(len == 0 && isdefault)
+		strncpy(buf, def, size);
+	return 0;
+}
+
+int
+isconf(char *name)
+{
+	int i;
+
+	for(i = 0; i < nconf; i++)
+		if(cistrcmp(confname[i], name) == 0)
+			return 1;
+	return 0;
+}
+
+char*
+getconf(char *name)
+{
+	int i, n, nmatch;
+	char buf[120];
+
+	nmatch = 0;
+	for(i = 0; i < nconf; i++)
+		if(cistrcmp(confname[i], name) == 0)
+			nmatch++;
+
+	switch(nmatch) {
+	default:
+		print("\n");
+		nmatch = 0;
+		for(i = 0; i < nconf; i++)
+			if(cistrcmp(confname[i], name) == 0)
+				print("%d. %s\n", ++nmatch, confval[i]);
+		print("%d. none of the above\n", ++nmatch);
+		do {
+			getstr(name, buf, sizeof(buf), nil, 0);
+			n = atoi(buf);
+		} while(n < 1 || n > nmatch);
+
+		for(i = 0; i < nconf; i++)
+			if(cistrcmp(confname[i], name) == 0)
+				if(--n == 0)
+					return confval[i];
+		break;
+
+	case 1:
+		for(i = 0; i < nconf; i++)
+			if(cistrcmp(confname[i], name) == 0)
+				return confval[i];
+		break;
+
+	case 0:
+		break;
+	}
+	return nil;
+}
+
+static void
+parsemenu(char* str, char* scratch, int len)
+{
+	Mitem *mi;
+	Mblock *mb, *menu;
+	char buf[20], *p, *q, *line[MAXCONF];
+	int i, inblock, n, show;
+
+	inblock = 0;
+	menu = nil;
+	memmove(scratch, str, len);
+	n = getfields(scratch, line, MAXCONF, 0, "\n");
+	if(n >= MAXCONF)
+		print("warning: possibly too many lines in plan9.ini\n");
+	for(i = 0; i < n; i++){
+		p = line[i];
+		if(inblock && *p == '['){
+			mblock[nmblock].end = i;
+			if(strcmp(mblock[nmblock].name, "menu") == 0)
+				menu = &mblock[nmblock];
+			nmblock++;
+			inblock = 0;
+		}
+		if(*p == '['){
+			if(nmblock == 0 && i != 0){
+				mblock[nmblock].name = "common";
+				mblock[nmblock].start = 0;
+				mblock[nmblock].end = i;
+				nmblock++;
+			}
+			q = strchr(p+1, ']');
+			if(q == nil || *(q+1) != 0){
+				print("malformed menu block header - %s\n", p);
+				return;
+			}
+			*q = 0;
+			mblock[nmblock].name = p+1;
+			mblock[nmblock].start = i+1;
+			inblock = 1;
+		}
+	}
+
+	if(inblock){
+		mblock[nmblock].end = i;
+		nmblock++;
+	}
+	if(menu == nil)
+		return;
+	if(nmblock < 2){
+		print("incomplete menu specification\n");
+		return;
+	}
+
+	for(i = menu->start; i < menu->end; i++){
+		p = line[i];
+		if(cistrncmp(p, "menuitem=", 9) == 0){
+			p += 9;
+			if((mb = findblock(p, &q)) == nil){
+				print("no block for menuitem %s\n", p);
+				return;
+			}
+			if(q != nil)
+				mitem[nmitem].tag = q;
+			else
+				mitem[nmitem].tag = mb->name;
+			mitem[nmitem].mb = mb;
+			nmitem++;
+		}
+		else if(cistrncmp(p, "menudefault=", 12) == 0){
+			p += 12;
+			if((mi = finditem(p, &q)) == nil){
+				print("no item for menudefault %s\n", p);
+				return;
+			}
+			if(q != nil)
+				mtimeout = strtol(q, 0, 0);
+			snprint(mdefaultbuf, sizeof mdefaultbuf, "%ld",
+				mi-mitem+1);
+			mdefault = mdefaultbuf;
+		}
+		else if(cistrncmp(p, "menuconsole=", 12) == 0){
+			p += 12;
+			p = comma(p, &q);
+			i8250config(p);
+		}
+		else{
+			print("invalid line in [menu] block - %s\n", p);
+			return;
+		}
+	}
+
+again:
+	print("\nPlan 9 Startup Menu:\n====================\n");
+	for(i = 0; i < nmitem; i++)
+		print("    %d. %s\n", i+1, mitem[i].tag);
+	for(;;){
+		getstr("Selection", buf, sizeof(buf), mdefault, mtimeout);
+		mtimeout = 0;
+		i = strtol(buf, &p, 0)-1;
+		if(i < 0 || i >= nmitem)
+			goto again;
+		switch(*p){
+		case 'p':
+		case 'P':
+			show = 1;
+			print("\n");
+			break;
+		case 0:
+		case '\n':
+			show = 0;
+			break;
+		default:
+			continue;
+			
+		}
+		mi = &mitem[i];
+	
+		p = str;
+		p += sprint(p, "menuitem=%s\n", mi->mb->name);
+		for(i = 0; i < nmblock; i++){
+			mb = &mblock[i];
+			if(mi->mb != mb && cistrcmp(mb->name, "common") != 0)
+				continue;
+			for(n = mb->start; n < mb->end; n++)
+				p += sprint(p, "%s\n", line[n]);
+		}
+
+		if(show){
+			for(q = str; q < p; q += i){
+				if((i = print(q)) <= 0)
+					break;
+			}
+			goto again;
+		}
+		break;
+	}
+	print("\n");
+}
+
+/* dig out tables created by l16r.s in real mode */
+void
+readlsconf(void)
+{
+	int i, n;
+	uchar *p;
+	MMap *map;
+	u64int addr, len;
+
+	/*
+	 * we could be running above 1MB, so put bios tables in low memory,
+	 * not after end.
+	 */
+	p = (uchar*)KADDR(BIOSTABLES);
+	for(n = 0; n < nelem(mmap); n++){
+		if(*p == 0)
+			break;
+		if(memcmp(p, "APM\0", 4) == 0){
+			p += 20;
+			continue;
+		}
+		else if(memcmp(p, "MAP\0", 4) == 0){
+			map = (MMap*)p;
+
+			switch(map->type){
+			default:
+				if(v_flag)
+					print("type %ud", map->type);
+				break;
+			case 1:
+				if(v_flag)
+					print("Memory");
+				break;
+			case 2:
+				if(v_flag)
+					print("reserved");
+				break;
+			case 3:
+				if(v_flag)
+					print("ACPI Reclaim Memory");
+				break;
+			case 4:
+				if(v_flag)
+					print("ACPI NVS Memory");
+				break;
+			}
+			addr = (((u64int)map->base[1])<<32)|map->base[0];
+			len = (((u64int)map->length[1])<<32)|map->length[0];
+			if(v_flag)
+				print("\t%#16.16lluX %#16.16lluX (%llud)\n",
+					addr, addr+len, len);
+
+			if(nmmap < nelem(mmap)){
+				memmove(&mmap[nmmap], map, sizeof(MMap));
+				mmap[nmmap].size = 20;
+				nmmap++;
+			}
+			p += 24;
+			continue;
+		}
+		else{
+			 /* ideally we shouldn't print here */
+			print("\nweird low-memory map at %#p:\n", p);
+			for(i = 0; i < 24; i++)
+				print(" %2.2uX", *(p+i));
+			print("\n");
+			delay(5000);
+		}
+		break;
+	}
+}
+
+void
+addconf(char *fmt, ...)
+{
+	va_list arg;
+
+	va_start(arg, fmt);
+	vseprint(BOOTARGS+strlen(BOOTARGS), BOOTARGS+BOOTARGSLEN, fmt, arg);
+	va_end(arg);
+}
+
+void
+dumpbootargs(void)
+{
+	char *p, *nl;
+
+	/* in the boot, we can only print PRINTSIZE (256) bytes at a time. */
+	print("boot args: ");
+	for (p = (char *)BOOTARGS; *p != '\0'; p = nl) {
+		nl = strchr(p, '\n');
+		if (nl != nil) {
+			++nl;
+			print("%.*s", (int)(nl - p), p);
+		}
+	}
+}
+
+void
+changeconf(char *fmt, ...)
+{
+	va_list arg;
+	char *p, *q, pref[20], buf[128];
+
+	va_start(arg, fmt);
+	vseprint(buf, buf+sizeof buf, fmt, arg);
+	va_end(arg);
+
+	pref[0] = '\n';
+	strncpy(pref+1, buf, 19);
+	pref[19] = '\0';
+	if(p = strchr(pref, '='))
+		*(p+1) = '\0';
+	else
+		print("warning: did not change %s in plan9.ini\n", buf);
+
+	/* find old line by looking for \nwhat= */
+	if(strncmp(BOOTARGS, pref+1, strlen(pref+1)) == 0)
+		p = BOOTARGS;
+	else if(p = strstr(BOOTARGS, pref))
+		p++;
+	else
+		p = nil;
+
+	/* move rest of args up, deleting what= line. */
+	if(p != nil && (q = strchr(p, '\n')) != nil)
+		memmove(p, q+1, strlen(q+1)+1);
+
+	/* add replacement to end */
+	addconf("%s", buf);
+}
+
+/*
+ *  read configuration file
+ */
+static char id[8] = "ZORT 0\r\n";
+
+int
+dotini(char *inibuf)
+{
+	int blankline, i, incomment, inspace, n;
+	char *cp, *p, *q, *line[MAXCONF];
+
+	cp = inibuf;
+
+	/*
+	 * Strip out '\r', change '\t' -> ' '.
+	 * Change runs of spaces into single spaces.
+	 * Strip out trailing spaces, blank lines.
+	 *
+	 * We do this before we make the copy so that if we 
+	 * need to change the copy, it is already fairly clean.
+	 * The main need is in the case when plan9.ini has been
+	 * padded with lots of trailing spaces, as is the case 
+	 * for those created during a distribution install.
+	 */
+	p = cp;
+	blankline = 1;
+	incomment = inspace = 0;
+	for(q = cp; *q; q++){
+		if(*q == '\r')
+			continue;
+		if(*q == '\t')
+			*q = ' ';
+		if(*q == ' '){
+			inspace = 1;
+			continue;
+		}
+		if(*q == '\n'){
+			if(!blankline){
+				if(!incomment)
+					*p++ = '\n';
+				blankline = 1;
+			}
+			incomment = inspace = 0;
+			continue;
+		}
+		if(inspace){
+			if(!blankline && !incomment)
+				*p++ = ' ';
+			inspace = 0;
+		}
+		if(blankline && *q == '#')
+			incomment = 1;
+		blankline = 0;
+		if(!incomment)
+			*p++ = *q;	
+	}
+	if(p > cp && p[-1] != '\n')
+		*p++ = '\n';
+	*p++ = 0;
+	n = p-cp;
+
+	parsemenu(cp, BOOTARGS, n);
+
+	/*
+	 * Keep a copy.
+	 * We could change this to pass the parsed strings
+	 * to the booted programme instead of the raw
+	 * string, then it only gets done once.
+	 */
+	if(strncmp(cp, id, sizeof(id))){
+		memmove(BOOTARGS, id, sizeof(id));
+		if(n+1+sizeof(id) >= BOOTARGSLEN)
+			n -= sizeof(id);
+		memmove(BOOTARGS+sizeof(id), cp, n+1);
+	}
+	else
+		memmove(BOOTARGS, cp, n+1);
+
+	n = getfields(cp, line, MAXCONF, 0, "\n");
+	for(i = 0; i < n; i++){
+		cp = strchr(line[i], '=');
+		if(cp == 0)
+			continue;
+		*cp++ = 0;
+		if(cp - line[i] >= NAMELEN+1)
+			*(line[i]+NAMELEN-1) = 0;
+		confname[nconf] = line[i];
+		confval[nconf] = cp;
+		nconf++;
+	}
+	return 0;
+}

+ 529 - 0
sys/src/9/pcboot/dat.h

@@ -0,0 +1,529 @@
+typedef struct BIOS32si	BIOS32si;
+typedef struct BIOS32ci	BIOS32ci;
+typedef struct Conf	Conf;
+typedef struct Confmem	Confmem;
+typedef struct FPsave	FPsave;
+typedef struct ISAConf	ISAConf;
+typedef struct Label	Label;
+typedef struct Lock	Lock;
+typedef struct MMU	MMU;
+typedef struct Mach	Mach;
+typedef struct Notsave	Notsave;
+typedef struct PCArch	PCArch;
+typedef struct Pcidev	Pcidev;
+typedef struct PCMmap	PCMmap;
+typedef struct PCMslot	PCMslot;
+typedef struct Page	Page;
+typedef struct PMMU	PMMU;
+typedef struct Proc	Proc;
+typedef struct Segdesc	Segdesc;
+typedef vlong		Tval;
+typedef struct Ureg	Ureg;
+typedef struct Vctl	Vctl;
+
+#pragma incomplete BIOS32si
+#pragma incomplete Pcidev
+#pragma incomplete Ureg
+
+#define MAXSYSARG	5	/* for mount(fd, afd, mpt, flag, arg) */
+
+/*
+ * Where configuration info is left for the loaded programme.
+ * There are 3584 bytes available at CONFADDR.
+ */
+#define BOOTLINE	((char*)CONFADDR)
+#define BOOTLINELEN	64
+#define BOOTARGS	((char*)(CONFADDR+BOOTLINELEN))
+#define	BOOTARGSLEN	(3584-0x200-BOOTLINELEN)
+#define	MAXCONF		100
+
+char *confname[MAXCONF];
+char *confval[MAXCONF];
+int nconf;
+
+#define KMESGSIZE 64
+#define PCICONSSIZE 64
+#define STAGESIZE 64
+
+#define NAMELEN 28
+
+#define	GSHORT(p)	(((p)[1]<<8)|(p)[0])
+#define	GLONG(p)	((GSHORT(p+2)<<16)|GSHORT(p))
+#define	GLSHORT(p)	(((p)[0]<<8)|(p)[1])
+#define	GLLONG(p)	(((ulong)GLSHORT(p)<<16)|GLSHORT(p+2))
+#define	PLLONG(p,v)	(p)[3]=(v);(p)[2]=(v)>>8;(p)[1]=(v)>>16;(p)[0]=(v)>>24
+
+enum {
+	Stkpat =	0,
+};
+
+/*
+ *  parameters for sysproc.c
+ */
+#define AOUT_MAGIC	(I_MAGIC)
+
+struct Lock
+{
+	ulong	magic;
+	ulong	key;
+	ulong	sr;
+	ulong	pc;
+	Proc	*p;
+	Mach	*m;
+	ushort	isilock;
+	long	lockcycles;
+};
+
+struct Label
+{
+	ulong	sp;
+	ulong	pc;
+};
+
+
+/*
+ * FPsave.status
+ */
+enum
+{
+	/* this is a state */
+	FPinit=		0,
+	FPactive=	1,
+	FPinactive=	2,
+
+	/* the following is a bit that can be or'd into the state */
+	FPillegal=	0x100,
+};
+
+struct	FPsave
+{
+	ushort	control;
+	ushort	r1;
+	ushort	status;
+	ushort	r2;
+	ushort	tag;
+	ushort	r3;
+	ulong	pc;
+	ushort	selector;
+	ushort	r4;
+	ulong	operand;
+	ushort	oselector;
+	ushort	r5;
+	uchar	regs[80];	/* floating point registers */
+};
+
+struct Confmem
+{
+	ulong	base;
+	ulong	npage;
+	ulong	kbase;
+	ulong	klimit;
+};
+
+struct Conf
+{
+	ulong	nmach;		/* processors */
+	ulong	nproc;		/* processes */
+	ulong	monitor;	/* has monitor? */
+	Confmem	mem[4];		/* physical memory */
+	ulong	npage;		/* total physical pages of memory */
+	ulong	upages;		/* user page pool */
+	ulong	nimage;		/* number of page cache image headers */
+	ulong	nswap;		/* number of swap pages */
+	int	nswppo;		/* max # of pageouts per segment pass */
+	ulong	base0;		/* base of bank 0 */
+	ulong	base1;		/* base of bank 1 */
+	ulong	copymode;	/* 0 is copy on write, 1 is copy on reference */
+	ulong	ialloc;		/* max interrupt time allocation in bytes */
+	ulong	pipeqsize;	/* size in bytes of pipe queues */
+	int	nuart;		/* number of uart devices */
+};
+
+/*
+ *  MMU stuff in proc
+ */
+#define NCOLOR 1
+struct PMMU
+{
+	Page*	mmupdb;			/* page directory base */
+	Page*	mmufree;		/* unused page table pages */
+	Page*	mmuused;		/* used page table pages */
+	Page*	kmaptable;		/* page table used by kmap */
+	uint	lastkmap;		/* last entry used by kmap */
+	int	nkmap;			/* number of current kmaps */
+};
+
+/*
+ *  things saved in the Proc structure during a notify
+ */
+struct Notsave
+{
+	ulong	svflags;
+	ulong	svcs;
+	ulong	svss;
+};
+
+#include "../port/portdat.h"
+
+typedef struct {
+	ulong	link;			/* link (old TSS selector) */
+	ulong	esp0;			/* privilege level 0 stack pointer */
+	ulong	ss0;			/* privilege level 0 stack selector */
+	ulong	esp1;			/* privilege level 1 stack pointer */
+	ulong	ss1;			/* privilege level 1 stack selector */
+	ulong	esp2;			/* privilege level 2 stack pointer */
+	ulong	ss2;			/* privilege level 2 stack selector */
+	ulong	xcr3;			/* page directory base register - not used because we don't use trap gates */
+	ulong	eip;			/* instruction pointer */
+	ulong	eflags;			/* flags register */
+	ulong	eax;			/* general registers */
+	ulong 	ecx;
+	ulong	edx;
+	ulong	ebx;
+	ulong	esp;
+	ulong	ebp;
+	ulong	esi;
+	ulong	edi;
+	ulong	es;			/* segment selectors */
+	ulong	cs;
+	ulong	ss;
+	ulong	ds;
+	ulong	fs;
+	ulong	gs;
+	ulong	ldt;			/* selector for task's LDT */
+	ulong	iomap;			/* I/O map base address + T-bit */
+} Tss;
+
+struct Segdesc
+{
+	ulong	d0;
+	ulong	d1;
+};
+
+struct Mach
+{
+	int	machno;			/* physical id of processor (KNOWN TO ASSEMBLY) */
+	ulong	splpc;			/* pc of last caller to splhi */
+
+	ulong*	pdb;			/* page directory base for this processor (va) */
+	Tss*	tss;			/* tss for this processor */
+	Segdesc	*gdt;			/* gdt for this processor */
+
+	Proc*	proc;			/* current process on this processor */
+	Proc*	externup;		/* extern register Proc *up */
+
+	Page*	pdbpool;
+	int	pdbcnt;
+
+	ulong	ticks;			/* of the clock since boot time */
+	Label	sched;			/* scheduler wakeup */
+	Lock	alarmlock;		/* access to alarm list */
+	void*	alarm;			/* alarms bound to this clock */
+	int	inclockintr;
+
+	Proc*	readied;		/* for runproc */
+	ulong	schedticks;		/* next forced context switch */
+
+	int	tlbfault;
+	int	tlbpurge;
+	int	pfault;
+	int	cs;
+	int	syscall;
+	int	load;
+	int	intr;
+	int	flushmmu;		/* make current proc flush it's mmu state */
+	int	ilockdepth;
+	Perf	perf;			/* performance counters */
+
+	ulong	spuriousintr;
+	int	lastintr;
+
+	int	loopconst;
+
+	Lock	apictimerlock;
+	int	cpumhz;
+	uvlong	cyclefreq;		/* Frequency of user readable cycle counter */
+	uvlong	cpuhz;
+	int	cpuidax;
+	int	cpuiddx;
+	char	cpuidid[16];
+	char*	cpuidtype;
+	int	havetsc;
+	int	havepge;
+	uvlong	tscticks;
+	int	pdballoc;
+	int	pdbfree;
+
+	vlong	mtrrcap;
+	vlong	mtrrdef;
+	vlong	mtrrfix[11];
+	vlong	mtrrvar[32];		/* 256 max. */
+
+	int	stack[1];
+};
+
+/*
+ * KMap the structure doesn't exist, but the functions do.
+ */
+typedef struct KMap		KMap;
+#define	VA(k)		((void*)(k))
+KMap*	kmap(Page*);
+void	kunmap(KMap*);
+
+struct
+{
+	Lock;
+	int	machs;			/* bitmap of active CPUs */
+	int	exiting;		/* shutdown */
+	int	ispanic;		/* shutdown in response to a panic */
+	int	thunderbirdsarego;	/* lets the added processors continue to schedinit */
+	int	rebooting;		/* just idle cpus > 0 */
+}active;
+
+/*
+ *  routines for things outside the PC model, like power management
+ */
+struct PCArch
+{
+	char*	id;
+	int	(*ident)(void);		/* this should be in the model */
+	void	(*reset)(void);		/* this should be in the model */
+	int	(*serialpower)(int);	/* 1 == on, 0 == off */
+	int	(*modempower)(int);	/* 1 == on, 0 == off */
+
+	void	(*intrinit)(void);
+	int	(*intrenable)(Vctl*);
+	int	(*intrvecno)(int);
+	int	(*intrdisable)(int);
+	void	(*introff)(void);
+	void	(*intron)(void);
+
+	void	(*clockenable)(void);
+	uvlong	(*fastclock)(uvlong*);
+	void	(*timerset)(uvlong);
+
+	void	(*resetothers)(void);	/* put other cpus into reset */
+};
+
+/* cpuid instruction result register bits */
+enum {
+	/* dx */
+	Fpuonchip = 1<<0,
+//	Pse	= 1<<3,		/* page size extensions */
+	Tsc	= 1<<4,		/* time-stamp counter */
+	Cpumsr	= 1<<5,		/* model-specific registers, rdmsr/wrmsr */
+	Pae	= 1<<6,		/* physical-addr extensions */
+	Mce	= 1<<7,		/* machine-check exception */
+	Cmpxchg8b = 1<<8,
+	Cpuapic	= 1<<9,
+	Mtrr	= 1<<12,	/* memory-type range regs.  */
+	Pge	= 1<<13,	/* page global extension */
+//	Pse2	= 1<<17,	/* more page size extensions */
+	Clflush = 1<<19,
+	Mmx	= 1<<23,
+	Fxsr	= 1<<24,	/* have SSE FXSAVE/FXRSTOR */
+	Sse	= 1<<25,	/* thus sfence instr. */
+	Sse2	= 1<<26,	/* thus mfence & lfence instr.s */
+};
+
+/*
+ *  a parsed plan9.ini line
+ */
+#define NISAOPT		8
+
+struct ISAConf {
+	char	*type;
+	ulong	port;
+	int	irq;
+	ulong	dma;
+	ulong	mem;
+	ulong	size;
+	ulong	freq;
+
+	int	nopt;
+	char	*opt[NISAOPT];
+};
+
+extern PCArch	*arch;			/* PC architecture */
+
+/*
+ * Each processor sees its own Mach structure at address MACHADDR.
+ * However, the Mach structures must also be available via the per-processor
+ * MMU information array machp, mainly for disambiguation and access to
+ * the clock which is only maintained by the bootstrap processor (0).
+ */
+Mach* machp[MAXMACH];
+	
+#define	MACHP(n)	(machp[n])
+
+extern Mach	*m;
+#define up	(((Mach*)MACHADDR)->externup)
+
+/*
+ *  hardware info about a device
+ */
+typedef struct {
+	ulong	port;	
+	int	size;
+} Devport;
+
+struct DevConf
+{
+	ulong	intnum;			/* interrupt number */
+	char	*type;			/* card type, malloced */
+	int	nports;			/* Number of ports */
+	Devport	*ports;			/* The ports themselves */
+};
+
+typedef struct BIOS32ci {		/* BIOS32 Calling Interface */
+	u32int	eax;
+	u32int	ebx;
+	u32int	ecx;
+	u32int	edx;
+	u32int	esi;
+	u32int	edi;
+} BIOS32ci;
+
+/* misc. */
+extern int	v_flag;
+
+/* APM goo */
+typedef struct Apminfo {
+	int haveinfo;
+	int ax;
+	int cx;
+	int dx;
+	int di;
+	int ebx;
+	int esi;
+} Apminfo;
+extern Apminfo	apm;
+
+/*
+ * Multiboot grot.
+ */
+typedef struct Mbi Mbi;
+struct Mbi {
+	u32int	flags;
+	u32int	memlower;
+	u32int	memupper;
+	u32int	bootdevice;
+	u32int	cmdline;
+	u32int	modscount;
+	u32int	modsaddr;
+	u32int	syms[4];
+	u32int	mmaplength;
+	u32int	mmapaddr;
+	u32int	driveslength;
+	u32int	drivesaddr;
+	u32int	configtable;
+	u32int	bootloadername;
+	u32int	apmtable;
+	u32int	vbe[6];
+};
+
+enum {						/* flags */
+	Fmem		= 0x00000001,		/* mem* valid */
+	Fbootdevice	= 0x00000002,		/* bootdevice valid */
+	Fcmdline	= 0x00000004,		/* cmdline valid */
+	Fmods		= 0x00000008,		/* mod* valid */
+	Fsyms		= 0x00000010,		/* syms[] has a.out info */
+	Felf		= 0x00000020,		/* syms[] has ELF info */
+	Fmmap		= 0x00000040,		/* mmap* valid */
+	Fdrives		= 0x00000080,		/* drives* valid */
+	Fconfigtable	= 0x00000100,		/* configtable* valid */
+	Fbootloadername	= 0x00000200,		/* bootloadername* valid */
+	Fapmtable	= 0x00000400,		/* apmtable* valid */
+	Fvbe		= 0x00000800,		/* vbe[] valid */
+};
+
+typedef struct Mod Mod;
+struct Mod {
+	u32int	modstart;
+	u32int	modend;
+	u32int	string;
+	u32int	reserved;
+};
+
+typedef struct MMap MMap;
+struct MMap {
+	u32int	size;
+	u32int	base[2];
+	u32int	length[2];
+	u32int	type;
+};
+
+MMap mmap[32+1];
+int nmmap;
+
+Mbi *multibootheader;
+
+enum {
+	Maxfile = 4096,
+};
+
+/* from 9load */
+
+enum {	/* returned by bootpass */
+	MORE, ENOUGH, FAIL
+};
+enum {
+	INITKERNEL,
+	READEXEC,
+	READ9TEXT,
+	READ9DATA,
+	READGZIP,
+	READEHDR,
+	READPHDR,
+	READEPAD,
+	READEDATA,
+	TRYBOOT,
+	TRYEBOOT,
+	INIT9LOAD,
+	READ9LOAD,
+	FAILED
+};
+
+typedef struct Execbytes Execbytes;
+struct	Execbytes
+{
+	uchar	magic[4];		/* magic number */
+	uchar	text[4];	 	/* size of text segment */
+	uchar	data[4];	 	/* size of initialized data */
+	uchar	bss[4];	  		/* size of uninitialized data */
+	uchar	syms[4];	 	/* size of symbol table */
+	uchar	entry[4];	 	/* entry point */
+	uchar	spsz[4];		/* size of sp/pc offset table */
+	uchar	pcsz[4];		/* size of pc/line number table */
+};
+
+typedef struct {
+	Execbytes;
+	uvlong uvl[1];
+} Exechdr;
+
+typedef struct Boot Boot;
+struct Boot {
+	int state;
+
+	Exechdr hdr;
+
+	char *bp;	/* base ptr */
+	char *wp;	/* write ptr */
+	char *ep;	/* end ptr */
+};
+
+extern int	debugload;
+extern Apminfo	apm;
+extern char	*defaultpartition;
+extern int	iniread;
+extern u32int	memstart;
+extern u32int	memend;
+extern int	noclock;
+extern int	pxe;
+extern int	vga;
+
+extern int	biosinited;
+
+extern void _KTZERO(void);
+#define KTZERO ((uintptr)_KTZERO)

+ 804 - 0
sys/src/9/pcboot/devbios.c

@@ -0,0 +1,804 @@
+/*
+ * read-only driver for BIOS LBA devices.
+ * devbios must be initialised first and no disks may be accessed
+ * via non-BIOS means (i.e., talking to the disk controller directly).
+ * EDD 4.0 defines the INT 0x13 functions.
+ *
+ * heavily dependent upon correct BIOS implementation.
+ * some bioses (e.g., vmware) seem to hang for two minutes then report
+ * a disk timeout on reset and extended read operations.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"../port/sd.h"
+#include	"dosfs.h"
+
+#define TYPE(q)		((ulong)(q).path & 0xf)
+#define UNIT(q)		(((ulong)(q).path>>4) & 0xff)
+#define L(q)		(((ulong)(q).path>>12) & 0xf)
+#define QID(u, t) 	((u)<<4 | (t))
+
+typedef struct Biosdev Biosdev;
+typedef struct Dap Dap;
+typedef uvlong Devbytes, Devsects;
+typedef uchar Devid;
+typedef struct Edrvparam Edrvparam;
+
+enum {
+	Debug = 0,
+	Pause = 0,			/* delay to read debugging */
+
+	Minsectsz	= 512,		/* for disks */
+	Maxsectsz	= 2048,		/* for optical (CDs, etc.) */
+
+	Highshort	= ((1ul<<16) - 1) << 16,  /* upper short of a long */
+
+	Maxdevs		= 8,
+	CF		= 1,		/* carry flag: indicates an error */
+	Flopid		= 0,		/* first floppy */
+	Baseid		= 0x80,		/* first disk */
+
+	Diskint		= 0x13,		/* "INT 13" for bios disk i/o */
+
+	/* cx capability bits in Biosckext results */
+	Fixeddisk	= 1<<0,		/* fixed disk access subset */
+	Drlock		= 1<<1,
+	Edd		= 1<<2,		/* enhanced disk drive support */
+	Bit64ext	= 1<<3,
+
+	/* bios calls: int 0x13 disk services w buffer at es:bx */
+	Biosinit	= 0,		/* initialise disk & floppy ctlrs */
+	Biosdrvsts,			/* status of last int 0x13 call */
+	Biosdrvparam	= 8,
+	Biosctlrinit,
+	Biosreset	=  0xd,		/* reset disk */
+	Biosdrvrdy	= 0x10,
+	/* extended int 0x13 calls w dap at ds:si */
+	Biosckext	= 0x41,
+	Biosrdsect,
+	Biosedrvparam	= 0x48,
+
+	/* magic numbers for bios calls */
+	Imok		= 0x55aa,
+	Youreok		= 0xaa55,
+};
+enum {
+	Qzero,				/* assumed to be 0 by devattach */
+	Qtopdir		= 1,
+	Qtopbase,
+	Qtopctl		= Qtopbase,
+	Qtopend,
+
+	Qunitdir,
+	Qunitbase,
+	Qctl		= Qunitbase,
+	Qdata,
+
+	Qtopfiles	= Qtopend-Qtopbase,
+};
+
+struct Biosdev {
+	Devbytes size;
+	Devbytes offset;
+	Devid	id;			/* drive number; e.g., 0x80 */
+	ushort	sectsz;
+	Chan	*rootchan;
+	Bootfs;
+};
+
+struct Dap {				/* a device address packet */
+	uchar	size;
+	uchar	_unused1;
+	uchar	nsects;
+	uchar	_unused2;
+	union {
+		ulong	addr;		/* actual address (nominally seg:off) */
+		struct {
+			ushort	addroff;	/* :offset */
+			ushort	addrseg;	/* segment: */
+		};
+	};
+	uvlong	stsect;			/* starting sector */
+
+	uvlong	addr64;			/* instead of addr, if addr is ~0 */
+	ulong	lnsects;		/* nsects to match addr64 */
+	ulong	_unused3;
+};
+
+struct Edrvparam {
+	ushort	size;			/* max. buffer (struct) size */
+	ushort	flags;
+	ulong	physcyls;
+	ulong	physheads;
+	ulong	phystracksects;
+	uvlong	physsects;
+	ushort	sectsz;
+
+	/* pointer is required to be unaligned, bytes 26-29.  ick. */
+//	void	*dpte;			/* ~0ull: invalid */
+	ushort	dpteoff;		/* device parameter table extension */
+	ushort	dpteseg;
+
+	/* remainder from edd 3.0 spec */
+	ushort	key;			/* 0xbedd if device path info present */
+	uchar	dpilen;			/* must be 44 (says edd 4.0) */
+	uchar	_unused1;
+	ushort	_unused2;
+	char	bustype[4];		/* "PCI" or "ISA" */
+	char	ifctype[8]; /* "ATA", "ATAPI", "SCSI", "USB", "1394", "FIBRE" */
+	uvlong	ifcpath;
+	uvlong	devpath[2];
+	uchar	_unused3;
+	uchar	dpicksum;
+};
+
+int biosinited;
+int biosndevs;
+
+void *biosgetfspart(int i, char *name, int chatty);
+
+static Biosdev bdev[Maxdevs];
+static Ureg regs;
+static RWlock devs;
+
+static int	dreset(Devid drive);
+static Devbytes	extgetsize(Biosdev *);
+static int	drivecap(Devid drive);
+
+/* convert ah error code to a string (just common cases) */
+static char *
+strerr(uchar err)
+{
+	switch (err) {
+	case 0:
+		return "no error";
+	case 1:
+		return "bad command";
+	case 0x80:
+		return "disk timeout";
+	default:
+		return "unknown";
+	}
+}
+
+static void
+assertlow64k(uintptr p, char *tag)
+{
+	if (p & Highshort)
+		panic("devbios: %s address %#p not in bottom 64k", tag, p);
+}
+
+static void
+initrealregs(Ureg *ureg)
+{
+	memset(ureg, 0, sizeof *ureg);
+}
+
+/*
+ * caller must zero or otherwise initialise *ureg,
+ * other than ax, bx, dx, si & ds.
+ */
+static int
+biosdiskcall(Ureg *ureg, uchar op, ulong bx, ulong dx, ulong si)
+{
+	int s;
+	uchar err;
+
+	s = splhi();		/* don't let the bios call be interrupted */
+	initrealregs(ureg);
+	ureg->ax = op << 8;
+	ureg->bx = bx;
+	ureg->dx = dx;		/* often drive id */
+	assertlow64k(si, "dap");
+	if(si && (si & Highshort) != ((si + Maxsectsz - 1) & Highshort))
+		print("biosdiskcall: dap address %#lux too near segment boundary\n",
+			si);
+
+	ureg->si = si;		/* ds:si forms data address packet addr */
+	ureg->ds = 0;		/* bottom 64K */
+	ureg->es = 0;		/* es:bx is conventional buffer */
+	ureg->di = 0;		/* buffer segment? */
+	ureg->flags = 0;
+
+	/*
+	 * *ureg is copied into low memory (realmoderegs) and thence into
+	 * the machine registers before the BIOS call, and the registers are
+	 * copied into realmoderegs and thence into *ureg after.
+	 *
+	 * realmode loads these registers: di, si, ax, bx, cx, dx, ds, es.
+	 */
+	ureg->trap = Diskint;
+	realmode(ureg);
+
+	if (ureg->flags & CF) {
+		if (dx == Baseid) {
+			err = ureg->ax >> 8;
+			print("\nbiosdiskcall: int %#x op %#ux drive %#lux "
+				"failed, ah error code %#ux (%s)\n",
+				Diskint, op, dx, err, strerr(err));
+		}
+		splx(s);
+		return -1;
+	}
+	splx(s);
+	return 0;
+}
+
+/*
+ * Find out what the bios knows about devices.
+ * our boot device could be usb; ghod only knows where it will appear.
+ */
+int
+biosinit0(void)
+{
+	int cap, mask, lastbit, ndrive;
+	Devbytes size;
+	Devid devid;
+	Biosdev *bdp;
+	static int beenhere;
+
+	delay(Pause);		/* pause to read the screen (DEBUG) */
+	if (biosinited || beenhere)
+		return 0;
+	beenhere = 1;
+
+	ndrive = *(uchar *)KADDR(0x475);		/* from bda */
+	if (Debug)
+		print("%d bios drive(s)\n", ndrive);
+	mask = lastbit = 0;
+	for (devid = Baseid, biosndevs = 0; devid != 0 && biosndevs < Maxdevs &&
+	    biosndevs < ndrive; devid++) {
+		cap = drivecap(devid);
+		/* don't reset; it seems to hang the bios */
+		if(cap < 0 || (cap & (Fixeddisk|Edd)) != (Fixeddisk|Edd)
+		    /* || devid != Baseid && dreset(devid) < 0 || */)
+			continue;		/* no suitable device */
+
+		/* found a live one */
+		lastbit = 1 << biosndevs;
+		mask |= lastbit;
+
+		bdp = &bdev[biosndevs];
+		bdp->id = devid;
+		size = extgetsize(bdp);
+		if (size == 0)
+			continue;		/* no device */
+		bdp->size = size;
+
+		print("bios%d: drive %#ux: %,llud bytes, %d-byte sectors\n",
+			biosndevs, devid, size, bdp->sectsz);
+		biosndevs++;
+	}
+	USED(lastbit);
+
+	if (Debug && ndrive != biosndevs)
+		print("devbios: expected %d drives, found %d\n", ndrive, biosndevs);
+
+	/*
+	 * some bioses seem to only be able to read from drive number 0x80 and
+	 * can't read from the highest drive number, even if there is only one.
+	 */
+	if (biosndevs > 0)
+		biosinited = 1;
+	else
+		panic("devbios: no bios drives seen"); /* 9loadusb needs ≥ 1 */
+	delay(Pause);		/* pause to read the screen (DEBUG) */
+	return mask;
+}
+
+static void
+biosreset(void)
+{
+	biosinit0();
+}
+
+static void
+biosinit(void)
+{
+}
+
+static Chan*
+biosattach(char *spec)
+{
+	ulong drive;
+	char *p;
+	Chan *chan;
+
+	drive = 0;
+	if(spec && *spec){
+		drive = strtoul(spec, &p, 0);
+		if((drive == 0 && p == spec) || *p || (drive >= Maxdevs))
+			error(Ebadarg);
+	}
+	if(bdev[drive].rootchan)
+		return bdev[drive].rootchan;
+
+	chan = devattach(L'☹', spec);
+	if(waserror()){
+		chanfree(chan);
+		nexterror();
+	}
+	chan->dev = drive;
+	bdev[drive].rootchan = chan;
+	/* arbitrary initialisation can go here */
+	poperror();
+	return chan;
+}
+
+static int
+unitgen(Chan *c, ulong type, Dir *dp)
+{
+	int perm, t;
+	ulong vers;
+	vlong size;
+	char *p;
+	Qid q;
+
+	perm = 0644;
+	size = 0;
+//	d = unit2dev(UNIT(c->qid));
+//	vers = d->vers;
+	vers = 0;
+	t = QTFILE;
+
+	switch(type){
+	default:
+		return -1;
+	case Qctl:
+		p = "ctl";
+		break;
+	case Qdata:
+		p = "data";
+		perm = 0640;
+		break;
+	}
+	mkqid(&q, QID(UNIT(c->qid), type), vers, t);
+	devdir(c, q, p, size, eve, perm, dp);
+	return 1;
+}
+
+static int
+topgen(Chan *c, ulong type, Dir *d)
+{
+	int perm;
+	vlong size;
+	char *p;
+	Qid q;
+
+	size = 0;
+	switch(type){
+	default:
+		return -1;
+	case Qdata:
+		p = "data";
+		perm = 0644;
+		break;
+	}
+	mkqid(&q, type, 0, QTFILE);
+	devdir(c, q, p, size, eve, perm, d);
+	return 1;
+}
+
+static int
+biosgen(Chan *c, char *, Dirtab *, int, int s, Dir *dp)
+{
+	Qid q;
+
+	if(c->qid.path == 0){
+		switch(s){
+		case DEVDOTDOT:
+			q.path = 0;
+			q.type = QTDIR;
+			devdir(c, q, "#☹", 0, eve, 0555, dp);
+			break;
+		case 0:
+			q.path = Qtopdir;
+			q.type = QTDIR;
+			devdir(c, q, "bios", 0, eve, 0555, dp);
+			break;
+		default:
+			return -1;
+		}
+		return 1;
+	}
+
+	switch(TYPE(c->qid)){
+	default:
+		return -1;
+	case Qtopdir:
+		if(s == DEVDOTDOT){
+			mkqid(&q, Qzero, 0, QTDIR);
+			devdir(c, q, "bios", 0, eve, 0555, dp);
+			return 1;
+		}
+		if(s < Qtopfiles)
+			return topgen(c, Qtopbase + s, dp);
+		s -= Qtopfiles;
+		if(s >= 1)
+			return -1;
+		mkqid(&q, QID(s, Qunitdir), 0, QTDIR);
+		devdir(c, q, "bios", 0, eve, 0555, dp);
+		return 1;
+	case Qdata:
+		return unitgen(c, TYPE(c->qid), dp);
+	}
+}
+
+static Walkqid*
+bioswalk(Chan *c, Chan *nc, char **name, int nname)
+{
+	return devwalk(c, nc, name, nname, nil, 0, biosgen);
+}
+
+static int
+biosstat(Chan *c, uchar *db, int n)
+{
+	return devstat(c, db, n, nil, 0, biosgen);
+}
+
+static Chan*
+biosopen(Chan *c, int omode)
+{
+	return devopen(c, omode, 0, 0, biosgen);
+}
+
+static void
+biosclose(Chan *)
+{
+}
+
+#ifdef UNUSED
+int
+biosboot(int dev, char *file, Boot *b)
+{
+	Bootfs *fs;
+
+	if(strncmp(file, "dos!", 4) == 0)
+		file += 4;
+	if(strchr(file, '!') != nil || strcmp(file, "") == 0) {
+		print("syntax is bios0!file\n");
+		return -1;
+	}
+
+	fs = biosgetfspart(dev, "9fat", 1);
+	if(fs == nil)
+		return -1;
+	return fsboot(fs, file, b);
+}
+#endif
+
+/* read n bytes at sector offset into a from drive id */
+long
+sectread(Biosdev *bdp, void *a, long n, Devsects offset)
+{
+	uchar *xch;
+	uintptr xchaddr;
+	Dap *dap;
+
+	if(bdp->sectsz <= 0 || n < 0 || n > bdp->sectsz)
+		return -1;
+	xch = (uchar *)BIOSXCHG;
+	assertlow64k(PADDR(xch), "biosxchg");
+	if(Debug)
+		/* scribble on the buffer to provoke trouble */
+		memset(xch, 'r', bdp->sectsz);
+
+	/* read into BIOSXCHG; alloc space for a worst-case (optical) sector */
+	dap = (Dap *)(xch + Maxsectsz);
+	assertlow64k(PADDR(dap), "Dap");
+	memset(dap, 0, sizeof *dap);
+	dap->size = sizeof *dap;
+	dap->nsects = 1;
+	dap->stsect = offset;
+
+	xchaddr = PADDR(xch);
+	assertlow64k(xchaddr, "sectread buffer");
+	dap->addr = xchaddr;		/* ulong version */
+	dap->addroff = xchaddr;		/* pedantic seg:off */
+	dap->addrseg = 0;
+	dap->addr64 = xchaddr;		/* paranoid redundancy */
+	dap->lnsects = 1;
+
+	/*
+	 * ensure that entire buffer fits in low memory.
+	 */
+	if((dap->addr & Highshort) !=
+	    ((dap->addr + Minsectsz - 1) & Highshort))
+		print("devbios: sectread: address %#lux too near seg boundary\n",
+			dap->addr);
+	if (Debug)
+		print("reading bios drive %#ux sector %lld -> %#lux...",
+			bdp->id, offset, dap->addr);
+	delay(Pause);			/* pause to read the screen (DEBUG) */
+
+	/*
+	 * int 13 read sector expects buffer seg in di?,
+	 * dap in si, 0x42 in ah, drive in dl.
+	 */
+	if (biosdiskcall(&regs, Biosrdsect, 0, bdp->id, PADDR(dap)) < 0) {
+		print("devbios: sectread: bios failed to read %ld @ sector %lld of %#ux\n",
+			n, offset, bdp->id);
+		return -1;
+	}
+	if (dap->nsects != 1)
+		panic("devbios: sector read ok but read %d sectors",
+			dap->nsects);
+	if (Debug)
+		print("OK\n");
+
+	/* copy into caller's buffer */
+	memmove(a, xch, n);
+	if(0 && Debug)
+		print("-%ux %ux %ux %ux--%16.16s-\n",
+			xch[0], xch[1], xch[2], xch[3], (char *)xch + 480);
+	delay(Pause);		/* pause to read the screen (DEBUG) */
+	return n;
+}
+
+/* seems to hang bioses, at least vmware's */
+static int
+dreset(Devid drive)
+{
+	print("devbios: resetting %#ux...", drive);
+	/* ignore carry flag for Biosinit */
+	biosdiskcall(&regs, Biosinit, 0, drive, 0);
+	print("\n");
+	return regs.ax? -1: 0;		/* ax != 0 on error */
+}
+
+/* returns capabilities bitmap */
+static int
+drivecap(Devid drive)
+{
+	int cap;
+
+	if (biosdiskcall(&regs, Biosckext, Imok, drive, 0) < 0)
+		/*
+		 * we have an old bios without extensions, in theory.
+		 * in practice, there may just be no drive for this number.
+		 */
+		return -1;
+	if(regs.bx != Youreok){
+		print("devbios: buggy bios: drive %#ux extension check "
+			 "returned %lux in bx\n", drive, regs.bx);
+		return -1;
+	}
+	cap = regs.cx;
+	if (Debug) {
+		print("bios drive %#ux extensions version %#x.%d cx %#ux\n",
+			drive, (uchar)(regs.ax >> 8), (uchar)regs.ax, cap);
+		if ((uchar)(regs.ax >> 8) < 0x30) {
+			print("drivecap: extensions prior to 0x30\n");
+			return -1;
+		}
+		print("\tsubsets supported:");
+		if (cap & Fixeddisk)
+			print(" fixed disk access;");
+		if (cap & Drlock)
+			print(" drive locking;");
+		if (cap & Edd)
+			print(" enhanced disk support;");
+		if (cap & Bit64ext)
+			print(" 64-bit extensions;");
+		print("\n");
+	}
+	delay(Pause);			/* pause to read the screen (DEBUG) */
+	return cap;
+}
+
+/* extended get size; reads bdp->id, fills in bdp->sectsz, returns # sectors */
+static Devbytes
+extgetsize(Biosdev *bdp)
+{
+	ulong sectsz;
+	Edrvparam *edp;
+
+	edp = (Edrvparam *)BIOSXCHG;
+	memset(edp, 0, sizeof *edp);
+	edp->size = sizeof *edp;
+	edp->dpteseg = edp->dpteoff = ~0;	/* no pointer */
+	edp->dpilen = 44;
+
+	if (biosdiskcall(&regs, Biosedrvparam, 0, bdp->id, PADDR(edp)) < 0)
+		return 0;		/* old bios without extensions */
+	if(Debug) {
+		print("bios drive %#ux info flags %#ux", bdp->id, edp->flags);
+		if (edp->key == 0xbedd)
+			print("; edd 3.0  %.4s %.8s",
+				edp->bustype, edp->ifctype);
+		else
+			print("; NOT edd 3.0 compliant (key %#ux)", edp->key);
+		print("\n");
+	}
+	if (edp->sectsz <= 0) {
+		print("devbios: drive %#ux: sector size <= 0\n", bdp->id);
+		edp->sectsz = 1;		/* don't divide by 0 */
+		return 0;
+	}
+	sectsz = edp->sectsz;
+	if (sectsz > Maxsectsz) {
+		print("devbios: sector size %lud > %d\n", sectsz, Maxsectsz);
+		return 0;
+	}
+	bdp->sectsz = sectsz;
+	return edp->physsects * sectsz;
+}
+
+vlong
+biossize(uint dev)
+{
+	Biosdev *bdp;
+
+	if (dev >= biosndevs)
+		return -1;
+	bdp = &bdev[dev];
+	if (bdp->sectsz <= 0)
+		return -1;
+	return bdp->size / bdp->sectsz;
+}
+
+long
+biossectsz(uint dev)
+{
+	Biosdev *bdp;
+
+	if (dev >= biosndevs)
+		return -1;
+	bdp = &bdev[dev];
+	if (bdp->sectsz <= 0)
+		return -1;
+	return bdp->sectsz;
+}
+
+long
+biosread0(Bootfs *fs, void *a, long n)
+{
+	int want, got, part, dev;
+	long totnr, stuck;
+	Devbytes offset;
+	Biosdev *bdp;
+
+	dev = fs->dev;				/* only use of fs */
+	if(dev > biosndevs)
+		return -1;
+	if (n <= 0)
+		return n;
+	bdp = &bdev[dev];
+	offset = bdp->offset;
+	stuck = 0;
+	for (totnr = 0; totnr < n && stuck < 4; totnr += got) {
+		if (bdp->sectsz == 0) {
+			print("devbios: zero sector size\n");
+			return -1;
+		}
+		want = bdp->sectsz;
+		if (totnr + want > n)
+			want = n - totnr;
+		if(0 && Debug && debugload)
+			print("bios%d, read: %ld @ off %lld, want: %d, id: %#ux\n",
+				dev, n, offset, want, bdp->id);
+		part = offset % bdp->sectsz;
+		if (part != 0) {	/* back up to start of sector */
+			offset -= part;
+			totnr  -= part;
+			if (totnr < 0) {
+				print("biosread0: negative count %ld\n", totnr);
+				return -1;
+			}
+		}
+		if ((vlong)offset < 0) {
+			print("biosread0: negative offset %lld\n", offset);
+			return -1;
+		}
+		got = sectread(bdp, (char *)a + totnr, want,
+			offset / bdp->sectsz);
+		if(got <= 0)
+			return -1;
+		offset += got;
+		bdp->offset = offset;
+		if (got < bdp->sectsz)
+			stuck++;	/* we'll have to re-read this sector */
+		else
+			stuck = 0;
+	}
+	return totnr;
+}
+
+vlong
+biosseek(Bootfs *fs, vlong off)
+{
+	if (off < 0) {
+		print("biosseek(fs, %lld) is illegal\n", off);
+		return -1;
+	}
+	if(fs->dev > biosndevs) {
+		print("biosseek: fs->dev %d > biosndevs %d\n", fs->dev, biosndevs);
+		return -1;
+	}
+	bdev[fs->dev].offset = off;	/* do not know size... (yet) */
+	return off;
+}
+
+static long
+biosread(Chan *c, void *db, long n, vlong off)
+{
+	Biosdev *bp;
+
+	switch(TYPE(c->qid)){
+	default:
+		error(Eperm);
+	case Qzero:
+	case Qtopdir:
+		return devdirread(c, db, n, 0, 0, biosgen);
+	case Qdata:
+		bp = &bdev[UNIT(c->qid)];
+		if (bp->rootchan == nil)
+			panic("biosread: nil root chan for bios%ld",
+				UNIT(c->qid));
+		biosseek(&bp->Bootfs, off);
+		return biosread0(&bp->Bootfs, db, n);
+	}
+}
+
+void *
+biosgetfspart(int i, char *name, int chatty)
+{
+	static Bootfs fs;
+
+	if(strcmp(name, "9fat") != 0){	// TODO use any bootfile partition given
+		if(chatty)
+			print("unknown partition bios%d!%s (use bios%d!9fat)\n",
+				i, name, i);
+		return nil;
+	}
+
+	fs.dev = i;
+	fs.diskread = biosread0;
+	fs.diskseek = biosseek;
+
+	if(dosinit(&fs, "#S/sdB0/9fat") < 0){
+		if(chatty)
+			print("bios%d!%s does not contain a FAT file system\n",
+				i, name);
+		return nil;
+	}
+	return &fs;
+}
+
+static long
+bioswrite(Chan *, void *, long, vlong)
+{
+	error("bios devices are read-only in bootstrap");
+	return 0;
+}
+
+Dev biosdevtab = {
+	L'☹',
+	"bios",
+
+	biosreset,
+	biosinit,
+	devshutdown,
+	biosattach,
+	bioswalk,
+	biosstat,
+	biosopen,
+	devcreate,
+	biosclose,
+	biosread,
+	devbread,
+	bioswrite,
+	devbwrite,
+	devremove,
+	devwstat,
+	devpower,
+	devconfig,
+};

+ 402 - 0
sys/src/9/pcboot/dir.c

@@ -0,0 +1,402 @@
+/*
+ * directory reading
+ * from /sys/src/libc/9sys/dirread.c
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"ureg.h"
+
+enum
+{
+	DIRSIZE	= STATFIXLEN + 16 * 4		/* enough for encoded stat buf + some reasonable strings */
+};
+
+Dir*
+dirchstat(Chan *chan)
+{
+	Dir *d;
+	uchar *buf;
+	int n, nd, i;
+
+	nd = DIRSIZE;
+	for(i=0; i<2; i++){	/* should work by the second try */
+		d = malloc(sizeof(Dir) + BIT16SZ + nd);
+		if(d == nil)
+			return nil;
+		buf = (uchar*)&d[1];
+		n = devtab[chan->type]->stat(chan, buf, BIT16SZ + nd);
+		if(n < BIT16SZ){
+			free(d);
+			return nil;
+		}
+		nd = GBIT16((uchar*)buf);	/* upper bound on size of Dir + strings */
+		if(nd <= n){
+			convM2D(buf, n, d, (char*)&d[1]);
+			return d;
+		}
+		/* else sizeof(Dir)+BIT16SZ+nd is plenty */
+		free(d);
+	}
+	return nil;
+}
+
+long
+dirpackage(uchar *buf, long ts, Dir **d)
+{
+	char *s;
+	long ss, i, n, nn, m;
+
+	*d = nil;
+	if(ts <= 0)
+		return 0;
+
+	/*
+	 * first find number of all stats, check they look like stats, & size all associated strings
+	 */
+	ss = 0;
+	n = 0;
+	for(i = 0; i < ts; i += m){
+		m = BIT16SZ + GBIT16(&buf[i]);
+		if(statcheck(&buf[i], m) < 0)
+			break;
+		ss += m;
+		n++;
+	}
+
+	if(i != ts)
+		return -1;
+
+	*d = malloc(n * sizeof(Dir) + ss);
+	if(*d == nil)
+		return -1;
+
+	/*
+	 * then convert all buffers
+	 */
+	s = (char*)*d + n * sizeof(Dir);
+	nn = 0;
+	for(i = 0; i < ts; i += m){
+		m = BIT16SZ + GBIT16((uchar*)&buf[i]);
+		if(nn >= n || convM2D(&buf[i], m, *d + nn, s) != m){
+			free(*d);
+			*d = nil;
+			return -1;
+		}
+		nn++;
+		s += m;
+	}
+
+	return nn;
+}
+
+/*
+ * directory reading, from sysfile.c
+ */
+
+long
+unionread(Chan *c, void *va, long n)
+{
+	int i;
+	long nr;
+	Mhead *m;
+	Mount *mount;
+
+	qlock(&c->umqlock);
+	m = c->umh;
+	rlock(&m->lock);
+	mount = m->mount;
+	/* bring mount in sync with c->uri and c->umc */
+	for(i = 0; mount != nil && i < c->uri; i++)
+		mount = mount->next;
+
+	nr = 0;
+	while(mount != nil){
+		/* Error causes component of union to be skipped */
+		if(mount->to && !waserror()){
+			if(c->umc == nil){
+				c->umc = cclone(mount->to);
+				c->umc = devtab[c->umc->type]->open(c->umc, OREAD);
+			}
+	
+			nr = devtab[c->umc->type]->read(c->umc, va, n, c->umc->offset);
+			c->umc->offset += nr;
+			poperror();
+		}
+		if(nr > 0)
+			break;
+
+		/* Advance to next element */
+		c->uri++;
+		if(c->umc){
+			cclose(c->umc);
+			c->umc = nil;
+		}
+		mount = mount->next;
+	}
+	runlock(&m->lock);
+	qunlock(&c->umqlock);
+	return nr;
+}
+
+void
+unionrewind(Chan *c)
+{
+	qlock(&c->umqlock);
+	c->uri = 0;
+	if(c->umc){
+		cclose(c->umc);
+		c->umc = nil;
+	}
+	qunlock(&c->umqlock);
+}
+
+static int
+dirfixed(uchar *p, uchar *e, Dir *d)
+{
+	int len;
+
+	len = GBIT16(p)+BIT16SZ;
+	if(p + len > e)
+		return -1;
+
+	p += BIT16SZ;	/* ignore size */
+	d->type = devno(GBIT16(p), 1);
+	p += BIT16SZ;
+	d->dev = GBIT32(p);
+	p += BIT32SZ;
+	d->qid.type = GBIT8(p);
+	p += BIT8SZ;
+	d->qid.vers = GBIT32(p);
+	p += BIT32SZ;
+	d->qid.path = GBIT64(p);
+	p += BIT64SZ;
+	d->mode = GBIT32(p);
+	p += BIT32SZ;
+	d->atime = GBIT32(p);
+	p += BIT32SZ;
+	d->mtime = GBIT32(p);
+	p += BIT32SZ;
+	d->length = GBIT64(p);
+
+	return len;
+}
+
+static char*
+dirname(uchar *p, int *n)
+{
+	p += BIT16SZ+BIT16SZ+BIT32SZ+BIT8SZ+BIT32SZ+BIT64SZ
+		+ BIT32SZ+BIT32SZ+BIT32SZ+BIT64SZ;
+	*n = GBIT16(p);
+	return (char*)p+BIT16SZ;
+}
+
+static long
+dirsetname(char *name, int len, uchar *p, long n, long maxn)
+{
+	char *oname;
+	int olen;
+	long nn;
+
+	if(n == BIT16SZ)
+		return BIT16SZ;
+
+	oname = dirname(p, &olen);
+
+	nn = n+len-olen;
+	PBIT16(p, nn-BIT16SZ);
+	if(nn > maxn)
+		return BIT16SZ;
+
+	if(len != olen)
+		memmove(oname+len, oname+olen, p+n-(uchar*)(oname+olen));
+	PBIT16((uchar*)(oname-2), len);
+	memmove(oname, name, len);
+	return nn;
+}
+
+/*
+ * Mountfix might have caused the fixed results of the directory read
+ * to overflow the buffer.  Catch the overflow in c->dirrock.
+ */
+static void
+mountrock(Chan *c, uchar *p, uchar **pe)
+{
+	uchar *e, *r;
+	int len, n;
+
+	e = *pe;
+
+	/* find last directory entry */
+	for(;;){
+		len = BIT16SZ+GBIT16(p);
+		if(p+len >= e)
+			break;
+		p += len;
+	}
+
+	/* save it away */
+	qlock(&c->rockqlock);
+	if(c->nrock+len > c->mrock){
+		n = ROUND(c->nrock+len, 1024);
+		r = smalloc(n);
+		memmove(r, c->dirrock, c->nrock);
+		free(c->dirrock);
+		c->dirrock = r;
+		c->mrock = n;
+	}
+	memmove(c->dirrock+c->nrock, p, len);
+	c->nrock += len;
+	qunlock(&c->rockqlock);
+
+	/* drop it */
+	*pe = p;
+}
+
+/*
+ * Satisfy a directory read with the results saved in c->dirrock.
+ */
+int
+mountrockread(Chan *c, uchar *op, long n, long *nn)
+{
+	long dirlen;
+	uchar *rp, *erp, *ep, *p;
+
+	/* common case */
+	if(c->nrock == 0)
+		return 0;
+
+	/* copy out what we can */
+	qlock(&c->rockqlock);
+	rp = c->dirrock;
+	erp = rp+c->nrock;
+	p = op;
+	ep = p+n;
+	while(rp+BIT16SZ <= erp){
+		dirlen = BIT16SZ+GBIT16(rp);
+		if(p+dirlen > ep)
+			break;
+		memmove(p, rp, dirlen);
+		p += dirlen;
+		rp += dirlen;
+	}
+
+	if(p == op){
+		qunlock(&c->rockqlock);
+		return 0;
+	}
+
+	/* shift the rest */
+	if(rp != erp)
+		memmove(c->dirrock, rp, erp-rp);
+	c->nrock = erp - rp;
+
+	*nn = p - op;
+	qunlock(&c->rockqlock);
+	return 1;
+}
+
+void
+mountrewind(Chan *c)
+{
+	c->nrock = 0;
+}
+
+/*
+ * Rewrite the results of a directory read to reflect current 
+ * name space bindings and mounts.  Specifically, replace
+ * directory entries for bind and mount points with the results
+ * of statting what is mounted there.  Except leave the old names.
+ */
+long
+mountfix(Chan *c, uchar *op, long n, long maxn)
+{
+	char *name;
+	int nbuf, nname;
+	Chan *nc;
+	Mhead *mh;
+	Mount *m;
+	uchar *p;
+	int dirlen, rest;
+	long l;
+	uchar *buf, *e;
+	Dir d;
+
+	p = op;
+	buf = nil;
+	nbuf = 0;
+	for(e=&p[n]; p+BIT16SZ<e; p+=dirlen){
+		dirlen = dirfixed(p, e, &d);
+		if(dirlen < 0)
+			break;
+		nc = nil;
+		mh = nil;
+		if(findmount(&nc, &mh, d.type, d.dev, d.qid)){
+			/*
+			 * If it's a union directory and the original is
+			 * in the union, don't rewrite anything.
+			 */
+			for(m=mh->mount; m; m=m->next)
+				if(eqchantdqid(m->to, d.type, d.dev, d.qid, 1))
+					goto Norewrite;
+
+			name = dirname(p, &nname);
+			/*
+			 * Do the stat but fix the name.  If it fails, leave old entry.
+			 * BUG: If it fails because there isn't room for the entry,
+			 * what can we do?  Nothing, really.  Might as well skip it.
+			 */
+			if(buf == nil){
+				buf = smalloc(4096);
+				nbuf = 4096;
+			}
+			if(waserror())
+				goto Norewrite;
+			l = devtab[nc->type]->stat(nc, buf, nbuf);
+			l = dirsetname(name, nname, buf, l, nbuf);
+			if(l == BIT16SZ)
+				error("dirsetname");
+			poperror();
+
+			/*
+			 * Shift data in buffer to accomodate new entry,
+			 * possibly overflowing into rock.
+			 */
+			rest = e - (p+dirlen);
+			if(l > dirlen){
+				while(p+l+rest > op+maxn){
+					mountrock(c, p, &e);
+					if(e == p){
+						dirlen = 0;
+						goto Norewrite;
+					}
+					rest = e - (p+dirlen);
+				}
+			}
+			if(l != dirlen){
+				memmove(p+l, p+dirlen, rest);
+				dirlen = l;
+				e = p+dirlen+rest;
+			}
+
+			/*
+			 * Rewrite directory entry.
+			 */
+			memmove(p, buf, l);
+
+		    Norewrite:
+			cclose(nc);
+			putmhead(mh);
+		}
+	}
+	if(buf)
+		free(buf);
+
+	if(p != e)
+		error("oops in rockfix");
+
+	return e-op;
+}

+ 413 - 0
sys/src/9/pcboot/diskload.c

@@ -0,0 +1,413 @@
+/*
+ * 9load - load next kernel from disk and start it
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"dosfs.h"
+#include	"../port/sd.h"
+
+/* from <libc.h> */
+#define	DIRMAX	(sizeof(Dir)+STATMAX)	/* max length of Dir structure */ 
+#define	STATMAX	65535U	/* max length of machine-independent stat structure */
+
+enum {
+	Bufsize = 8192,
+};
+
+int	dosdirread(File *f, char ***nmarray);
+int	isconf(char *name);
+
+static int progress = 1;
+static Bootfs fs;
+
+/*
+ * from 9load's bootp.c:
+ */
+
+static int
+dumpfile(char *file)
+{
+	int n;
+	char *buf;
+
+	buf = smalloc(Maxfile + 1);
+	n = readfile(file, buf, Maxfile);
+	if (n < 0)
+		return -1;
+	buf[n] = 0;
+	print("%s (%d bytes):\n", file, n);
+	print("%s\n", buf);
+	free(buf);
+	return 0;
+}
+
+long
+dirread0(Chan *c, uchar *p, long n)
+{
+	long nn, nnn;
+	vlong off;
+
+	/*
+	 * The offset is passed through on directories, normally.
+	 * Sysseek complains, but pread is used by servers like exportfs,
+	 * that shouldn't need to worry about this issue.
+	 *
+	 * Notice that c->devoffset is the offset that c's dev is seeing.
+	 * The number of bytes read on this fd (c->offset) may be different
+	 * due to rewritings in rockfix.
+	 */
+	/* use and maintain channel's offset */
+	off = c->offset;
+	if(off < 0)
+		error(Enegoff);
+
+	if(off == 0){	/* rewind to the beginning of the directory */
+		c->offset = 0;
+		c->devoffset = 0;
+		mountrewind(c);
+		unionrewind(c);
+	}
+
+	if(c->qid.type & QTDIR){
+		if(mountrockread(c, p, n, &nn)){
+			/* do nothing: mountrockread filled buffer */
+		}else if(c->umh)
+			nn = unionread(c, p, n);
+		else{
+			if(off != c->offset)
+				error(Edirseek);
+			nn = devtab[c->type]->read(c, p, n, c->devoffset);
+		}
+		nnn = mountfix(c, p, nn, n);
+	}else
+		nnn = nn = devtab[c->type]->read(c, p, n, off);
+
+	lock(c);
+	c->devoffset += nn;
+	c->offset += nnn;
+	unlock(c);
+
+	/* nnn == 54, sizeof(Dir) == 60 */
+	return nnn;
+}
+
+long
+dirread(Chan *c, Dir **d)
+{
+	uchar *buf;
+	long ts;
+
+	buf = malloc(DIRMAX);
+	if(buf == nil)
+		return -1;
+	ts = dirread0(c, buf, DIRMAX);
+	if(ts >= 0)
+		/* convert machine-independent representation to Dirs */
+		ts = dirpackage(buf, ts, d);
+	free(buf);
+	return ts;
+}
+
+static int
+addsdev(Dir *dirp)
+{
+	int n, f, lines, flds;
+	vlong start, end;
+	char *buf, *part;
+	char *line[64], *fld[5];
+	char ctl[64], disk[64];
+
+	buf = smalloc(Maxfile + 1);
+	snprint(ctl, sizeof ctl, "#S/%s/ctl", dirp->name);
+	n = readfile(ctl, buf, Maxfile);
+	if (n < 0)
+		return -1;
+	buf[n] = 0;
+
+	lines = getfields(buf, line, nelem(line), 0, "\r\n");
+	part = nil;
+	for (f = 0; f < lines; f++) {
+		flds = tokenize(line[f], fld, nelem(fld));
+		if (flds < 4 || strcmp(fld[0], "part") != 0)
+			continue;
+		kstrdup(&part, fld[1]);
+		start = strtoull(fld[2], nil, 0);
+		end   = strtoull(fld[3], nil, 0);
+		if (end > (vlong)100*(vlong)MB*MB) {
+			print("addsdev: implausible partition #S/%s/%s %lld %lld\n",
+				dirp->name, part, start, end);
+			continue;
+		}
+		/*
+		 * We are likely to only see a "data" partition on each disk.
+		 *
+		 * Read the on-disk partition tables & set in-core partitions
+		 * (disk, part, start, end).
+		 */
+		print("found partition #S/%s/%s %,lld %,lld\n",
+			dirp->name, part, start, end);
+		snprint(disk, sizeof disk, "#S/%s", dirp->name);
+		readparts(disk);
+	}
+	free(buf);
+	return 0;
+}
+
+static File file;
+
+/*
+ * look for kernels on a 9fat; if there's just one, return it.
+ * could treat x and x.gz as one kernel.
+ */
+static char *
+findonekernel(Bootfs *fs)
+{
+	int n, kerns;
+	char *bootfile, *name;
+	char **array;
+
+	if(fswalk(fs, "", &file) <= 0) {
+		print("can't walk to ''\n");
+		return nil;
+	}
+	dosdirread(&file, &array);
+	bootfile = nil;
+	kerns = 0;
+	for (n = 0; (name = array[n]) != nil; n++)
+		if(strncmp(name, "9pc", 3) == 0 ||
+		   strncmp(name, "9k8", 3) == 0){
+			bootfile = name;
+			kerns++;
+		}
+	if (kerns > 1) {
+		print("found these kernels:");
+		for (n = 0; (name = array[n]) != nil; n++)
+			print(" %s", name);
+		print("\n");
+	}
+	return kerns == 1? bootfile: nil;
+}
+
+void
+askbootfile(char *buf, int len, char **bootfp)
+{
+	getstr("Boot from", buf, len, "sdC0!9fat!9pccpu", 60);
+	trimnl(buf);
+	if (bootfp)
+		kstrdup(bootfp, buf);
+}
+
+int
+partboot(char *path)
+{
+	long n;
+	char *buf;
+	Boot boot;
+	Boot *b;
+	Chan *ch;
+
+	b = &boot;
+	memset(b, 0, sizeof *b);
+	b->state = INITKERNEL;
+	ch = namecopen(path, OREAD);
+	if (ch == nil) {
+		print("can't open partition %s\n", path);
+		return -1;
+	}
+	print("loading %s\n", path);
+	buf = smalloc(Bufsize);
+	while((n = devtab[ch->type]->read(ch, buf, Bufsize, ch->offset)) > 0)
+		if(bootpass(b, buf, n) != MORE)
+			break;
+	bootpass(b, nil, 0);		/* attempts to boot */
+
+	free(buf);
+	cclose(ch);
+	return -1;
+}
+
+/* fsroot must be nil or a fat root directory already dosinit'ed */
+static void
+trybootfile(char *bootfile, Bootfs *fsroot)
+{
+	int nf;
+	char fat[64];
+	char *disk, *part, *file, *bootcopy;
+	char *fields[4];
+	Boot boot;
+	static int didaddconf;
+
+	bootcopy = file = nil;
+	kstrdup(&bootcopy, bootfile);
+	nf = getfields(bootcopy, fields, nelem(fields), 0, "!");
+	switch(nf){
+	case 3:
+		file = fields[2];
+		/* fall through */
+	case 2:
+		disk = fields[0];
+		part = fields[1];
+		break;
+	default:
+		print("bad bootfile syntax: %s\n", bootfile);
+		return;
+	}
+
+	if(didaddconf == 0) {
+		didaddconf = 1;
+		sdaddallconfs(sdaddconf);
+	}
+
+	snprint(fat, sizeof fat, "#S/%s/%s", disk, part);
+	if (file == nil) { /* if no file, try to load from partition directly */
+		partboot(fat);
+		return;
+	}
+
+	if (fsroot == nil) {
+		fsroot = &fs;
+		memset(fsroot, 0, sizeof *fsroot);
+		if (dosinit(fsroot, fat) < 0) {
+			print("dosinit %s failed\n", fat);
+			return;
+		}
+	}
+
+	/* load kernel and jump to it */
+	memset(&boot, 0, sizeof boot);
+	boot.state = INITKERNEL;
+	fsboot(fsroot, file, &boot);
+
+	/* failed to boot */
+}
+
+/*
+ * for a given disk's 9fat, find & load plan9.ini, parse it,
+ * extract kernel filename, load that kernel and jump to it.
+ */
+static void
+trydiskboot(char *disk)
+{
+	int n;
+	char fat[80];
+	char *ini, *bootfile;
+
+	/* mount the disk's 9fat */
+	memset(&fs, 0, sizeof fs);
+	snprint(fat, sizeof fat, "#S/%s/9fat", disk);
+	if (dosinit(&fs, fat) < 0) {
+		print("dosinit %s failed\n", fat);
+		return;
+	}
+
+	/* open plan9.ini, read it */
+	ini = smalloc(Maxfile+1);
+	if(fswalk(&fs, "plan9.ini", &file) <= 0) {
+		print("no plan9.ini in %s\n", fat);
+		n = 0;
+	} else {
+		n = fsread(&file, ini, Maxfile);
+		if (n < 0)
+			panic("error reading %s", ini);
+	}
+	ini[n] = 0;
+
+	/*
+	 * take note of plan9.ini contents.  consumes ini to make config vars,
+	 * thus we can't free ini.
+	 */
+	dotini(ini);
+	i8250console();			/* (re)configure serial port */
+
+	bootfile = nil;			/* for kstrdup in askbootfile */
+	if(isconf("bootfile")) {
+		kstrdup(&bootfile, getconf("bootfile"));
+		if(strcmp(bootfile, "manual") == 0)
+			askbootfile(fat, sizeof fat, &bootfile);
+
+		/* pass arguments to kernels that can use them */
+		strecpy(BOOTLINE, BOOTLINE+BOOTLINELEN, bootfile);
+	} else if ((bootfile = findonekernel(&fs)) != nil) {  /* look in fat */
+		snprint(fat, sizeof fat, "%s!9fat!%s", disk, bootfile);
+		bootfile = fat;
+		print("no bootfile named in plan9.ini; found %s\n", bootfile);
+	} else {
+		/* if #S/disk/kernel partition exists, load from it. */
+		snprint(fat, sizeof fat, "#S/%s/kernel", disk);
+		partboot(fat);
+		/* last resort: ask the user */
+		askbootfile(fat, sizeof fat, &bootfile);
+	}
+	trybootfile(bootfile, &fs);
+
+	/* failed; try again */
+}
+
+/*
+ * find all the disks in #S, read their partition tables and set those
+ * partitions in core, mainly so that we can access 9fat file systems.
+ * for each disk's 9fat, read plan9.ini and boot the named kernel.
+ */
+void
+bootloadproc(void *)
+{
+	int n, dirs, sdev;
+	char kern[64];
+	char *sdevs[128];
+	Chan *sdch;
+	Dir *dirp, *dp;
+
+	sdch = nil;
+	while(waserror()) {
+		print("error caught at top level in bootload\n");
+		if(sdch) {
+			cclose(sdch);
+			sdch = nil;
+		}
+	}
+	bind("#S", "/dev", MAFTER);		/* try to force an attach */
+	sdch = namecopen("#S", OREAD);
+	if (sdch == nil)
+		panic("no disks (no #S)");
+	sdev = 0;
+	while ((dirs = dirread(sdch, &dirp)) > 0) {
+		for (dp = dirp; dirs-- > 0; dp++)
+			if (strcmp(dp->name, "sdctl") != 0) {
+				addsdev(dp);
+				if (sdev >= nelem(sdevs))
+					print("too many sdevs; ignoring %s\n",
+						dp->name);
+				else
+					kstrdup(&sdevs[sdev++], dp->name);
+			}
+		free(dirp);
+	}
+	cclose(sdch);
+	sdch = nil;
+	if (sdev == 0)
+		panic("no disks (in #S)");
+
+	print("disks:");
+	for (n = 0; n < sdev; n++)
+		print(" %s", sdevs[n]);
+	print("\n");
+
+	for (n = 0; n < sdev; n++) {
+		print("trying %s...", sdevs[n]);
+		trydiskboot(sdevs[n]);
+	}
+	USED(sdch);
+	for (;;) {
+		askbootfile(kern, sizeof kern, nil);
+		trybootfile(kern, nil);
+	}
+	// poperror();
+}

+ 608 - 0
sys/src/9/pcboot/dosboot.c

@@ -0,0 +1,608 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"dosfs.h"
+
+enum {
+	Dosfilemax = 8,
+	Dosextmax = 3,
+};
+
+/*
+ *  predeclared
+ */
+static void	bootdump(Dosboot*);
+static void	setname(Dosfile*, char*);
+
+/*
+ *  debugging
+ */
+#define chatty	0
+#define chat	if(chatty)print
+
+/*
+ *  block io buffers
+ */
+enum
+{
+	Nbio=	16,
+};
+typedef struct	Clustbuf	Clustbuf;
+struct Clustbuf
+{
+	int	age;
+	long	sector;
+	uchar	*iobuf;
+	Dos	*dos;
+	int	size;
+};
+Clustbuf	bio[Nbio];
+
+/*
+ *  get an io block from an io buffer
+ */
+Clustbuf*
+getclust(Dos *dos, long sector)
+{
+	Bootfs *fs;
+	Clustbuf *p, *oldest;
+	int size;
+
+	chat("getclust @ %ld\n", sector);
+
+	/*
+	 *  if we have it, just return it
+	 */
+	for(p = bio; p < &bio[Nbio]; p++){
+		if(sector == p->sector && dos == p->dos){
+			p->age = m->ticks;
+			chat("getclust %ld in cache\n", sector);
+			return p;
+		}
+	}
+
+	/*
+	 *  otherwise, reuse the oldest entry
+	 */
+	oldest = bio;
+	for(p = &bio[1]; p < &bio[Nbio]; p++){
+		if(p->age <= oldest->age)
+			oldest = p;
+	}
+	p = oldest;
+
+	/*
+	 *  make sure the buffer is big enough
+	 */
+	size = dos->clustsize*dos->sectsize;
+	if(p->iobuf==0 || p->size < size)
+		p->iobuf = smalloc(size);
+	p->size = size;
+
+	/*
+	 *  read in the cluster
+	 */
+	fs = (Bootfs*)dos;		/* assume dos is embedded at start of an Bootfs */
+	chat("getclust addr %llud %p %s\n", ((sector+dos->start)*(vlong)dos->sectsize),
+		fs, fs->disk);
+	fs->devch->offset = (sector+dos->start) * (vlong)dos->sectsize;
+	if(myreadn(fs->devch, p->iobuf, size) != size){
+		chat("can't read block\n");
+		return 0;
+	}
+	USED(fs);
+	p->age = m->ticks;
+	p->dos = dos;
+	p->sector = sector;
+	chat("getclust %ld read\n", sector);
+	return p;
+}
+
+/*
+ *  walk the fat one level ( n is a current cluster number ).
+ *  return the new cluster number or -1 if no more.
+ */
+static long
+fatwalk(Dos *dos, int n)
+{
+	ulong k, sect;
+	Clustbuf *p;
+	int o;
+
+	chat("fatwalk %d\n", n);
+
+	if(n < 2 || n >= dos->fatclusters)
+		return -1;
+
+	switch(dos->fatbits){
+	case 12:
+		k = (3*n)/2; break;
+	case 16:
+		k = 2*n; break;
+	case 32:
+		k = 4*n; break;
+	default:
+		return -1;
+	}
+	if(k >= dos->fatsize*dos->sectsize)
+		panic("getfat");
+
+	if (dos->sectsize == 0 || dos->clustsize == 0)
+		panic("fatwalk: zero sector or cluster size");
+	sect = (k/(dos->sectsize*dos->clustsize))*dos->clustsize + dos->fataddr;
+	o = k%(dos->sectsize*dos->clustsize);
+	p = getclust(dos, sect);
+	k = p->iobuf[o++];
+	if(o >= dos->sectsize*dos->clustsize){
+		p = getclust(dos, sect+dos->clustsize);
+		o = 0;
+	}
+	k |= p->iobuf[o++]<<8;
+	if(dos->fatbits == 12){
+		if(n&1)
+			k >>= 4;
+		else
+			k &= 0xfff;
+		if(k >= 0xff8)
+			k = -1;
+	}
+	else if (dos->fatbits == 32){
+		if(o >= dos->sectsize*dos->clustsize){
+			p = getclust(dos, sect+dos->clustsize);
+			o = 0;
+		}
+		k |= p->iobuf[o++]<<16;
+		k |= p->iobuf[o]<<24;
+		if (k >= 0xfffffff8)
+			k = -1;
+	}
+	else
+		k = k < 0xfff8 ? k : -1;
+	chat("fatwalk %d -> %lud\n", n, k);
+	return k;
+}
+
+/*
+ *  map a file's logical cluster address to a physical sector address
+ */
+static long
+fileaddr(Dosfile *fp, long ltarget)
+{
+	Dos *dos = fp->dos;
+	long l;
+	long p;
+
+	chat("fileaddr %8.8s %ld\n", fp->name, ltarget);
+	/*
+	 *  root directory is contiguous and easy (unless FAT32)
+	 */
+	if(fp->pstart == 0 && dos->rootsize != 0) {
+		if(ltarget*dos->sectsize*dos->clustsize >= dos->rootsize*sizeof(Dosdir))
+			return -1;
+		l = dos->rootaddr + ltarget*dos->clustsize;
+		chat("fileaddr %ld -> %ld\n", ltarget, l);
+		return l;
+	}
+
+	/*
+	 *  anything else requires a walk through the fat
+	 */
+	if(ltarget >= fp->lcurrent && fp->pcurrent){
+		/* start at the currrent point */
+		l = fp->lcurrent;
+		p = fp->pcurrent;
+	} else {
+		/* go back to the beginning */
+		l = 0;
+		p = fp->pstart;
+	}
+	while(l != ltarget){
+		/* walk the fat */
+		p = fatwalk(dos, p);
+		if(p < 0)
+			return -1;
+		l++;
+	}
+	fp->lcurrent = l;
+	fp->pcurrent = p;
+
+	/*
+	 *  clusters start at 2 instead of 0 (why? - presotto)
+	 */
+	l =  dos->dataaddr + (p-2)*dos->clustsize;
+	chat("fileaddr %ld -> %ld\n", ltarget, l);
+	return l;
+}
+
+/*
+ *  read from a dos file
+ */
+long
+dosread(Dosfile *fp, void *a, long n)
+{
+	long addr;
+	long rv;
+	int i;
+	int off;
+	Clustbuf *p;
+	uchar *from, *to;
+
+	if((fp->attr & DOSDIR) == 0){
+		if(fp->offset >= fp->length)
+			return 0;
+		if(fp->offset+n > fp->length)
+			n = fp->length - fp->offset;
+	}
+
+	to = a;
+	for(rv = 0; rv < n; rv+=i){
+		/*
+		 *  read the cluster
+		 */
+		addr = fileaddr(fp, fp->offset/fp->dos->clustbytes);
+		if(addr < 0)
+			return -1;
+		p = getclust(fp->dos, addr);
+		if(p == 0)
+			return -1;
+
+		/*
+		 *  copy the bytes we need
+		 */
+		off = fp->offset % fp->dos->clustbytes;
+		from = &p->iobuf[off];
+		i = n - rv;
+		if(i > fp->dos->clustbytes - off)
+			i = fp->dos->clustbytes - off;
+		memmove(to, from, i);
+		to += i;
+		fp->offset += i;
+	}
+
+	return rv;
+}
+
+/*
+ *  walk a directory returns
+ * 	-1 if something went wrong
+ *	 0 if not found
+ *	 1 if found
+ */
+int
+doswalk(File *f, char *name)
+{
+	Dosdir d;
+	long n;
+	Dosfile *file;
+
+	chat("doswalk %s\n", name);
+
+	file = &f->dos;
+
+	if((file->attr & DOSDIR) == 0){
+		chat("walking non-directory!\n");
+		return -1;
+	}
+
+	setname(file, name);
+
+	file->offset = 0;	/* start at the beginning */
+	while((n = dosread(file, &d, sizeof(d))) == sizeof(d)){
+		chat("comparing to %8.8s.%3.3s\n", (char*)d.name, (char*)d.ext);
+		if(memcmp(file->name, d.name, sizeof(d.name)) != 0)
+			continue;
+		if(memcmp(file->ext, d.ext, sizeof(d.ext)) != 0)
+			continue;
+		if(d.attr & DOSVLABEL){
+			chat("%8.8s.%3.3s is a LABEL\n", (char*)d.name, (char*)d.ext);
+			continue;
+		}
+		file->attr = d.attr;
+		file->pstart = GSHORT(d.start);
+		if (file->dos->fatbits == 32)
+			file->pstart |= GSHORT(d.highstart) << 16;
+		file->length = GLONG(d.length);
+		file->pcurrent = 0;
+		file->lcurrent = 0;
+		file->offset = 0;
+		return 1;
+	}
+	return n >= 0 ? 0 : -1;
+}
+
+void
+lowercase(char *s)
+{
+	for (; *s != '\0'; s++)
+		if (*s >= 'A' && *s <= 'Z')
+			*s -= 'A' - 'a';
+}
+
+void
+trim(char *s, int len)
+{
+	while(len > 0 && s[len-1] == ' ')
+		s[--len] = '\0';
+}
+
+/*
+ *  read a directory and return the file names in a malloced
+ *	array whose address is stored through nmarray.
+ * 	-1 if something went wrong
+ *	else number of dir. entries
+ */
+int
+dosdirread(File *f, char ***nmarray)
+{
+	int entries;
+	long i;
+	char buf[Dosfilemax+1+Dosextmax+1];
+	char **nms;
+	Dosdir d;
+	Dosfile *file;
+
+	chat("dosdirread\n");
+	file = &f->dos;
+	if((file->attr & DOSDIR) == 0){
+		chat("walking non-directory!\n");
+		return -1;
+	}
+
+	/* allocate the array of char*s */
+	file->offset = 0;		/* start at the beginning */
+	for(entries = 0; dosread(file, &d, sizeof d) == sizeof d; entries++)
+		;
+	nms = smalloc(sizeof(char *) * (entries + 1));
+
+	/* populate the array */
+	file->offset = 0;		/* rewind */
+	for(i = 0; i < entries && dosread(file, &d, sizeof d) == sizeof d; ){
+		trim((char *)d.name, Dosfilemax);
+		trim((char *)d.ext, Dosextmax);
+		if (d.name[0] == '\0')
+			continue;
+		if (d.ext[0] == '\0')
+			kstrdup(&nms[i], (char *)d.name);
+		else {
+			snprint(buf, sizeof buf, "%.*s.%.*s",
+				Dosfilemax, (char *)d.name,
+				Dosextmax, (char *)d.ext);
+			kstrdup(&nms[i], buf);
+		}
+		lowercase(nms[i++]);
+	}
+	*nmarray = nms;
+	return 0;
+}
+
+/*
+ *  instructions that boot blocks can start with
+ */
+#define	JMPSHORT	0xeb
+#define JMPNEAR		0xe9
+
+/*
+ *  read in a segment
+ */
+long
+dosreadseg(File *f, void *va, long len)
+{
+	char *a;
+	long n, sofar;
+	Dosfile *fp;
+
+	fp = &f->dos;
+	a = va;
+	for(sofar = 0; sofar < len; sofar += n){
+		n = 8*1024;
+		if(len - sofar < n)
+			n = len - sofar;
+		n = dosread(fp, a + sofar, n);
+		if(n <= 0)
+			break;
+		print(".");
+	}
+	return sofar;
+}
+
+int
+dosinit(Bootfs *fs, char *disk)
+{
+	Clustbuf *p;
+	Dosboot *b;
+	int i;
+	Dos *dos;
+	Dosfile *root;
+
+chat("dosinit0 %p %s\n", fs, fs->disk);
+
+	fs->disk = disk;
+	fs->devch = namecopen(disk, OREAD);
+	if (fs->devch == nil) {
+		print("dosinit: can't open %s\n", disk);
+		return -1;
+	}
+
+	dos = &fs->dos;
+	/* defaults till we know better */
+	dos->sectsize = 512;
+	dos->clustsize = 1;
+
+	/* get first sector */
+	p = getclust(dos, 0);
+	if(p == 0){
+		chat("can't read boot block\n");
+		return -1;
+	}
+
+chat("dosinit0a\n");
+
+	p->dos = 0;				/* don't cache this block */
+	b = (Dosboot *)p->iobuf;
+	if(b->magic[0] != JMPNEAR && (b->magic[0] != JMPSHORT || b->magic[2] != 0x90)){
+		chat("no dos file system %x %x %x %x\n",
+			b->magic[0], b->magic[1], b->magic[2], b->magic[3]);
+		return -1;
+	}
+
+	if(chatty)
+		bootdump(b);
+
+	if(b->clustsize == 0) {
+unreasonable:
+		if(chatty){
+			print("unreasonable FAT BPB: ");
+			for(i=0; i<3+8+2+1; i++)
+				print(" %.2ux", p->iobuf[i]);
+			print("\n");
+		}
+		return -1;
+	}
+
+chat("dosinit1\n");
+
+	/*
+	 * Determine the systems' wondrous properties.
+	 * There are heuristics here, but there's no real way
+	 * of knowing if this is a reasonable FAT.
+	 */
+	dos->fatbits = 0;
+	dos->sectsize = GSHORT(b->sectsize);
+	if(dos->sectsize & 0xFF)
+		goto unreasonable;
+	dos->clustsize = b->clustsize;
+	dos->clustbytes = dos->sectsize*dos->clustsize;
+	dos->nresrv = GSHORT(b->nresrv);
+	dos->nfats = b->nfats;
+	dos->fatsize = GSHORT(b->fatsize);
+	dos->rootsize = GSHORT(b->rootsize);
+	dos->volsize = GSHORT(b->volsize);
+	if(dos->volsize == 0)
+		dos->volsize = GLONG(b->bigvolsize);
+	dos->mediadesc = b->mediadesc;
+	if(dos->fatsize == 0) {
+		chat("fat32\n");
+		dos->rootsize = 0;
+		dos->fatsize = GLONG(b->bigfatsize);
+		dos->fatbits = 32;
+	}
+	dos->fataddr = dos->nresrv;
+	if (dos->rootsize == 0) {
+		dos->rootaddr = 0;
+		dos->rootclust = GLONG(b->rootdirstartclust);
+		dos->dataaddr = dos->fataddr + dos->nfats*dos->fatsize;
+	} else {
+		dos->rootaddr = dos->fataddr + dos->nfats*dos->fatsize;
+		i = dos->rootsize*sizeof(Dosdir) + dos->sectsize - 1;
+		i = i/dos->sectsize;
+		dos->dataaddr = dos->rootaddr + i;
+	}
+	dos->fatclusters = 2+(dos->volsize - dos->dataaddr)/dos->clustsize;
+	if(dos->fatbits != 32) {
+		if(dos->fatclusters < 4087)
+			dos->fatbits = 12;
+		else
+			dos->fatbits = 16;
+	}
+	dos->freeptr = 2;
+
+	if(dos->clustbytes < 512 || dos->clustbytes > 64*1024)
+		goto unreasonable;
+
+chat("dosinit2\n");
+
+	/*
+	 *  set up the root
+	 */
+
+	fs->root.fs = fs;
+	root = &fs->root.dos;
+	root->dos = dos;
+	root->pstart = dos->rootsize == 0 ? dos->rootclust : 0;
+	root->pcurrent = root->lcurrent = 0;
+	root->offset = 0;
+	root->attr = DOSDIR;
+	root->length = dos->rootsize*sizeof(Dosdir);
+
+chat("dosinit3\n");
+
+	fs->read = dosreadseg;
+	fs->walk = doswalk;
+	return 0;
+}
+
+static void
+bootdump(Dosboot *b)
+{
+	if(chatty == 0)
+		return;
+	print("magic: 0x%2.2x 0x%2.2x 0x%2.2x ",
+		b->magic[0], b->magic[1], b->magic[2]);
+	print("version: \"%8.8s\"\n", (char*)b->version);
+	print("sectsize: %d ", GSHORT(b->sectsize));
+	print("allocsize: %d ", b->clustsize);
+	print("nresrv: %d ", GSHORT(b->nresrv));
+	print("nfats: %d\n", b->nfats);
+	print("rootsize: %d ", GSHORT(b->rootsize));
+	print("volsize: %d ", GSHORT(b->volsize));
+	print("mediadesc: 0x%2.2x\n", b->mediadesc);
+	print("fatsize: %d ", GSHORT(b->fatsize));
+	print("trksize: %d ", GSHORT(b->trksize));
+	print("nheads: %d ", GSHORT(b->nheads));
+	print("nhidden: %d ", GLONG(b->nhidden));
+	print("bigvolsize: %d\n", GLONG(b->bigvolsize));
+/*
+	print("driveno: %d\n", b->driveno);
+	print("reserved0: 0x%2.2x\n", b->reserved0);
+	print("bootsig: 0x%2.2x\n", b->bootsig);
+	print("volid: 0x%8.8x\n", GLONG(b->volid));
+	print("label: \"%11.11s\"\n", b->label);
+*/
+}
+
+
+/*
+ *  set up a dos file name
+ */
+static void
+setname(Dosfile *fp, char *from)
+{
+	char *to;
+
+	to = fp->name;
+	for(; *from && to-fp->name < 8; from++, to++){
+		if(*from == '.'){
+			from++;
+			break;
+		}
+		if(*from >= 'a' && *from <= 'z')
+			*to = *from + 'A' - 'a';
+		else
+			*to = *from;
+	}
+	while(to - fp->name < 8)
+		*to++ = ' ';
+	
+	/* from might be 12345678.123: don't save the '.' in ext */
+	if(*from == '.')
+		from++;
+
+	to = fp->ext;
+	for(; *from && to-fp->ext < 3; from++, to++){
+		if(*from >= 'a' && *from <= 'z')
+			*to = *from + 'A' - 'a';
+		else
+			*to = *from;
+	}
+	while(to-fp->ext < 3)
+		*to++ = ' ';
+
+	chat("name is %8.8s.%3.3s\n", fp->name, fp->ext);
+}

+ 156 - 0
sys/src/9/pcboot/dosfs.h

@@ -0,0 +1,156 @@
+typedef struct Dosboot	Dosboot;
+typedef struct Dos	Dos;
+typedef struct Dosdir	Dosdir;
+typedef struct Dosfile	Dosfile;
+typedef struct Dospart	Dospart;
+typedef struct File File;
+typedef struct Bootfs Bootfs;
+
+int fsread(File *file, void *a, long n);
+int fsboot(Bootfs *fs, char *path, Boot *b);
+int fswalk(Bootfs *fs, char *path, File *f);
+
+struct Dospart
+{
+	uchar flag;		/* active flag */
+	uchar shead;		/* starting head */
+	uchar scs[2];		/* starting cylinder/sector */
+	uchar type;		/* partition type */
+	uchar ehead;		/* ending head */
+	uchar ecs[2];		/* ending cylinder/sector */
+	uchar start[4];		/* starting sector */
+	uchar len[4];		/* length in sectors */
+};
+
+#define FAT12	0x01
+#define FAT16	0x04
+#define EXTEND	0x05
+#define FATHUGE	0x06
+#define FAT32	0x0b
+#define FAT32X	0x0c
+#define EXTHUGE	0x0f
+#define DMDDO	0x54
+#define PLAN9	0x39
+#define LEXTEND 0x85
+
+struct Dosfile{
+	Dos	*dos;		/* owning dos file system */
+	char	name[8];
+	char	ext[3];
+	uchar	attr;
+	long	length;
+	long	pstart;		/* physical start cluster address */
+	long	pcurrent;	/* physical current cluster address */
+	long	lcurrent;	/* logical current cluster address */
+	long	offset;
+};
+
+struct Dos{
+	long	start;		/* start of file system */
+	int	sectsize;	/* in bytes */
+	int	clustsize;	/* in sectors */
+	int	clustbytes;	/* in bytes */
+	int	nresrv;		/* sectors */
+	int	nfats;		/* usually 2 */
+	int	rootsize;	/* number of entries */
+	int	volsize;	/* in sectors */
+	int	mediadesc;
+	int	fatsize;	/* in sectors */
+	int	fatclusters;
+	int	fatbits;	/* 12 or 16 */
+	long	fataddr;	/* sector number */
+	long	rootaddr;
+	long	rootclust;
+	long	dataaddr;
+	long	freeptr;
+};
+
+typedef struct Dosboot Dosboot;
+typedef struct Dosdir Dosdir;
+
+struct Dosboot{
+	uchar	magic[3];
+	uchar	version[8];
+	uchar	sectsize[2];
+	uchar	clustsize;
+	uchar	nresrv[2];
+	uchar	nfats;
+	uchar	rootsize[2];
+	uchar	volsize[2];
+	uchar	mediadesc;
+	uchar	fatsize[2];
+	uchar	trksize[2];
+	uchar	nheads[2];
+	uchar	nhidden[4];
+	uchar	bigvolsize[4];
+/* fat 32 */
+	uchar	bigfatsize[4];
+	uchar	extflags[2];
+	uchar	fsversion[2];
+	uchar	rootdirstartclust[4];
+	uchar	fsinfosect[2];
+	uchar	backupbootsect[2];
+/* ???
+	uchar	driveno;
+	uchar	reserved0;
+	uchar	bootsig;
+	uchar	volid[4];
+	uchar	label[11];
+	uchar	reserved1[8];
+*/
+};
+
+struct Dosdir{
+	uchar	name[8];
+	uchar	ext[3];
+	uchar	attr;
+	uchar	lowercase;
+	uchar	hundredth;
+	uchar	ctime[2];
+	uchar	cdate[2];
+	uchar	adate[2];
+	uchar	highstart[2];
+	uchar	mtime[2];
+	uchar	mdate[2];
+	uchar	start[2];
+	uchar	length[4];
+};
+
+#define	DOSRONLY	0x01
+#define	DOSHIDDEN	0x02
+#define	DOSSYSTEM	0x04
+#define	DOSVLABEL	0x08
+#define	DOSDIR	0x10
+#define	DOSARCH	0x20
+
+// #pragma incomplete Bootfs
+
+struct File{
+	union{
+		Dosfile	dos;
+		int walked;
+	};
+	Bootfs	*fs;
+	char	*path;
+};
+
+struct Bootfs{
+	union {
+		Dos dos;
+	};
+	Chan	*devch;
+	char	*disk;
+
+	/* for *bios.c */
+	int	dev;				/* device id */
+	long	(*diskread)(Bootfs*, void*, long);	/* disk read routine */
+	vlong	(*diskseek)(Bootfs*, vlong);	/* disk seek routine */
+
+	long	(*read)(File*, void*, long);
+	int	(*walk)(File*, char*);
+	File	root;
+};
+
+extern int	dosinit(Bootfs*, char *);
+
+#define BADPTR(x) ((ulong)(x) < 0x80000000)

+ 314 - 0
sys/src/9/pcboot/expand.c

@@ -0,0 +1,314 @@
+/*
+ * expand gzipped boot loader appended to this binary and execute it.
+ *
+ * due to Russ Cox, rsc@swtch.com.
+ * see http://plan9.bell-labs.com/wiki/plan9/Replacing_9load
+ */
+#include <u.h>
+#include <libc.h>
+#include <a.out.h>
+#include <flate.h>
+#include "mem.h"
+#include "expand.h"
+
+#include "inflate.guts.c"
+
+#define KB		1024
+#define MB		(1024*1024)
+
+extern char edata[];
+
+/* ldecomp.s */
+void mb586(void);
+void splhi(void);
+void wbinvd(void);
+
+/* inflate.guts.c */
+int gunzip(uchar*, int, uchar*, int);
+
+int isexec(void*);
+int isgzip(uchar*);
+void run(void*);
+
+#pragma varargck type "d" ulong
+#pragma varargck type "x" ulong
+
+static uchar *kernel = (uchar*)Bootkernaddr;
+static char *dbrk = (char*)Mallocbase;
+
+ulong
+swap(ulong p)
+{
+	return p<<24 | p>>24 | (p<<8)&0x00FF0000 | (p>>8)&0x0000FF00;
+}
+
+enum {
+	/* keyboard controller ports & cmds */
+	Data=		0x60,		/* data port */
+	Status=		0x64,		/* status port */
+	 Inready=	0x01,		/*  input character ready */
+	 Outbusy=	0x02,		/*  output busy */
+	Cmd=		0x64,		/* command port (write only) */
+
+	/* system control port a */
+	Sysctla=	0x92,
+	 Sysctlreset=	1<<0,
+	 Sysctla20ena=	1<<1,
+};
+
+static int
+isa20on(void)
+{
+	int r;
+	ulong o;
+	ulong *zp, *mb1p;
+
+	zp = 0;
+	mb1p = (ulong *)MB;
+	o = *zp;
+
+	*zp = 0x1234;
+	*mb1p = 0x8765;
+	mb586();
+	wbinvd();
+	r = *zp != *mb1p;
+
+	*zp = o;
+	return r;
+}
+
+static void
+delay(uint ms)				/* approximate */
+{
+	int i;
+
+	while(ms-- > 0)
+		for(i = 1000*1000; i > 0; i--)
+			;
+}
+
+static int
+kbdinit(void)
+{
+	int c, try;
+
+	/* wait for a quiescent controller */
+	try = 1000;
+	while(try-- > 0 && (c = inb(Status)) & (Outbusy | Inready)) {
+		if(c & Inready)
+			inb(Data);
+		delay(1);
+	}
+	return try <= 0? -1: 0;
+}
+
+/*
+ *  wait for output no longer busy (but not forever,
+ *  there might not be a keyboard controller).
+ */
+static void
+outready(void)
+{
+	int i;
+
+	for (i = 1000; i > 0 && inb(Status) & Outbusy; i--)
+		delay(1);
+}
+
+/*
+ *  ask 8042 to enable the use of address bit 20
+ */
+int
+i8042a20(void)
+{
+	if (kbdinit() < 0)
+		return -1;
+	outready();
+	outb(Cmd, 0xD1);
+	outready();
+	outb(Data, 0xDF);
+	outready();
+	return 0;
+}
+
+void
+a20init(void)
+{
+	int b;
+
+	if (isa20on())
+		return;
+	if (i8042a20() < 0) {		/* original method, via kbd ctlr */
+		/* newer method, last resort */
+		b = inb(Sysctla);
+		if (!(b & Sysctla20ena))
+			outb(Sysctla, (b & ~Sysctlreset) | Sysctla20ena);
+	}
+	if (!isa20on())
+		print("a20 didn't come on!\n");
+}
+
+void
+_main(void)
+{
+	int ksize;
+	Exec *exec;
+
+	splhi();
+	a20init();		/* don't wrap addresses at 1MB boundaries */
+	ksize = Lowmemsz - (ulong)edata;	/* payload size */
+	memmove(kernel, edata, ksize);
+	memset(edata, 0, end - edata);
+
+	cgainit();
+	if(isgzip(kernel)) {
+		print("gz...");
+		memmove((uchar*)Unzipbuf, kernel, ksize);
+
+		/* we have to uncompress the entire kernel to get OK status */
+		if(gunzip(kernel, Bootkernmax, (uchar*)Unzipbuf, ksize) < 0){
+			print("gzip failed.");
+			exits(0);
+		}
+	}
+	if(isexec(kernel))
+		run(kernel);
+
+	exec = (Exec *)kernel;
+	print("unrecognized program; magic # 0x%x\n", swap(exec->magic));
+	exits(0);
+}
+
+int
+isexec(void *v)
+{
+	Exec *exec;
+
+	exec = v;
+	return swap(exec->magic) == I_MAGIC || swap(exec->magic) == S_MAGIC;
+}
+
+void
+run(void *v)
+{
+	ulong entry, text, data;
+	uchar *base;
+	Exec *exec;
+
+	base = v;
+	exec = v;
+	entry = swap(exec->entry) & ~KSEGM;
+	text = swap(exec->text);
+	data = swap(exec->data);
+	/*
+	 * align data segment on the expected page boundary.
+	 * sizeof(Exec)+text is offset from base to data segment.
+	 */
+	memmove(base+PGROUND(sizeof(Exec)+text), base+sizeof(Exec)+text, data);
+
+	print("starting protected-mode loader at 0x%x\n", entry);
+	((void(*)(void))entry)();
+
+	print("exec failed");
+	exits(0);
+}
+
+int
+isgzip(uchar *p)
+{
+	return p[0] == 0x1F && p[1] == 0x8B;
+}
+
+void*
+malloc(ulong n)
+{
+	void *v;
+
+	v = dbrk;
+	dbrk += ROUND(n, BY2WD);
+	return v;
+}
+
+void
+free(void*)
+{
+}
+
+void
+puts(char *s)
+{
+	for(; *s; s++)
+		cgaputc(*s);
+}
+
+int
+print(char *fmt, ...)
+{
+	int sign;
+	long d;
+	ulong x;
+	char *p, *s, buf[20];
+	va_list arg;
+	static char *hex = "0123456789abcdef";
+
+	va_start(arg, fmt);
+	for(p = fmt; *p; p++){
+		if(*p != '%') {
+			cgaputc(*p);
+			continue;
+		}
+		SET(s);
+		switch(*++p){
+		case 'p':
+		case 'x':
+			x = va_arg(arg, ulong);
+			if(x == 0){
+				s = "0";
+				break;
+			}
+			s = buf+sizeof buf;
+			*--s = 0;
+			while(x > 0){
+				*--s = hex[x&15];
+				x /= 16;
+			}
+			if(s == buf+sizeof buf)
+				*--s = '0';
+			break;
+		case 'd':
+			d = va_arg(arg, ulong);
+			if(d == 0){
+				s = "0";
+				break;
+			}
+			if(d < 0){
+				d = -d;
+				sign = -1;
+			}else
+				sign = 1;
+			s = buf+sizeof buf;
+			*--s = 0;
+			while(d > 0){
+				*--s = (d%10)+'0';
+				d /= 10;
+			}
+			if(sign < 0)
+				*--s = '-';
+			break;
+		case 's':
+			s = va_arg(arg, char*);
+			break;
+		case 0:
+			return 0;
+		}
+		puts(s);
+	}
+	return 0;
+}
+
+void
+exits(char*)
+{
+	for(;;)
+		;
+}

+ 4 - 0
sys/src/9/pcboot/expand.h

@@ -0,0 +1,4 @@
+void cgainit(void);
+void cgaputc(int);
+int inb(int);
+void outb(int, int);

+ 63 - 0
sys/src/9/pcboot/fns.h

@@ -0,0 +1,63 @@
+#include "../pc/fns.h"
+
+vlong biossize(uint);
+long biossectsz(uint);
+void bootloadproc(void *);
+void changeconf(char *fmt, ...);
+Dir *dirchstat(Chan *chan);
+int getstr(char *prompt, char *buf, int size, char *def, int timeout);
+int gunzip(uchar*, int, uchar*, int);
+void i8042a20(void);
+void (*i8237alloc)(void);
+void impulse(void);
+uintptr mapping(uintptr);
+void mkmultiboot(void);
+void mmuinit0(void);
+long mountfix(Chan *c, uchar *op, long n, long maxn);
+void mountrewind(Chan *c);
+int mountrockread(Chan *c, uchar *op, long n, long *nn);
+Chan *namecopen(char *, int);
+void readlsconf(void);
+void trimnl(char *s);
+void unionrewind(Chan *c);
+void warp64(uvlong);
+
+/* boot.c */
+int bootpass(Boot *b, void *vbuf, int nbuf);
+
+/* conf.c */
+void addconf(char *fmt, ...);
+int dotini(char *inibuf);
+
+/* devether.c */
+Chan *etherattach(char *spec);
+int parseether(uchar*, char*);
+
+/* fs.c */
+typedef struct File File;
+
+/* parts.c */
+int readparts(char *);
+
+/* pxe.c */
+int bind(char *old, char *new, int flag);
+long unmount(char *old, char *new);
+
+/* rand.c */
+void srand(long);
+int nrand(int);
+
+/* stub.c */
+long chdir(char *dir);
+Chan *namecopen(char *name, int mode);
+Chan *enamecopen(char *name, int mode);
+Chan *nameccreate(char *name, int mode);
+Chan *enameccreate(char *name, int mode);
+int myreadn(Chan *c, void *vp, long n);
+int readfile(char *file, void *buf, int len);
+
+long dirpackage(uchar *buf, long ts, Dir **d);
+
+/* libip */
+int equivip4(uchar *, uchar *);
+int equivip6(uchar *, uchar *);

+ 103 - 0
sys/src/9/pcboot/fs.c

@@ -0,0 +1,103 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"dosfs.h"
+
+enum {
+	Bufsize = 8192,
+};
+
+/*
+ *  grab next element from a path, return the pointer to unprocessed portion of
+ *  path.
+ */
+char *
+nextelem(char *path, char *elem)
+{
+	int i;
+
+	while(*path == '/')
+		path++;
+	if(*path==0 || *path==' ')
+		return 0;
+	for(i=0; *path!='\0' && *path!='/' && *path!=' '; i++){
+		if(i==NAMELEN){
+			print("name component too long\n");
+			return 0;
+		}
+		*elem++ = *path++;
+	}
+	*elem = '\0';
+	return path;
+}
+
+int
+fswalk(Bootfs *fs, char *path, File *f)
+{
+	char element[NAMELEN];
+
+	*f = fs->root;
+	if(BADPTR(fs->walk))
+		panic("fswalk bad pointer fs->walk");
+
+	f->path = path;
+	while(path = nextelem(path, element)){
+		switch(fs->walk(f, element)){
+		case -1:
+			return -1;
+		case 0:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ *  boot
+ */
+int
+fsboot(Bootfs *fs, char *path, Boot *b)
+{
+	long n;
+	char *buf;
+	File file;
+
+	memset(&file, 0, sizeof file);
+	switch(fswalk(fs, path, &file)){
+	case -1:
+		print("error walking to %s\n", path);
+		return -1;
+	case 0:
+		print("%s not found\n", path);
+		return -1;
+	case 1:
+		print("found %s\n", path);
+		break;
+	}
+	buf = smalloc(Bufsize);
+	while((n = fsread(&file, buf, Bufsize)) > 0) {
+		if(bootpass(b, buf, n) != MORE)
+			break;
+	}
+
+	bootpass(b, nil, 0);	/* tries boot */
+	free(buf);
+	return -1;
+}
+
+int
+fsread(File *file, void *a, long n)
+{
+	if(BADPTR(file->fs))
+		panic("bad pointer file->fs in fsread");
+	if(BADPTR(file->fs->read))
+		panic("bad pointer file->fs->read in fsread");
+	return file->fs->read(file, a, n);
+}

+ 11 - 0
sys/src/9/pcboot/inflate.c

@@ -0,0 +1,11 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"../port/error.h"
+#include	<flate.h>
+
+#include	"inflate.guts.c"

+ 180 - 0
sys/src/9/pcboot/inflate.guts.c

@@ -0,0 +1,180 @@
+/* included by expand and 9boot with different header files */
+
+typedef struct Biobuf	Biobuf;
+
+struct Biobuf
+{
+	uchar *bp;
+	uchar *p;
+	uchar *ep;
+};
+
+static ulong	Boffset(Biobuf *bp);
+static int	crcwrite(void *out, void *buf, int n);
+static ulong	get4(Biobuf *b);
+static int	getc(void*);
+static int	header(Biobuf*);
+static ulong	offset(Biobuf*);
+static int	trailer(Biobuf*, Biobuf*);
+
+/* GZIP flags */
+enum {
+	Ftext=		(1<<0),
+	Fhcrc=		(1<<1),
+	Fextra=		(1<<2),
+	Fname=		(1<<3),
+	Fcomment=	(1<<4),
+
+	GZCRCPOLY	= 0xedb88320UL,
+};
+
+static ulong	*crctab;
+static ulong	crc;
+
+int
+gunzip(uchar *out, int outn, uchar *in, int inn)
+{
+	Biobuf bin, bout;
+	int err;
+
+	crc = 0;
+	crctab = mkcrctab(GZCRCPOLY);
+	err = inflateinit();
+	if(err != FlateOk)
+		print("inflateinit failed: %s\n", flateerr(err));
+
+	bin.bp = bin.p = in;
+	bin.ep = in+inn;
+	bout.bp = bout.p = out;
+	bout.ep = out+outn;
+
+	err = header(&bin);
+	if(err != FlateOk)
+		return err;
+
+	err = inflate(&bout, crcwrite, &bin, getc);
+	if(err != FlateOk)
+		print("inflate failed: %s\n", flateerr(err));
+
+	err = trailer(&bout, &bin);
+	if(err != FlateOk)
+		return err;
+
+	return Boffset(&bout);
+}
+
+static int
+header(Biobuf *bin)
+{
+	int i, flag;
+
+	if(getc(bin) != 0x1f || getc(bin) != 0x8b){
+		print("bad magic\n");
+		return FlateCorrupted;
+	}
+	if(getc(bin) != 8){
+		print("unknown compression type\n");
+		return FlateCorrupted;
+	}
+	
+	flag = getc(bin);
+	
+	/* mod time */
+	get4(bin);
+	
+	/* extra flags */
+	getc(bin);
+	
+	/* OS type */
+	getc(bin);
+
+	if(flag & Fextra)
+		for(i=getc(bin); i>0; i--)
+			getc(bin);
+	
+	/* name */
+	if(flag&Fname)
+		while(getc(bin) != 0)
+			;
+
+	/* comment */
+	if(flag&Fcomment)
+		while(getc(bin) != 0)
+			;
+
+	/* crc16 */
+	if(flag&Fhcrc) {
+		getc(bin);
+		getc(bin);
+	}
+		
+	return FlateOk;
+}
+
+static int
+trailer(Biobuf *bout, Biobuf *bin)
+{
+	/* crc32 */
+	if(crc != get4(bin)){
+		print("crc mismatch\n");
+		return FlateCorrupted;
+	}
+
+	/* length */
+	if(get4(bin) != Boffset(bout)){
+		print("bad output len\n");
+		return FlateCorrupted;
+	}
+	return FlateOk;
+}
+
+static ulong
+get4(Biobuf *b)
+{
+	ulong v;
+	int i, c;
+
+	v = 0;
+	for(i = 0; i < 4; i++){
+		c = getc(b);
+		v |= c << (i * 8);
+	}
+	return v;
+}
+
+static int
+getc(void *in)
+{
+	Biobuf *bp = in;
+
+//	if((bp->p - bp->bp) % 10000 == 0)
+//		print(".");
+	if(bp->p >= bp->ep){
+		print("EOF");
+		return -1;
+	}
+	return *bp->p++;
+}
+
+static ulong
+Boffset(Biobuf *bp)
+{
+	return bp->p - bp->bp;
+}
+
+static int
+crcwrite(void *out, void *buf, int n)
+{
+	Biobuf *bp;
+	int nn;
+
+	crc = blockcrc(crctab, crc, buf, n);
+	bp = out;
+	nn = n;
+	if(nn > bp->ep-bp->p)
+		nn = bp->ep-bp->p;
+	if(nn > 0)
+		memmove(bp->p, buf, nn);
+	bp->p += n;
+	return n;
+}

+ 141 - 0
sys/src/9/pcboot/iso9660.h

@@ -0,0 +1,141 @@
+/*
+ * ISO 9660 CD format
+ */
+
+#define	VOLDESC	16	/* sector number */
+
+/*
+ * L means little-endian, M means big-endian, and LM means little-endian
+ * then again big-endian.
+ */
+typedef uchar		Byte2L[2];
+typedef uchar		Byte2M[2];
+typedef uchar		Byte4LM[4];
+typedef uchar		Byte4L[4];
+typedef uchar		Byte4M[4];
+typedef uchar		Byte8LM[8];
+typedef union Drec	Drec;
+typedef union Voldesc	Voldesc;
+
+enum {
+	BootIso		= 0,
+	PrimaryIso	= 1,
+	SupplementaryIso = 2,
+	PartitionIso	= 3,
+	TerminatorIso	= 255,
+};
+
+enum {
+	Cdsec	= 2048,
+	Maxname = 256,
+};
+
+union Voldesc {			/* volume descriptor */
+	uchar	byte[Cdsec];
+	union {			/* for CD001, the ECMA standard */
+		struct {
+			uchar	type;
+			uchar	stdid[5];
+			uchar	version;
+			uchar	unused;
+			uchar	sysid[32];
+			uchar	bootid[32];
+			uchar	data[1977];
+		} boot;
+		struct {
+			uchar	type;
+			uchar	stdid[5];
+			uchar	version;
+			uchar	flags;
+			uchar	sysid[32];
+			uchar	volid[32];
+			Byte8LM	partloc;
+			Byte8LM	size;
+			uchar	escapes[32];
+			Byte4LM	vsetsize;
+			Byte4LM	vseqno;
+			Byte4LM	blksize;
+			Byte8LM	ptabsize;
+			Byte4L	lptable;
+			Byte4L	optlptable;
+			Byte4M	mptable;
+			Byte4M	optmptable;
+			uchar	rootdir[34];
+			uchar	volsetid[128];
+			uchar	pubid[128];
+			uchar	prepid[128];
+			uchar	appid[128];
+			uchar	copyright[37];
+			uchar	abstract[37];
+			uchar	bibliography[37];
+			uchar	cdate[17];
+			uchar	mdate[17];
+			uchar	expdate[17];
+			uchar	effdate[17];
+			uchar	fsversion;
+			uchar	unused3[1];
+			uchar	appuse[512];
+			uchar	unused4[653];
+		} desc;
+	} z;
+	union {			/* for CDROM, the `High Sierra' standard */
+		struct {
+			Byte8LM	number;
+			uchar	type;
+			uchar	stdid[5];
+			uchar	version;
+			uchar	flags;
+			uchar	sysid[32];
+			uchar	volid[32];
+			Byte8LM	partloc;
+			Byte8LM	size;
+			uchar	escapes[32];
+			Byte4LM	vsetsize;
+			Byte4LM	vseqno;
+			Byte4LM	blksize;
+			uchar	quux[40];
+			uchar	rootdir[34];
+			uchar	volsetid[128];
+			uchar	pubid[128];
+			uchar	prepid[128];
+			uchar	appid[128];
+			uchar	copyright[32];
+			uchar	abstract[32];
+			uchar	cdate[16];
+			uchar	mdate[16];
+			uchar	expdate[16];
+			uchar	effdate[16];
+			uchar	fsversion;
+		} desc;
+	} r;
+};
+
+union Drec {
+	struct {
+		uchar	reclen;
+		uchar	attrlen;
+		Byte8LM	addr;
+		Byte8LM	size;
+		uchar	date[6];
+		uchar	tzone;		/* flags in high sierra */
+		uchar	flags;		/* ? in high sierra */
+		uchar	unitsize;	/* ? in high sierra */
+		uchar	gapsize;	/* ? in high sierra */
+		Byte4LM	vseqno;		/* ? in high sierra */
+		uchar	namelen;
+		uchar	name[1];
+	};
+	struct {
+		uchar	r_pad[24];
+		uchar	r_flags;
+	};
+};
+
+struct Isofile {
+	short	fmt;		/* 'z' if iso, 'r' if high sierra */
+	short	blksize;
+	vlong	offset;		/* true offset when reading directory */
+	long	odelta;		/* true size of directory just read */
+	vlong	doffset;	/* plan9 offset when reading directory */
+	Drec	d;
+};

+ 1 - 0
sys/src/9/pcboot/ktzero.s

@@ -0,0 +1 @@
+TEXT _KTZERO(SB), $0

+ 663 - 0
sys/src/9/pcboot/l.s

@@ -0,0 +1,663 @@
+/*
+ * non-startup assembly-language assist
+ */
+
+#include "mem.h"
+#include "/sys/src/boot/pc/x16.h"
+#undef DELAY
+
+#define PADDR(a)	((a) & ~KZERO)
+#define KADDR(a)	(KZERO|(a))
+
+/*
+ * Some machine instructions not handled by 8[al].
+ */
+#define OP16		BYTE $0x66
+#define DELAY		BYTE $0xEB; BYTE $0x00	/* JMP .+2 */
+#define CPUID		BYTE $0x0F; BYTE $0xA2	/* CPUID, argument in AX */
+#define WRMSR		BYTE $0x0F; BYTE $0x30	/* WRMSR, argument in AX/DX (lo/hi) */
+#define RDTSC 		BYTE $0x0F; BYTE $0x31	/* RDTSC, result in AX/DX (lo/hi) */
+#define RDMSR		BYTE $0x0F; BYTE $0x32	/* RDMSR, result in AX/DX (lo/hi) */
+#define HLT		BYTE $0xF4
+#define INVLPG	BYTE $0x0F; BYTE $0x01; BYTE $0x39	/* INVLPG (%ecx) */
+#define WBINVD	BYTE $0x0F; BYTE $0x09
+
+/*
+ * Macros for calculating offsets within the page directory base
+ * and page tables. Note that these are assembler-specific hence
+ * the '<<2'.
+ */
+#define PDO(a)		(((((a))>>22) & 0x03FF)<<2)
+#define PTO(a)		(((((a))>>12) & 0x03FF)<<2)
+
+TEXT pagingoff(SB), $0
+	DELAY				/* JMP .+2 */
+
+	/*
+	 *  use a jump to an absolute location to get the PC out of
+	 *  KZERO.  first establishes double mapping of first few MB.
+	 */
+	MOVL	CR3, CX				/* load address of PDB */
+	ADDL	$KZERO, CX
+	MOVL	PDO(KZERO)(CX), DX		/* double-map KZERO at 0 */
+	MOVL	DX, PDO(0)(CX)
+
+	MOVL	CR3, CX
+	MOVL	CX, CR3				/* load and flush the mmu */
+
+	MOVL	  entry+0(FP), DX
+
+	LEAL	_nopaging-KZERO(SB),AX
+	JMP*	AX				/* jump to identity-map */
+
+TEXT _nopaging(SB), $0
+	DELAY				/* JMP .+2 */
+
+	/* switch to low stack */
+	MOVL	SP, AX
+	MOVL	$RMSTACK, SP
+//	PUSHL	AX
+
+	/* change gdt to physical pointer */
+	MOVL	_gdtptr16r-KZERO(SB), GDTR
+
+	/*
+	 * turn off paging
+	 */
+	MOVL	CR0,AX
+	ANDL	$~PG, AX
+	MOVL	AX,CR0
+	DELAY				/* JMP .+2 */
+
+	MOVL	$_stop32pg-KZERO(SB), AX
+	JMP*	AX				/* forward into the past */
+
+TEXT _stop32pg(SB), $0
+	MOVL	multibootheader-KZERO(SB), BX	/* multiboot data pointer */
+	MOVL	$0x2badb002, AX			/* multiboot magic */
+
+	JMP*	DX				/* into the loaded kernel */
+
+_idle:
+	HLT
+	JMP	_idle
+
+/*
+ * BIOS32.
+ */
+TEXT bios32call(SB), $0
+	MOVL	ci+0(FP), BP
+	MOVL	0(BP), AX
+	MOVL	4(BP), BX
+	MOVL	8(BP), CX
+	MOVL	12(BP), DX
+	MOVL	16(BP), SI
+	MOVL	20(BP), DI
+	PUSHL	BP
+
+	MOVL	12(SP), BP			/* ptr */
+	BYTE $0xFF; BYTE $0x5D; BYTE $0x00	/* CALL FAR 0(BP) */
+
+	POPL	BP
+	MOVL	DI, 20(BP)
+	MOVL	SI, 16(BP)
+	MOVL	DX, 12(BP)
+	MOVL	CX, 8(BP)
+	MOVL	BX, 4(BP)
+	MOVL	AX, 0(BP)
+
+	XORL	AX, AX
+	JCC	_bios32xxret
+	INCL	AX
+
+_bios32xxret:
+	RET
+
+TEXT cgapost2(SB), 0, $16
+	MOVL	$0xb8000,CX
+	MOVL	CX,(SP)
+	CALL	,kaddr+0(SB)
+	MOVL	code+0(FP),BP
+	MOVL	AX,BX
+	MOVL	BP,CX
+	SARL	$4,CX
+	ANDL	$15,CX
+	MOVBLZX	hex(SB)(CX*1),AX
+	MOVB	AX,3996(BX)
+	MOVB	$7,3997(BX)
+	MOVL	BP,DX
+	ANDL	$15,DX
+	MOVBLZX	hex(SB)(DX*1),CX
+	MOVB	CX,3998(BX)
+	MOVB	$7,3999(BX)
+	RET
+
+/*
+ * Read/write various system registers.
+ * CR4 and the 'model specific registers' should only be read/written
+ * after it has been determined the processor supports them
+ */
+TEXT ltr(SB), $0				/* TR - task register */
+	MOVL	tptr+0(FP), AX
+	MOVW	AX, TASK
+	RET
+
+TEXT invlpg(SB), $0
+	/* 486+ only */
+	MOVL	va+0(FP), CX
+	INVLPG
+	RET
+
+TEXT wbinvd(SB), $0
+	WBINVD
+	RET
+
+/*
+ * stub for:
+ * time stamp counter; low-order 32 bits of 64-bit cycle counter
+ * Runs at fasthz/4 cycles per second (m->clkin>>3)
+ */
+TEXT lcycles(SB),1,$0
+	RDTSC
+	RET
+
+/*
+ * Try to determine the CPU type which requires fiddling with EFLAGS.
+ * If the Id bit can be toggled then the CPUID instruction can be used
+ * to determine CPU identity and features. First have to check if it's
+ * a 386 (Ac bit can't be set). If it's not a 386 and the Id bit can't be
+ * toggled then it's an older 486 of some kind.
+ *
+ *	cpuid(fun, regs[4]);
+ */
+TEXT cpuid(SB), $0
+	MOVL	$0x240000, AX
+	PUSHL	AX
+	POPFL					/* set Id|Ac */
+	PUSHFL
+	POPL	BX				/* retrieve value */
+	MOVL	$0, AX
+	PUSHL	AX
+	POPFL					/* clear Id|Ac, EFLAGS initialised */
+	PUSHFL
+	POPL	AX				/* retrieve value */
+	XORL	BX, AX
+	TESTL	$0x040000, AX			/* Ac */
+	JZ	_cpu386				/* can't set this bit on 386 */
+	TESTL	$0x200000, AX			/* Id */
+	JZ	_cpu486				/* can't toggle this bit on some 486 */
+	MOVL	fn+0(FP), AX
+	CPUID
+	JMP	_cpuid
+_cpu486:
+	MOVL	$0x400, AX
+	JMP	_maybezapax
+_cpu386:
+	MOVL	$0x300, AX
+_maybezapax:
+	CMPL	fn+0(FP), $1
+	JE	_zaprest
+	XORL	AX, AX
+_zaprest:
+	XORL	BX, BX
+	XORL	CX, CX
+	XORL	DX, DX
+_cpuid:
+	MOVL	regs+4(FP), BP
+	MOVL	AX, 0(BP)
+	MOVL	BX, 4(BP)
+	MOVL	CX, 8(BP)
+	MOVL	DX, 12(BP)
+	RET
+
+/*
+ * Floating point.
+ * Note: the encodings for the FCLEX, FINIT, FSAVE, FSTCW, FSENV and FSTSW
+ * instructions do NOT have the WAIT prefix byte (i.e. they act like their
+ * FNxxx variations) so WAIT instructions must be explicitly placed in the
+ * code as necessary.
+ */
+#define	FPOFF(l)						 ;\
+	MOVL	CR0, AX 					 ;\
+	ANDL	$0xC, AX			/* EM, TS */	 ;\
+	CMPL	AX, $0x8					 ;\
+	JEQ 	l						 ;\
+	WAIT							 ;\
+l:								 ;\
+	MOVL	CR0, AX						 ;\
+	ANDL	$~0x4, AX			/* EM=0 */	 ;\
+	ORL	$0x28, AX			/* NE=1, TS=1 */ ;\
+	MOVL	AX, CR0
+
+#define	FPON							 ;\
+	MOVL	CR0, AX						 ;\
+	ANDL	$~0xC, AX			/* EM=0, TS=0 */ ;\
+	MOVL	AX, CR0
+	
+TEXT fpoff(SB), $0				/* disable */
+	FPOFF(l1)
+	RET
+
+TEXT fpinit(SB), $0				/* enable and init */
+	FPON
+	FINIT
+	WAIT
+	/* setfcr(FPPDBL|FPRNR|FPINVAL|FPZDIV|FPOVFL) */
+	/* note that low 6 bits are masks, not enables, on this chip */
+	PUSHW	$0x0232
+	FLDCW	0(SP)
+	POPW	AX
+	WAIT
+	RET
+
+/*
+ * Test-And-Set
+ */
+TEXT tas(SB), $0
+	MOVL	$0xDEADDEAD, AX
+	MOVL	lock+0(FP), BX
+	XCHGL	AX, (BX)			/* lock->key */
+	RET
+
+TEXT _xinc(SB), $0				/* void _xinc(long*); */
+	MOVL	l+0(FP), AX
+	LOCK;	INCL 0(AX)
+	RET
+
+TEXT _xdec(SB), $0				/* long _xdec(long*); */
+	MOVL	l+0(FP), BX
+	XORL	AX, AX
+	LOCK;	DECL 0(BX)
+	JLT	_xdeclt
+	JGT	_xdecgt
+	RET
+_xdecgt:
+	INCL	AX
+	RET
+_xdeclt:
+	DECL	AX
+	RET
+
+TEXT xchgw(SB), $0
+	MOVL	v+4(FP), AX
+	MOVL	p+0(FP), BX
+	XCHGW	AX, (BX)
+	RET
+
+TEXT cmpswap486(SB), $0
+	MOVL	addr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	BYTE $0x0F; BYTE $0xB1; BYTE $0x0B	/* CMPXCHGL CX, (BX) */
+	JNZ didnt
+	MOVL	$1, AX
+	RET
+didnt:
+	XORL	AX,AX
+	RET
+
+TEXT mul64fract(SB), $0
+/*
+ * Multiply two 64-bit number s and keep the middle 64 bits from the 128-bit result
+ * See ../port/tod.c for motivation.
+ */
+	MOVL	r+0(FP), CX
+	XORL	BX, BX				/* BX = 0 */
+
+	MOVL	a+8(FP), AX
+	MULL	b+16(FP)			/* a1*b1 */
+	MOVL	AX, 4(CX)			/* r2 = lo(a1*b1) */
+
+	MOVL	a+8(FP), AX
+	MULL	b+12(FP)			/* a1*b0 */
+	MOVL	AX, 0(CX)			/* r1 = lo(a1*b0) */
+	ADDL	DX, 4(CX)			/* r2 += hi(a1*b0) */
+
+	MOVL	a+4(FP), AX
+	MULL	b+16(FP)			/* a0*b1 */
+	ADDL	AX, 0(CX)			/* r1 += lo(a0*b1) */
+	ADCL	DX, 4(CX)			/* r2 += hi(a0*b1) + carry */
+
+	MOVL	a+4(FP), AX
+	MULL	b+12(FP)			/* a0*b0 */
+	ADDL	DX, 0(CX)			/* r1 += hi(a0*b0) */
+	ADCL	BX, 4(CX)			/* r2 += carry */
+	RET
+
+/*
+ *  label consists of a stack pointer and a PC
+ */
+TEXT gotolabel(SB), $0
+	MOVL	label+0(FP), AX
+	MOVL	0(AX), SP			/* restore sp */
+	MOVL	4(AX), AX			/* put return pc on the stack */
+	MOVL	AX, 0(SP)
+	MOVL	$1, AX				/* return 1 */
+	RET
+
+TEXT setlabel(SB), $0
+	MOVL	label+0(FP), AX
+	MOVL	SP, 0(AX)			/* store sp */
+	MOVL	0(SP), BX			/* store return pc */
+	MOVL	BX, 4(AX)
+	MOVL	$0, AX				/* return 0 */
+	RET
+
+/*
+ * Attempt at power saving. -rsc
+ */
+TEXT halt(SB), $0
+	CLI					/* interrupts off */
+	CMPL	nrdy(SB), $0
+	JEQ	_nothingready
+	STI					/* interrupts on */
+	RET
+
+_nothingready:
+	STI			/* interrupts on: service before rescheduling */
+	HLT
+	RET
+
+/*
+ * Interrupt/exception handling.
+ * Each entry in the vector table calls either _strayintr or _strayintrx depending
+ * on whether an error code has been automatically pushed onto the stack
+ * (_strayintrx) or not, in which case a dummy entry must be pushed before retrieving
+ * the trap type from the vector table entry and placing it on the stack as part
+ * of the Ureg structure.
+ * The size of each entry in the vector table (6 bytes) is known in trapinit().
+ */
+TEXT _strayintr(SB), $0
+	PUSHL	AX			/* save AX */
+	MOVL	4(SP), AX		/* return PC from vectortable(SB) */
+	JMP	intrcommon
+
+TEXT _strayintrx(SB), $0
+	XCHGL	AX, (SP)		/* swap AX with vectortable CALL PC */
+intrcommon:
+	PUSHL	DS			/* save DS */
+	PUSHL	$(KDSEL)
+	POPL	DS			/* fix up DS */
+	MOVBLZX	(AX), AX		/* trap type -> AX */
+	XCHGL	AX, 4(SP)		/* exchange trap type with saved AX */
+
+	PUSHL	ES			/* save ES */
+	PUSHL	$(KDSEL)
+	POPL	ES			/* fix up ES */
+
+	PUSHL	FS			/* save the rest of the Ureg struct */
+	PUSHL	GS
+	PUSHAL
+
+	PUSHL	SP			/* Ureg* argument to trap */
+	CALL	trap(SB)
+
+TEXT forkret(SB), $0
+	POPL	AX
+	POPAL
+	POPL	GS
+	POPL	FS
+	POPL	ES
+	POPL	DS
+	ADDL	$8, SP			/* pop error code and trap type */
+	IRETL
+
+TEXT vectortable(SB), $0
+	CALL _strayintr(SB); BYTE $0x00		/* divide error */
+	CALL _strayintr(SB); BYTE $0x01		/* debug exception */
+	CALL _strayintr(SB); BYTE $0x02		/* NMI interrupt */
+	CALL _strayintr(SB); BYTE $0x03		/* breakpoint */
+	CALL _strayintr(SB); BYTE $0x04		/* overflow */
+	CALL _strayintr(SB); BYTE $0x05		/* bound */
+	CALL _strayintr(SB); BYTE $0x06		/* invalid opcode */
+	CALL _strayintr(SB); BYTE $0x07		/* no coprocessor available */
+	CALL _strayintrx(SB); BYTE $0x08	/* double fault */
+	CALL _strayintr(SB); BYTE $0x09		/* coprocessor segment overflow */
+	CALL _strayintrx(SB); BYTE $0x0A	/* invalid TSS */
+	CALL _strayintrx(SB); BYTE $0x0B	/* segment not available */
+	CALL _strayintrx(SB); BYTE $0x0C	/* stack exception */
+	CALL _strayintrx(SB); BYTE $0x0D	/* general protection error */
+	CALL _strayintrx(SB); BYTE $0x0E	/* page fault */
+	CALL _strayintr(SB); BYTE $0x0F		/*  */
+	CALL _strayintr(SB); BYTE $0x10		/* coprocessor error */
+	CALL _strayintrx(SB); BYTE $0x11	/* alignment check */
+	CALL _strayintr(SB); BYTE $0x12		/* machine check */
+	CALL _strayintr(SB); BYTE $0x13
+	CALL _strayintr(SB); BYTE $0x14
+	CALL _strayintr(SB); BYTE $0x15
+	CALL _strayintr(SB); BYTE $0x16
+	CALL _strayintr(SB); BYTE $0x17
+	CALL _strayintr(SB); BYTE $0x18
+	CALL _strayintr(SB); BYTE $0x19
+	CALL _strayintr(SB); BYTE $0x1A
+	CALL _strayintr(SB); BYTE $0x1B
+	CALL _strayintr(SB); BYTE $0x1C
+	CALL _strayintr(SB); BYTE $0x1D
+	CALL _strayintr(SB); BYTE $0x1E
+	CALL _strayintr(SB); BYTE $0x1F
+	CALL _strayintr(SB); BYTE $0x20		/* VectorLAPIC */
+	CALL _strayintr(SB); BYTE $0x21
+	CALL _strayintr(SB); BYTE $0x22
+	CALL _strayintr(SB); BYTE $0x23
+	CALL _strayintr(SB); BYTE $0x24
+	CALL _strayintr(SB); BYTE $0x25
+	CALL _strayintr(SB); BYTE $0x26
+	CALL _strayintr(SB); BYTE $0x27
+	CALL _strayintr(SB); BYTE $0x28
+	CALL _strayintr(SB); BYTE $0x29
+	CALL _strayintr(SB); BYTE $0x2A
+	CALL _strayintr(SB); BYTE $0x2B
+	CALL _strayintr(SB); BYTE $0x2C
+	CALL _strayintr(SB); BYTE $0x2D
+	CALL _strayintr(SB); BYTE $0x2E
+	CALL _strayintr(SB); BYTE $0x2F
+	CALL _strayintr(SB); BYTE $0x30
+	CALL _strayintr(SB); BYTE $0x31
+	CALL _strayintr(SB); BYTE $0x32
+	CALL _strayintr(SB); BYTE $0x33
+	CALL _strayintr(SB); BYTE $0x34
+	CALL _strayintr(SB); BYTE $0x35
+	CALL _strayintr(SB); BYTE $0x36
+	CALL _strayintr(SB); BYTE $0x37
+	CALL _strayintr(SB); BYTE $0x38
+	CALL _strayintr(SB); BYTE $0x39
+	CALL _strayintr(SB); BYTE $0x3A
+	CALL _strayintr(SB); BYTE $0x3B
+	CALL _strayintr(SB); BYTE $0x3C
+	CALL _strayintr(SB); BYTE $0x3D
+	CALL _strayintr(SB); BYTE $0x3E
+	CALL _strayintr(SB); BYTE $0x3F
+//	CALL _syscallintr(SB); BYTE $0x40	/* VectorSYSCALL */
+	CALL _strayintr(SB); BYTE $0x40
+	CALL _strayintr(SB); BYTE $0x41
+	CALL _strayintr(SB); BYTE $0x42
+	CALL _strayintr(SB); BYTE $0x43
+	CALL _strayintr(SB); BYTE $0x44
+	CALL _strayintr(SB); BYTE $0x45
+	CALL _strayintr(SB); BYTE $0x46
+	CALL _strayintr(SB); BYTE $0x47
+	CALL _strayintr(SB); BYTE $0x48
+	CALL _strayintr(SB); BYTE $0x49
+	CALL _strayintr(SB); BYTE $0x4A
+	CALL _strayintr(SB); BYTE $0x4B
+	CALL _strayintr(SB); BYTE $0x4C
+	CALL _strayintr(SB); BYTE $0x4D
+	CALL _strayintr(SB); BYTE $0x4E
+	CALL _strayintr(SB); BYTE $0x4F
+	CALL _strayintr(SB); BYTE $0x50
+	CALL _strayintr(SB); BYTE $0x51
+	CALL _strayintr(SB); BYTE $0x52
+	CALL _strayintr(SB); BYTE $0x53
+	CALL _strayintr(SB); BYTE $0x54
+	CALL _strayintr(SB); BYTE $0x55
+	CALL _strayintr(SB); BYTE $0x56
+	CALL _strayintr(SB); BYTE $0x57
+	CALL _strayintr(SB); BYTE $0x58
+	CALL _strayintr(SB); BYTE $0x59
+	CALL _strayintr(SB); BYTE $0x5A
+	CALL _strayintr(SB); BYTE $0x5B
+	CALL _strayintr(SB); BYTE $0x5C
+	CALL _strayintr(SB); BYTE $0x5D
+	CALL _strayintr(SB); BYTE $0x5E
+	CALL _strayintr(SB); BYTE $0x5F
+	CALL _strayintr(SB); BYTE $0x60
+	CALL _strayintr(SB); BYTE $0x61
+	CALL _strayintr(SB); BYTE $0x62
+	CALL _strayintr(SB); BYTE $0x63
+	CALL _strayintr(SB); BYTE $0x64
+	CALL _strayintr(SB); BYTE $0x65
+	CALL _strayintr(SB); BYTE $0x66
+	CALL _strayintr(SB); BYTE $0x67
+	CALL _strayintr(SB); BYTE $0x68
+	CALL _strayintr(SB); BYTE $0x69
+	CALL _strayintr(SB); BYTE $0x6A
+	CALL _strayintr(SB); BYTE $0x6B
+	CALL _strayintr(SB); BYTE $0x6C
+	CALL _strayintr(SB); BYTE $0x6D
+	CALL _strayintr(SB); BYTE $0x6E
+	CALL _strayintr(SB); BYTE $0x6F
+	CALL _strayintr(SB); BYTE $0x70
+	CALL _strayintr(SB); BYTE $0x71
+	CALL _strayintr(SB); BYTE $0x72
+	CALL _strayintr(SB); BYTE $0x73
+	CALL _strayintr(SB); BYTE $0x74
+	CALL _strayintr(SB); BYTE $0x75
+	CALL _strayintr(SB); BYTE $0x76
+	CALL _strayintr(SB); BYTE $0x77
+	CALL _strayintr(SB); BYTE $0x78
+	CALL _strayintr(SB); BYTE $0x79
+	CALL _strayintr(SB); BYTE $0x7A
+	CALL _strayintr(SB); BYTE $0x7B
+	CALL _strayintr(SB); BYTE $0x7C
+	CALL _strayintr(SB); BYTE $0x7D
+	CALL _strayintr(SB); BYTE $0x7E
+	CALL _strayintr(SB); BYTE $0x7F
+	CALL _strayintr(SB); BYTE $0x80		/* Vector[A]PIC */
+	CALL _strayintr(SB); BYTE $0x81
+	CALL _strayintr(SB); BYTE $0x82
+	CALL _strayintr(SB); BYTE $0x83
+	CALL _strayintr(SB); BYTE $0x84
+	CALL _strayintr(SB); BYTE $0x85
+	CALL _strayintr(SB); BYTE $0x86
+	CALL _strayintr(SB); BYTE $0x87
+	CALL _strayintr(SB); BYTE $0x88
+	CALL _strayintr(SB); BYTE $0x89
+	CALL _strayintr(SB); BYTE $0x8A
+	CALL _strayintr(SB); BYTE $0x8B
+	CALL _strayintr(SB); BYTE $0x8C
+	CALL _strayintr(SB); BYTE $0x8D
+	CALL _strayintr(SB); BYTE $0x8E
+	CALL _strayintr(SB); BYTE $0x8F
+	CALL _strayintr(SB); BYTE $0x90
+	CALL _strayintr(SB); BYTE $0x91
+	CALL _strayintr(SB); BYTE $0x92
+	CALL _strayintr(SB); BYTE $0x93
+	CALL _strayintr(SB); BYTE $0x94
+	CALL _strayintr(SB); BYTE $0x95
+	CALL _strayintr(SB); BYTE $0x96
+	CALL _strayintr(SB); BYTE $0x97
+	CALL _strayintr(SB); BYTE $0x98
+	CALL _strayintr(SB); BYTE $0x99
+	CALL _strayintr(SB); BYTE $0x9A
+	CALL _strayintr(SB); BYTE $0x9B
+	CALL _strayintr(SB); BYTE $0x9C
+	CALL _strayintr(SB); BYTE $0x9D
+	CALL _strayintr(SB); BYTE $0x9E
+	CALL _strayintr(SB); BYTE $0x9F
+	CALL _strayintr(SB); BYTE $0xA0
+	CALL _strayintr(SB); BYTE $0xA1
+	CALL _strayintr(SB); BYTE $0xA2
+	CALL _strayintr(SB); BYTE $0xA3
+	CALL _strayintr(SB); BYTE $0xA4
+	CALL _strayintr(SB); BYTE $0xA5
+	CALL _strayintr(SB); BYTE $0xA6
+	CALL _strayintr(SB); BYTE $0xA7
+	CALL _strayintr(SB); BYTE $0xA8
+	CALL _strayintr(SB); BYTE $0xA9
+	CALL _strayintr(SB); BYTE $0xAA
+	CALL _strayintr(SB); BYTE $0xAB
+	CALL _strayintr(SB); BYTE $0xAC
+	CALL _strayintr(SB); BYTE $0xAD
+	CALL _strayintr(SB); BYTE $0xAE
+	CALL _strayintr(SB); BYTE $0xAF
+	CALL _strayintr(SB); BYTE $0xB0
+	CALL _strayintr(SB); BYTE $0xB1
+	CALL _strayintr(SB); BYTE $0xB2
+	CALL _strayintr(SB); BYTE $0xB3
+	CALL _strayintr(SB); BYTE $0xB4
+	CALL _strayintr(SB); BYTE $0xB5
+	CALL _strayintr(SB); BYTE $0xB6
+	CALL _strayintr(SB); BYTE $0xB7
+	CALL _strayintr(SB); BYTE $0xB8
+	CALL _strayintr(SB); BYTE $0xB9
+	CALL _strayintr(SB); BYTE $0xBA
+	CALL _strayintr(SB); BYTE $0xBB
+	CALL _strayintr(SB); BYTE $0xBC
+	CALL _strayintr(SB); BYTE $0xBD
+	CALL _strayintr(SB); BYTE $0xBE
+	CALL _strayintr(SB); BYTE $0xBF
+	CALL _strayintr(SB); BYTE $0xC0
+	CALL _strayintr(SB); BYTE $0xC1
+	CALL _strayintr(SB); BYTE $0xC2
+	CALL _strayintr(SB); BYTE $0xC3
+	CALL _strayintr(SB); BYTE $0xC4
+	CALL _strayintr(SB); BYTE $0xC5
+	CALL _strayintr(SB); BYTE $0xC6
+	CALL _strayintr(SB); BYTE $0xC7
+	CALL _strayintr(SB); BYTE $0xC8
+	CALL _strayintr(SB); BYTE $0xC9
+	CALL _strayintr(SB); BYTE $0xCA
+	CALL _strayintr(SB); BYTE $0xCB
+	CALL _strayintr(SB); BYTE $0xCC
+	CALL _strayintr(SB); BYTE $0xCD
+	CALL _strayintr(SB); BYTE $0xCE
+	CALL _strayintr(SB); BYTE $0xCF
+	CALL _strayintr(SB); BYTE $0xD0
+	CALL _strayintr(SB); BYTE $0xD1
+	CALL _strayintr(SB); BYTE $0xD2
+	CALL _strayintr(SB); BYTE $0xD3
+	CALL _strayintr(SB); BYTE $0xD4
+	CALL _strayintr(SB); BYTE $0xD5
+	CALL _strayintr(SB); BYTE $0xD6
+	CALL _strayintr(SB); BYTE $0xD7
+	CALL _strayintr(SB); BYTE $0xD8
+	CALL _strayintr(SB); BYTE $0xD9
+	CALL _strayintr(SB); BYTE $0xDA
+	CALL _strayintr(SB); BYTE $0xDB
+	CALL _strayintr(SB); BYTE $0xDC
+	CALL _strayintr(SB); BYTE $0xDD
+	CALL _strayintr(SB); BYTE $0xDE
+	CALL _strayintr(SB); BYTE $0xDF
+	CALL _strayintr(SB); BYTE $0xE0
+	CALL _strayintr(SB); BYTE $0xE1
+	CALL _strayintr(SB); BYTE $0xE2
+	CALL _strayintr(SB); BYTE $0xE3
+	CALL _strayintr(SB); BYTE $0xE4
+	CALL _strayintr(SB); BYTE $0xE5
+	CALL _strayintr(SB); BYTE $0xE6
+	CALL _strayintr(SB); BYTE $0xE7
+	CALL _strayintr(SB); BYTE $0xE8
+	CALL _strayintr(SB); BYTE $0xE9
+	CALL _strayintr(SB); BYTE $0xEA
+	CALL _strayintr(SB); BYTE $0xEB
+	CALL _strayintr(SB); BYTE $0xEC
+	CALL _strayintr(SB); BYTE $0xED
+	CALL _strayintr(SB); BYTE $0xEE
+	CALL _strayintr(SB); BYTE $0xEF
+	CALL _strayintr(SB); BYTE $0xF0
+	CALL _strayintr(SB); BYTE $0xF1
+	CALL _strayintr(SB); BYTE $0xF2
+	CALL _strayintr(SB); BYTE $0xF3
+	CALL _strayintr(SB); BYTE $0xF4
+	CALL _strayintr(SB); BYTE $0xF5
+	CALL _strayintr(SB); BYTE $0xF6
+	CALL _strayintr(SB); BYTE $0xF7
+	CALL _strayintr(SB); BYTE $0xF8
+	CALL _strayintr(SB); BYTE $0xF9
+	CALL _strayintr(SB); BYTE $0xFA
+	CALL _strayintr(SB); BYTE $0xFB
+	CALL _strayintr(SB); BYTE $0xFC
+	CALL _strayintr(SB); BYTE $0xFD
+	CALL _strayintr(SB); BYTE $0xFE
+	CALL _strayintr(SB); BYTE $0xFF

+ 248 - 0
sys/src/9/pcboot/l16r.s

@@ -0,0 +1,248 @@
+/*
+ * Protected-mode bootstrap, to be jumped to by a Primary Bootstrap Sector.
+ * Load with -H3 -R4 -T0xNNNNNNNN to get a binary image with no header.
+ * Note that the processor is in 'real' mode on entry, so care must be taken
+ * as the assembler assumes protected mode, hence the sometimes weird-looking
+ * code to assure 16-bit instructions are issued.
+ */
+#include "mem.h"
+#include "/sys/src/boot/pc/x16.h"
+
+#undef BIOSCALL		/* we don't know what evil the bios gets up to */
+#define BIOSCALL(b)	INT $(b); CLI
+
+#define CMPL(r0, r1)	BYTE $0x66; CMP(r0, r1)
+#define LLI(i, rX)	BYTE $0x66;		/* i -> rX */		\
+			BYTE $(0xB8+rX);				\
+			LONG $i;
+
+/*
+ * Start:
+ *	disable interrupts;
+ *	set all segments;
+ *	create temporary stack.
+ */
+TEXT _start16r(SB), $0
+	CLI				/* interrupts off */
+
+	/* make the jump conditional to keep 8l from moving _multibootheader */
+	LWI(1, rAX)
+	SUBI(1, rAX)
+	JEQ	pastmboothdr
+
+/*
+ * Must be 4-byte aligned & within 8K of the image's start.
+ */
+	NOP
+	NOP
+#include "mboot.s"
+
+TEXT _hello(SB), $0
+	BYTE $'\r';
+	BYTE $'\n';
+	BYTE $'P'; BYTE $'l'; BYTE $'a'; BYTE $'n';
+	BYTE $' '; BYTE $'9'; BYTE $' '; BYTE $'f';
+	BYTE $'r'; BYTE $'o'; BYTE $'m'; BYTE $' ';
+	BYTE $'B'; BYTE $'e'; BYTE $'l'; BYTE $'l';
+	BYTE $' '; BYTE $'L'; BYTE $'a'; BYTE $'b';
+	BYTE $'s'; 
+	BYTE $'\z';
+
+TEXT _DI(SB), $0
+	BYTE $0; BYTE $0; BYTE $0; BYTE $0;
+
+TEXT _ES(SB), $0
+	BYTE $0; BYTE $0; BYTE $0; BYTE $0;
+
+	/* continued from before _multibootheader */
+pastmboothdr:
+	MFSR(rCS, rAX)
+	MTSR(rAX, rDS)			/* set the data segment */
+
+	LWI(0, rAX)			/* always put stack in first 64k */
+	MTSR(rAX, rSS)
+	LWI(PXEBASE, rSP)		/* set the stack */
+
+	LWI(0x2401, rAX)		/* enable a20 line */
+	BIOSCALL(0x15)
+
+/*
+ * Do any real-mode BIOS calls before going to protected mode.
+ * Data will be stored at BIOSTABLES, not ROUND(end, BY2PG) as before.
+ */
+
+/*
+ * Check for CGA mode.
+ */
+_cgastart:
+	LWI(0x0F00, rAX)		/* get current video mode in AL */
+	BIOSCALL(0x10)
+	ANDI(0x007F, rAX)
+	SUBI(0x0003, rAX)		/* is it mode 3? */
+	JEQ	_cgaputs
+
+	LWI(0x0003, rAX)		/* turn on text mode 3 */
+	BIOSCALL(0x10)
+
+_cgaputs:				/* output a cheery wee message */
+	LWI(_hello(SB), rSI)
+	CLR(rBX)
+_cgaputsloop:
+	LODSB
+	ORB(rAL, rAL)
+	JEQ	_cgaend
+
+	LBI(0x0E,rAH)
+	BIOSCALL(0x10)
+	JMP	_cgaputsloop
+_cgaend:
+
+/*
+ * Reset floppy disc system.
+ * If successful, make sure the motor is off.
+ */
+_floppystart:
+	CLR(rAX)
+	CLR(rDX)
+	BIOSCALL(0x13)
+	ORB(rAL, rAL)
+	JNE	_floppyend
+	OUTPORTB(0x3F2, 0x00)		/* turn off motor */
+_floppyend:
+
+	LLI(BIOSTABLES, rAX)	/* tables in low memory, not after end */
+	OPSIZE; ANDL $~(BY2PG-1), AX
+	OPSIZE; SHRL $4, AX
+	SW(rAX, _ES(SB))
+	CLR(rDI)
+	SW(rDI, _DI(SB))
+
+	MTSR(rAX, rES)
+	
+/*
+ * Check for APM1.2 BIOS support.
+ */
+_apmstart:
+	LWI(0x5304, rAX)		/* disconnect anyone else */
+	CLR(rBX)
+	BIOSCALL(0x15)
+
+	LWI(0x5303, rAX)		/* connect */
+	CLR(rBX)
+	CLC
+	BIOSCALL(0x15)
+
+	JCC	_apmpush
+	LW(_ES(SB), rAX)
+	MTSR(rAX, rES)
+	LW(_DI(SB), rDI)
+	JCS	_apmend
+
+_apmpush:
+	OPSIZE; PUSHR(rSI)		/* save returned APM data on stack */
+	OPSIZE; PUSHR(rBX)
+	PUSHR(rDI)
+	PUSHR(rDX)
+	PUSHR(rCX)
+	PUSHR(rAX)
+
+	LW(_ES(SB), rAX)
+	MTSR(rAX, rES)
+	LW(_DI(SB), rDI)
+
+	LWI(0x5041, rAX)		/* first 4 bytes are APM\0 */
+	STOSW
+	LWI(0x004D, rAX)
+	STOSW
+
+	LWI(8, rCX)			/* pop the saved APM data */
+_apmpop:
+	POPR(rAX)
+	STOSW
+	LOOP	_apmpop
+_apmend:
+
+/*
+ * Try to retrieve the 0xE820 memory map.
+ * This is weird because some BIOS do not seem to preserve
+ * ES/DI on failure. Consequently they may not be valid
+ * at _e820end:.
+ */
+
+_e820start:
+	SW(rDI, _DI(SB))		/* save DI */
+	CLR(rAX)			/* write terminator */
+	STOSW
+	STOSW
+
+	CLR(rBX)
+	PUSHR(rBX)			/* keep loop count on stack */
+					/* BX is the continuation value */
+_e820loop:
+	POPR(rAX)
+	INC(rAX)
+	PUSHR(rAX)			/* doesn't affect FLAGS */
+	CMPI(32, rAX)			/* mmap[32+1] in C code */
+	JGT	_e820pop
+
+	LLI(20, rCX)			/* buffer size */
+	LLI(0x534D4150, rDX)		/* signature - ASCII "SMAP" */
+	LLI(0x0000E820, rAX)		/* function code */
+
+	BIOSCALL(0x15)
+
+	JCS	_e820pop		/* some kind of error */
+	LLI(0x534D4150, rDX)
+	CMPL(rDX, rAX)			/* verify correct BIOS version */
+	JNE	_e820pop
+	LLI(20, rDX)
+	CMPL(rDX, rCX)			/* verify correct count */
+	JNE	_e820pop
+
+	SUBI(4, rDI)			/* overwrite terminator */
+	LWI(0x414D, rAX)		/* first 4 bytes are "MAP\0" */
+	STOSW
+	LWI(0x0050, rAX)
+	STOSW
+
+	ADDI(20, rDI)			/* bump to next entry */
+
+	SW(rDI, _DI(SB))		/* save DI */
+	CLR(rAX)			/* write terminator */
+	STOSW
+	STOSW
+
+	OR(rBX, rBX)			/* zero if last entry */
+	JNE	_e820loop
+
+_e820pop:
+	POPR(rAX)			/* loop count */
+	LW(_DI(SB), rDI)
+	CLR(rAX)
+	MTSR(rAX, rES)
+_e820end:
+
+/*
+ * Done with BIOS calls for now.  realmode calls may use the BIOS later.
+ */
+
+/*
+ * Load a basic GDT to map 4GB, turn on the protected mode bit in CR0,
+ * set all the segments to point to the new GDT then jump to the 32-bit code.
+ */
+_real:
+	LGDT(_gdtptr16r(SB))		/* load a basic gdt */
+
+	MFCR(rCR0, rAX)
+	ORI(1, rAX)
+	MTCR(rAX, rCR0)			/* turn on protected mode */
+	DELAY				/* JMP .+2 */
+
+	LWI(SELECTOR(1, SELGDT, 0), rAX)/* set all segments */
+	MTSR(rAX, rDS)
+	MTSR(rAX, rES)
+	MTSR(rAX, rFS)
+	MTSR(rAX, rGS)
+	MTSR(rAX, rSS)
+
+	FARJUMP32(SELECTOR(2, SELGDT, 0), _start32p-KZERO(SB))

+ 212 - 0
sys/src/9/pcboot/l32p.s

@@ -0,0 +1,212 @@
+#include "mem.h"
+
+#define KB		1024
+#define MB		(1024*1024)
+
+/*
+ * Some machine instructions not handled by 8[al].
+ */
+#define	DELAY		BYTE $0xEB; BYTE $0x00	/* JMP .+2 */
+#define FARJUMP(s, o)	BYTE $0xEA;		/* far jump to ptr32:16 */\
+			LONG $o; WORD $s
+
+#define NOP		BYTE $0x90
+#define HLT		BYTE $0xF4
+
+/*
+ * Macro for calculating offset within the page directory base.
+ * Note that this is assembler-specific hence the '<<2'.
+ */
+#define PDO(a)		(((((a))>>22) & 0x03FF)<<2)
+
+/*
+ * May enter here either from the 16-bit real-mode startup or
+ * from other 32-bit protected mode code. For the latter case must
+ * make sure the GDT is set as it would be by the 16-bit code:
+ *	disable interrupts;
+ *	load the GDT with the table in _gdt32p;
+ *	load all the data segments
+ *	load the code segment via a far jump.
+ */
+TEXT _start32p(SB), $0
+/* if distance to _start32p32 changes, update the offset after 0xEB. */
+//	jmp	.+32 (_start32p32).
+	BYTE $0xEB; BYTE $(2+3*2*4+2+4)
+	NOP; NOP
+
+TEXT _gdt32p(SB), $0
+	LONG	$0x0000; LONG $0
+	LONG	$0xFFFF; LONG $(SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(0)|SEGDATA|SEGW)
+	LONG	$0xFFFF; LONG $(SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR)
+
+TEXT _gdtptr32p(SB), $0
+	WORD	$(3*8)
+	LONG	$_gdt32p-KZERO(SB)
+
+_start32p32:
+	CLI
+
+	MOVL	AX, _multibootheader+(48-KZERO)(SB)
+	MOVL	BX, _multibootheader+(52-KZERO)(SB)
+
+	MOVL	$_gdtptr32p-KZERO(SB), AX
+	MOVL	(AX), GDTR
+
+	MOVL	$KDSEL, AX
+	MOVW	AX, DS
+	MOVW	AX, ES
+	MOVW	AX, FS
+	MOVW	AX, GS
+	MOVW	AX, SS
+
+	DELAY
+	FARJUMP(KESEL, _start32pg-KZERO(SB))
+
+/*
+ * Make the basic page tables for processor 0. Eight pages are needed for
+ * the basic set:
+ * a page directory, 5 pages pf page tables for mapping the first 20MB of
+ * physical memory, a single physical and virtual page for the Mach structure,
+ * and a page to be used later for the GDT.
+ *
+ * The remaining PTEs will be allocated later when memory is sized.
+ * Could clear BSS here too when clearing the space for the tables,
+ * but that would violate the separation of church and state.
+ * The first aligned page after end[] was used for passing BIOS parameters
+ * by the real-mode startup, now it's BIOSTABLES.
+ */
+TEXT _start32pg(SB), $0			/* CHECK alignment (16) */
+	DELAY
+
+	MOVL	$end-KZERO(SB), DX	/* clear pages for the tables etc. */
+	/* start must be page aligned, skip boot params page */
+	ADDL	$(2*BY2PG-1), DX
+	ANDL	$~(BY2PG-1), DX
+
+	/*
+	 * zero mach page & gdt in low memory
+	 */
+	MOVL	$(CPU0MACH-KZERO), DI
+	XORL	AX, AX
+	MOVL	$(2*BY2PG), CX		/* mach (phys & virt) & gdt */
+	SHRL	$2, CX
+	CLD
+	REP;	STOSL			/* zero mach & gdt pages */
+
+	/*
+	 * zero pdb and pte for low memory
+	 */
+	MOVL	DX, DI			/* first page after end & boot params */
+	XORL	AX, AX
+	MOVL	$((1+LOWPTEPAGES)*BY2PG), CX /* pdb, n pte */
+	SHRL	$2, CX
+	CLD
+	REP;	STOSL			/* zero pdb & pte pages */
+
+	/*
+	 * populate pdb for low memory (20MB)
+	 */
+	MOVL	DX, AX			/* bootstrap processor PDB (page 0) */
+	MOVL	DX, DI			/* save it for later */
+	MOVL	$(PTEWRITE|PTEVALID), BX/* page permissions */
+
+	ADDL	$BY2PG, DX		/* -> PA of first page of page table (page 1)  */
+	ADDL	$PDO(KZERO), AX		/* page directory offset for KZERO */
+	MOVL	DX, (AX)		/* PTE's for KZERO */
+	ORL	BX, (AX)
+
+	/* should be LOWPTEPAGES-1 repetitions of this fragment */
+	ADDL	$BY2PG, DX		/* -> PA of second page of page table (page 2)  */
+	ADDL	$4, AX			/* page dir. offset for KZERO+4MB */
+	MOVL	DX, (AX)		/* PTE's for KZERO */
+	ORL	BX, (AX)
+
+	ADDL	$BY2PG, DX		/* -> PA of third page of page table (page 3)  */
+	ADDL	$4, AX			/* page dir. offset for KZERO+8MB */
+	MOVL	DX, (AX)		/* PTE's for KZERO */
+	ORL	BX, (AX)
+
+	ADDL	$BY2PG, DX		/* -> PA of fourth page of page table (page 4)  */
+	ADDL	$4, AX			/* page dir. offset for KZERO+12MB */
+	MOVL	DX, (AX)		/* PTE's for KZERO */
+	ORL	BX, (AX)
+
+	ADDL	$BY2PG, DX		/* -> PA of fifth page of page table (page 5)  */
+	ADDL	$4, AX			/* page dir. offset for KZERO+16MB */
+	MOVL	DX, (AX)		/* PTE's for KZERO */
+	ORL	BX, (AX)
+
+	/*
+	 * populate page tables for low memory
+	 */
+	MOVL	DI, AX
+	ADDL	$BY2PG, AX		/* PA of first page of page table */
+	MOVL	$(MemMin/BY2PG), CX
+_setpte:
+	MOVL	BX, (AX)
+	ADDL	$(1<<PGSHIFT), BX
+	ADDL	$4, AX
+	LOOP	_setpte
+
+	/*
+	 * map the Mach page
+	 */
+	MOVL	DI, AX
+	ADDL	$BY2PG, AX		/* PA of first page of page table */
+	MOVL	$(CPU0MACH-KZERO), DX	/* -> PA of Mach structure */
+	MOVL	$CPU0MACH, BX		/* VA of Mach structure */
+	SHRL	$10, BX			/* create offset into PTEs */
+	ANDL	$(0x3FF<<2), BX
+
+	ADDL	BX, AX			/* PTE offset for Mach structure */
+	MOVL	DX, (AX)		/* create PTE for Mach */
+	MOVL	$(PTEWRITE|PTEVALID), BX/* page permissions */
+	ORL	BX, (AX)
+
+/*
+ * Now ready to use the new map. Initialise CR0 (assume the BIOS gets
+ * it mostly correct for this processor type w.r.t. caches and FPU).
+ * It is necessary on some processors to follow mode switching
+ * with a JMP instruction to clear the prefetch queues.
+ * The instruction to switch modes and the following jump instruction
+ * should be identity-mapped; to this end double map KZERO at
+ * virtual 0 and undo the mapping once virtual nirvana has been attained.
+ * If paging is already on, don't load CR3 before setting CR0, in which
+ * case most of this is a NOP and the 2nd load of CR3 actually does
+ * the business.
+ */
+	MOVL	DI, CX			/* load address of PDB */
+	/* double-map first 20 MB, since we are running at 7 or 9 MB */
+	/* should be LOWPTEPAGES repetitions */
+	MOVL	PDO(KZERO)(CX), DX	/* double-map KZERO at 0 */
+	MOVL	DX, PDO(0)(CX)
+	MOVL	PDO(KZERO+4*MB)(CX), DX
+	MOVL	DX, PDO(4*MB)(CX)
+	MOVL	PDO(KZERO+8*MB)(CX), DX
+	MOVL	DX, PDO(8*MB)(CX)
+	MOVL	PDO(KZERO+12*MB)(CX), DX
+	MOVL	DX, PDO(12*MB)(CX)
+	MOVL	PDO(KZERO+16*MB)(CX), DX
+	MOVL	DX, PDO(16*MB)(CX)
+
+	MOVL	CR0, DX
+	MOVL	DX, AX
+	ANDL	$PG, AX			/* check paging not already on */
+	JNE	_nocr3load
+	MOVL	CX, CR3			/* paging off, safe to load cr3 */
+_nocr3load:
+	ORL	$(PG|0x10000), DX	/* PG|WP */
+	ANDL	$~0x6000000A, DX	/* ~(CD|NW|TS|MP) */
+
+	MOVL	$_startpg(SB), AX
+	TESTL	$KZERO, AX		/* want to run protected or virtual? */
+	JEQ	_to32v			/* protected */
+	MOVL	DX, CR0			/* turn on paging */
+	JMP*	AX			/* headfirst into the new world */
+
+TEXT _startpg(SB), $0
+//	MOVL	$0, PDO(0)(CX)		/* undo double-map of KZERO at 0 */
+	MOVL	CX, CR3			/* load and flush the mmu */
+_to32v:
+	MOVL	$_start32v(SB), AX
+	JMP*	AX			/* into the dorkness */

+ 549 - 0
sys/src/9/pcboot/l32v.s

@@ -0,0 +1,549 @@
+#include "/sys/src/boot/pc/x16.h"
+#include "mem.h"
+
+#define KB		1024
+#define MB		(1024*1024)
+
+#define WRMSR		BYTE $0x0F; BYTE $0x30	/* WRMSR, argument in AX/DX (lo/hi) */
+#define RDTSC 		BYTE $0x0F; BYTE $0x31	/* RDTSC, result in AX/DX (lo/hi) */
+#define RDMSR		BYTE $0x0F; BYTE $0x32	/* RDMSR, result in AX/DX (lo/hi) */
+#define CPUID		BYTE $0x0F; BYTE $0xA2	/* CPUID, argument in AX */
+
+TEXT _start32v(SB),$0
+	CLI
+
+	MOVL	$edata(SB), DI
+	XORL	AX, AX
+	MOVL	$end(SB), CX
+	SUBL	DI, CX			/* end-edata bytes */
+	SHRL	$2, CX			/* end-edata doublewords */
+
+	CLD
+	REP; STOSL			/* clear BSS */
+
+	MOVL	CR3, AX
+	/* 1+LOWPTEPAGES zeroed pages at (AX): pdb, pte */
+	ADDL	$KZERO, AX		/* VA of PDB */
+	MOVL	AX, mach0pdb(SB)
+	ADDL	$((1+LOWPTEPAGES)*BY2PG), AX /* skip pdb & n pte */
+	MOVL	AX, memstart(SB)	/* start of available memory */
+
+	/* 2 zeroed pages at CPU0MACH: mach, gdt */
+	MOVL	$CPU0MACH, AX
+	MOVL	AX, mach0m(SB)		/* ... VA of Mach */
+	MOVL	AX, m(SB)		/* initialise global Mach pointer */
+	MOVL	$0, 0(AX)		/* initialise m->machno */
+	ADDL	$MACHSIZE, AX
+	MOVL	AX, SP			/* switch to new stack in Mach */
+	MOVL	$CPU0GDT, mach0gdt(SB)
+
+	MOVL	$0x240000, AX		/* try to set Id|Ac in EFLAGS */
+	PUSHL	AX
+	POPFL
+
+	PUSHFL				/* retrieve EFLAGS -> BX */
+	POPL	BX
+
+	MOVL	$0, AX			/* clear Id|Ac, EFLAGS initialised */
+	PUSHL	AX
+	POPFL
+
+	PUSHFL				/* retrieve EFLAGS -> AX */
+
+	XORL	BX, (SP)		/* togglable bits */
+	CALL	main(SB)
+
+/*
+ * Park a processor. Should never fall through a return from main to here,
+ * should only be called by application processors when shutting down.
+ */
+TEXT idle(SB), $0
+_idle:
+	STI
+	HLT
+	JMP	_idle
+
+TEXT hlt(SB), $0
+	STI
+	HLT
+	RET
+
+/*
+ */
+TEXT _warp9o(SB), $0
+	MOVL	entry+0(FP), CX
+	MOVL	multibootheader-KZERO(SB), BX	/* multiboot data pointer */
+	MOVL	$0x2badb002, AX			/* multiboot magic */
+
+	CLI
+	JMP*	CX
+
+	JMP	_idle
+
+/*
+ * Macro for calculating offset within the page directory base.
+ * Note that this is assembler-specific hence the '<<2'.
+ */
+#define PDO(a)		(((((a))>>22) & 0x03FF)<<2)
+
+/*
+ */
+TEXT _warp9(SB), $0
+	CLI
+	MOVL	entry+0(FP), BP
+
+	MOVL	CR3, CX				/* load address of PDB */
+	ADDL	$KZERO, CX
+	MOVL	PDO(KZERO)(CX), DX		/* double-map KZERO at 0 */
+	MOVL	DX, PDO(0)(CX)
+
+	MOVL	CR3, CX
+	MOVL	CX, CR3				/* load and flush the mmu */
+
+	MOVL	$_start32id<>-KZERO(SB), AX
+	JMP*	AX				/* jump to identity-map */
+
+TEXT _start32id<>(SB), $0
+	MOVL	CR0, DX				/* turn off paging */
+	ANDL	$~PG, DX
+
+	MOVL	$_stop32pg<>-KZERO(SB), AX
+	MOVL	DX, CR0
+	JMP*	AX				/* forward into the past */
+
+TEXT _stop32pg<>(SB), $0
+	MOVL	multibootheader-KZERO(SB), BX	/* multiboot data pointer */
+	MOVL	$0x2badb002, AX			/* multiboot magic */
+
+	JMP*	BP
+
+	JMP	_idle
+
+/*
+ *  input a byte
+ */
+TEXT	inb(SB),$0
+
+	MOVL	p+0(FP),DX
+	XORL	AX,AX
+	INB
+	RET
+
+/*
+ * input a short from a port
+ */
+TEXT	ins(SB), $0
+
+	MOVL	p+0(FP), DX
+	XORL	AX, AX
+	OPSIZE; INL
+	RET
+
+/*
+ * input a long from a port
+ */
+TEXT	inl(SB), $0
+
+	MOVL	p+0(FP), DX
+	XORL	AX, AX
+	INL
+	RET
+
+/*
+ *  output a byte
+ */
+TEXT	outb(SB),$0
+
+	MOVL	p+0(FP),DX
+	MOVL	b+4(FP),AX
+	OUTB
+	RET
+
+/*
+ * output a short to a port
+ */
+TEXT	outs(SB), $0
+	MOVL	p+0(FP), DX
+	MOVL	s+4(FP), AX
+	OPSIZE; OUTL
+	RET
+
+/*
+ * output a long to a port
+ */
+TEXT	outl(SB), $0
+	MOVL	p+0(FP), DX
+	MOVL	s+4(FP), AX
+	OUTL
+	RET
+
+/*
+ *  input a string of bytes from a port
+ */
+TEXT	insb(SB),$0
+
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),DI
+	MOVL	c+8(FP),CX
+	CLD; REP; INSB
+	RET
+
+/*
+ *  input a string of shorts from a port
+ */
+TEXT	inss(SB),$0
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),DI
+	MOVL	c+8(FP),CX
+	CLD
+	REP; OPSIZE; INSL
+	RET
+
+/*
+ *  output a string of bytes to a port
+ */
+TEXT	outsb(SB),$0
+
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),SI
+	MOVL	c+8(FP),CX
+	CLD; REP; OUTSB
+	RET
+
+/*
+ *  output a string of shorts to a port
+ */
+TEXT	outss(SB),$0
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),SI
+	MOVL	c+8(FP),CX
+	CLD
+	REP; OPSIZE; OUTSL
+	RET
+
+/*
+ *  input a string of longs from a port
+ */
+TEXT	insl(SB),$0
+
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),DI
+	MOVL	c+8(FP),CX
+	CLD; REP; INSL
+	RET
+
+/*
+ *  output a string of longs to a port
+ */
+TEXT	outsl(SB),$0
+
+	MOVL	p+0(FP),DX
+	MOVL	a+4(FP),SI
+	MOVL	c+8(FP),CX
+	CLD; REP; OUTSL
+	RET
+
+/*
+ *  routines to load/read various system registers
+ */
+GLOBL	idtptr(SB),$6
+TEXT	putidt(SB),$0		/* interrupt descriptor table */
+	MOVL	t+0(FP),AX
+	MOVL	AX,idtptr+2(SB)
+	MOVL	l+4(FP),AX
+	MOVW	AX,idtptr(SB)
+	MOVL	idtptr(SB),IDTR
+	RET
+
+TEXT lgdt(SB), $0			/* GDTR - global descriptor table */
+	MOVL	gdtptr+0(FP), AX
+	MOVL	(AX), GDTR
+	RET
+
+TEXT lidt(SB), $0			/* IDTR - interrupt descriptor table */
+	MOVL	idtptr+0(FP), AX
+	MOVL	(AX), IDTR
+	RET
+
+TEXT	putcr3(SB),$0		/* top level page table pointer */
+	MOVL	t+0(FP),AX
+	MOVL	AX,CR3
+	RET
+
+TEXT	getcr0(SB),$0		/* coprocessor bits */
+	MOVL	CR0,AX
+	RET
+
+TEXT	getcr2(SB),$0		/* fault address */
+	MOVL	CR2,AX
+	RET
+
+TEXT	getcr3(SB),$0		/* page directory base */
+	MOVL	CR3,AX
+	RET
+
+TEXT	getcr4(SB), $0		/* CR4 - extensions */
+	MOVL	CR4, AX
+	RET
+
+TEXT putcr4(SB), $0
+	MOVL	cr4+0(FP), AX
+	MOVL	AX, CR4
+	RET
+
+TEXT _cycles(SB), $0				/* time stamp counter */
+	RDTSC
+	MOVL	vlong+0(FP), CX			/* &vlong */
+	MOVL	AX, 0(CX)			/* lo */
+	MOVL	DX, 4(CX)			/* hi */
+	RET
+
+TEXT rdmsr(SB), $0				/* model-specific register */
+	MOVL	index+0(FP), CX
+	RDMSR
+	MOVL	vlong+4(FP), CX			/* &vlong */
+	MOVL	AX, 0(CX)			/* lo */
+	MOVL	DX, 4(CX)			/* hi */
+	RET
+	
+TEXT wrmsr(SB), $0
+	MOVL	index+0(FP), CX
+	MOVL	lo+4(FP), AX
+	MOVL	hi+8(FP), DX
+	WRMSR
+	RET
+
+/*
+ * memory barriers
+ */
+TEXT mb386(SB), $0
+	POPL	AX				/* return PC */
+	PUSHFL
+	PUSHL	CS
+	PUSHL	AX
+	IRETL
+
+TEXT mb586(SB), $0
+	XORL	AX, AX
+	CPUID
+	RET
+
+TEXT sfence(SB), $0
+	BYTE $0x0f
+	BYTE $0xae
+	BYTE $0xf8
+	RET
+
+TEXT lfence(SB), $0
+	BYTE $0x0f
+	BYTE $0xae
+	BYTE $0xe8
+	RET
+
+TEXT mfence(SB), $0
+	BYTE $0x0f
+	BYTE $0xae
+	BYTE $0xf0
+	RET
+
+/*
+ *  special traps
+ */
+TEXT	intr0(SB),$0
+	PUSHL	$0
+	PUSHL	$0
+	JMP	intrcommon
+TEXT	intr1(SB),$0
+	PUSHL	$0
+	PUSHL	$1
+	JMP	intrcommon
+TEXT	intr2(SB),$0
+	PUSHL	$0
+	PUSHL	$2
+	JMP	intrcommon
+TEXT	intr3(SB),$0
+	PUSHL	$0
+	PUSHL	$3
+	JMP	intrcommon
+TEXT	intr4(SB),$0
+	PUSHL	$0
+	PUSHL	$4
+	JMP	intrcommon
+TEXT	intr5(SB),$0
+	PUSHL	$0
+	PUSHL	$5
+	JMP	intrcommon
+TEXT	intr6(SB),$0
+	PUSHL	$0
+	PUSHL	$6
+	JMP	intrcommon
+TEXT	intr7(SB),$0
+	PUSHL	$0
+	PUSHL	$7
+	JMP	intrcommon
+TEXT	intr8(SB),$0
+	PUSHL	$8
+	JMP	intrcommon
+TEXT	intr9(SB),$0
+	PUSHL	$0
+	PUSHL	$9
+	JMP	intrcommon
+TEXT	intr10(SB),$0
+	PUSHL	$10
+	JMP	intrcommon
+TEXT	intr11(SB),$0
+	PUSHL	$11
+	JMP	intrcommon
+TEXT	intr12(SB),$0
+	PUSHL	$12
+	JMP	intrcommon
+TEXT	intr13(SB),$0
+	PUSHL	$13
+	JMP	intrcommon
+TEXT	intr14(SB),$0
+	PUSHL	$14
+	JMP	intrcommon
+TEXT	intr15(SB),$0
+	PUSHL	$0
+	PUSHL	$15
+	JMP	intrcommon
+TEXT	intr16(SB),$0
+	PUSHL	$0
+	PUSHL	$16
+	JMP	intrcommon
+TEXT	intr24(SB),$0
+	PUSHL	$0
+	PUSHL	$24
+	JMP	intrcommon
+TEXT	intr25(SB),$0
+	PUSHL	$0
+	PUSHL	$25
+	JMP	intrcommon
+TEXT	intr26(SB),$0
+	PUSHL	$0
+	PUSHL	$26
+	JMP	intrcommon
+TEXT	intr27(SB),$0
+	PUSHL	$0
+	PUSHL	$27
+	JMP	intrcommon
+TEXT	intr28(SB),$0
+	PUSHL	$0
+	PUSHL	$28
+	JMP	intrcommon
+TEXT	intr29(SB),$0
+	PUSHL	$0
+	PUSHL	$29
+	JMP	intrcommon
+TEXT	intr30(SB),$0
+	PUSHL	$0
+	PUSHL	$30
+	JMP	intrcommon
+TEXT	intr31(SB),$0
+	PUSHL	$0
+	PUSHL	$31
+	JMP	intrcommon
+TEXT	intr32(SB),$0
+	PUSHL	$0
+	PUSHL	$32
+	JMP	intrcommon
+TEXT	intr33(SB),$0
+	PUSHL	$0
+	PUSHL	$33
+	JMP	intrcommon
+TEXT	intr34(SB),$0
+	PUSHL	$0
+	PUSHL	$34
+	JMP	intrcommon
+TEXT	intr35(SB),$0
+	PUSHL	$0
+	PUSHL	$35
+	JMP	intrcommon
+TEXT	intr36(SB),$0
+	PUSHL	$0
+	PUSHL	$36
+	JMP	intrcommon
+TEXT	intr37(SB),$0
+	PUSHL	$0
+	PUSHL	$37
+	JMP	intrcommon
+TEXT	intr38(SB),$0
+	PUSHL	$0
+	PUSHL	$38
+	JMP	intrcommon
+TEXT	intr39(SB),$0
+	PUSHL	$0
+	PUSHL	$39
+	JMP	intrcommon
+TEXT	intr64(SB),$0
+	PUSHL	$0
+	PUSHL	$64
+	JMP	intrcommon
+TEXT	intrbad(SB),$0
+	PUSHL	$0
+	PUSHL	$0x1ff
+	JMP	intrcommon
+
+intrcommon:
+	PUSHL	DS
+	PUSHL	ES
+	PUSHL	FS
+	PUSHL	GS
+	PUSHAL
+	MOVL	$(KDSEL),AX
+	MOVW	AX,DS
+	MOVW	AX,ES
+	LEAL	0(SP),AX
+	PUSHL	AX
+	CALL	trap(SB)
+	POPL	AX
+	POPAL
+	POPL	GS
+	POPL	FS
+	POPL	ES
+	POPL	DS
+	ADDL	$8,SP	/* error code and trap type */
+	IRETL
+
+
+/*
+ *  interrupt level is interrupts on or off.
+ *  kprof knows that spllo to spldone is splx routines.
+ */
+TEXT	spllo(SB),$0
+	PUSHFL
+	POPL	AX
+	STI
+	RET
+
+TEXT	splhi(SB),$0
+	PUSHFL
+	POPL	AX
+	CLI
+	RET
+
+TEXT	splx(SB),$0
+	MOVL	s+0(FP),AX
+	PUSHL	AX
+	POPFL
+	RET
+
+TEXT spldone(SB), $0
+	RET
+
+TEXT islo(SB), $0
+	PUSHFL
+	POPL	AX
+	ANDL	$0x200, AX			/* interrupt enable flag */
+	RET
+
+/*
+ *  basic timing loop to determine CPU frequency
+ */
+TEXT	aamloop(SB),$0
+
+	MOVL	c+0(FP),CX
+aaml1:
+	AAM
+	LOOP	aaml1
+	RET

+ 36 - 0
sys/src/9/pcboot/l64p.s

@@ -0,0 +1,36 @@
+#include "mem.h"
+
+/*
+ * Macro for calculating offset within the page directory base.
+ * Note that this is assembler-specific hence the '<<2'.
+ */
+#define PDO(a)		(((((a))>>22) & 0x03FF)<<2)
+
+TEXT _warp64(SB), $0
+	CLI
+	MOVL	entry+0(FP), BP			/* entry */
+
+	MOVL	CR3, CX				/* load address of PDB */
+	ADDL	$KZERO, CX
+	MOVL	PDO(KZERO)(CX), DX		/* double-map KZERO at 0 */
+	MOVL	DX, PDO(0)(CX)
+
+	MOVL	CR3, CX
+	MOVL	CX, CR3				/* load and flush the mmu */
+
+	MOVL	$_start32id<>-KZERO(SB), AX
+	JMP*	AX				/* jump to identity-map */
+
+TEXT _start32id<>(SB), $0
+	MOVL	CR0, DX				/* turn off paging */
+	ANDL	$~PG, DX
+
+	MOVL	$_stop32pg<>-KZERO(SB), AX
+	MOVL	DX, CR0
+	JMP*	AX				/* forward into the past */
+
+TEXT _stop32pg<>(SB), $0
+	MOVL multibootheader-KZERO(SB), BX
+	MOVL $0x2badb002, AX
+
+	JMP* BP

+ 322 - 0
sys/src/9/pcboot/ldecomp.s

@@ -0,0 +1,322 @@
+/*
+ * Bootstrap loader decompressor.  Starts at 0x10000 (where pbs puts it)
+ * or 0x7c00 (where pxe puts it) and memmoves kernel (immediately following)
+ * into standard kernel location.
+ */
+#include "mem.h"
+#include "/sys/src/boot/pc/x16.h"
+
+#undef BIOSCALL		/* we don't know what evil the bios gets up to */
+#define BIOSCALL(b)	INT $(b); CLI
+
+#define CMPL(r0, r1)	BYTE $0x66; CMP(r0, r1)
+#define LLI(i, rX)	BYTE $0x66;		/* i -> rX */		\
+			BYTE $(0xB8+rX);				\
+			LONG $i;
+#define CPUID		BYTE $0x0F; BYTE $0xA2	/* CPUID, argument in AX */
+#define WBINVD		BYTE $0x0F; BYTE $0x09
+
+TEXT origin(SB), $0
+/*
+ *	turn off interrupts
+ */
+	CLI
+
+	/*
+	 * This part of l.s is used only in the boot kernel.
+	 * It assumes that we are in real address mode, i.e.,
+	 * that we look like an 8086.
+	 *
+	 * Make sure the segments are reasonable.
+	 * If we were started directly from the BIOS
+	 * (i.e. no MS-DOS) then DS may not be
+	 * right.
+	 */
+	MOVW	CS, AX
+	MOVW	AX, DS
+
+	/* from ../l16r.s */
+	LWI(0, rAX)			/* always put stack in first 64k */
+	MTSR(rAX, rSS)
+	LWI(origin(SB), rSP)		/* set the stack pointer */
+
+	DELAY
+
+	LWI(0x2401, rAX)		/* enable a20 line */
+	BIOSCALL(0x15)
+
+	XORL	AX, AX
+	MOVB	$0x03, AL
+//	LWI(3, rAX)
+	INT	$0x10			/* set video mode in AL */
+
+/*
+ * Check for CGA mode.
+ */
+_cgastart:
+	LWI(0x0F00, rAX)		/* get current video mode in AL */
+	BIOSCALL(0x10)
+	ANDI(0x007F, rAX)
+	SUBI(0x0003, rAX)		/* is it mode 3? */
+	JEQ	_cgamode3
+
+	LWI(0x0003, rAX)		/* turn on text mode 3 */
+	BIOSCALL(0x10)
+_cgamode3:
+	LWI(_hello(SB), rSI)
+	CALL	_cgaputs(SB)
+
+/*
+ * start of transplanted apm & e820 scan code from l16r.s
+ */
+	LLI(BIOSTABLES, rAX)	/* tables in low memory, not after end */
+	OPSIZE; ANDL $~(BY2PG-1), AX
+	OPSIZE; SHRL $4, AX
+	SW(rAX, _ES(SB))
+	CLR(rDI)
+	SW(rDI, _DI(SB))
+
+	MTSR(rAX, rES)
+	
+/*
+ * Check for APM1.2 BIOS support.
+ */
+	DELAY
+	LWI(0x5304, rAX)		/* disconnect anyone else */
+	CLR(rBX)
+	BIOSCALL(0x15)
+	JCS	_apmfail
+
+	LWI(0x5303, rAX)		/* connect */
+	CLR(rBX)
+	CLC
+	BIOSCALL(0x15)
+	JCC	_apmpush
+_apmfail:
+	LW(_ES(SB), rAX)		/* no support */
+	MTSR(rAX, rES)
+	LW(_DI(SB), rDI)
+	DELAY
+	JCS	_apmend
+
+_apmpush:
+	OPSIZE; PUSHR(rSI)		/* save returned APM data on stack */
+	OPSIZE; PUSHR(rBX)
+	PUSHR(rDI)
+	PUSHR(rDX)
+	PUSHR(rCX)
+	PUSHR(rAX)
+
+	LW(_ES(SB), rAX)
+	MTSR(rAX, rES)
+	LW(_DI(SB), rDI)
+	DELAY
+
+	LWI(0x5041, rAX)		/* first 4 bytes are APM\0 */
+	STOSW
+	LWI(0x004D, rAX)
+	STOSW
+
+	LWI(8, rCX)			/* pop the saved APM data */
+_apmpop:
+	POPR(rAX)
+	STOSW
+	LOOP	_apmpop
+_apmend:
+
+/*
+ * Try to retrieve the 0xE820 memory map.
+ * This is weird because some BIOS do not seem to preserve
+ * ES/DI on failure. Consequently they may not be valid
+ * at _e820end:.
+ */
+	SW(rDI, _DI(SB))		/* save DI */
+	CLR(rAX)			/* write terminator (for APM?) */
+	STOSW
+	STOSW
+
+	CLR(rBX)
+	PUSHR(rBX)			/* keep loop count on stack */
+					/* BX is the continuation value */
+_e820loop:
+	POPR(rAX)
+	INC(rAX)
+	PUSHR(rAX)			/* doesn't affect FLAGS */
+	CMPI(32, rAX)			/* mmap[32+1] in C code */
+	JGT	_e820pop
+
+	LLI(20, rCX)			/* buffer size */
+	LLI(0x534D4150, rDX)		/* signature - ASCII "SMAP" */
+	LLI(0x0000E820, rAX)		/* function code */
+
+	BIOSCALL(0x15)			/* writes 20 bytes at (es,di) */
+
+	JCS	_e820pop		/* some kind of error */
+	LLI(0x534D4150, rDX)
+	CMPL(rDX, rAX)			/* verify correct BIOS version */
+	JNE	_e820pop
+	LLI(20, rDX)
+	CMPL(rDX, rCX)			/* verify correct count */
+	JNE	_e820pop
+
+	SUBI(4, rDI)			/* overwrite terminator */
+	LWI(0x414D, rAX)		/* first 4 bytes are "MAP\0" */
+	STOSW
+	LWI(0x0050, rAX)
+	STOSW
+
+	ADDI(20, rDI)			/* bump to next entry */
+
+	SW(rDI, _DI(SB))		/* save DI */
+	CLR(rAX)			/* write terminator */
+	STOSW
+	STOSW
+
+	OR(rBX, rBX)			/* zero if last entry */
+	JNE	_e820loop
+
+_e820pop:
+	POPR(rAX)			/* loop count */
+	LW(_DI(SB), rDI)
+	CLR(rAX)
+	MTSR(rAX, rES)
+	DELAY
+_e820end:
+/*
+ * end of transplanted apm & e820 scan code from l16r.s
+ */
+
+/*
+ * 	goto protected mode
+ */
+/*	MOVL	loadgdtptr(SB),GDTR /**/
+	 BYTE	$0x0f
+	 BYTE	$0x01
+	 BYTE	$0x16
+	 WORD	$loadgdtptr(SB)
+
+	DELAY
+	LWI(1, rAX)
+	/* MOV AX,MSW */
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF0
+
+/*
+ *	clear prefetch queue (weird code to avoid optimizations)
+ */
+	DELAY
+
+/*
+ *	set all segs
+ */
+/*	MOVW	$KDSEL,AX	/**/
+	 BYTE	$0xc7
+	 BYTE	$0xc0
+	 WORD	$KDSEL
+	MOVW	AX,DS
+	MOVW	AX,SS
+	MOVW	AX,ES
+	MOVW	AX,FS
+	MOVW	AX,GS
+
+	MOVW	$(20*1024*1024-4), SP		/* new stack pointer */
+	DELAY
+
+	/* god only knows what the damned bios has been up to... */
+	CLI
+
+	/* jump to C (main) */
+/*	JMPFAR	KESEL:$main(SB) /**/
+	 BYTE	$0x66
+	 BYTE	$0xEA
+	 LONG	$_main(SB)
+	 WORD	$KESEL
+
+/* output a cheery wee message (rSI) */
+TEXT _cgaputs(SB), $0
+//_cgaputs:
+	CLR(rBX)
+_cgaputsloop:
+	LODSB
+	ORB(rAL, rAL)
+	JEQ	_cgaend
+
+	LBI(0x0E,rAH)
+	BIOSCALL(0x10)
+	JMP	_cgaputsloop
+_cgaend:
+	RET
+
+TEXT _hello(SB), $0
+	BYTE $'\r'; BYTE $'\n'
+	BYTE $'9'; BYTE $'b'; BYTE $'o'; BYTE $'o'
+	BYTE $'t'; BYTE $' '
+	BYTE $'\z'
+
+/* offset into bios tables using segment ES.  stos? use (es,di) */
+TEXT _DI(SB), $0
+	LONG	$0
+
+/* segment address of bios tables (BIOSTABLES >> 4) */
+TEXT _ES(SB), $0
+	LONG	$0
+
+/*
+ *  pointer to initial gdt
+ */
+TEXT	loadgdtptr(SB),$0
+	WORD	$(4*8)
+	LONG	$loadgdt(SB)
+
+/*
+ *  gdt to get us to 32-bit/segmented/unpaged mode
+ */
+TEXT	loadgdt(SB),$0
+
+	/* null descriptor */
+	LONG	$0
+	LONG	$0
+
+	/* data segment descriptor for 4 gigabytes (PL 0) */
+	LONG	$(0xFFFF)
+	LONG	$(SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(0)|SEGDATA|SEGW)
+
+	/* exec segment descriptor for 4 gigabytes (PL 0) */
+	LONG	$(0xFFFF)
+	LONG	$(SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR)
+
+	/* exec segment descriptor for 4 gigabytes (PL 0) 16-bit */
+	LONG	$(0xFFFF)
+	LONG	$(SEGG|(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR)
+
+/*
+ *  output a byte
+ */
+TEXT	outb(SB),$0
+	MOVL	p+0(FP),DX
+	MOVL	b+4(FP),AX
+	OUTB
+	RET
+
+/*
+ *  input a byte
+ */
+TEXT	inb(SB),$0
+	MOVL	p+0(FP),DX
+	XORL	AX,AX
+	INB
+	RET
+
+TEXT mb586(SB), $0
+	XORL	AX, AX
+	CPUID
+	RET
+
+TEXT wbinvd(SB), $0
+	WBINVD
+	RET
+
+TEXT	splhi(SB),$0
+	PUSHFL
+	POPL	AX
+	CLI
+	RET

+ 37 - 0
sys/src/9/pcboot/load

@@ -0,0 +1,37 @@
+# load - 9load (not 9pxeload) as a variant of 9pccpu.
+#	has to fit in bottom 640K, currently is about 378K.
+dev
+	root
+	cons
+	arch
+	rtc
+#	floppy
+	sd
+
+	uart
+
+misc
+	bootld
+	conf
+	dir
+	diskload
+#	dma
+	dosboot
+	fs
+	inflate
+	nomtrr
+	parts
+	pci
+	rand
+	stub
+	uarti8250
+
+	sdata		pci sdscsi
+	sd53c8xx	pci sdscsi
+	sdmylex		pci sdscsi
+	sdiahci		pci sdscsi
+#	sdflop
+
+port
+	int cpuserver = 1;
+	char hellomsg[] = "disk loader";

+ 37 - 0
sys/src/9/pcboot/loadusb

@@ -0,0 +1,37 @@
+# loadusb - 9loadusb as a variant of 9pccpu.
+#	has to fit in bottom 640K, currently is about 310K.
+dev
+	root
+	cons
+	arch
+	rtc
+# order matters: bios must precede sd
+	bios
+	sd
+
+	uart
+
+misc
+	bootld
+	conf
+	dir
+	diskload
+	dosboot
+	fs
+	inflate
+	nomtrr
+	parts
+	pci
+	rand
+	realmode
+	stub
+	uarti8250
+
+	sdbios		pci sdscsi
+
+port
+	int cpuserver = 1;
+# disabling the clock makes 9loadusb run faster, but also breaks
+# (timed) keyboard input, alas.
+#	int noclock = 1;
+	char hellomsg[] = "bios (usb) loader";

+ 737 - 0
sys/src/9/pcboot/main.c

@@ -0,0 +1,737 @@
+/*
+ * 9boot - load next 386 or amd64 kernel from disk and start it
+ *	and
+ * 9load - load next 386 or amd64 kernel via pxe (bootp, tftp) and start it
+ *
+ * intel says that pxe can only load into the bottom 640K, and
+ * intel's pxe boot agent takes 128K, leaving only 512K for 9boot.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"reboot.h"
+#include	"ip.h"		/* for eipfmt */
+
+enum {
+	Datamagic = 0xbabeabed,
+};
+
+Mach *m;
+
+ulong* mach0pdb;
+Mach* mach0m;
+Segdesc* mach0gdt;
+u32int memstart;
+u32int memend;
+int noclock;
+
+extern int pcivga;
+extern char hellomsg[];
+
+/*
+ * Where configuration info is left for the loaded programme.
+ */
+char bootdisk[KNAMELEN];
+Conf conf;
+
+uchar *sp;	/* user stack of init proc */
+int delaylink;
+int debug;
+int v_flag;
+
+static void
+sanity(void)
+{
+	uintptr cr3;
+
+	cr3 = (uintptr)KADDR(getcr3());
+	if (cr3 == 0)
+		panic("zero cr3");
+	if ((uintptr)m->pdb != cr3 || (uintptr)mach0pdb != cr3)
+		panic("not all same: cr3 %#p m->pdb %#p mach0pdb %#p",
+			cr3, m->pdb, mach0pdb);
+	if (m != mach0m)
+		panic("m %#p != mach0m %#p", m, mach0m);
+	if (m->gdt != mach0gdt)
+		panic("m->gdt %#p != mach0gdt %#p", m->gdt, mach0gdt);
+	if (0)
+		iprint("m->pdb %#p m %#p sp %#p m->gdt %#p\n",
+			m->pdb, m, &cr3, m->gdt);
+}
+
+enum {
+	/* system control port a */
+	Sysctla=	0x92,
+	 Sysctlreset=	1<<0,
+	 Sysctla20ena=	1<<1,
+};
+
+static int
+isa20on(void)
+{
+	int r;
+	ulong o;
+	ulong *zp, *mb1p;
+
+	zp = 0;
+	mb1p = (ulong *)MB;
+	o = *zp;
+
+	*zp = 0x1234;
+	*mb1p = 0x8765;
+	mb586();
+	wbinvd();
+	r = *zp != *mb1p;
+
+	*zp = o;
+	return r;
+}
+
+void
+a20init(void)
+{
+	int b;
+
+	if (isa20on())
+		return;
+
+	i8042a20();			/* original method, via kbd ctlr */
+	if (isa20on())
+		return;
+
+	/* newer method, last resort */
+	b = inb(Sysctla);
+	if (!(b & Sysctla20ena))
+		outb(Sysctla, (b & ~Sysctlreset) | Sysctla20ena);
+	if (!isa20on())
+		iprint("a20 didn't come on!\n");
+}
+
+void
+main(void)
+{
+	Proc *savup;
+	static ulong vfy = Datamagic;
+	static char novga[] = "\nno vga; serial console only\n";
+
+	savup = up;
+	up = nil;
+	/* m has been set by l32v.s */
+
+	/*
+	 * disable address wraps at 1MB boundaries.
+	 * if we're 9boot, ldecomp.s already did this.
+	 */
+	a20init();
+
+	mach0init();
+//	options();		/* we don't get options passed to us */
+	ioinit();
+	/* we later call i8250console after plan9.ini has been read */
+	i8250config("0");	/* configure serial port 0 with defaults */
+	quotefmtinstall();
+ 	fmtinstall('i', eipfmt);
+ 	fmtinstall('I', eipfmt);
+ 	fmtinstall('E', eipfmt);
+ 	fmtinstall('V', eipfmt);
+ 	fmtinstall('M', eipfmt);
+	screeninit();			/* cga setup */
+	cgapost(0xc);
+
+	trapinit0();
+	mmuinit0();
+
+	kbdinit();
+	i8253init();
+	cpuidentify();
+	readlsconf();
+	meminit();
+	confinit();
+	archinit();
+	xinit();
+	if(i8237alloc != nil)
+		i8237alloc();		/* dma (for floppy) init */
+	trapinit();
+	printinit();
+	sanity();
+	cgapost(1);
+
+	/*
+	 * soekris servers have no built-in video but each has a serial port.
+	 * they must see serial output, if any, before cga output because
+	 * otherwise the soekris bios will translate cga output to serial
+	 * output, which will garble serial console output.
+	 */
+	pcimatch(nil, 0, 0);		/* force scan of pci table */
+	if (!pcivga) {
+		screenputs = nil;
+		uartputs(novga, sizeof novga - 1);
+	}
+	print(" %s\n\n", hellomsg);
+
+	if (vfy != Datamagic)
+		panic("data segment incorrectly aligned or loaded");
+	if (savup)
+		print("up was non-nil (%#p) upon entry to main; bss wasn't zeroed!\n",
+			savup);
+
+//	xsummary();
+	cpuidprint();
+	mmuinit();
+	if(arch->intrinit)	/* launches other processors on an mp */
+		arch->intrinit();
+	timersinit();
+	mathinit();
+	kbdenable();
+	/*
+	 * 9loadusb runs much faster if we don't use the clock.
+	 * perhaps we're competing with the bios for the use of it?
+	 */
+	if(!noclock && arch->clockenable)
+		arch->clockenable();
+	procinit0();
+	initseg();
+	if(delaylink){
+		bootlinks();
+		pcimatch(0, 0, 0);
+	}else
+		links();
+	conf.monitor = 1;
+	cgapost(0xcd);
+	chandevreset();
+	cgapost(2);
+	pageinit();	/* must follow xinit, and conf.mem must be populated */
+	i8253link();
+	userinit();
+
+	active.thunderbirdsarego = 1;
+	cgapost(0xb0);
+	schedinit();
+}
+
+void
+mach0init(void)
+{
+	conf.nmach = 1;
+	MACHP(0) = mach0m;
+	m->machno = 0;
+	m->pdb = mach0pdb;
+	m->gdt = mach0gdt;
+
+	machinit();
+
+	active.machs = 1;
+	active.exiting = 0;
+}
+
+void
+machinit(void)
+{
+	int machno;
+	ulong *pdb;
+	Segdesc *gdt;
+
+	machno = m->machno;
+	pdb = m->pdb;
+	gdt = m->gdt;
+	memset(m, 0, sizeof(Mach));
+	m->machno = machno;
+	m->pdb = pdb;
+	m->gdt = gdt;
+	m->perf.period = 1;
+
+	/*
+	 * For polled uart output at boot, need
+	 * a default delay constant. 100000 should
+	 * be enough for a while. Cpuidentify will
+	 * calculate the real value later.
+	 */
+	m->loopconst = 100000;
+}
+
+void
+init0(void)
+{
+	int i;
+	char buf[2*KNAMELEN];
+
+	up->nerrlab = 0;
+
+	spllo();
+
+	/*
+	 * These are o.k. because rootinit is null.
+	 * Then early kproc's will have a root and dot.
+	 */
+	up->slash = namec("#/", Atodir, 0, 0);
+	pathclose(up->slash->path);
+	up->slash->path = newpath("/");
+	up->dot = cclone(up->slash);
+
+	chandevinit();
+
+	if(0 && !waserror()){			/* not needed by boot */
+		snprint(buf, sizeof(buf), "%s %s", arch->id, conffile);
+		ksetenv("terminal", buf, 0);
+		ksetenv("cputype", "386", 0);
+		if(cpuserver)
+			ksetenv("service", "cpu", 0);
+		else
+			ksetenv("service", "terminal", 0);
+		for(i = 0; i < nconf; i++){
+			if(confname[i][0] != '*')
+				ksetenv(confname[i], confval[i], 0);
+			ksetenv(confname[i], confval[i], 1);
+		}
+		poperror();
+	}
+	kproc("alarm", alarmkproc, 0);
+	kproc("bootload", bootloadproc, 0);
+
+//	touser(sp);	/* user mode isn't really implemented in the boot */
+	for (;;)
+		sched();
+}
+
+void
+userinit(void)
+{
+	Proc *p;
+
+	p = newproc();
+	p->pgrp = newpgrp();
+	p->egrp = smalloc(sizeof(Egrp));
+	p->egrp->ref = 1;
+	p->fgrp = dupfgrp(nil);
+	p->rgrp = newrgrp();
+	p->procmode = 0640;
+
+	kstrdup(&eve, "");
+	kstrdup(&p->text, "*init*");
+	kstrdup(&p->user, eve);
+
+	p->fpstate = FPinit;
+	fpoff();
+
+	/*
+	 * Kernel Stack
+	 *
+	 * N.B. make sure there's enough space for syscall to check
+	 *	for valid args and 
+	 *	4 bytes for gotolabel's return PC
+	 */
+	p->sched.pc = (ulong)init0;
+	p->sched.sp = (ulong)p->kstack+KSTACK-(sizeof(Sargs)+BY2WD);
+
+	/* NB: no user stack nor text segments are set up */
+
+	ready(p);
+}
+
+uchar *
+pusharg(char *p)
+{
+	int n;
+
+	n = strlen(p)+1;
+	sp -= n;
+	memmove(sp, p, n);
+	return sp;
+}
+
+/* we're a bootstrap loader, so we aren't passed any options. */
+void
+bootargs(void *base)
+{
+ 	int i, ac;
+	uchar *av[32];
+	uchar **lsp;
+	char *cp = "";
+	char buf[64];
+
+	sp = (uchar*)base + BY2PG - MAXSYSARG*BY2WD;
+
+	ac = 0;
+	av[ac++] = pusharg("/386/9dos");
+
+	/* when boot is changed to only use rc, this code can go away */
+//	cp[BOOTLINELEN-1] = 0;
+	buf[0] = 0;
+	if(strncmp(cp, "fd", 2) == 0){
+		snprint(buf, sizeof buf, "local!#f/fd%lddisk", strtol(cp+2, 0, 0));
+		av[ac++] = pusharg(buf);
+	} else if(strncmp(cp, "sd", 2) == 0){
+		snprint(buf, sizeof buf, "local!#S/sd%c%c/fs", *(cp+2), *(cp+3));
+		av[ac++] = pusharg(buf);
+	} else if(strncmp(cp, "ether", 5) == 0)
+		av[ac++] = pusharg("-n");
+
+	/* 4 byte word align stack */
+	sp = (uchar*)((ulong)sp & ~3);
+
+	/* build argc, argv on stack */
+	sp -= (ac+1)*sizeof(sp);
+	lsp = (uchar**)sp;
+	for(i = 0; i < ac; i++)
+		*lsp++ = av[i] + ((USTKTOP - BY2PG) - (ulong)base);
+	*lsp = 0;
+	sp += (USTKTOP - BY2PG) - (ulong)base - sizeof(ulong);
+}
+
+void
+confinit(void)
+{
+	int i, userpcnt;
+	ulong kpages;
+
+	userpcnt = 0;			/* bootstrap; no user mode  */
+	conf.npage = 0;
+	for(i=0; i<nelem(conf.mem); i++)
+		conf.npage += conf.mem[i].npage;
+
+	conf.npage = MemMax / BY2PG;
+	conf.nproc = 20;		/* need a few kprocs */
+	if(cpuserver)
+		conf.nproc *= 3;
+	if(conf.nproc > 2000)
+		conf.nproc = 2000;
+	conf.nimage = 40;
+	conf.nswap = conf.nproc*80;
+	conf.nswppo = 4096;
+
+	kpages = conf.npage - (conf.npage*userpcnt)/100;
+
+	/*
+	 * can't go past the end of virtual memory
+	 * (ulong)-KZERO is 2^32 - KZERO
+	 */
+	if(kpages > ((ulong)-KZERO)/BY2PG)
+		kpages = ((ulong)-KZERO)/BY2PG;
+
+	conf.upages = conf.npage - kpages;
+	conf.ialloc = (kpages/2)*BY2PG;
+
+	/*
+	 * Guess how much is taken by the large permanent
+	 * datastructures. Mntcache and Mntrpc are not accounted for
+	 * (probably ~300KB).
+	 */
+	kpages *= BY2PG;
+	kpages -= conf.upages*sizeof(Page)
+		+ conf.nproc*sizeof(Proc)
+		+ conf.nimage*sizeof(Image)
+		+ conf.nswap
+		+ conf.nswppo*sizeof(Page);
+	mainmem->maxsize = kpages;
+	if(!cpuserver){
+		/*
+		 * give terminals lots of image memory, too; the dynamic
+		 * allocation will balance the load properly, hopefully.
+		 * be careful with 32-bit overflow.
+		 */
+		imagmem->maxsize = kpages;
+	}
+}
+
+/*
+ *  math coprocessor segment overrun
+ */
+static void
+mathover(Ureg*, void*)
+{
+	pexit("math overrun", 0);
+}
+
+void
+mathinit(void)
+{
+}
+
+/*
+ *  set up floating point for a new process
+ */
+void
+procsetup(Proc*p)
+{
+	p->fpstate = FPinit;
+	fpoff();
+}
+
+void
+procrestore(Proc *p)
+{
+	uvlong t;
+
+	if(p->kp)
+		return;
+	cycles(&t);
+	p->pcycles -= t;
+}
+
+/*
+ *  Save the mach dependent part of the process state.
+ */
+void
+procsave(Proc *p)
+{
+	uvlong t;
+
+	cycles(&t);
+	p->pcycles += t;
+
+	/*
+	 * While this processor is in the scheduler, the process could run
+	 * on another processor and exit, returning the page tables to
+	 * the free list where they could be reallocated and overwritten.
+	 * When this processor eventually has to get an entry from the
+	 * trashed page tables it will crash.
+	 *
+	 * If there's only one processor, this can't happen.
+	 * You might think it would be a win not to do this in that case,
+	 * especially on VMware, but it turns out not to matter.
+	 */
+	mmuflushtlb(PADDR(m->pdb));
+}
+
+static void
+shutdown(int ispanic)
+{
+	int ms, once;
+
+	lock(&active);
+	if(ispanic)
+		active.ispanic = ispanic;
+	else if(m->machno == 0 && (active.machs & (1<<m->machno)) == 0)
+		active.ispanic = 0;
+	once = active.machs & (1<<m->machno);
+	/*
+	 * setting exiting will make hzclock() on each processor call exit(0),
+	 * which calls shutdown(0) and arch->reset(), which on mp systems is
+	 * mpshutdown, which idles non-bootstrap cpus and returns on bootstrap
+	 * processors (to permit a reboot).  clearing our bit in machs avoids
+	 * calling exit(0) from hzclock() on this processor.
+	 */
+	active.machs &= ~(1<<m->machno);
+	active.exiting = 1;
+	unlock(&active);
+
+	if(once)
+		iprint("cpu%d: exiting\n", m->machno);
+
+	/* wait for any other processors to shutdown */
+	spllo();
+	for(ms = 5*1000; ms > 0; ms -= TK2MS(2)){
+		delay(TK2MS(2));
+		if(active.machs == 0 && consactive() == 0)
+			break;
+	}
+
+	if(active.ispanic){
+		if(!cpuserver)
+			for(;;)
+				halt();
+		if(getconf("*debug"))
+			delay(5*60*1000);
+		else
+			delay(10000);
+	}else
+		delay(1000);
+}
+
+void
+reboot(void *entry, void *code, ulong size)
+{
+	int i;
+	void (*f)(ulong, ulong, ulong);
+	ulong *pdb;
+
+	/* we do pass options to the kernel we loaded, however, at CONFADDR. */
+	// writeconf();
+
+	/*
+	 * the boot processor is cpu0.  execute this function on it
+	 * so that the new kernel has the same cpu0.  this only matters
+	 * because the hardware has a notion of which processor was the
+	 * boot processor and we look at it at start up.
+	 */
+	if (m->machno != 0) {
+		procwired(up, 0);
+		sched();
+	}
+
+	if(conf.nmach > 1) {
+		/*
+		 * the other cpus could be holding locks that will never get
+		 * released (e.g., in the print path) if we put them into
+		 * reset now, so force them to shutdown gracefully first.
+		 */
+		lock(&active);
+		active.rebooting = 1;
+		unlock(&active);
+		shutdown(0);
+		if(arch->resetothers)
+			arch->resetothers();
+		delay(20);
+	}
+
+	/*
+	 * should be the only processor running now
+	 */
+	active.machs = 0;
+	if (m->machno != 0)
+		print("on cpu%d (not 0)!\n", m->machno);
+
+	print("shutting down...\n");
+	delay(200);
+
+	splhi();
+
+	/* turn off buffered serial console */
+	serialoq = nil;
+
+	/* shutdown devices */
+	chandevshutdown();
+	arch->introff();
+
+	/*
+	 * Modify the machine page table to directly map low memory
+	 * This allows the reboot code to turn off the page mapping
+	 */
+	pdb = m->pdb;
+	for (i = 0; i < LOWPTEPAGES; i++)
+		pdb[PDX(i*4*MB)] = pdb[PDX(KZERO + i*4*MB)];
+	mmuflushtlb(PADDR(pdb));
+
+	/* setup reboot trampoline function */
+	f = (void*)REBOOTADDR;
+	memmove(f, rebootcode, sizeof(rebootcode));
+
+	print("rebooting...\n");
+
+	/* off we go - never to return */
+	coherence();
+	(*f)(PADDR(entry), PADDR(code), size);
+}
+
+
+void
+exit(int ispanic)
+{
+	shutdown(ispanic);
+	spllo();
+	arch->reset();
+}
+
+int
+isaconfig(char *class, int ctlrno, ISAConf *isa)
+{
+	char cc[32], *p;
+	int i;
+
+	snprint(cc, sizeof cc, "%s%d", class, ctlrno);
+	p = getconf(cc);
+	if(p == nil)
+		return 0;
+
+	isa->type = "";
+	isa->nopt = tokenize(p, isa->opt, NISAOPT);
+	for(i = 0; i < isa->nopt; i++){
+		p = isa->opt[i];
+		if(cistrncmp(p, "type=", 5) == 0)
+			isa->type = p + 5;
+		else if(cistrncmp(p, "port=", 5) == 0)
+			isa->port = strtoul(p+5, &p, 0);
+		else if(cistrncmp(p, "irq=", 4) == 0)
+			isa->irq = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "dma=", 4) == 0)
+			isa->dma = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "mem=", 4) == 0)
+			isa->mem = strtoul(p+4, &p, 0);
+		else if(cistrncmp(p, "size=", 5) == 0)
+			isa->size = strtoul(p+5, &p, 0);
+		else if(cistrncmp(p, "freq=", 5) == 0)
+			isa->freq = strtoul(p+5, &p, 0);
+	}
+	return 1;
+}
+
+int
+cistrcmp(char *a, char *b)
+{
+	int ac, bc;
+
+	for(;;){
+		ac = *a++;
+		bc = *b++;
+	
+		if(ac >= 'A' && ac <= 'Z')
+			ac = 'a' + (ac - 'A');
+		if(bc >= 'A' && bc <= 'Z')
+			bc = 'a' + (bc - 'A');
+		ac -= bc;
+		if(ac)
+			return ac;
+		if(bc == 0)
+			break;
+	}
+	return 0;
+}
+
+int
+cistrncmp(char *a, char *b, int n)
+{
+	unsigned ac, bc;
+
+	while(n > 0){
+		ac = *a++;
+		bc = *b++;
+		n--;
+
+		if(ac >= 'A' && ac <= 'Z')
+			ac = 'a' + (ac - 'A');
+		if(bc >= 'A' && bc <= 'Z')
+			bc = 'a' + (bc - 'A');
+
+		ac -= bc;
+		if(ac)
+			return ac;
+		if(bc == 0)
+			break;
+	}
+
+	return 0;
+}
+
+int less_power_slower;
+
+/*
+ *  put the processor in the halt state if we've no processes to run.
+ *  an interrupt will get us going again.
+ */
+void
+idlehands(void)
+{
+	/*
+	 * we used to halt only on single-core setups. halting in an smp system 
+	 * can result in a startup latency for processes that become ready.
+	 * if less_power_slower is true, we care more about saving energy
+	 * than reducing this latency.
+	 */
+	if(conf.nmach == 1 || less_power_slower)
+		halt();
+}
+
+void
+trimnl(char *s)
+{
+	char *nl;
+
+	nl = strchr(s, '\n');
+	if (nl != nil)
+		*nl = '\0';
+}

+ 42 - 0
sys/src/9/pcboot/mboot.s

@@ -0,0 +1,42 @@
+/* included by l16r.s and mbootstart.s */
+
+/*
+ * Must be 4-byte aligned & within 8K of the image's start.
+ */
+TEXT _multibootheader(SB), $0			/* CHECK alignment (4) */
+	LONG	$0x1BADB002			/* magic */
+	LONG	$0x00010003			/* flags */
+	LONG	$-(0x1BADB002 + 0x00010003)	/* checksum */
+	LONG	$_multibootheader-KZERO(SB)	/* header_addr */
+	LONG	$_start32p-KZERO(SB)		/* load_addr */
+	LONG	$edata-KZERO(SB)		/* load_end_addr */
+	LONG	$end-KZERO(SB)			/* bss_end_addr */
+	LONG	$_start32p-KZERO(SB)		/* entry_addr */
+	LONG	$0				/* mode_type */
+	LONG	$0				/* width */
+	LONG	$0				/* height */
+	LONG	$0				/* depth */
+
+	LONG	$0				/* +48: saved AX - magic */
+	LONG	$0				/* +52: saved BX - info* */
+
+/*
+ * There's no way with 8[al] to make this into local data, hence
+ * the TEXT definitions. Also, it should be in the same segment as
+ * the LGDT instruction.
+ * In real mode only 24-bits of the descriptor are loaded so the
+ * -KZERO is superfluous for the usual mappings.
+ * The segments are
+ *	NULL
+ *	DATA 32b 4GB PL0
+ *	EXEC 32b 4GB PL0
+ *	EXEC 16b 4GB PL0
+ */
+TEXT _gdt16r(SB), $0
+	LONG $0x0000; LONG $0
+	LONG $0xFFFF; LONG $(SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(0)|SEGDATA|SEGW)
+	LONG $0xFFFF; LONG $(SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR)
+	LONG $0xFFFF; LONG $(SEGG     |(0xF<<16)|SEGP|SEGPL(0)|SEGEXEC|SEGR)
+TEXT _gdtptr16r(SB), $0
+	WORD	$(4*8)
+	LONG	$_gdt16r-KZERO(SB)

+ 11 - 0
sys/src/9/pcboot/mbootstart.s

@@ -0,0 +1,11 @@
+#include "mem.h"
+#include "/sys/src/boot/pc/x16.h"
+
+/*
+ * Must be 4-byte aligned & within 8K of the image's start.
+ */
+TEXT _nop(SB), $0
+	NOP
+	NOP
+	NOP
+#include "mboot.s"

+ 213 - 0
sys/src/9/pcboot/mem.h

@@ -0,0 +1,213 @@
+/*
+ * Memory and machine-specific definitions.  Used in C and assembler.
+ */
+
+#define MIN(a, b)	((a) < (b)? (a): (b))
+#define MAX(a, b)	((a) > (b)? (a): (b))
+
+/*
+ * Sizes
+ */
+#define	BI2BY		8			/* bits per byte */
+#define	BI2WD		32			/* bits per word */
+#define	BY2WD		4			/* bytes per word */
+#define	BY2V		8			/* bytes per double word */
+#define	BY2PG		4096			/* bytes per page */
+#define	WD2PG		(BY2PG/BY2WD)		/* words per page */
+#define	BY2XPG		(4096*1024)		/* bytes per big page */
+#define	PGSHIFT		12			/* log(BY2PG) */
+#define	ROUND(s, sz)	(((s)+((sz)-1))&~((sz)-1))
+#define	PGROUND(s)	ROUND(s, BY2PG)
+#define CACHELINESZ	32			/* pentium & later */
+#define	BLOCKALIGN	8
+
+#define	MAXMACH		1			/* max # cpus system can run */
+/*
+ * we use a larger-than-normal kernel stack in the bootstraps as we have
+ * significant arrays (buffers) on the stack.  we typically consume about
+ * 4.5K of stack.
+ */
+#define KSTACK		(4*BY2PG)		/* Size of kernel stack */
+#define MACHSIZE	BY2PG
+
+/*
+ * Time
+ */
+#define	HZ		(100)			/* clock frequency */
+#define	MS2HZ		(1000/HZ)		/* millisec per clock tick */
+#define	TK2SEC(t)	((t)/HZ)		/* ticks to seconds */
+
+/*
+ *  Address spaces
+ *
+ *  Kernel is at 2GB-4GB
+ */
+#define	KZERO		0x80000000		/* base of kernel address space */
+#define	KSEGM		0xF0000000
+#define	VPT		(KZERO-VPTSIZE)
+#define	VPTSIZE		BY2XPG
+#define	NVPT		(VPTSIZE/BY2WD)
+#define	KMAP		(VPT-KMAPSIZE)
+#define	KMAPSIZE	BY2XPG
+#define	VMAP		(KMAP-VMAPSIZE)
+#define	VMAPSIZE	(0x10000000-VPTSIZE-KMAPSIZE)
+#define	UZERO		0			/* base of user address space */
+#define	UTZERO		(UZERO+BY2PG)		/* first address in user text */
+#define	USTKTOP		(VMAP-BY2PG)		/* byte just beyond user stack */
+#define	USTKSIZE	(16*1024*1024)		/* size of user stack */
+#define	TSTKTOP		(USTKTOP-USTKSIZE)	/* end of new stack in sysexec */
+#define	TSTKSIZ 	100			/* pages in new stack; limits exec args */
+
+/*
+ * Fundamental addresses - bottom 64kB saved for return to real mode
+ *
+ * we need some fixed address space for things that normally fit below the
+ * kernel and some of it needs to be addressible from real mode, thus under 1MB.
+ *
+ * pxe loading starts us at 0x7c00 and non-pxe loading starts us at 0x10000.
+ *
+ * this assertion must hold: PDX(TMPADDR) == PDX(MACHADDR).
+ * see realmode0.s for a description of the Chinese puzzle.
+ */
+/* first 1K is real-mode IVT (vectors) */
+/* next 256 bytes are bios data area (bda) */
+/* 0x500 to 0x800 unused [768] */
+/*
+ * RMCODE should be the lowest address used in real mode.
+ * all real-mode buffers, etc. should follow it.
+ */
+#define RMCODE		(KZERO+0x800)	/* copy of initial KTZERO [2K, magic] */
+#define	REBOOTADDR	(RMCODE-KZERO)	/* reboot code - physical address [128] */
+#define RMSIZE		2048
+/* 0x1000 to 0x1200 unused [512] */
+/* CONFADDR must match def'n in kernel being loaded */
+#define	CONFADDR	(KZERO+0x1200)	/* cfg passed to kernel [3.5K, fixed] */
+#define BIOSXCHG	(KZERO+0x2000)	/* BIOS data exchange [2K+32] */
+/* 0x2820 to 0x2900 unused [224] */
+#define	RMUADDR		(KZERO+0x2900)	/* real mode Ureg [128 (76 actual)] */
+/* 0x2980 to 0x3000 unused [1664] */
+#define IDTADDR		(KZERO+0x3000)	/* protected-mode idt [2K] */
+/* 0x3800 to 0x4000 unused [2K] */
+
+/* we only use one processor for bootstrapping, so merge MACHADDR & CPU0MACH */
+#define	MACHADDR	(KZERO+0x4000)	/* as seen by current processor */
+#define	CPU0MACH	MACHADDR	/* Mach for bootstrap processor */
+#define	CPU0GDT		(KZERO+0x5000)	/* bootstrap processor GDT after Mach */
+#define	TMPADDR		(KZERO+0x6000)	/* used for temporary mappings */
+/* 0x7000—0x7800 unused [2K], could be extra real-mode stack */
+#define PXEBASE 	0x7c00		/* pxe loads us here */
+#define RMSTACK 	PXEBASE		/* real phys. stack base [1K below] */
+#define PBSBASE		0x10000		/* pbs loads us here (at 64K) */
+
+/*
+ * we used to use 9boot's `end', rounded up, but that was when the boot loader
+ * was in the first 640K; now end is up around 10MB (at least for 9boot).
+ * various machines nibble away at the top of the lowest 640K.
+ */
+#define BIOSTABLES	(512*1024)
+// #define BIOSTABLES	(600*1024)	/* fails on amd64 */
+// #define BIOSTABLES	0x3800		/* 2K: fails on amd64 */
+
+#define Bootkernaddr	(9*MB)	/* where to put decompressed boot kernel */
+#define Bootkernmax	(4*MB)	/* max size */
+#define Unzipbuf	(13*MB)
+#define Mallocbase	(16*MB)
+/*
+ * MemMin is what the bootstrap code in l*.s has already mapped;
+ * MemMax is the limit of physical memory to scan.
+ */
+#define MemMin		(20*MB)	/* don't have PTEs for more allocated yet */
+#define MemMax		(32*MB)
+
+#define Lowmemsz	(640*KB)
+
+#define LOWPTEPAGES	(MemMin / (4*MB))
+
+#define Kernelmax	(8*MB)	/* max size of real kernel, not an address */
+
+/*
+ *  known x86 segments (in GDT) and their selectors
+ */
+#define	NULLSEG	0	/* null segment */
+#define	KDSEG	1	/* kernel data/stack */
+#define	KESEG	2	/* kernel executable */	
+#define	UDSEG	3	/* user data/stack */
+#define	UESEG	4	/* user executable */
+#define	TSSSEG	5	/* task segment */
+#define	APMCSEG		6	/* APM code segment */
+#define	APMCSEG16	7	/* APM 16-bit code segment */
+#define	APMDSEG		8	/* APM data segment */
+#define	KESEG16		9	/* kernel executable 16-bit */
+#define	NGDT		10	/* number of GDT entries required */
+/* #define	APM40SEG	8	/* APM segment 0x40 */
+
+#define	SELGDT	(0<<2)	/* selector is in gdt */
+#define	SELLDT	(1<<2)	/* selector is in ldt */
+
+#define	SELECTOR(i, t, p)	(((i)<<3) | (t) | (p))
+
+#define	NULLSEL	SELECTOR(NULLSEG, SELGDT, 0)
+#define	KDSEL	SELECTOR(KDSEG, SELGDT, 0)
+#define	KESEL	SELECTOR(KESEG, SELGDT, 0)
+#define	UESEL	SELECTOR(UESEG, SELGDT, 3)
+#define	UDSEL	SELECTOR(UDSEG, SELGDT, 3)
+#define	TSSSEL	SELECTOR(TSSSEG, SELGDT, 0)
+#define	APMCSEL 	SELECTOR(APMCSEG, SELGDT, 0)
+#define	APMCSEL16	SELECTOR(APMCSEG16, SELGDT, 0)
+#define	APMDSEL		SELECTOR(APMDSEG, SELGDT, 0)
+/* #define	APM40SEL	SELECTOR(APM40SEG, SELGDT, 0) */
+
+/*
+ *  fields in segment descriptors
+ */
+#define	SEGDATA	(0x10<<8)	/* data/stack segment */
+#define	SEGEXEC	(0x18<<8)	/* executable segment */
+#define	SEGTSS	(0x9<<8)	/* TSS segment */
+#define	SEGCG	(0x0C<<8)	/* call gate */
+#define	SEGIG	(0x0E<<8)	/* interrupt gate */
+#define	SEGTG	(0x0F<<8)	/* trap gate */
+#define	SEGTYPE	(0x1F<<8)
+
+#define	SEGP	(1<<15)		/* segment present */
+#define	SEGPL(x) ((x)<<13)	/* priority level */
+#define	SEGB	(1<<22)		/* granularity 1==4k (for expand-down) */
+#define	SEGG	(1<<23)		/* granularity 1==4k (for other) */
+#define	SEGE	(1<<10)		/* expand down */
+#define	SEGW	(1<<9)		/* writable (for data/stack) */
+#define	SEGR	(1<<9)		/* readable (for code) */
+#define	SEGD	(1<<22)		/* default 1==32bit (for code) */
+
+/*
+ *  virtual MMU
+ */
+#define	PTEMAPMEM	(1024*1024)	
+#define	PTEPERTAB	(PTEMAPMEM/BY2PG)
+#define	SEGMAPSIZE	1984
+#define	SSEGMAPSIZE	16
+#define	PPN(x)		((x)&~(BY2PG-1))
+
+/*
+ *  physical MMU
+ */
+#define	PTEVALID	(1<<0)
+#define	PTEWT		(1<<3)
+#define	PTEUNCACHED	(1<<4)
+#define	PTEWRITE	(1<<1)
+#define	PTERONLY	(0<<1)
+#define	PTEKERNEL	(0<<2)
+#define	PTEUSER		(1<<2)
+#define	PTESIZE		(1<<7)
+#define	PTEGLOBAL	(1<<8)
+
+/* CR0 */
+#define PG	0x80000000
+
+/*
+ * Macros for calculating offsets within the page directory base
+ * and page tables. 
+ */
+#define	PDX(va)		((((ulong)(va))>>22) & 0x03FF)
+#define	PTX(va)		((((ulong)(va))>>12) & 0x03FF)
+
+#define	getpgcolor(a)	0
+

+ 600 - 0
sys/src/9/pcboot/memory.c

@@ -0,0 +1,600 @@
+/*
+ * Size memory and create the kernel page-tables on the fly while doing so.
+ * Called from main(), this code should only be run by the bootstrap processor.
+ *
+ * MemMin is what the bootstrap code in l.s has already mapped;
+ * MemMax is the limit of physical memory to scan.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "ureg.h"
+
+#define MEMDEBUG	0
+
+enum {
+	MemUPA		= 0,		/* unbacked physical address */
+	MemRAM		= 1,		/* physical memory */
+	MemUMB		= 2,		/* upper memory block (<16MB) */
+	MemReserved	= 3,
+	NMemType	= 4,
+
+	KB		= 1024,
+};
+
+typedef struct Map Map;
+struct Map {
+	ulong	size;
+	ulong	addr;
+};
+
+typedef struct RMap RMap;
+struct RMap {
+	char*	name;
+	Map*	map;
+	Map*	mapend;
+
+	Lock;
+};
+
+/* 
+ * Memory allocation tracking.
+ */
+static Map mapupa[16];
+static RMap rmapupa = {
+	"unallocated unbacked physical addresses",
+	mapupa,
+	&mapupa[nelem(mapupa)-1],
+};
+
+static Map mapram[16];
+static RMap rmapram = {
+	"physical memory",
+	mapram,
+	&mapram[nelem(mapram)-1],
+};
+
+static Map mapumb[64];
+static RMap rmapumb = {
+	"upper memory block",
+	mapumb,
+	&mapumb[nelem(mapumb)-1],
+};
+
+static Map mapumbrw[16];
+static RMap rmapumbrw = {
+	"UMB device memory",
+	mapumbrw,
+	&mapumbrw[nelem(mapumbrw)-1],
+};
+
+static void map(ulong base, ulong len, int type);
+
+void
+mapprint(RMap *rmap)
+{
+	Map *mp;
+
+	print("%s\n", rmap->name);	
+	for(mp = rmap->map; mp->size; mp++)
+		print("\t%8.8luX %8.8luX (%lud)\n", mp->addr, mp->addr+mp->size, mp->size);
+}
+
+
+void
+memdebug(void)
+{
+	ulong maxpa, maxpa1, maxpa2;
+
+	maxpa = (nvramread(0x18)<<8)|nvramread(0x17);
+	maxpa1 = (nvramread(0x31)<<8)|nvramread(0x30);
+	maxpa2 = (nvramread(0x16)<<8)|nvramread(0x15);
+	print("maxpa = %luX -> %luX, maxpa1 = %luX maxpa2 = %luX\n",
+		maxpa, MB+maxpa*KB, maxpa1, maxpa2);
+
+	mapprint(&rmapram);
+	mapprint(&rmapumb);
+	mapprint(&rmapumbrw);
+	mapprint(&rmapupa);
+}
+
+void
+mapfree(RMap* rmap, ulong addr, ulong size)
+{
+	Map *mp;
+	ulong t;
+
+	if(size <= 0)
+		return;
+
+	lock(rmap);
+	for(mp = rmap->map; mp->addr <= addr && mp->size; mp++)
+		;
+
+	if(mp > rmap->map && (mp-1)->addr+(mp-1)->size == addr){
+		(mp-1)->size += size;
+		if(addr+size == mp->addr){
+			(mp-1)->size += mp->size;
+			while(mp->size){
+				mp++;
+				(mp-1)->addr = mp->addr;
+				(mp-1)->size = mp->size;
+			}
+		}
+	}
+	else{
+		if(addr+size == mp->addr && mp->size){
+			mp->addr -= size;
+			mp->size += size;
+		}
+		else do{
+			if(mp >= rmap->mapend){
+				print("mapfree: %s: losing %#luX, %ld\n",
+					rmap->name, addr, size);
+				break;
+			}
+			t = mp->addr;
+			mp->addr = addr;
+			addr = t;
+			t = mp->size;
+			mp->size = size;
+			mp++;
+		}while(size = t);
+	}
+	unlock(rmap);
+}
+
+ulong
+mapalloc(RMap* rmap, ulong addr, int size, int align)
+{
+	Map *mp;
+	ulong maddr, oaddr;
+
+	lock(rmap);
+	for(mp = rmap->map; mp->size; mp++){
+		maddr = mp->addr;
+
+		if(addr){
+			/*
+			 * A specific address range has been given:
+			 *   if the current map entry is greater then
+			 *   the address is not in the map;
+			 *   if the current map entry does not overlap
+			 *   the beginning of the requested range then
+			 *   continue on to the next map entry;
+			 *   if the current map entry does not entirely
+			 *   contain the requested range then the range
+			 *   is not in the map.
+			 */
+			if(maddr > addr)
+				break;
+			if(mp->size < addr - maddr)	/* maddr+mp->size < addr, but no overflow */
+				continue;
+			if(addr - maddr > mp->size - size)	/* addr+size > maddr+mp->size, but no overflow */
+				break;
+			maddr = addr;
+		}
+
+		if(align > 0)
+			maddr = ((maddr+align-1)/align)*align;
+		if(mp->addr+mp->size-maddr < size)
+			continue;
+
+		oaddr = mp->addr;
+		mp->addr = maddr+size;
+		mp->size -= maddr-oaddr+size;
+		if(mp->size == 0){
+			do{
+				mp++;
+				(mp-1)->addr = mp->addr;
+			}while((mp-1)->size = mp->size);
+		}
+
+		unlock(rmap);
+		if(oaddr != maddr)
+			mapfree(rmap, oaddr, maddr-oaddr);
+
+		return maddr;
+	}
+	unlock(rmap);
+
+	return 0;
+}
+
+/*
+ * Allocate from the ram map directly to make page tables.
+ * Called by mmuwalk during e820scan.
+ */
+void*
+rampage(void)
+{
+	ulong m;
+
+	m = mapalloc(&rmapram, 0, BY2PG, BY2PG);
+	if(m == 0)
+		return nil;
+	return KADDR(m);
+}
+
+static void
+umbexclude(void)
+{
+	int size;
+	ulong addr;
+	char *op, *p, *rptr;
+
+	if((p = getconf("umbexclude")) == nil)
+		return;
+
+	while(p && *p != '\0' && *p != '\n'){
+		op = p;
+		addr = strtoul(p, &rptr, 0);
+		if(rptr == nil || rptr == p || *rptr != '-'){
+			print("umbexclude: invalid argument <%s>\n", op);
+			break;
+		}
+		p = rptr+1;
+
+		size = strtoul(p, &rptr, 0) - addr + 1;
+		if(size <= 0){
+			print("umbexclude: bad range <%s>\n", op);
+			break;
+		}
+		if(rptr != nil && *rptr == ',')
+			*rptr++ = '\0';
+		p = rptr;
+
+		mapalloc(&rmapumb, addr, size, 0);
+	}
+}
+
+static void
+umbscan(void)
+{
+	uchar o[2], *p;
+
+	/*
+	 * Scan the Upper Memory Blocks (0xA0000->0xF0000) for pieces
+	 * which aren't used; they can be used later for devices which
+	 * want to allocate some virtual address space.
+	 * Check for two things:
+	 * 1) device BIOS ROM. This should start with a two-byte header
+	 *    of 0x55 0xAA, followed by a byte giving the size of the ROM
+	 *    in 512-byte chunks. These ROM's must start on a 2KB boundary.
+	 * 2) device memory. This is read-write.
+	 * There are some assumptions: there's VGA memory at 0xA0000 and
+	 * the VGA BIOS ROM is at 0xC0000. Also, if there's no ROM signature
+	 * at 0xE0000 then the whole 64KB up to 0xF0000 is theoretically up
+	 * for grabs; check anyway.
+	 */
+	p = KADDR(0xD0000);
+	while(p < (uchar*)KADDR(0xE0000)){
+		/*
+		 * Check for the ROM signature, skip if valid.
+		 */
+		if(p[0] == 0x55 && p[1] == 0xAA){
+			p += p[2]*512;
+			continue;
+		}
+
+		/*
+		 * Is it writeable? If yes, then stick it in
+		 * the UMB device memory map. A floating bus will
+		 * return 0xff, so add that to the map of the
+		 * UMB space available for allocation.
+		 * If it is neither of those, ignore it.
+		 */
+		o[0] = p[0];
+		p[0] = 0xCC;
+		o[1] = p[2*KB-1];
+		p[2*KB-1] = 0xCC;
+		if(p[0] == 0xCC && p[2*KB-1] == 0xCC){
+			p[0] = o[0];
+			p[2*KB-1] = o[1];
+			mapfree(&rmapumbrw, PADDR(p), 2*KB);
+		}
+		else if(p[0] == 0xFF && p[1] == 0xFF)
+			mapfree(&rmapumb, PADDR(p), 2*KB);
+		p += 2*KB;
+	}
+
+	p = KADDR(0xE0000);
+	if(p[0] != 0x55 || p[1] != 0xAA){
+		p[0] = 0xCC;
+		p[64*KB-1] = 0xCC;
+		if(p[0] != 0xCC && p[64*KB-1] != 0xCC)
+			mapfree(&rmapumb, PADDR(p), 64*KB);
+	}
+
+	umbexclude();
+}
+
+enum {
+	Pteflags = (1<<12) - 1,
+};
+
+void
+dumppdb(ulong *pdb)
+{
+	ulong *pp;
+
+	pdb = (ulong *)((uintptr)pdb & ~Pteflags);
+	iprint("pdb at phys %#8.8p:\n", PADDR(pdb));
+	for (pp = pdb; pp < pdb + 1024; pp++)
+		if (*pp)
+			iprint("pdb[%3ld]: %#8.8lux\n", pp - pdb, *pp);
+}
+
+void
+dumppte(ulong *pdb, int sub, int first)
+{
+	ulong *pp, *pte;
+
+	pte = KADDR(pdb[sub]);
+	pte = (ulong *)((uintptr)pte & ~Pteflags);
+	if (PADDR(pte) == 0) {
+		iprint("pdb[%d] unmapped\n", sub);
+		return;
+	}
+	iprint("pdb[%d] pte at phys %#8.8p:\n", sub, PADDR(pte));
+	for (pp = pte; pp < pte + first; pp++)
+		if (*pp)
+			iprint("pte[%3ld]: %#8.8lux\n", pp - pte, *pp);
+	iprint("...\n");
+}
+
+uintptr
+mapping(uintptr va)
+{
+	ulong *pte;
+
+	pte = KADDR(m->pdb[PDX(va)] & ~Pteflags);
+	return pte[PTX(va)] & ~Pteflags;
+}
+
+/*
+ * adjust the maps and make the mmu mappings match the maps
+ */
+static void
+lowraminit(void)
+{
+	/*
+	 * low memory is in use by bootstrap kernels and ROMs.
+	 * MemReserved is untouchable, so use MemRAM.
+	 * address zero is special to mapalloc, and thus to map, so avoid it.
+	 * we can thus load the new kernel directly at 1MB and up.
+	 */
+//	map(BY2PG, MB - BY2PG, MemRAM)	/* executing this map call is fatal */
+	mapalloc(&rmapram, BY2PG, Mallocbase - BY2PG, 0);
+
+	/*
+	 * declare all RAM above Mallocbase to be free.
+	 */
+	map(Mallocbase, MemMax - Mallocbase, MemRAM);
+
+	/* declare rest of physical address space above RAM to be available */
+	map(MemMax, KZERO-MemMax, MemUPA);
+
+	/* force the new mappings to take effect */
+	mmuflushtlb(PADDR(m->pdb));
+}
+
+/*
+ * add region at physical base of len bytes to map for `type', and
+ * set up page tables to map virtual KZERO|base to physical base.
+ */
+static void
+map(ulong base, ulong len, int type)
+{
+	ulong n, flags, maxkpa;
+	
+//	iprint("map %.8lux %.8lux %d (", base, base+len, type);
+	/*
+	 * Split any call crossing MemMin to make below simpler.
+	 */
+	if(base < MemMin && len > MemMin-base){
+		n = MemMin - base;
+		map(base, n, type);
+		map(MemMin, len-n, type);
+		return;
+	}
+	
+	switch(type){
+	case MemRAM:
+		mapfree(&rmapram, base, len);
+		flags = PTEWRITE|PTEVALID;
+		break;
+	case MemUMB:
+		mapfree(&rmapumb, base, len);
+		flags = PTEWRITE|PTEUNCACHED|PTEVALID;
+		break;
+	case MemUPA:
+		mapfree(&rmapupa, base, len);
+		flags = 0;
+		break;
+	default:
+	case MemReserved:
+		flags = 0;
+		break;
+	}
+
+	/*
+	 * Only map from KZERO to 2^32.
+	 */
+	if(flags){
+		maxkpa = -KZERO;
+		if(base >= maxkpa)
+			return;
+		if(len > maxkpa-base)
+			len = maxkpa - base;
+		pdbmap(m->pdb, base|flags, base+KZERO, len);
+	}
+}
+
+void
+meminit(void)
+{
+	int i, kzsub;
+	Map *mp;
+	Confmem *cm;
+	ulong pa, *pte;
+	ulong lost, physpte;
+
+	/* no need to size memory, we don't need much. */
+	pte = m->pdb + BY2PG/BY2WD;		/* see l*.s */
+
+	/* populate pdb with double-mapping of low memory */
+	kzsub = ((uintptr)KZERO >> (2*PGSHIFT - 4)) / sizeof(ulong);
+	physpte = (uintptr)PADDR(pte);
+	for (i = 0; i < LOWPTEPAGES; i++)
+		m->pdb[kzsub + i] = m->pdb[i] =
+			PTEVALID | PTEKERNEL | PTEWRITE | (physpte + i * BY2PG);
+
+	/*
+	 * Set special attributes for memory between 640KB and 1MB:
+	 *   VGA memory is writethrough;
+	 *   BIOS ROM's/UMB's are uncached;
+	 * then scan for useful memory.
+	 */
+	for(pa = 0xA0000; pa < 0xC0000; pa += BY2PG){
+		pte = mmuwalk(m->pdb, (ulong)KADDR(pa), 2, 0);
+		*pte |= PTEWT;
+	}
+	for(pa = 0xC0000; pa < 0x100000; pa += BY2PG){
+		pte = mmuwalk(m->pdb, (ulong)KADDR(pa), 2, 0);
+		*pte |= PTEUNCACHED;
+	}
+	mmuflushtlb(PADDR(m->pdb));
+
+	umbscan();
+	lowraminit();
+
+	/*
+	 * Set the conf entries describing banks of allocatable memory.
+	 */
+	for(i=0; i<nelem(mapram) && i<nelem(conf.mem); i++){
+		mp = &rmapram.map[i];
+		cm = &conf.mem[i];
+		cm->base = mp->addr;
+		cm->npage = mp->size/BY2PG;
+		if (i == 0 && cm->npage == 0)
+			panic("meminit: no memory in conf.mem");
+	}
+	lost = 0;
+	for(; i<nelem(mapram); i++)
+		lost += rmapram.map[i].size;
+	if(lost)
+		print("meminit - lost %lud bytes\n", lost);
+
+	if(MEMDEBUG)
+		memdebug();
+}
+
+/*
+ * Allocate memory from the upper memory blocks.
+ */
+ulong
+umbmalloc(ulong addr, int size, int align)
+{
+	ulong a;
+
+	if(a = mapalloc(&rmapumb, addr, size, align))
+		return (ulong)KADDR(a);
+
+	return 0;
+}
+
+void
+umbfree(ulong addr, int size)
+{
+	mapfree(&rmapumb, PADDR(addr), size);
+}
+
+ulong
+umbrwmalloc(ulong addr, int size, int align)
+{
+	ulong a;
+	uchar o[2], *p;
+
+	if(a = mapalloc(&rmapumbrw, addr, size, align))
+		return(ulong)KADDR(a);
+
+	/*
+	 * Perhaps the memory wasn't visible before
+	 * the interface is initialised, so try again.
+	 */
+	if((a = umbmalloc(addr, size, align)) == 0)
+		return 0;
+	p = (uchar*)a;
+	o[0] = p[0];
+	p[0] = 0xCC;
+	o[1] = p[size-1];
+	p[size-1] = 0xCC;
+	if(p[0] == 0xCC && p[size-1] == 0xCC){
+		p[0] = o[0];
+		p[size-1] = o[1];
+		return a;
+	}
+	umbfree(a, size);
+
+	return 0;
+}
+
+void
+umbrwfree(ulong addr, int size)
+{
+	mapfree(&rmapumbrw, PADDR(addr), size);
+}
+
+/*
+ * Give out otherwise-unused physical address space
+ * for use in configuring devices.  Note that unlike upamalloc
+ * before it, upaalloc does not map the physical address
+ * into virtual memory.  Call vmap to do that.
+ */
+ulong
+upaalloc(int size, int align)
+{
+	ulong a;
+
+	a = mapalloc(&rmapupa, 0, size, align);
+	if(a == 0){
+		print("out of physical address space allocating %d\n", size);
+		mapprint(&rmapupa);
+	}
+	return a;
+}
+
+void
+upafree(ulong pa, int size)
+{
+	mapfree(&rmapupa, pa, size);
+}
+
+void
+upareserve(ulong pa, int size)
+{
+	ulong a;
+	
+	a = mapalloc(&rmapupa, pa, size, 0);
+	if(a != pa){
+		/*
+		 * This can happen when we're using the E820
+		 * map, which might have already reserved some
+		 * of the regions claimed by the pci devices.
+		 */
+	//	print("upareserve: cannot reserve pa=%#.8lux size=%d\n", pa, size);
+		if(a != 0)
+			mapfree(&rmapupa, a, size);
+	}
+}
+
+void
+memorysummary(void)
+{
+	memdebug();
+}
+

+ 144 - 0
sys/src/9/pcboot/mkfile

@@ -0,0 +1,144 @@
+# build pxe loaders (from pxe + pbs) and disk loaders (disk + usb)
+objtype=386
+</$objtype/mkfile
+
+TARG=\
+	9boot\
+	9bootpbs\
+	9load\
+	9loadusb\
+
+BIN=/386
+
+PXEBASE=0x80007c00
+PBSBASE=0x80010000
+# assume we've been decompressed into place by our header.
+# BOOTBASE must match expand.c's Kerneladdr|KZERO; +0x20 allows for a.out header
+BOOTBASE=0x80900020
+EXTRACOPIES=
+
+SFX=''				# defaults
+BASE=boot
+
+ONPROC=$NPROC
+NPROC=1				# don't build all boot flavours at once
+
+%.$O:	%.s
+	$AS $AFLAGS $stem.s
+
+%.$O:	%.c
+	$CC $CFLAGS $stem.c
+
+all:V: $TARG
+
+install:V: 9boot.install 9bootpbs.install 9load.install 9loadusb.install
+
+pcclean pclean:V:
+	@ { cd ../pc && mk clean }
+clean:V:
+	rm -f 8.* bootmain bootmain.c
+	@ {
+		rfork ne; bind bootmkfile mkfile
+		CONF=bootpbs SFX=pbs mk clean
+		BASE=load CONF=loadusb mk clean
+		BASE=load CONF=loadusb SFX=pxe mk clean
+		for (BASE in boot load)
+			CONF=$BASE mk clean
+	}
+
+%.calls:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PXEBASE CONF=boot mk $target }
+
+#
+# pxe-loaded pxe loader
+#
+9bootmain 9bootmaindebug:V:
+	cp boot bootmain
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$BOOTBASE CONF=bootmain NPROC=$ONPROC MBOOT=mbootstart.$O
+	mk all }
+
+# creating $O.expand also creates $O.expanddebug, out of mk's sight
+9boot:D: $O.expand 9bootmaindebug
+	cp 9bootmaindebug 9bootdebug		# for 9boot.install
+	{ cat $O.expand; strip -o /fd/1 9bootmaindebug | gzip -9 } >$target
+
+# creating 9boot also creates 9bootdebug, out of mk's sight
+9boot.install:V: 9boot
+	cp 9boot 9bootdebug /$objtype/ &
+	for(i in $EXTRACOPIES)
+		{ 9fs $i && cp $prereq /n/$i/$objtype && echo -n $i... & }
+	wait
+	echo
+
+#
+# disk-resident pxe loader, loaded by pbs
+#
+9bootpbs:V:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE CONF=bootpbs SFX=pbs
+	NPROC=$ONPROC START=l16r.$O; mk all }
+
+9bootpbs.install:V: 9bootpbs
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE CONF=bootpbs SFX=pbs
+	NPROC=$ONPROC START=l16r.$O; mk install }
+
+#
+# disk-resident disk loaders, loaded by pbs
+#
+9load:V:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE BASE=load CONF=load
+	NPROC=$ONPROC START=l16r.$O; mk all }
+
+9load.install:V: 9load
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE BASE=load CONF=load
+	NPROC=$ONPROC START=l16r.$O; mk install }
+
+9loadusb:V:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE BASE=load CONF=loadusb
+	NPROC=$ONPROC START=l16r.$O; mk all }
+
+9loadusb.install:V: 9loadusb
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PBSBASE BASE=load CONF=loadusb
+	NPROC=$ONPROC START=l16r.$O; mk install }
+
+#
+# pxe-loaded disk loaders for convenient debugging only
+#
+9loadpxe:V:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PXEBASE BASE=load CONF=loadpxe SFX=pxe
+	NPROC=$ONPROC START=l16r.$O; mk all }
+
+9loadpxe.install:V: 9loadpxe
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PXEBASE BASE=load CONF=loadpxe SFX=pxe
+	NPROC=$ONPROC START=l16r.$O; mk install }
+
+9loadusbpxe:V:
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PXEBASE BASE=load CONF=loadusbpxe SFX=pxe
+	NPROC=$ONPROC START=l16r.$O; mk all }
+
+9loadusbpxe.install:V: 9loadusbpxe
+	@ { rfork ne; bind bootmkfile mkfile
+	KTZERO=$PXEBASE BASE=load CONF=loadusbpxe SFX=pxe
+	NPROC=$ONPROC START=l16r.$O; mk install }
+
+#
+# 8.expand: the decompressing header for 9boot
+#
+# for pbs, 0x10000 (64K), for pxe, 0x7c00 (31K)
+LOADADDR=0x7c00
+
+cga.tiny.$O expand.$O: expand.h inflate.guts.c
+
+$O.expand: ldecomp.$O cga.tiny.$O expand.$O
+	$LD -o $target^debug -R1 -T$LOADADDR $prereq
+	$LD -o $target   -H3 -R1 -T$LOADADDR $prereq

+ 995 - 0
sys/src/9/pcboot/mmu.c

@@ -0,0 +1,995 @@
+/*
+ * Memory mappings.  Life was easier when 2G of memory was enough.
+ *
+ * The kernel memory starts at KZERO, with the text loaded at KZERO+1M
+ * (9load sits under 1M during the load).  The memory from KZERO to the
+ * top of memory is mapped 1-1 with physical memory, starting at physical
+ * address 0.  All kernel memory and data structures (i.e., the entries stored
+ * into conf.mem) must sit in this physical range: if KZERO is at 0xF0000000,
+ * then the kernel can only have 256MB of memory for itself.
+ * 
+ * The 256M below KZERO comprises three parts.  The lowest 4M is the
+ * virtual page table, a virtual address representation of the current 
+ * page table tree.  The second 4M is used for temporary per-process
+ * mappings managed by kmap and kunmap.  The remaining 248M is used
+ * for global (shared by all procs and all processors) device memory
+ * mappings and managed by vmap and vunmap.  The total amount (256M)
+ * could probably be reduced somewhat if desired.  The largest device
+ * mapping is that of the video card, and even though modern video cards
+ * have embarrassing amounts of memory, the video drivers only use one
+ * frame buffer worth (at most 16M).  Each is described in more detail below.
+ *
+ * The VPT is a 4M frame constructed by inserting the pdb into itself.
+ * This short-circuits one level of the page tables, with the result that 
+ * the contents of second-level page tables can be accessed at VPT.  
+ * We use the VPT to edit the page tables (see mmu) after inserting them
+ * into the page directory.  It is a convenient mechanism for mapping what
+ * might be otherwise-inaccessible pages.  The idea was borrowed from
+ * the Exokernel.
+ *
+ * The VPT doesn't solve all our problems, because we still need to 
+ * prepare page directories before we can install them.  For that, we
+ * use tmpmap/tmpunmap, which map a single page at TMPADDR.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+
+/*
+ * Simple segment descriptors with no translation.
+ */
+#define	DATASEGM(p) 	{ 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
+#define	EXECSEGM(p) 	{ 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
+#define	EXEC16SEGM(p) 	{ 0xFFFF, SEGG|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
+#define	TSSSEGM(b,p)	{ ((b)<<16)|sizeof(Tss),\
+			  ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
+
+void realmodeintrinst(void);
+void _stop32pg(void);
+
+Segdesc gdt[NGDT] =
+{
+[NULLSEG]	{ 0, 0},		/* null descriptor */
+[KDSEG]		DATASEGM(0),		/* kernel data/stack */
+[KESEG]		EXECSEGM(0),		/* kernel code */
+[UDSEG]		DATASEGM(3),		/* user data/stack */
+[UESEG]		EXECSEGM(3),		/* user code */
+[TSSSEG]	TSSSEGM(0,0),		/* tss segment */
+[KESEG16]		EXEC16SEGM(0),	/* kernel code 16-bit */
+};
+
+static int didmmuinit;
+static void taskswitch(ulong, ulong);
+static void memglobal(void);
+
+#define	vpt ((ulong*)VPT)
+#define	VPTX(va)		(((ulong)(va))>>12)
+#define	vpd (vpt+VPTX(VPT))
+
+void
+mmuinit0(void)
+{
+	memmove(m->gdt, gdt, sizeof gdt);
+}
+
+void
+mmuinit(void)
+{
+	ulong x, *p;
+	ushort ptr[3];
+
+	didmmuinit = 1;
+
+	if(0) print("vpt=%#.8ux vpd=%#p kmap=%#.8ux\n",
+		VPT, vpd, KMAP);
+
+	memglobal();
+	m->pdb[PDX(VPT)] = PADDR(m->pdb)|PTEWRITE|PTEVALID;
+	
+	m->tss = malloc(sizeof(Tss));
+	if(m->tss == nil)
+		panic("mmuinit: no memory");
+	memset(m->tss, 0, sizeof(Tss));
+	m->tss->iomap = 0xDFFF<<16;
+
+	/*
+	 * We used to keep the GDT in the Mach structure, but it
+	 * turns out that that slows down access to the rest of the
+	 * page.  Since the Mach structure is accessed quite often,
+	 * it pays off anywhere from a factor of 1.25 to 2 on real
+	 * hardware to separate them (the AMDs are more sensitive
+	 * than Intels in this regard).  Under VMware it pays off
+	 * a factor of about 10 to 100.
+	 */
+	memmove(m->gdt, gdt, sizeof gdt);
+	x = (ulong)m->tss;
+	m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
+	m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
+
+	ptr[0] = sizeof(gdt)-1;
+	x = (ulong)m->gdt;
+	ptr[1] = x & 0xFFFF;
+	ptr[2] = (x>>16) & 0xFFFF;
+	lgdt(ptr);
+
+	ptr[0] = sizeof(Segdesc)*256-1;
+	x = IDTADDR;
+	ptr[1] = x & 0xFFFF;
+	ptr[2] = (x>>16) & 0xFFFF;
+	lidt(ptr);
+
+	/*
+	 * this kills 9load but not 9boot.  9load dies at the taskswitch.
+	 * should track down exactly why some day.
+	 */
+	/* make most kernel text unwritable */
+if(0)	for(x = PGROUND((ulong)_stop32pg); x < (ulong)etext; x += BY2PG){
+		if (x == (ulong)realmodeintrinst & ~(BY2PG-1))
+			continue;
+		p = mmuwalk(m->pdb, x, 2, 0);
+		if(p == nil)
+			panic("mmuinit");
+		*p &= ~PTEWRITE;
+	}
+
+	taskswitch(PADDR(m->pdb), (ulong)m + MACHSIZE);
+	ltr(TSSSEL);
+}
+
+/* 
+ * On processors that support it, we set the PTEGLOBAL bit in
+ * page table and page directory entries that map kernel memory.
+ * Doing this tells the processor not to bother flushing them
+ * from the TLB when doing the TLB flush associated with a 
+ * context switch (write to CR3).  Since kernel memory mappings
+ * are never removed, this is safe.  (If we ever remove kernel memory
+ * mappings, we can do a full flush by turning off the PGE bit in CR4,
+ * writing to CR3, and then turning the PGE bit back on.) 
+ *
+ * See also mmukmap below.
+ * 
+ * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
+ */
+static void
+memglobal(void)
+{
+	int i, j;
+	ulong *pde, *pte;
+
+	/* only need to do this once, on bootstrap processor */
+	if(m->machno != 0)
+		return;
+
+	if(!m->havepge)
+		return;
+
+	pde = m->pdb;
+	for(i=PDX(KZERO); i<1024; i++){
+		if(pde[i] & PTEVALID){
+			pde[i] |= PTEGLOBAL;
+			if(!(pde[i] & PTESIZE)){
+				pte = KADDR(pde[i]&~(BY2PG-1));
+				for(j=0; j<1024; j++)
+					if(pte[j] & PTEVALID)
+						pte[j] |= PTEGLOBAL;
+			}
+		}
+	}			
+}
+
+/*
+ * Flush all the user-space and device-mapping mmu info
+ * for this process, because something has been deleted.
+ * It will be paged back in on demand.
+ */
+void
+flushmmu(void)
+{
+	int s;
+
+	s = splhi();
+	up->newtlb = 1;
+	mmuswitch(up);
+	splx(s);
+}
+
+/*
+ * Flush a single page mapping from the tlb.
+ */
+void
+flushpg(ulong va)
+{
+	if(X86FAMILY(m->cpuidax) >= 4)
+		invlpg(va);
+	else
+		putcr3(getcr3());
+}
+	
+/*
+ * Allocate a new page for a page directory. 
+ * We keep a small cache of pre-initialized
+ * page directories in each mach.
+ */
+static Page*
+mmupdballoc(void)
+{
+	int s;
+	Page *page;
+	ulong *pdb;
+
+	s = splhi();
+	m->pdballoc++;
+	if(m->pdbpool == 0){
+		spllo();
+		page = newpage(0, 0, 0);
+		page->va = (ulong)vpd;
+		splhi();
+		pdb = tmpmap(page);
+		memmove(pdb, m->pdb, BY2PG);
+		pdb[PDX(VPT)] = page->pa|PTEWRITE|PTEVALID;	/* set up VPT */
+		tmpunmap(pdb);
+	}else{
+		page = m->pdbpool;
+		m->pdbpool = page->next;
+		m->pdbcnt--;
+	}
+	splx(s);
+	return page;
+}
+
+static void
+mmupdbfree(Proc *proc, Page *p)
+{
+	if(islo())
+		panic("mmupdbfree: islo");
+	m->pdbfree++;
+	if(m->pdbcnt >= 10){
+		p->next = proc->mmufree;
+		proc->mmufree = p;
+	}else{
+		p->next = m->pdbpool;
+		m->pdbpool = p;
+		m->pdbcnt++;
+	}
+}
+
+/*
+ * A user-space memory segment has been deleted, or the
+ * process is exiting.  Clear all the pde entries for user-space
+ * memory mappings and device mappings.  Any entries that
+ * are needed will be paged back in as necessary.
+ */
+static void
+mmuptefree(Proc* proc)
+{
+	int s;
+	ulong *pdb;
+	Page **last, *page;
+
+	if(proc->mmupdb == nil || proc->mmuused == nil)
+		return;
+	s = splhi();
+	pdb = tmpmap(proc->mmupdb);
+	last = &proc->mmuused;
+	for(page = *last; page; page = page->next){
+		pdb[page->daddr] = 0;
+		last = &page->next;
+	}
+	tmpunmap(pdb);
+	splx(s);
+	*last = proc->mmufree;
+	proc->mmufree = proc->mmuused;
+	proc->mmuused = 0;
+}
+
+static void
+taskswitch(ulong pdb, ulong stack)
+{
+	Tss *tss;
+
+	tss = m->tss;
+	tss->ss0 = KDSEL;
+	tss->esp0 = stack;
+	tss->ss1 = KDSEL;
+	tss->esp1 = stack;
+	tss->ss2 = KDSEL;
+	tss->esp2 = stack;
+	putcr3(pdb);
+}
+
+void
+mmuswitch(Proc* proc)
+{
+	ulong *pdb;
+
+	if(proc->newtlb){
+		mmuptefree(proc);
+		proc->newtlb = 0;
+	}
+
+	if(proc->mmupdb){
+		pdb = tmpmap(proc->mmupdb);
+		pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
+		tmpunmap(pdb);
+		taskswitch(proc->mmupdb->pa, (ulong)(proc->kstack+KSTACK));
+	}else
+		taskswitch(PADDR(m->pdb), (ulong)(proc->kstack+KSTACK));
+}
+
+/*
+ * Release any pages allocated for a page directory base or page-tables
+ * for this process:
+ *   switch to the prototype pdb for this processor (m->pdb);
+ *   call mmuptefree() to place all pages used for page-tables (proc->mmuused)
+ *   onto the process' free list (proc->mmufree). This has the side-effect of
+ *   cleaning any user entries in the pdb (proc->mmupdb);
+ *   if there's a pdb put it in the cache of pre-initialised pdb's
+ *   for this processor (m->pdbpool) or on the process' free list;
+ *   finally, place any pages freed back into the free pool (palloc).
+ * This routine is only called from schedinit() with palloc locked.
+ */
+void
+mmurelease(Proc* proc)
+{
+	Page *page, *next;
+	ulong *pdb;
+
+	if(islo())
+		panic("mmurelease: islo");
+	taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
+	if(proc->kmaptable){
+		if(proc->mmupdb == nil)
+			panic("mmurelease: no mmupdb");
+		if(--proc->kmaptable->ref)
+			panic("mmurelease: kmap ref %d", proc->kmaptable->ref);
+		if(proc->nkmap)
+			panic("mmurelease: nkmap %d", proc->nkmap);
+		/*
+		 * remove kmaptable from pdb before putting pdb up for reuse.
+		 */
+		pdb = tmpmap(proc->mmupdb);
+		if(PPN(pdb[PDX(KMAP)]) != proc->kmaptable->pa)
+			panic("mmurelease: bad kmap pde %#.8lux kmap %#.8lux",
+				pdb[PDX(KMAP)], proc->kmaptable->pa);
+		pdb[PDX(KMAP)] = 0;
+		tmpunmap(pdb);
+		/*
+		 * move kmaptable to free list.
+		 */
+		pagechainhead(proc->kmaptable);
+		proc->kmaptable = 0;
+	}
+	if(proc->mmupdb){
+		mmuptefree(proc);
+		mmupdbfree(proc, proc->mmupdb);
+		proc->mmupdb = 0;
+	}
+	for(page = proc->mmufree; page; page = next){
+		next = page->next;
+		if(--page->ref)
+			panic("mmurelease: page->ref %d", page->ref);
+		pagechainhead(page);
+	}
+	if(proc->mmufree && palloc.r.p)
+		wakeup(&palloc.r);
+	proc->mmufree = 0;
+}
+
+/*
+ * Allocate and install pdb for the current process.
+ */
+static void
+upallocpdb(void)
+{
+	int s;
+	ulong *pdb;
+	Page *page;
+	
+	if(up->mmupdb != nil)
+		return;
+	page = mmupdballoc();
+	s = splhi();
+	if(up->mmupdb != nil){
+		/*
+		 * Perhaps we got an interrupt while
+		 * mmupdballoc was sleeping and that
+		 * interrupt allocated an mmupdb?
+		 * Seems unlikely.
+		 */
+		mmupdbfree(up, page);
+		splx(s);
+		return;
+	}
+	pdb = tmpmap(page);
+	pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
+	tmpunmap(pdb);
+	up->mmupdb = page;
+	putcr3(up->mmupdb->pa);
+	splx(s);
+}
+
+/*
+ * Update the mmu in response to a user fault.  pa may have PTEWRITE set.
+ */
+void
+putmmu(ulong va, ulong pa, Page*)
+{
+	int old, s;
+	Page *page;
+
+	if(up->mmupdb == nil)
+		upallocpdb();
+
+	/*
+	 * We should be able to get through this with interrupts
+	 * turned on (if we get interrupted we'll just pick up 
+	 * where we left off) but we get many faults accessing
+	 * vpt[] near the end of this function, and they always happen
+	 * after the process has been switched out and then 
+	 * switched back, usually many times in a row (perhaps
+	 * it cannot switch back successfully for some reason).
+	 * 
+	 * In any event, I'm tired of searching for this bug.  
+	 * Turn off interrupts during putmmu even though
+	 * we shouldn't need to.		- rsc
+	 */
+	
+	s = splhi();
+	if(!(vpd[PDX(va)]&PTEVALID)){
+		if(up->mmufree == 0){
+			spllo();
+			page = newpage(0, 0, 0);
+			splhi();
+		}
+		else{
+			page = up->mmufree;
+			up->mmufree = page->next;
+		}
+		vpd[PDX(va)] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
+		/* page is now mapped into the VPT - clear it */
+		memset((void*)(VPT+PDX(va)*BY2PG), 0, BY2PG);
+		page->daddr = PDX(va);
+		page->next = up->mmuused;
+		up->mmuused = page;
+	}
+	old = vpt[VPTX(va)];
+	vpt[VPTX(va)] = pa|PTEUSER|PTEVALID;
+	if(old&PTEVALID)
+		flushpg(va);
+	if(getcr3() != up->mmupdb->pa)
+		print("bad cr3 %#.8lux %#.8lux\n", getcr3(), up->mmupdb->pa);
+	splx(s);
+}
+
+/*
+ * Double-check the user MMU.
+ * Error checking only.
+ */
+void
+checkmmu(ulong va, ulong pa)
+{
+	if(up->mmupdb == 0)
+		return;
+	if(!(vpd[PDX(va)]&PTEVALID) || !(vpt[VPTX(va)]&PTEVALID))
+		return;
+	if(PPN(vpt[VPTX(va)]) != pa)
+		print("%ld %s: va=%#08lux pa=%#08lux pte=%#08lux\n",
+			up->pid, up->text,
+			va, pa, vpt[VPTX(va)]);
+}
+
+/*
+ * Walk the page-table pointed to by pdb and return a pointer
+ * to the entry for virtual address va at the requested level.
+ * If the entry is invalid and create isn't requested then bail
+ * out early. Otherwise, for the 2nd level walk, allocate a new
+ * page-table page and register it in the 1st level.  This is used
+ * only to edit kernel mappings, which use pages from kernel memory,
+ * so it's okay to use KADDR to look at the tables.
+ */
+ulong*
+mmuwalk(ulong* pdb, ulong va, int level, int create)
+{
+	ulong *table;
+	void *map;
+
+	table = &pdb[PDX(va)];
+	if(!(*table & PTEVALID) && create == 0)
+		return 0;
+
+	switch(level){
+
+	default:
+		return 0;
+
+	case 1:
+		return table;
+
+	case 2:
+		if(*table & PTESIZE)
+			panic("mmuwalk2: va %luX entry %luX", va, *table);
+		if(!(*table & PTEVALID)){
+			/*
+			 * Have to call low-level allocator from
+			 * memory.c if we haven't set up the xalloc
+			 * tables yet.
+			 */
+			if(didmmuinit)
+				map = xspanalloc(BY2PG, BY2PG, 0);
+			else
+				map = rampage();
+			if(map == nil)
+				panic("mmuwalk xspanalloc failed");
+			*table = PADDR(map)|PTEWRITE|PTEVALID;
+		}
+		table = KADDR(PPN(*table));
+		return &table[PTX(va)];
+	}
+}
+
+/*
+ * Device mappings are shared by all procs and processors and
+ * live in the virtual range VMAP to VMAP+VMAPSIZE.  The master
+ * copy of the mappings is stored in mach0->pdb, and they are
+ * paged in from there as necessary by vmapsync during faults.
+ */
+
+static Lock vmaplock;
+
+static int findhole(ulong *a, int n, int count);
+static ulong vmapalloc(ulong size);
+static void pdbunmap(ulong*, ulong, int);
+
+/*
+ * Add a device mapping to the vmap range.
+ */
+void*
+vmap(ulong pa, int size)
+{
+	int osize;
+	ulong o, va;
+	
+	/*
+	 * might be asking for less than a page.
+	 */
+	osize = size;
+	o = pa & (BY2PG-1);
+	pa -= o;
+	size += o;
+
+	size = ROUND(size, BY2PG);
+	if(pa == 0){
+		print("vmap pa=0 pc=%#p\n", getcallerpc(&pa));
+		return nil;
+	}
+	ilock(&vmaplock);
+	if((va = vmapalloc(size)) == 0 
+	|| pdbmap(MACHP(0)->pdb, pa|PTEUNCACHED|PTEWRITE, va, size) < 0){
+		iunlock(&vmaplock);
+		return 0;
+	}
+	iunlock(&vmaplock);
+	/* avoid trap on local processor
+	for(i=0; i<size; i+=4*MB)
+		vmapsync(va+i);
+	*/
+	USED(osize);
+//	print("  vmap %#.8lux %d => %#.8lux\n", pa+o, osize, va+o);
+	return (void*)(va + o);
+}
+
+static int
+findhole(ulong *a, int n, int count)
+{
+	int have, i;
+	
+	have = 0;
+	for(i=0; i<n; i++){
+		if(a[i] == 0)
+			have++;
+		else
+			have = 0;
+		if(have >= count)
+			return i+1 - have;
+	}
+	return -1;
+}
+
+/*
+ * Look for free space in the vmap.
+ */
+static ulong
+vmapalloc(ulong size)
+{
+	int i, n, o;
+	ulong *vpdb;
+	int vpdbsize;
+	
+	vpdb = &MACHP(0)->pdb[PDX(VMAP)];
+	vpdbsize = VMAPSIZE/(4*MB);
+
+	if(size >= 4*MB){
+		n = (size+4*MB-1) / (4*MB);
+		if((o = findhole(vpdb, vpdbsize, n)) != -1)
+			return VMAP + o*4*MB;
+		return 0;
+	}
+	n = (size+BY2PG-1) / BY2PG;
+	for(i=0; i<vpdbsize; i++)
+		if((vpdb[i]&PTEVALID) && !(vpdb[i]&PTESIZE))
+			if((o = findhole(KADDR(PPN(vpdb[i])), WD2PG, n)) != -1)
+				return VMAP + i*4*MB + o*BY2PG;
+	if((o = findhole(vpdb, vpdbsize, 1)) != -1)
+		return VMAP + o*4*MB;
+		
+	/*
+	 * could span page directory entries, but not worth the trouble.
+	 * not going to be very much contention.
+	 */
+	return 0;
+}
+
+/*
+ * Remove a device mapping from the vmap range.
+ * Since pdbunmap does not remove page tables, just entries,
+ * the call need not be interlocked with vmap.
+ */
+void
+vunmap(void *v, int size)
+{
+	int i;
+	ulong va, o;
+	Mach *nm;
+	Proc *p;
+	
+	/*
+	 * might not be aligned
+	 */
+	va = (ulong)v;
+	o = va&(BY2PG-1);
+	va -= o;
+	size += o;
+	size = ROUND(size, BY2PG);
+	
+	if(size < 0 || va < VMAP || va+size > VMAP+VMAPSIZE)
+		panic("vunmap va=%#.8lux size=%#x pc=%#.8lux",
+			va, size, getcallerpc(&v));
+
+	pdbunmap(MACHP(0)->pdb, va, size);
+	
+	/*
+	 * Flush mapping from all the tlbs and copied pdbs.
+	 * This can be (and is) slow, since it is called only rarely.
+	 * It is possible for vunmap to be called with up == nil,
+	 * e.g. from the reset/init driver routines during system
+	 * boot. In that case it suffices to flush the MACH(0) TLB
+	 * and return.
+	 */
+	if(!active.thunderbirdsarego){
+		if(MACHP(0)->pdb == 0)
+			panic("vunmap: nil m->pdb pc=%#p", getcallerpc(&v));
+		if(PADDR(MACHP(0)->pdb) == 0)
+			panic("vunmap: nil PADDR(m->pdb)");
+		putcr3(PADDR(MACHP(0)->pdb));
+		return;
+	}
+	for(i=0; i<conf.nproc; i++){
+		p = proctab(i);
+		if(p->state == Dead)
+			continue;
+		if(p != up)
+			p->newtlb = 1;
+	}
+	for(i=0; i<conf.nmach; i++){
+		nm = MACHP(i);
+		if(nm != m)
+			nm->flushmmu = 1;
+	}
+	flushmmu();
+	for(i=0; i<conf.nmach; i++){
+		nm = MACHP(i);
+		if(nm != m)
+			while((active.machs&(1<<nm->machno)) && nm->flushmmu)
+				;
+	}
+}
+
+/*
+ * Add kernel mappings for va -> pa for a section of size bytes.
+ */
+int
+pdbmap(ulong *pdb, ulong pa, ulong va, int size)
+{
+	int pse;
+	ulong pgsz, *pte, *table;
+	ulong flag, off;
+
+	flag = pa&0xFFF;
+	pa &= ~0xFFF;
+
+	if((MACHP(0)->cpuiddx & 0x08) && (getcr4() & 0x10))
+		pse = 1;
+	else
+		pse = 0;
+
+	for(off=0; off<size; off+=pgsz){
+		table = &pdb[PDX(va+off)];
+		if((*table&PTEVALID) && (*table&PTESIZE))
+			panic("vmap: pdb pte valid and big page: "
+				"va=%#.8lux pa=%#.8lux pde=%#.8lux",
+				va+off, pa+off, *table);
+
+		/*
+		 * Check if it can be mapped using a 4MB page:
+		 * va, pa aligned and size >= 4MB and processor can do it.
+		 */
+		if(pse && (pa+off)%(4*MB) == 0 && (va+off)%(4*MB) == 0 &&
+		    (size-off) >= 4*MB){
+			*table = (pa+off)|flag|PTESIZE|PTEVALID;
+			pgsz = 4*MB;
+		}else{
+			pte = mmuwalk(pdb, va+off, 2, 1);
+			if(*pte&PTEVALID)
+				panic("vmap: va=%#.8lux pa=%#.8lux pte=%#.8lux",
+					va+off, pa+off, *pte);
+			*pte = (pa+off)|flag|PTEVALID;
+			pgsz = BY2PG;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Remove mappings.  Must already exist, for sanity.
+ * Only used for kernel mappings, so okay to use KADDR.
+ */
+static void
+pdbunmap(ulong *pdb, ulong va, int size)
+{
+	ulong vae;
+	ulong *table;
+	
+	vae = va+size;
+	while(va < vae){
+		table = &pdb[PDX(va)];
+		if(!(*table & PTEVALID)){
+			panic("vunmap: not mapped");
+			/* 
+			va = (va+4*MB-1) & ~(4*MB-1);
+			continue;
+			*/
+		}
+		if(*table & PTESIZE){
+			*table = 0;
+			va = (va+4*MB-1) & ~(4*MB-1);
+			continue;
+		}
+		table = KADDR(PPN(*table));
+		if(!(table[PTX(va)] & PTEVALID))
+			panic("vunmap: not mapped");
+		table[PTX(va)] = 0;
+		va += BY2PG;
+	}
+}
+
+/*
+ * Handle a fault by bringing vmap up to date.
+ * Only copy pdb entries and they never go away,
+ * so no locking needed.
+ */
+int
+vmapsync(ulong va)
+{
+	ulong entry, *table;
+
+	if(va < VMAP || va >= VMAP+VMAPSIZE)
+		return 0;
+
+	entry = MACHP(0)->pdb[PDX(va)];
+	if(!(entry&PTEVALID))
+		return 0;
+	if(!(entry&PTESIZE)){
+		/* make sure entry will help the fault */
+		table = KADDR(PPN(entry));
+		if(!(table[PTX(va)]&PTEVALID))
+			return 0;
+	}
+	vpd[PDX(va)] = entry;
+	/*
+	 * TLB doesn't cache negative results, so no flush needed.
+	 */
+	return 1;
+}
+
+
+/*
+ * KMap is used to map individual pages into virtual memory.
+ * It is rare to have more than a few KMaps at a time (in the 
+ * absence of interrupts, only two at a time are ever used,
+ * but interrupts can stack).  The mappings are local to a process,
+ * so we can use the same range of virtual address space for
+ * all processes without any coordination.
+ */
+#define kpt (vpt+VPTX(KMAP))
+#define NKPT (KMAPSIZE/BY2PG)
+
+KMap*
+kmap(Page *page)
+{
+	int i, o, s;
+
+	if(up == nil)
+		panic("kmap: up=0 pc=%#.8lux", getcallerpc(&page));
+	if(up->mmupdb == nil)
+		upallocpdb();
+	if(up->nkmap < 0)
+		panic("kmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap);
+	
+	/*
+	 * Splhi shouldn't be necessary here, but paranoia reigns.
+	 * See comment in putmmu above.
+	 */
+	s = splhi();
+	up->nkmap++;
+	if(!(vpd[PDX(KMAP)]&PTEVALID)){
+		/* allocate page directory */
+		if(KMAPSIZE > BY2XPG)
+			panic("bad kmapsize");
+		if(up->kmaptable != nil)
+			panic("kmaptable");
+		spllo();
+		up->kmaptable = newpage(0, 0, 0);
+		splhi();
+		vpd[PDX(KMAP)] = up->kmaptable->pa|PTEWRITE|PTEVALID;
+		flushpg((ulong)kpt);
+		memset(kpt, 0, BY2PG);
+		kpt[0] = page->pa|PTEWRITE|PTEVALID;
+		up->lastkmap = 0;
+		splx(s);
+		return (KMap*)KMAP;
+	}
+	if(up->kmaptable == nil)
+		panic("no kmaptable");
+	o = up->lastkmap+1;
+	for(i=0; i<NKPT; i++){
+		if(kpt[(i+o)%NKPT] == 0){
+			o = (i+o)%NKPT;
+			kpt[o] = page->pa|PTEWRITE|PTEVALID;
+			up->lastkmap = o;
+			splx(s);
+			return (KMap*)(KMAP+o*BY2PG);
+		}
+	}
+	panic("out of kmap");
+	return nil;
+}
+
+void
+kunmap(KMap *k)
+{
+	ulong va;
+
+	va = (ulong)k;
+	if(up->mmupdb == nil || !(vpd[PDX(KMAP)]&PTEVALID))
+		panic("kunmap: no kmaps");
+	if(va < KMAP || va >= KMAP+KMAPSIZE)
+		panic("kunmap: bad address %#.8lux pc=%#p", va, getcallerpc(&k));
+	if(!(vpt[VPTX(va)]&PTEVALID))
+		panic("kunmap: not mapped %#.8lux pc=%#p", va, getcallerpc(&k));
+	up->nkmap--;
+	if(up->nkmap < 0)
+		panic("kunmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap);
+	vpt[VPTX(va)] = 0;
+	flushpg(va);
+}
+
+/*
+ * Temporary one-page mapping used to edit page directories.
+ *
+ * The fasttmp #define controls whether the code optimizes
+ * the case where the page is already mapped in the physical
+ * memory window.  
+ */
+#define fasttmp 1
+
+void*
+tmpmap(Page *p)
+{
+	ulong i;
+	ulong *entry;
+	
+	if(islo())
+		panic("tmpaddr: islo");
+
+	if(fasttmp && p->pa < -KZERO)
+		return KADDR(p->pa);
+
+	/*
+	 * PDX(TMPADDR) == PDX(MACHADDR), so this
+	 * entry is private to the processor and shared 
+	 * between up->mmupdb (if any) and m->pdb.
+	 */
+	entry = &vpt[VPTX(TMPADDR)];
+	if(!(*entry&PTEVALID)){
+		for(i=KZERO; i<=CPU0MACH; i+=BY2PG)
+			print("%#p: *%#p=%#p (vpt=%#p index=%#p)\n", i, &vpt[VPTX(i)], vpt[VPTX(i)], vpt, VPTX(i));
+		panic("tmpmap: no entry");
+	}
+	if(PPN(*entry) != PPN(TMPADDR-KZERO))
+		panic("tmpmap: already mapped entry=%#.8lux", *entry);
+	*entry = p->pa|PTEWRITE|PTEVALID;
+	flushpg(TMPADDR);
+	return (void*)TMPADDR;
+}
+
+void
+tmpunmap(void *v)
+{
+	ulong *entry;
+	
+	if(islo())
+		panic("tmpaddr: islo");
+	if(fasttmp && (ulong)v >= KZERO && v != (void*)TMPADDR)
+		return;
+	if(v != (void*)TMPADDR)
+		panic("tmpunmap: bad address");
+	entry = &vpt[VPTX(TMPADDR)];
+	if(!(*entry&PTEVALID) || PPN(*entry) == PPN(PADDR(TMPADDR)))
+		panic("tmpmap: not mapped entry=%#.8lux", *entry);
+	*entry = PPN(TMPADDR-KZERO)|PTEWRITE|PTEVALID;
+	flushpg(TMPADDR);
+}
+
+/*
+ * These could go back to being macros once the kernel is debugged,
+ * but the extra checking is nice to have.
+ */
+void*
+kaddr(ulong pa)
+{
+	if(pa > (ulong)-KZERO)
+		panic("kaddr: pa=%#.8lux > -KZERO pc=%#p", pa, getcallerpc(&pa));
+	return (void*)(pa | KZERO);
+}
+
+ulong
+paddr(void *v)
+{
+	ulong va;
+	
+	va = (ulong)v;
+	if(va < KZERO)
+		panic("paddr: va=%#.8lux < KZERO pc=%#p", va, getcallerpc(&v));
+	return va & ~KSEGM;
+}
+
+/*
+ * More debugging.
+ */
+void
+countpagerefs(ulong *ref, int print)
+{
+	USED(ref, print);
+}
+
+void
+checkfault(ulong, ulong)
+{
+}
+
+/*
+ * Return the number of bytes that can be accessed via KADDR(pa).
+ * If pa is not a valid argument to KADDR, return 0.
+ */
+ulong
+cankaddr(ulong pa)
+{
+	if(pa >= -KZERO)
+		return 0;
+	return -KZERO - pa;
+}
+

+ 38 - 0
sys/src/9/pcboot/multiboot.c

@@ -0,0 +1,38 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+
+Mbi	mbhdr;
+int	nmmap;
+
+/* these need to end up in low memory */
+Mbi	*multibootheader = &mbhdr;
+MMap	mmap[32+1];
+
+void
+mkmultiboot(void)
+{
+	MMap *lmmap;
+
+	/* reuse the bios table memory */
+	multibootheader = (Mbi *)KADDR(BIOSTABLES);
+	memset(multibootheader, 0, sizeof *multibootheader);
+
+	lmmap = (MMap *)(multibootheader + 1);
+	memmove(lmmap, mmap, sizeof mmap);
+
+	multibootheader->cmdline = PADDR(BOOTLINE);
+	multibootheader->flags |= Fcmdline;
+	if(nmmap != 0){
+		multibootheader->mmapaddr = PADDR(lmmap);
+		multibootheader->mmaplength = nmmap*sizeof(MMap);
+		multibootheader->flags |= Fmmap;
+	}
+	multibootheader = (Mbi *)PADDR(multibootheader);
+	if(v_flag)
+		print("PADDR(&multibootheader) %#p\n", multibootheader);
+}

+ 9 - 0
sys/src/9/pcboot/no-inflate.c

@@ -0,0 +1,9 @@
+#include	"u.h"
+#include	"../port/lib.h"
+
+int
+gunzip(uchar *, int, uchar *, int)
+{
+	print("booting gzipped kernels is not supported by this bootstrap.\n");
+	return -1;
+}

+ 582 - 0
sys/src/9/pcboot/parts.c

@@ -0,0 +1,582 @@
+/*
+ * read disk partition tables, intended for early use on systems
+ * that don't use (the new) 9load.  borrowed from old 9load.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"dosfs.h"
+#include	"../port/sd.h"
+#include	"iso9660.h"
+
+#define gettokens(l, a, an, del)	getfields(l, a, an, 1, del)
+
+enum {
+	Trace	= 0,
+	Parttrace = 0,
+	Debugboot = 0,
+
+	Maxsec	= 2048,
+	Normsec	= 512,			/* mag disks */
+
+	/* from devsd.c */
+	PartLOG		= 8,
+	NPart		= (1<<PartLOG),
+};
+
+typedef struct PSDunit PSDunit;
+struct PSDunit {
+	SDunit;
+	Chan	*ctlc;
+	Chan	*data;
+};
+
+static uchar *mbrbuf, *partbuf;
+static char buf[128], buf2[128];
+
+static void
+psdaddpart(PSDunit* unit, char* name, uvlong start, uvlong end)
+{
+	int len, nw;
+
+	sdaddpart(unit, name, start, end);
+
+	/* update devsd's in-memory partition table. */
+	len = snprint(buf, sizeof buf, "part %s %lld %lld\n", name, start, end);
+	nw = devtab[unit->ctlc->type]->write(unit->ctlc, buf, len,
+		unit->ctlc->offset);
+	if (nw != len)
+		print("can't update devsd's partition table\n");
+	if (Debugboot)
+		print("part %s %lld %lld\n", name, start, end);
+}
+
+static long
+psdread(PSDunit *unit, SDpart *pp, void* va, long len, vlong off)
+{
+	long l, secsize;
+	uvlong bno, nb;
+
+	/*
+	 * Check the request is within partition bounds.
+	 */
+	secsize = unit->secsize;
+	if (secsize == 0)
+		panic("psdread: %s: zero sector size", unit->name);
+	bno = off/secsize + pp->start;
+	nb = (off+len+secsize-1)/secsize + pp->start - bno;
+	if(bno+nb > pp->end)
+		nb = pp->end - bno;
+	if(bno >= pp->end || nb == 0)
+		return 0;
+
+	unit->data->offset = bno * secsize;
+	l = myreadn(unit->data, va, len);
+	if (l < 0)
+		return 0;
+	return l;
+}
+
+static int
+sdreadblk(PSDunit *unit, SDpart *part, void *a, vlong off, int mbr)
+{
+	uchar *b;
+
+	assert(a);			/* sdreadblk */
+	if(psdread(unit, part, a, unit->secsize, off) != unit->secsize){
+		if(Trace)
+			print("%s: read %lud at %lld failed\n", unit->name,
+				unit->secsize, (vlong)part->start*unit->secsize+off);
+		return -1;
+	}
+	b = a;
+	if(mbr && (b[0x1FE] != 0x55 || b[0x1FF] != 0xAA)){
+		if(Trace)
+			print("%s: bad magic %.2ux %.2ux at %lld\n",
+				unit->name, b[0x1FE], b[0x1FF],
+				(vlong)part->start*unit->secsize+off);
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ *  read partition table.  The partition table is just ascii strings.
+ */
+#define MAGIC "plan9 partitions"
+static void
+oldp9part(PSDunit *unit)
+{
+	SDpart *pp;
+	char *field[3], *line[NPart+1];
+	ulong n;
+	uvlong start, end;
+	int i;
+	static SDpart fakepart;
+
+	/*
+	 * We prefer partition tables on the second to last sector,
+	 * but some old disks use the last sector instead.
+	 */
+
+	pp = &fakepart;
+	kstrdup(&pp->name, "partition");
+	pp->start = unit->sectors - 2;
+	pp->end = unit->sectors - 1;
+
+	if(Debugboot)
+		print("oldp9part %s\n", unit->name);
+	if(sdreadblk(unit, pp, partbuf, 0, 0) < 0)
+		return;
+
+	if(strncmp((char*)partbuf, MAGIC, sizeof(MAGIC)-1) != 0) {
+		/* not found on 2nd last sector; look on last sector */
+		pp->start++;
+		pp->end++;
+		if(sdreadblk(unit, pp, partbuf, 0, 0) < 0)
+			return;
+		if(strncmp((char*)partbuf, MAGIC, sizeof(MAGIC)-1) != 0)
+			return;
+		print("%s: using old plan9 partition table on last sector\n", unit->name);
+	}else
+		print("%s: using old plan9 partition table on 2nd-to-last sector\n", unit->name);
+
+	/* we found a partition table, so add a partition partition */
+	psdaddpart(unit, pp->name, pp->start, pp->end);
+
+	/*
+	 * parse partition table
+	 */
+	partbuf[unit->secsize-1] = '\0';
+	n = gettokens((char*)partbuf, line, NPart+1, "\n");
+	if(n && strncmp(line[0], MAGIC, sizeof(MAGIC)-1) == 0)
+		for(i = 1; i < n; i++){
+			if(gettokens(line[i], field, 3, " ") != 3)
+				break;
+			start = strtoull(field[1], 0, 0);
+			end = strtoull(field[2], 0, 0);
+			if(start >= end || end > unit->sectors)
+				break;
+			psdaddpart(unit, field[0], start, end);
+		}
+}
+
+static SDpart*
+sdfindpart(PSDunit *unit, char *name)
+{
+	int i;
+
+	if(Parttrace)
+		print("findpart %d %s %s: ", unit->npart, unit->name, name);
+	for(i=0; i<unit->npart; i++) {
+		if(Parttrace)
+			print("%s...", unit->part[i].name);
+		if(strcmp(unit->part[i].name, name) == 0){
+			if(Parttrace)
+				print("\n");
+			return &unit->part[i];
+		}
+	}
+	if(Parttrace)
+		print("not found\n");
+	return nil;
+}
+
+/*
+ * look for a plan 9 partition table on drive `unit' in the second
+ * sector (sector 1) of partition `name'.
+ * if found, add the partitions defined in the table.
+ */
+static void
+p9part(PSDunit *unit, char *name)
+{
+	SDpart *p;
+	char *field[4], *line[NPart+1];
+	uvlong start, end;
+	int i, n;
+
+	if(Debugboot)
+		print("p9part %s %s\n", unit->name, name);
+	p = sdfindpart(unit, name);
+	if(p == nil)
+		return;
+
+	if(sdreadblk(unit, p, partbuf, unit->secsize, 0) < 0)
+		return;
+	partbuf[unit->secsize-1] = '\0';
+
+	if(strncmp((char*)partbuf, "part ", 5) != 0)
+		return;
+
+	n = gettokens((char*)partbuf, line, NPart+1, "\n");
+	if(n == 0)
+		return;
+	for(i = 0; i < n; i++){
+		if(strncmp(line[i], "part ", 5) != 0)
+			break;
+		if(gettokens(line[i], field, 4, " ") != 4)
+			break;
+		start = strtoull(field[2], 0, 0);
+		end   = strtoull(field[3], 0, 0);
+		if(start >= end || end > unit->sectors)
+			break;
+		psdaddpart(unit, field[1], p->start+start, p->start+end);
+	}
+}
+
+static int
+isdos(int t)
+{
+	return t==FAT12 || t==FAT16 || t==FATHUGE || t==FAT32 || t==FAT32X;
+}
+
+static int
+isextend(int t)
+{
+	return t==EXTEND || t==EXTHUGE || t==LEXTEND;
+}
+
+/*
+ * Fetch the first dos and all plan9 partitions out of the MBR partition table.
+ * We return -1 if we did not find a plan9 partition.
+ */
+static int
+mbrpart(PSDunit *unit)
+{
+	Dospart *dp;
+	uvlong taboffset, start, end;
+	uvlong firstxpart, nxtxpart;
+	int havedos, i, nplan9;
+	char name[10];
+
+	taboffset = 0;
+	dp = (Dospart*)&mbrbuf[0x1BE];
+	{
+		/* get the MBR (allowing for DMDDO) */
+		if(sdreadblk(unit, &unit->part[0], mbrbuf,
+		    (vlong)taboffset * unit->secsize, 1) < 0)
+			return -1;
+		for(i=0; i<4; i++)
+			if(dp[i].type == DMDDO) {
+				if(Trace)
+					print("DMDDO partition found\n");
+				taboffset = 63;
+				if(sdreadblk(unit, &unit->part[0], mbrbuf,
+				    (vlong)taboffset * unit->secsize, 1) < 0)
+					return -1;
+				i = -1;	/* start over */
+			}
+	}
+
+	/*
+	 * Read the partitions, first from the MBR and then
+	 * from successive extended partition tables.
+	 */
+	nplan9 = 0;
+	havedos = 0;
+	firstxpart = 0;
+	for(;;) {
+		if(sdreadblk(unit, &unit->part[0], mbrbuf,
+		    (vlong)taboffset * unit->secsize, 1) < 0)
+			return -1;
+		if(Trace) {
+			if(firstxpart)
+				print("%s ext %llud ", unit->name, taboffset);
+			else
+				print("%s mbr ", unit->name);
+		}
+		nxtxpart = 0;
+		for(i=0; i<4; i++) {
+			if(Trace)
+				print("dp %d...", dp[i].type);
+			start = taboffset+GLONG(dp[i].start);
+			end = start+GLONG(dp[i].len);
+
+			if(dp[i].type == PLAN9) {
+				if(nplan9 == 0)
+					strncpy(name, "plan9", sizeof name);
+				else
+					snprint(name, sizeof name, "plan9.%d",
+						nplan9);
+				psdaddpart(unit, name, start, end);
+				p9part(unit, name);
+				nplan9++;
+			}
+
+			/*
+			 * We used to take the active partition (and then the first
+			 * when none are active).  We have to take the first here,
+			 * so that the partition we call ``dos'' agrees with the
+			 * partition disk/fdisk calls ``dos''.
+			 */
+			if(havedos==0 && isdos(dp[i].type)){
+				havedos = 1;
+				psdaddpart(unit, "dos", start, end);
+			}
+
+			/* nxtxpart is relative to firstxpart (or 0), not taboffset */
+			if(isextend(dp[i].type)){
+				nxtxpart = start-taboffset+firstxpart;
+				if(Trace)
+					print("link %llud...", nxtxpart);
+			}
+		}
+		if(Trace)
+			print("\n");
+
+		if(!nxtxpart)
+			break;
+		if(!firstxpart)
+			firstxpart = nxtxpart;
+		taboffset = nxtxpart;
+	}
+	return nplan9 ? 0 : -1;
+}
+
+/*
+ * To facilitate booting from CDs, we create a partition for
+ * the FAT filesystem image embedded in a bootable CD.
+ */
+static int
+part9660(PSDunit *unit)
+{
+	ulong a, n, i, j;
+	uchar drecsz;
+	uchar *p;
+	uchar buf[Maxsec];
+	Drec *rootdrec, *drec;
+	Voldesc *v;
+	static char stdid[] = "CD001\x01";
+
+	if(unit->secsize == 0)
+		unit->secsize = Cdsec;
+	if(unit->secsize != Cdsec)
+		return -1;
+
+	if(psdread(unit, &unit->part[0], buf, Cdsec, VOLDESC*Cdsec) < 0)
+		return -1;
+	if(buf[0] != PrimaryIso ||
+	    memcmp((char*)buf+1, stdid, sizeof stdid - 1) != 0)
+		return -1;
+
+	v = (Voldesc *)buf;
+	rootdrec = (Drec *)v->z.desc.rootdir;
+	assert(rootdrec);
+	p = rootdrec->addr;
+	a = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
+	p = rootdrec->size;
+	n = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
+//	print("part9660: read %uld %uld\n", n, a);	/* debugging */
+
+	if(n < Cdsec){
+		print("warning: bad boot file size %ld in iso directory", n);
+		n = Cdsec;
+	}
+
+	drec = nil;
+	for(j = 0; j*Cdsec < n; j++){
+		if(psdread(unit, &unit->part[0], buf, Cdsec, (a + j)*Cdsec) < 0)
+			return -1;
+		for(i = 0; i + j*Cdsec <= n && i < Cdsec; i += drecsz){
+			drec = (Drec *)&buf[i];
+			drecsz = drec->reclen;
+			if(drecsz == 0 || drecsz + i > Cdsec)
+				break;
+			if(cistrncmp("bootdisk.img", (char *)drec->name, 12) == 0)
+				goto Found;
+		}
+	}
+Found:
+	if(j*Cdsec >= n || drec == nil)
+		return -1;
+
+	p = drec->addr;
+	a = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
+	p = drec->size;
+	n = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
+
+	print("found partition %s!9fat; %lud+%lud\n", unit->name, a, n);
+	n /= Cdsec;
+	psdaddpart(unit, "9fat", a, a+n);
+	return 0;
+}
+
+enum {
+	NEW = 1<<0,
+	OLD = 1<<1
+};
+
+/*
+ * read unit->data to look for partition tables.
+ * if found, stash partitions in environment and write them to ctl too.
+ */
+static void
+partition(PSDunit *unit)
+{
+	int type;
+	char *p;
+
+	if(unit->part == 0)
+		return;
+
+	if(part9660(unit) == 0)
+		return;
+
+	p = getconf("partition");
+	if(p != nil && strncmp(p, "new", 3) == 0)
+		type = NEW;
+	else if(p != nil && strncmp(p, "old", 3) == 0)
+		type = OLD;
+	else
+		type = NEW|OLD;
+
+	if(mbrbuf == nil) {
+		mbrbuf = malloc(Maxsec);
+		partbuf = malloc(Maxsec);
+		if(mbrbuf==nil || partbuf==nil) {
+			free(mbrbuf);
+			free(partbuf);
+			partbuf = mbrbuf = nil;
+			return;
+		}
+	}
+
+	/*
+	 * there might be no mbr (e.g. on a very large device), so look for
+	 * a bare plan 9 partition table if mbrpart fails.
+	 */
+	if((type & NEW) && mbrpart(unit) >= 0){
+		/* nothing to do */
+	}
+	else if (type & NEW)
+		p9part(unit, "data");
+	else if(type & OLD)
+		oldp9part(unit);
+}
+
+static void
+rdgeom(PSDunit *unit)
+{
+	int n, f, lines;
+	char *buf, *p;
+	char *line[64], *fld[5];
+	char ctl[64];
+	static char geom[] = "geometry";
+
+	buf = smalloc(Maxfile + 1);
+	strncpy(ctl, unit->name, sizeof ctl);
+	p = strrchr(ctl, '/');
+	if (p)
+		strcpy(p, "/ctl");		/* was "/data" */
+	n = readfile(ctl, buf, Maxfile);
+	if (n < 0) {
+		print("rdgeom: can't read %s\n", ctl);
+		return;
+	}
+	buf[n] = 0;
+
+	lines = getfields(buf, line, nelem(line), 0, "\r\n");
+	for (f = 0; f < lines; f++)
+		if (tokenize(line[f], fld, nelem(fld)) >= 3 &&
+		    strcmp(fld[0], geom) == 0)
+			break;
+	if(f < lines){
+		unit->sectors = strtoull(fld[1], nil, 0);
+		unit->secsize = strtoull(fld[2], nil, 0);
+	}
+	if (f >= lines || unit->sectors == 0){
+		/* no geometry line, so fake it */
+		unit->secsize = Cdsec;
+		unit->sectors = ~0ull / unit->secsize;
+	}
+	if(unit->secsize == 0)
+		print("rdgeom: %s: zero sector size read from ctl file\n",
+			unit->name);
+	free(buf);
+	unit->ctlc->offset = 0;
+}
+
+static void
+setpartitions(char *name, Chan *ctl, Chan *data)
+{
+	PSDunit sdunit;
+	PSDunit *unit;
+	SDpart *part0;
+
+	unit = &sdunit;
+	memset(unit, 0, sizeof *unit);
+	unit->ctlc = ctl;
+	unit->data = data;
+
+	unit->secsize = Normsec;	/* default: won't work for CDs */
+	unit->sectors = ~0ull / unit->secsize;
+	kstrdup(&unit->name, name);
+	rdgeom(unit);
+	unit->part = mallocz(sizeof(SDpart) * SDnpart, 1);
+	unit->npart = SDnpart;
+
+	part0 = &unit->part[0];
+	part0->end = unit->sectors - 1;
+	kstrdup(&part0->name, "data");
+	part0->valid = 1;
+
+	mbrbuf = malloc(Maxsec);
+	partbuf = malloc(Maxsec);
+	partition(unit);
+	free(unit->part);
+}
+
+/*
+ * read disk partition tables so that readnvram via factotum
+ * can see them.
+ */
+int
+readparts(char *disk)
+{
+	Chan *ctl, *data;
+
+	snprint(buf, sizeof buf, "%s/ctl", disk);
+	ctl  = namecopen(buf, ORDWR);
+	snprint(buf2, sizeof buf2, "%s/data", disk);
+	data = namecopen(buf2, OREAD);
+	if (ctl != nil && data != nil)
+		setpartitions(buf2, ctl, data);
+	cclose(ctl);
+	cclose(data);
+	return 0;
+}
+
+/*
+ * Leave partitions around for devsd in next kernel to pick up.
+ * (Needed by boot process; more extensive
+ * partitioning is done by termrc or cpurc).
+ */
+void
+sdaddconf(SDunit *unit)
+{
+	int i;
+	SDpart *pp;
+
+	/*
+	 * If there were no partitions (just data and partition), don't bother.
+	 */
+	if(unit->npart <= 1 || (unit->npart == 2 &&
+	    strcmp(unit->part[1].name, "partition") == 0))
+		return;
+
+	addconf("%spart=", unit->name);
+	/* skip 0, which is "data" */
+	for(i = 1, pp = &unit->part[i]; i < unit->npart; i++, pp++)
+		if (pp->valid)
+			addconf("%s%s %lld %lld", i==1 ? "" : "/", pp->name,
+				pp->start, pp->end);
+	addconf("\n");
+}

+ 94 - 0
sys/src/9/pcboot/pxe.h

@@ -0,0 +1,94 @@
+/* from <ip.h> */
+
+enum
+{
+	ETHER_HDR	= 14,
+	ET_IP		= 0x800,
+
+	IP_VER		= 0x40,
+	IP_HLEN		= 0x05,			
+ 	IP_UDPPROTO	= 17,
+
+	UDP_EHSIZE	= 22,
+	UDP_PHDRSIZE	= 12,
+	UDP_HDRSIZE	= 20,
+
+	BPportsrc	= 68,
+	BPportdst	= 67,
+	Bootrequest 	= 1,
+	Bootreply   	= 2,
+
+	TFTPport	= 69,
+//	Timeout		= 5000,	/* milliseconds */
+	Timeout		= 2000,	/* milliseconds */
+	Tftp_READ	= 1,
+	Tftp_WRITE	= 2,
+	Tftp_DATA	= 3,
+	Tftp_ACK	= 4,
+	Tftp_ERROR	= 5,
+	Tftp_OACK	= 6,		/* extension: option(s) ack */
+	Defsegsize	= 512,
+
+	/* lengths of some bootp fields */
+	Maxhwlen=	16,
+	Maxfilelen=	128,
+	Maxoptlen=	312-4,
+
+	/* bootp option types */
+	OBend=			255,
+	OBpad=			0,
+	OBmask=			1,
+};
+
+/*
+ *  user level udp headers with control message "headers"
+ */
+enum 
+{
+	Udphdrsize=	52,	/* size of a Udphdr */
+};
+
+typedef struct Udphdr Udphdr;
+struct Udphdr
+{
+	uchar	raddr[IPaddrlen];	/* V6 remote address */
+	uchar	laddr[IPaddrlen];	/* V6 local address */
+	uchar	ifcaddr[IPaddrlen];	/* V6 ifc addr msg was received on */
+	uchar	rport[2];		/* remote port */
+	uchar	lport[2];		/* local port */
+};
+
+/*
+ * from 9load
+ */
+typedef struct Bootp Bootp;
+struct Bootp
+{
+	uchar	op;		/* opcode */
+	uchar	htype;		/* hardware type */
+	uchar	hlen;		/* hardware address len */
+	uchar	hops;		/* hops */
+	uchar	xid[4];		/* a random number */
+	uchar	secs[2];	/* elapsed since client started booting */
+	uchar	flags[2];	/* unused in bootp, flags in dhcp */
+	uchar	ciaddr[4];	/* client IP address (client tells server) */
+	uchar	yiaddr[4];	/* client IP address (server tells client) */
+	uchar	siaddr[4];	/* server IP address */
+	uchar	giaddr[4];	/* gateway IP address */
+	uchar	chaddr[16];	/* client hardware address */
+	char	sname[64];	/* server host name (optional) */
+	char	file[128];	/* boot file name */
+
+//	char	vend[128];	/* vendor-specific goo */
+	uchar	optmagic[4];
+	uchar	optdata[Maxoptlen];
+};
+
+typedef struct Pxenetaddr Pxenetaddr;
+struct Pxenetaddr
+{
+	uchar	ip[IPaddrlen];
+	ushort	port;
+};
+
+extern int chatty;

+ 1235 - 0
sys/src/9/pcboot/pxeload.c

@@ -0,0 +1,1235 @@
+/*
+ * 9boot - load next kernel via pxe (bootp, tftp) and start it
+ *
+ * intel says that pxe can only load into the bottom 640K,
+ * and intel's boot agent takes 128K, leaving only 512K for 9boot.
+ *
+ * some of this code is from the old 9load's bootp.c.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/netif.h"
+#include	"../ip/ip.h"
+#include	"pxe.h"
+
+#define TFTPDEF "135.104.9.6"	/* IP of default tftp server */
+
+enum {
+	Tftpusehdrs =	0,	/* flag: use announce+headers for tftp? */
+	Debug =		0,
+
+	Tftphdrsz =	4,
+	/*
+	 * this can be bigger than the ether mtu and
+	 * will work due to ip fragmentation, at least on v4.
+	 */
+	Prefsegsize =	1400,
+	Maxsegsize =	2048,
+	Bufsz =		Maxsegsize + 2,
+};
+
+typedef struct Ethaddr Ethaddr;
+typedef struct Kernname Kernname;
+typedef struct Openeth Openeth;
+typedef struct Tftp Tftp;
+
+struct Tftp {
+	uchar	header[Tftphdrsz];
+	uchar	data[Maxsegsize];
+};
+
+struct Kernname {
+	char	*edev;
+	char	*bootfile;
+};
+
+struct Openeth {
+	/* names */
+	int	ctlrno;
+	char	ethname[16];	/* ether%d */
+	char	netethname[32];	/* /net/ether%d */
+	char	filename[128];	/* from bootp, for tftp */
+
+	Chan	*ifcctl;	/* /net/ipifc/clone */
+	Chan	*ethctl;	/* /net/etherN/0/ctl, for promiscuous mode */
+
+	/* udp connection */
+	Chan	*udpctl;
+	Chan	*udpdata;
+	Pxenetaddr *netaddr;
+	int	rxactive;
+};
+
+struct Ethaddr {		/* communication with sleep procs */
+	Openeth	*oe;
+	Pxenetaddr *a;
+};
+
+static char ethernm[] = "ether";
+
+/*
+ * there can be at most one concurrent tftp session until we move these
+ * variables into Openeth or some other struct.
+ */
+static ushort tftpport;
+static int tftpblockno;
+static int tftpphase;
+static int progress;
+static int segsize;
+static Tftp *tftpb;
+
+static uchar myea[Eaddrlen];
+static Pxenetaddr myaddr;		/* actually, local ip addr & port */
+static Pxenetaddr tftpserv;		/* actually, remote ip addr & port */
+static Pxenetaddr bootpserv;
+
+uchar *
+etheraddr(Openeth *oe)
+{
+	int n;
+	char name[32], buf[32];
+	uchar ea[Eaddrlen];
+
+	memset(ea, 0, sizeof ea);
+	snprint(name, sizeof name, "#l%d/ether%d/addr", oe->ctlrno, oe->ctlrno);
+	n = readfile(name, buf, sizeof buf - 1);
+	if (n < 0)
+		return ea;
+	buf[n] = '\0';
+	parseether(ea, buf);
+	return ea;
+}
+
+static void
+udpsend(Openeth *oe, Pxenetaddr *a, void *data, int dlen)
+{
+	int n;
+	uchar *buf;
+	Chan *c;
+	Etherpkt pkt;
+	Udphdr *uh;
+
+	buf = data;
+	if (dlen > sizeof pkt)
+		panic("udpsend: packet too big");
+
+	oe->netaddr = a;
+	/*
+	 * add Plan 9 UDP pseudo-headers
+	 */
+	if (!tftpphase || Tftpusehdrs) {
+		memset(&pkt, 0, sizeof pkt);
+		uh = (Udphdr*)&pkt;
+		memmove(uh + 1, data, dlen);
+		USED(buf);
+		buf = (uchar *)uh;
+		dlen += sizeof *uh;
+		if (dlen > sizeof pkt)
+			panic("udpsend: packet too big");
+
+		ipmove(uh->laddr, myaddr.ip);
+		hnputs(uh->lport, myaddr.port);
+		ipmove(uh->raddr, a->ip);
+		hnputs(uh->rport, a->port);
+		if(Debug)
+			print("udpsend %I!%d -> %I!%d ", uh->laddr,
+				nhgets(uh->lport), uh->raddr, nhgets(uh->rport));
+	}
+	if (waserror()) {
+		iprint("udp write error\n");
+		return;			/* send another req later */
+	}
+	c = oe->udpdata;
+	assert(oe->udpdata != nil);
+	n = devtab[c->type]->write(c, buf, dlen, c->offset);
+	poperror();
+	c->offset += n;
+	if (n != dlen)
+		print("udpsend: wrote %d/%d\n", n, dlen);
+	else if (progress)
+		print(".");
+}
+
+static void
+nak(Openeth *oe, Pxenetaddr *a, int code, char *msg, int report)
+{
+	char buf[4 + 32];
+
+	buf[0] = 0;
+	buf[1] = Tftp_ERROR;
+	buf[2] = 0;
+	buf[3] = code;
+	strncpy(buf+4, msg, sizeof buf - 4 - 1);
+	udpsend(oe, a, buf, 4 + strlen(buf+4) + 1);
+	if(report)
+		print("\ntftp: error(%d): %s\n", code, msg);
+}
+
+/* a is the source address we're looking for */
+static int
+tuplematch(Pxenetaddr *a, Udphdr *h)
+{
+	int port;
+	uchar *ip;
+
+	if (tftpphase && !Tftpusehdrs)
+		return 1;
+	/*
+	 * we're using udp headers mode, because we're still doing bootp,
+	 * or we are doing tftp and we chose to use headers mode.
+	 */
+	port = a->port;
+	ip = a->ip;
+	/*
+	 * we're accepting any src port or it's from the port we want, and
+	 * it's from the ip we want or we sent to a broadcast address, and
+	 * it's for us or it's a broadcast.
+	 */
+	return (port == 0 || nhgets(h->rport) == port) &&
+		(equivip6(h->raddr, ip) || equivip6(ip, IPv4bcast)) &&
+		(equivip6(h->laddr, myaddr.ip) || equivip6(h->laddr, IPv4bcast));
+}
+
+/* extract UDP payload into data and set a */
+static int
+udppayload(Udphdr *h, int len, Pxenetaddr *a, uchar *data, int dlen)
+{
+	if(Debug)
+		print("udprecv %I!%d to %I!%d...\n",
+			h->raddr, nhgets(h->rport), h->laddr, nhgets(h->lport));
+
+	if(a->port != 0 && nhgets(h->rport) != a->port) {
+		if(Debug)
+			print("udpport %ux not %ux\n", nhgets(h->rport), a->port);
+		return -1;
+	}
+
+	if(!equivip6(a->ip, IPv4bcast) && !equivip6(a->ip, h->raddr)) {
+		if(Debug)
+			print("bad ip %I not %I\n", h->raddr, a->ip);
+		return -1;
+	}
+
+	len -= sizeof *h;		/* don't count pseudo-headers */
+	if(len > dlen) {
+		print("udp packet too big: %d > %d; from addr %I\n",
+			len, dlen, h->raddr);
+		return -1;
+	}
+	memmove(data, h + 1, len);	/* skip pseudo-headers */
+
+	/* set a from remote address */
+	ipmove(a->ip, h->raddr);
+	a->port = nhgets(h->rport);
+	return len;
+}
+
+static int
+chanlen(Chan *ch)
+{
+	int len;
+	Dir *dp;
+
+	dp = dirchstat(ch);
+	if (dp == nil)
+		return -1;
+	len = dp->length;		/* qlen(cv->rq) in devip */
+	free(dp);
+	return len;
+}
+
+static int
+udprecv(Openeth *oe, Pxenetaddr *a, void *data, int dlen)
+{
+	int len, buflen, chlen;
+	ulong timo, now;
+	char *buf;
+	Chan *c;
+	Etherpkt pkt;
+
+	oe->netaddr = a;
+	/* timo is frequency of tftp ack and broadcast bootp retransmission */
+	if(oe->rxactive == 0)
+		timo = 1000;
+	else
+		timo = Timeout;
+	now = TK2MS(m->ticks);
+	timo += now;			/* deadline */
+
+	c = oe->udpdata;
+	spllo();			/* paranoia */
+	do {
+		/*
+		 * wait for data to arrive or time-out.
+		 * alarms only work for user procs, so we poll to avoid getting
+		 * stuck in ipread.
+		 */
+		for (chlen = chanlen(c); chlen == 0 && now < timo;
+		     chlen = chanlen(c)) {
+			/* briefly give somebody else a chance to run */
+			tsleep(&up->sleep, return0, 0, 0);
+			now = TK2MS(m->ticks);
+		}
+		if (chlen <= 0) {
+			print("T");
+			return -1;		/* timed out */
+		}
+
+		while (waserror()) {
+			print("read err: %s\n", up->errstr);
+			tsleep(&up->sleep, return0, 0, 1000);
+		}
+
+		/*
+		 * using Plan 9 UDP pseudo-headers?
+		 */
+		if (tftpphase && !Tftpusehdrs) {
+			buf = data;	/* read directly in caller's buffer */
+			buflen = dlen;
+		} else {
+			buf = (char *)&pkt;  /* read pkt with hdrs */
+			buflen = sizeof pkt;
+		}
+		/* devtab[c->type]->read calls ipread */
+		len = devtab[c->type]->read(c, buf, buflen, c->offset);
+		poperror();
+
+		if (len <= 0)
+			return len;
+		c->offset += len;
+	} while (!tuplematch(oe->netaddr, (Udphdr *)buf));
+
+	/*
+	 * using Plan 9 UDP pseudo-headers? extract payload into caller's buf.
+	 */
+	if (!tftpphase || Tftpusehdrs)
+		len = udppayload((Udphdr *)&pkt, len, a, data, dlen);
+	if (len >= 0)
+		oe->rxactive = 1;
+	return len;
+}
+
+static void
+ack(Openeth *oe, Pxenetaddr *a, int blkno)
+{
+	char buf[4];
+
+	buf[0] = 0;
+	buf[1] = Tftp_ACK;
+	buf[2] = blkno>>8;
+	buf[3] = blkno;
+	udpsend(oe, a, buf, sizeof buf);
+}
+
+static char *
+skipwd(char *wd)
+{
+	while (*wd != '\0')
+		wd++;
+	return wd + 1;		/* skip terminating NUL */
+}
+
+static int
+optval(char *opt, char *pkt, int len)
+{
+	char *wd, *ep, *p;
+
+	ep = pkt + len;
+	for (p = pkt; p < ep && *p != '\0'; p = skipwd(wd)) {
+		wd = skipwd(p);
+		if (cistrcmp(p, opt) == 0)
+			return strtol(wd, 0, 10);
+	}
+	return -1;
+}
+
+/*
+ * send a tftp read request to `a' for name.  if we get a data packet back,
+ * ack it and stash it in tftp for later.
+ *
+ * format of a request packet, from the RFC:
+ *
+ *          2 bytes     string    1 byte     string   1 byte
+ *          ------------------------------------------------
+ *         | Opcode |  Filename  |   0  |    Mode    |   0  |
+ *          ------------------------------------------------
+ */
+static int
+tftpread1st(Openeth *oe, Pxenetaddr *a, char *name, Tftp *tftp)
+{
+	int i, n, len, rlen, oport, sendack;
+	static char *buf;
+
+	if (buf == nil)
+		buf = malloc(Bufsz);
+	buf[0] = 0;
+	buf[1] = Tftp_READ;
+	len = 2 + snprint(buf+2, Bufsz - 2, "%s", name) + 1;
+	len += snprint(buf+len, Bufsz - len, "octet") + 1;
+	len += snprint(buf+len, Bufsz - len, "blksize") + 1; /* option */
+	len += snprint(buf+len, Bufsz - len, "%d", Prefsegsize) + 1;
+
+	/*
+	 * keep sending the same packet until we get an answer.
+	 */
+	if (Debug)
+		print("tftpread1st %s\n", name);
+	oe->netaddr = a;
+	/*
+	 * the first packet or two sent seem to get dropped,
+	 * so use a shorter time-out on the first packet.
+	 */
+	oe->rxactive = 0;
+	oport = a->port;
+	tftpblockno = 0;
+	segsize = Defsegsize;
+	sendack = 0;
+	for(i = 0; i < 10; i++){
+		a->port = oport;
+		if (sendack)
+			ack(oe, a, tftpblockno);
+		else
+			udpsend(oe, a, buf, len);	/* tftp read name */
+
+		if((rlen = udprecv(oe, a, tftp, sizeof(Tftp))) < Tftphdrsz)
+			continue;		/* runt or time-out */
+
+		switch((tftp->header[0]<<8)|tftp->header[1]){
+
+		case Tftp_ERROR:
+			print("tftpread1st: error (%d): %s\n",
+				(tftp->header[2]<<8)|tftp->header[3], (char*)tftp->data);
+			return -1;
+
+		case Tftp_OACK:
+			n = optval("blksize", (char *)tftp->header+2, rlen-2);
+			if (n <= 0) {
+				nak(oe, a, 0, "bad blksize option value", 0);
+				return -1;
+			}
+			segsize = n;
+			/* no bytes stashed in tftp.data */
+			i = 0;
+			sendack = 1;
+			break;
+
+		case Tftp_DATA:
+			tftpblockno = 1;
+			len = (tftp->header[2]<<8)|tftp->header[3];
+			if(len != tftpblockno){
+				print("tftpread1st: block error: %d\n", len);
+				nak(oe, a, 1, "block error", 0);
+				return -1;
+			}
+			rlen -= Tftphdrsz;
+			if(rlen < segsize)
+				/* ACK now, in case we don't later */
+				ack(oe, a, tftpblockno);
+			return rlen;
+
+		default:
+			print("tftpread1st: unexpected pkt type recv'd\n");
+			nak(oe, a, 0, "unexpected pkt type recv'd", 0);
+			return -1;
+		}
+	}
+
+	print("tftpread1st: failed to connect to server (%I!%d)\n", a->ip, oport);
+	return -1;
+}
+
+static int
+tftpread(Openeth *oe, Pxenetaddr *a, Tftp *tftp, int dlen)
+{
+	int try, blockno, len;
+
+	dlen += Tftphdrsz;
+
+	/*
+	 * keep sending ACKs until we get an answer.
+	 */
+	for(try = 0; try < 10; try++) {
+		ack(oe, a, tftpblockno);
+
+		len = udprecv(oe, a, tftp, dlen);
+		/*
+		 * NB: not `<='; just a header is legal and happens when
+		 * file being read is a multiple of segsize bytes long.
+		 */
+		if(len < Tftphdrsz){
+			if(Debug)
+				print("tftpread: too short %d <= %d\n",
+					len, Tftphdrsz);
+			continue;
+		}
+		switch((tftp->header[0]<<8)|tftp->header[1]){
+		case Tftp_ERROR:
+			print("tftpread: error (blk %d): %s\n",
+				(tftp->header[2]<<8)|tftp->header[3],
+				(char*)tftp->data);
+			nak(oe, a, 0, "error pkt recv'd", 0);
+			return -1;
+		case Tftp_OACK:
+			print("tftpread: oack pkt recv'd too late\n");
+			nak(oe, a, 0, "oack pkt recv'd too late", 0);
+			return -1;
+		default:
+			print("tftpread: unexpected pkt type recv'd\n");
+			nak(oe, a, 0, "unexpected pkt type recv'd", 0);
+			return -1;
+		case Tftp_DATA:
+			break;
+		}
+		blockno = (tftp->header[2]<<8)|tftp->header[3];
+		if(blockno <= tftpblockno){
+			if(Debug)
+				print("tftpread: blkno %d <= %d\n",
+					blockno, tftpblockno);
+			continue;
+		}
+
+		if(blockno == tftpblockno+1) {
+			tftpblockno++;
+			if(len < dlen)	/* last packet? send final ack */
+				ack(oe, a, tftpblockno);
+			return len-Tftphdrsz;
+		}
+		print("tftpread: block error: %d, expected %d\n",
+			blockno, tftpblockno+1);
+	}
+
+	return -1;
+}
+
+/*
+ * broadcast a bootp request for file.  stash any answer in rep.
+ */
+static int
+bootpbcast(Openeth *oe, char *file, Bootp *rep)
+{
+	Bootp req;
+	int i;
+	uchar *ea;
+	char name[128], *filename, *sysname;
+	static char zeroes[IPaddrlen];
+
+	oe->filename[0] = '\0';
+	if (Debug)
+		if (file == nil)
+			print("bootpopen: %s...", oe->ethname);
+		else
+			print("bootpopen: %s!%s...", oe->ethname, file);
+	if((ea = etheraddr(oe)) == nil){
+		print("bad ether %s\n", oe->ethname);
+		return -1;
+	}
+
+	filename = nil;
+	sysname = 0;
+	if(file && *file){
+		strncpy(name, file, sizeof name);
+		if(filename = strchr(name, '!')){
+			sysname = name;
+			*filename++ = 0;
+		}
+		else
+			filename = name;
+	}
+
+	/*
+	 * form a bootp request packet
+	 */
+	memset(&req, 0, sizeof(req));
+	req.op = Bootrequest;
+	req.htype = 1;			/* ethernet */
+	req.hlen = Eaddrlen;		/* ethernet */
+	memmove(req.chaddr, ea, Eaddrlen);
+	req.flags[0] = 0x80;		/* request broadcast reply */
+	if(filename != nil) {
+		strncpy(req.file, filename, sizeof(req.file));
+		strncpy(oe->filename, filename, sizeof oe->filename);
+	}
+	if(sysname != nil)		/* if server name given, supply it */
+		strncpy(req.sname, sysname, sizeof(req.sname));
+
+	if (memcmp(myaddr.ip, zeroes, sizeof myaddr.ip) == 0)
+		ipmove(myaddr.ip, IPv4bcast);	/* didn't know my ip yet */
+	myaddr.port = BPportsrc;
+	memmove(myea, ea, Eaddrlen);
+
+	/* send to 255.255.255.255!67 */
+	ipmove(bootpserv.ip, IPv4bcast);
+	bootpserv.port = BPportdst;
+
+	/*
+	 * send it until we get a matching answer
+	 */
+	memset(rep, 0, sizeof *rep);
+	for(i = 10; i > 0; i--) {
+		req.xid[0] = i;			/* try different xids */
+		udpsend(oe, &bootpserv, &req, sizeof(req));
+
+		if(udprecv(oe, &bootpserv, rep, sizeof(*rep)) <= 0)
+			continue;
+		if(memcmp(req.chaddr, rep->chaddr, Eaddrlen) != 0)
+			continue;
+		if(rep->htype != 1 || rep->hlen != Eaddrlen)
+			continue;
+		if(sysname == 0 || strcmp(sysname, rep->sname) == 0)
+			break;
+	}
+	if(i <= 0) {
+		if (file == nil)
+			print("bootp on %s timed out\n", oe->ethname);
+		else
+			print("bootp on %s for %s timed out\n", oe->ethname, file);
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * request file via tftp from server named in rep.
+ * initial data packet will be stashed in tftpb.
+ */
+static int
+tftpopen(Openeth *oe, char *file, Bootp *rep)
+{
+	char *filename;
+	char buf[128];
+	static uchar ipv4noaddr[IPv4addrlen];
+
+	/*
+	 * read file from tftp server in bootp answer
+	 */
+	filename = oe->filename;
+	if (file)
+		filename = file;
+	if(filename == 0 || *filename == 0){
+		if(strcmp(rep->file, "/386/9boot") == 0 ||
+		   strcmp(rep->file, "/386/9pxeload") == 0) {
+			print("won't load another boot loader (%s)\n", rep->file);
+			return -1;		/* avoid infinite loop */
+		}
+		filename = rep->file;
+	}
+
+	print("\n");
+	if(rep->sname[0] != '\0')
+		print("%s ", rep->sname);
+
+	v4tov6(myaddr.ip, rep->yiaddr);
+	myaddr.port = tftpport;
+	if (equivip4(rep->siaddr, ipv4noaddr)) { /* no server address? */
+		getstr("tftp server IP address", buf, sizeof buf, TFTPDEF, 0);
+		v4parseip(rep->siaddr, buf);
+	}
+	v4tov6(tftpserv.ip, rep->siaddr);
+	tftpserv.port = TFTPport;
+	if (tftpb == nil)
+		tftpb = malloc(sizeof *tftpb);
+
+	print("(%V!%d): %s ", rep->siaddr, tftpserv.port, filename);
+
+	return tftpread1st(oe, &tftpserv, filename, tftpb);
+}
+
+int
+tftpboot(Openeth *oe, char *file, Bootp *rep, Boot *b)
+{
+	int n;
+
+	if((n = tftpopen(oe, file, rep)) < 0)
+		return -1;
+
+	progress = 0;			/* no more dots; we're on a roll now */
+	print(" ");			/* after "sys (ip!port): kernel ..." */
+	while(bootpass(b, tftpb->data, n) == MORE){
+		n = tftpread(oe, &tftpserv, tftpb, segsize);
+		if(n < segsize)
+			break;
+	}
+	if(0 < n && n < segsize)	/* got to end of file */
+		bootpass(b, tftpb->data, n);
+	else
+		nak(oe, &tftpserv, 3, "ok", 0);	/* tftpclose to abort transfer */
+	bootpass(b, nil, 0);	/* boot if possible */
+	return -1;
+}
+
+/* leave the channel to /net/ipifc/clone open */
+static int
+binddevip(Openeth *oe)
+{
+	Chan *icc;
+	char buf[32];
+
+	if (waserror()) {
+		print("binddevip: can't bind ether %s: %s\n",
+			oe->netethname, up->errstr);
+		nexterror();
+	}
+	/* get a new ip interface */
+	oe->ifcctl = icc = namecopen("/net/ipifc/clone", ORDWR);
+	if(icc == nil)
+		error("can't open /net/ipifc/clone");
+
+	/*
+	 * specify medium as ethernet, bind the interface to it.
+	 * this should trigger chandial of types 0x800, 0x806 and 0x86dd.
+	 */
+	snprint(buf, sizeof buf, "bind ether %s", oe->netethname);
+	devtab[icc->type]->write(icc, buf, strlen(buf), 0);  /* bind ether %s */
+	poperror();
+	return 0;
+}
+
+/* set the default route */
+static int
+adddefroute(char *, uchar *gaddr)
+{
+	char buf[64];
+	Chan *rc;
+
+	rc = nil;
+	if (waserror()) {
+		if (rc)
+			cclose(rc);
+		return -1;
+	}
+	rc = enamecopen("/net/iproute", ORDWR);
+
+	if(isv4(gaddr))
+		snprint(buf, sizeof buf, "add 0 0 %I", gaddr);
+	else
+		snprint(buf, sizeof buf, "add :: /0 %I", gaddr);
+	devtab[rc->type]->write(rc, buf, strlen(buf), 0);
+	poperror();
+	cclose(rc);
+	return 0;
+}
+
+static int
+validip(uchar *ip)
+{
+	return ipcmp(ip, IPnoaddr) != 0 && ipcmp(ip, v4prefix) != 0;
+}
+
+static int
+openetherdev(Openeth *oe)
+{
+	int n;
+	char num[16];
+	Chan *c;
+	static char promisc[] = "promiscuous";
+
+	if (chdir(oe->netethname) < 0)
+		return -1;			/* out of ethers */
+
+	oe->ethctl = nil;
+	if (waserror()) {
+		print("error opening /net/ether%d/0/ctl: %s\n",
+			oe->ctlrno, up->errstr);
+		if (oe->ethctl) {
+			cclose(oe->ethctl);
+			oe->ethctl = nil;
+		}
+		chdir("/");			/* don't hold conv. open */
+		return -1;
+	}
+	oe->ethctl = c = namecopen("0/ctl", ORDWR);	/* should be ipv4 */
+	if (c == nil) {
+		/* read clone file to make conversation 0 since not present */
+		oe->ethctl = c = enamecopen("clone", ORDWR);
+		n = devtab[c->type]->read(c, num, sizeof num - 1, 0);
+		if (n < 0)
+			print("no %s/clone: %s\n", oe->netethname, up->errstr);
+		else {
+			num[n] = 0;
+			print("%s/clone returned %s\n", oe->netethname, num);
+		}
+	}
+	/* shouldn't be needed to read bootp (broadcast) reply */
+	devtab[c->type]->write(c, promisc, sizeof promisc-1, 0);
+	poperror();
+	chdir("/");
+	/* leave oe->ethctl open to keep promiscuous mode on */
+	return 0;
+}
+
+/* add a logical interface to the ip stack */
+int
+minip4cfg(Openeth *oe)
+{
+	int n;
+	char buf[64];
+
+	n = snprint(buf, sizeof buf, "add %I", IPnoaddr);
+	devtab[oe->ifcctl->type]->write(oe->ifcctl, buf, n, 0);	/* add %I */
+
+	openetherdev(oe);
+	return 0;
+}
+
+/* remove the :: address added by minip4cfg */
+int
+unminip4cfg(Openeth *oe)
+{
+	int n;
+	char buf[64];
+
+	n = snprint(buf, sizeof buf, "remove %I /128", IPnoaddr);
+	if (waserror()) {
+		print("failed write to ifc: %s: %s\n", buf, up->errstr);
+		return -1;
+	}
+	devtab[oe->ifcctl->type]->write(oe->ifcctl, buf, n, 0);	/* remove %I */
+	cclose(oe->ethctl);		/* turn promiscuous mode off */
+	oe->ethctl = nil;
+	poperror();
+	return 0;
+}
+
+/*
+ * parse p, looking for option `op'.  if non-nil, np points to minimum length.
+ * return nil if option is too small, else ptr to opt, and
+ * store actual length via np if non-nil.
+ */
+uchar*
+optget(uchar *p, int op, int *np)
+{
+	int len, code;
+
+	while ((code = *p++) != OBend) {
+		if(code == OBpad)
+			continue;
+		len = *p++;
+		if(code != op) {
+			p += len;
+			continue;
+		}
+		if(np != nil){
+			if(*np > len) {
+				return 0;
+			}
+			*np = len;
+		}
+		return p;
+	}
+	return 0;
+}
+
+int
+optgetaddr(uchar *p, int op, uchar *ip)
+{
+	int len;
+
+	len = 4;
+	p = optget(p, op, &len);
+	if(p == nil)
+		return 0;
+	v4tov6(ip, p);
+	return 1;
+}
+
+int beprimary = 1;
+
+/* add a logical interface to the ip stack */
+int
+ip4cfg(Openeth *oe, Bootp *rep)
+{
+	int n;
+	uchar gaddr[IPaddrlen], v6mask[IPaddrlen];
+	uchar v4mask[IPv4addrlen];
+	char buf[64];
+	static uchar zeroes[4];
+
+	v4tov6(gaddr, rep->yiaddr);
+	if(!validip(gaddr))
+		return -1;
+
+	/* dig subnet mask, if any, out of options.  if none, guess. */
+	if(optgetaddr(rep->optdata, OBmask, v6mask)) {
+		v6tov4(v4mask, v6mask);
+		n = snprint(buf, sizeof buf, "add %V %M", rep->yiaddr, v4mask);
+	} else
+		n = snprint(buf, sizeof buf, "add %V 255.255.255.0", rep->yiaddr);
+
+	devtab[oe->ifcctl->type]->write(oe->ifcctl, buf, n, 0);
+
+	v4tov6(gaddr, rep->giaddr);
+	if(beprimary==1 && validip(gaddr) && !equivip4(rep->giaddr, zeroes))
+		adddefroute("/net", gaddr);
+	return 0;
+}
+
+static int
+openudp(Openeth *oe)
+{
+	int n;
+	char buf[16];
+	Chan *cc;
+
+	/* read clone file for conversation number */
+	if (waserror())
+		panic("openudp: can't open /net/udp/clone");
+	cc = enamecopen("/net/udp/clone", ORDWR);
+	oe->udpctl = cc;
+	n = devtab[cc->type]->read(cc, buf, sizeof buf - 1, 0);
+	poperror();
+	buf[n] = '\0';
+	return atoi(buf);
+}
+
+static void
+initbind(Openeth *oe)
+{
+	char buf[8];
+
+	if (waserror()) {
+		print("error while binding: %s\n", up->errstr);
+		return;
+	}
+	snprint(buf, sizeof buf, "#I%d", oe->ctlrno);
+	bind(buf, "/net", MAFTER);
+	snprint(buf, sizeof buf, "#l%d", oe->ctlrno);
+	bind(buf, "/net", MAFTER);
+	binddevip(oe);
+	poperror();
+}
+
+static void
+closeudp(Openeth *oe)
+{
+	if (oe->udpctl) {
+		cclose(oe->udpctl);
+		oe->udpctl = nil;
+	}
+	if (oe->udpdata) {
+		cclose(oe->udpdata);
+		oe->udpdata = nil;
+	}
+}
+
+static int
+announce(Openeth *oe, char *port)
+{
+	int udpconv;
+	char buf[32];
+	static char hdrs[] = "headers";
+
+	while (waserror()) {
+		print("can't announce udp!*!%s: %s\n", port, up->errstr);
+		closeudp(oe);
+		nexterror();
+	}
+	udpconv = openudp(oe);
+	if (udpconv < 0)
+		panic("can't open udp conversation: %s", up->errstr);
+
+	/* headers is only effective after a udp announce */
+	snprint(buf, sizeof buf, "announce %s", port);
+	devtab[oe->udpctl->type]->write(oe->udpctl, buf, strlen(buf), 0);
+	devtab[oe->udpctl->type]->write(oe->udpctl, hdrs, sizeof hdrs - 1, 0);
+	poperror();
+
+	/* now okay to open the data file */
+	snprint(buf, sizeof buf, "/net/udp/%d/data", udpconv);
+	/*
+	 * we must use create, not open, to get Conv->rq and ->wq
+	 * allocated by udpcreate.
+	 */
+	oe->udpdata = enameccreate(buf, ORDWR);
+	cclose(oe->udpctl);
+	oe->udpctl = nil;
+	return udpconv;
+}
+
+static long
+tftprdfile(Openeth *oe, int openread, void* va, long len)
+{
+	int n;
+	char *p, *v;
+
+	n = openread;	/* have read this many bytes already into tftpb->data */
+	p = v = va;
+	len--;				/* leave room for NUL */
+	while(n > 0) {
+		if((p-v)+n > len)
+			n = len - (p-v);
+		memmove(p, tftpb->data, n);
+		p += n;
+		*p = 0;
+		if(n != segsize)
+			break;
+
+		if((n = tftpread(oe, &tftpserv, tftpb, segsize)) < 0)
+			return -1;
+	}
+	return p-v;
+}
+
+static int
+newtftpconn(Openeth *oe, Bootp *rep)
+{
+	char num[16], dialstr[64];
+
+	if (waserror()) {
+		print("can't dial: %s\n", up->errstr);
+		return -1;
+	}
+	closeudp(oe);
+
+	tftpphase = 1;
+	tftpport = 5000 + nrand(20480);
+	snprint(num, sizeof num, "%d", tftpport);
+	if (Tftpusehdrs)
+		announce(oe, num);
+	else {
+		snprint(dialstr, sizeof dialstr, "/net/udp!%V!%d",
+			rep->siaddr, TFTPport);
+		oe->udpdata = chandial(dialstr, num, nil, nil);
+		oe->udpctl = nil;
+	}
+	poperror();
+	return 0;
+}
+
+static int
+setipcfg(Openeth *oe, Bootp *rep)
+{
+	int r;
+
+	tftpphase = 0;
+	progress = 1;
+
+	/* /net/iproute is unpopulated here; add at least broadcast */
+	minip4cfg(oe);
+	announce(oe, "68");
+	r = bootpbcast(oe, nil, rep);
+	closeudp(oe);
+	unminip4cfg(oe);
+	if(r < 0)
+		return -1;
+
+	ip4cfg(oe, rep);
+	if (Debug)
+		print("got & set ip config\n");
+	return 0;
+}
+
+static int
+getkernname(Openeth *oe, Bootp *rep, Kernname *kp)
+{
+	int n;
+	char *ini, *p;
+	char cfgpxe[32], buf[64];
+
+	if (kp->bootfile) {
+		print("getkernname: already have bootfile %s\n", kp->bootfile);
+		return 0;
+	}
+	if (newtftpconn(oe, rep) < 0)
+		return -1;
+
+	/* use our mac address instead of relying on a bootp answer */
+	snprint(cfgpxe, sizeof cfgpxe, "/cfg/pxe/%E", myea);
+	/*
+	 * use bootp answer (rep) to open cfgpxe.
+	 * reads first pkt of cfgpxe into tftpb->data.
+	 */
+	n = tftpopen(oe, cfgpxe, rep);
+	if (n < 0) {
+		print("\nfailed.\n");
+		return -1;
+	}
+	if (Debug)
+		print("\opened %s\n", cfgpxe);
+
+	ini = smalloc(2*BOOTARGSLEN);
+	/* starts by copying data from tftpb->data into ini */
+	n = tftprdfile(oe, n, ini, 2*BOOTARGSLEN);
+	if (n < 0) {
+		print("error reading %s\n", cfgpxe);
+		free(ini);
+		return -1;
+	}
+	print(" read %d bytes", n);
+
+	/*
+	 * take note of plan9.ini contents.  consumes ini to make config vars,
+	 * thus we can't free ini.
+	 */
+	dotini(ini);
+	i8250console();		/* configure serial port with defaults */
+	kp->edev = nil;
+	kp->bootfile = getconf("bootfile");
+	if (kp->bootfile == nil) {
+		getstr("\nBoot from:", buf, sizeof(buf), "ether0!/386/9pccpu",
+			60);
+		trimnl(buf);
+		kstrdup(&kp->bootfile, buf);
+	}
+	// print("kp->bootfile %s\n", kp->bootfile);
+	p = strchr(kp->bootfile, '!');
+	if (p != nil) {
+		kp->edev = kp->bootfile;
+		*p++ = '\0';
+		kp->bootfile = nil;
+		kstrdup(&kp->bootfile, p);
+		if (strncmp(kp->edev, ethernm, sizeof ethernm - 1) != 0) {
+			print("bad ether device %s\n", kp->edev);
+			return -1;
+		}
+	}
+
+	/* pass arguments to kernels that can use them */
+	strecpy(BOOTLINE, BOOTLINE+BOOTLINELEN, kp->bootfile);
+	p = strchr(kp->bootfile, ' ');
+	if(p != nil)
+		*p = '\0';
+	return 0;
+}
+
+static void
+unbinddevip(Openeth *oe)
+{
+	Chan *icc;
+	static char unbind[] = "unbind";
+
+	icc = oe->ifcctl;
+	if (icc) {
+		devtab[icc->type]->write(icc, unbind, sizeof unbind - 1, 0);
+		cclose(icc);
+		oe->ifcctl = nil;
+	}
+}
+
+/*
+ * phase 1: get our ip (v4) configuration via bootp, set new ip configuration.
+ * phase 2: load /cfg/pxe, parse it, extract kernel filename.
+ * phase 3: load kernel and jump to it.
+ */
+static void
+tftpload(Openeth *oe, Kernname *kp)
+{
+	Bootp rep;
+	Boot boot;
+
+	if(waserror()) {
+		print("tftpload: %s\n", up->errstr);
+		closeudp(oe);
+		unbinddevip(oe);
+		return;
+	}
+
+	memset(&rep, 0, sizeof rep);
+	if (setipcfg(oe, &rep) >= 0 &&
+	    getkernname(oe, &rep, kp) >= 0 &&
+	    (!kp->edev ||
+	     oe->ctlrno == strtol(kp->edev + sizeof ethernm - 1, 0, 10)) &&
+	    newtftpconn(oe, &rep) >= 0) {
+		memset(&boot, 0, sizeof boot);
+		boot.state = INITKERNEL;
+		tftpboot(oe, kp->bootfile, &rep, &boot);
+	}
+
+	/* we failed or bootfile asked for another ether */
+	poperror();
+	closeudp(oe);
+	unbinddevip(oe);
+}
+
+static int
+etherload(int eth, Kernname *kp)
+{
+	Openeth *oe;
+
+	print("pxe on ether%d ", eth);
+	oe = smalloc(sizeof *oe);
+	memset(oe, 0, sizeof *oe);
+	oe->ctlrno = eth;
+	snprint(oe->ethname, sizeof oe->ethname, "ether%d", oe->ctlrno);
+	snprint(oe->netethname, sizeof oe->netethname, "/net/ether%d",
+		oe->ctlrno);
+	initbind(oe);
+
+	tftpload(oe, kp);
+
+	/* failed to boot; keep going */
+	unmount(nil, "/net");
+	return 0;
+}
+
+static int
+nethers(void)
+{
+	int neth;
+	char num[4];
+	Chan *cc;
+
+	/* count interfaces */
+	print("attaching ethers:");
+	for (neth = 0; ; neth++) {
+		cc = nil;
+		if (waserror()) {		/* no more interfaces */
+			if (cc)
+				cclose(cc);
+			break;
+		}
+
+		snprint(num, sizeof num, "%d", neth);
+		cc = etherattach(num);
+		if (cc)
+			cclose(cc);
+		poperror();
+		if (cc == nil)
+			break;			/* no more interfaces */
+		print(" %d", neth);
+	}
+	print("\n");
+	return neth;
+}
+
+void
+bootloadproc(void *)
+{
+	int eth, neth;
+	Kernname kernnm;
+
+	neth = nethers();
+	if(neth <= 0) {
+		print("error counting interfaces, assuming 1\n");
+		neth = 1;
+	}
+
+	srand(TK2MS(m->ticks));			/* for local port numbers */
+	nrand(20480);				/* 1st # is always 0; toss it */
+	kernnm.edev = kernnm.bootfile = nil;
+
+	while(waserror()) {
+		print("%s\n", up->errstr);
+		tsleep(&up->sleep, return0, 0, 30*1000);
+	}
+	for (;;) {
+		/* try each interface in turn: first get /cfg/pxe file */
+		for (eth = 0; eth < neth && kernnm.edev == nil; eth++)
+			etherload(eth, &kernnm);
+		if (kernnm.edev != nil) {
+			eth = strtol(kernnm.edev + sizeof ethernm - 1, 0, 10);
+			etherload(eth, &kernnm);
+		}
+		/*
+		 * couldn't boot on any ether.  don't give up;
+		 * perhaps the boot servers are down, so try again later.
+		 */
+		print("failed to boot via pxe; will try again.\n");
+		tsleep(&up->sleep, return0, 0, 15*1000);
+	}
+}

+ 96 - 0
sys/src/9/pcboot/rand.c

@@ -0,0 +1,96 @@
+/*
+ * libc pseudo-random number generators, but without locks.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+
+/* stubs to emulate random.c */
+ulong
+randomread(void *, ulong)
+{
+	return 0;
+}
+
+void
+randominit(void)
+{
+}
+
+/*
+ *	algorithm by
+ *	D. P. Mitchell & J. A. Reeds
+ */
+
+#define	LEN	607
+#define	TAP	273
+#define	MASK	0x7fffffffL
+#define	A	48271
+#define	M	2147483647
+#define	Q	44488
+#define	R	3399
+#define	NORM	(1.0/(1.0+MASK))
+
+static	ulong	rng_vec[LEN];
+static	ulong*	rng_tap = rng_vec;
+static	ulong*	rng_feed = 0;
+
+static void
+isrand(long seed)
+{
+	long lo, hi, x;
+	int i;
+
+	rng_tap = rng_vec;
+	rng_feed = rng_vec+LEN-TAP;
+	seed = seed%M;
+	if(seed < 0)
+		seed += M;
+	if(seed == 0)
+		seed = 89482311;
+	x = seed;
+	/*
+	 *	Initialize by x[n+1] = 48271 * x[n] mod (2**31 - 1)
+	 */
+	for(i = -20; i < LEN; i++) {
+		hi = x / Q;
+		lo = x % Q;
+		x = A*lo - R*hi;
+		if(x < 0)
+			x += M;
+		if(i >= 0)
+			rng_vec[i] = x;
+	}
+}
+
+void
+srand(long seed)
+{
+	isrand(seed);
+}
+
+long
+lrand(void)
+{
+	ulong x;
+
+	rng_tap--;
+	if(rng_tap < rng_vec) {
+		if(rng_feed == 0) {
+			isrand(1);
+			rng_tap--;
+		}
+		rng_tap += LEN;
+	}
+	rng_feed--;
+	if(rng_feed < rng_vec)
+		rng_feed += LEN;
+	x = (*rng_feed + *rng_tap) & MASK;
+	*rng_feed = x;
+	return x;
+}

+ 93 - 0
sys/src/9/pcboot/realmode.c

@@ -0,0 +1,93 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"../port/error.h"
+
+/*
+ * Back the processor into real mode to run a BIOS call,
+ * then return.  This must be used carefully, since it 
+ * completely disables hardware interrupts (e.g., the i8259)
+ * while running.  It is *not* using VM86 mode. 
+ * Maybe that's really the right answer, but real mode
+ * is fine for now.  We don't expect to use this very much --
+ * just for BIOS INT 13 disk i/o.
+ */
+
+void realmode0(void);		/* in realmode0.s */
+void realmodeintrinst(void);	/* in realmode0.s */
+void realmodeend(void);		/* in realmode0.s */
+
+extern ushort rmseg;		/* in realmode0.s */
+
+static Ureg rmu;
+static QLock rmlock;
+static int beenhere;
+
+void
+realmode(Ureg *ureg)
+{
+	int s, sz;
+	ulong cr3;
+	uchar *ip;
+
+	qlock(&rmlock);
+	if (!beenhere)
+		iprint("into bios in real mode...");
+	*(Ureg *)RMUADDR = *ureg;
+
+	/*
+	 * in pxe-loaded bootstraps, the l.s real-mode code is already
+	 * below 64K, but for pbs-loaded bootstraps, we need to copy it there.
+	 */
+	ip = (void *)realmodeintrinst;		/* the INT instruction */
+	ip[1] = ureg->trap;			/* insert INT number */
+	coherence();
+	if ((uintptr)KTZERO == KZERO+PXEBASE)	/* pxe-loaded? */
+		rmseg = 0;			/* into JMPFAR instr. */
+	else {
+		/* copy l.s so that it can be run from 16-bit mode */
+		sz = (char *)realmodeend - (char *)KTZERO;
+		if (sz > RMSIZE)
+			panic("RMCODE < %d bytes", sz);
+		rmseg = (RMCODE - KZERO) >> 4;	/* into JMPFAR instr. */
+		memmove((void*)RMCODE, (void*)KTZERO, sz);
+	}
+	coherence();
+
+	s = splhi();
+	m->pdb[PDX(0)] = m->pdb[PDX(KZERO)];	/* identity map low */
+	cr3 = getcr3();
+	putcr3(PADDR(m->pdb));
+	if (arch)
+		arch->introff();
+	else
+		i8259off();
+
+	realmode0();
+	splhi();				/* who knows what the bios did */
+
+	if(m->tss){
+		/*
+		 * Called from memory.c before initialization of mmu.
+		 * Don't turn interrupts on before the kernel is ready!
+		 */
+		if (arch)
+			arch->intron();
+		else
+			i8259on();
+	}
+	m->pdb[PDX(0)] = 0;	/* remove low mapping */
+	putcr3(cr3);
+	splx(s);
+
+	*ureg = *(Ureg *)RMUADDR;
+	if (!beenhere) {
+		beenhere = 1;
+		iprint("and back\n");
+	}
+	qunlock(&rmlock);
+}

+ 272 - 0
sys/src/9/pcboot/realmode0.s

@@ -0,0 +1,272 @@
+#include "mem.h"
+#include "/sys/src/boot/pc/x16.h"
+#undef DELAY
+
+#define PADDR(a)	((a) & ~KZERO)
+#define KADDR(a)	(KZERO|(a))
+
+/*
+ * Some machine instructions not handled by 8[al].
+ */
+#define OP16		BYTE $0x66
+#define DELAY		BYTE $0xEB; BYTE $0x00	/* JMP .+2 */
+#define CPUID		BYTE $0x0F; BYTE $0xA2	/* CPUID, argument in AX */
+#define WRMSR		BYTE $0x0F; BYTE $0x30	/* WRMSR, argument in AX/DX (lo/hi) */
+#define RDTSC 		BYTE $0x0F; BYTE $0x31	/* RDTSC, result in AX/DX (lo/hi) */
+#define RDMSR		BYTE $0x0F; BYTE $0x32	/* RDMSR, result in AX/DX (lo/hi) */
+#define HLT		BYTE $0xF4
+#define INVLPG	BYTE $0x0F; BYTE $0x01; BYTE $0x39	/* INVLPG (%ecx) */
+#define WBINVD	BYTE $0x0F; BYTE $0x09
+
+/*
+ * Macros for calculating offsets within the page directory base
+ * and page tables. Note that these are assembler-specific hence
+ * the '<<2'.
+ */
+#define PDO(a)		(((((a))>>22) & 0x03FF)<<2)
+#define PTO(a)		(((((a))>>12) & 0x03FF)<<2)
+
+TEXT m0rgdtptr(SB), $0
+	WORD	$(NGDT*8-1)
+	LONG	$(CPU0GDT-KZERO)
+
+TEXT m0gdtptr(SB), $0
+	WORD	$(NGDT*8-1)
+	LONG	$CPU0GDT
+
+TEXT m0idtptr(SB), $0
+	WORD $(256*8-1)
+	LONG $IDTADDR
+
+/*
+ * Save registers.
+ */
+TEXT saveregs(SB), $0
+	/* appease 8l */
+	SUBL $32, SP
+	POPL AX
+	POPL AX
+	POPL AX
+	POPL AX
+	POPL AX
+	POPL AX
+	POPL AX
+	POPL AX
+	
+	PUSHL	AX
+	PUSHL	BX
+	PUSHL	CX
+	PUSHL	DX
+	PUSHL	BP
+	PUSHL	DI
+	PUSHL	SI
+	PUSHFL
+
+	XCHGL	32(SP), AX	/* swap return PC and saved flags */
+	XCHGL	0(SP), AX
+	XCHGL	32(SP), AX
+	RET
+
+TEXT restoreregs(SB), $0
+	/* appease 8l */
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	PUSHL	AX
+	ADDL	$32, SP
+	
+	XCHGL	32(SP), AX	/* swap return PC and saved flags */
+	XCHGL	0(SP), AX
+	XCHGL	32(SP), AX
+
+	POPFL
+	POPL	SI
+	POPL	DI
+	POPL	BP
+	POPL	DX
+	POPL	CX
+	POPL	BX
+	POPL	AX
+	RET
+
+/*
+ * Assumed to be in protected mode at time of call.
+ * Switch to real mode, execute an interrupt, and
+ * then switch back to protected mode.  
+ *
+ * Assumes:
+ *	- no device interrupts are going to come in
+ *	- 0-16MB is identity mapped in page tables
+ *	- l.s real-mode code is in low memory already but
+ *	  may need to be copied into the first 64K (if loaded by pbs)
+ *	- can use code segment rmseg in real mode to get at l.s code
+ *	- the above Chinese puzzle pretty much forces RMUADDR to be 0x8000 or 0
+ *	  and rmseg to be 0x800 or 0.
+ */
+
+TEXT realmodeidtptr(SB), $0
+	WORD	$(4*256-1)
+	LONG	$0
+
+TEXT realmode0(SB), $0
+	CALL	saveregs(SB)
+
+	/* switch to low code address */
+	LEAL	physcode-KZERO(SB), AX
+	JMP *AX
+
+TEXT physcode(SB), $0
+
+	/* switch to low stack */
+	MOVL	SP, AX
+	MOVL	$RMSTACK, SP
+	PUSHL	AX
+	
+	/* paranoia: make sure modified INT & JMPFAR instr.s are seen below */
+	BYTE $0x0f; BYTE $0xae; BYTE $0xf8	/* SFENCE */
+	BYTE $0x0f; BYTE $0xae; BYTE $0xe8	/* LFENCE */
+	BYTE $0x0f; BYTE $0xae; BYTE $0xf0	/* MFENCE */
+
+	/* change gdt to physical pointer */
+	MOVL	m0rgdtptr-KZERO(SB), GDTR
+
+	/* load IDT with real-mode version*/
+	MOVL	realmodeidtptr-KZERO(SB), IDTR
+
+	/* disable paging */
+	MOVL	CR0, AX
+	ANDL	$(~PG), AX
+	MOVL	AX, CR0
+	/* JMP .+2 to clear prefetch queue*/
+	BYTE $0xEB; BYTE $0x00
+
+	/* jump to 16-bit code segment */
+/*	JMPFAR	SELECTOR(KESEG16, SELGDT, 0):$again16bit(SB) /**/
+	 BYTE	$0xEA
+	 LONG	$again16bit-KZERO(SB)
+	 WORD	$SELECTOR(KESEG16, SELGDT, 0)
+
+TEXT again16bit(SB), $0
+	/*
+	 * Now in 16-bit compatibility mode.
+	 * These are 32-bit instructions being interpreted
+	 * as 16-bit instructions.  I'm being lazy and
+	 * not using the macros because I know when
+	 * the 16- and 32-bit instructions look the same
+	 * or close enough.
+	 */
+
+	/* disable protected mode and jump to real mode cs */
+	OPSIZE; MOVL CR0, AX
+	OPSIZE; XORL BX, BX
+	OPSIZE; INCL BX
+	OPSIZE; XORL BX, AX
+	OPSIZE; MOVL AX, CR0
+
+	/* JMPFAR rmseg:now16real */
+	 BYTE $0xEA
+	 WORD	$now16real-KZERO(SB)
+TEXT rmseg(SB), $0
+	 WORD	$0
+
+TEXT now16real(SB), $0
+	/* copy the registers for the bios call */
+	CLR(rAX)
+	MTSR(rAX, rSS)			/* 0000 -> rSS */
+	LWI((RMSTACK-4), rSP)		/* preserve AX pushed in physcode */
+
+	LWI((RMUADDR-KZERO), rBP)
+
+	/* offsets are in Ureg */
+	LXW(44, xBP, rAX)
+	MOVW	AX, DS
+	LXW(40, xBP, rAX)
+	MOVW	AX, ES
+
+	OPSIZE; LXW(0, xBP, rDI)
+	OPSIZE; LXW(4, xBP, rSI)
+	OPSIZE; LXW(16, xBP, rBX)
+	OPSIZE; LXW(20, xBP, rDX)
+	OPSIZE; LXW(24, xBP, rCX)
+	OPSIZE; LXW(28, xBP, rAX)
+
+	CLC
+
+	/* assume that SP and SS persist across INT */
+
+TEXT realmodeintrinst(SB), $0
+	INT $0x00
+	CLI			/* who knows what evil the bios got up to */
+	/* save the registers after the call */
+
+//	CLR(rBP)
+//	MTSR(rBP, rSS)			/* 0000 -> rSS */
+//	LWI((RMSTACK-4), rSP)
+
+	OPSIZE; PUSHFL
+	OPSIZE; PUSHL AX
+
+	LWI((RMUADDR-KZERO), rBP)
+	OPSIZE; SXW(rDI, 0, xBP)
+	OPSIZE; SXW(rSI, 4, xBP)
+	OPSIZE; SXW(rBX, 16, xBP)
+	OPSIZE; SXW(rDX, 20, xBP)
+	OPSIZE; SXW(rCX, 24, xBP)
+	OPSIZE; POPL AX
+	OPSIZE; SXW(rAX, 28, xBP)
+
+	MOVW	DS, AX
+	OPSIZE; SXW(rAX, 44, xBP)
+	MOVW	ES, AX
+	OPSIZE; SXW(rAX, 40, xBP)
+
+	OPSIZE; POPL AX
+	OPSIZE; SXW(rAX, 64, xBP)	/* flags */
+
+	/* re-enter protected mode and jump to 32-bit code */
+	OPSIZE; MOVL $1, AX
+	OPSIZE; MOVL AX, CR0
+	
+/*	JMPFAR	SELECTOR(KESEG, SELGDT, 0):$again32bit(SB) /**/
+	 OPSIZE
+	 BYTE $0xEA
+	 LONG	$again32bit-KZERO(SB)
+	 WORD	$SELECTOR(KESEG, SELGDT, 0)
+
+TEXT again32bit(SB), $0
+	MOVW	$SELECTOR(KDSEG, SELGDT, 0),AX
+	MOVW	AX,DS
+	MOVW	AX,SS
+	MOVW	AX,ES
+	MOVW	AX,FS
+	MOVW	AX,GS
+
+	/* enable paging and jump to kzero-address code */
+	MOVL	CR0, AX
+	ORL	$(PG|0x10000), AX	/* PG|WP */
+	MOVL	AX, CR0
+	LEAL	again32kzero(SB), AX
+	JMP*	AX
+
+TEXT again32kzero(SB), $0
+	/* breathe a sigh of relief - back in 32-bit protected mode */
+
+	/* switch to old stack */	
+	PUSHL	AX	/* match popl below for 8l */
+	MOVL	$(RMSTACK-4), SP
+	POPL	SP
+
+	/* restore idt */
+	MOVL	m0idtptr(SB),IDTR
+
+	/* restore gdt */
+	MOVL	m0gdtptr(SB), GDTR
+
+	CALL	restoreregs(SB)
+	RET
+
+TEXT realmodeend(SB), $0

+ 182 - 0
sys/src/9/pcboot/sdbios.c

@@ -0,0 +1,182 @@
+/*
+ * read-only sd driver for BIOS devices with partitions.
+ * will probably only work with bootstrap kernels, as the normal kernel
+ * deals directly with the clock and disk controllers, which seems
+ * to confuse many BIOSes.
+ *
+ * devbios must be initialised first and no disks may be accessed
+ * via non-BIOS means.
+ */
+
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"pool.h"
+#include	"../port/error.h"
+#include	"../port/netif.h"
+#include	"../port/sd.h"
+#include	"dosfs.h"
+#include	<disk.h>
+
+long	biosread0(Bootfs *, void *, long);
+vlong	biosseek(Bootfs *fs, vlong off);
+
+extern SDifc sdbiosifc;
+extern int biosndevs;
+
+uchar *
+putbeul(ulong ul, uchar *p)
+{
+	*p++ = ul >> 24;
+	*p++ = ul >> 16;
+	*p++ = ul >> 8;
+	*p++ = ul;
+	return p;
+}
+
+uchar *
+putbeuvl(uvlong uvl, uchar *p)
+{
+	*p++ = uvl >> 56;
+	*p++ = uvl >> 48;
+	*p++ = uvl >> 40;
+	*p++ = uvl >> 32;
+	*p++ = uvl >> 24;
+	*p++ = uvl >> 16;
+	*p++ = uvl >> 8;
+	*p++ = uvl;
+	return p;
+}
+
+int
+biosverify(SDunit* )
+{
+	if (!biosinited)
+		return 0;
+	return 1;
+}
+
+int
+biosonline(SDunit* unit)
+{
+	uint subno = unit->subno;
+
+	if (!biosinited)
+		panic("sdbios: biosonline: sdbios not inited");
+	if (unit == nil)
+		return 0;
+	unit->secsize = biossectsz(subno);
+	if (unit->secsize <= 0) {
+		print("sdbios: biosonline: implausible sector size on medium\n");
+		return 0;
+	}
+	unit->sectors = biossize(subno);
+	if (unit->sectors <= 0) {
+		unit->sectors = 0;
+		print("sdbios: biosonline: no sectors on medium\n");
+		return 0;
+	}
+	return 1;
+}
+
+static int
+biosrio(SDreq* r)
+{
+	int nb;
+	long got;
+	vlong off;
+	uchar *p;
+	Bootfs fs;			/* just for fs->dev, which is zero */
+	SDunit *unit;
+
+	if (!biosinited)
+		return SDeio;
+	unit = r->unit;
+	/*
+	 * Most SCSI commands can be passed unchanged except for
+	 * the padding on the end. The few which require munging
+	 * are not used internally. Mode select/sense(6) could be
+	 * converted to the 10-byte form but it's not worth the
+	 * effort. Read/write(6) are easy.
+	 */
+	r->rlen = 0;
+	r->status = SDok;
+	switch(r->cmd[0]){
+	case ScmdRead:
+	case ScmdExtread:
+		if (r->cmd[0] == ScmdRead)
+			panic("biosrio: ScmdRead read op");
+		off = r->cmd[2]<<24 | r->cmd[3]<<16 | r->cmd[4]<<8 | r->cmd[5];
+		nb =  r->cmd[7]<<8  | r->cmd[8];	/* often 4 */
+		USED(nb);		/* is nb*unit->secsize == r->dlen? */
+		memset(&fs, 0, sizeof fs);
+		biosseek(&fs, off * unit->secsize);
+		got = biosread0(&fs, r->data, r->dlen);
+		if (got < 0)
+			r->status = SDeio;
+		else
+			r->rlen = got;
+		break;
+	case ScmdWrite:
+	case ScmdExtwrite:
+		r->status = SDeio;	/* boot programs don't write */
+		break;
+
+		/*
+		 * Read capacity returns the LBA of the last sector.
+		 */
+	case ScmdRcapacity:
+		p = putbeul(r->unit->sectors - 1, r->data);
+		r->data = putbeul(r->unit->secsize, p);
+		return SDok;
+	case ScmdRcapacity16:
+		p = putbeuvl(r->unit->sectors - 1, r->data);
+		r->data = putbeul(r->unit->secsize, p);
+		return SDok;
+	/* ignore others */
+	}
+	return r->status;
+}
+
+/* this is called between biosreset and biosinit */
+static SDev*
+biospnp(void)
+{
+	SDev *sdev;
+
+	if (!biosinited)
+		panic("sdbios: biospnp: bios devbios not yet inited");
+	if((sdev = malloc(sizeof(SDev))) != nil) {
+		sdev->ifc = &sdbiosifc;
+		sdev->idno = 'B';
+		sdev->nunit = biosndevs;
+		iprint("sdbios: biospnp: %d unit(s) at sd%C0\n",
+			sdev->nunit, sdev->idno);
+	}
+	return sdev;
+}
+
+SDifc sdbiosifc = {
+	"bios",				/* name */
+
+	biospnp,			/* pnp */
+	nil,				/* legacy */
+	nil,				/* enable */
+	nil,				/* disable */
+
+	biosverify,			/* verify */
+	biosonline,			/* online */
+	biosrio,			/* rio */
+	nil,				/* rctl */
+	nil,				/* wctl */
+
+	scsibio,			/* bio */
+	nil,				/* probe */
+	nil,				/* clear */
+	nil,				/* rtopctl */
+	nil,				/* wtopctl */
+};

+ 400 - 0
sys/src/9/pcboot/stub.c

@@ -0,0 +1,400 @@
+/*
+ * stubs to make bootstrap kernels link, copies of a few functions
+ * to avoid including system calls yet have access to i/o functions,
+ * and some convenience routines.
+ */
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+void (*proctrace)(Proc*, int, vlong);
+
+/* devmnt.c */
+
+void
+muxclose(Mnt *)
+{
+}
+
+Chan*
+mntauth(Chan *, char *)
+{
+	return nil;
+}
+
+long
+mntversion(Chan *, char *, int, int)
+{
+	return 0;
+}
+
+/* swap.c */
+
+Image swapimage;
+
+void
+putswap(Page *)
+{
+}
+
+void
+dupswap(Page *)
+{
+}
+
+void
+kickpager(void)
+{
+}
+
+int
+swapcount(ulong)
+{
+	return 0;
+}
+
+void
+pagersummary(void)
+{
+}
+
+void
+setswapchan(Chan *)
+{
+}
+
+/* devenv.c */
+
+void
+closeegrp(Egrp *)
+{
+}
+
+void
+ksetenv(char *, char *, int)
+{
+}
+
+/* devproc.c */
+
+Segment*
+data2txt(Segment *s)
+{
+	Segment *ps;
+
+	ps = newseg(SG_TEXT, s->base, s->size);
+	ps->image = s->image;
+	incref(ps->image);
+	ps->fstart = s->fstart;
+	ps->flen = s->flen;
+	ps->flushme = 1;
+	return ps;
+}
+
+/* sysproc.c */
+
+int
+return0(void*)
+{
+	return 0;
+}
+
+/* syscallfmt.c */
+void
+syscallfmt(int, ulong, va_list)
+{
+}
+
+void
+sysretfmt(int, va_list, long, uvlong, uvlong)
+{
+}
+
+/* sysfile.c */
+
+int
+newfd(Chan *)
+{
+	return -1;
+}
+
+void
+validstat(uchar *s, int n)
+{
+	int m;
+	char buf[64];
+
+	if(statcheck(s, n) < 0)
+		error(Ebadstat);
+	/* verify that name entry is acceptable */
+	s += STATFIXLEN - 4*BIT16SZ;	/* location of first string */
+	/*
+	 * s now points at count for first string.
+	 * if it's too long, let the server decide; this is
+	 * only for his protection anyway. otherwise
+	 * we'd have to allocate and waserror.
+	 */
+	m = GBIT16(s);
+	s += BIT16SZ;
+	if(m+1 > sizeof buf)
+		return;
+	memmove(buf, s, m);
+	buf[m] = '\0';
+	/* name could be '/' */
+	if(strcmp(buf, "/") != 0)
+		validname(buf, 0);
+}
+
+Chan*
+fdtochan(int fd, int mode, int chkmnt, int iref)
+{
+	Chan *c;
+	Fgrp *f;
+
+	c = 0;
+	f = up->fgrp;
+
+	lock(f);
+	if(fd<0 || f->nfd<=fd || (c = f->fd[fd])==0) {
+		unlock(f);
+		error(Ebadfd);
+	}
+	if(iref)
+		incref(c);
+	unlock(f);
+
+	if(chkmnt && (c->flag&CMSG)) {
+		if(iref)
+			cclose(c);
+		error(Ebadusefd);
+	}
+
+	if(mode<0 || c->mode==ORDWR)
+		return c;
+
+	if((mode&OTRUNC) && c->mode==OREAD) {
+		if(iref)
+			cclose(c);
+		error(Ebadusefd);
+	}
+
+	if((mode&~OTRUNC) != c->mode) {
+		if(iref)
+			cclose(c);
+		error(Ebadusefd);
+	}
+
+	return c;
+}
+
+int
+openmode(ulong o)
+{
+	o &= ~(OTRUNC|OCEXEC|ORCLOSE);
+	if(o > OEXEC)
+		error(Ebadarg);
+	if(o == OEXEC)
+		return OREAD;
+	return o;
+}
+
+int
+bind(char *old, char *new, int flag)
+{
+	int ret;
+	Chan *c0, *c1;
+
+	if((flag&~MMASK) || (flag&MORDER)==(MBEFORE|MAFTER))
+		error(Ebadarg);
+
+	c0 = namec(old, Abind, 0, 0);
+	if(waserror()){
+		cclose(c0);
+		return -1;
+	}
+
+	c1 = namec(new, Amount, 0, 0);
+	if(waserror()){
+		cclose(c1);
+		nexterror();
+	}
+
+	ret = cmount(&c0, c1, flag, nil);
+
+	poperror();
+	cclose(c1);
+	poperror();
+	cclose(c0);
+	return ret;
+}
+
+long
+unmount(char *name, char *old)
+{
+	Chan *cmount, *cmounted;
+
+	cmounted = 0;
+	cmount = namec(old, Amount, 0, 0);
+	if(waserror()) {
+		cclose(cmount);
+		if(cmounted)
+			cclose(cmounted);
+		return -1;
+	}
+
+	if(name)
+		/*
+		 * This has to be namec(..., Aopen, ...) because
+		 * if name is something like /srv/cs or /fd/0,
+		 * opening it is the only way to get at the real
+		 * Chan underneath.
+		 */
+		cmounted = namec(name, Aopen, OREAD, 0);
+	cunmount(cmount, cmounted);
+	poperror();
+	cclose(cmount);
+	if(cmounted)
+		cclose(cmounted);
+	return 0;
+}
+
+long
+chdir(char *dir)
+{
+	Chan *c;
+
+	if (waserror())
+		return -1;
+	c = namec(dir, Atodir, 0, 0);
+	if (up->dot)
+		cclose(up->dot);
+	up->dot = c;
+	poperror();
+	return 0;
+}
+
+Chan *
+namecopen(char *name, int mode)
+{
+	Chan *c;
+
+	if (waserror())
+		return nil;
+	c = namec(name, Aopen, mode, 0);
+	poperror();
+	return c;
+}
+
+Chan *
+enamecopen(char *name, int mode)
+{
+	Chan *c;
+
+	c = namecopen(name, mode);
+	if (c == nil)
+		panic("can't open %s", name);
+	return c;
+}
+
+Chan *
+nameccreate(char *name, int mode)
+{
+	Chan *c;
+
+	if (waserror())
+		return nil;
+	c = namec(name, Acreate, mode, 0);
+	poperror();
+	return c;
+}
+
+Chan *
+enameccreate(char *name, int mode)
+{
+	Chan *c;
+
+	c = nameccreate(name, mode);
+	if (c == nil)
+		panic("can't create %s", name);
+	return c;
+}
+
+int
+myreadn(Chan *c, void *vp, long n)
+{
+	char *p = vp;
+	long nn;
+
+	while(n > 0) {
+		nn = devtab[c->type]->read(c, p, n, c->offset);
+		if(nn == 0)
+			break;
+		c->offset += nn;
+		p += nn;
+		n -= nn;
+	}
+	return p - (char *)vp;
+}
+
+int
+readfile(char *file, void *buf, int len)
+{
+	int n;
+	Chan *cc;
+
+	cc = nil;
+	if (waserror()) {
+		if (cc)
+			cclose(cc);
+		return -1;
+	}
+	cc = namecopen(file, OREAD);
+	if (cc == nil)
+		error("no such file");
+	n = myreadn(cc, buf, len);
+	poperror();
+	cclose(cc);
+	return n;
+}
+
+static int
+dumpfile(char *file)
+{
+	int n;
+	char *buf;
+
+	buf = smalloc(Maxfile + 1);
+	n = readfile(file, buf, Maxfile);
+	if (n < 0)
+		return -1;
+	buf[n] = 0;
+	print("%s (%d bytes):\n", file, n);
+	print("%s\n", buf);
+	free(buf);
+	return 0;
+}
+
+/* main.c */
+
+void
+fpx87restore(FPsave*)
+{
+}
+
+void
+fpx87save(FPsave*)
+{
+}
+
+void
+fpssesave(FPsave *)
+{
+}
+
+void
+fpsserestore(FPsave *)
+{
+}

+ 767 - 0
sys/src/9/pcboot/trap.c

@@ -0,0 +1,767 @@
+#include	"u.h"
+#include	"tos.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+#include	"../port/error.h"
+
+enum {
+	Dumpstack = 1,		/* flag: allow stack dump on panic */
+};
+
+static int trapinited;
+
+void	noted(Ureg*, ulong);
+
+static void debugbpt(Ureg*, void*);
+static void fault386(Ureg*, void*);
+static void doublefault(Ureg*, void*);
+static void unexpected(Ureg*, void*);
+static void _dumpstack(Ureg*);
+
+static Lock vctllock;
+static Vctl *vctl[256];
+
+enum
+{
+	Ntimevec = 20		/* number of time buckets for each intr */
+};
+
+void
+intrenable(int irq, void (*f)(Ureg*, void*), void* a, int tbdf, char *name)
+{
+	int vno;
+	Vctl *v;
+
+	if(f == nil){
+		print("intrenable: nil handler for %d, tbdf %#uX for %s\n",
+			irq, tbdf, name);
+		return;
+	}
+
+	v = xalloc(sizeof(Vctl));
+	v->isintr = 1;
+	v->irq = irq;
+	v->tbdf = tbdf;
+	v->f = f;
+	v->a = a;
+	strncpy(v->name, name, KNAMELEN-1);
+	v->name[KNAMELEN-1] = 0;
+
+	ilock(&vctllock);
+	vno = arch->intrenable(v);
+	if(vno == -1){
+		iunlock(&vctllock);
+		print("intrenable: couldn't enable irq %d, tbdf %#uX for %s\n",
+			irq, tbdf, v->name);
+		xfree(v);
+		return;
+	}
+	if(vctl[vno]){
+		if(vctl[vno]->isr != v->isr || vctl[vno]->eoi != v->eoi)
+			panic("intrenable: handler: %s %s %#p %#p %#p %#p",
+				vctl[vno]->name, v->name,
+				vctl[vno]->isr, v->isr, vctl[vno]->eoi, v->eoi);
+		v->next = vctl[vno];
+	}
+	vctl[vno] = v;
+	iunlock(&vctllock);
+}
+
+int
+intrdisable(int irq, void (*f)(Ureg *, void *), void *a, int tbdf, char *name)
+{
+	Vctl **pv, *v;
+	int vno;
+
+	/*
+	 * For now, none of this will work with the APIC code,
+	 * there is no mapping between irq and vector as the IRQ
+	 * is pretty meaningless.
+	 */
+	if(arch->intrvecno == nil)
+		return -1;
+	vno = arch->intrvecno(irq);
+	ilock(&vctllock);
+	pv = &vctl[vno];
+	while (*pv &&
+		  ((*pv)->irq != irq || (*pv)->tbdf != tbdf || (*pv)->f != f || (*pv)->a != a ||
+		   strcmp((*pv)->name, name)))
+		pv = &((*pv)->next);
+	assert(*pv);
+
+	v = *pv;
+	*pv = (*pv)->next;	/* Link out the entry */
+
+	if(vctl[vno] == nil && arch->intrdisable != nil)
+		arch->intrdisable(irq);
+	iunlock(&vctllock);
+	xfree(v);
+	return 0;
+}
+
+static long
+irqallocread(Chan*, void *vbuf, long n, vlong offset)
+{
+	char *buf, *p, str[2*(11+1)+KNAMELEN+1+1];
+	int m, vno;
+	long oldn;
+	Vctl *v;
+
+	if(n < 0 || offset < 0)
+		error(Ebadarg);
+
+	oldn = n;
+	buf = vbuf;
+	for(vno=0; vno<nelem(vctl); vno++){
+		for(v=vctl[vno]; v; v=v->next){
+			m = snprint(str, sizeof str, "%11d %11d %.*s\n", vno, v->irq, KNAMELEN, v->name);
+			if(m <= offset)	/* if do not want this, skip entry */
+				offset -= m;
+			else{
+				/* skip offset bytes */
+				m -= offset;
+				p = str+offset;
+				offset = 0;
+
+				/* write at most max(n,m) bytes */
+				if(m > n)
+					m = n;
+				memmove(buf, p, m);
+				n -= m;
+				buf += m;
+
+				if(n == 0)
+					return oldn;
+			}
+		}
+	}
+	return oldn - n;
+}
+
+void
+trapenable(int vno, void (*f)(Ureg*, void*), void* a, char *name)
+{
+	Vctl *v;
+
+	if(vno < 0 || vno >= VectorPIC)
+		panic("trapenable: vno %d", vno);
+	v = xalloc(sizeof(Vctl));
+	v->tbdf = BUSUNKNOWN;
+	v->f = f;
+	v->a = a;
+	strncpy(v->name, name, KNAMELEN);
+	v->name[KNAMELEN-1] = 0;
+
+	ilock(&vctllock);
+	if(vctl[vno])
+		v->next = vctl[vno]->next;
+	vctl[vno] = v;
+	iunlock(&vctllock);
+}
+
+static void
+nmienable(void)
+{
+	int x;
+
+	/*
+	 * Hack: should be locked with NVRAM access.
+	 */
+	outb(0x70, 0x80);		/* NMI latch clear */
+	outb(0x70, 0);
+
+	x = inb(0x61) & 0x07;		/* Enable NMI */
+	outb(0x61, 0x08|x);
+	outb(0x61, x);
+}
+
+/*
+ * Minimal trap setup.  Just enough so that we can panic
+ * on traps (bugs) during kernel initialization.
+ * Called very early - malloc is not yet available.
+ */
+void
+trapinit0(void)
+{
+	int d1, v;
+	ulong vaddr;
+	Segdesc *idt;
+
+	idt = (Segdesc*)IDTADDR;
+	vaddr = (ulong)vectortable;
+	for(v = 0; v < 256; v++){
+		d1 = (vaddr & 0xFFFF0000)|SEGP;
+		switch(v){
+
+		case VectorBPT:
+			d1 |= SEGPL(3)|SEGIG;
+			break;
+
+		case VectorSYSCALL:
+			d1 |= SEGPL(3)|SEGIG;
+			break;
+
+		default:
+			d1 |= SEGPL(0)|SEGIG;
+			break;
+		}
+		idt[v].d0 = (vaddr & 0xFFFF)|(KESEL<<16);
+		idt[v].d1 = d1;
+		vaddr += 6;
+	}
+}
+
+void
+trapinit(void)
+{
+	/*
+	 * Special traps.
+	 * Syscall() is called directly without going through trap().
+	 */
+	trapenable(VectorBPT, debugbpt, 0, "debugpt");
+	trapenable(VectorPF, fault386, 0, "fault386");
+	trapenable(Vector2F, doublefault, 0, "doublefault");
+	trapenable(Vector15, unexpected, 0, "unexpected");
+	nmienable();
+
+	addarchfile("irqalloc", 0444, irqallocread, nil);
+	trapinited = 1;
+}
+
+static char* excname[32] = {
+	"divide error",
+	"debug exception",
+	"nonmaskable interrupt",
+	"breakpoint",
+	"overflow",
+	"bounds check",
+	"invalid opcode",
+	"coprocessor not available",
+	"double fault",
+	"coprocessor segment overrun",
+	"invalid TSS",
+	"segment not present",
+	"stack exception",
+	"general protection violation",
+	"page fault",
+	"15 (reserved)",
+	"coprocessor error",
+	"alignment check",
+	"machine check",
+	"19 (reserved)",
+	"20 (reserved)",
+	"21 (reserved)",
+	"22 (reserved)",
+	"23 (reserved)",
+	"24 (reserved)",
+	"25 (reserved)",
+	"26 (reserved)",
+	"27 (reserved)",
+	"28 (reserved)",
+	"29 (reserved)",
+	"30 (reserved)",
+	"31 (reserved)",
+};
+
+/*
+ *  keep histogram of interrupt service times
+ */
+void
+intrtime(Mach*, int vno)
+{
+	ulong diff;
+	ulong x;
+
+	x = perfticks();
+	diff = x - m->perf.intrts;
+	m->perf.intrts = x;
+
+	m->perf.inintr += diff;
+	if(up == nil && m->perf.inidle > diff)
+		m->perf.inidle -= diff;
+	USED(vno);
+}
+
+/* go to user space */
+void
+kexit(Ureg*)
+{
+	uvlong t;
+	Tos *tos;
+
+	/* precise time accounting, kernel exit */
+	tos = (Tos*)(USTKTOP-sizeof(Tos));
+	cycles(&t);
+	tos->kcycles += t - up->kentry;
+	tos->pcycles = up->pcycles;
+	tos->pid = up->pid;
+}
+
+/*
+ *  All traps come here.  It is slower to have all traps call trap()
+ *  rather than directly vectoring the handler.  However, this avoids a
+ *  lot of code duplication and possible bugs.  The only exception is
+ *  VectorSYSCALL.
+ *  Trap is called with interrupts disabled via interrupt-gates.
+ */
+void
+trap(Ureg* ureg)
+{
+	int clockintr, i, vno, user;
+	Vctl *ctl, *v;
+	Mach *mach;
+
+	if(!trapinited){
+		/* fault386 can give a better error message */
+		if(ureg->trap == VectorPF)
+			fault386(ureg, nil);
+		panic("trap %lud: not ready", ureg->trap);
+	}
+
+	if (m == 0)
+		panic("trap: nil m");
+	m->perf.intrts = perfticks();
+	user = (ureg->cs & 0xFFFF) == UESEL;
+
+	clockintr = 0;
+
+	vno = ureg->trap;
+	if(ctl = vctl[vno]){
+		if(ctl->isintr){
+			m->intr++;
+			if(vno >= VectorPIC && vno != VectorSYSCALL)
+				m->lastintr = ctl->irq;
+		}
+
+		if(ctl->isr)
+			ctl->isr(vno);
+		for(v = ctl; v != nil; v = v->next){
+			if(v->f)
+				v->f(ureg, v->a);
+		}
+		if(ctl->eoi)
+			ctl->eoi(vno);
+
+		if(ctl->isintr){
+			intrtime(m, vno);
+
+			if(ctl->irq == IrqCLOCK || ctl->irq == IrqTIMER)
+				clockintr = 1;
+
+			if(up && !clockintr)
+				preempted();
+		}
+	}
+	else if(vno < nelem(excname) && user){
+		char buf[ERRMAX];
+
+		spllo();
+		snprint(buf, sizeof buf, "sys: trap: %s", excname[vno]);
+		postnote(up, 1, buf, NDebug);
+	}
+	else if(vno >= VectorPIC && vno != VectorSYSCALL){
+		/*
+		 * An unknown interrupt.
+		 * Check for a default IRQ7. This can happen when
+		 * the IRQ input goes away before the acknowledge.
+		 * In this case, a 'default IRQ7' is generated, but
+		 * the corresponding bit in the ISR isn't set.
+		 * In fact, just ignore all such interrupts.
+		 */
+
+		/* call all interrupt routines, just in case */
+		for(i = VectorPIC; i <= MaxIrqLAPIC; i++){
+			ctl = vctl[i];
+			if(ctl == nil)
+				continue;
+			if(!ctl->isintr)
+				continue;
+			for(v = ctl; v != nil; v = v->next){
+				if(v->f)
+					v->f(ureg, v->a);
+			}
+			/* should we do this? */
+			if(ctl->eoi)
+				ctl->eoi(i);
+		}
+
+		/* clear the interrupt */
+		i8259isr(vno);
+
+		if(0)print("cpu%d: spurious interrupt %d, last %d\n",
+			m->machno, vno, m->lastintr);
+		if(0)if(conf.nmach > 1){
+			for(i = 0; i < 32; i++){
+				if(!(active.machs & (1<<i)))
+					continue;
+				mach = MACHP(i);
+				if(m->machno == mach->machno)
+					continue;
+				print(" cpu%d: last %d",
+					mach->machno, mach->lastintr);
+			}
+			print("\n");
+		}
+		m->spuriousintr++;
+		return;
+	}
+	else{
+		if(vno == VectorNMI){
+			/*
+			 * Don't re-enable, it confuses the crash dumps.
+			nmienable();
+			 */
+			iprint("cpu%d: PC %#8.8lux\n", m->machno, ureg->pc);
+			while(m->machno != 0)
+				;
+		}
+		dumpregs(ureg);
+		if(vno < nelem(excname))
+			panic("%s", excname[vno]);
+		panic("unknown trap/intr: %d", vno);
+	}
+	splhi();
+
+	/* delaysched set because we held a lock or because our quantum ended */
+	if(up && up->delaysched && clockintr){
+		sched();
+		splhi();
+	}
+}
+
+/*
+ *  dump registers
+ */
+void
+dumpregs2(Ureg* ureg)
+{
+	if(up)
+		iprint("cpu%d: registers for %s %lud\n",
+			m->machno, up->text, up->pid);
+	else
+		iprint("cpu%d: registers for kernel\n", m->machno);
+	iprint("FLAGS=%luX TRAP=%luX ECODE=%luX PC=%luX",
+		ureg->flags, ureg->trap, ureg->ecode, ureg->pc);
+	iprint(" SS=%4.4luX USP=%luX\n", ureg->ss & 0xFFFF, ureg->usp);
+	iprint("  AX %8.8luX  BX %8.8luX  CX %8.8luX  DX %8.8luX\n",
+		ureg->ax, ureg->bx, ureg->cx, ureg->dx);
+	iprint("  SI %8.8luX  DI %8.8luX  BP %8.8luX\n",
+		ureg->si, ureg->di, ureg->bp);
+	iprint("  CS %4.4luX  DS %4.4luX  ES %4.4luX  FS %4.4luX  GS %4.4luX\n",
+		ureg->cs & 0xFFFF, ureg->ds & 0xFFFF, ureg->es & 0xFFFF,
+		ureg->fs & 0xFFFF, ureg->gs & 0xFFFF);
+}
+
+void
+dumpregs(Ureg* ureg)
+{
+	vlong mca, mct;
+
+	dumpregs2(ureg);
+
+	/*
+	 * Processor control registers.
+	 * If machine check exception, time stamp counter, page size extensions
+	 * or enhanced virtual 8086 mode extensions are supported, there is a
+	 * CR4. If there is a CR4 and machine check extensions, read the machine
+	 * check address and machine check type registers if RDMSR supported.
+	 */
+	iprint("  CR0 %8.8lux CR2 %8.8lux CR3 %8.8lux",
+		getcr0(), getcr2(), getcr3());
+	if(m->cpuiddx & 0x9A){
+		iprint(" CR4 %8.8lux", getcr4());
+		if((m->cpuiddx & 0xA0) == 0xA0){
+			rdmsr(0x00, &mca);
+			rdmsr(0x01, &mct);
+			iprint("\n  MCA %8.8llux MCT %8.8llux", mca, mct);
+		}
+	}
+	iprint("\n  ur %#p up %#p\n", ureg, up);
+}
+
+
+/*
+ * Fill in enough of Ureg to get a stack trace, and call a function.
+ * Used by debugging interface rdb.
+ */
+void
+callwithureg(void (*fn)(Ureg*))
+{
+	Ureg ureg;
+	ureg.pc = getcallerpc(&fn);
+	ureg.sp = (ulong)&fn;
+	fn(&ureg);
+}
+
+static void
+_dumpstack(Ureg *ureg)
+{
+	uintptr l, v, i, estack;
+	extern ulong etext;
+	int x;
+	char *s;
+
+	if (Dumpstack) {
+		print("no stack dump\n");
+		return;
+	}
+	if((s = getconf("*nodumpstack")) != nil && strcmp(s, "0") != 0){
+		iprint("dumpstack disabled\n");
+		return;
+	}
+	iprint("dumpstack\n");
+
+	x = 0;
+	x += iprint("ktrace /kernel/path %.8lux %.8lux <<EOF\n", ureg->pc, ureg->sp);
+	i = 0;
+	if(up
+	&& (uintptr)&l >= (uintptr)up->kstack
+	&& (uintptr)&l <= (uintptr)up->kstack+KSTACK)
+		estack = (uintptr)up->kstack+KSTACK;
+	else if((uintptr)&l >= (uintptr)m->stack
+	&& (uintptr)&l <= (uintptr)m+MACHSIZE)
+		estack = (uintptr)m+MACHSIZE;
+	else
+		return;
+	x += iprint("estackx %p\n", estack);
+
+	for(l = (uintptr)&l; l < estack; l += sizeof(uintptr)){
+		v = *(uintptr*)l;
+		if((KTZERO < v && v < (uintptr)&etext) || estack-l < 32){
+			/*
+			 * Could Pick off general CALL (((uchar*)v)[-5] == 0xE8)
+			 * and CALL indirect through AX
+			 * (((uchar*)v)[-2] == 0xFF && ((uchar*)v)[-2] == 0xD0),
+			 * but this is too clever and misses faulting address.
+			 */
+			x += iprint("%.8p=%.8p ", l, v);
+			i++;
+		}
+		if(i == 4){
+			i = 0;
+			x += iprint("\n");
+		}
+	}
+	if(i)
+		iprint("\n");
+	iprint("EOF\n");
+
+	if(ureg->trap != VectorNMI)
+		return;
+
+	i = 0;
+	for(l = (uintptr)&l; l < estack; l += sizeof(uintptr)){
+		iprint("%.8p ", *(uintptr*)l);
+		if(++i == 8){
+			i = 0;
+			iprint("\n");
+		}
+	}
+	if(i)
+		iprint("\n");
+}
+
+void
+dumpstack(void)
+{
+	callwithureg(_dumpstack);
+}
+
+static void
+debugbpt(Ureg* ureg, void*)
+{
+	char buf[ERRMAX];
+
+	if(up == 0)
+		panic("kernel bpt");
+	/* restore pc to instruction that caused the trap */
+	ureg->pc--;
+	snprint(buf, sizeof buf, "sys: breakpoint");
+	postnote(up, 1, buf, NDebug);
+}
+
+static void
+doublefault(Ureg*, void*)
+{
+	panic("double fault");
+}
+
+static void
+unexpected(Ureg* ureg, void*)
+{
+	print("unexpected trap %lud; ignoring\n", ureg->trap);
+}
+
+extern void checkfault(ulong, ulong);
+static void
+fault386(Ureg* ureg, void*)
+{
+	ulong addr;
+	int read, user, n, insyscall;
+
+	addr = getcr2();
+	read = !(ureg->ecode & 2);
+
+	user = (ureg->cs & 0xFFFF) == UESEL;
+	if(!user){
+		if(vmapsync(addr))
+			return;
+		if(addr >= USTKTOP)
+			panic("kernel fault: bad address pc=%#.8lux addr=%#.8lux", ureg->pc, addr);
+		if(up == nil)
+			panic("kernel fault: no user process pc=%#.8lux addr=%#.8lux", ureg->pc, addr);
+	} else
+		panic("fault386: fault from user mode");
+	if(up == nil)
+		panic("user fault: up=0 pc=%#.8lux addr=%#.8lux", ureg->pc, addr);
+
+	insyscall = up->insyscall;
+	up->insyscall = 1;
+	n = fault(addr, read);
+	if(n < 0){
+		dumpregs(ureg);
+		panic("fault: %#lux", addr);
+	}
+	up->insyscall = insyscall;
+}
+
+/*
+ *  dregs of system calls
+ */
+
+/*
+ *  Syscall is called directly from assembler without going through trap().
+ */
+void
+syscall(Ureg*)
+{
+	/* the bootstrap doesn't implement system calls */
+	panic("syscall");
+}
+
+long
+execregs(ulong entry, ulong ssize, ulong nargs)
+{
+	ulong *sp;
+	Ureg *ureg;
+
+	up->fpstate = FPinit;
+	fpoff();
+
+	sp = (ulong*)(USTKTOP - ssize);
+	*--sp = nargs;
+
+	ureg = up->dbgreg;
+	ureg->usp = (ulong)sp;
+	ureg->pc = entry;
+	return USTKTOP-sizeof(Tos);		/* address of kernel/user shared data */
+}
+
+/*
+ *  return the userpc the last exception happened at
+ */
+ulong
+userpc(void)
+{
+	Ureg *ureg;
+
+	ureg = (Ureg*)up->dbgreg;
+	return ureg->pc;
+}
+
+/* This routine must save the values of registers the user is not permitted
+ * to write from devproc and then restore the saved values before returning.
+ */
+void
+setregisters(Ureg* ureg, char* pureg, char* uva, int n)
+{
+	ulong cs, ds, es, flags, fs, gs, ss;
+
+	ss = ureg->ss;
+	flags = ureg->flags;
+	cs = ureg->cs;
+	ds = ureg->ds;
+	es = ureg->es;
+	fs = ureg->fs;
+	gs = ureg->gs;
+	memmove(pureg, uva, n);
+	ureg->gs = gs;
+	ureg->fs = fs;
+	ureg->es = es;
+	ureg->ds = ds;
+	ureg->cs = cs;
+	ureg->flags = (ureg->flags & 0x00FF) | (flags & 0xFF00);
+	ureg->ss = ss;
+}
+
+static void
+linkproc(void)
+{
+	spllo();
+	up->kpfun(up->kparg);
+	pexit("kproc dying", 0);
+}
+
+void
+kprocchild(Proc* p, void (*func)(void*), void* arg)
+{
+	/*
+	 * gotolabel() needs a word on the stack in
+	 * which to place the return PC used to jump
+	 * to linkproc().
+	 */
+	p->sched.pc = (ulong)linkproc;
+	p->sched.sp = (ulong)p->kstack+KSTACK-BY2WD;
+
+	p->kpfun = func;
+	p->kparg = arg;
+}
+
+void
+forkchild(Proc *p, Ureg *ureg)
+{
+	Ureg *cureg;
+
+	/*
+	 * Add 2*BY2WD to the stack to account for
+	 *  - the return PC
+	 *  - trap's argument (ur)
+	 */
+	p->sched.sp = (ulong)p->kstack+KSTACK-(sizeof(Ureg)+2*BY2WD);
+	p->sched.pc = (ulong)forkret;
+
+	cureg = (Ureg*)(p->sched.sp+2*BY2WD);
+	memmove(cureg, ureg, sizeof(Ureg));
+	/* return value of syscall in child */
+	cureg->ax = 0;
+
+	/* Things from bottom of syscall which were never executed */
+	p->psstate = 0;
+	p->insyscall = 0;
+}
+
+/* Give enough context in the ureg to produce a kernel stack for
+ * a sleeping process
+ */
+void
+setkernur(Ureg* ureg, Proc* p)
+{
+	ureg->pc = p->sched.pc;
+	ureg->sp = p->sched.sp+4;
+}
+
+ulong
+dbgpc(Proc *p)
+{
+	Ureg *ureg;
+
+	ureg = p->dbgreg;
+	if(ureg == 0)
+		return 0;
+
+	return ureg->pc;
+}

+ 12 - 0
sys/src/9/pcboot/unbindpc

@@ -0,0 +1,12 @@
+#!/bin/rc
+# unbindpc - unbind files from ../pc into .
+if (! test -e etherigbe.c)
+	exit ''
+
+rfork e
+for (f in `{ls -d pc?*pxe | grep -v '\.'})
+	unmount $f >[2]/dev/null
+unmount .
+unmount /tmp/blank
+# unmount ../port/systab.h
+exit ''

+ 30 - 0
sys/src/9/pcboot/warp64.c

@@ -0,0 +1,30 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"io.h"
+#include	"ureg.h"
+
+typedef unsigned long long u64intptr;
+
+void
+warp64(uvlong entry)
+{
+	u64intptr kzero64 = 0xfffffffff0000000ull;
+	extern void _warp64(ulong);
+
+	print("warp64(%#llux) %#llux %d\n", entry, entry & ~kzero64, nmmap);
+	if(v_flag)
+		print("mkmultiboot\n");
+	mkmultiboot();
+	if(v_flag)
+		print("impulse\n");
+	/*
+	 * No output after impulse().
+	 */
+	if(v_flag)
+		print("_warp64\n");
+	impulse();
+	_warp64(entry & ~kzero64);
+}

+ 125 - 0
sys/src/9/pcboot/words

@@ -0,0 +1,125 @@
+This is a specialised pc kernel that acts as a bootstrap loader for
+386 and amd64 kernels, and replaces the old 9pxeload, 9load and
+9loadusb.  It relies on the pc port (in /sys/src/9/pc) for most of its
+non-portable source files, notably the disk and ethernet drivers, so
+you'll want an up-to-date system (see replica/pull) if installing this
+by hand.  Support for user mode (such as system calls) is almost
+entirely absent.  There is some duplication of code with the pc
+kernel, but we have tried to keep it to a minimum.
+
+Caveats:
+
+9boot doesn't scan disks for partition tables since it doesn't contain
+the disk drivers, but `readparts=' in plan9.ini (see
+/sys/src/9/boot/parts.c) will read them.  9load does read partition
+tables.
+
+9loadusb seems to hang for two minutes when attempting to read via the
+bios on some bioses (e.g., vmware's), then reports a disk timeout on
+the INT 13 extended read operation.
+
+
+How these new bootstraps differ from the old 9load and 9pxeload.
+
+The old 9load was a single process derived from (but separately
+evolved from) an old PC kernel, so it needed modified device drivers
+(primarily for ethernet and disk controllers), which was an ongoing
+maintenance nuisance, and not all ethernet controllers had bootstrap
+drivers.  The new bootstraps are just specialised Plan 9 kernels that
+are loaded into the first 640KB of RAM, so they use unmodified Plan 9
+drivers.  There isn't enough room below 512K to include useful support
+for user mode and system calls, but the new bootstraps do implement
+kernel processes.  The 512K limit is imposed by PXE or PBS loading of
+the new bootstraps into the first 512K or 640K.
+
+The old 9load was capable of loading from disks, floppies, USB disks
+or ethernet (via PXE).  Again due to space limitations, we've had to
+focus the new bootstraps more sharply.  9boot loads via PXE and
+nothing else, which is our usual mode of operation.  9load loads from
+non-USB disks (using BIOS INT 13 calls) but not floppies, which ought
+to be obsolete by now.  If you must boot from floppy and are doing so
+now and your BIOS won't boot from USB, do this to arrange to boot from
+USB disk instead:
+
+	a: && cp /386/9loadusb /n/a:/9load
+
+9loadusb loads from USB disks only.  BIOSes seem to be easily confused
+by intermixing direct I/O and BIOS calls, thus we keep 9load and
+9loadusb distinct.
+
+The new bootstraps can load amd64 as well as 386 kernels, and generate
+Gnu Multiboot tables for their benefit.
+
+The new 9load will look on all available disks for FAT file systems.
+If no bootfile is specified in a plan9.ini, it will examine each file
+system to see if it contains a single Plan 9 kernel (9pc* or 9k8*),
+and if so, will boot it.
+
+The new 9boot contains no disk drivers, so it can't read partition
+tables and populate #ec/sd??part for the kernel's benefit, so if you
+need to access a disk partition early in the kernel's execution (e.g.,
+you have an nvram partition), you'll want to add
+
+readparts=
+
+to the /cfg/pxe file for any such machines.
+
+The new bootstraps should run on more machines than the old ones.  We
+discovered new ways to enable the A20 address line and try them all
+until success.  This may fix various odd memory corruption problems
+seen in the past.  We also discovered that BIOS calls may enable
+interrupts, so we disable them again immediately upon return.  This
+may prevent mysterious resets seen with the old bootstraps.
+
+
+CD booting changes
+
+pbsraw can be up to 2k, so we can print and be nice, uses things
+written up by mk9660.  In any case, it reads a contiguous file and is
+468 bytes long (we dropped the 9fat support here), so it could be used
+to load from any raw partition supporting lba access.
+
+There are corresponding changes to mk9660 to annotate the PBS.  A new
+parameter, -x loader, names a loader that has to be in the root
+directory.  In conjuction with -B, can be used to boot directly from
+the CD.
+
+9load was changed to deal with a 9fat image file in the root directory
+of a CD image.  It has to be called bootdisk.img and can be of any
+size (as long as it is contiguous).
+
+
+Memory Map
+
+This is how the bootstraps use memory:
+
+0	---------------------
+
+31K	---------------------
+	start of pxe decomp + compressed 9boot
+64K	---------------------
+	start of pbs decomp + compressed 9boot
+	decompresses to 9MB
+	...
+512K	---------------------
+	pxe loader from ROM
+640K	---------------------
+	UMB
+1MB	---------------------
+	kernel
+9MB	---------------------
+	9boot after decomp.
+	decompresses kernel.gz
+	from 13MB to 1MB.
+13MB	---------------------
+	(kernel.gz)
+15MB	---------------------
+	no-man's land
+16MB	---------------------
+	malloc arena for 9boot
+	...
+
+chinese puzzle constraints:
+BIOS calls only work from bottom 1MB (640K, really).
+with paging on, we need to have created page tables for the memory we use.
+a20 has to be enabled before we try to use memory above 1MB.

+ 3 - 3
sys/src/boot/pc/mkfile

@@ -39,9 +39,9 @@ install:V:
 	for (i in $TARG)
 		mk $MKFLAGS $i.install
 
-%.install:V:	$BIN/%
-	for (fs in lookout boundary piestand bovril)
-		9fs $fs && cp $prereq /n/$fs/$prereq
+#%.install:V:	$BIN/%
+#	for (fs in lookout boundary piestand bovril)
+#		9fs $fs && cp $prereq /n/$fs/$prereq
 
 $BIN/%:	%
 	cp $stem $BIN/$stem