Browse Source

Plan 9 from Bell Labs 2003-01-09

David du Colombier 21 years ago
parent
commit
a9fe9057c2
65 changed files with 26159 additions and 6 deletions
  1. 71 3
      dist/replica/plan9.db
  2. 77 0
      dist/replica/plan9.log
  3. 1164 0
      sys/doc/fossil.ms
  4. BIN
      sys/doc/fossil.pdf
  5. 5401 0
      sys/doc/fossil.ps
  6. 34 0
      sys/lib/sysconfig/fl/boot
  7. 7 0
      sys/lib/sysconfig/fl/flproto
  8. 8 0
      sys/lib/sysconfig/fl/venti.conf
  9. 428 0
      sys/man/4/fossil
  10. 806 0
      sys/man/8/fossilcons
  11. 2 1
      sys/src/9/pc/devether.c
  12. 122 0
      sys/src/9/pc/pcfl
  13. 30 0
      sys/src/9/port/dev.c
  14. 2 2
      sys/src/9/port/devfs.c
  15. 209 0
      sys/src/cmd/fossil/9.h
  16. 126 0
      sys/src/cmd/fossil/9auth.c
  17. 121 0
      sys/src/cmd/fossil/9dir.c
  18. 126 0
      sys/src/cmd/fossil/9excl.c
  19. 286 0
      sys/src/cmd/fossil/9fid.c
  20. 1460 0
      sys/src/cmd/fossil/9fsys.c
  21. 177 0
      sys/src/cmd/fossil/9lstn.c
  22. 1118 0
      sys/src/cmd/fossil/9p.c
  23. 109 0
      sys/src/cmd/fossil/9ping.c
  24. 415 0
      sys/src/cmd/fossil/9proc.c
  25. 195 0
      sys/src/cmd/fossil/9srv.c
  26. 960 0
      sys/src/cmd/fossil/9user.c
  27. 112 0
      sys/src/cmd/fossil/Ccli.c
  28. 417 0
      sys/src/cmd/fossil/Ccmd.c
  29. 390 0
      sys/src/cmd/fossil/Ccons.c
  30. 41 0
      sys/src/cmd/fossil/Clog.c
  31. 441 0
      sys/src/cmd/fossil/archive.c
  32. 19 0
      sys/src/cmd/fossil/build
  33. 40 0
      sys/src/cmd/fossil/buildsh
  34. 421 0
      sys/src/cmd/fossil/bwatch.c
  35. 2000 0
      sys/src/cmd/fossil/cache.c
  36. 298 0
      sys/src/cmd/fossil/dat.h
  37. 25 0
      sys/src/cmd/fossil/deadlock
  38. 332 0
      sys/src/cmd/fossil/disk.c
  39. 86 0
      sys/src/cmd/fossil/dump.c
  40. 36 0
      sys/src/cmd/fossil/error.c
  41. 31 0
      sys/src/cmd/fossil/error.h
  42. 1648 0
      sys/src/cmd/fossil/file.c
  43. 657 0
      sys/src/cmd/fossil/flchk.c
  44. 553 0
      sys/src/cmd/fossil/flfmt.c
  45. 13 0
      sys/src/cmd/fossil/flproto
  46. 98 0
      sys/src/cmd/fossil/fns.h
  47. 186 0
      sys/src/cmd/fossil/fossil-acid
  48. 94 0
      sys/src/cmd/fossil/fossil.c
  49. 819 0
      sys/src/cmd/fossil/fs.c
  50. 48 0
      sys/src/cmd/fossil/fs.h
  51. 17 0
      sys/src/cmd/fossil/history
  52. 121 0
      sys/src/cmd/fossil/invariants
  53. 96 0
      sys/src/cmd/fossil/mkfile
  54. 39 0
      sys/src/cmd/fossil/nobwatch.c
  55. 226 0
      sys/src/cmd/fossil/pack.c
  56. 84 0
      sys/src/cmd/fossil/periodic.c
  57. 958 0
      sys/src/cmd/fossil/source.c
  58. 271 0
      sys/src/cmd/fossil/srcload.c
  59. 11 0
      sys/src/cmd/fossil/stdinc.h
  60. 19 0
      sys/src/cmd/fossil/trunc.c
  61. 13 0
      sys/src/cmd/fossil/unpack
  62. 746 0
      sys/src/cmd/fossil/vac.c
  63. 107 0
      sys/src/cmd/fossil/vac.h
  64. 1127 0
      sys/src/cmd/fossil/view.c
  65. 65 0
      sys/src/cmd/fossil/walk.c

+ 71 - 3
dist/replica/plan9.db

@@ -198,6 +198,10 @@
 386/bin/file - 775 sys sys 1039758559 117145
 386/bin/fmt - 775 sys sys 1039758560 63811
 386/bin/fortune - 775 sys sys 1039758560 66329
+386/bin/fossil - 20000000775 sys sys 1042005470 0
+386/bin/fossil/flchk - 775 sys sys 1042005470 226919
+386/bin/fossil/flfmt - 775 sys sys 1042005471 225502
+386/bin/fossil/fossil - 775 sys sys 1042005469 329196
 386/bin/freq - 775 sys sys 1039758560 60443
 386/bin/fs - 20000000775 sys sys 954380769 0
 386/bin/fs/32vfs - 775 sys sys 1039758560 96155
@@ -2798,11 +2802,13 @@ n/c - 20000000555 sys sys 1015089577 0
 n/c: - 20000000555 sys sys 952641484 0
 n/d: - 20000000555 sys sys 958016621 0
 n/dist - 20000000555 sys sys 1020896384 0
+n/fossil - 20000000775 sys sys 1042005455 0
 n/ftp - 20000000555 sys sys 959261485 0
 n/kfs - 20000000555 sys sys 954008414 0
 n/kremvax - 20000000555 sys sys 985197951 0
 n/paq - 20000000555 sys sys 1017722329 0
 n/sid - 20000000555 sys sys 959261486 0
+n/snap - 20000000775 sys sys 1042005458 0
 n/sources - 20000000555 sys sys 1021926252 0
 n/sourcesdump - 20000000775 sys sys 1041013207 0
 n/sourcessnap - 20000000775 sys sys 1041013207 0
@@ -3040,6 +3046,9 @@ sys/doc/contents.ms - 664 sys sys 1019916701 4920
 sys/doc/contents.ps - 664 sys sys 1019916845 232629
 sys/doc/docfonts - 664 sys sys 1038117516 208
 sys/doc/fonts - 664 sys sys 944959649 137
+sys/doc/fossil.ms - 664 sys sys 1042044710 31440
+sys/doc/fossil.pdf - 664 sys sys 1042044711 65284
+sys/doc/fossil.ps - 664 sys sys 1042044711 110396
 sys/doc/fs - 20000000775 sys sys 945616779 0
 sys/doc/fs/fs.html - 664 sys sys 1020013937 21345
 sys/doc/fs/fs.pdf - 664 sys sys 1020384351 47177
@@ -4153,6 +4162,10 @@ sys/lib/sysconfig/auth/files/rewrite - 664 sys sys 1016833537 428
 sys/lib/sysconfig/auth/files/tcp566 - 775 sys sys 1016833537 36
 sys/lib/sysconfig/auth/files/tcp567 - 775 sys sys 1016833537 34
 sys/lib/sysconfig/auth/mkfile - 664 sys sys 1016833657 2937
+sys/lib/sysconfig/fl - 20000000775 sys sys 1042004836 0
+sys/lib/sysconfig/fl/boot - 775 sys sys 1042004836 773
+sys/lib/sysconfig/fl/flproto - 664 sys sys 1042004836 129
+sys/lib/sysconfig/fl/venti.conf - 664 sys sys 1042004837 139
 sys/lib/sysconfig/proto - 20000000775 sys sys 959740591 0
 sys/lib/sysconfig/proto/allproto - 664 sys sys 945018241 2
 sys/lib/sysconfig/proto/armpaqproto - 664 sys sys 1037173885 2270
@@ -4712,6 +4725,7 @@ sys/man/4/dossrv - 664 sys sys 1015024813 4176
 sys/man/4/execnet - 664 sys sys 1019866708 1069
 sys/man/4/exportfs - 664 sys sys 1018386776 3746
 sys/man/4/factotum - 664 sys sys 1021579982 13900
+sys/man/4/fossil - 664 sys sys 1042005418 8705
 sys/man/4/fs - 664 sys sys 1019058716 3387
 sys/man/4/ftpfs - 664 sys sys 1018386777 4113
 sys/man/4/import - 664 sys sys 1034195346 2204
@@ -4809,6 +4823,7 @@ sys/man/8/cpurc - 664 sys sys 971455510 1275
 sys/man/8/cron - 664 sys sys 944959679 1750
 sys/man/8/dhcpd - 664 sys sys 1032654987 5237
 sys/man/8/drawterm - 664 sys sys 958419689 2458
+sys/man/8/fossilcons - 664 sys sys 1042005415 12630
 sys/man/8/fs - 664 sys sys 1037805200 13843
 sys/man/8/fsconfig - 664 sys sys 1037805200 7966
 sys/man/8/httpd - 664 sys sys 1037690024 4516
@@ -5058,7 +5073,7 @@ sys/src/9/pc/cga.c - 664 sys sys 1015014513 1843
 sys/src/9/pc/clock.c - 664 sys sys 1032052912 899
 sys/src/9/pc/dat.h - 664 sys sys 1032052913 6070
 sys/src/9/pc/devarch.c - 664 sys sys 1036812831 16158
-sys/src/9/pc/devether.c - 664 sys sys 1026847635 10042
+sys/src/9/pc/devether.c - 664 sys sys 1042004805 10072
 sys/src/9/pc/devfloppy.c - 664 sys sys 1015014514 19930
 sys/src/9/pc/devi82365.c - 664 sys sys 1020284820 19987
 sys/src/9/pc/devlm78.c - 664 sys sys 1026847635 6038
@@ -5117,6 +5132,7 @@ sys/src/9/pc/pcauth - 664 sys sys 1039753496 600
 sys/src/9/pc/pccd - 664 sys sys 1039753495 1278
 sys/src/9/pc/pccpu - 664 sys sys 1039803186 785
 sys/src/9/pc/pcdisk - 664 sys sys 1039764711 1369
+sys/src/9/pc/pcfl - 664 sys sys 1042004821 1563
 sys/src/9/pc/pcflop - 664 sys sys 1032749195 1353
 sys/src/9/pc/pci.c - 664 sys sys 1032052921 23359
 sys/src/9/pc/pcmciamodem.c - 664 sys sys 1036812832 1499
@@ -5174,7 +5190,7 @@ sys/src/9/port/cache.c - 664 sys sys 1014931171 9242
 sys/src/9/port/chan.c - 664 sys sys 1031706300 28370
 sys/src/9/port/cis.c - 664 sys sys 1014931171 8087
 sys/src/9/port/debugalloc.c - 664 sys sys 1014931171 10402
-sys/src/9/port/dev.c - 664 sys sys 1032990930 6881
+sys/src/9/port/dev.c - 664 sys sys 1042004688 8082
 sys/src/9/port/devaudio.c - 664 sys sys 1026847546 21137
 sys/src/9/port/devbridge.c - 664 sys sys 1026847546 24311
 sys/src/9/port/devcap.c - 664 sys sys 1032052801 4070
@@ -5182,7 +5198,7 @@ sys/src/9/port/devcons.c - 664 sys sys 1036812998 20919
 sys/src/9/port/devdraw.c - 664 sys sys 1039753332 41987
 sys/src/9/port/devdup.c - 664 sys sys 1014931172 2332
 sys/src/9/port/devenv.c - 664 sys sys 1019762849 6562
-sys/src/9/port/devfs.c - 664 sys sys 1041971958 10698
+sys/src/9/port/devfs.c - 664 sys sys 1042044133 10702
 sys/src/9/port/devkprof.c - 664 sys sys 1014931173 3111
 sys/src/9/port/devloopback.c - 664 sys sys 1018721201 14968
 sys/src/9/port/devmnt.c - 664 sys sys 1041443399 21588
@@ -6966,6 +6982,58 @@ sys/src/cmd/fax/subr.c - 664 sys sys 1015090401 1245
 sys/src/cmd/file.c - 664 sys sys 1038186733 20196
 sys/src/cmd/fmt.c - 664 sys sys 1025298248 3897
 sys/src/cmd/fortune.c - 664 sys sys 1035832953 1674
+sys/src/cmd/fossil - 20000000775 sys sys 1042005512 0
+sys/src/cmd/fossil/9.h - 664 sys sys 1042005502 3379
+sys/src/cmd/fossil/9auth.c - 664 sys sys 1042005502 2389
+sys/src/cmd/fossil/9dir.c - 664 sys sys 1042005502 1995
+sys/src/cmd/fossil/9excl.c - 664 sys sys 1042005502 1887
+sys/src/cmd/fossil/9fid.c - 664 sys sys 1042005502 5236
+sys/src/cmd/fossil/9fsys.c - 664 sys sys 1042005503 26731
+sys/src/cmd/fossil/9lstn.c - 664 sys sys 1042005503 2865
+sys/src/cmd/fossil/9p.c - 664 sys sys 1042005503 21328
+sys/src/cmd/fossil/9ping.c - 664 sys sys 1042005503 1563
+sys/src/cmd/fossil/9proc.c - 664 sys sys 1042005503 7358
+sys/src/cmd/fossil/9srv.c - 664 sys sys 1042005504 3215
+sys/src/cmd/fossil/9user.c - 664 sys sys 1042005504 17476
+sys/src/cmd/fossil/Ccli.c - 664 sys sys 1042005504 1624
+sys/src/cmd/fossil/Ccmd.c - 664 sys sys 1042005504 7169
+sys/src/cmd/fossil/Ccons.c - 664 sys sys 1042005504 6524
+sys/src/cmd/fossil/Clog.c - 664 sys sys 1042005505 591
+sys/src/cmd/fossil/archive.c - 664 sys sys 1042005505 9083
+sys/src/cmd/fossil/build - 664 sys sys 1042005505 449
+sys/src/cmd/fossil/buildsh - 775 sys sys 1042005505 561
+sys/src/cmd/fossil/bwatch.c - 664 sys sys 1042005505 6754
+sys/src/cmd/fossil/cache.c - 664 sys sys 1042005506 40534
+sys/src/cmd/fossil/dat.h - 664 sys sys 1042005506 7775
+sys/src/cmd/fossil/deadlock - 775 sys sys 1042005506 413
+sys/src/cmd/fossil/disk.c - 664 sys sys 1042005506 5634
+sys/src/cmd/fossil/dump.c - 664 sys sys 1042005506 1340
+sys/src/cmd/fossil/error.c - 664 sys sys 1042005507 1367
+sys/src/cmd/fossil/error.h - 664 sys sys 1042005507 744
+sys/src/cmd/fossil/file.c - 664 sys sys 1042005507 27500
+sys/src/cmd/fossil/flchk.c - 664 sys sys 1042005507 13670
+sys/src/cmd/fossil/flfmt.c - 664 sys sys 1042005507 10314
+sys/src/cmd/fossil/flproto - 664 sys sys 1042005508 210
+sys/src/cmd/fossil/fns.h - 664 sys sys 1042005508 2950
+sys/src/cmd/fossil/fossil-acid - 664 sys sys 1042005508 3965
+sys/src/cmd/fossil/fossil.c - 664 sys sys 1042005508 1256
+sys/src/cmd/fossil/fs.c - 664 sys sys 1042005508 16962
+sys/src/cmd/fossil/fs.h - 664 sys sys 1042005509 1222
+sys/src/cmd/fossil/history - 664 sys sys 1042005509 482
+sys/src/cmd/fossil/invariants - 664 sys sys 1042005509 4073
+sys/src/cmd/fossil/mkfile - 664 sys sys 1042005509 1589
+sys/src/cmd/fossil/nobwatch.c - 664 sys sys 1042005509 329
+sys/src/cmd/fossil/pack.c - 664 sys sys 1042005510 4683
+sys/src/cmd/fossil/periodic.c - 664 sys sys 1042005510 1091
+sys/src/cmd/fossil/source.c - 664 sys sys 1042005510 18124
+sys/src/cmd/fossil/srcload.c - 664 sys sys 1042005510 4178
+sys/src/cmd/fossil/stdinc.h - 664 sys sys 1042005510 155
+sys/src/cmd/fossil/trunc.c - 664 sys sys 1042005511 280
+sys/src/cmd/fossil/unpack - 775 sys sys 1042005511 286
+sys/src/cmd/fossil/vac.c - 664 sys sys 1042005511 12497
+sys/src/cmd/fossil/vac.h - 664 sys sys 1042005511 2782
+sys/src/cmd/fossil/view.c - 664 sys sys 1042005512 19708
+sys/src/cmd/fossil/walk.c - 664 sys sys 1042005512 963
 sys/src/cmd/freq.c - 664 sys sys 944961364 1682
 sys/src/cmd/getflags - 20000000775 sys sys 954036865 0
 sys/src/cmd/getflags/funcgetflags.c - 664 sys sys 944960998 4621

+ 77 - 0
dist/replica/plan9.log

@@ -16913,3 +16913,80 @@
 1041971572 11 c sys/man/7/INDEX - 664 sys sys 1041971551 89
 1041971572 12 c sys/man/8/INDEX - 664 sys sys 1041971551 2472
 1041973373 0 c sys/src/9/port/devfs.c - 664 sys sys 1041971958 10698
+1042005689 0 a 386/bin/fossil - 20000000775 sys sys 1042005470 0
+1042005689 1 a 386/bin/fossil/flchk - 775 sys sys 1042005470 226919
+1042005689 2 a 386/bin/fossil/flfmt - 775 sys sys 1042005471 225502
+1042005689 3 a 386/bin/fossil/fossil - 775 sys sys 1042005469 329196
+1042005689 4 a n/fossil - 20000000775 sys sys 1042005455 0
+1042005689 5 a n/snap - 20000000775 sys sys 1042005458 0
+1042005689 6 a sys/doc/fossil.ms - 664 sys sys 1042005621 30983
+1042005689 7 a sys/doc/fossil.ps - 664 sys sys 1042005622 105122
+1042005689 8 a sys/lib/sysconfig/fl - 20000000775 sys sys 1042004836 0
+1042005689 9 a sys/lib/sysconfig/fl/boot - 775 sys sys 1042004836 773
+1042005689 10 a sys/lib/sysconfig/fl/flproto - 664 sys sys 1042004836 129
+1042005689 11 a sys/lib/sysconfig/fl/venti.conf - 664 sys sys 1042004837 139
+1042005689 12 a sys/man/4/fossil - 664 sys sys 1042005418 8705
+1042005689 13 a sys/man/8/fossilcons - 664 sys sys 1042005415 12630
+1042005689 14 c sys/src/9/pc/devether.c - 664 sys sys 1042004805 10072
+1042005689 15 a sys/src/9/pc/pcfl - 664 sys sys 1042004821 1563
+1042005689 16 c sys/src/9/port/dev.c - 664 sys sys 1042004688 8082
+1042005689 17 a sys/src/cmd/fossil - 20000000775 sys sys 1042005512 0
+1042005689 18 a sys/src/cmd/fossil/9.h - 664 sys sys 1042005502 3379
+1042005689 19 a sys/src/cmd/fossil/9auth.c - 664 sys sys 1042005502 2389
+1042005689 20 a sys/src/cmd/fossil/9dir.c - 664 sys sys 1042005502 1995
+1042005689 21 a sys/src/cmd/fossil/9excl.c - 664 sys sys 1042005502 1887
+1042005689 22 a sys/src/cmd/fossil/9fid.c - 664 sys sys 1042005502 5236
+1042005689 23 a sys/src/cmd/fossil/9fsys.c - 664 sys sys 1042005503 26731
+1042005689 24 a sys/src/cmd/fossil/9lstn.c - 664 sys sys 1042005503 2865
+1042005689 25 a sys/src/cmd/fossil/9p.c - 664 sys sys 1042005503 21328
+1042005689 26 a sys/src/cmd/fossil/9ping.c - 664 sys sys 1042005503 1563
+1042005689 27 a sys/src/cmd/fossil/9proc.c - 664 sys sys 1042005503 7358
+1042005689 28 a sys/src/cmd/fossil/9srv.c - 664 sys sys 1042005504 3215
+1042005689 29 a sys/src/cmd/fossil/9user.c - 664 sys sys 1042005504 17476
+1042005689 30 a sys/src/cmd/fossil/Ccli.c - 664 sys sys 1042005504 1624
+1042005689 31 a sys/src/cmd/fossil/Ccmd.c - 664 sys sys 1042005504 7169
+1042005689 32 a sys/src/cmd/fossil/Ccons.c - 664 sys sys 1042005504 6524
+1042005689 33 a sys/src/cmd/fossil/Clog.c - 664 sys sys 1042005505 591
+1042005689 34 a sys/src/cmd/fossil/archive.c - 664 sys sys 1042005505 9083
+1042005689 35 a sys/src/cmd/fossil/build - 664 sys sys 1042005505 449
+1042005689 36 a sys/src/cmd/fossil/buildsh - 775 sys sys 1042005505 561
+1042005689 37 a sys/src/cmd/fossil/bwatch.c - 664 sys sys 1042005505 6754
+1042005689 38 a sys/src/cmd/fossil/cache.c - 664 sys sys 1042005506 40534
+1042005689 39 a sys/src/cmd/fossil/dat.h - 664 sys sys 1042005506 7775
+1042005689 40 a sys/src/cmd/fossil/deadlock - 775 sys sys 1042005506 413
+1042005689 41 a sys/src/cmd/fossil/disk.c - 664 sys sys 1042005506 5634
+1042005689 42 a sys/src/cmd/fossil/dump.c - 664 sys sys 1042005506 1340
+1042005689 43 a sys/src/cmd/fossil/error.c - 664 sys sys 1042005507 1367
+1042005689 44 a sys/src/cmd/fossil/error.h - 664 sys sys 1042005507 744
+1042005689 45 a sys/src/cmd/fossil/file.c - 664 sys sys 1042005507 27500
+1042005689 46 a sys/src/cmd/fossil/flchk.c - 664 sys sys 1042005507 13670
+1042005689 47 a sys/src/cmd/fossil/flfmt.c - 664 sys sys 1042005507 10314
+1042005689 48 a sys/src/cmd/fossil/flproto - 664 sys sys 1042005508 210
+1042005689 49 a sys/src/cmd/fossil/fns.h - 664 sys sys 1042005508 2950
+1042005689 50 a sys/src/cmd/fossil/fossil-acid - 664 sys sys 1042005508 3965
+1042005689 51 a sys/src/cmd/fossil/fossil.c - 664 sys sys 1042005508 1256
+1042005689 52 a sys/src/cmd/fossil/fs.c - 664 sys sys 1042005508 16962
+1042005689 53 a sys/src/cmd/fossil/fs.h - 664 sys sys 1042005509 1222
+1042005689 54 a sys/src/cmd/fossil/history - 664 sys sys 1042005509 482
+1042005689 55 a sys/src/cmd/fossil/invariants - 664 sys sys 1042005509 4073
+1042005689 56 a sys/src/cmd/fossil/mkfile - 664 sys sys 1042005509 1589
+1042005689 57 a sys/src/cmd/fossil/nobwatch.c - 664 sys sys 1042005509 329
+1042005689 58 a sys/src/cmd/fossil/pack.c - 664 sys sys 1042005510 4683
+1042005689 59 a sys/src/cmd/fossil/periodic.c - 664 sys sys 1042005510 1091
+1042005689 60 a sys/src/cmd/fossil/source.c - 664 sys sys 1042005510 18124
+1042005689 61 a sys/src/cmd/fossil/srcload.c - 664 sys sys 1042005510 4178
+1042005689 62 a sys/src/cmd/fossil/stdinc.h - 664 sys sys 1042005510 155
+1042005689 63 a sys/src/cmd/fossil/trunc.c - 664 sys sys 1042005511 280
+1042005689 64 a sys/src/cmd/fossil/unpack - 775 sys sys 1042005511 286
+1042005689 65 a sys/src/cmd/fossil/vac.c - 664 sys sys 1042005511 12497
+1042005689 66 a sys/src/cmd/fossil/vac.h - 664 sys sys 1042005511 2782
+1042005689 67 a sys/src/cmd/fossil/view.c - 664 sys sys 1042005512 19708
+1042005689 68 a sys/src/cmd/fossil/walk.c - 664 sys sys 1042005512 963
+1042005689 69 a sys/src/cmd/fossil/words - 664 sys sys 1042005512 1166
+1042006506 0 a sys/doc/fossil.pdf - 664 sys sys 1042006244 65276
+1042006506 1 c sys/doc/fossil.ps - 664 sys sys 1042006244 110324
+1042006877 0 d sys/src/cmd/fossil/words - 664 sys sys 1042005512 0
+1042045363 0 c sys/doc/fossil.ms - 664 sys sys 1042044710 31440
+1042045363 1 c sys/doc/fossil.pdf - 664 sys sys 1042044711 65284
+1042045363 2 c sys/doc/fossil.ps - 664 sys sys 1042044711 110396
+1042045363 3 c sys/src/9/port/devfs.c - 664 sys sys 1042044133 10702

+ 1164 - 0
sys/doc/fossil.ms

@@ -0,0 +1,1164 @@
+... .FP times
+... .fp 1 R R.nomath
+... .fp 5 CW LucidaSansCW83
+.TL
+Fossil, an Archival File Server
+.AU
+Sean Quinlan
+Jim McKie
+Russ Cox
+jmk,rsc@plan9.bell-labs.com
+.AB
+This paper describes the internals and 
+operation of Fossil, an archival file server built for Plan 9.
+Fossil has not yet replaced the current Plan 9 file server
+and
+.CW kfs ,
+but that is our eventual intent.
+Both fossil and this documentation are
+works in progress.  Comments on either are most welcome.
+.AE
+.de HP
+.LP
+..
+.NH 1
+Introduction
+.HP
+Fossil is an archival file server built for Plan 9.
+In a typical configuration, it maintains a traditional file system
+in a local disk partition and periodically archives snapshots of the file system
+to a Venti server.  These archives are made available through
+a file system interface.
+Fossil can also be run without a Venti server, in which case the
+snapshots (if any) occupy local disk space.
+.PP
+The bulk of this paper explains the underlying data structures:
+Venti trees, the Venti archival file system format, and finally Fossil's
+file system format.
+The end of the paper discusses the architecture of the Fossil server.
+.PP
+The presentation of the data structures is very detailed, perhaps
+too detailed for most readers.
+The intent is to record all the details necessary to make structural
+changes to the file system format.
+Feel free to jump ahead when boredom strikes.
+.NH 1
+Venti trees and directory hierarchies
+.HP
+Venti [3] is an archival block storage server.
+Once a block is stored, it can be retrieved by presenting the 20-byte
+SHA1 hash of its contents, called a
+.I score .
+Blocks on Venti have a maximum length of about 56 kilobytes,
+though in practice smaller blocks are used.
+To store a byte stream of arbitrary length, Venti uses a hash tree.
+Conceptually, the data stream is broken into fixed-size (say,
+.I dsize -byte)
+chunks, which are stored on the Venti server.
+The resulting scores are concatenated into a new pointer stream, which is
+broken into fixed size (say,
+.I psize -byte)
+chunks, which are stored on the Venti server.
+.I Psize "" (
+is different from
+.I dsize
+so that we can ensure that each pointer block holds an
+integral number of pointers.)
+This yields a new pointer stream, and so on, until there is a single block
+and finally a single score describing the entire tree.
+The resulting structure looks like:
+.PS
+.ps 8
+.vs 10
+boxht=0.1
+boxwid=0.1
+
+B0: box invis wid 1 "\f(CWVtDataType\fP"
+move right 0.1
+L0a: box wid 0.2
+move right 0.1
+L0b: box wid 0.2
+move right 0.1
+L0c: box invis wid 0.2 "..."
+move right 0.1
+
+L0d: box wid 0.2
+move right 0.1
+L0e: box wid 0.2
+move right 0.2
+L0f: box invis wid 0.2 "..."
+move right 0.2
+
+L0g: box wid 0.2
+move right 0.1
+L0h: box wid 0.2
+move right 0.1
+L0i: box invis wid 0.2 "..."
+move right 0.1
+
+L0j: box wid 0.2
+move right 0.1
+L0k: box wid 0.2
+move right 0.1
+L0l: box invis wid 0.2 "..."
+move right 0.1
+L0m: box wid 0.2
+
+define boxppddd {
+	line from 0.2<$1.nw,$1.ne> to 0.2<$1.sw,$1.se>
+	line from 0.4<$1.nw,$1.ne> to 0.4<$1.sw,$1.se>
+	X: box invis at 0.1<$1.nw,$1.ne>
+	Y: box invis at 0.1<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $2.nw
+	X: box invis at 0.3<$1.nw,$1.ne>
+	Y: box invis at 0.3<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $3.nw
+	"..." at 0.7<$1.w,$1.e>
+}
+
+define boxppdddp {
+	line from 0.2<$1.nw,$1.ne> to 0.2<$1.sw,$1.se>
+	line from 0.4<$1.nw,$1.ne> to 0.4<$1.sw,$1.se>
+	line from 0.8<$1.nw,$1.ne> to 0.8<$1.sw,$1.se>
+	X: box invis at 0.1<$1.nw,$1.ne>
+	Y: box invis at 0.1<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $2.nw
+	X: box invis at 0.3<$1.nw,$1.ne>
+	Y: box invis at 0.3<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $3.nw
+	"..." at 0.6<$1.w,$1.e>
+	X: box invis at 0.9<$1.nw,$1.ne>
+	Y: box invis at 0.9<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $4.nw
+}
+
+define boxpdddp {
+	line from 0.2<$1.nw,$1.ne> to 0.2<$1.sw,$1.se>
+	line from 0.8<$1.nw,$1.ne> to 0.8<$1.sw,$1.se>
+	X: box invis at 0.1<$1.nw,$1.ne>
+	Y: box invis at 0.1<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $2.nw
+	"..." at 0.5<$1.w,$1.e>
+	X: box invis at 0.9<$1.nw,$1.ne>
+	Y: box invis at 0.9<$1.sw,$1.se>
+	line -> from 0.5<X,Y> to $3.nw
+}
+
+bhd=0.4
+L1abc: box wid 0.5 at 0.5<L0a, L0b>+(0,bhd)
+boxppddd(L1abc, L0a, L0b)
+L1def: box wid 0.5 at 0.5<L0d, L0e>+(0,bhd)
+boxppddd(L1def, L0d, L0e)
+L1ghi: box wid 0.5 at 0.5<L0g, L0h>+(0,bhd)
+boxppddd(L1ghi, L0g, L0h)
+L1jklm: box wid 0.5 at 0.5<L0j, L0k>+(0,bhd)
+boxppdddp(L1jklm, L0j, L0k, L0m)
+B1: box invis wid 1 "\f(CWVtPointerType0\fP" at B0+(0,bhd)
+
+L2abcdef: box wid 0.5 at 0.5<L1abc,L1def>+(0,bhd)
+boxppddd(L2abcdef, L1abc, L1def)
+L2ghijklm: box wid 0.5 at 0.5<L1ghi,L1jklm>+(0,bhd)
+boxpdddp(L2ghijklm, L1ghi, L1jklm)
+B2: box invis wid 1 "\f(CWVtPointerType1\fP" at B1+(0,bhd)
+
+L3atom: box wid 0.5 at 0.5<L2abcdef, L2ghijklm>+(0,bhd)
+boxpdddp(L3atom, L2abcdef, L2ghijklm)
+B3: box invis wid 1 "\f(CWVtPointerType2\fP" at B2+(0,bhd)
+.PE
+.LP
+The leaves are the original data stream.  Those blocks have type
+.CW VtDataType .
+The first pointer stream has type
+.CW VtPointerType0 ,
+the next has type
+.CW VtPointerType1 ,
+and so on.
+The figure ends with a single block of type
+.CW VtPointerType2 ,
+but in general trees can have height up to
+.CW VtPointerType6 .
+For a
+.I dsize
+of 8192 bytes
+and
+.I psize
+of 8180 bytes (409 pointers),
+this gives a maximum stream size of approximately 10 zettabytes
+(2\s-2\u73\d\s+2 or 10\s-2\u22\d\s+2 bytes).
+.PP
+Data block are truncated to remove trailing runs of zeros before
+storage to Venti; they are zero-filled back to
+.I dsize
+bytes after retrieval from Venti.
+SImilarly, trailing runs of pointers to zero-length blocks are
+removed from and added back to pointer blocks.
+These simple rules happen to make it particularly efficient to store
+large runs of zeros, as might occur in a data stream with ``holes:''
+the zero-length block itself can be interpreted as a tree of any
+depth encoding an all-zero data stream.
+.PP
+Reconstructing the data stream requires the score and type of the
+topmost block in the tree, the data chunk size, the pointer chunk size,
+and the data stream size.
+(From the data stream size and the chunk sizes we could derive the
+depth of the tree and thus the type of the topmost block, but it is convenient
+to allow trees that are deeper than necessary.)
+This information is kept in a 40-byte structure called a
+.CW VtEntry :
+.P1
+VtEntry:
+.ta +\w'    'u +\w'            'u
+	gen[4]	\fRgeneration number\fP
+	psize[2]	\fRsize of pointer blocks\fP
+	dsize[2]	\fRsize of data blocks\fP
+	flags[1]
+	zero[5]
+	size[6]	\fRlength of file\fP
+	score[20]	\fRscore of root block in tree\fP
+.P2
+(In this notation,
+.CW name[sz]
+indicates a
+.CW sz -byte
+field called
+.CW name .
+Integers are stored in big-endian order.
+.CW Size
+really is a 48-bit field.)
+.CW Flags
+is made up of the following bit fields.
+.I or ' `
+of the following flags:
+.P1
+.ta +\w'      'u +\w'                      'u
+0x01	VtEntryActive	\fRentry is allocated\fP
+0x02	VtEntryDir	\fRentry describes a Venti directory (q.v.)\fP
+0x1C	VtEntryDepthMask	\fRmask for tree depth\fP
+0x20	VtEntryLocal	\fRreserved (q.v.)\fP
+.P2
+.LP
+The depth of the described tree is stored in the 5 bits indicated:
+a tree with a topmost node of type
+.CW VtPointerType3
+has depth 4.
+.PP
+With
+.CW VtEntry
+we can build more complicated data structures,
+ones with multiple or nested data streams.
+A data stream consisting of
+.CW VtEntry
+structures is called a Venti directory.
+It is identical in structure to the Venti data stream
+we described earlier except that the bottom-level type is
+.CW VtDirType ,
+and
+the
+.CW VtEntry
+describing a Venti directory has the
+.CW VtEntryDir
+flag bit set.
+The
+.I dsize
+for a Venti directory
+is a multiple of 40 so that each data chunk holds
+an integer number of
+.CW VtEntry
+structures.
+By analogy with Venti directories,
+we call the original data stream a
+Venti file.
+Note that Venti files are assumed
+.I not
+to contain pointers to other Venti blocks.
+The only pointers to Venti blocks occur in 
+.CW VtEntry
+structures in
+Venti directories
+(and in the internal hash tree structure of the
+individual files and directories).
+Note also that these directories are nothing more than pointer lists.
+In particular, there are no names or metadata like in a file system.
+.PP
+To make it easier to pass hierarchies between applications,
+the root of a hierarchy is described in a 300-byte structure
+called a
+.CW VtRoot :
+.P1
+VtRoot:
+.ta +\w'    'u +\w'                'u
+	version[2]	\f(CW2\fP
+	name[128]	\fRname of structure (just a comment)\fP
+	type[128]	\fRstring describing structure (\f(CWvac\fR)\f(CW
+	score[20]	\fRpointer to \f(CWVtDirType\fP block\f(CW
+	blockSize[2]	\fRmaximum block size in structure\fP
+	prev[20]	\fRprevious \f(CWVtRoot\fP in chain, if any\fP
+.P2
+.LP
+This structure is stored to Venti and its score is passed
+between applications, typically in the form
+``\fItype\f(CW:\fIrootscore\fR,''
+where
+.I type
+is the type field from the
+.CW VtRoot
+structure, and
+.I rootscore
+is the score of the
+.CW VtRoot
+block.
+.CW VtRoot
+structures can be chained together using the
+.I prev
+field to encode an archival history
+of the data structure.
+.PP
+For example, a small Venti hierarchy might look like:
+.PS
+.ps 8
+.vs 10
+boxwid=0.1
+boxht=0.1
+f=0.9
+mb=0.16
+
+VtRoot: [
+	right
+	B1: box
+	move right 0.1
+	"\f(CWVtRoot\fP" ljust
+]
+
+Root: [
+	right
+	B1: box fill f
+	B2: box fill f
+	B3: box fill f
+	move right 0.1
+] with .nw at VtRoot.sw+(0.2,-.1)
+Level1: [
+	RootMeta: [
+		box wid mb
+	]
+	MetaSource: [
+		right
+		B1: box wid 5*mb
+	] with .nw at RootMeta.sw+(0,-.1)
+
+	Source: [
+		right
+		B1: box fill f
+		B2: box fill f
+		B3: box fill f
+		B4: box fill f
+		B5: box fill f
+		B6: box fill f
+		B7: box fill f
+		B8: box fill f
+	] with .nw at MetaSource.sw+(0,-.1)
+	SB1: box invis at Source.B1
+	SB2: box invis at Source.B2
+	SB3: box invis at Source.B3
+] with .nw at Root.sw+(0.4,-.1)
+Level2: [
+	MetaSource: [
+		right
+		B1: box wid 5*mb
+	] 
+	Source: [
+		right
+		B1: box fill f
+		B2: box fill f
+		B3: box fill f
+		B4: box fill f
+		B5: box fill f
+		B6: box fill f
+		B7: box fill f
+		B8: box fill f
+	] with .nw at MetaSource.sw+(0,-.1)
+	File: box wid 0.8 with .nw at Source.sw+(0,-.1)
+] with .nw at Level1.sw+(0.6,-.1)
+
+line -> from VtRoot.B1 down boxwid/2+0.1+boxwid/2 then to Root.w
+line -> from Root.B3 down boxwid/2+0.1+boxwid/2 then to Level1.RootMeta.w
+line -> from Root.B2 down boxwid/2+0.1+boxwid+0.1+boxwid/2 then to Level1.MetaSource.w
+line -> from Root.B1 down boxwid/2+0.1+boxwid+0.1+boxwid+0.1+boxwid/2 then to Level1.Source.w
+
+line -> from Level1.SB3 down boxwid/2+0.1+boxwid/2 then to Level2.MetaSource.w
+line -> from Level1.SB2 down boxwid/2+0.1+boxwid+0.1+boxwid/2 then to Level2.Source.w
+line -> from Level1.SB1 down boxwid/2+0.1+boxwid+0.1+boxwid+0.1+boxwid/2 then to Level2.File.w
+
+[
+	KEY: box wid 1.5 invis "Key"
+	line from KEY.sw to KEY.se
+	k = -.1
+	kk=0.5
+	A: [
+		box wid 4*boxwid
+		"Venti file" ljust with .w at last box .w+(kk,0)
+	] with .nw at KEY.sw+(0,2*k)
+	B: [
+		box fill f
+		"Venti entry (\f(CWVtEntry\fP)" ljust with .w at last box .w+(kk,0)
+	] with .nw at A.sw+(0,k)
+	C: [
+		right
+		CC: box fill f
+		box fill f
+		box fill f
+		box fill f
+		"Venti directory" ljust with .w at CC.w+(kk,0)
+	] with .nw at B.sw+(0,k)
+	D: [
+		line -> right 3*boxwid
+		"Venti pointer (score)" ljust with .w at last line .w+(kk, 0)
+	] with .nw at C.sw+(0,k)
+] with .nw at VtRoot.nw+(3,0)
+.PE
+.LP
+Venti files are shown as white boxes, while directories are shown
+as shaded boxes.  Each shaded square represents a
+.CW VtEntry .
+Arrows represent pointers from
+.CW VtEntry
+structures to other
+Venti files or directories.
+.PP
+The hierarchical structure provided by Venti files and directories
+can be used as the base for more complicated data structures.
+Because this structure captures all the information
+about pointers to other blocks, tools written to traverse
+Venti hierarchies can traverse the more complicated
+data structures as well.
+For example,
+.I venti/copy
+(see
+.I ventiaux (8))
+copies a Venti hierarchy from one Venti server to another,
+given the root
+.CW VtEntry .
+Because the traditional file system described in later sections is
+layered on a Venti hierarchy, 
+.I venti/copy
+can copy it without fully understanding its structure.
+.NH 1
+Vac file system format
+.HP
+The Venti archive format
+.I vac
+builds a traditional file system using a Venti hierarchy.
+Each vac file is implemented as a Venti file;
+each vac directory is implemented as a Venti
+directory and a Venti file to provide traditional file system metadata.
+The metadata is stored in a structure called a
+.CW DirEntry :
+.P1
+DirEntry:
+.ta +\w'    'u +\w'            'u
+	magic[4]	\f(CW0x1c4d9072\fP (DirMagic)\fP
+	version[2]	\f(CW9\fP
+	elem[s]	\fRname (final path element only)\fP
+	entry[4]	\fRentry number for Venti file or directory\fP
+	gen[4]	\fRgeneration number\fP
+	mentry[4]	\fRentry number for Venti file holding metadata\fP
+	mgen[4]	\fRgeneration number\fP
+	qid[8]	\fRunique file serial number\fP
+	uid[s]	\fRowner\fP
+	gid[s]	\fRgroup\fP
+	mid[s]	\fRlast modified by\fP
+	mtime[4]	\fRlast modification time\fP
+	ctime[4]	\fRcreation time\fP
+	atime[4]	\fRlast access time\fP
+	mode[4]	\fRmode bits\fP
+.P2
+The notation
+.CW name[s]
+denotes a string stored as a two-byte length
+and then that many bytes.
+The above describes Version 9 of the 
+.CW DirEntry
+format.  Versions 7 and 8 are very similar; they can be
+read by the current
+.I vac
+source code but are not written.
+Earlier versions were not widespread.
+A
+.CW DirEntry
+may be followed by optional extension sections, though none
+are currently used.
+The
+.CW mode
+bits include bits commonly used by
+Unix and Windows, in addition to those used by Plan 9.
+.PP
+The
+.CW entry
+field is an index into the parallel Venti directory.
+The
+.CW gen
+field must match the
+.CW gen 
+field in the corresponding
+.CW VtEntry
+in the directory;
+it is used to detect
+stale indices.
+Similarly,
+.CW mentry
+and
+.CW mgen
+are the index and generation number
+for the metadata Venti file,
+if the
+.CW DirEntry
+describes a vac directory.
+.PP
+The relation between Venti files and directories and
+vac files and directories can be seen in this figure:
+.PS
+.ps 8
+.vs 10
+boxwid=0.1
+boxht=0.1
+f=0.9
+mb=0.16
+
+VtRoot: [
+	right
+	B1: box
+	move right 0.1
+	"\f(CWVtRoot\fP" ljust
+]
+
+SuperRoot: [
+	right
+	B1: box fill f
+	move right 0.1
+	"fs root block" ljust
+] with .nw at VtRoot.sw + (0.2, -.2)
+Root: [
+	right
+	B1: box fill f
+	B2: box fill f
+	B3: box fill f
+	move right 0.1
+	"root directory info block" ljust
+] with .nw at SuperRoot.sw+(0.2, -.2)
+Level1: [
+	RootMeta: [
+		box wid mb
+		move right 0.1
+		"root metadata" ljust
+	]
+	MetaSource: [
+		right
+		B1: box wid mb
+		B2: box wid mb
+		B3: box wid mb
+		B4: box wid mb
+		B5: box wid mb
+	] with .nw at RootMeta.sw+(0,-.2)
+	MB1: box wid mb invis at MetaSource.B1
+	MB2: box wid mb invis at MetaSource.B2
+	MB3: box wid mb invis at MetaSource.B3
+	MB4: box wid mb invis at MetaSource.B4
+	MB5: box wid mb invis at MetaSource.B5
+
+	Source: [
+		right
+		B1: box fill f
+		B2: box fill f
+		B3: box fill f
+		B4: box fill f
+		B5: box fill f
+		B6: box fill f
+		B7: box fill f
+		B8: box fill f
+	] with .nw at MetaSource.sw+(0,-.1)
+	SB1: box invis at Source.B1
+	SB2: box invis at Source.B2
+	SB3: box invis at Source.B3
+	SB4: box invis at Source.B4
+	SB5: box invis at Source.B5
+	SB6: box invis at Source.B6
+	SB7: box invis at Source.B7
+	SB8: box invis at Source.B8
+] with .nw at Root.sw+(0.4,-.2)
+Level2: [
+	MetaSource: [
+		right
+		B1: box wid mb
+		B2: box wid mb
+		B3: box wid mb
+		B4: box wid mb
+		B5: box wid mb
+	] 
+	Source: [
+		right
+		B1: box fill f
+		B2: box fill f
+		B3: box fill f
+		B4: box fill f
+		B5: box fill f
+		B6: box fill f
+		B7: box fill f
+		B8: box fill f
+	] with .nw at MetaSource.sw+(0,-.1)
+	File: box wid 0.8 with .nw at Source.sw+(0,-.2)
+] with .nw at Level1.sw+(0.6,-.2)
+
+line -> from VtRoot.B1 down boxwid/2+0.2+boxwid/2 then to SuperRoot.w
+line -> from SuperRoot.B1 down boxwid/2+0.2+boxwid/2 then to Root.w
+line -> from Root.B3 down boxwid/2+0.2+boxwid/2 then to Level1.RootMeta.w
+line -> from Root.B2 down boxwid/2+0.2+boxwid+0.2+boxwid/2 then to Level1.MetaSource.w
+line -> from Root.B1 down boxwid/2+0.2+boxwid+0.1+boxwid+0.2+boxwid/2 then to Level1.Source.w
+
+line -> from Level1.SB3 down boxwid/2+0.2+boxwid/2 then to Level2.MetaSource.w
+line -> from Level1.SB2 down boxwid/2+0.2+boxwid+0.1+boxwid/2 then to Level2.Source.w
+line -> from Level1.SB1 down boxwid/2+0.2+boxwid+0.1+boxwid+0.2+boxwid/2 then to Level2.File.w
+
+arrowwid = arrowwid/2
+arrowht = arrowht/2
+line -> from Level1.MB1 to Level1.SB1.n
+line -> from Level1.MB2 to Level1.SB2.n
+line -> from Level1.MB2 to Level1.SB3.n
+line -> from Level1.MB4 to Level1.SB7.n
+line -> from Level1.MB5 to Level1.SB5.n
+arrowwid = arrowwid * 2
+arrowht = arrowht * 2
+
+box dashed with .nw at Level1.MetaSource.nw+(-.05,.05) wid 0.8+.05*2 ht .3+.05*2
+box dashed with .nw at Level2.MetaSource.nw+(-.05,.05) wid 0.8+.05*2 ht .3+.05*2
+box dotted with .nw at Level2.File.nw+(-.05,.05) wid 0.8+0.05*2 ht .1+.05*2
+
+[
+	KEY: box wid 1.5 invis "Key"
+	line from KEY.sw to KEY.se
+	k = -.1
+	kk=0.5
+	A: [
+		box wid 4*boxwid
+		"Venti file" ljust with .w at last box .w+(kk,0)
+	] with .nw at KEY.sw+(0,2*k)
+	B: [
+		box fill f
+		"Venti entry (\f(CWEntry\fP)" ljust with .w at last box .w+(kk,0)
+	] with .nw at A.sw+(0,k)
+	C: [
+		right
+		CC: box fill f
+		box fill f
+		box fill f
+		box fill f
+		"Venti directory" ljust with .w at CC.w+(kk,0)
+	] with .nw at B.sw+(0,k)
+	D: [
+		line -> right 3*boxwid
+		"Venti pointer (score)" ljust with .w at last line .w+(kk, 0)
+	] with .nw at C.sw+(0,k)
+	DD: [
+		box dotted wid 4*boxwid
+		"Vac file" ljust with .w at last box .w+(kk,0)
+	] with .nw at D.sw+(0,k)
+	E: [
+		box wid mb
+		"Vac entry (\f(CWDirEntry\fP)" ljust with .w at last box .w+(kk,0)
+	] with .nw at DD.sw+(0,k)
+	G: [
+		box dashed wid 4*boxwid
+		"Vac directory" ljust with .w at last box .w+(kk,0)
+	] with .nw at E.sw+(0,k)
+	H: [
+		arrowwid = arrowwid/2
+		arrowht = arrowht/2
+		line -> right 1.5*boxwid
+		"Vac pointer (integer index)" ljust with .w at last line .w+(kk, 0)
+		arrowwid = arrowwid * 2
+		arrowht = arrowht * 2
+	] with .nw at G.sw+(0,k)
+] with .nw at VtRoot.nw+(3,0)
+.PE
+.LP
+In reality, the story is slightly more complicated.
+The metadata file in a Vac directory
+is not just the concatenation of
+.CW DirEntry
+structures.
+Instead, it is the concatenation of
+.CW MetaBlocks .
+A
+.CW MetaBlock
+contains some number of
+.CW DirEntry
+structures along with a sorted index to make it easy
+to look for a particular
+.CW DirEntry
+by its
+.CW elem 
+field.
+The details are in the source code.
+.PP
+As shown in the diagram,
+the root directory of the file system is summarized by
+three
+.CW VtEntry
+structures describing
+the Venti directory for the children of the root,
+the Venti file for the metadata describing the children of the root,
+and a Venti file holding metadata for the root directory itself.
+These
+.CW VtEntry
+structures are placed in a Venti directory of their own,
+described by the single 
+.CW VtEntry
+in the
+root block.
+.NH 1
+Fossil file system format
+.HP
+Fossil uses the vac format, with some small changes.
+The changes only affect the data on the local disk; the data
+archived to Venti is exactly in vac format.
+.PP
+Blocks stored on local disk may contain scores pointing at local disk
+blocks or at Venti blocks. 
+Local block addresses are stored as 20-byte scores in which the first 16 bytes
+are all zero and the last 4 bytes specify a block number in the disk.
+Before a block is archived, all the
+blocks it points to must be archived, and the local scores in the block
+must be changed to Venti scores.
+Using block addresses rather than content hashes for local data
+makes the local file system easier to manage: if a local block's contents
+change, the pointer to the block does not need to change.
+.NH 2
+Snapshots
+.HP
+Fossil is an archival file server.
+It takes periodic snapshots of the file system,
+which are made accessible through the file system.
+Specifically, the active file system is presented in
+.CW /active .
+Ephemeral snapshots (those that are kept on local disk and eventually deleted)
+are presented in
+\f(CW/snapshot/\fIyyyy\f(CW/\fImmdd\f(CW/\fIhhmm\fR,
+where
+.I yyyy
+is the full year,
+.I mm
+is the month number,
+.I dd
+is the day number,
+.I hh
+is the hour,
+and
+.I mm
+is the minute.
+Archival snapshots (those that are archived to Venti and persist forever)
+are presented in
+\f(CW/archive/\fIyyyy\f(CW/\fImmdds\fR,
+where
+.I yyyy ,
+.I mm ,
+and
+.I dd
+are year, month, and day as before,
+and
+.I s
+is a sequence number if more than one
+archival snapshot is done in a day.
+For the first snapshot,
+.I s
+is null.
+For the subsequent snapshots,
+.I s
+is
+.CW .1 ,
+.CW .2 ,
+.CW .3 ,
+etc.
+.PP
+To implement the snapshots, the file server maintains a
+current
+.I epoch
+for the active file system.
+Each local block has a label that records, among other things,
+the epoch in which the block was allocated.
+If a block was allocated in an epoch earlier than the current one,
+it is immutable and treated as copy-on-write.
+Taking a snapshot can be accomplished by
+recording the address of the current root block and then 
+incrementing the epoch number.
+Notice that the copy-on-write method makes
+snapshots both time efficient and space efficient.
+The only time cost is waiting for all current file system
+requests to finish and then incrementing a counter.
+After a snapshot, blocks only get copied when they are
+next modified, so the per-snapshot
+space requirement is proportional
+to the amount of new data rather than the total
+size of the file system.
+.PP
+The blocks in the archival snapshots are moved to Venti,
+but the blocks in the ephemeral snapshots take up space
+in the local disk file.
+To allow reclamation of this disk space, the file system
+maintains a 
+.I low
+.I epoch ,
+which is the epoch of the earliest ephemeral snapshot
+still available.
+Fossil only allows access to snapshots with epoch numbers
+between the 
+low epoch and the current epoch
+(also called the high epoch).
+Incrementing the low epoch thus makes old
+snapshots inaccessible.
+The space required to store those snapshots can then
+be reclaimed, as described below.
+.NH 2
+Local blocks
+.HP
+The bulk of the local disk file is the local blocks.
+Each block has a 14-byte label associated with it, of the format:
+.P1
+Label:
+.ta +\w'    'u +\w'                'u
+	state[1]	\fRblock state\fP
+	type[1]	\fRblock type\fP
+	epoch[4]	\fRallocation epoch\fP
+	epochClose[4]	\fRclose epoch\fP
+	tag[4]	\fRrandom tag\fP
+.P2
+.LP
+The
+.CW type
+is an analogue of the block types described earlier,
+though different names are used, to distinguish between
+pointers blocks in a hash tree for a data stream
+and pointer blocks for a directory stream.
+The
+.CW epoch
+was mentioned in the last section.
+The other fields are explained below.
+.PP
+There are two distinguished blocks states
+.CW BsFree
+.CW 0x00 ) (
+and
+.CW BsBad
+.CW 0xFF ), (
+which mark blocks that are available for allocation
+and blocks that are bad and should be avoided.
+If
+.CW state
+is not one of these values, it is a bitwise
+.I or ' `
+of the following flags:
+.P1
+.ta +\w'      'u +\w'                'u
+0x01	BsAlloc	\fRblock is in use\fP
+0x02	BsCopied	\fRblock has been copied\fP
+0x04	BsVenti	\fRblock has been stored on Venti\fP
+0x08	BsClosed	\fRblock has been unlinked from active file system\fP
+.P2
+.LP
+The flags are explained as they arise in the discussions below.
+.PP
+It is convenient to store some extra fields in the
+.CW VtEntry
+structure when it describes a Venti file or directory
+stored on local disk.
+Specifically, we set the
+.CW VtEntryLocal
+flag bit
+and then use the bytes 7-16 of the score (which would
+otherwise be zero, since it is a local score) to hold these fields:
+.P1
+.ta +\w'    'u +\w'                'u
+	archive[1]	\fRboolean: this is an archival snapshot\fP
+	snap[4]	\fRepoch number if root of snapshot\fP
+	tag[4]	\fRrandom tag\fP
+.P2
+.LP
+The extended
+.CW VtEntry
+structure is called an
+.CW Entry .
+The
+.CW tag
+field
+in the
+.CW Label
+and the
+.CW Entry
+is used to identify dangling pointers or other file system corruption:
+all the local blocks in a hash tree must
+have tags matching the tag in the
+.CW Entry .
+If this
+.CW Entry
+points at the root of a snapshot,
+the
+.CW snap
+field is the epoch of the snapshot.
+If the snapshot is intended to be archived to Venti,
+the
+.CW archive
+field is non-zero.
+.NH 2
+Block reclamation
+.HP
+The blocks in the active file system form a tree: each
+block has only one parent.
+Once a copy-on-write block 
+.I b
+is replaced by its copy, it is no longer
+needed by the active file system.
+At this point,
+.I b
+is unlinked from the active file system.
+We say that
+.I b
+is now
+.I closed :
+it is needed only for snapshots.
+When a block is closed, the
+.CW BsClosed
+bit is set in its state, and the current epoch (called the block's closing epoch)
+is stored in the
+.CW epochClose
+label field.
+(Open blocks have an
+.CW epochClose
+of
+.CW ~0 ).
+.PP
+A block is referenced by snapshots with epochs
+between the block's allocation epoch and its closing epoch.
+Once the file system's low epoch grows to be greater than or equal to the block's
+closing epoch, the block is no longer needed for any snapshots
+and can be reused.
+.PP
+In a typical configuration, where nightly archival snapshots
+are taken and written to Venti, it is desirable to reclaim
+the space occupied by now-archived blocks if possible.
+To do this, Fossil keeps track of whether the pointers
+in each block are unique to that block.
+When a block
+.I bb
+is allocated, a pointer to
+.I bb
+is written into exactly one active block (say,
+.I b ).
+In the absence of snapshots, the pointer to
+.I bb
+will remain unique to
+.I b ,
+so that if the pointer is zeroed,
+.I bb
+can be immediately reused.
+Snapshots complicate this invariant:
+when
+.I b
+is copied-on-write, all its pointers
+are no longer unique to it.
+At time of the copy, the
+.CW BsCopied
+state bit in the block's label
+is set to note the duplication of the pointers contained within.
+.NH 2
+Disk layout
+.HP
+The file system header describes the file system layout and has this format:
+.P1
+.ta +\w'    'u +\w'                'u
+Header:
+	magic[4]	\fR0x3776AE89 (HeaderMagic)\fP
+	version[2]	\fR1 (HeaderVersion)\fP
+	blockSize[2]	\fIfile system block size\fP
+	super[4]	\fRblock offset of super block\fP
+	label[4]	\fRblock offset of labels\fP
+	data[4]	\fRdata blocks\fP
+	end[4]	\fRend of file system\fP
+.P2
+.LP
+The corresponding file system layout is:
+.PS
+.ps 8
+.vs 9
+boxwid=0.75
+boxht=0.15
+Empty: box "empty" ht 0.25
+Header: box "header" with .n at Empty.s
+Empty2: box "empty" with .n at Header.s
+Super: box "super block" with .n at Empty2.s
+Label: box "label" "blocks" with .n at Super.s ht 0.25
+Data: box "data" "blocks" with .n at Label.s ht 0.3
+"  0" ljust at Empty.ne
+"  128kB" ljust at Header.ne
+"  \f5super\fP \(mu \f(CWblockSize\fP" ljust at Super.ne
+"  \f5label\fP \(mu \f(CWblockSize\fP" ljust at Label.ne
+"  \f5data\fP \(mu \f(CWblockSize\fP" ljust at Data.ne
+"  \f5end\fP \(mu \f(CWblockSize\fP" ljust at Data.se
+"" at (-1,0)
+"" at (6,0)
+.PE
+.LP
+The numbers to the right of the blocks are byte offsets
+of the boundaries.
+.LP
+The super block describes the file system itself and looks like:
+.P1
+.ta +\w'    'u +\w'                'u
+Super:
+	magic[4]	\fR0x2340A3B1 (SuperMagic)\fP
+	version[2]	\fR1 (SuperVersion)\fP
+	epochLow[4]	\fRfile system low epoch\fP
+	epochHigh[4]	\fRfile system high (active) epoch\fP
+	qid[8]	\fRnext qid to allocate\fP
+	active[4]	\fRdata block number: root of active file system\fP
+	next[4]	\fRdata block number: root of next file system to archive\fP
+	current[4]	\fRdata block number: root of file system currently being archived\fP
+	last[20]	\fRVenti score of last successful archive\fP
+	name[128]	\fRname of file system (just a comment)\fP
+.P2
+.LP
+.NH 1
+Fossil server
+.HP
+The Fossil server is a user-space program that runs on a standard Plan 9 kernel.
+.NH 2
+Process structure
+.PP
+The file server is structured as a set of processes synchronizing
+mostly through message passing along queues.
+The processes are given names, which can be seen in the output of
+.CW ps 
+.CW -a .
+.PP
+.CW Listen
+processes announce on various network addresses.
+A
+.CW con
+process handles each incoming connection, reading 9P requests
+and adding them to a central message queue.
+.CW Msg
+processes remove 9P requests from the queue,
+handle them, and write the responses to the appropriate
+file descriptors.
+.PP
+The
+.CW disk
+process handles disk I/O requests made by the other processes.
+The
+.CW flush
+process writes dirty blocks from the in-memory block cache to disk.
+The
+.CW unlink
+process frees previously linked blocks once the blocks that point at them
+have been written to disk.
+.PP
+A
+.CW consI
+reads from each console file (typically a pipe posted in
+.CW /srv ),
+adding the typed characters to the input queue.
+The
+.CW cons
+process echoes input and runs the commands, saving
+output in a ring buffer.
+Because there is only one
+.CW cons
+process, only one console command may be executing at a time.
+A
+.CW consO
+process copies this ring buffer to the each console file.
+.PP
+The
+.CW periodic
+process runs periodic events, like
+flushing the root metadata to disk or
+taking snapshots of the file system.
+.NH 2
+Block cache
+.HP
+Fossil maintains an in-memory block cache which 
+holds both local disk blocks and Venti blocks.
+Cache eviction follows a least recently used policy.
+Dirty blocks are restricted to at most half the cache.
+This can be changed by editing
+.CW DirtyPercentage
+in 
+.CW dat.h .
+.PP
+The block cache uses soft updates [1] to ensure that the on-disk
+file system is always self-consistent.
+Thus there is no
+.I halt
+console command
+and no need to check a file system 
+that was shut down without halting.
+.NH 2
+Archiving
+.HP
+A background process writes blocks in archival snapshots to Venti.
+Although
+.CW /archive/\fIyyyy\fP/\fImmdds\fR
+is a copy of only
+.CW /active
+at the time of the snapshot,
+the archival process archives the
+entire file tree rather than just
+the subtree rooted at
+.CW /active .
+The snapshots
+.CW /snapshot/\fIyyyy\fP/\fImmdd\fP/\fIhhmm
+are stored as empty directories.
+Once all the blocks have been archived,
+a 
+.CW VtRoot
+header for the file system is archived.
+The score of that header is recorded in
+.CW super.score
+and also printed on the file server console.
+The score can used by
+.I flfmt
+to restore a file system (see
+.I fossil (4)).
+.NH 2
+Contrast with the old file server
+.HP
+The most obvious difference between Fossil and the 
+old Plan 9 file server [2] is that Fossil uses a Venti server as 
+its archival storage in place of a WORM juke box.
+There are a few other architectural differences to be 
+aware of.
+.PP
+Fossil is a user-level program run on a standard kernel.
+.PP
+Fossil does not have any way to concatenate, stripe, or
+mirror disk files.  For functionality similar to the old file server's
+configuration strings, use the experimental file stack device 
+(see
+.I devfs (3)).
+.PP
+Fossil speaks only 9P2000.  Old 9P (aka 9P1) is not supported.
+.PP
+... XXX words about converting an old file system to fossil?
+.NH 1
+References
+.LP
+[1] Gregory R. Ganger, Marshall Kirk McKusick, Craig A. N. Soules,
+and Yale N. Patt.
+``Soft Updates: A Solution to the Metadata Update Problem
+in File Systems,''
+.I "ACM Transactions on Computer Systems" ,
+Vol 18., No. 2, May 2000, pp. 127\-153.
+.LP
+[2] Sean Quinlan, ``A Cached WORM File System,''
+.I "Software\(emPractice and Experience" ,
+Vol 21., No 12., December 1991, pp. 1289\-1299.
+.LP
+[3] Sean Quinlan and Sean Dorward, ``Venti: A New Approach to Archival Storage,''
+.I "Usenix Conference on File and Storage Technologies" ,
+2002.

BIN
sys/doc/fossil.pdf


+ 5401 - 0
sys/doc/fossil.ps

@@ -0,0 +1,5401 @@
+%!PS-Adobe-2.0
+%%Version: 0.1
+%%DocumentFonts: (atend)
+%%Pages: (atend)
+%%EndComments
+%
+% Version 3.3.2 prologue for troff files.
+%
+
+/#copies 1 store
+/aspectratio 1 def
+/formsperpage 1 def
+/landscape false def
+/linewidth .3 def
+/magnification 1 def
+/margin 0 def
+/orientation 0 def
+/resolution 720 def
+/rotation 1 def
+/xoffset 0 def
+/yoffset 0 def
+
+/roundpage true def
+/useclippath true def
+/pagebbox [0 0 612 792] def
+
+/R  /Times-Roman def
+/I  /Times-Italic def
+/B  /Times-Bold def
+/BI /Times-BoldItalic def
+/H  /Helvetica def
+/HI /Helvetica-Oblique def
+/HB /Helvetica-Bold def
+/HX /Helvetica-BoldOblique def
+/CW /Courier def
+/CO /Courier def
+/CI /Courier-Oblique def
+/CB /Courier-Bold def
+/CX /Courier-BoldOblique def
+/PA /Palatino-Roman def
+/PI /Palatino-Italic def
+/PB /Palatino-Bold def
+/PX /Palatino-BoldItalic def
+/Hr /Helvetica-Narrow def
+/Hi /Helvetica-Narrow-Oblique def
+/Hb /Helvetica-Narrow-Bold def
+/Hx /Helvetica-Narrow-BoldOblique def
+/KR /Bookman-Light def
+/KI /Bookman-LightItalic def
+/KB /Bookman-Demi def
+/KX /Bookman-DemiItalic def
+/AR /AvantGarde-Book def
+/AI /AvantGarde-BookOblique def
+/AB /AvantGarde-Demi def
+/AX /AvantGarde-DemiOblique def
+/NR /NewCenturySchlbk-Roman def
+/NI /NewCenturySchlbk-Italic def
+/NB /NewCenturySchlbk-Bold def
+/NX /NewCenturySchlbk-BoldItalic def
+/ZD /ZapfDingbats def
+/ZI /ZapfChancery-MediumItalic def
+/S  /S def
+/S1 /S1 def
+/GR /Symbol def
+
+/inch {72 mul} bind def
+/min {2 copy gt {exch} if pop} bind def
+
+/setup {
+	counttomark 2 idiv {def} repeat pop
+
+	landscape {/orientation 90 orientation add def} if
+	/scaling 72 resolution div def
+	linewidth setlinewidth
+	1 setlinecap
+
+	pagedimensions
+	xcenter ycenter translate
+	orientation rotation mul rotate
+	width 2 div neg height 2 div translate
+	xoffset inch yoffset inch neg translate
+	margin 2 div dup neg translate
+	magnification dup aspectratio mul scale
+	scaling scaling scale
+
+	addmetrics
+	0 0 moveto
+} def
+
+/pagedimensions {
+	useclippath userdict /gotpagebbox known not and {
+		/pagebbox [clippath pathbbox newpath] def
+		roundpage currentdict /roundpagebbox known and {roundpagebbox} if
+	} if
+	pagebbox aload pop
+	4 -1 roll exch 4 1 roll 4 copy
+	landscape {4 2 roll} if
+	sub /width exch def
+	sub /height exch def
+	add 2 div /xcenter exch def
+	add 2 div /ycenter exch def
+	userdict /gotpagebbox true put
+} def
+
+/addmetrics {
+	/Symbol /S null Sdefs cf
+	/Times-Roman /S1 StandardEncoding dup length array copy S1defs cf
+} def
+
+/pagesetup {
+	/page exch def
+	currentdict /pagedict known currentdict page known and {
+		page load pagedict exch get cvx exec
+	} if
+} def
+
+/decodingdefs [
+	{counttomark 2 idiv {y moveto show} repeat}
+	{neg /y exch def counttomark 2 idiv {y moveto show} repeat}
+	{neg moveto {2 index stringwidth pop sub exch div 0 32 4 -1 roll widthshow} repeat}
+	{neg moveto {spacewidth sub 0.0 32 4 -1 roll widthshow} repeat}
+	{counttomark 2 idiv {y moveto show} repeat}
+	{neg setfunnytext}
+] def
+
+/setdecoding {/t decodingdefs 3 -1 roll get bind def} bind def
+
+/w {neg moveto show} bind def
+/m {neg dup /y exch def moveto} bind def
+/done {/lastpage where {pop lastpage} if} def
+
+/f {
+	dup /font exch def findfont exch
+	dup /ptsize exch def scaling div dup /size exch def scalefont setfont
+	linewidth ptsize mul scaling 10 mul div setlinewidth
+	/spacewidth ( ) stringwidth pop def
+} bind def
+
+/changefont {
+	/fontheight exch def
+	/fontslant exch def
+	currentfont [
+		1 0
+		fontheight ptsize div fontslant sin mul fontslant cos div
+		fontheight ptsize div
+		0 0
+	] makefont setfont
+} bind def
+
+/sf {f} bind def
+
+/cf {
+	dup length 2 idiv
+	/entries exch def
+	/chtab exch def
+	/newencoding exch def
+	/newfont exch def
+
+	findfont dup length 1 add dict
+	/newdict exch def
+	{1 index /FID ne {newdict 3 1 roll put}{pop pop} ifelse} forall
+
+	newencoding type /arraytype eq {newdict /Encoding newencoding put} if
+
+	newdict /Metrics entries dict put
+	newdict /Metrics get
+	begin
+		chtab aload pop
+		1 1 entries {pop def} for
+		newfont newdict definefont pop
+	end
+} bind def
+
+%
+% A few arrays used to adjust reference points and character widths in some
+% of the printer resident fonts. If square roots are too high try changing
+% the lines describing /radical and /radicalex to,
+%
+%	/radical	[0 -75 550 0]
+%	/radicalex	[-50 -75 500 0]
+%
+% Move braceleftbt a bit - default PostScript character is off a bit.
+%
+
+/Sdefs [
+	/bracketlefttp		[201 500]
+	/bracketleftbt		[201 500]
+	/bracketrighttp		[-81 380]
+	/bracketrightbt		[-83 380]
+	/braceleftbt		[203 490]
+	/bracketrightex		[220 -125 500 0]
+	/radical		[0 0 550 0]
+	/radicalex		[-50 0 500 0]
+	/parenleftex		[-20 -170 0 0]
+	/integral		[100 -50 500 0]
+	/infinity		[10 -75 730 0]
+] def
+
+/S1defs [
+	/underscore		[0 80 500 0]
+	/endash			[7 90 650 0]
+] def
+%
+% Version 3.3.2 drawing procedures for dpost. Automatically pulled in when
+% needed.
+%
+
+/inpath false def
+/savematrix matrix def
+
+/Dl {
+	inpath
+		{neg lineto pop pop}
+		{newpath neg moveto neg lineto stroke}
+	ifelse
+} bind def
+
+/De {
+	/y1 exch 2 div def
+	/x1 exch 2 div def
+	/savematrix savematrix currentmatrix def
+	neg exch x1 add exch translate
+	x1 y1 scale
+	0 0 1 0 360
+	inpath
+		{1 0 moveto arc savematrix setmatrix}
+		{newpath arc savematrix setmatrix stroke}
+	ifelse
+} bind def
+
+/Da {
+	/dy2 exch def
+	/dx2 exch def
+	/dy1 exch def
+	/dx1 exch def
+	dy1 add neg exch dx1 add exch
+	dx1 dx1 mul dy1 dy1 mul add sqrt
+	dy1 dx1 neg atan
+	dy2 neg dx2 atan
+	inpath
+		{arc}
+		{newpath arc stroke}
+	ifelse
+} bind def
+
+/DA {
+	/dy2 exch def
+	/dx2 exch def
+	/dy1 exch def
+	/dx1 exch def
+	dy1 add neg exch dx1 add exch
+	dx1 dx1 mul dy1 dy1 mul add sqrt
+	dy1 dx1 neg atan
+	dy2 neg dx2 atan
+	inpath
+		{arcn}
+		{newpath arcn stroke}
+	ifelse
+} bind def
+
+/Ds {
+	/y2 exch def
+	/x2 exch def
+	/y1 exch def
+	/x1 exch def
+	/y0 exch def
+	/x0 exch def
+	x0 5 x1 mul add 6 div
+	y0 5 y1 mul add -6 div
+	x2 5 x1 mul add 6 div
+	y2 5 y1 mul add -6 div
+	x1 x2 add 2 div
+	y1 y2 add -2 div
+	inpath
+		{curveto}
+		{newpath x0 x1 add 2 div y0 y1 add -2 div moveto curveto stroke}
+	ifelse
+} bind def
+%
+% Tries to round clipping path dimensions, as stored in array pagebbox, so they
+% match one of the known sizes in the papersizes array. Lower left coordinates
+% are always set to 0.
+%
+
+/roundpagebbox {
+    7 dict begin
+	/papersizes [8.5 inch 11 inch 14 inch 17 inch] def
+
+	/mappapersize {
+		/val exch def
+		/slop .5 inch def
+		/diff slop def
+		/j 0 def
+		0 1 papersizes length 1 sub {
+			/i exch def
+			papersizes i get val sub abs
+			dup diff le {/diff exch def /j i def} {pop} ifelse
+		} for
+		diff slop lt {papersizes j get} {val} ifelse
+	} def
+
+	pagebbox 0 0 put
+	pagebbox 1 0 put
+	pagebbox dup 2 get mappapersize 2 exch put
+	pagebbox dup 3 get mappapersize 3 exch put
+    end
+} bind def
+
+%%EndProlog
+%%BeginSetup
+mark
+%
+% Encoding vector and redefinition of findfont for the ISO Latin1 standard.
+% The 18 characters missing from ROM based fonts on older printers are noted
+% below.
+%
+
+/ISOLatin1Encoding [
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/space
+	/exclam
+	/quotedbl
+	/numbersign
+	/dollar
+	/percent
+	/ampersand
+	/quoteright
+	/parenleft
+	/parenright
+	/asterisk
+	/plus
+	/comma
+	/minus
+	/period
+	/slash
+	/zero
+	/one
+	/two
+	/three
+	/four
+	/five
+	/six
+	/seven
+	/eight
+	/nine
+	/colon
+	/semicolon
+	/less
+	/equal
+	/greater
+	/question
+	/at
+	/A
+	/B
+	/C
+	/D
+	/E
+	/F
+	/G
+	/H
+	/I
+	/J
+	/K
+	/L
+	/M
+	/N
+	/O
+	/P
+	/Q
+	/R
+	/S
+	/T
+	/U
+	/V
+	/W
+	/X
+	/Y
+	/Z
+	/bracketleft
+	/backslash
+	/bracketright
+	/asciicircum
+	/underscore
+	/quoteleft
+	/a
+	/b
+	/c
+	/d
+	/e
+	/f
+	/g
+	/h
+	/i
+	/j
+	/k
+	/l
+	/m
+	/n
+	/o
+	/p
+	/q
+	/r
+	/s
+	/t
+	/u
+	/v
+	/w
+	/x
+	/y
+	/z
+	/braceleft
+	/bar
+	/braceright
+	/asciitilde
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/.notdef
+	/dotlessi
+	/grave
+	/acute
+	/circumflex
+	/tilde
+	/macron
+	/breve
+	/dotaccent
+	/dieresis
+	/.notdef
+	/ring
+	/cedilla
+	/.notdef
+	/hungarumlaut
+	/ogonek
+	/caron
+	/space
+	/exclamdown
+	/cent
+	/sterling
+	/currency
+	/yen
+	/brokenbar		% missing
+	/section
+	/dieresis
+	/copyright
+	/ordfeminine
+	/guillemotleft
+	/logicalnot
+	/hyphen
+	/registered
+	/macron
+	/degree			% missing
+	/plusminus		% missing
+	/twosuperior		% missing
+	/threesuperior		% missing
+	/acute
+	/mu			% missing
+	/paragraph
+	/periodcentered
+	/cedilla
+	/onesuperior		% missing
+	/ordmasculine
+	/guillemotright
+	/onequarter		% missing
+	/onehalf		% missing
+	/threequarters		% missing
+	/questiondown
+	/Agrave
+	/Aacute
+	/Acircumflex
+	/Atilde
+	/Adieresis
+	/Aring
+	/AE
+	/Ccedilla
+	/Egrave
+	/Eacute
+	/Ecircumflex
+	/Edieresis
+	/Igrave
+	/Iacute
+	/Icircumflex
+	/Idieresis
+	/Eth			% missing
+	/Ntilde
+	/Ograve
+	/Oacute
+	/Ocircumflex
+	/Otilde
+	/Odieresis
+	/multiply		% missing
+	/Oslash
+	/Ugrave
+	/Uacute
+	/Ucircumflex
+	/Udieresis
+	/Yacute			% missing
+	/Thorn			% missing
+	/germandbls
+	/agrave
+	/aacute
+	/acircumflex
+	/atilde
+	/adieresis
+	/aring
+	/ae
+	/ccedilla
+	/egrave
+	/eacute
+	/ecircumflex
+	/edieresis
+	/igrave
+	/iacute
+	/icircumflex
+	/idieresis
+	/eth			% missing
+	/ntilde
+	/ograve
+	/oacute
+	/ocircumflex
+	/otilde
+	/odieresis
+	/divide			% missing
+	/oslash
+	/ugrave
+	/uacute
+	/ucircumflex
+	/udieresis
+	/yacute			% missing
+	/thorn			% missing
+	/ydieresis
+] def
+
+/NewFontDirectory FontDirectory maxlength dict def
+
+%
+% Apparently no guarantee findfont is defined in systemdict so the obvious
+%
+%	systemdict /findfont get exec
+%
+% can generate an error. So far the only exception is a VT600 (version 48.0).
+%
+
+userdict /@RealFindfont known not {
+	userdict begin
+		/@RealFindfont systemdict begin /findfont load end def
+	end
+} if
+
+/findfont {
+	dup NewFontDirectory exch known not {
+		dup
+		%dup systemdict /findfont get exec	% not always in systemdict
+		dup userdict /@RealFindfont get exec
+		dup /Encoding get StandardEncoding eq {
+			dup length dict begin
+				{1 index /FID ne {def}{pop pop} ifelse} forall
+				/Encoding ISOLatin1Encoding def
+				currentdict
+			end
+			/DummyFontName exch definefont
+		} if
+		NewFontDirectory 3 1 roll put
+	} if
+	NewFontDirectory exch get
+} bind def
+
+%%Patch from lp
+%%EndPatch from lp
+
+setup
+%%EndSetup
+%%Page: 1 1
+/saveobj save def
+mark
+1 pagesetup
+12 /LucidaSans-Demi f
+(Fossil, an Archival) 1975 1230 w
+(File) 3138 1230 w
+(Server) 3395 1230 w
+10 /LucidaSans-Italic f
+(Sean Quinlan) 2557 1470 w
+(Jim McKie) 2643 1590 w
+(Russ Cox) 2663 1710 w
+(jmk,rsc@plan9.bell-labs.com) 2168 1830 w
+(ABSTRACT) 2626 2270 w
+10 /LucidaSansUnicode00 f
+(This) 1330 2530 w
+(paper) 1608 2530 w
+(describes) 1959 2530 w
+(the) 2493 2530 w
+(internals) 2721 2530 w
+(and) 3217 2530 w
+(operation) 3471 2530 w
+(of) 4010 2530 w
+(Fossil,) 4182 2530 w
+(an) 4563 2530 w
+(archival) 1080 2650 w
+(file) 1487 2650 w
+(server) 1671 2650 w
+(built) 2001 2650 w
+(for) 2254 2650 w
+(Plan) 2426 2650 w
+(9.) 2660 2650 w
+(Fossil) 2820 2650 w
+(has not yet replaced the current) 3128 2650 w
+(Plan 9 file server and) 1080 2770 w
+10 /LucidaTypewriter f
+(kfs) 2132 2770 w
+10 /LucidaSansUnicode00 f
+(, but that is our eventual intent.) 2348 2770 w
+(Both) 3958 2770 w
+(fossil) 4209 2770 w
+(and) 4500 2770 w
+(this) 1080 2890 w
+(documentation) 1295 2890 w
+(are) 2059 2890 w
+(works) 2246 2890 w
+(in) 2569 2890 w
+(progress.) 2695 2890 w
+(Comments) 3220 2890 w
+(on) 3777 2890 w
+(either) 3935 2890 w
+(are) 4251 2890 w
+(most) 4438 2890 w
+(welcome.) 1080 3010 w
+10 /LucidaSans-Demi f
+(1.) 720 3370 w
+(Introduction) 873 3370 w
+10 /LucidaSansUnicode00 f
+(Fossil) 720 3526 w
+(is) 1032 3526 w
+(an) 1149 3526 w
+(archival) 1303 3526 w
+(file) 1714 3526 w
+(server) 1902 3526 w
+(built) 2236 3526 w
+(for) 2494 3526 w
+(Plan) 2671 3526 w
+(9.) 2910 3526 w
+(In) 3075 3526 w
+(a) 3204 3526 w
+(typical) 3297 3526 w
+(configuration,) 3651 3526 w
+(it) 4370 3526 w
+(maintains) 4474 3526 w
+(a) 4985 3526 w
+(traditional) 720 3646 w
+(file) 1254 3646 w
+(system) 1441 3646 w
+(in) 1817 3646 w
+(a) 1944 3646 w
+(local) 2035 3646 w
+(disk) 2296 3646 w
+(partition) 2533 3646 w
+(and) 2983 3646 w
+(periodically) 3199 3646 w
+(archives) 3792 3646 w
+(snapshots) 4224 3646 w
+(of) 4752 3646 w
+(the) 4885 3646 w
+(file) 720 3766 w
+(system) 914 3766 w
+(to) 1297 3766 w
+(a) 1438 3766 w
+(Venti) 1536 3766 w
+(server.) 1828 3766 w
+(These) 2232 3766 w
+(archives) 2563 3766 w
+(are) 3003 3766 w
+(made) 3198 3766 w
+(available) 3508 3766 w
+(through) 3975 3766 w
+(a) 4406 3766 w
+(file) 4505 3766 w
+(system) 4700 3766 w
+(interface.) 720 3886 w
+(Fossil) 1247 3886 w
+(can) 1561 3886 w
+(also) 1768 3886 w
+(be) 2003 3886 w
+(run) 2161 3886 w
+(without) 2365 3886 w
+(a) 2769 3886 w
+(Venti) 2863 3886 w
+(server,) 3151 3886 w
+(in) 3518 3886 w
+(which) 3647 3886 w
+(case) 3966 3886 w
+(the) 4217 3886 w
+(snapshots) 4410 3886 w
+(\(if) 4941 3886 w
+(any\) occupy local disk space.) 720 4006 w
+(The) 970 4162 w
+(bulk) 1203 4162 w
+(of) 1468 4162 w
+(this) 1619 4162 w
+(paper) 1851 4162 w
+(explains) 2182 4162 w
+(the) 2641 4162 w
+(underlying) 2849 4162 w
+(data) 3420 4162 w
+(structures:) 3683 4162 w
+(Venti) 4257 4162 w
+(trees,) 4559 4162 w
+(the) 4885 4162 w
+(Venti) 720 4282 w
+(archival) 1007 4282 w
+(file) 1419 4282 w
+(system) 1608 4282 w
+(format,) 1986 4282 w
+(and) 2380 4282 w
+(finally) 2598 4282 w
+(Fossil) 2929 4282 w
+10 /LucidaSansUnicode20 f
+(\031) 3204 4282 w
+10 /LucidaSansUnicode00 f
+(s) 3236 4282 w
+(file) 3324 4282 w
+(system) 3512 4282 w
+(format.) 3889 4282 w
+(The) 4314 4282 w
+(end) 4532 4282 w
+(of) 4750 4282 w
+(the) 4885 4282 w
+(paper discusses the architecture of the Fossil server.) 720 4402 w
+(The) 970 4558 w
+(presentation) 1196 4558 w
+(of) 1851 4558 w
+(the) 1994 4558 w
+(data) 2195 4558 w
+(structures) 2451 4558 w
+(is) 2986 4558 w
+(very) 3112 4558 w
+(detailed,) 3359 4558 w
+(perhaps) 3825 4558 w
+(too) 4262 4558 w
+(detailed) 4467 4558 w
+(for) 4901 4558 w
+(most) 720 4678 w
+(readers.) 1025 4678 w
+(The) 1514 4678 w
+(intent) 1757 4678 w
+(is) 2102 4678 w
+(to) 2244 4678 w
+(record) 2404 4678 w
+(all) 2779 4678 w
+(the) 2954 4678 w
+(details) 3171 4678 w
+(necessary) 3553 4678 w
+(to) 4090 4678 w
+(make) 4250 4678 w
+(structural) 4574 4678 w
+(changes to the file system format.) 720 4798 w
+(Feel free to jump ahead when boredom strikes.) 2443 4798 w
+10 /LucidaSans-Demi f
+(2.) 720 5038 w
+(Venti trees and directory hierarchies) 873 5038 w
+10 /LucidaSansUnicode00 f
+(Venti) 720 5194 w
+([3]) 1007 5194 w
+(is) 1174 5194 w
+(an) 1292 5194 w
+(archival) 1447 5194 w
+(block) 1859 5194 w
+(storage) 2159 5194 w
+(server.) 2560 5194 w
+(Once) 2959 5194 w
+(a) 3245 5194 w
+(block) 3339 5194 w
+(is) 3640 5194 w
+(stored,) 3759 5194 w
+(it) 4139 5194 w
+(can) 4244 5194 w
+(be) 4451 5194 w
+(retrieved) 4609 5194 w
+(by) 720 5314 w
+(presenting) 882 5314 w
+(the) 1448 5314 w
+(20-byte) 1649 5314 w
+(SHA1) 2087 5314 w
+(hash) 2393 5314 w
+(of) 2669 5314 w
+(its) 2813 5314 w
+(contents,) 2976 5314 w
+(called) 3471 5314 w
+(a) 3800 5314 w
+10 /LucidaSans-Italic f
+(score) 3901 5314 w
+10 /LucidaSansUnicode00 f
+(.) 4158 5314 w
+(Blocks) 4268 5314 w
+(on) 4622 5314 w
+(Venti) 4791 5314 w
+(have) 720 5434 w
+(a) 994 5434 w
+(maximum) 1098 5434 w
+(length) 1633 5434 w
+(of) 1990 5434 w
+(about) 2137 5434 w
+(56) 2464 5434 w
+(kilobytes,) 2639 5434 w
+(though) 3156 5434 w
+(in) 3551 5434 w
+(practice) 3691 5434 w
+(smaller) 4123 5434 w
+(blocks) 4526 5434 w
+(are) 4888 5434 w
+(used.) 720 5554 w
+(To) 1055 5554 w
+(store) 1218 5554 w
+(a) 1503 5554 w
+(byte) 1597 5554 w
+(stream) 1844 5554 w
+(of) 2216 5554 w
+(arbitrary) 2353 5554 w
+(length,) 2806 5554 w
+(Venti) 3184 5554 w
+(uses) 3471 5554 w
+(a) 3729 5554 w
+(hash) 3822 5554 w
+(tree.) 4090 5554 w
+(Conceptually,) 4382 5554 w
+(the) 720 5674 w
+(data) 913 5674 w
+(stream) 1161 5674 w
+(is) 1532 5674 w
+(broken) 1651 5674 w
+(into) 2031 5674 w
+(fixed-size) 2259 5674 w
+(\(say,) 2795 5674 w
+10 /LucidaSans-Italic f
+(dsize) 3057 5674 w
+10 /LucidaSansUnicode00 f
+(-byte\)) 3303 5674 w
+(chunks,) 3641 5674 w
+(which) 4058 5674 w
+(are) 4378 5674 w
+(stored) 4569 5674 w
+(on) 4917 5674 w
+(the) 720 5794 w
+(Venti) 932 5794 w
+(server.) 1238 5794 w
+(The) 1656 5794 w
+(resulting) 1894 5794 w
+(scores) 2380 5794 w
+(are) 2748 5794 w
+(concatenated) 2957 5794 w
+(into) 3660 5794 w
+(a) 3906 5794 w
+(new) 4018 5794 w
+(pointer) 4270 5794 w
+(stream,) 4675 5794 w
+(which) 720 5914 w
+(is) 1042 5914 w
+(broken) 1163 5914 w
+(into) 1545 5914 w
+(fixed) 1775 5914 w
+(size) 2062 5914 w
+(\(say,) 2296 5914 w
+10 /LucidaSans-Italic f
+(psize) 2560 5914 w
+10 /LucidaSansUnicode00 f
+(-byte\)) 2806 5914 w
+(chunks,) 3146 5914 w
+(which) 3565 5914 w
+(are) 3887 5914 w
+(stored) 4080 5914 w
+(on) 4430 5914 w
+(the) 4594 5914 w
+(Venti) 4791 5914 w
+(server.) 720 6034 w
+(\() 1114 6034 w
+10 /LucidaSans-Italic f
+(Psize) 1147 6034 w
+10 /LucidaSansUnicode00 f
+(is) 1423 6034 w
+(different from) 1536 6034 w
+10 /LucidaSans-Italic f
+(dsize) 2250 6034 w
+10 /LucidaSansUnicode00 f
+(so that we can ensure that each pointer block holds) 2528 6034 w
+(an integral number) 720 6154 w
+(of) 1682 6154 w
+(pointers.\)) 1813 6154 w
+(This) 2343 6154 w
+(yields) 2581 6154 w
+(a) 2894 6154 w
+(new) 2982 6154 w
+(pointer) 3210 6154 w
+(stream,) 3592 6154 w
+(and) 3990 6154 w
+(so) 4203 6154 w
+(on,) 4348 6154 w
+(until) 4536 6154 w
+(there) 4788 6154 w
+(is) 720 6274 w
+(a) 854 6274 w
+(single) 963 6274 w
+(block) 1306 6274 w
+(and) 1622 6274 w
+(finally) 1856 6274 w
+(a) 2203 6274 w
+(single) 2312 6274 w
+(score) 2655 6274 w
+(describing) 2968 6274 w
+(the) 3528 6274 w
+(entire) 3736 6274 w
+(tree.) 4070 6274 w
+(The) 4377 6274 w
+(resulting) 4611 6274 w
+(structure looks like:) 720 6394 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 1 1
+%%Page: 2 2
+/saveobj save def
+mark
+2 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 2 \255) 2783 480 w
+8 /LucidaTypewriter f
+(VtDataType) 1114 1772 w
+1836 1792 1836 1720 Dl
+1836 1720 1980 1720 Dl
+1980 1720 1980 1792 Dl
+1980 1792 1836 1792 Dl
+2052 1792 2052 1720 Dl
+2052 1720 2196 1720 Dl
+2196 1720 2196 1792 Dl
+2196 1792 2052 1792 Dl
+8 /LucidaSansUnicode00 f
+(...) 2301 1772 w
+2484 1792 2484 1720 Dl
+2484 1720 2628 1720 Dl
+2628 1720 2628 1792 Dl
+2628 1792 2484 1792 Dl
+2700 1792 2700 1720 Dl
+2700 1720 2844 1720 Dl
+2844 1720 2844 1792 Dl
+2844 1792 2700 1792 Dl
+(...) 3021 1772 w
+3276 1792 3276 1720 Dl
+3276 1720 3420 1720 Dl
+3420 1720 3420 1792 Dl
+3420 1792 3276 1792 Dl
+3492 1792 3492 1720 Dl
+3492 1720 3636 1720 Dl
+3636 1720 3636 1792 Dl
+3636 1792 3492 1792 Dl
+(...) 3741 1772 w
+3924 1792 3924 1720 Dl
+3924 1720 4068 1720 Dl
+4068 1720 4068 1792 Dl
+4068 1792 3924 1792 Dl
+4140 1792 4140 1720 Dl
+4140 1720 4284 1720 Dl
+4284 1720 4284 1792 Dl
+4284 1792 4140 1792 Dl
+(...) 4389 1772 w
+4572 1792 4572 1720 Dl
+4572 1720 4716 1720 Dl
+4716 1720 4716 1792 Dl
+4716 1792 4572 1792 Dl
+1836 1504 1836 1432 Dl
+1836 1432 2196 1432 Dl
+2196 1432 2196 1504 Dl
+2196 1504 1836 1504 Dl
+1908 1432 1908 1504 Dl
+1980 1432 1980 1504 Dl
+1872 1468 1836 1720 Dl
+1828 1645 1835 1719 Dl
+1864 1651 1836 1719 Dl
+1944 1468 2052 1720 Dl
+2007 1660 2051 1719 Dl
+2040 1646 2051 1719 Dl
+(...) 2049 1484 w
+2484 1504 2484 1432 Dl
+2484 1432 2844 1432 Dl
+2844 1432 2844 1504 Dl
+2844 1504 2484 1504 Dl
+2556 1432 2556 1504 Dl
+2628 1432 2628 1504 Dl
+2520 1468 2484 1720 Dl
+2476 1645 2483 1719 Dl
+2512 1651 2484 1719 Dl
+2592 1468 2700 1720 Dl
+2655 1660 2699 1719 Dl
+2688 1646 2699 1719 Dl
+(...) 2697 1484 w
+3276 1504 3276 1432 Dl
+3276 1432 3636 1432 Dl
+3636 1432 3636 1504 Dl
+3636 1504 3276 1504 Dl
+3348 1432 3348 1504 Dl
+3420 1432 3420 1504 Dl
+3312 1468 3276 1720 Dl
+3268 1645 3275 1719 Dl
+3304 1651 3276 1719 Dl
+3384 1468 3492 1720 Dl
+3447 1660 3491 1719 Dl
+3480 1646 3491 1719 Dl
+(...) 3489 1484 w
+3924 1504 3924 1432 Dl
+3924 1432 4284 1432 Dl
+4284 1432 4284 1504 Dl
+4284 1504 3924 1504 Dl
+3996 1432 3996 1504 Dl
+4068 1432 4068 1504 Dl
+4212 1432 4212 1504 Dl
+3960 1468 3924 1720 Dl
+3916 1645 3923 1719 Dl
+3952 1651 3924 1719 Dl
+4032 1468 4140 1720 Dl
+4095 1660 4139 1719 Dl
+4128 1646 4139 1719 Dl
+(...) 4101 1484 w
+4248 1468 4572 1720 Dl
+4504 1689 4571 1719 Dl
+4525 1661 4571 1719 Dl
+8 /LucidaTypewriter f
+(VtPointerType0) 998 1484 w
+2160 1216 2160 1144 Dl
+2160 1144 2520 1144 Dl
+2520 1144 2520 1216 Dl
+2520 1216 2160 1216 Dl
+2232 1144 2232 1216 Dl
+2304 1144 2304 1216 Dl
+2196 1180 1836 1432 Dl
+1884 1375 1836 1431 Dl
+1905 1405 1836 1431 Dl
+2268 1180 2484 1432 Dl
+2423 1388 2483 1431 Dl
+2450 1365 2483 1431 Dl
+8 /LucidaSansUnicode00 f
+(...) 2373 1196 w
+3600 1216 3600 1144 Dl
+3600 1144 3960 1144 Dl
+3960 1144 3960 1216 Dl
+3960 1216 3600 1216 Dl
+3672 1144 3672 1216 Dl
+3888 1144 3888 1216 Dl
+3636 1180 3276 1432 Dl
+3324 1375 3276 1431 Dl
+3345 1405 3276 1431 Dl
+(...) 3741 1196 w
+3924 1180 3924 1432 Dl
+3906 1360 3924 1432 Dl
+3942 1360 3924 1432 Dl
+8 /LucidaTypewriter f
+(VtPointerType1) 998 1196 w
+2880 928 2880 856 Dl
+2880 856 3240 856 Dl
+3240 856 3240 928 Dl
+3240 928 2880 928 Dl
+2952 856 2952 928 Dl
+3168 856 3168 928 Dl
+2916 892 2160 1144 Dl
+2222 1104 2160 1143 Dl
+2234 1138 2160 1143 Dl
+8 /LucidaSansUnicode00 f
+(...) 3021 908 w
+3204 892 3600 1144 Dl
+3529 1120 3599 1143 Dl
+3548 1090 3599 1144 Dl
+8 /LucidaTypewriter f
+(VtPointerType2) 998 908 w
+10 /LucidaSansUnicode00 f
+(The) 720 2013 w
+(leaves) 947 2013 w
+(are) 1292 2013 w
+(the) 1490 2013 w
+(original) 1691 2013 w
+(data) 2105 2013 w
+(stream.) 2361 2013 w
+(Those) 2804 2013 w
+(blocks) 3143 2013 w
+(have) 3502 2013 w
+(type) 3773 2013 w
+10 /LucidaTypewriter f
+(VtDataType) 4028 2013 w
+10 /LucidaSansUnicode00 f
+(.) 4748 2013 w
+(The) 4859 2013 w
+(first) 720 2133 w
+(pointer) 1058 2133 w
+(stream) 1550 2133 w
+(has) 2025 2133 w
+(type) 2335 2133 w
+10 /LucidaTypewriter f
+(VtPointerType0) 2685 2133 w
+10 /LucidaSansUnicode00 f
+(,) 3693 2133 w
+(the) 3867 2133 w
+(next) 4164 2133 w
+(has) 4522 2133 w
+(type) 4832 2133 w
+10 /LucidaTypewriter f
+(VtPointerType1) 720 2253 w
+10 /LucidaSansUnicode00 f
+(,) 1728 2253 w
+(and) 1842 2253 w
+(so) 2104 2253 w
+(on.) 2298 2253 w
+(The) 2567 2253 w
+(figure) 2830 2253 w
+(ends) 3199 2253 w
+(with) 3513 2253 w
+(a) 3800 2253 w
+(single) 3937 2253 w
+(block) 4308 2253 w
+(of) 4652 2253 w
+(type) 4832 2253 w
+10 /LucidaTypewriter f
+(VtPointerType2) 720 2373 w
+10 /LucidaSansUnicode00 f
+(,) 1728 2373 w
+(but) 1807 2373 w
+(in) 2015 2373 w
+(general) 2152 2373 w
+(trees) 2559 2373 w
+(can) 2846 2373 w
+(have) 3060 2373 w
+(height) 3331 2373 w
+(up) 3685 2373 w
+(to) 3856 2373 w
+10 /LucidaTypewriter f
+(VtPointerType6) 4000 2373 w
+10 /LucidaSansUnicode00 f
+(.) 5008 2373 w
+(For) 720 2493 w
+(a) 911 2493 w
+10 /LucidaSans-Italic f
+(dsize) 1001 2493 w
+10 /LucidaSansUnicode00 f
+(of) 1282 2493 w
+(8192) 1415 2493 w
+(bytes) 1702 2493 w
+(and) 1996 2493 w
+10 /LucidaSans-Italic f
+(psize) 2211 2493 w
+10 /LucidaSansUnicode00 f
+(of) 2492 2493 w
+(8180) 2625 2493 w
+(bytes) 2912 2493 w
+(\(409) 3206 2493 w
+(pointers\),) 3463 2493 w
+(this) 3963 2493 w
+(gives) 4177 2493 w
+(a) 4463 2493 w
+(maximum) 4554 2493 w
+(stream size of approximately 10 zettabytes \(2) 720 2613 w
+8 /LucidaSansUnicode00 f
+(73) 2954 2573 w
+10 /LucidaSansUnicode00 f
+(or 10) 3086 2613 w
+8 /LucidaSansUnicode00 f
+(22) 3346 2573 w
+10 /LucidaSansUnicode00 f
+(bytes\).) 3478 2613 w
+(Data) 970 2769 w
+(block) 1233 2769 w
+(are) 1536 2769 w
+(truncated) 1729 2769 w
+(to) 2234 2769 w
+(remove) 2373 2769 w
+(trailing) 2773 2769 w
+(runs) 3158 2769 w
+(of) 3415 2769 w
+(zeros) 3554 2769 w
+(before) 3861 2769 w
+(storage) 4216 2769 w
+(to) 4620 2769 w
+(Venti;) 4759 2769 w
+(they) 720 2889 w
+(are) 962 2889 w
+(zero-filled) 1149 2889 w
+(back) 1700 2889 w
+(to) 1961 2889 w
+10 /LucidaSans-Italic f
+(dsize) 2093 2889 w
+10 /LucidaSansUnicode00 f
+(bytes) 2373 2889 w
+(after) 2666 2889 w
+(retrieval) 2926 2889 w
+(from) 3356 2889 w
+(Venti.) 3622 2889 w
+(SImilarly,) 3969 2889 w
+(trailing) 4446 2889 w
+(runs) 4824 2889 w
+(of) 720 3009 w
+(pointers) 858 3009 w
+(to) 1298 3009 w
+(zero-length) 1436 3009 w
+(blocks) 2057 3009 w
+(are) 2410 3009 w
+(removed) 2602 3009 w
+(from) 3064 3009 w
+(and) 3336 3009 w
+(added) 3557 3009 w
+(back) 3898 3009 w
+(to) 4166 3009 w
+(pointer) 4305 3009 w
+(blocks.) 4695 3009 w
+(These) 720 3129 w
+(simple) 1048 3129 w
+(rules) 1409 3129 w
+(happen) 1687 3129 w
+(to) 2087 3129 w
+(make) 2224 3129 w
+(it) 2525 3129 w
+(particularly) 2630 3129 w
+(efficient) 3213 3129 w
+(to) 3646 3129 w
+(store) 3783 3129 w
+(large) 4068 3129 w
+(runs) 4350 3129 w
+(of) 4605 3129 w
+(zeros,) 4742 3129 w
+(as) 720 3249 w
+(might) 864 3249 w
+(occur) 1185 3249 w
+(in) 1489 3249 w
+(a) 1618 3249 w
+(data) 1711 3249 w
+(stream) 1959 3249 w
+(with) 2330 3249 w
+10 /LucidaSansUnicode20 f
+(\030\030) 2573 3249 w
+10 /LucidaSansUnicode00 f
+(holes:) 2637 3249 w
+10 /LucidaSansUnicode20 f
+(\031\031) 2928 3249 w
+10 /LucidaSansUnicode00 f
+(the) 3030 3249 w
+(zero-length) 3223 3249 w
+(block) 3842 3249 w
+(itself) 4142 3249 w
+(can) 4419 3249 w
+(be) 4625 3249 w
+(inter\255) 4782 3249 w
+(preted as a tree of any depth encoding an all-zero data stream.) 720 3369 w
+(Reconstructing) 970 3525 w
+(the) 1728 3525 w
+(data) 1916 3525 w
+(stream) 2159 3525 w
+(requires) 2525 3525 w
+(the) 2957 3525 w
+(score) 3145 3525 w
+(and) 3439 3525 w
+(type) 3653 3525 w
+(of) 3895 3525 w
+(the) 4027 3525 w
+(topmost) 4216 3525 w
+(block) 4653 3525 w
+(in) 4949 3525 w
+(the) 720 3645 w
+(tree,) 917 3645 w
+(the) 1181 3645 w
+(data) 1377 3645 w
+(chunk) 1628 3645 w
+(size,) 1964 3645 w
+(the) 2230 3645 w
+(pointer) 2426 3645 w
+(chunk) 2816 3645 w
+(size,) 3152 3645 w
+(and) 3418 3645 w
+(the) 3639 3645 w
+(data) 3835 3645 w
+(stream) 4086 3645 w
+(size.) 4460 3645 w
+(\(From) 4758 3645 w
+(the) 720 3765 w
+(data) 909 3765 w
+(stream) 1153 3765 w
+(size) 1520 3765 w
+(and) 1747 3765 w
+(the) 1961 3765 w
+(chunk) 2150 3765 w
+(sizes) 2479 3765 w
+(we) 2758 3765 w
+(could) 2926 3765 w
+(derive) 3227 3765 w
+(the) 3559 3765 w
+(depth) 3749 3765 w
+(of) 4065 3765 w
+(the) 4198 3765 w
+(tree) 4388 3765 w
+(and) 4613 3765 w
+(thus) 4828 3765 w
+(the) 720 3885 w
+(type) 918 3885 w
+(of) 1169 3885 w
+(the) 1310 3885 w
+(topmost) 1508 3885 w
+(block,) 1954 3885 w
+(but) 2291 3885 w
+(it) 2496 3885 w
+(is) 2605 3885 w
+(convenient) 2728 3885 w
+(to) 3299 3885 w
+(allow) 3440 3885 w
+(trees) 3734 3885 w
+(that) 4018 3885 w
+(are) 4252 3885 w
+(deeper) 4447 3885 w
+(than) 4824 3885 w
+(necessary.\)) 720 4005 w
+(This information is kept in a 40-byte structure called a) 1324 4005 w
+10 /LucidaTypewriter f
+(VtEntry) 4024 4005 w
+10 /LucidaSansUnicode00 f
+(:) 4528 4005 w
+9 /LucidaTypewriter f
+(VtEntry:) 1008 4175 w
+(gen[4]) 1268 4285 w
+9 /LucidaSansUnicode00 f
+(generation) 2048 4285 w
+(number) 2546 4285 w
+9 /LucidaTypewriter f
+(psize[2]) 1268 4395 w
+9 /LucidaSansUnicode00 f
+(size) 2048 4395 w
+(of) 2250 4395 w
+(pointer) 2367 4395 w
+(blocks) 2710 4395 w
+9 /LucidaTypewriter f
+(dsize[2]) 1268 4505 w
+9 /LucidaSansUnicode00 f
+(size) 2048 4505 w
+(of) 2250 4505 w
+(data) 2367 4505 w
+(blocks) 2586 4505 w
+9 /LucidaTypewriter f
+(flags[1]) 1268 4615 w
+(zero[5]) 1268 4725 w
+(size[6]) 1268 4835 w
+9 /LucidaSansUnicode00 f
+(length) 2048 4835 w
+(of) 2354 4835 w
+(file) 2471 4835 w
+9 /LucidaTypewriter f
+(score[20]) 1268 4945 w
+9 /LucidaSansUnicode00 f
+(score) 2048 4945 w
+(of) 2311 4945 w
+(root) 2428 4945 w
+(block) 2637 4945 w
+(in) 2902 4945 w
+(tree) 3013 4945 w
+10 /LucidaSansUnicode00 f
+(\(In) 720 5125 w
+(this) 877 5125 w
+(notation,) 1089 5125 w
+10 /LucidaTypewriter f
+(name[sz]) 1558 5125 w
+10 /LucidaSansUnicode00 f
+(indicates) 2167 5125 w
+(a) 2633 5125 w
+10 /LucidaTypewriter f
+(sz) 2722 5125 w
+10 /LucidaSansUnicode00 f
+(-byte) 2866 5125 w
+(field) 3166 5125 w
+(called) 3414 5125 w
+10 /LucidaTypewriter f
+(name) 3731 5125 w
+10 /LucidaSansUnicode00 f
+(.) 4019 5125 w
+(Integers) 4117 5125 w
+(are) 4545 5125 w
+(stored) 4731 5125 w
+(in) 720 5245 w
+(big-endian) 846 5245 w
+(order.) 1420 5245 w
+10 /LucidaTypewriter f
+(Size) 1781 5245 w
+10 /LucidaSansUnicode00 f
+(really) 2104 5245 w
+(is) 2401 5245 w
+(a) 2516 5245 w
+(48-bit) 2606 5245 w
+(field.\)) 2954 5245 w
+10 /LucidaTypewriter f
+(Flags) 3300 5245 w
+10 /LucidaSansUnicode00 f
+(is) 3695 5245 w
+(made) 3810 5245 w
+(up) 4112 5245 w
+(of) 4272 5245 w
+(the) 4404 5245 w
+(following) 4593 5245 w
+(bit fields.) 720 5365 w
+10 /LucidaSansUnicode20 f
+(\030) 1242 5365 w
+10 /LucidaSans-Italic f
+(or) 1274 5365 w
+10 /LucidaSansUnicode20 f
+(\031) 1379 5365 w
+10 /LucidaSansUnicode00 f
+(of the following flags:) 1443 5365 w
+9 /LucidaTypewriter f
+(0x01) 1008 5535 w
+(VtEntryActive) 1398 5535 w
+9 /LucidaSansUnicode00 f
+(entry) 2828 5535 w
+(is) 3080 5535 w
+(allocated) 3181 5535 w
+9 /LucidaTypewriter f
+(0x02) 1008 5645 w
+(VtEntryDir) 1398 5645 w
+9 /LucidaSansUnicode00 f
+(entry) 2828 5645 w
+(describes) 3080 5645 w
+(a) 3524 5645 w
+(Venti) 3603 5645 w
+(directory) 3856 5645 w
+(\(q.v.\)) 4273 5645 w
+9 /LucidaTypewriter f
+(0x1C) 1008 5755 w
+(VtEntryDepthMask) 1398 5755 w
+9 /LucidaSansUnicode00 f
+(mask) 2828 5755 w
+(for) 3089 5755 w
+(tree) 3243 5755 w
+(depth) 3442 5755 w
+9 /LucidaTypewriter f
+(0x20) 1008 5865 w
+(VtEntryLocal) 1398 5865 w
+9 /LucidaSansUnicode00 f
+(reserved) 2828 5865 w
+(\(q.v.\)) 3231 5865 w
+10 /LucidaSansUnicode00 f
+(The) 720 6081 w
+(depth) 940 6081 w
+(of) 1261 6081 w
+(the) 1399 6081 w
+(described) 1594 6081 w
+(tree) 2107 6081 w
+(is) 2337 6081 w
+(stored) 2457 6081 w
+(in) 2806 6081 w
+(the) 2937 6081 w
+(5) 3132 6081 w
+(bits) 3235 6081 w
+(indicated:) 3455 6081 w
+(a) 3972 6081 w
+(tree) 4067 6081 w
+(with) 4297 6081 w
+(a) 4542 6081 w
+(topmost) 4637 6081 w
+(node of type) 720 6201 w
+10 /LucidaTypewriter f
+(VtPointerType3) 1364 6201 w
+10 /LucidaSansUnicode00 f
+(has depth 4.) 2404 6201 w
+(With) 970 6357 w
+10 /LucidaTypewriter f
+(VtEntry) 1220 6357 w
+10 /LucidaSansUnicode00 f
+(we) 1760 6357 w
+(can) 1929 6357 w
+(build) 2133 6357 w
+(more) 2415 6357 w
+(complicated) 2703 6357 w
+(data) 3328 6357 w
+(structures,) 3575 6357 w
+(ones) 4133 6357 w
+(with) 4400 6357 w
+(multiple) 4642 6357 w
+(or) 720 6477 w
+(nested) 870 6477 w
+(data) 1243 6477 w
+(streams.) 1501 6477 w
+(A) 1997 6477 w
+(data) 2114 6477 w
+(stream) 2372 6477 w
+(consisting) 2753 6477 w
+(of) 3296 6477 w
+10 /LucidaTypewriter f
+(VtEntry) 3441 6477 w
+10 /LucidaSansUnicode00 f
+(structures) 3992 6477 w
+(is) 4528 6477 w
+(called) 4655 6477 w
+(a) 4985 6477 w
+(Venti) 720 6597 w
+(directory.) 1007 6597 w
+(It) 1540 6597 w
+(is) 1644 6597 w
+(identical) 1762 6597 w
+(in) 2211 6597 w
+(structure) 2340 6597 w
+(to) 2816 6597 w
+(the) 2952 6597 w
+(Venti) 3145 6597 w
+(data) 3432 6597 w
+(stream) 3680 6597 w
+(we) 4051 6597 w
+(described) 4222 6597 w
+(earlier) 4733 6597 w
+(except) 720 6717 w
+(that) 1078 6717 w
+(the) 1303 6717 w
+(bottom-level) 1492 6717 w
+(type) 2158 6717 w
+(is) 2399 6717 w
+10 /LucidaTypewriter f
+(VtDirType) 2512 6717 w
+10 /LucidaSansUnicode00 f
+(,) 3160 6717 w
+(and) 3225 6717 w
+(the) 3438 6717 w
+10 /LucidaTypewriter f
+(VtEntry) 3626 6717 w
+10 /LucidaSansUnicode00 f
+(describing) 4163 6717 w
+(a) 4703 6717 w
+(Venti) 4791 6717 w
+(directory has the) 720 6837 w
+10 /LucidaTypewriter f
+(VtEntryDir) 1570 6837 w
+10 /LucidaSansUnicode00 f
+(flag) 2323 6837 w
+(bit) 2539 6837 w
+(set.) 2701 6837 w
+(The) 2942 6837 w
+10 /LucidaSans-Italic f
+(dsize) 3156 6837 w
+10 /LucidaSansUnicode00 f
+(for) 3435 6837 w
+(a) 3607 6837 w
+(Venti) 3695 6837 w
+(directory) 3977 6837 w
+(is) 4441 6837 w
+(a) 4554 6837 w
+(multiple) 4642 6837 w
+(of) 720 6957 w
+(40) 868 6957 w
+(so) 1044 6957 w
+(that) 1206 6957 w
+(each) 1447 6957 w
+(data) 1721 6957 w
+(chunk) 1981 6957 w
+(holds) 2326 6957 w
+(an) 2642 6957 w
+(integer) 2809 6957 w
+(number) 3202 6957 w
+(of) 3628 6957 w
+10 /LucidaTypewriter f
+(VtEntry) 3775 6957 w
+10 /LucidaSansUnicode00 f
+(structures.) 4328 6957 w
+(By) 4930 6957 w
+(analogy) 720 7077 w
+(with) 1141 7077 w
+(Venti) 1391 7077 w
+(directories,) 1686 7077 w
+(we) 2279 7077 w
+(call) 2458 7077 w
+(the) 2668 7077 w
+(original) 2869 7077 w
+(data) 3283 7077 w
+(stream) 3539 7077 w
+(a) 3918 7077 w
+(Venti) 4019 7077 w
+(file.) 4314 7077 w
+(Note) 4575 7077 w
+(that) 4849 7077 w
+(Venti) 720 7197 w
+(files) 1006 7197 w
+(are) 1245 7197 w
+(assumed) 1434 7197 w
+10 /LucidaSans-Italic f
+(not) 1902 7197 w
+10 /LucidaSansUnicode00 f
+(to) 2095 7197 w
+(contain) 2229 7197 w
+(pointers) 2622 7197 w
+(to) 3058 7197 w
+(other) 3192 7197 w
+(Venti) 3485 7197 w
+(blocks.) 3770 7197 w
+(The) 4183 7197 w
+(only) 4400 7197 w
+(pointers) 4640 7197 w
+(to) 720 7317 w
+(Venti) 866 7317 w
+(blocks) 1163 7317 w
+(occur) 1524 7317 w
+(in) 1838 7317 w
+10 /LucidaTypewriter f
+(VtEntry) 1977 7317 w
+10 /LucidaSansUnicode00 f
+(structures) 2529 7317 w
+(in) 3066 7317 w
+(Venti) 3205 7317 w
+(directories) 3502 7317 w
+(\(and) 4065 7317 w
+(in) 4326 7317 w
+(the) 4465 7317 w
+(internal) 4669 7317 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 2 2
+%%Page: 3 3
+/saveobj save def
+mark
+3 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 3 \255) 2783 480 w
+(hash) 720 840 w
+(tree) 988 840 w
+(structure) 1216 840 w
+(of) 1692 840 w
+(the) 1828 840 w
+(individual) 2021 840 w
+(files) 2532 840 w
+(and) 2772 840 w
+(directories\).) 2990 840 w
+(Note) 3640 840 w
+(also) 3906 840 w
+(that) 4140 840 w
+(these) 4369 840 w
+(directo\255) 4669 840 w
+(ries) 720 960 w
+(are) 936 960 w
+(nothing) 1127 960 w
+(more) 1541 960 w
+(than) 1831 960 w
+(pointer) 2087 960 w
+(lists.) 2476 960 w
+(In) 2777 960 w
+(particular,) 2908 960 w
+(there) 3443 960 w
+(are) 3735 960 w
+(no) 3927 960 w
+(names) 4090 960 w
+(or) 4447 960 w
+(metadata) 4589 960 w
+(like in a file system.) 720 1080 w
+(To) 970 1236 w
+(make) 1132 1236 w
+(it) 1432 1236 w
+(easier) 1536 1236 w
+(to) 1862 1236 w
+(pass) 1998 1236 w
+(hierarchies) 2256 1236 w
+(between) 2827 1236 w
+(applications,) 3272 1236 w
+(the) 3927 1236 w
+(root) 4121 1236 w
+(of) 4360 1236 w
+(a) 4497 1236 w
+(hierarchy) 4591 1236 w
+(is described in a 300-byte structure called a) 720 1356 w
+10 /LucidaTypewriter f
+(VtRoot) 2906 1356 w
+10 /LucidaSansUnicode00 f
+(:) 3338 1356 w
+9 /LucidaTypewriter f
+(VtRoot:) 1008 1526 w
+(version[2]) 1268 1636 w
+(2) 2308 1636 w
+(name[128]) 1268 1746 w
+9 /LucidaSansUnicode00 f
+(name) 2308 1746 w
+(of) 2577 1746 w
+(structure) 2694 1746 w
+(\(just) 3117 1746 w
+(a) 3338 1746 w
+(comment\)) 3417 1746 w
+9 /LucidaTypewriter f
+(type[128]) 1268 1856 w
+9 /LucidaSansUnicode00 f
+(string) 2308 1856 w
+(describing) 2591 1856 w
+(structure) 3077 1856 w
+(\() 3500 1856 w
+9 /LucidaTypewriter f
+(vac) 3530 1856 w
+9 /LucidaSansUnicode00 f
+(\)) 3725 1856 w
+9 /LucidaTypewriter f
+(score[20]) 1268 1966 w
+9 /LucidaSansUnicode00 f
+(pointer) 2308 1966 w
+(to) 2651 1966 w
+9 /LucidaTypewriter f
+(VtDirType) 2768 1966 w
+9 /LucidaSansUnicode00 f
+(block) 3382 1966 w
+9 /LucidaTypewriter f
+(blockSize[2]) 1268 2076 w
+9 /LucidaSansUnicode00 f
+(maximum) 2308 2076 w
+(block) 2776 2076 w
+(size) 3041 2076 w
+(in) 3243 2076 w
+(structure) 3354 2076 w
+9 /LucidaTypewriter f
+(prev[20]) 1268 2186 w
+9 /LucidaSansUnicode00 f
+(previous) 2308 2186 w
+9 /LucidaTypewriter f
+(VtRoot) 2711 2186 w
+9 /LucidaSansUnicode00 f
+(in) 3130 2186 w
+(chain,) 3241 2186 w
+(if) 3533 2186 w
+(any) 3621 2186 w
+10 /LucidaSansUnicode00 f
+(This structure is stored to Venti and its score is) 720 2402 w
+(passed) 3057 2402 w
+(between) 3429 2402 w
+(applications,) 3869 2402 w
+(typically) 4519 2402 w
+(in) 4949 2402 w
+(the) 720 2522 w
+(form) 925 2522 w
+10 /LucidaSansUnicode20 f
+(\030\030) 1207 2522 w
+10 /LucidaSans-Italic f
+(type) 1271 2522 w
+10 /LucidaTypewriter f
+(:) 1478 2522 w
+10 /LucidaSans-Italic f
+(rootscore) 1550 2522 w
+10 /LucidaSansUnicode00 f
+(,) 2007 2522 w
+10 /LucidaSansUnicode20 f
+(\031\031) 2039 2522 w
+10 /LucidaSansUnicode00 f
+(where) 2153 2522 w
+10 /LucidaSans-Italic f
+(type) 2495 2522 w
+10 /LucidaSansUnicode00 f
+(is) 2751 2522 w
+(the) 2880 2522 w
+(type) 3084 2522 w
+(field) 3341 2522 w
+(from) 3604 2522 w
+(the) 3885 2522 w
+10 /LucidaTypewriter f
+(VtRoot) 4089 2522 w
+10 /LucidaSansUnicode00 f
+(structure,) 4570 2522 w
+(and) 720 2642 w
+10 /LucidaSans-Italic f
+(rootscore) 945 2642 w
+10 /LucidaSansUnicode00 f
+(is) 1447 2642 w
+(the) 1572 2642 w
+(score) 1772 2642 w
+(of) 2077 2642 w
+(the) 2220 2642 w
+10 /LucidaTypewriter f
+(VtRoot) 2420 2642 w
+10 /LucidaSansUnicode00 f
+(block.) 2898 2642 w
+10 /LucidaTypewriter f
+(VtRoot) 3270 2642 w
+10 /LucidaSansUnicode00 f
+(structures) 3748 2642 w
+(can) 4283 2642 w
+(be) 4497 2642 w
+(chained) 4662 2642 w
+(together using the) 720 2762 w
+10 /LucidaSans-Italic f
+(prev) 1649 2762 w
+10 /LucidaSansUnicode00 f
+(field to encode an archival history of the data structure.) 1897 2762 w
+(For example, a small Venti hierarchy might look like:) 970 2918 w
+1260 3132 1260 3060 Dl
+1260 3060 1332 3060 Dl
+1332 3060 1332 3132 Dl
+1332 3132 1260 3132 Dl
+8 /LucidaTypewriter f
+(VtRoot) 1404 3112 w
+gsave
+newpath
+1404 3276 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1404 3276 1404 3204 Dl
+1404 3204 1476 3204 Dl
+1476 3204 1476 3276 Dl
+1476 3276 1404 3276 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1476 3276 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1476 3276 1476 3204 Dl
+1476 3204 1548 3204 Dl
+1548 3204 1548 3276 Dl
+1548 3276 1476 3276 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1548 3276 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1548 3276 1548 3204 Dl
+1548 3204 1620 3204 Dl
+1620 3204 1620 3276 Dl
+1620 3276 1548 3276 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+1692 3420 1692 3348 Dl
+1692 3348 1807 3348 Dl
+1807 3348 1807 3420 Dl
+1807 3420 1692 3420 Dl
+1692 3564 1692 3492 Dl
+1692 3492 2268 3492 Dl
+2268 3492 2268 3564 Dl
+2268 3564 1692 3564 Dl
+gsave
+newpath
+1692 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1692 3708 1692 3636 Dl
+1692 3636 1764 3636 Dl
+1764 3636 1764 3708 Dl
+1764 3708 1692 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1764 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1764 3708 1764 3636 Dl
+1764 3636 1836 3636 Dl
+1836 3636 1836 3708 Dl
+1836 3708 1764 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1836 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1836 3708 1836 3636 Dl
+1836 3636 1908 3636 Dl
+1908 3636 1908 3708 Dl
+1908 3708 1836 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1908 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1908 3708 1908 3636 Dl
+1908 3636 1980 3636 Dl
+1980 3636 1980 3708 Dl
+1980 3708 1908 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1980 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1980 3708 1980 3636 Dl
+1980 3636 2052 3636 Dl
+2052 3636 2052 3708 Dl
+2052 3708 1980 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2052 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2052 3708 2052 3636 Dl
+2052 3636 2124 3636 Dl
+2124 3636 2124 3708 Dl
+2124 3708 2052 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2124 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2124 3708 2124 3636 Dl
+2124 3636 2196 3636 Dl
+2196 3636 2196 3708 Dl
+2196 3708 2124 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2196 3708 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2196 3708 2196 3636 Dl
+2196 3636 2268 3636 Dl
+2268 3636 2268 3708 Dl
+2268 3708 2196 3708 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+2124 3852 2124 3780 Dl
+2124 3780 2700 3780 Dl
+2700 3780 2700 3852 Dl
+2700 3852 2124 3852 Dl
+gsave
+newpath
+2124 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2124 3996 2124 3924 Dl
+2124 3924 2196 3924 Dl
+2196 3924 2196 3996 Dl
+2196 3996 2124 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2196 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2196 3996 2196 3924 Dl
+2196 3924 2268 3924 Dl
+2268 3924 2268 3996 Dl
+2268 3996 2196 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2268 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2268 3996 2268 3924 Dl
+2268 3924 2340 3924 Dl
+2340 3924 2340 3996 Dl
+2340 3996 2268 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2340 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2340 3996 2340 3924 Dl
+2340 3924 2412 3924 Dl
+2412 3924 2412 3996 Dl
+2412 3996 2340 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2412 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2412 3996 2412 3924 Dl
+2412 3924 2484 3924 Dl
+2484 3924 2484 3996 Dl
+2484 3996 2412 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2484 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2484 3996 2484 3924 Dl
+2484 3924 2556 3924 Dl
+2556 3924 2556 3996 Dl
+2556 3996 2484 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2556 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2556 3996 2556 3924 Dl
+2556 3924 2628 3924 Dl
+2628 3924 2628 3996 Dl
+2628 3996 2556 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2628 3996 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2628 3996 2628 3924 Dl
+2628 3924 2700 3924 Dl
+2700 3924 2700 3996 Dl
+2700 3996 2628 3996 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+2124 4140 2124 4068 Dl
+2124 4068 2700 4068 Dl
+2700 4068 2700 4140 Dl
+2700 4140 2124 4140 Dl
+1296 3096 1296 3240 Dl
+1296 3240 1404 3240 Dl
+1332 3258 1404 3240 Dl
+1332 3222 1404 3240 Dl
+1584 3240 1584 3384 Dl
+1584 3384 1692 3384 Dl
+1620 3402 1692 3384 Dl
+1620 3366 1692 3384 Dl
+1512 3240 1512 3528 Dl
+1512 3528 1692 3528 Dl
+1620 3546 1692 3528 Dl
+1620 3510 1692 3528 Dl
+1440 3240 1440 3672 Dl
+1440 3672 1692 3672 Dl
+1620 3690 1692 3672 Dl
+1620 3654 1692 3672 Dl
+1872 3672 1872 3816 Dl
+1872 3816 2124 3816 Dl
+2052 3834 2124 3816 Dl
+2052 3798 2124 3816 Dl
+1800 3672 1800 3960 Dl
+1800 3960 2124 3960 Dl
+2052 3978 2124 3960 Dl
+2052 3942 2124 3960 Dl
+1728 3672 1728 4104 Dl
+1728 4104 2124 4104 Dl
+2052 4122 2124 4104 Dl
+2052 4086 2124 4104 Dl
+8 /LucidaSansUnicode00 f
+(Key) 3891 3112 w
+3420 3132 4500 3132 Dl
+3420 3348 3420 3276 Dl
+3420 3276 3708 3276 Dl
+3708 3276 3708 3348 Dl
+3708 3348 3420 3348 Dl
+(Venti) 3780 3328 w
+(file) 4006 3328 w
+gsave
+newpath
+3420 3492 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3420 3492 3420 3420 Dl
+3420 3420 3492 3420 Dl
+3492 3420 3492 3492 Dl
+3492 3492 3420 3492 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+(Venti) 3780 3472 w
+(entry) 4006 3472 w
+(\() 4232 3472 w
+8 /LucidaTypewriter f
+(VtEntry) 4258 3472 w
+8 /LucidaSansUnicode00 f
+(\)) 4664 3472 w
+gsave
+newpath
+3420 3636 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3420 3636 3420 3564 Dl
+3420 3564 3492 3564 Dl
+3492 3564 3492 3636 Dl
+3492 3636 3420 3636 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3492 3636 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3492 3636 3492 3564 Dl
+3492 3564 3564 3564 Dl
+3564 3564 3564 3636 Dl
+3564 3636 3492 3636 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3564 3636 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3564 3636 3564 3564 Dl
+3564 3564 3636 3564 Dl
+3636 3564 3636 3636 Dl
+3636 3636 3564 3636 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3636 3636 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3636 3636 3636 3564 Dl
+3636 3564 3708 3564 Dl
+3708 3564 3708 3636 Dl
+3708 3636 3636 3636 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+(Venti) 3780 3616 w
+(directory) 4006 3616 w
+3420 3708 3636 3708 Dl
+3564 3726 3636 3708 Dl
+3564 3690 3636 3708 Dl
+(Venti) 3780 3724 w
+(pointer) 4006 3724 w
+(\(score\)) 4312 3724 w
+10 /LucidaSansUnicode00 f
+(Venti) 720 4361 w
+(files) 1022 4361 w
+(are) 1277 4361 w
+(shown) 1483 4361 w
+(as) 1850 4361 w
+(white) 2010 4361 w
+(boxes,) 2325 4361 w
+(while) 2703 4361 w
+(directories) 3010 4361 w
+(are) 3579 4361 w
+(shown) 3785 4361 w
+(as) 4152 4361 w
+(shaded) 4312 4361 w
+(boxes.) 4716 4361 w
+(Each) 720 4481 w
+(shaded) 977 4481 w
+(square) 1362 4481 w
+(represents) 1725 4481 w
+(a) 2274 4481 w
+10 /LucidaTypewriter f
+(VtEntry) 2363 4481 w
+10 /LucidaSansUnicode00 f
+(.) 2867 4481 w
+(Arrows) 2965 4481 w
+(represent) 3339 4481 w
+(pointers) 3836 4481 w
+(from) 4270 4481 w
+10 /LucidaTypewriter f
+(VtEntry) 4536 4481 w
+10 /LucidaSansUnicode00 f
+(structures to other Venti files or directories.) 720 4601 w
+(The hierarchical structure provided by Venti files and directories) 970 4757 w
+(can) 4128 4757 w
+(be) 4329 4757 w
+(used) 4481 4757 w
+(as) 4746 4757 w
+(the) 4885 4757 w
+(base) 720 4877 w
+(for) 979 4877 w
+(more) 1152 4877 w
+(complicated) 1437 4877 w
+(data) 2059 4877 w
+(structures.) 2303 4877 w
+(Because) 2889 4877 w
+(this) 3311 4877 w
+(structure) 3523 4877 w
+(captures) 3994 4877 w
+(all) 4443 4877 w
+(the) 4589 4877 w
+(infor\255) 4777 4877 w
+(mation about pointers to other blocks,) 720 4997 w
+(tools) 2628 4997 w
+(written) 2900 4997 w
+(to) 3272 4997 w
+(traverse) 3403 4997 w
+(Venti) 3825 4997 w
+(hierarchies) 4107 4997 w
+(can) 4673 4997 w
+(tra\255) 4874 4997 w
+(verse) 720 5117 w
+(the) 1041 5117 w
+(more) 1261 5117 w
+(complicated) 1576 5117 w
+(data) 2228 5117 w
+(structures) 2502 5117 w
+(as) 3055 5117 w
+(well.) 3225 5117 w
+(For) 3544 5117 w
+(example,) 3764 5117 w
+10 /LucidaSans-Italic f
+(venti/copy) 4273 5117 w
+10 /LucidaSansUnicode00 f
+(\(see) 4844 5117 w
+10 /LucidaSans-Italic f
+(ventiaux) 720 5237 w
+10 /LucidaSansUnicode00 f
+(\(8\)\)) 1133 5237 w
+(copies) 1341 5237 w
+(a) 1698 5237 w
+(Venti) 1799 5237 w
+(hierarchy) 2094 5237 w
+(from) 2589 5237 w
+(one) 2867 5237 w
+(Venti) 3092 5237 w
+(server) 3388 5237 w
+(to) 3732 5237 w
+(another,) 3877 5237 w
+(given) 4330 5237 w
+(the) 4638 5237 w
+(root) 4840 5237 w
+10 /LucidaTypewriter f
+(VtEntry) 720 5357 w
+10 /LucidaSansUnicode00 f
+(.) 1224 5357 w
+(Because) 1321 5357 w
+(the) 1743 5357 w
+(traditional) 1931 5357 w
+(file) 2462 5357 w
+(system) 2646 5357 w
+(described) 3019 5357 w
+(in) 3525 5357 w
+(later) 3649 5357 w
+(sections) 3900 5357 w
+(is) 4331 5357 w
+(layered) 4444 5357 w
+(on) 4829 5357 w
+(a) 4985 5357 w
+(Venti hierarchy,) 720 5477 w
+10 /LucidaSans-Italic f
+(venti/copy) 1514 5477 w
+10 /LucidaSansUnicode00 f
+(can copy it without fully understanding its structure.) 2053 5477 w
+10 /LucidaSans-Demi f
+(3.) 720 5717 w
+(Vac file system format) 873 5717 w
+10 /LucidaSansUnicode00 f
+(The) 720 5873 w
+(Venti) 952 5873 w
+(archive) 1252 5873 w
+(format) 1649 5873 w
+10 /LucidaSans-Italic f
+(vac) 2024 5873 w
+10 /LucidaSansUnicode00 f
+(builds) 2240 5873 w
+(a) 2588 5873 w
+(traditional) 2694 5873 w
+(file) 3243 5873 w
+(system) 3445 5873 w
+(using) 3836 5873 w
+(a) 4153 5873 w
+(Venti) 4259 5873 w
+(hierarchy.) 4559 5873 w
+(Each) 720 5993 w
+(vac) 976 5993 w
+(file) 1168 5993 w
+(is) 1353 5993 w
+(implemented) 1467 5993 w
+(as) 2138 5993 w
+(a) 2278 5993 w
+(Venti) 2367 5993 w
+(file;) 2650 5993 w
+(each) 2867 5993 w
+(vac) 3125 5993 w
+(directory) 3317 5993 w
+(is) 3781 5993 w
+(implemented) 3894 5993 w
+(as) 4564 5993 w
+(a) 4703 5993 w
+(Venti) 4791 5993 w
+(directory) 720 6113 w
+(and) 1195 6113 w
+(a) 1419 6113 w
+(Venti) 1518 6113 w
+(file) 1811 6113 w
+(to) 2006 6113 w
+(provide) 2148 6113 w
+(traditional) 2557 6113 w
+(file) 3099 6113 w
+(system) 3294 6113 w
+(metadata.) 3678 6113 w
+(The) 4238 6113 w
+(metadata) 4464 6113 w
+(is) 4960 6113 w
+(stored in a structure called a) 720 6233 w
+10 /LucidaTypewriter f
+(DirEntry) 2143 6233 w
+10 /LucidaSansUnicode00 f
+(:) 2719 6233 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 3 3
+%%Page: 4 4
+/saveobj save def
+mark
+4 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 4 \255) 2783 480 w
+9 /LucidaTypewriter f
+(DirEntry:) 1008 830 w
+(magic[4]) 1268 940 w
+(0x1c4d9072) 2048 940 w
+(\(DirMagic\)) 2763 940 w
+(version[2]) 1268 1050 w
+(9) 2048 1050 w
+(elem[s]) 1268 1160 w
+9 /LucidaSansUnicode00 f
+(name) 2048 1160 w
+(\(final) 2317 1160 w
+(path) 2567 1160 w
+(element) 2792 1160 w
+(only\)) 3170 1160 w
+9 /LucidaTypewriter f
+(entry[4]) 1268 1270 w
+9 /LucidaSansUnicode00 f
+(entry) 2048 1270 w
+(number) 2300 1270 w
+(for) 2669 1270 w
+(Venti) 2823 1270 w
+(file) 3076 1270 w
+(or) 3240 1270 w
+(directory) 3361 1270 w
+9 /LucidaTypewriter f
+(gen[4]) 1268 1380 w
+9 /LucidaSansUnicode00 f
+(generation) 2048 1380 w
+(number) 2546 1380 w
+9 /LucidaTypewriter f
+(mentry[4]) 1268 1490 w
+9 /LucidaSansUnicode00 f
+(entry) 2048 1490 w
+(number) 2300 1490 w
+(for) 2669 1490 w
+(Venti) 2823 1490 w
+(file) 3076 1490 w
+(holding) 3240 1490 w
+(metadata) 3601 1490 w
+9 /LucidaTypewriter f
+(mgen[4]) 1268 1600 w
+9 /LucidaSansUnicode00 f
+(generation) 2048 1600 w
+(number) 2546 1600 w
+9 /LucidaTypewriter f
+(qid[8]) 1268 1710 w
+9 /LucidaSansUnicode00 f
+(unique) 2048 1710 w
+(file) 2378 1710 w
+(serial) 2542 1710 w
+(number) 2806 1710 w
+9 /LucidaTypewriter f
+(uid[s]) 1268 1820 w
+9 /LucidaSansUnicode00 f
+(owner) 2048 1820 w
+9 /LucidaTypewriter f
+(gid[s]) 1268 1930 w
+9 /LucidaSansUnicode00 f
+(group) 2048 1930 w
+9 /LucidaTypewriter f
+(mid[s]) 1268 2040 w
+9 /LucidaSansUnicode00 f
+(last) 2048 2040 w
+(modified) 2232 2040 w
+(by) 2649 2040 w
+9 /LucidaTypewriter f
+(mtime[4]) 1268 2150 w
+9 /LucidaSansUnicode00 f
+(last) 2048 2150 w
+(modification) 2232 2150 w
+(time) 2808 2150 w
+9 /LucidaTypewriter f
+(ctime[4]) 1268 2260 w
+9 /LucidaSansUnicode00 f
+(creation) 2048 2260 w
+(time) 2430 2260 w
+9 /LucidaTypewriter f
+(atime[4]) 1268 2370 w
+9 /LucidaSansUnicode00 f
+(last) 2048 2370 w
+(access) 2232 2370 w
+(time) 2545 2370 w
+9 /LucidaTypewriter f
+(mode[4]) 1268 2480 w
+9 /LucidaSansUnicode00 f
+(mode) 2048 2480 w
+(bits) 2323 2480 w
+10 /LucidaSansUnicode00 f
+(The) 720 2660 w
+(notation) 954 2660 w
+10 /LucidaTypewriter f
+(name[s]) 1412 2660 w
+10 /LucidaSansUnicode00 f
+(denotes) 1970 2660 w
+(a) 2410 2660 w
+(string) 2519 2660 w
+(stored) 2855 2660 w
+(as) 3218 2660 w
+(a) 3378 2660 w
+(two-byte) 3487 2660 w
+(length) 3982 2660 w
+(and) 4344 2660 w
+(then) 4578 2660 w
+(that) 4849 2660 w
+(many) 720 2780 w
+(bytes.) 1021 2780 w
+(The) 1383 2780 w
+(above) 1603 2780 w
+(describes) 1929 2780 w
+(Version) 2429 2780 w
+(9) 2833 2780 w
+(of) 2935 2780 w
+(the) 3072 2780 w
+10 /LucidaTypewriter f
+(DirEntry) 3265 2780 w
+10 /LucidaSansUnicode00 f
+(format.) 3879 2780 w
+(Versions) 4305 2780 w
+(7) 4759 2780 w
+(and) 4860 2780 w
+(8) 720 2900 w
+(are) 821 2900 w
+(very) 1011 2900 w
+(similar;) 1250 2900 w
+(they) 1647 2900 w
+(can) 1892 2900 w
+(be) 2098 2900 w
+(read) 2255 2900 w
+(by) 2508 2900 w
+(the) 2661 2900 w
+(current) 2854 2900 w
+10 /LucidaSans-Italic f
+(vac) 3243 2900 w
+10 /LucidaSansUnicode00 f
+(source) 3447 2900 w
+(code) 3808 2900 w
+(but) 4078 2900 w
+(are) 4279 2900 w
+(not) 4470 2900 w
+(written.) 4669 2900 w
+(Earlier) 720 3020 w
+(versions) 1060 3020 w
+(were) 1498 3020 w
+(not) 1762 3020 w
+(widespread.) 1956 3020 w
+(A) 2608 3020 w
+10 /LucidaTypewriter f
+(DirEntry) 2711 3020 w
+10 /LucidaSansUnicode00 f
+(may) 3321 3020 w
+(be) 3555 3020 w
+(followed) 3708 3020 w
+(by) 4155 3020 w
+(optional) 4304 3020 w
+(exten\255) 4735 3020 w
+(sion) 720 3140 w
+(sections,) 967 3140 w
+(though) 1441 3140 w
+(none) 1832 3140 w
+(are) 2118 3140 w
+(currently) 2315 3140 w
+(used.) 2791 3140 w
+(The) 3132 3140 w
+10 /LucidaTypewriter f
+(mode) 3358 3140 w
+10 /LucidaSansUnicode00 f
+(bits) 3691 3140 w
+(include) 3916 3140 w
+(bits) 4313 3140 w
+(commonly) 4538 3140 w
+(used by Unix and Windows, in addition to those used by Plan 9.) 720 3260 w
+(The) 970 3416 w
+10 /LucidaTypewriter f
+(entry) 1190 3416 w
+10 /LucidaSansUnicode00 f
+(field) 1589 3416 w
+(is) 1842 3416 w
+(an) 1961 3416 w
+(index) 2117 3416 w
+(into) 2427 3416 w
+(the) 2655 3416 w
+(parallel) 2849 3416 w
+(Venti) 3245 3416 w
+(directory.) 3533 3416 w
+(The) 4067 3416 w
+10 /LucidaTypewriter f
+(gen) 4287 3416 w
+10 /LucidaSansUnicode00 f
+(field) 4543 3416 w
+(must) 4797 3416 w
+(match) 720 3536 w
+(the) 1052 3536 w
+10 /LucidaTypewriter f
+(gen) 1241 3536 w
+10 /LucidaSansUnicode00 f
+(field) 1491 3536 w
+(in) 1739 3536 w
+(the) 1863 3536 w
+(corresponding) 2051 3536 w
+10 /LucidaTypewriter f
+(VtEntry) 2787 3536 w
+10 /LucidaSansUnicode00 f
+(in) 3324 3536 w
+(the) 3448 3536 w
+(directory;) 3636 3536 w
+(it) 4132 3536 w
+(is) 4231 3536 w
+(used) 4344 3536 w
+(to) 4609 3536 w
+(detect) 4740 3536 w
+(stale) 720 3656 w
+(indices.) 993 3656 w
+(Similarly,) 1443 3656 w
+10 /LucidaTypewriter f
+(mentry) 1931 3656 w
+10 /LucidaSansUnicode00 f
+(and) 2409 3656 w
+10 /LucidaTypewriter f
+(mgen) 2635 3656 w
+10 /LucidaSansUnicode00 f
+(are) 2969 3656 w
+(the) 3167 3656 w
+(index) 3368 3656 w
+(and) 3685 3656 w
+(generation) 3911 3656 w
+(number) 4478 3656 w
+(for) 4901 3656 w
+(the metadata Venti file, if the) 720 3776 w
+10 /LucidaTypewriter f
+(DirEntry) 2171 3776 w
+10 /LucidaSansUnicode00 f
+(describes a vac directory.) 2779 3776 w
+(The) 970 3932 w
+(relation) 1185 3932 w
+(between) 1589 3932 w
+(Venti) 2030 3932 w
+(files) 2313 3932 w
+(and) 2549 3932 w
+(directories) 2763 3932 w
+(and) 3312 3932 w
+(vac) 3526 3932 w
+(files) 3718 3932 w
+(and) 3954 3932 w
+(directories) 4168 3932 w
+(can) 4718 3932 w
+(be) 4921 3932 w
+(seen in this figure:) 720 4052 w
+1260 4266 1260 4194 Dl
+1260 4194 1332 4194 Dl
+1332 4194 1332 4266 Dl
+1332 4266 1260 4266 Dl
+8 /LucidaTypewriter f
+(VtRoot) 1404 4246 w
+gsave
+newpath
+1404 4482 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1404 4482 1404 4410 Dl
+1404 4410 1476 4410 Dl
+1476 4410 1476 4482 Dl
+1476 4482 1404 4482 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+8 /LucidaSansUnicode00 f
+(fs) 1548 4462 w
+(root) 1645 4462 w
+(block) 1832 4462 w
+gsave
+newpath
+1548 4698 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1548 4698 1548 4626 Dl
+1548 4626 1620 4626 Dl
+1620 4626 1620 4698 Dl
+1620 4698 1548 4698 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1620 4698 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1620 4698 1620 4626 Dl
+1620 4626 1692 4626 Dl
+1692 4626 1692 4698 Dl
+1692 4698 1620 4698 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1692 4698 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1692 4698 1692 4626 Dl
+1692 4626 1764 4626 Dl
+1764 4626 1764 4698 Dl
+1764 4698 1692 4698 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+(root) 1836 4678 w
+(directory) 2023 4678 w
+(info) 2395 4678 w
+(block) 2573 4678 w
+1836 4914 1836 4842 Dl
+1836 4842 1951 4842 Dl
+1951 4842 1951 4914 Dl
+1951 4914 1836 4914 Dl
+(root) 2023 4894 w
+(metadata) 2210 4894 w
+1836 5130 1836 5058 Dl
+1836 5058 1951 5058 Dl
+1951 5058 1951 5130 Dl
+1951 5130 1836 5130 Dl
+1951 5130 1951 5058 Dl
+1951 5058 2066 5058 Dl
+2066 5058 2066 5130 Dl
+2066 5130 1951 5130 Dl
+2066 5130 2066 5058 Dl
+2066 5058 2181 5058 Dl
+2181 5058 2181 5130 Dl
+2181 5130 2066 5130 Dl
+2181 5130 2181 5058 Dl
+2181 5058 2296 5058 Dl
+2296 5058 2296 5130 Dl
+2296 5130 2181 5130 Dl
+2296 5130 2296 5058 Dl
+2296 5058 2411 5058 Dl
+2412 5058 2412 5130 Dl
+2412 5130 2297 5130 Dl
+gsave
+newpath
+1836 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1836 5274 1836 5202 Dl
+1836 5202 1908 5202 Dl
+1908 5202 1908 5274 Dl
+1908 5274 1836 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1908 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1908 5274 1908 5202 Dl
+1908 5202 1980 5202 Dl
+1980 5202 1980 5274 Dl
+1980 5274 1908 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+1980 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+1980 5274 1980 5202 Dl
+1980 5202 2052 5202 Dl
+2052 5202 2052 5274 Dl
+2052 5274 1980 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2052 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2052 5274 2052 5202 Dl
+2052 5202 2124 5202 Dl
+2124 5202 2124 5274 Dl
+2124 5274 2052 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2124 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2124 5274 2124 5202 Dl
+2124 5202 2196 5202 Dl
+2196 5202 2196 5274 Dl
+2196 5274 2124 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2196 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2196 5274 2196 5202 Dl
+2196 5202 2268 5202 Dl
+2268 5202 2268 5274 Dl
+2268 5274 2196 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2268 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2268 5274 2268 5202 Dl
+2268 5202 2340 5202 Dl
+2340 5202 2340 5274 Dl
+2340 5274 2268 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2340 5274 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2340 5274 2340 5202 Dl
+2340 5202 2412 5202 Dl
+2412 5202 2412 5274 Dl
+2412 5274 2340 5274 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+2268 5490 2268 5418 Dl
+2268 5418 2383 5418 Dl
+2383 5418 2383 5490 Dl
+2383 5490 2268 5490 Dl
+2383 5490 2383 5418 Dl
+2383 5418 2498 5418 Dl
+2498 5418 2498 5490 Dl
+2498 5490 2383 5490 Dl
+2498 5490 2498 5418 Dl
+2498 5418 2613 5418 Dl
+2613 5418 2613 5490 Dl
+2613 5490 2498 5490 Dl
+2613 5490 2613 5418 Dl
+2613 5418 2728 5418 Dl
+2728 5418 2728 5490 Dl
+2728 5490 2613 5490 Dl
+2728 5490 2728 5418 Dl
+2728 5418 2843 5418 Dl
+2844 5418 2844 5490 Dl
+2844 5490 2729 5490 Dl
+gsave
+newpath
+2268 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2268 5634 2268 5562 Dl
+2268 5562 2340 5562 Dl
+2340 5562 2340 5634 Dl
+2340 5634 2268 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2340 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2340 5634 2340 5562 Dl
+2340 5562 2412 5562 Dl
+2412 5562 2412 5634 Dl
+2412 5634 2340 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2412 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2412 5634 2412 5562 Dl
+2412 5562 2484 5562 Dl
+2484 5562 2484 5634 Dl
+2484 5634 2412 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2484 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2484 5634 2484 5562 Dl
+2484 5562 2556 5562 Dl
+2556 5562 2556 5634 Dl
+2556 5634 2484 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2556 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2556 5634 2556 5562 Dl
+2556 5562 2628 5562 Dl
+2628 5562 2628 5634 Dl
+2628 5634 2556 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2628 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2628 5634 2628 5562 Dl
+2628 5562 2700 5562 Dl
+2700 5562 2700 5634 Dl
+2700 5634 2628 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2700 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2700 5634 2700 5562 Dl
+2700 5562 2772 5562 Dl
+2772 5562 2772 5634 Dl
+2772 5634 2700 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+2772 5634 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+2772 5634 2772 5562 Dl
+2772 5562 2844 5562 Dl
+2844 5562 2844 5634 Dl
+2844 5634 2772 5634 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+2268 5850 2268 5778 Dl
+2268 5778 2844 5778 Dl
+2844 5778 2844 5850 Dl
+2844 5850 2268 5850 Dl
+1296 4230 1296 4446 Dl
+1296 4446 1404 4446 Dl
+1332 4464 1404 4446 Dl
+1332 4428 1404 4446 Dl
+1440 4446 1440 4662 Dl
+1440 4662 1548 4662 Dl
+1476 4680 1548 4662 Dl
+1476 4644 1548 4662 Dl
+1728 4662 1728 4878 Dl
+1728 4878 1836 4878 Dl
+1764 4896 1836 4878 Dl
+1764 4860 1836 4878 Dl
+1656 4662 1656 5094 Dl
+1656 5094 1836 5094 Dl
+1764 5112 1836 5094 Dl
+1764 5076 1836 5094 Dl
+1584 4662 1584 5238 Dl
+1584 5238 1836 5238 Dl
+1764 5256 1836 5238 Dl
+1764 5220 1836 5238 Dl
+2016 5238 2016 5454 Dl
+2016 5454 2268 5454 Dl
+2196 5472 2268 5454 Dl
+2196 5436 2268 5454 Dl
+1944 5238 1944 5598 Dl
+1944 5598 2268 5598 Dl
+2196 5616 2268 5598 Dl
+2196 5580 2268 5598 Dl
+1872 5238 1872 5814 Dl
+1872 5814 2268 5814 Dl
+2196 5832 2268 5814 Dl
+2196 5796 2268 5814 Dl
+1893 5094 1872 5202 Dl
+1870 5165 1871 5201 Dl
+1887 5168 1872 5201 Dl
+2008 5094 1944 5202 Dl
+1954 5166 1944 5201 Dl
+1969 5176 1944 5201 Dl
+2008 5094 2015 5202 Dl
+2004 5166 2015 5201 Dl
+2022 5165 2016 5201 Dl
+2239 5094 2303 5202 Dl
+2278 5176 2303 5201 Dl
+2293 5166 2303 5201 Dl
+2354 5094 2160 5202 Dl
+2187 5176 2160 5201 Dl
+2196 5192 2160 5201 Dl
+1800 5310 1836 5310 Dl
+1876 5310 1912 5310 Dl
+1953 5310 1989 5310 Dl
+2029 5310 2065 5310 Dl
+2106 5310 2142 5310 Dl
+2182 5310 2218 5310 Dl
+2259 5310 2295 5310 Dl
+2335 5310 2371 5310 Dl
+2412 5310 2448 5310 Dl
+2448 5310 2448 5274 Dl
+2448 5247 2448 5211 Dl
+2448 5184 2448 5148 Dl
+2448 5121 2448 5085 Dl
+2448 5058 2448 5022 Dl
+2448 5022 2412 5022 Dl
+2371 5022 2335 5022 Dl
+2295 5022 2259 5022 Dl
+2218 5022 2182 5022 Dl
+2142 5022 2106 5022 Dl
+2065 5022 2029 5022 Dl
+1989 5022 1953 5022 Dl
+1912 5022 1876 5022 Dl
+1836 5022 1800 5022 Dl
+1800 5022 1800 5058 Dl
+1800 5085 1800 5121 Dl
+1800 5148 1800 5184 Dl
+1800 5211 1800 5247 Dl
+1800 5274 1800 5310 Dl
+2232 5670 2268 5670 Dl
+2308 5670 2344 5670 Dl
+2385 5670 2421 5670 Dl
+2461 5670 2497 5670 Dl
+2538 5670 2574 5670 Dl
+2614 5670 2650 5670 Dl
+2691 5670 2727 5670 Dl
+2767 5670 2803 5670 Dl
+2844 5670 2880 5670 Dl
+2880 5670 2880 5634 Dl
+2880 5607 2880 5571 Dl
+2880 5544 2880 5508 Dl
+2880 5481 2880 5445 Dl
+2880 5418 2880 5382 Dl
+2880 5382 2844 5382 Dl
+2803 5382 2767 5382 Dl
+2727 5382 2691 5382 Dl
+2650 5382 2614 5382 Dl
+2574 5382 2538 5382 Dl
+2497 5382 2461 5382 Dl
+2421 5382 2385 5382 Dl
+2344 5382 2308 5382 Dl
+2268 5382 2232 5382 Dl
+2232 5382 2232 5418 Dl
+2232 5445 2232 5481 Dl
+2232 5508 2232 5544 Dl
+2232 5571 2232 5607 Dl
+2232 5634 2232 5670 Dl
+5 /LucidaSansUnicode00 f
+(.) 2224 5888 w
+(.) 2260 5888 w
+(.) 2296 5888 w
+(.) 2332 5888 w
+(.) 2368 5888 w
+(.) 2404 5888 w
+(.) 2440 5888 w
+(.) 2476 5888 w
+(.) 2512 5888 w
+(.) 2548 5888 w
+(.) 2584 5888 w
+(.) 2620 5888 w
+(.) 2656 5888 w
+(.) 2692 5888 w
+(.) 2728 5888 w
+(.) 2764 5888 w
+(.) 2800 5888 w
+(.) 2836 5888 w
+(.) 2872 5888 w
+(.) 2872 5888 w
+(.) 2872 5852 w
+(.) 2872 5816 w
+(.) 2872 5780 w
+(.) 2872 5744 w
+(.) 2872 5744 w
+(.) 2836 5744 w
+(.) 2800 5744 w
+(.) 2764 5744 w
+(.) 2728 5744 w
+(.) 2692 5744 w
+(.) 2656 5744 w
+(.) 2620 5744 w
+(.) 2584 5744 w
+(.) 2548 5744 w
+(.) 2512 5744 w
+(.) 2476 5744 w
+(.) 2440 5744 w
+(.) 2404 5744 w
+(.) 2368 5744 w
+(.) 2332 5744 w
+(.) 2296 5744 w
+(.) 2260 5744 w
+(.) 2224 5744 w
+(.) 2224 5744 w
+(.) 2224 5780 w
+(.) 2224 5816 w
+(.) 2224 5852 w
+(.) 2224 5888 w
+8 /LucidaSansUnicode00 f
+(Key) 3891 4246 w
+3420 4266 4500 4266 Dl
+3420 4482 3420 4410 Dl
+3420 4410 3708 4410 Dl
+3708 4410 3708 4482 Dl
+3708 4482 3420 4482 Dl
+(Venti) 3780 4462 w
+(file) 4006 4462 w
+gsave
+newpath
+3420 4626 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3420 4626 3420 4554 Dl
+3420 4554 3492 4554 Dl
+3492 4554 3492 4626 Dl
+3492 4626 3420 4626 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+(Venti) 3780 4606 w
+(entry) 4006 4606 w
+(\() 4232 4606 w
+8 /LucidaTypewriter f
+(Entry) 4258 4606 w
+8 /LucidaSansUnicode00 f
+(\)) 4548 4606 w
+gsave
+newpath
+3420 4770 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3420 4770 3420 4698 Dl
+3420 4698 3492 4698 Dl
+3492 4698 3492 4770 Dl
+3492 4770 3420 4770 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3492 4770 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3492 4770 3492 4698 Dl
+3492 4698 3564 4698 Dl
+3564 4698 3564 4770 Dl
+3564 4770 3492 4770 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3564 4770 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3564 4770 3564 4698 Dl
+3564 4698 3636 4698 Dl
+3636 4698 3636 4770 Dl
+3636 4770 3564 4770 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+gsave
+newpath
+3636 4770 m
+/inpath true def
+ 0.9 0.9 0.9 setrgbcolor
+3636 4770 3636 4698 Dl
+3636 4698 3708 4698 Dl
+3708 4698 3708 4770 Dl
+3708 4770 3636 4770 Dl
+ gsave eofill grestore 0 setgray stroke
+grestore
+/inpath false def
+(Venti) 3780 4750 w
+(directory) 4006 4750 w
+3420 4842 3636 4842 Dl
+3564 4860 3636 4842 Dl
+3564 4824 3636 4842 Dl
+(Venti) 3780 4858 w
+(pointer) 4006 4858 w
+(\(score\)) 4312 4858 w
+5 /LucidaSansUnicode00 f
+(.) 3412 4988 w
+(.) 3448 4988 w
+(.) 3484 4988 w
+(.) 3520 4988 w
+(.) 3556 4988 w
+(.) 3592 4988 w
+(.) 3628 4988 w
+(.) 3664 4988 w
+(.) 3700 4988 w
+(.) 3700 4988 w
+(.) 3700 4952 w
+(.) 3700 4916 w
+(.) 3700 4916 w
+(.) 3664 4916 w
+(.) 3628 4916 w
+(.) 3592 4916 w
+(.) 3556 4916 w
+(.) 3520 4916 w
+(.) 3484 4916 w
+(.) 3448 4916 w
+(.) 3412 4916 w
+(.) 3412 4916 w
+(.) 3412 4952 w
+(.) 3412 4988 w
+8 /LucidaSansUnicode00 f
+(Vac) 3780 4966 w
+(file) 3943 4966 w
+3420 5130 3420 5058 Dl
+3420 5058 3535 5058 Dl
+3535 5058 3535 5130 Dl
+3535 5130 3420 5130 Dl
+(Vac) 3780 5110 w
+(entry) 3943 5110 w
+(\() 4169 5110 w
+8 /LucidaTypewriter f
+(DirEntry) 4195 5110 w
+8 /LucidaSansUnicode00 f
+(\)) 4659 5110 w
+3420 5274 3456 5274 Dl
+3483 5274 3519 5274 Dl
+3546 5274 3582 5274 Dl
+3609 5274 3645 5274 Dl
+3672 5274 3708 5274 Dl
+3708 5274 3708 5202 Dl
+3708 5202 3672 5202 Dl
+3645 5202 3609 5202 Dl
+3582 5202 3546 5202 Dl
+3519 5202 3483 5202 Dl
+3456 5202 3420 5202 Dl
+3420 5202 3420 5274 Dl
+(Vac) 3780 5254 w
+(directory) 3943 5254 w
+3420 5346 3528 5346 Dl
+3492 5354 3528 5345 Dl
+3492 5336 3528 5345 Dl
+(Vac) 3780 5362 w
+(pointer) 3943 5362 w
+(\(integer) 4249 5362 w
+(index\)) 4577 5362 w
+10 /LucidaSansUnicode00 f
+(In) 720 6107 w
+(reality,) 848 6107 w
+(the) 1216 6107 w
+(story) 1408 6107 w
+(is) 1687 6107 w
+(slightly) 1804 6107 w
+(more) 2192 6107 w
+(complicated.) 2480 6107 w
+(The) 3169 6107 w
+(metadata) 3387 6107 w
+(file) 3875 6107 w
+(in) 4063 6107 w
+(a) 4191 6107 w
+(Vac) 4283 6107 w
+(directory) 4491 6107 w
+(is) 4960 6107 w
+(not) 720 6227 w
+(just) 918 6227 w
+(the) 1136 6227 w
+(concatenation) 1329 6227 w
+(of) 2046 6227 w
+10 /LucidaTypewriter f
+(DirEntry) 2182 6227 w
+10 /LucidaSansUnicode00 f
+(structures.) 2796 6227 w
+(Instead,) 3387 6227 w
+(it) 3810 6227 w
+(is) 3914 6227 w
+(the) 4032 6227 w
+(concatenation) 4225 6227 w
+(of) 4942 6227 w
+10 /LucidaTypewriter f
+(MetaBlocks) 720 6347 w
+10 /LucidaSansUnicode00 f
+(.) 1440 6347 w
+(A) 1541 6347 w
+10 /LucidaTypewriter f
+(MetaBlock) 1648 6347 w
+10 /LucidaSansUnicode00 f
+(contains) 2334 6347 w
+(some) 2780 6347 w
+(number) 3079 6347 w
+(of) 3494 6347 w
+10 /LucidaTypewriter f
+(DirEntry) 3630 6347 w
+10 /LucidaSansUnicode00 f
+(structures) 4244 6347 w
+(along) 4771 6347 w
+(with) 720 6467 w
+(a) 974 6467 w
+(sorted) 1078 6467 w
+(index) 1436 6467 w
+(to) 1756 6467 w
+(make) 1903 6467 w
+(it) 2214 6467 w
+(easy) 2329 6467 w
+(to) 2592 6467 w
+(look) 2739 6467 w
+(for) 2997 6467 w
+(a) 3185 6467 w
+(particular) 3289 6467 w
+10 /LucidaTypewriter f
+(DirEntry) 3800 6467 w
+10 /LucidaSansUnicode00 f
+(by) 4424 6467 w
+(its) 4587 6467 w
+10 /LucidaTypewriter f
+(elem) 4752 6467 w
+10 /LucidaSansUnicode00 f
+(field.) 720 6587 w
+(The details are in the source code.) 1030 6587 w
+(As) 970 6743 w
+(shown) 1135 6743 w
+(in) 1493 6743 w
+(the) 1629 6743 w
+(diagram,) 1829 6743 w
+(the) 2304 6743 w
+(root) 2504 6743 w
+(directory) 2749 6743 w
+(of) 3225 6743 w
+(the) 3369 6743 w
+(file) 3570 6743 w
+(system) 3767 6743 w
+(is) 4153 6743 w
+(summarized) 4279 6743 w
+(by) 4925 6743 w
+(three) 720 6863 w
+10 /LucidaTypewriter f
+(VtEntry) 1019 6863 w
+10 /LucidaSansUnicode00 f
+(structures) 1570 6863 w
+(describing) 2106 6863 w
+(the) 2660 6863 w
+(Venti) 2862 6863 w
+(directory) 3158 6863 w
+(for) 3636 6863 w
+(the) 3822 6863 w
+(children) 4024 6863 w
+(of) 4463 6863 w
+(the) 4607 6863 w
+(root,) 4808 6863 w
+(the) 720 6983 w
+(Venti) 912 6983 w
+(file) 1198 6983 w
+(for) 1386 6983 w
+(the) 1562 6983 w
+(metadata) 1754 6983 w
+(describing) 2242 6983 w
+(the) 2786 6983 w
+(children) 2978 6983 w
+(of) 3408 6983 w
+(the) 3543 6983 w
+(root,) 3735 6983 w
+(and) 4005 6983 w
+(a) 4223 6983 w
+(Venti) 4316 6983 w
+(file) 4603 6983 w
+(hold\255) 4792 6983 w
+(ing) 720 7103 w
+(metadata) 919 7103 w
+(for) 1416 7103 w
+(the) 1601 7103 w
+(root) 1802 7103 w
+(directory) 2048 7103 w
+(itself.) 2525 7103 w
+(These) 2874 7103 w
+10 /LucidaTypewriter f
+(VtEntry) 3207 7103 w
+10 /LucidaSansUnicode00 f
+(structures) 3756 7103 w
+(are) 4290 7103 w
+(placed) 4487 7103 w
+(in) 4849 7103 w
+(a) 4985 7103 w
+(Venti directory of their own, described by the single) 720 7223 w
+10 /LucidaTypewriter f
+(VtEntry) 3275 7223 w
+10 /LucidaSansUnicode00 f
+(in the root block.) 3811 7223 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 4 4
+%%Page: 5 5
+/saveobj save def
+mark
+5 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 5 \255) 2783 480 w
+10 /LucidaSans-Demi f
+(4.) 720 840 w
+(Fossil file system format) 873 840 w
+10 /LucidaSansUnicode00 f
+(Fossil) 720 996 w
+(uses) 1034 996 w
+(the) 1293 996 w
+(vac) 1487 996 w
+(format,) 1684 996 w
+(with) 2079 996 w
+(some) 2323 996 w
+(small) 2623 996 w
+(changes.) 2919 996 w
+(The) 3421 996 w
+(changes) 3641 996 w
+(only) 4079 996 w
+(affect) 4322 996 w
+(the) 4635 996 w
+(data) 4830 996 w
+(on the local disk; the data archived to Venti is exactly in vac format.) 720 1116 w
+(Blocks) 970 1272 w
+(stored) 1321 1272 w
+(on) 1673 1272 w
+(local) 1839 1272 w
+(disk) 2107 1272 w
+(may) 2351 1272 w
+(contain) 2594 1272 w
+(scores) 2994 1272 w
+(pointing) 3348 1272 w
+(at) 3796 1272 w
+(local) 3931 1272 w
+(disk) 4200 1272 w
+(blocks) 4445 1272 w
+(or) 4802 1272 w
+(at) 4948 1272 w
+(Venti) 720 1392 w
+(blocks.) 1013 1392 w
+(Local) 1402 1392 w
+(block) 1695 1392 w
+(addresses) 2001 1392 w
+(are) 2532 1392 w
+(stored) 2728 1392 w
+(as) 3081 1392 w
+(20-byte) 3231 1392 w
+(scores) 3666 1392 w
+(in) 4020 1392 w
+(which) 4154 1392 w
+(the) 4478 1392 w
+(first) 4676 1392 w
+(16) 4914 1392 w
+(bytes are all zero and the last 4 bytes specify a block number) 720 1512 w
+(in) 3738 1512 w
+(the) 3862 1512 w
+(disk.) 4050 1512 w
+(Before) 4348 1512 w
+(a) 4690 1512 w
+(block) 4778 1512 w
+(is) 720 1632 w
+(archived,) 834 1632 w
+(all) 1309 1632 w
+(the) 1456 1632 w
+(blocks) 1645 1632 w
+(it) 1992 1632 w
+(points) 2092 1632 w
+(to) 2429 1632 w
+(must) 2561 1632 w
+(be) 2837 1632 w
+(archived,) 2989 1632 w
+(and) 3463 1632 w
+(the) 3676 1632 w
+(local) 3864 1632 w
+(scores) 4122 1632 w
+(in) 4466 1632 w
+(the) 4590 1632 w
+(block) 4778 1632 w
+(must) 720 1752 w
+(be) 997 1752 w
+(changed) 1150 1752 w
+(to) 1596 1752 w
+(Venti) 1729 1752 w
+(scores.) 2013 1752 w
+(Using) 2423 1752 w
+(block) 2731 1752 w
+(addresses) 3028 1752 w
+(rather) 3550 1752 w
+(than) 3877 1752 w
+(content) 4128 1752 w
+(hashes) 4529 1752 w
+(for) 4901 1752 w
+(local) 720 1872 w
+(data) 1002 1872 w
+(makes) 1269 1872 w
+(the) 1639 1872 w
+(local) 1851 1872 w
+(file) 2133 1872 w
+(system) 2341 1872 w
+(easier) 2738 1872 w
+(to) 3083 1872 w
+(manage:) 3237 1872 w
+(if) 3708 1872 w
+(a) 3830 1872 w
+(local) 3941 1872 w
+(block) 4222 1872 w
+10 /LucidaSansUnicode20 f
+(\031) 4484 1872 w
+10 /LucidaSansUnicode00 f
+(s) 4516 1872 w
+(contents) 4623 1872 w
+(change, the pointer to the block does not need to change.) 720 1992 w
+10 /LucidaSans-Demi f
+(4.1.) 720 2232 w
+(Snapshots) 962 2232 w
+10 /LucidaSansUnicode00 f
+(Fossil) 720 2388 w
+(is) 1031 2388 w
+(an) 1147 2388 w
+(archival) 1300 2388 w
+(file) 1710 2388 w
+(server.) 1897 2388 w
+(It) 2294 2388 w
+(takes) 2396 2388 w
+(periodic) 2689 2388 w
+(snapshots) 3118 2388 w
+(of) 3647 2388 w
+(the) 3781 2388 w
+(file) 3973 2388 w
+(system,) 4161 2388 w
+(which) 4570 2388 w
+(are) 4888 2388 w
+(made) 720 2508 w
+(accessible) 1021 2508 w
+(through) 1547 2508 w
+(the) 1968 2508 w
+(file) 2157 2508 w
+(system.) 2342 2508 w
+(Specifically,) 2780 2508 w
+(the) 3381 2508 w
+(active) 3570 2508 w
+(file) 3884 2508 w
+(system) 4069 2508 w
+(is) 4442 2508 w
+(presented) 4555 2508 w
+(in) 720 2628 w
+10 /LucidaTypewriter f
+(/active) 859 2628 w
+10 /LucidaSansUnicode00 f
+(.) 1363 2628 w
+(Ephemeral) 1475 2628 w
+(snapshots) 2032 2628 w
+(\(those) 2573 2628 w
+(that) 2921 2628 w
+(are) 3161 2628 w
+(kept) 3362 2628 w
+(on) 3625 2628 w
+(local) 3797 2628 w
+(disk) 4071 2628 w
+(and) 4321 2628 w
+(eventually) 4550 2628 w
+(deleted\)) 720 2748 w
+(are) 1171 2748 w
+(presented) 1381 2748 w
+(in) 1924 2748 w
+10 /LucidaTypewriter f
+(/snapshot/) 2113 2748 w
+10 /LucidaSans-Italic f
+(yyyy) 2833 2748 w
+10 /LucidaTypewriter f
+(/) 3053 2748 w
+10 /LucidaSans-Italic f
+(mmdd) 3125 2748 w
+10 /LucidaTypewriter f
+(/) 3433 2748 w
+10 /LucidaSans-Italic f
+(hhmm) 3505 2748 w
+10 /LucidaSansUnicode00 f
+(,) 3815 2748 w
+(where) 3905 2748 w
+10 /LucidaSans-Italic f
+(yyyy) 4255 2748 w
+10 /LucidaSansUnicode00 f
+(is) 4533 2748 w
+(the) 4671 2748 w
+(full) 4883 2748 w
+(year,) 720 2868 w
+10 /LucidaSans-Italic f
+(mm) 999 2868 w
+10 /LucidaSansUnicode00 f
+(is) 1228 2868 w
+(the) 1351 2868 w
+(month) 1549 2868 w
+(number,) 1907 2868 w
+10 /LucidaSans-Italic f
+(dd) 2359 2868 w
+10 /LucidaSansUnicode00 f
+(is) 2524 2868 w
+(the) 2648 2868 w
+(day) 2847 2868 w
+(number,) 3061 2868 w
+10 /LucidaSans-Italic f
+(hh) 3514 2868 w
+10 /LucidaSansUnicode00 f
+(is) 3682 2868 w
+(the) 3806 2868 w
+(hour,) 4005 2868 w
+(and) 4307 2868 w
+10 /LucidaSans-Italic f
+(mm) 4531 2868 w
+10 /LucidaSansUnicode00 f
+(is) 4761 2868 w
+(the) 4885 2868 w
+(minute.) 720 2988 w
+(Archival) 1173 2988 w
+(snapshots) 1611 2988 w
+(\(those) 2154 2988 w
+(that) 2504 2988 w
+(are) 2745 2988 w
+(archived) 2947 2988 w
+(to) 3406 2988 w
+(Venti) 3554 2988 w
+(and) 3853 2988 w
+(persist) 4083 2988 w
+(forever\)) 4461 2988 w
+(are) 4888 2988 w
+(presented) 720 3108 w
+(in) 1241 3108 w
+10 /LucidaTypewriter f
+(/archive/) 1408 3108 w
+10 /LucidaSans-Italic f
+(yyyy) 2056 3108 w
+10 /LucidaTypewriter f
+(/) 2276 3108 w
+10 /LucidaSans-Italic f
+(mmdds) 2348 3108 w
+10 /LucidaSansUnicode00 f
+(,) 2705 3108 w
+(where) 2773 3108 w
+10 /LucidaSans-Italic f
+(yyyy) 3101 3108 w
+10 /LucidaSansUnicode00 f
+(,) 3321 3108 w
+10 /LucidaSans-Italic f
+(mm) 3389 3108 w
+10 /LucidaSansUnicode00 f
+(,) 3575 3108 w
+(and) 3643 3108 w
+10 /LucidaSans-Italic f
+(dd) 3859 3108 w
+10 /LucidaSansUnicode00 f
+(are) 4017 3108 w
+(year,) 4205 3108 w
+(month,) 4477 3108 w
+(and) 4860 3108 w
+(day) 720 3228 w
+(as) 923 3228 w
+(before,) 1062 3228 w
+(and) 1441 3228 w
+10 /LucidaSans-Italic f
+(s) 1654 3228 w
+10 /LucidaSansUnicode00 f
+(is) 1736 3228 w
+(a) 1849 3228 w
+(sequence) 1937 3228 w
+(number) 2427 3228 w
+(if) 2837 3228 w
+(more) 2936 3228 w
+(than) 3220 3228 w
+(one) 3469 3228 w
+(archival) 3681 3228 w
+(snapshot) 4088 3228 w
+(is done in) 4563 3228 w
+(a) 720 3348 w
+(day.) 811 3348 w
+(For) 1081 3348 w
+(the) 1273 3348 w
+(first) 1464 3348 w
+(snapshot,) 1695 3348 w
+10 /LucidaSans-Italic f
+(s) 2205 3348 w
+10 /LucidaSansUnicode00 f
+(is) 2290 3348 w
+(null.) 2406 3348 w
+(For) 2688 3348 w
+(the) 2881 3348 w
+(subsequent) 3073 3348 w
+(snapshots,) 3673 3348 w
+10 /LucidaSans-Italic f
+(s) 4235 3348 w
+10 /LucidaSansUnicode00 f
+(is) 4321 3348 w
+10 /LucidaTypewriter f
+(.1) 4438 3348 w
+10 /LucidaSansUnicode00 f
+(,) 4582 3348 w
+10 /LucidaTypewriter f
+(.2) 4651 3348 w
+10 /LucidaSansUnicode00 f
+(,) 4795 3348 w
+10 /LucidaTypewriter f
+(.3) 4864 3348 w
+10 /LucidaSansUnicode00 f
+(,) 5008 3348 w
+(etc.) 720 3468 w
+(To) 970 3624 w
+(implement) 1151 3624 w
+(the) 1726 3624 w
+(snapshots,) 1938 3624 w
+(the) 2520 3624 w
+(file) 2732 3624 w
+(server) 2940 3624 w
+(maintains) 3295 3624 w
+(a) 3826 3624 w
+(current) 3939 3624 w
+10 /LucidaSans-Italic f
+(epoch) 4347 3624 w
+10 /LucidaSansUnicode00 f
+(for) 4688 3624 w
+(the) 4885 3624 w
+(active) 720 3744 w
+(file) 1047 3744 w
+(system.) 1245 3744 w
+(Each) 1696 3744 w
+(local) 1965 3744 w
+(block) 2237 3744 w
+(has) 2546 3744 w
+(a) 2761 3744 w
+(label) 2863 3744 w
+(that) 3142 3744 w
+(records,) 3380 3744 w
+(among) 3822 3744 w
+(other) 4201 3744 w
+(things,) 4504 3744 w
+(the) 4885 3744 w
+(epoch) 720 3864 w
+(in) 1046 3864 w
+(which) 1170 3864 w
+(the) 1484 3864 w
+(block) 1672 3864 w
+(was) 1967 3864 w
+(allocated.) 2183 3864 w
+(If) 2716 3864 w
+(a) 2815 3864 w
+(block) 2903 3864 w
+(was) 3198 3864 w
+(allocated) 3414 3864 w
+(in) 3883 3864 w
+(an) 4007 3864 w
+(epoch) 4157 3864 w
+(earlier) 4483 3864 w
+(than) 4824 3864 w
+(the) 720 3984 w
+(current) 917 3984 w
+(one,) 1309 3984 w
+(it) 1562 3984 w
+(is) 1670 3984 w
+(immutable) 1792 3984 w
+(and) 2351 3984 w
+(treated) 2573 3984 w
+(as) 2960 3984 w
+(copy-on-write.) 3108 3984 w
+(Taking) 3920 3984 w
+(a) 4291 3984 w
+(snapshot) 4388 3984 w
+(can) 4872 3984 w
+(be) 720 4104 w
+(accomplished) 872 4104 w
+(by) 1569 4104 w
+(recording) 1717 4104 w
+(the) 2217 4104 w
+(address) 2406 4104 w
+(of) 2820 4104 w
+(the) 2952 4104 w
+(current) 3141 4104 w
+(root) 3525 4104 w
+(block) 3759 4104 w
+(and) 4055 4104 w
+(then) 4269 4104 w
+(increment\255) 4520 4104 w
+(ing) 720 4224 w
+(the) 920 4224 w
+(epoch) 1122 4224 w
+(number.) 1462 4224 w
+(Notice) 1950 4224 w
+(that) 2305 4224 w
+(the) 2543 4224 w
+(copy-on-write) 2745 4224 w
+(method) 3498 4224 w
+(makes) 3917 4224 w
+(snapshots) 4277 4224 w
+(both) 4817 4224 w
+(time) 720 4344 w
+(efficient) 976 4344 w
+(and) 1411 4344 w
+(space) 1632 4344 w
+(efficient.) 1949 4344 w
+(The) 2449 4344 w
+(only) 2672 4344 w
+(time) 2918 4344 w
+(cost) 3175 4344 w
+(is) 3417 4344 w
+(waiting) 3539 4344 w
+(for) 3932 4344 w
+(all) 4113 4344 w
+(current) 4268 4344 w
+(file) 4660 4344 w
+(sys\255) 4853 4344 w
+(tem) 720 4464 w
+(requests) 948 4464 w
+(to) 1407 4464 w
+(finish) 1547 4464 w
+(and) 1859 4464 w
+(then) 2080 4464 w
+(incrementing) 2338 4464 w
+(a) 3019 4464 w
+(counter.) 3115 4464 w
+(After) 3590 4464 w
+(a) 3871 4464 w
+(snapshot,) 3967 4464 w
+(blocks) 4482 4464 w
+(only) 4836 4464 w
+(get) 720 4584 w
+(copied) 911 4584 w
+(when) 1270 4584 w
+(they) 1563 4584 w
+(are) 1806 4584 w
+(next) 1994 4584 w
+(modified,) 2246 4584 w
+(so) 2745 4584 w
+(the) 2893 4584 w
+(per-snapshot) 3084 4584 w
+(space) 3780 4584 w
+(requirement) 4092 4584 w
+(is) 4725 4584 w
+(pro\255) 4842 4584 w
+(portional to the amount of new data rather than the total size of the file system.) 720 4704 w
+(The) 970 4860 w
+(blocks) 1203 4860 w
+(in) 1568 4860 w
+(the) 1711 4860 w
+(archival) 1918 4860 w
+(snapshots) 2344 4860 w
+(are) 2889 4860 w
+(moved) 3093 4860 w
+(to) 3470 4860 w
+(Venti,) 3620 4860 w
+(but) 3953 4860 w
+(the) 4167 4860 w
+(blocks) 4375 4860 w
+(in) 4741 4860 w
+(the) 4885 4860 w
+(ephemeral) 720 4980 w
+(snapshots) 1274 4980 w
+(take) 1810 4980 w
+(up) 2059 4980 w
+(space) 2227 4980 w
+(in) 2546 4980 w
+(the) 2680 4980 w
+(local) 2878 4980 w
+(disk) 3146 4980 w
+(file.) 3390 4980 w
+(To) 3648 4980 w
+(allow) 3815 4980 w
+(reclamation) 4109 4980 w
+(of) 4721 4980 w
+(this) 4861 4980 w
+(disk) 720 5100 w
+(space,) 970 5100 w
+(the) 1327 5100 w
+(file) 1531 5100 w
+(system) 1731 5100 w
+(maintains) 2120 5100 w
+(a) 2642 5100 w
+10 /LucidaSans-Italic f
+(low) 2746 5100 w
+(epoch) 2959 5100 w
+10 /LucidaSansUnicode00 f
+(,) 3242 5100 w
+(which) 3324 5100 w
+(is) 3655 5100 w
+(the) 3785 5100 w
+(epoch) 3990 5100 w
+(of) 4333 5100 w
+(the) 4481 5100 w
+(earliest) 4686 5100 w
+(ephemeral) 720 5220 w
+(snapshot) 1279 5220 w
+(still) 1769 5220 w
+(available.) 1992 5220 w
+(Fossil) 2526 5220 w
+(only) 2848 5220 w
+(allows) 3099 5220 w
+(access) 3448 5220 w
+(to) 3810 5220 w
+(snapshots) 3955 5220 w
+(with) 4495 5220 w
+(epoch) 4747 5220 w
+(numbers) 720 5340 w
+(between) 1198 5340 w
+(the) 1655 5340 w
+(low) 1860 5340 w
+(epoch) 2077 5340 w
+(and) 2420 5340 w
+(the) 2650 5340 w
+(current) 2855 5340 w
+(epoch) 3255 5340 w
+(\(also) 3598 5340 w
+(called) 3877 5340 w
+(the) 4210 5340 w
+(high) 4416 5340 w
+(epoch\).) 4682 5340 w
+(Incrementing) 720 5460 w
+(the) 1395 5460 w
+(low) 1585 5460 w
+(epoch) 1787 5460 w
+(thus) 2115 5460 w
+(makes) 2362 5460 w
+(old) 2709 5460 w
+(snapshots) 2896 5460 w
+(inaccessible.) 3423 5460 w
+(The) 4104 5460 w
+(space) 4319 5460 w
+(required) 4629 5460 w
+(to store those snapshots can then be reclaimed, as described below.) 720 5580 w
+10 /LucidaSans-Demi f
+(4.2.) 720 5820 w
+(Local blocks) 962 5820 w
+10 /LucidaSansUnicode00 f
+(The) 720 5976 w
+(bulk) 936 5976 w
+(of) 1183 5976 w
+(the) 1316 5976 w
+(local) 1506 5976 w
+(disk) 1766 5976 w
+(file) 2002 5976 w
+(is) 2188 5976 w
+(the) 2303 5976 w
+(local) 2493 5976 w
+(blocks.) 2753 5976 w
+(Each) 3165 5976 w
+(block) 3422 5976 w
+(has) 3719 5976 w
+(a) 3922 5976 w
+(14-byte) 4013 5976 w
+(label) 4441 5976 w
+(associ\255) 4709 5976 w
+(ated with it, of the format:) 720 6096 w
+9 /LucidaTypewriter f
+(Label:) 1008 6266 w
+(state[1]) 1268 6376 w
+9 /LucidaSansUnicode00 f
+(block) 2308 6376 w
+(state) 2573 6376 w
+9 /LucidaTypewriter f
+(type[1]) 1268 6486 w
+9 /LucidaSansUnicode00 f
+(block) 2308 6486 w
+(type) 2573 6486 w
+9 /LucidaTypewriter f
+(epoch[4]) 1268 6596 w
+9 /LucidaSansUnicode00 f
+(allocation) 2308 6596 w
+(epoch) 2760 6596 w
+9 /LucidaTypewriter f
+(epochClose[4]) 1268 6706 w
+9 /LucidaSansUnicode00 f
+(close) 2308 6706 w
+(epoch) 2560 6706 w
+9 /LucidaTypewriter f
+(tag[4]) 1268 6816 w
+9 /LucidaSansUnicode00 f
+(random) 2308 6816 w
+(tag) 2676 6816 w
+10 /LucidaSansUnicode00 f
+(The) 720 7032 w
+10 /LucidaTypewriter f
+(type) 942 7032 w
+10 /LucidaSansUnicode00 f
+(is) 1272 7032 w
+(an) 1394 7032 w
+(analogue) 1553 7032 w
+(of) 2037 7032 w
+(the) 2177 7032 w
+(block) 2374 7032 w
+(types) 2678 7032 w
+(described) 2979 7032 w
+(earlier,) 3494 7032 w
+(though) 3875 7032 w
+(different) 4263 7032 w
+(names) 4723 7032 w
+(are) 720 7152 w
+(used,) 917 7152 w
+(to) 1226 7152 w
+(distinguish) 1369 7152 w
+(between) 1951 7152 w
+(pointers) 2403 7152 w
+(blocks) 2848 7152 w
+(in) 3205 7152 w
+(a) 3340 7152 w
+(hash) 3439 7152 w
+(tree) 3713 7152 w
+(for) 3947 7152 w
+(a) 4130 7152 w
+(data) 4229 7152 w
+(stream) 4483 7152 w
+(and) 4860 7152 w
+(pointer) 720 7272 w
+(blocks) 1114 7272 w
+(for) 1472 7272 w
+(a) 1657 7272 w
+(directory) 1758 7272 w
+(stream.) 2235 7272 w
+(The) 2678 7272 w
+10 /LucidaTypewriter f
+(epoch) 2905 7272 w
+10 /LucidaSansUnicode00 f
+(was) 3311 7272 w
+(mentioned) 3540 7272 w
+(in) 4105 7272 w
+(the) 4242 7272 w
+(last) 4443 7272 w
+(section.) 4661 7272 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 5 5
+%%Page: 6 6
+/saveobj save def
+mark
+6 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 6 \255) 2783 480 w
+(The other fields are explained below.) 720 840 w
+(There) 970 996 w
+(are) 1297 996 w
+(two) 1498 996 w
+(distinguished) 1722 996 w
+(blocks) 2428 996 w
+(states) 2791 996 w
+10 /LucidaTypewriter f
+(BsFree) 3128 996 w
+10 /LucidaSansUnicode00 f
+(\() 3610 996 w
+10 /LucidaTypewriter f
+(0x00) 3643 996 w
+10 /LucidaSansUnicode00 f
+(\)) 3931 996 w
+(and) 4014 996 w
+10 /LucidaTypewriter f
+(BsBad) 4244 996 w
+10 /LucidaSansUnicode00 f
+(\() 4654 996 w
+10 /LucidaTypewriter f
+(0xFF) 4687 996 w
+10 /LucidaSansUnicode00 f
+(\),) 4975 996 w
+(which) 720 1116 w
+(mark) 1043 1116 w
+(blocks) 1332 1116 w
+(that) 1687 1116 w
+(are) 1920 1116 w
+(available) 2114 1116 w
+(for) 2579 1116 w
+(allocation) 2759 1116 w
+(and) 3269 1116 w
+(blocks) 3490 1116 w
+(that) 3844 1116 w
+(are) 4076 1116 w
+(bad) 4269 1116 w
+(and) 4491 1116 w
+(should) 4712 1116 w
+(be) 720 1236 w
+(avoided.) 886 1236 w
+(If) 1376 1236 w
+10 /LucidaTypewriter f
+(state) 1489 1236 w
+10 /LucidaSansUnicode00 f
+(is) 1896 1236 w
+(not) 2023 1236 w
+(one) 2230 1236 w
+(of) 2456 1236 w
+(these) 2601 1236 w
+(values,) 2910 1236 w
+(it) 3294 1236 w
+(is) 3407 1236 w
+(a) 3534 1236 w
+(bitwise) 3637 1236 w
+10 /LucidaSansUnicode20 f
+(\030) 4027 1236 w
+10 /LucidaSans-Italic f
+(or) 4059 1236 w
+10 /LucidaSansUnicode20 f
+(\031) 4164 1236 w
+10 /LucidaSansUnicode00 f
+(of) 4244 1236 w
+(the) 4390 1236 w
+(following) 4593 1236 w
+(flags:) 720 1356 w
+9 /LucidaTypewriter f
+(0x01) 1008 1526 w
+(BsAlloc) 1398 1526 w
+9 /LucidaSansUnicode00 f
+(block) 2438 1526 w
+(is) 2703 1526 w
+(in) 2804 1526 w
+(use) 2915 1526 w
+9 /LucidaTypewriter f
+(0x02) 1008 1636 w
+(BsCopied) 1398 1636 w
+9 /LucidaSansUnicode00 f
+(block) 2438 1636 w
+(has) 2703 1636 w
+(been) 2884 1636 w
+(copied) 3126 1636 w
+9 /LucidaTypewriter f
+(0x04) 1008 1746 w
+(BsVenti) 1398 1746 w
+9 /LucidaSansUnicode00 f
+(block) 2438 1746 w
+(has) 2703 1746 w
+(been) 2884 1746 w
+(stored) 3126 1746 w
+(on) 3433 1746 w
+(Venti) 3573 1746 w
+9 /LucidaTypewriter f
+(0x08) 1008 1856 w
+(BsClosed) 1398 1856 w
+9 /LucidaSansUnicode00 f
+(block) 2438 1856 w
+(has) 2703 1856 w
+(been) 2884 1856 w
+(unlinked) 3126 1856 w
+(from) 3534 1856 w
+(active) 3772 1856 w
+(file) 4053 1856 w
+(system) 4217 1856 w
+10 /LucidaSansUnicode00 f
+(The flags are explained as they arise in the discussions below.) 720 2072 w
+(It) 970 2228 w
+(is) 1094 2228 w
+(convenient) 1232 2228 w
+(to) 1819 2228 w
+(store) 1976 2228 w
+(some) 2281 2228 w
+(extra) 2601 2228 w
+(fields) 2910 2228 w
+(in) 3234 2228 w
+(the) 3384 2228 w
+10 /LucidaTypewriter f
+(VtEntry) 3598 2228 w
+10 /LucidaSansUnicode00 f
+(structure) 4161 2228 w
+(when) 4658 2228 w
+(it) 4974 2228 w
+(describes) 720 2348 w
+(a) 1255 2348 w
+(Venti) 1383 2348 w
+(file) 1705 2348 w
+(or) 1929 2348 w
+(directory) 2104 2348 w
+(stored) 2608 2348 w
+(on) 2990 2348 w
+(local) 3186 2348 w
+(disk.) 3484 2348 w
+(Specifically,) 3822 2348 w
+(we) 4462 2348 w
+(set) 4668 2348 w
+(the) 4885 2348 w
+10 /LucidaTypewriter f
+(VtEntryLocal) 720 2468 w
+10 /LucidaSansUnicode00 f
+(flag) 1620 2468 w
+(bit) 1839 2468 w
+(and) 2004 2468 w
+(then) 2220 2468 w
+(use) 2473 2468 w
+(the) 2678 2468 w
+(bytes) 2869 2468 w
+(7-16) 3164 2468 w
+(of) 3447 2468 w
+(the) 3581 2468 w
+(score) 3773 2468 w
+(\(which) 4070 2468 w
+(would) 4421 2468 w
+(other\255) 4750 2468 w
+(wise be zero, since it is a local score\) to hold these fields:) 720 2588 w
+9 /LucidaTypewriter f
+(archive[1]) 1268 2758 w
+9 /LucidaSansUnicode00 f
+(boolean:) 2308 2758 w
+(this) 2715 2758 w
+(is) 2905 2758 w
+(an) 3006 2758 w
+(archival) 3141 2758 w
+(snapshot) 3508 2758 w
+9 /LucidaTypewriter f
+(snap[4]) 1268 2868 w
+9 /LucidaSansUnicode00 f
+(epoch) 2308 2868 w
+(number) 2601 2868 w
+(if) 2970 2868 w
+(root) 3058 2868 w
+(of) 3267 2868 w
+(snapshot) 3384 2868 w
+9 /LucidaTypewriter f
+(tag[4]) 1268 2978 w
+9 /LucidaSansUnicode00 f
+(random) 2308 2978 w
+(tag) 2676 2978 w
+10 /LucidaSansUnicode00 f
+(The) 720 3194 w
+(extended) 948 3194 w
+10 /LucidaTypewriter f
+(VtEntry) 1449 3194 w
+10 /LucidaSansUnicode00 f
+(structure) 2001 3194 w
+(is) 2487 3194 w
+(called) 2615 3194 w
+(an) 2946 3194 w
+10 /LucidaTypewriter f
+(Entry) 3111 3194 w
+10 /LucidaSansUnicode00 f
+(.) 3471 3194 w
+(The) 3583 3194 w
+10 /LucidaTypewriter f
+(tag) 3812 3194 w
+10 /LucidaSansUnicode00 f
+(field) 4076 3194 w
+(in) 4338 3194 w
+(the) 4477 3194 w
+10 /LucidaTypewriter f
+(Label) 4680 3194 w
+10 /LucidaSansUnicode00 f
+(and) 720 3314 w
+(the) 938 3314 w
+10 /LucidaTypewriter f
+(Entry) 1131 3314 w
+10 /LucidaSansUnicode00 f
+(is) 1529 3314 w
+(used) 1647 3314 w
+(to) 1917 3314 w
+(identify) 2052 3314 w
+(dangling) 2454 3314 w
+(pointers) 2915 3314 w
+(or) 3352 3314 w
+(other) 3491 3314 w
+(file) 3785 3314 w
+(system) 3973 3314 w
+(corruption:) 4350 3314 w
+(all) 4927 3314 w
+(the) 720 3434 w
+(local) 916 3434 w
+(blocks) 1183 3434 w
+(in) 1538 3434 w
+(a) 1671 3434 w
+(hash) 1768 3434 w
+(tree) 2040 3434 w
+(must) 2272 3434 w
+(have) 2557 3434 w
+(tags) 2824 3434 w
+(matching) 3071 3434 w
+(the) 3564 3434 w
+(tag) 3761 3434 w
+(in) 3957 3434 w
+(the) 4090 3434 w
+10 /LucidaTypewriter f
+(Entry) 4287 3434 w
+10 /LucidaSansUnicode00 f
+(.) 4647 3434 w
+(If) 4753 3434 w
+(this) 4861 3434 w
+10 /LucidaTypewriter f
+(Entry) 720 3554 w
+10 /LucidaSansUnicode00 f
+(points) 1114 3554 w
+(at) 1451 3554 w
+(the) 1577 3554 w
+(root) 1766 3554 w
+(of) 2000 3554 w
+(a) 2132 3554 w
+(snapshot,) 2221 3554 w
+(the) 2729 3554 w
+10 /LucidaTypewriter f
+(snap) 2918 3554 w
+10 /LucidaSansUnicode00 f
+(field) 3240 3554 w
+(is) 3488 3554 w
+(the) 3602 3554 w
+(epoch) 3790 3554 w
+(of) 4116 3554 w
+(the) 4247 3554 w
+(snapshot.) 4435 3554 w
+(If) 4974 3554 w
+(the snapshot is intended to be archived to Venti, the) 720 3674 w
+10 /LucidaTypewriter f
+(archive) 3305 3674 w
+10 /LucidaSansUnicode00 f
+(field is non-zero.) 3841 3674 w
+10 /LucidaSans-Demi f
+(4.3.) 720 3914 w
+(Block reclamation) 962 3914 w
+10 /LucidaSansUnicode00 f
+(The) 720 4070 w
+(blocks) 935 4070 w
+(in) 1282 4070 w
+(the) 1407 4070 w
+(active) 1596 4070 w
+(file) 1910 4070 w
+(system) 2095 4070 w
+(form) 2469 4070 w
+(a) 2735 4070 w
+(tree:) 2824 4070 w
+(each) 3080 4070 w
+(block) 3338 4070 w
+(has) 3634 4070 w
+(only) 3837 4070 w
+(one) 4076 4070 w
+(parent.) 4290 4070 w
+(Once) 4703 4070 w
+(a) 4985 4070 w
+(copy-on-write) 720 4190 w
+(block) 1468 4190 w
+10 /LucidaSans-Italic f
+(b) 1772 4190 w
+10 /LucidaSansUnicode00 f
+(is) 1875 4190 w
+(replaced) 1997 4190 w
+(by) 2453 4190 w
+(its) 2610 4190 w
+(copy,) 2769 4190 w
+(it) 3070 4190 w
+(is) 3178 4190 w
+(no) 3300 4190 w
+(longer) 3465 4190 w
+(needed) 3818 4190 w
+(by) 4216 4190 w
+(the) 4372 4190 w
+(active) 4568 4190 w
+(file) 4889 4190 w
+(system.) 720 4310 w
+(At) 1167 4310 w
+(this) 1316 4310 w
+(point,) 1538 4310 w
+10 /LucidaSans-Italic f
+(b) 1865 4310 w
+10 /LucidaSansUnicode00 f
+(is) 1969 4310 w
+(unlinked) 2092 4310 w
+(from) 2556 4310 w
+(the) 2831 4310 w
+(active) 3029 4310 w
+(file) 3352 4310 w
+(system.) 3546 4310 w
+(We) 3993 4310 w
+(say) 4178 4310 w
+(that) 4379 4310 w
+10 /LucidaSans-Italic f
+(b) 4613 4310 w
+10 /LucidaSansUnicode00 f
+(is) 4717 4310 w
+(now) 4840 4310 w
+10 /LucidaSans-Italic f
+(closed) 720 4430 w
+10 /LucidaSansUnicode00 f
+(:) 1019 4430 w
+(it) 1094 4430 w
+(is) 1203 4430 w
+(needed) 1326 4430 w
+(only) 1725 4430 w
+(for) 1972 4430 w
+(snapshots.) 2154 4430 w
+(When) 2754 4430 w
+(a) 3063 4430 w
+(block) 3161 4430 w
+(is) 3466 4430 w
+(closed,) 3589 4430 w
+(the) 3974 4430 w
+10 /LucidaTypewriter f
+(BsClosed) 4171 4430 w
+10 /LucidaSansUnicode00 f
+(bit) 4789 4430 w
+(is) 4960 4430 w
+(set) 720 4550 w
+(in) 901 4550 w
+(its) 1029 4550 w
+(state,) 1183 4550 w
+(and) 1488 4550 w
+(the) 1705 4550 w
+(current) 1897 4550 w
+(epoch) 2284 4550 w
+(\(called) 2614 4550 w
+(the) 2968 4550 w
+(block) 3161 4550 w
+10 /LucidaSansUnicode20 f
+(\031) 3423 4550 w
+10 /LucidaSansUnicode00 f
+(s) 3455 4550 w
+(closing) 3544 4550 w
+(epoch\)) 3927 4550 w
+(is) 4291 4550 w
+(stored) 4409 4550 w
+(in) 4756 4550 w
+(the) 4885 4550 w
+10 /LucidaTypewriter f
+(epochClose) 720 4670 w
+10 /LucidaSansUnicode00 f
+(label field.) 1472 4670 w
+(\(Open blocks have an) 2046 4670 w
+10 /LucidaTypewriter f
+(epochClose) 3121 4670 w
+10 /LucidaSansUnicode00 f
+(of) 3873 4670 w
+10 /LucidaTypewriter f
+(~0) 4003 4670 w
+10 /LucidaSansUnicode00 f
+(\).) 4147 4670 w
+(A) 970 4826 w
+(block) 1094 4826 w
+(is) 1411 4826 w
+(referenced) 1546 4826 w
+(by) 2120 4826 w
+(snapshots) 2290 4826 w
+(with) 2838 4826 w
+(epochs) 3098 4826 w
+(between) 3497 4826 w
+(the) 3959 4826 w
+(block) 4170 4826 w
+10 /LucidaSansUnicode20 f
+(\031) 4432 4826 w
+10 /LucidaSansUnicode00 f
+(s) 4464 4826 w
+(allocation) 4571 4826 w
+(epoch) 720 4946 w
+(and) 1048 4946 w
+(its) 1263 4946 w
+(closing) 1415 4946 w
+(epoch.) 1795 4946 w
+(Once) 2187 4946 w
+(the) 2469 4946 w
+(file) 2659 4946 w
+(system) 2845 4946 w
+10 /LucidaSansUnicode20 f
+(\031) 3185 4946 w
+10 /LucidaSansUnicode00 f
+(s) 3217 4946 w
+(low) 3303 4946 w
+(epoch) 3504 4946 w
+(grows) 3831 4946 w
+(to) 4157 4946 w
+(be) 4289 4946 w
+(greater) 4442 4946 w
+(than) 4824 4946 w
+(or) 720 5066 w
+(equal) 861 5066 w
+(to) 1165 5066 w
+(the) 1302 5066 w
+(block) 1497 5066 w
+10 /LucidaSansUnicode20 f
+(\031) 1759 5066 w
+10 /LucidaSansUnicode00 f
+(s) 1791 5066 w
+(closing) 1882 5066 w
+(epoch,) 2267 5066 w
+(the) 2632 5066 w
+(block) 2827 5066 w
+(is) 3129 5066 w
+(no) 3249 5066 w
+(longer) 3412 5066 w
+(needed) 3763 5066 w
+(for) 4159 5066 w
+(any) 4338 5066 w
+(snapshots) 4547 5066 w
+(and can be reused.) 720 5186 w
+(In) 970 5342 w
+(a) 1094 5342 w
+(typical) 1182 5342 w
+(configuration,) 1531 5342 w
+(where) 2245 5342 w
+(nightly) 2570 5342 w
+(archival) 2936 5342 w
+(snapshots) 3343 5342 w
+(are) 3869 5342 w
+(taken) 4054 5342 w
+(and) 4355 5342 w
+(written) 4569 5342 w
+(to) 4942 5342 w
+(Venti,) 720 5462 w
+(it) 1043 5462 w
+(is) 1151 5462 w
+(desirable) 1272 5462 w
+(to) 1756 5462 w
+(reclaim) 1895 5462 w
+(the) 2290 5462 w
+(space) 2486 5462 w
+(occupied) 2803 5462 w
+(by) 3280 5462 w
+(now-archived) 3436 5462 w
+(blocks) 4144 5462 w
+(if) 4498 5462 w
+(possible.) 4605 5462 w
+(To) 720 5582 w
+(do) 885 5582 w
+(this,) 1050 5582 w
+(Fossil) 1302 5582 w
+(keeps) 1618 5582 w
+(track) 1943 5582 w
+(of) 2226 5582 w
+(whether) 2365 5582 w
+(the) 2797 5582 w
+(pointers) 2994 5582 w
+(in) 3436 5582 w
+(each) 3569 5582 w
+(block) 3835 5582 w
+(are) 4139 5582 w
+(unique) 4333 5582 w
+(to) 4709 5582 w
+(that) 4849 5582 w
+(block.) 720 5702 w
+(When) 1091 5702 w
+(a) 1402 5702 w
+(block) 1502 5702 w
+10 /LucidaSans-Italic f
+(bb) 1809 5702 w
+10 /LucidaSansUnicode00 f
+(is) 1976 5702 w
+(allocated,) 2100 5702 w
+(a) 2612 5702 w
+(pointer) 2711 5702 w
+(to) 3104 5702 w
+10 /LucidaSans-Italic f
+(bb) 3246 5702 w
+10 /LucidaSansUnicode00 f
+(is) 3412 5702 w
+(written) 3536 5702 w
+(into) 3919 5702 w
+(exactly) 4152 5702 w
+(one) 4537 5702 w
+(active) 4760 5702 w
+(block \(say,) 720 5822 w
+10 /LucidaSans-Italic f
+(b) 1269 5822 w
+10 /LucidaSansUnicode00 f
+(\).) 1330 5822 w
+(In the absence of snapshots, the) 1459 5822 w
+(pointer) 3070 5822 w
+(to) 3452 5822 w
+10 /LucidaSans-Italic f
+(bb) 3583 5822 w
+10 /LucidaSansUnicode00 f
+(will) 3738 5822 w
+(remain) 3935 5822 w
+(unique) 4304 5822 w
+(to) 4671 5822 w
+10 /LucidaSans-Italic f
+(b) 4802 5822 w
+10 /LucidaSansUnicode00 f
+(,) 4863 5822 w
+(so) 4928 5822 w
+(that) 720 5942 w
+(if) 951 5942 w
+(the) 1057 5942 w
+(pointer) 1252 5942 w
+(is) 1641 5942 w
+(zeroed,) 1761 5942 w
+10 /LucidaSans-Italic f
+(bb) 2167 5942 w
+10 /LucidaSansUnicode00 f
+(can) 2329 5942 w
+(be) 2537 5942 w
+(immediately) 2696 5942 w
+(reused.) 3328 5942 w
+(Snapshots) 3761 5942 w
+(complicate) 4297 5942 w
+(this) 4861 5942 w
+(invariant:) 720 6062 w
+(when) 1209 6062 w
+10 /LucidaSans-Italic f
+(b) 1501 6062 w
+10 /LucidaSansUnicode00 f
+(is) 1597 6062 w
+(copied-on-write,) 1712 6062 w
+(all) 2581 6062 w
+(its) 2730 6062 w
+(pointers) 2883 6062 w
+(are) 3319 6062 w
+(no) 3507 6062 w
+(longer) 3666 6062 w
+(unique) 4013 6062 w
+(to) 4383 6062 w
+(it.) 4517 6062 w
+(At) 4683 6062 w
+(time) 4825 6062 w
+(of) 720 6182 w
+(the) 851 6182 w
+(copy,) 1039 6182 w
+(the) 1331 6182 w
+10 /LucidaTypewriter f
+(BsCopied) 1519 6182 w
+10 /LucidaSansUnicode00 f
+(state) 2128 6182 w
+(bit) 2397 6182 w
+(in) 2559 6182 w
+(the) 2683 6182 w
+(block) 2871 6182 w
+10 /LucidaSansUnicode20 f
+(\031) 3133 6182 w
+10 /LucidaSansUnicode00 f
+(s) 3165 6182 w
+(label) 3249 6182 w
+(is) 3514 6182 w
+(set) 3627 6182 w
+(to note the duplication of) 3804 6182 w
+(the pointers contained within.) 720 6302 w
+10 /LucidaSans-Demi f
+(4.4.) 720 6542 w
+(Disk layout) 962 6542 w
+10 /LucidaSansUnicode00 f
+(The file system header describes the file system layout and has this format:) 720 6698 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 6 6
+%%Page: 7 7
+/saveobj save def
+mark
+7 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 7 \255) 2783 480 w
+9 /LucidaTypewriter f
+(Header:) 1008 830 w
+(magic[4]) 1268 940 w
+9 /LucidaSansUnicode00 f
+(0x3776AE89) 2308 940 w
+(\(HeaderMagic\)) 2902 940 w
+9 /LucidaTypewriter f
+(version[2]) 1268 1050 w
+9 /LucidaSansUnicode00 f
+(1) 2308 1050 w
+(\(HeaderVersion\)) 2394 1050 w
+9 /LucidaTypewriter f
+(blockSize[2]) 1268 1160 w
+9 /LucidaSans-Italic f
+(file) 2308 1160 w
+(system) 2471 1160 w
+(block) 2804 1160 w
+(size) 3062 1160 w
+9 /LucidaTypewriter f
+(super[4]) 1268 1270 w
+9 /LucidaSansUnicode00 f
+(block) 2308 1270 w
+(offset) 2573 1270 w
+(of) 2852 1270 w
+(super) 2969 1270 w
+(block) 3244 1270 w
+9 /LucidaTypewriter f
+(label[4]) 1268 1380 w
+9 /LucidaSansUnicode00 f
+(block) 2308 1380 w
+(offset) 2573 1380 w
+(of) 2852 1380 w
+(labels) 2969 1380 w
+9 /LucidaTypewriter f
+(data[4]) 1268 1490 w
+9 /LucidaSansUnicode00 f
+(data) 2308 1490 w
+(blocks) 2527 1490 w
+9 /LucidaTypewriter f
+(end[4]) 1268 1600 w
+9 /LucidaSansUnicode00 f
+(end) 2308 1600 w
+(of) 2500 1600 w
+(file) 2617 1600 w
+(system) 2781 1600 w
+10 /LucidaSansUnicode00 f
+(The corresponding file system layout is:) 720 1816 w
+1440 2128 1440 1948 Dl
+1440 1948 1980 1948 Dl
+1980 1948 1980 2128 Dl
+1980 2128 1440 2128 Dl
+8 /LucidaSansUnicode00 f
+(empty) 1590 2054 w
+1440 2236 1440 2128 Dl
+1440 2128 1980 2128 Dl
+1980 2128 1980 2236 Dl
+1980 2236 1440 2236 Dl
+(header) 1577 2198 w
+1440 2344 1440 2236 Dl
+1440 2236 1980 2236 Dl
+1980 2236 1980 2344 Dl
+1980 2344 1440 2344 Dl
+(empty) 1590 2306 w
+1440 2452 1440 2344 Dl
+1440 2344 1980 2344 Dl
+1980 2344 1980 2452 Dl
+1980 2452 1440 2452 Dl
+(super) 1483 2414 w
+(block) 1728 2414 w
+1440 2632 1440 2452 Dl
+1440 2452 1980 2452 Dl
+1980 2452 1980 2632 Dl
+1980 2632 1440 2632 Dl
+(label) 1618 2513 w
+(blocks) 1585 2603 w
+1440 2848 1440 2632 Dl
+1440 2632 1980 2632 Dl
+1980 2632 1980 2848 Dl
+1980 2848 1440 2848 Dl
+(data) 1626 2711 w
+(blocks) 1585 2801 w
+(0) 2032 1964 w
+(128kB) 2032 2144 w
+8 /LucidaTypewriter f
+(super) 2032 2360 w
+8 /LucidaSansUnicode00 f
+(\327) 2348 2360 w
+8 /LucidaTypewriter f
+(blockSize) 2438 2360 w
+(label) 2032 2468 w
+8 /LucidaSansUnicode00 f
+(\327) 2348 2468 w
+8 /LucidaTypewriter f
+(blockSize) 2438 2468 w
+(data) 2032 2648 w
+8 /LucidaSansUnicode00 f
+(\327) 2290 2648 w
+8 /LucidaTypewriter f
+(blockSize) 2380 2648 w
+(end) 2032 2864 w
+8 /LucidaSansUnicode00 f
+(\327) 2232 2864 w
+8 /LucidaTypewriter f
+(blockSize) 2322 2864 w
+10 /LucidaSansUnicode00 f
+(The numbers to the right of the blocks are byte offsets of the boundaries.) 720 3062 w
+(The super block describes the file system itself and looks like:) 720 3218 w
+9 /LucidaTypewriter f
+(Super:) 1008 3388 w
+(magic[4]) 1268 3498 w
+9 /LucidaSansUnicode00 f
+(0x2340A3B1) 2308 3498 w
+(\(SuperMagic\)) 2905 3498 w
+9 /LucidaTypewriter f
+(version[2]) 1268 3608 w
+9 /LucidaSansUnicode00 f
+(1) 2308 3608 w
+(\(SuperVersion\)) 2394 3608 w
+9 /LucidaTypewriter f
+(epochLow[4]) 1268 3718 w
+9 /LucidaSansUnicode00 f
+(file) 2308 3718 w
+(system) 2472 3718 w
+(low) 2807 3718 w
+(epoch) 2986 3718 w
+9 /LucidaTypewriter f
+(epochHigh[4]) 1268 3828 w
+9 /LucidaSansUnicode00 f
+(file) 2308 3828 w
+(system) 2472 3828 w
+(high) 2807 3828 w
+(\(active\)) 3030 3828 w
+(epoch) 3371 3828 w
+9 /LucidaTypewriter f
+(qid[8]) 1268 3938 w
+9 /LucidaSansUnicode00 f
+(next) 2308 3938 w
+(qid) 2531 3938 w
+(to) 2700 3938 w
+(allocate) 2817 3938 w
+9 /LucidaTypewriter f
+(active[4]) 1268 4048 w
+9 /LucidaSansUnicode00 f
+(data) 2308 4048 w
+(block) 2527 4048 w
+(number:) 2792 4048 w
+(root) 3190 4048 w
+(of) 3399 4048 w
+(active) 3516 4048 w
+(file) 3797 4048 w
+(system) 3961 4048 w
+9 /LucidaTypewriter f
+(next[4]) 1268 4158 w
+9 /LucidaSansUnicode00 f
+(data) 2308 4158 w
+(block) 2527 4158 w
+(number:) 2792 4158 w
+(root) 3190 4158 w
+(of) 3399 4158 w
+(next) 3516 4158 w
+(file) 3739 4158 w
+(system) 3903 4158 w
+(to) 4238 4158 w
+(archive) 4355 4158 w
+9 /LucidaTypewriter f
+(current[4]) 1268 4268 w
+9 /LucidaSansUnicode00 f
+(data) 2308 4268 w
+(block) 2527 4268 w
+(number:) 2792 4268 w
+(root) 3190 4268 w
+(of) 3399 4268 w
+(file) 3516 4268 w
+(system) 3680 4268 w
+(currently) 4015 4268 w
+(being) 4432 4268 w
+(archived) 4706 4268 w
+9 /LucidaTypewriter f
+(last[20]) 1268 4378 w
+9 /LucidaSansUnicode00 f
+(Venti) 2308 4378 w
+(score) 2561 4378 w
+(of) 2824 4378 w
+(last) 2941 4378 w
+(successful) 3125 4378 w
+(archive) 3605 4378 w
+9 /LucidaTypewriter f
+(name[128]) 1268 4488 w
+9 /LucidaSansUnicode00 f
+(name) 2308 4488 w
+(of) 2577 4488 w
+(file) 2694 4488 w
+(system) 2858 4488 w
+(\(just) 3193 4488 w
+(a) 3414 4488 w
+(comment\)) 3493 4488 w
+10 /LucidaSans-Demi f
+(5.) 720 4824 w
+(Fossil server) 873 4824 w
+10 /LucidaSansUnicode00 f
+(The Fossil server is a user-space program that runs on a standard Plan 9 kernel.) 720 4980 w
+10 /LucidaSans-Demi f
+(5.1.) 720 5220 w
+(Process structure) 962 5220 w
+10 /LucidaSansUnicode00 f
+(The) 970 5376 w
+(file) 1199 5376 w
+(server) 1398 5376 w
+(is) 1743 5376 w
+(structured) 1872 5376 w
+(as) 2422 5376 w
+(a) 2577 5376 w
+(set) 2681 5376 w
+(of) 2874 5376 w
+(processes) 3021 5376 w
+(synchronizing) 3551 5376 w
+(mostly) 4281 5376 w
+(through) 4653 5376 w
+(message) 720 5496 w
+(passing) 1185 5496 w
+(along) 1599 5496 w
+(queues.) 1909 5496 w
+(The) 2364 5496 w
+(processes) 2586 5496 w
+(are) 3108 5496 w
+(given) 3301 5496 w
+(names,) 3603 5496 w
+(which) 3993 5496 w
+(can) 4315 5496 w
+(be) 4524 5496 w
+(seen) 4684 5496 w
+(in) 4949 5496 w
+(the output of) 720 5616 w
+10 /LucidaTypewriter f
+(ps) 1391 5616 w
+(-a) 1567 5616 w
+10 /LucidaSansUnicode00 f
+(.) 1711 5616 w
+10 /LucidaTypewriter f
+(Listen) 970 5772 w
+10 /LucidaSansUnicode00 f
+(processes) 1441 5772 w
+(announce) 1961 5772 w
+(on) 2472 5772 w
+(various) 2635 5772 w
+(network) 3026 5772 w
+(addresses.) 3458 5772 w
+(A) 4049 5772 w
+10 /LucidaTypewriter f
+(con) 4158 5772 w
+10 /LucidaSansUnicode00 f
+(process) 4414 5772 w
+(han\255) 4828 5772 w
+(dles) 720 5892 w
+(each) 958 5892 w
+(incoming) 1221 5892 w
+(connection,) 1709 5892 w
+(reading) 2312 5892 w
+(9P) 2719 5892 w
+(requests) 2876 5892 w
+(and) 3332 5892 w
+(adding) 3551 5892 w
+(them) 3923 5892 w
+(to) 4209 5892 w
+(a) 4345 5892 w
+(central) 4438 5892 w
+(mes\255) 4807 5892 w
+(sage) 720 6012 w
+(queue.) 997 6012 w
+10 /LucidaTypewriter f
+(Msg) 1413 6012 w
+10 /LucidaSansUnicode00 f
+(processes) 1682 6012 w
+(remove) 2216 6012 w
+(9P) 2628 6012 w
+(requests) 2799 6012 w
+(from) 3269 6012 w
+(the) 3554 6012 w
+(queue,) 3762 6012 w
+(handle) 4146 6012 w
+(them,) 4526 6012 w
+(and) 4860 6012 w
+(write the responses to the appropriate file descriptors.) 720 6132 w
+(The) 970 6288 w
+10 /LucidaTypewriter f
+(disk) 1196 6288 w
+10 /LucidaSansUnicode00 f
+(process) 1530 6288 w
+(handles) 1950 6288 w
+(disk) 2374 6288 w
+(I/O) 2621 6288 w
+(requests) 2827 6288 w
+(made) 3290 6288 w
+(by) 3603 6288 w
+(the) 3764 6288 w
+(other) 3965 6288 w
+(processes.) 4268 6288 w
+(The) 4859 6288 w
+10 /LucidaTypewriter f
+(flush) 720 6408 w
+10 /LucidaSansUnicode00 f
+(process) 1143 6408 w
+(writes) 1580 6408 w
+(dirty) 1934 6408 w
+(blocks) 2218 6408 w
+(from) 2593 6408 w
+(the) 2887 6408 w
+(in-memory) 3104 6408 w
+(block) 3711 6408 w
+(cache) 4035 6408 w
+(to) 4372 6408 w
+(disk.) 4532 6408 w
+(The) 4859 6408 w
+10 /LucidaTypewriter f
+(unlink) 720 6528 w
+10 /LucidaSansUnicode00 f
+(process) 1187 6528 w
+(frees) 1596 6528 w
+(previously) 1872 6528 w
+(linked) 2403 6528 w
+(blocks) 2735 6528 w
+(once) 3083 6528 w
+(the) 3348 6528 w
+(blocks) 3539 6528 w
+(that) 3888 6528 w
+(point) 4115 6528 w
+(at) 4403 6528 w
+(them) 4531 6528 w
+(have) 4815 6528 w
+(been written to disk.) 720 6648 w
+(A) 970 6804 w
+10 /LucidaTypewriter f
+(consI) 1084 6804 w
+10 /LucidaSansUnicode00 f
+(reads) 1489 6804 w
+(from) 1800 6804 w
+(each) 2077 6804 w
+(console) 2346 6804 w
+(file) 2762 6804 w
+(\(typically) 2959 6804 w
+(a) 3435 6804 w
+(pipe) 3536 6804 w
+(posted) 3793 6804 w
+(in) 4170 6804 w
+10 /LucidaTypewriter f
+(/srv) 4307 6804 w
+10 /LucidaSansUnicode00 f
+(\),) 4595 6804 w
+(adding) 4706 6804 w
+(the) 720 6924 w
+(typed) 914 6924 w
+(characters) 1224 6924 w
+(to) 1763 6924 w
+(the) 1900 6924 w
+(input) 2094 6924 w
+(queue.) 2386 6924 w
+(The) 2788 6924 w
+10 /LucidaTypewriter f
+(cons) 3008 6924 w
+10 /LucidaSansUnicode00 f
+(process) 3335 6924 w
+(echoes) 3747 6924 w
+(input) 4122 6924 w
+(and) 4413 6924 w
+(runs) 4631 6924 w
+(the) 4885 6924 w
+(commands,) 720 7044 w
+(saving) 1329 7044 w
+(output) 1688 7044 w
+(in) 2058 7044 w
+(a) 2197 7044 w
+(ring) 2300 7044 w
+(buffer.) 2542 7044 w
+(Because) 2950 7044 w
+(there) 3387 7044 w
+(is) 3687 7044 w
+(only) 3816 7044 w
+(one) 4069 7044 w
+10 /LucidaTypewriter f
+(cons) 4297 7044 w
+10 /LucidaSansUnicode00 f
+(process,) 4634 7044 w
+(only) 720 7164 w
+(one) 965 7164 w
+(console) 1185 7164 w
+(command) 1597 7164 w
+(may) 2116 7164 w
+(be) 2357 7164 w
+(executing) 2517 7164 w
+(at) 3034 7164 w
+(a) 3167 7164 w
+(time.) 3263 7164 w
+(A) 3583 7164 w
+10 /LucidaTypewriter f
+(consO) 3693 7164 w
+10 /LucidaSansUnicode00 f
+(process) 4094 7164 w
+(copies) 4509 7164 w
+(this) 4861 7164 w
+(ring buffer to the each console file.) 720 7284 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 7 7
+%%Page: 8 8
+/saveobj save def
+mark
+8 pagesetup
+10 /LucidaSansUnicode00 f
+(\255 8 \255) 2783 480 w
+(The) 970 840 w
+10 /LucidaTypewriter f
+(periodic) 1197 840 w
+10 /LucidaSansUnicode00 f
+(process) 1819 840 w
+(runs) 2239 840 w
+(periodic) 2502 840 w
+(events,) 2942 840 w
+(like) 3335 840 w
+(flushing) 3554 840 w
+(the) 3995 840 w
+(root) 4197 840 w
+(metadata) 4444 840 w
+(to) 4942 840 w
+(disk or taking snapshots of the file system.) 720 960 w
+10 /LucidaSans-Demi f
+(5.2.) 720 1200 w
+(Block cache) 962 1200 w
+10 /LucidaSansUnicode00 f
+(Fossil) 720 1356 w
+(maintains) 1028 1356 w
+(an) 1534 1356 w
+(in-memory) 1684 1356 w
+(block) 2262 1356 w
+(cache) 2557 1356 w
+(which) 2865 1356 w
+(holds) 3179 1356 w
+(both) 3479 1356 w
+(local) 3736 1356 w
+(disk) 3995 1356 w
+(blocks) 4230 1356 w
+(and) 4577 1356 w
+(Venti) 4791 1356 w
+(blocks.) 720 1476 w
+(Cache) 1130 1476 w
+(eviction) 1456 1476 w
+(follows) 1866 1476 w
+(a) 2244 1476 w
+(least recently used policy.) 2332 1476 w
+(Dirty blocks are restricted to) 3653 1476 w
+(at) 720 1596 w
+(most) 877 1596 w
+(half) 1184 1596 w
+(the) 1432 1596 w
+(cache.) 1652 1596 w
+(This) 2056 1596 w
+(can) 2326 1596 w
+(be) 2559 1596 w
+(changed) 2743 1596 w
+(by) 3219 1596 w
+(editing) 3399 1596 w
+10 /LucidaTypewriter f
+(DirtyPercentage) 3803 1596 w
+10 /LucidaSansUnicode00 f
+(in) 4949 1596 w
+10 /LucidaTypewriter f
+(dat.h) 720 1716 w
+10 /LucidaSansUnicode00 f
+(.) 1080 1716 w
+(The) 970 1872 w
+(block) 1201 1872 w
+(cache) 1513 1872 w
+(uses) 1838 1872 w
+(soft) 2108 1872 w
+(updates) 2344 1872 w
+([1]) 2781 1872 w
+(to) 2960 1872 w
+(ensure) 3108 1872 w
+(that) 3486 1872 w
+(the) 3728 1872 w
+(on-disk) 3934 1872 w
+(file) 4367 1872 w
+(system) 4569 1872 w
+(is) 4960 1872 w
+(always) 720 1992 w
+(self-consistent.) 1076 1992 w
+(Thus) 1905 1992 w
+(there) 2179 1992 w
+(is) 2467 1992 w
+(no) 2583 1992 w
+10 /LucidaSans-Italic f
+(halt) 2742 1992 w
+10 /LucidaSansUnicode00 f
+(console) 2968 1992 w
+(command) 3375 1992 w
+(and) 3889 1992 w
+(no) 4105 1992 w
+(need) 4264 1992 w
+(to) 4537 1992 w
+(check) 4671 1992 w
+(a) 4985 1992 w
+(file system that was shut down without halting.) 720 2112 w
+10 /LucidaSans-Demi f
+(5.3.) 720 2352 w
+(Archiving) 962 2352 w
+10 /LucidaSansUnicode00 f
+(A) 720 2508 w
+(background) 877 2508 w
+(process) 1543 2508 w
+(writes) 2005 2508 w
+(blocks) 2384 2508 w
+(in) 2785 2508 w
+(archival) 2964 2508 w
+(snapshots) 3426 2508 w
+(to) 4007 2508 w
+(Venti.) 4194 2508 w
+(Although) 4596 2508 w
+10 /LucidaTypewriter f
+(/archive/) 720 2628 w
+10 /LucidaSans-Italic f
+(yyyy) 1368 2628 w
+10 /LucidaTypewriter f
+(/) 1588 2628 w
+10 /LucidaSans-Italic f
+(mmdds) 1660 2628 w
+10 /LucidaSansUnicode00 f
+(is) 2057 2628 w
+(a) 2177 2628 w
+(copy) 2272 2628 w
+(of) 2539 2628 w
+(only) 2676 2628 w
+10 /LucidaTypewriter f
+(/active) 2919 2628 w
+10 /LucidaSansUnicode00 f
+(at) 3462 2628 w
+(the) 3593 2628 w
+(time) 3787 2628 w
+(of) 4041 2628 w
+(the) 4178 2628 w
+(snapshot,) 4372 2628 w
+(the) 4885 2628 w
+(archival) 720 2748 w
+(process) 1153 2748 w
+(archives) 1586 2748 w
+(the) 2043 2748 w
+(entire) 2258 2748 w
+(file) 2599 2748 w
+(tree) 2810 2748 w
+(rather) 3060 2748 w
+(than) 3412 2748 w
+(just) 3688 2748 w
+(the) 3928 2748 w
+(subtree) 4143 2748 w
+(rooted) 4569 2748 w
+(at) 4948 2748 w
+10 /LucidaTypewriter f
+(/active) 720 2868 w
+10 /LucidaSansUnicode00 f
+(.) 1224 2868 w
+(The) 1361 2868 w
+(snapshots) 1615 2868 w
+10 /LucidaTypewriter f
+(/snapshot/) 2181 2868 w
+10 /LucidaSans-Italic f
+(yyyy) 2901 2868 w
+10 /LucidaTypewriter f
+(/) 3121 2868 w
+10 /LucidaSans-Italic f
+(mmdd) 3193 2868 w
+10 /LucidaTypewriter f
+(/) 3501 2868 w
+10 /LucidaSans-Italic f
+(hhmm) 3573 2868 w
+10 /LucidaSansUnicode00 f
+(are) 3956 2868 w
+(stored) 4180 2868 w
+(as) 4561 2868 w
+(empty) 4739 2868 w
+(directories.) 720 2988 w
+(Once) 1338 2988 w
+(all) 1624 2988 w
+(the) 1777 2988 w
+(blocks) 1972 2988 w
+(have) 2325 2988 w
+(been) 2590 2988 w
+(archived,) 2867 2988 w
+(a) 3348 2988 w
+10 /LucidaTypewriter f
+(VtRoot) 3443 2988 w
+10 /LucidaSansUnicode00 f
+(header) 3915 2988 w
+(for) 4288 2988 w
+(the) 4467 2988 w
+(file) 4662 2988 w
+(sys\255) 4853 2988 w
+(tem) 720 3108 w
+(is) 964 3108 w
+(archived.) 1102 3108 w
+(The) 1633 3108 w
+(score) 1872 3108 w
+(of) 2190 3108 w
+(that) 2346 3108 w
+(header) 2594 3108 w
+(is) 2984 3108 w
+(recorded) 3121 3108 w
+(in) 3610 3108 w
+10 /LucidaTypewriter f
+(super.score) 3758 3108 w
+10 /LucidaSansUnicode00 f
+(and) 4607 3108 w
+(also) 4844 3108 w
+(printed) 720 3228 w
+(on) 1110 3228 w
+(the) 1272 3228 w
+(file) 1466 3228 w
+(server) 1656 3228 w
+(console.) 1992 3228 w
+(The) 2466 3228 w
+(score) 2686 3228 w
+(can) 2985 3228 w
+(used) 3192 3228 w
+(by) 3463 3228 w
+10 /LucidaSans-Italic f
+(flfmt) 3617 3228 w
+10 /LucidaSansUnicode00 f
+(to) 3893 3228 w
+(restore) 4031 3228 w
+(a) 4414 3228 w
+(file) 4509 3228 w
+(system) 4700 3228 w
+(\(see) 720 3348 w
+10 /LucidaSans-Italic f
+(fossil) 948 3348 w
+10 /LucidaSansUnicode00 f
+(\(4\)\).) 1199 3348 w
+10 /LucidaSans-Demi f
+(5.4.) 720 3588 w
+(Contrast with the old file server) 962 3588 w
+10 /LucidaSansUnicode00 f
+(The) 720 3744 w
+(most) 935 3744 w
+(obvious) 1211 3744 w
+(difference) 1624 3744 w
+(between) 2146 3744 w
+(Fossil) 2587 3744 w
+(and) 2896 3744 w
+(the) 3110 3744 w
+(old) 3299 3744 w
+(Plan) 3486 3744 w
+(9) 3721 3744 w
+(file) 3818 3744 w
+(server) 4004 3744 w
+([2]) 4336 3744 w
+(is) 4500 3744 w
+(that) 4615 3744 w
+(Fos\255) 4841 3744 w
+(sil) 720 3864 w
+(uses) 867 3864 w
+(a) 1125 3864 w
+(Venti) 1218 3864 w
+(server) 1505 3864 w
+(as) 1840 3864 w
+(its) 1984 3864 w
+(archival) 2139 3864 w
+(storage) 2551 3864 w
+(in) 2952 3864 w
+(place) 3081 3864 w
+(of) 3373 3864 w
+(a) 3509 3864 w
+(WORM) 3602 3864 w
+(juke) 3952 3864 w
+(box.) 4195 3864 w
+(There) 4481 3864 w
+(are) 4796 3864 w
+(a) 4985 3864 w
+(few other architectural differences to be aware of.) 720 3984 w
+(Fossil is a user-level program run on a standard kernel.) 970 4140 w
+(Fossil) 970 4296 w
+(does) 1282 4296 w
+(not) 1550 4296 w
+(have) 1747 4296 w
+(any) 2009 4296 w
+(way) 2216 4296 w
+(to) 2438 4296 w
+(concatenate,) 2574 4296 w
+(stripe,) 3227 4296 w
+(or) 3574 4296 w
+(mirror) 3714 4296 w
+(disk) 4058 4296 w
+(files.) 4297 4296 w
+(For) 4601 4296 w
+(func\255) 4795 4296 w
+(tionality) 720 4416 w
+(similar) 1156 4416 w
+(to) 1528 4416 w
+(the) 1671 4416 w
+(old) 1871 4416 w
+(file) 2069 4416 w
+(server) 2265 4416 w
+10 /LucidaSansUnicode20 f
+(\031) 2562 4416 w
+10 /LucidaSansUnicode00 f
+(s) 2594 4416 w
+(configuration) 2690 4416 w
+(strings,) 3384 4416 w
+(use) 3794 4416 w
+(the) 4008 4416 w
+(experimental) 4207 4416 w
+(file) 4889 4416 w
+(stack device \(see) 720 4536 w
+10 /LucidaSans-Italic f
+(devfs) 1571 4536 w
+10 /LucidaSansUnicode00 f
+(\(3\)\).) 1826 4536 w
+(Fossil speaks only 9P2000.) 970 4692 w
+(Old 9P \(aka 9P1\) is not supported.) 2345 4692 w
+10 /LucidaSans-Demi f
+(6.) 720 4968 w
+(References) 873 4968 w
+10 /LucidaSansUnicode00 f
+([1]) 720 5124 w
+(Gregory) 901 5124 w
+(R.) 1338 5124 w
+(Ganger,) 1485 5124 w
+(Marshall) 1917 5124 w
+(Kirk) 2377 5124 w
+(McKusick,) 2622 5124 w
+(Craig) 3159 5124 w
+(A.) 3467 5124 w
+(N.) 3620 5124 w
+(Soules,) 3779 5124 w
+(and) 4177 5124 w
+(Yale) 4410 5124 w
+(N.) 4665 5124 w
+(Patt.) 4824 5124 w
+10 /LucidaSansUnicode20 f
+(\030\030) 720 5244 w
+10 /LucidaSansUnicode00 f
+(Soft) 784 5244 w
+(Updates:) 1038 5244 w
+(A) 1529 5244 w
+(Solution) 1662 5244 w
+(to) 2121 5244 w
+(the) 2283 5244 w
+(Metadata) 2502 5244 w
+(Update) 3010 5244 w
+(Problem) 3417 5244 w
+(in) 3879 5244 w
+(File) 4034 5244 w
+(Systems,) 4266 5244 w
+10 /LucidaSansUnicode20 f
+(\031\031) 4692 5244 w
+10 /LucidaSans-Italic f
+(ACM) 4820 5244 w
+(Transactions on Computer Systems) 720 5364 w
+10 /LucidaSansUnicode00 f
+(, Vol 18., No. 2, May 2000, pp. 127) 2435 5364 w
+10 /LucidaSansUnicode20 f
+(\023) 4154 5364 w
+10 /LucidaSansUnicode00 f
+(153.) 4204 5364 w
+([2]) 720 5520 w
+(Sean) 898 5520 w
+(Quinlan,) 1174 5520 w
+10 /LucidaSansUnicode20 f
+(\030\030) 1632 5520 w
+10 /LucidaSansUnicode00 f
+(A) 1696 5520 w
+(Cached) 1815 5520 w
+(WORM) 2221 5520 w
+(File) 2584 5520 w
+(System,) 2802 5520 w
+10 /LucidaSansUnicode20 f
+(\031\031) 3177 5520 w
+10 /LucidaSans-Italic f
+(Software) 3291 5520 w
+10 /LucidaSansUnicode20 f
+(\024) 3719 5520 w
+10 /LucidaSans-Italic f
+(Practice) 3819 5520 w
+(and) 4256 5520 w
+(Experience) 4490 5520 w
+10 /LucidaSansUnicode00 f
+(,) 5008 5520 w
+(Vol 21., No 12., December 1991, pp. 1289) 720 5640 w
+10 /LucidaSansUnicode20 f
+(\023) 2799 5640 w
+10 /LucidaSansUnicode00 f
+(1299.) 2849 5640 w
+([3]) 720 5796 w
+(Sean) 907 5796 w
+(Quinlan) 1192 5796 w
+(and) 1627 5796 w
+(Sean) 1865 5796 w
+(Dorward,) 2150 5796 w
+10 /LucidaSansUnicode20 f
+(\030\030) 2653 5796 w
+10 /LucidaSansUnicode00 f
+(Venti:) 2717 5796 w
+(A) 3056 5796 w
+(New) 3184 5796 w
+(Approach) 3450 5796 w
+(to) 3974 5796 w
+(Archival) 4131 5796 w
+(Storage,) 4578 5796 w
+10 /LucidaSansUnicode20 f
+(\031\031) 4976 5796 w
+10 /LucidaSans-Italic f
+(Usenix Conference on File and Storage Technologies) 720 5916 w
+10 /LucidaSansUnicode00 f
+(, 2002.) 3237 5916 w
+cleartomark
+showpage
+saveobj restore
+%%EndPage: 8 8
+%%Trailer
+done
+%%DocumentFonts: LucidaSansUnicode20 LucidaSansUnicode00 LucidaSans-Demi LucidaSans-Italic LucidaTypewriter
+%%Pages: 8

+ 34 - 0
sys/lib/sysconfig/fl/boot

@@ -0,0 +1,34 @@
+#!/boot/rc -m /boot/rcmain
+
+cpuserver=no
+cd /boot
+cp '#r/rtc' '#c/time'
+bind -a '#I' /net
+bind -a '#l0' /net
+bind -a '#S' /dev
+bind '#p' /proc
+bind '#d' /fd
+bind -a /boot /
+ipconfig loopback /dev/null 127.1
+if(~ $cpuserver yes){
+	factotum -sfactotum -S
+}
+if not{
+	factotum -sfactotum -u
+	# add a key so mount and fossil can authenticate each other
+	# remove this key once factotum is initialized with other keys
+	factotum -g 'proto=p9sk1 user=rsc dom=localhost !password=localhost'
+}
+venti -c venti.conf -B 8m -C 8m -h tcp!127.1!8000 -I 8m -w -a tcp!127.1!17034 &
+sleep 10
+venti=tcp!127.0.0.1!17034
+fossil -c '. flproto'
+mount -c /srv/boot /root
+bind -ac /root /
+rootdir=/root
+rootspec=''
+if(~ $cpuserver yes)
+	/386/init -c
+if not
+	/386/init -t
+exec ./rc -m/boot/rcmain -i

+ 7 - 0
sys/lib/sysconfig/fl/flproto

@@ -0,0 +1,7 @@
+srv -p fscons
+srv boot
+fsys main config /dev/sdC0/fossil
+fsys main open
+fsys main
+users /active/adm/users
+snaptime -s 60 -a 0500

+ 8 - 0
sys/lib/sysconfig/fl/venti.conf

@@ -0,0 +1,8 @@
+index main
+isect /dev/sdC0/v.index0
+isect /dev/sdC0/v.index1
+isect /dev/sdC0/v.index2
+isect /dev/sdC0/v.index3
+
+arenas /dev/sdC0/v.arenas
+

+ 428 - 0
sys/man/4/fossil

@@ -0,0 +1,428 @@
+.TH FOSSIL 4
+.SH NAME
+fossil \- archival file server
+.SH SYNOPSIS
+.B fossil/fossil
+[
+.B -Dt
+]
+[
+.B -c
+.I cmd
+]...
+.PP
+.B fossil/flchk
+[
+.B -f
+]
+[
+.B -c
+.I ncache
+]
+[
+.B -h
+.I host
+]
+.I file
+.PP
+.B fossil/flfmt
+[
+.B -y
+]
+[
+.B -b
+.I blocksize
+]
+[
+.B -h
+.I host
+]
+[
+.B -l
+.I label
+]
+[
+.B -v
+.BI vac: score
+]
+.I file
+.SH DESCRIPTION
+Fossil
+will become the main file system for Plan 9.
+Unlike the Plan 9 file servers of old,
+fossil
+is a collection of user-space programs that run on a standard Plan 9 kernel.
+The name of the main fossil file server at Murray Hill is
+.BR ehime .
+The Plan 9 distribution file server,
+.BR sources ,
+is also a fossil server.
+.PP
+Fossil
+is structured as a magnetic disk write buffer
+backed by a Venti server for archival storage.
+It serves the Plan 9 protocol via TCP.
+A fossil file server conventionally presents
+three trees in the root directory of each file system:
+.BR active ,
+.BR archive ,
+and
+.BR snapshot .
+.B /active
+is the root of a conventional file system
+whose blocks are stored in a disk file.
+In a typical configuration, the file server periodically
+marks the entire file system copy-on-write, effectively
+taking a snapshot of the file system at that moment.
+This snapshot is made available in a name
+created from the date and time of the snapshot:
+.BI /snapshot/ yyyy / mmdd / hhmm \fR,
+where
+.I yyyy
+is the full year,
+.I mm
+is the month number,
+.I dd
+is the day number,
+.I hh
+is the hour,
+and
+.I mm
+is the minute.
+The snapshots in
+.B /snapshot
+are ephemeral: eventually they are deleted
+to reclaim the disk space they occupy.
+Long-lasting snapshots stored on a Venti server
+are kept in 
+.B /archive
+and also named from the date (though not the time) of the snapshot:
+.BI /archive/ yyyy / mmdds \fR,
+where
+.IR yyyy ,
+.IR mm ,
+and
+.I dd
+are year, month, and day as before,
+and
+.I s
+is a sequence number if more than one
+archival snapshot is done in a day.
+For the first snapshot,
+.I s
+is null.
+For the subsequent snapshots,
+.I s
+is
+.BR .1 ,
+.BR .2 ,
+.BR .3 ,
+etc.
+The root of the main file system that is frozen
+for the first archival snapshot of December 15, 2002
+will be named
+.BR /archive/2002/1215/ .
+.PP
+The attach name used in
+.IR mount (1)
+(see also
+.IR mount (2)
+and
+.IR attach (5))
+selects a file system to be served
+and optionally a subtree,
+in the format
+.IB fs \fR[\fB/ dir \fR].
+An empty attach name selects
+.BR main/active .
+.PP
+Fossil normally requires all users except
+.L none
+to provide authentication tickets on each
+.IR attach (5).
+To keep just anyone from connecting,
+.L none
+is only allowed to attach after another user
+has successfully attached on the same
+connection.
+The other user effectively acts as a chaperone
+for
+.LR none .
+Authentication can be disabled using the
+.B -A
+flag to
+.B open
+(see
+.IR fossilcons (8)).
+.PP
+The groups called
+.B noworld
+and
+.B write
+are special on the file server.
+Any user belonging to
+.B noworld
+has attenuated access privileges.
+Specifically, when checking such a user's access to files,
+the file's permission buts are first ANDed
+with 0770 for normal files and 0771 for directories.
+The effect is to deny world access permissions to
+.B noworld
+users, except when walking into directories.
+If the
+.B write
+group exists, then the file system appears read-only
+to users not in the group.
+This is used to make the Plan 9 distribution file server
+.RI ( sources.cs.bell-labs.com )
+readable by the world but writable only to the developers.
+.PP
+.I Fossil
+starts a new instance of the fossil file server.
+It is configured mainly through console commands,
+documented in
+.IR fossilcons (8).
+.PP
+The options are:
+.TP
+.B -D
+Toggle the debugging flag, which is initially off.
+When the flag is set, information about authentication
+and all protocol messages are written to standard error.
+.TP
+.B -t
+Start a file server console on
+.BR /dev/cons .
+If this option is given,
+.I fossil
+does not fork itself into the background.
+.TP
+.BI -c " cmd
+Execute the console command
+.IR cmd .
+This option may be repeated to give multiple
+commands.
+Typically the only commands given on the
+command line are
+.RB `` . \fIfile \fR,''
+which executes a file containing commands,
+and
+.RB `` "srv -p" \fIcons \fR,''
+which starts a file server console on
+.BI /srv/ cons \fR.
+See
+.IR fossilcons (8)
+for more information.
+.PD
+.PP
+.I Flchk
+checks the fossil file system stored in
+.I file
+for inconsistencies.
+.I Flchk
+prints fossil console commands that may be
+executed to take care of
+bad pointers
+.RB ( clrp ),
+bad entries
+.RB ( clre ),
+bad directory entries
+.RB ( clri ),
+unreachable blocks
+.RB ( bfree ).
+Console commands are interspersed with
+more detailed commentary on the file system.
+The commands are distinguished by being prefixed with
+sharp signs.
+Note that all proposed fixes are rather drastic: offending
+pieces of file system are simply chopped off.
+.PP
+.I Flchk
+does
+.I not
+modify the file system, so it is safe to
+run concurrently with
+.IR fossil ,
+though in this case
+the list of unreachable
+blocks and any inconsistencies involving the active file system
+should be taken with a grain of salt.
+.PP
+The options are:
+.TP
+.B -f
+Fast mode.
+By default,
+.I flchk
+checks the entire file system image for consistency,
+which includes all the archives to Venti
+and can take a very long time.
+In fast mode,
+.I flchk
+avoids walking in Venti blocks
+whenever possible.
+.TP
+.BI -c " ncache
+Keep a cache of
+.I ncache
+(by default, 1000)
+file system blocks in memory during the check.
+.TP
+.BI -h " host
+Use
+.I host
+as the Venti server.
+.PD
+.PP
+.I Flfmt
+prepares
+.I file
+as a new fossil file system.
+The file system is initialized with three empty directories
+.BR active ,
+.BR archive ,
+and
+.BR snapshot ,
+as described above.
+The options are:
+.TP
+.B -y
+Yes mode.
+By default,
+.I flfmt
+will prompt for confirmation before formatting
+a file that already contains a fossil file system,
+and before formatting a file that is not served
+directly by a kernel device.
+If the
+.B -y
+flag is given, no such checks are made.
+.TP
+.BI -b " blocksize
+Set the file system block size (by default, 8192).
+.TP
+.BI -h " host
+Use
+.I host
+as the Venti server.
+.TP
+.BI -l " label
+Set the textual label on the file system to
+.IR label .
+The label is only a comment.
+.TP
+.BI "-v vac:" score
+Initialize the file system using the vac file
+system stored on Venti at
+.IR score .
+The score should have been generated by
+.I fossil
+rather than by
+.IR vac (1),
+so that the appropriate snapshot metadata is present.
+.PD
+.SH EXAMPLES
+.PP
+Place the root of the archive file system on
+.B /n/dump
+and show the modified times of the MIPS C compiler
+over all dumps in December 2002:
+.IP
+.EX
+9fs dump
+ls -l /n/dump/2002/12*/mips/bin/vc
+.EE
+.PP
+To get only one line of output for each version of the compiler:
+.IP
+.EX
+ls -lp /n/dump/2002/12*/mips/bin/vc | uniq
+.EE
+.ne 14
+.PP
+Initialize a new file system, start the server with permission
+checking turned off, create a users file, and mount the server:
+.IP
+.EX
+fossil/flfmt /dev/sdC0/fossil
+cat >flproto <<EOF
+fsys main config /dev/sdC0/fossil
+fsys main open -AWP
+fsys main
+create /active/adm adm sys d775
+create /active/adm/users adm sys 664
+users -w /active/adm/users
+srv -p fscons
+srv fossil
+EOF
+fossil/fossil -c '. flproto'
+mount /srv/fossil /n/fossil
+.EE
+.LP
+See the discussion of the
+.B users
+and
+.B uname
+commands in
+.IR fossilcons (8)
+for more about the user table.
+.ne 3
+.PP
+Perhaps because the disk has been corrupted or replaced,
+format a new file system using the last archive score printed
+on the console:
+.IP
+.EX
+fossil/flfmt -v vac:b9b3...5559 /dev/sdC0/fossil
+.EE
+.LP
+Note that while
+.B /snapshot
+will be lost,
+.B /active
+and
+.B /archive
+will be restored to their contents at the time of the
+last archival snapshot.
+.ne 3
+.PP
+Blindly accept the changes prescribed by
+.I flchk
+(not recommended):
+.IP
+.EX
+fossil/flchk /dev/sdC0/fossil | sed -n 's/^# //p' >>/srv/fscons
+.EE
+.LP
+A better strategy is to vet the output,
+filter out any suggestions you're not comfortable with,
+and then use the
+.I sed
+command to prepare the script.
+.SH SOURCE
+.B /sys/src/cmd/fossil
+.SH SEE ALSO
+.IR yesterday (1),
+.IR fs (4),
+.IR srv (4),
+.IR fossilcons (8),
+.IR venti (8)
+.SH BUGS
+It is likely that the disk format (but not the Venti format)
+will change in the future, to make the disk a full cache
+rather than just a write buffer.
+Changing to the new format will require reformatting
+the disk as in the example above,
+but note that this will preserve most of the file system
+(all but
+.B /snapshot
+) with little effort.
+.LP
+The implementation of
+.IR flush (5)
+has a race that will be fixed shortly.
+It is possible (though, in practice, it happens
+very rarely) that 
+.I fossil
+will respond to a message after responding
+to a flush of that message.

+ 806 - 0
sys/man/8/fossilcons

@@ -0,0 +1,806 @@
+.TH FOSSILCONS 8
+.SH NAME
+fossilcons \- fossil console commands
+.SH SYNOPSIS
+.B
+con /srv/fscons
+.PP
+.PD 0.1
+.B .
+.I file
+.PP
+.B 9p
+.I T-message
+...
+.PP
+.B dflag
+.PP
+.B echo
+[
+.B -n
+]
+[
+.I arg
+...
+]
+.PP
+.B listen
+[
+.B -d
+]
+[
+.I address
+]
+.PP
+.B msg
+[
+.B -m
+.I nmsg
+]
+[
+.B -p
+.I nproc
+]
+.PP
+.B uname
+.I uname
+[
+.I uid
+|
+.BI : uid
+|
+.BI % newname
+|
+.BI = leader
+|
+.BI + member
+|
+.BI - member
+]
+.PP
+.B users
+[
+.B -dw
+|
+.I file
+]
+.PP
+.B srv
+[
+.B -dp
+]
+.I name
+.sp
+.PP
+.B fsys
+.I name
+.PP
+.B fsys
+.I name
+.B config
+.I device
+.PP
+.B fsys
+.I name
+.B venti
+.B -d
+|
+[
+.B -r
+]
+[
+.I host
+]
+.PP
+.B fsys
+.I name
+.B open
+[
+.B -APWr
+]
+[
+.B -c
+.I ncache
+]
+.PP
+[
+.B fsys
+.I name
+]
+.B close
+.PP
+.B fsys
+.I name
+.B unconfig
+.sp
+.PP
+[
+.B fsys
+.I name
+]
+.B bfree
+.I addr
+.PP
+[
+.B fsys
+.I name
+]
+.B block
+.I addr
+.I offset
+[
+.I count
+[
+.I data
+]]
+.PP
+[
+.B fsys
+.I name
+]
+.B clre
+.I addr
+.I offsets
+\&...
+.PP
+[
+.B fsys
+.I name
+]
+.B clri
+.I files
+\&...
+.PP
+[
+.B fsys
+.I name
+]
+.B clrp
+.I addr
+.I offset
+\&...
+.PP
+[
+.B fsys
+.I name
+]
+.B create
+.I path
+.I uid
+.I gid
+.I perm
+.PP
+[
+.B fsys
+.I name
+]
+.B epoch
+[
+.B -y
+]
+.I n
+.PP
+[
+.B fsys
+.I name
+]
+.B label
+.I addr
+[
+.I type
+.I state
+.I epoch
+.I epochclose
+.I tag
+]
+.PP
+[
+.B fsys
+.I name
+]
+.B remove
+.I files
+\&...
+.PP
+[
+.B fsys
+.I name
+]
+.B snap
+[
+.B -a
+]
+.PP
+[
+.B fsys
+.I name
+]
+.B snaptime
+[
+.B -a
+.I hhmm
+]
+[
+.B -s
+.I interval
+]
+.PP
+[
+.B fsys
+.I name
+]
+.B stat
+.IR files ...
+.PP
+[
+.B fsys
+.I name
+]
+.B sync
+.PP
+[
+.B fsys
+.I name
+]
+.B vac
+.I dir
+.PP
+[
+.B fsys
+.I name
+]
+.B wstat
+.I file
+.I elem
+.I uid
+.I gid
+.I perm
+.I length
+.SH DESCRIPTION
+These are configuration and maintenance commands
+executed at the console of a 
+.IR fossil (4)
+file server.
+The commands are split into three groups above:
+file server configuration,
+file system configuration,
+and file system maintenance.
+This manual page is split in the same way.
+.SS File server configuration
+.PP
+The
+dot
+.RI ( . )
+command
+reads
+.IR file ,
+treating each line as a command to be executed.
+Blank lines and lines beginning with a 
+.L #
+character are ignored.
+Note that
+.I file
+is a file in the name space in which
+.I fossil
+was started,
+.I not
+a file in any file system served by
+.IR fossil .
+.PP
+.I 9p
+executes a 9P transaction; the arguments
+are in the same format used by
+.IR 9pcon (8).
+.PP
+.I Dflag
+toggles the debug flag and prints the new setting.
+When the debug flag is set, all protocol messages
+and information about authentication is printed to
+standard error.
+.PP
+.I Echo
+behaves identically to
+.IR echo (1),
+writing to the console.
+.PP
+.I Listen
+manages the network addresses at which
+fossil is listening.
+With no arguments,
+.I listen
+prints the current list of addresses and their network directories.
+With one argument, listen
+.I address
+starts a new listener at
+.IR address ;
+the
+.B -d
+flag causes 
+.I listen
+to remove the listener
+at the given address.
+.PP
+.I Msg
+prints the maximum internal 9P message queue size
+and the maximum number of 9P processes to
+allocate for serving the queue.
+The
+.B -m
+and
+.B -p
+options set the two variables.
+.PP
+.I Uname
+displays or edits one entry the in-memory user table.
+If the in-memory table was read from a file
+(see the
+.I users
+command below),
+the new table is written to
+.IR /active/adm/users .
+There is no distinction between users and groups:
+a user is a group with one member.
+The user table records a mapping between
+uids and unames, as well as recording the
+leader and members of each group.
+A
+.I uid
+is a string naming a user or group
+and stored in the on-disk data structures.
+A
+.I uname
+is the string naming a user or group and
+used in 9P protocol messages.
+There is a distinction so that unames can be
+safely reused, even though uids cannot.
+The first argument to
+.I uname
+is (confusingly) a
+uname
+.RI ` uname .'
+The second argument is an optional verb, one of:
+.TP
+.I uid
+add the user with uname
+.RI ` uname '
+and uid
+.RI ` uid ,'
+creating a home directory
+.BI /active/usr/ uname \fR.
+.TP
+.BI : uid
+add the user but do not create the home directory
+.TP
+.BI % newname
+rename
+.RI ` uname '
+to
+.RI ` newname ,'
+throughout the user table
+.TP
+.BI = leader
+set the group leader to the uname
+.IR leader .
+.TP
+.BI + member
+add the uname
+.RI ` member '
+to the group
+.TP
+.BI - member
+remove the uname
+.RI ` member '
+from the group
+.LP
+If the verb is omitted, the entire entry for
+.I uname
+is printed, in the form:
+.IP
+.IB uid : uname : leader :\fImembers
+.LP
+where
+.I members
+is a comma-separated list of unames.
+.PP
+.I Users
+reads
+.IR file
+from the file system named
+.B main
+and uses it to initialize the user table.
+With no arguments,
+.I users
+prints the file name of the current user table.
+.PP
+The user table is a list of lines in the form printed
+by the
+.I uname
+command.
+The
+.B -d
+flag resets the user table to the default:
+.IP
+.EX
+adm:adm:adm:sys
+none:none::
+noworld:noworld::
+sys:sys::
+.EE
+.PP
+These users are mandatory and must appear in all user files read.
+It is not possible to rename these unames.
+.PP
+The
+.B -w
+flag causes the file
+.I /active/adm/users
+to be written with the current in-memory table.
+The path
+.I /active/adm
+must exist.
+.PP
+.I Srv
+behaves like listen but uses
+.BI /srv/ name
+rather than a network address.
+With the
+.B -p
+flag, 
+.I srv 
+edits a list of console services rather than 9P services.
+.SS File system configuration
+.I Fsys
+sets the current file system to
+.IR name ,
+which must be configured and open (q.v.).
+The current file system name is
+displayed as the file server prompt.
+.PP
+.I Fsys
+takes as an optional argument
+(after
+.BR name )
+a command to execute on the named file system.
+Most commands require that the named file system
+be configured and open; these commands can be invoked
+without the
+.BI fsys " name
+prefix, in which case the current file system is used.
+A few commands
+.RB ( config ,
+.BR open ,
+and
+.BR unconfig )
+operate on unopened file systems; they require the prefix.
+.PP
+.I Config
+creates a new file system named
+.I name
+using disk file
+.I device .
+This just adds an entry to fossil's internal table.
+.PP
+.I Venti
+establishes a connection to the Venti server
+.I host
+(by default, the environment variable
+.B $venti
+or the network variable
+.BR $venti )
+for use by the named file system.
+If no
+.I venti
+command is issued before
+.IR open ,
+the default Venti server will be used.
+The
+.B -d
+flag closes the connection to the Venti server
+and can only be used when the file system is closed.
+The
+.B -r
+flag redials the Venti server and can only
+be used when the file system is open;
+.I host
+must be given again.
+.PP
+.I Open
+opens the file system, reading the
+root and super blocks and allocating an in-memory
+cache for disk and Venti blocks.
+The options are:
+.TP
+.B -A
+run with no authentication
+.TP
+.B -P
+run with no permission checking
+.TP
+.B -W
+allow wstat to make arbitrary changes to the user and group fields
+.TP
+.B -r
+open the file system read-only
+.TP
+.BI -c " ncache
+allocate an in-memory cache of 
+.I ncache
+(by default, 1000)
+blocks
+.PP
+.I Close
+flushes all dirty file system blocks to disk
+and then closes the device file.
+.PP
+.I Unconfig
+removes the named file system (which must be closed)
+from fossil's internal table.
+.SS File system maintenance
+.I Bfree
+marks the block at disk address
+.I addr
+as available for allocation.
+Before doing so, it prints a
+.I label
+command (q.v.)
+that can be used to restore the block to its previous state.
+.PP
+.I Block
+displays (in hexadecimal)
+the contents of the block at disk address
+.IR addr ,
+starting at
+.I offset
+and continuing for
+.I count
+bytes or until the end of the block.
+If 
+.I data
+(also hexadecimal)
+is given, the contents in that range are
+replaced with data.
+When writing to a block,
+.I block
+prints the old and new contents,
+so that the change is easily undone.
+Editing blocks is discouraged.
+.PP
+.I Clre
+zeros an entry from a disk block.
+Before doing so, it prints a
+.I block
+command that can be used 
+to restore the entry.
+.PP
+.I Clri
+removes the internal directory entry
+and abandons storage associated with
+.IR files .
+It ignores the usual rules for sanity, such as checking against
+removing a non-empty directory.
+A subsequent
+.I flchk
+(see
+.IR fossil (4))
+will identify the abandoned storage so it can be reclaimed with
+.I bfree
+commands.
+The
+.I perm
+is formatted as described in the
+.I stat
+command ;
+creating files or directories with the
+snapshot
+.RB ( s )
+bit set is not allowed.
+.PP
+.I Clrp
+zeros a pointer in a disk block.
+Before doing so, it prints a 
+.I block
+command that can be used to restore the entry.
+.PP
+.I Create
+creates a file on the current file system.
+.I Uid
+and
+.I gid
+are uids
+.RI ( not
+unames;
+see the discussion above, in the description
+of the 
+.I uname
+command).
+.I Perm
+is the low 9 bits of the permission mode of the file,
+in octal.
+The 
+.BR -a ,
+.BR -d ,
+and
+.B -l
+flags set the append-only, directory, and lock bits.
+.PP
+.I Epoch
+sets the low file system epoch.
+Snapshots in the file system are given increasing epoch numbers.
+The file system maintains a low and a high epoch number,
+and only allows access to snapshots in that range.
+The low epoch number can be moved forward to discard old snapshots
+and reclaim the disk space they occupy.
+(The high epoch number is always the epoch of the currently
+active file system.)
+.PP
+The command
+``\fLepoch\fI n''\fR
+is used to propose changing the low epoch to
+.IR n .
+In response, 
+.I fossil
+scans
+.B /archive
+and
+.B /snapshot
+for snapshots that would be discarded, printing their
+epoch numbers and the
+.I clri
+commands necessary to remove them.
+The epoch is changed only if no such paths are found.
+The usual sequence of commands is (1) run epoch to
+print the snapshots and their epochs, (2) clri some snapshots,
+(3) run epoch again.
+If the file system is completely full (there are no free blocks),
+.I clri
+may fail because it needs to allocate blocks.
+For this situation,
+the
+.B -y
+flag to epoch forces the epoch change even when
+it means discarding currently accessible snapshots.
+Note that when there are still snapshots in
+.BR /archive ,
+the archiver should take care
+of those snapshots (moving the blocks from disk to Venti)
+if you give it more time.
+.PP
+.I Label
+displays and edits the label associated with a block.
+When editing, a parameter of
+.B -
+means leave that field unchanged.
+Editing labels is discouraged.
+.PP
+.I Remove
+removes
+.IR files .
+.PP
+.I Snap
+takes a temporary snapshot of the current file system,
+recording it in 
+.BI /snapshot/ yyyy / mmdd / hhmm \fR,
+as described in 
+.IR fossil (4).
+The
+.B -a
+flag causes 
+.I snap
+to take an archival snapshot, recording it in
+.BI /archive/ yyyy / mmdd \fR,
+also described in
+.IR fossil (4).
+.PP
+.I Snaptime
+displays and edits the times at which snapshots are automatically
+taken.
+An archival snapshot is taken once a day, at
+.IR hhmm ,
+while temporary snapshots are taken at multiples of
+.I interval
+minutes.
+With no arguments,
+.I snaptime
+prints the current snapshot times.
+The
+.B -a
+and
+.B -s
+options set the archive and snapshot times.
+An
+.I hhmm
+or
+.I interval
+of
+.L none
+can be used to disable that kind of automatic snapshot.
+By default, both are disabled.
+.PP
+.I Stat
+displays metadata for each of the named
+.IR files ,
+in the form:
+.IP
+.EX
+stat \fIfile elem uid gid perm length
+.EE
+.LP
+(Replacing
+.B stat
+with
+.B wstat
+yields a valid command.)
+The
+.I perm
+is an octal number less than or equal to 777,
+prefixed with any of the following letters
+to indicate additional bits.
+.IP
+.EX
+.ta +4n
+a	\fRappend only
+d	\fRdirectory
+l	\fRexclusive use
+s	\fRis the root of a snapshot
+A	\fRMS-DOS archive bit
+G	\fRsetgid
+H	\fRMS-DOS hidden bit
+L	\fRsymbolic link
+S	\fRMS-DOS system bit
+T	\fRMS-DOS temporary bit
+U	\fRsetuid
+Y	\fRsticky
+.EE
+The bits denoted by capital letters are included
+to support non-Plan 9 systems.
+They are not made visible by the 9P protocol.
+.PP
+.I Sync
+writes dirty blocks in memory to the disk.
+.PP
+.I Vac
+prints the Venti score for a
+.IR vac (1)
+archive containing the tree rooted
+at
+.IR dir ,
+which must already be archived to Venti
+(typically
+.IR dir
+is a directory in the
+.B /archive
+tree).
+.PP
+.I Wstat
+changes the metadata of the named
+.IR file .
+Specifying
+.B -
+for any of the fields means ``don't change.''
+Attempts to change the
+.B d
+or
+.B s
+bits in the
+.I perm
+are silently ignored.
+.SH EXAMPLE
+.IR Sources ,
+the Plan 9 distribution file server,
+uses the following configuration file:
+.IP
+.EX
+srv -p fscons.sources
+srv -p fscons.sources.adduserd
+srv sources
+fsys main config /dev/sdC0/fossil.outside
+fsys main open -c 25600
+fsys main
+users /active/adm/users
+listen tcp!*!564
+msg -m 40 -p 10
+snaptime -a 0000 -s 15
+.EE
+.LP
+The second console is used by the daemon
+that creates new accounts.

+ 2 - 1
sys/src/9/pc/devether.c

@@ -228,7 +228,8 @@ etheroq(Ether* ether, Block* bp)
 
 	if(!loopback){
 		qbwrite(ether->oq, bp);
-		ether->transmit(ether);
+		if(ether->transmit != nil)
+			ether->transmit(ether);
 	} else
 		freeb(bp);
 

+ 122 - 0
sys/src/9/pc/pcfl

@@ -0,0 +1,122 @@
+dev
+	root
+	cons
+	arch
+	pnp		pci
+	env
+	pipe
+	proc
+	mnt
+	srv
+	dup
+	rtc
+	ssl
+	tls
+	cap
+	kprof
+
+	ether		netif
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum386 inferno
+
+	draw		screen vga vgax
+	mouse		mouse
+	vga
+
+	sd
+	floppy		dma
+	lpt
+
+	audio		dma
+	pccard
+	i82365		cis
+	uart
+	usb
+
+link
+	devpccard
+	devi82365
+	apm		apmjump
+	ether2000	ether8390
+	ether2114x	pci
+	ether589	etherelnk3
+	ether79c970	pci
+	ether8003	ether8390
+	ether8139
+	ether82557	pci
+	ether82543gc	pci
+	ether83815	pci
+	etherec2t	ether8390
+	etherga620	pci
+	etherelnk3	pci
+	ethersink
+	ethersmc	devi82365 cis
+	etherwavelan	wavelan devi82365 cis pci
+	ethermedium
+	pcmciamodem
+	netdevmedium
+	loopbackmedium
+	usbuhci
+
+misc
+	archmp		mp apic
+
+	sdata		pci sdscsi
+	sd53c8xx		pci sdscsi
+	sdmylex		pci sdscsi
+
+	uarti8250
+
+	vga3dfx		+cur
+	vgaark2000pv	+cur
+	vgabt485	=cur
+	vgaclgd542x	+cur
+	vgaclgd546x	+cur
+	vgact65545	+cur
+	vgacyber938x	+cur
+	vgaet4000	+cur
+	vgahiqvideo	+cur
+	vgai81x	+cur
+	vgamach64xx	+cur
+	vgamga2164w	+cur
+	vgamga4xx	+cur
+	vganeomagic	+cur
+	vganvidia	+cur
+	vgargb524	=cur
+	vgas3		+cur vgasavage
+	vgat2r4		+cur
+	vgatvp3020	=cur
+	vgatvp3026	=cur
+	vgavmware	+cur
+
+ip
+	il
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+
+port
+	int cpuserver = 0;
+
+boot boot #S/sdC0/
+	il
+	local
+
+bootdir
+	/386/bin/rc
+	/rc/lib/rcmain
+	/386/bin/bind
+	/386/bin/cat
+	/386/bin/cp
+	/386/bin/echo
+	/386/bin/mount
+	/386/bin/sleep
+	/386/bin/auth/factotum
+	/386/bin/fossil/fossil
+	/386/bin/ip/ipconfig
+	/386/bin/venti/venti
+	/sys/lib/sysconfig/fl/boot
+	/sys/lib/sysconfig/fl/flproto
+	/sys/lib/sysconfig/fl/venti.conf
+

+ 30 - 0
sys/src/9/port/dev.c

@@ -49,6 +49,36 @@ devdir(Chan *c, Qid qid, char *n, vlong length, char *user, long perm, Dir *db)
 	db->muid = user;
 }
 
+/*
+ * (here, Devgen is the prototype; devgen is the function in dev.c.)
+ * 
+ * a Devgen is expected to return the directory entry for ".."
+ * if you pass it s==DEVDOTDOT (-1).  otherwise...
+ * 
+ * there are two contradictory rules.
+ * 
+ * (i) if c is a directory, a Devgen is expected to list its children
+ * as you iterate s.
+ * 
+ * (ii) whether or not c is a directory, a Devgen is expected to list
+ * its siblings as you iterate s.
+ * 
+ * devgen always returns the list of children in the root
+ * directory.  thus it follows (i) when c is the root and (ii) otherwise.
+ * many other Devgens follow (i) when c is a directory and (ii) otherwise.
+ * 
+ * devwalk assumes (i).  it knows that devgen breaks (i)
+ * for children that are themselves directories, and explicitly catches them.
+ * 
+ * devstat assumes (ii).  if the Devgen in question follows (i)
+ * for this particular c, devstat will not find the necessary info.
+ * with our particular Devgen functions, this happens only for
+ * directories, so devstat makes something up, assuming
+ * c->name, c->qid, eve, DMDIR|0555.
+ * 
+ * devdirread assumes (i).  the callers have to make sure
+ * that the Devgen satisfies (i) for the chan being read.
+ */
 /*
  * the zeroth element of the table MUST be the directory itself for ..
 */

+ 2 - 2
sys/src/9/port/devfs.c

@@ -173,6 +173,8 @@ mconfig(char* a, long n)	// "name idev0 idev1"
 	char	*c;
 	vlong	size, start;
 
+	size = 0;
+	start = 0;
 	if (confstr[0] == 0)
 		seprint(confstr, confstr+sizeof(confstr), Cfgstr);
 	mp = nil;
@@ -395,7 +397,6 @@ catio(Fsdev *mp, int isread, void *a, long n, vlong off)
 	int	i;
 	Chan*	mc;
 	long	l, wl, res;
-	char*	s;
 	//print("catio %d %p %ld %lld\n", isread, a, n, off);
 	res = n;
 	for (i = 0; n >= 0 && i < mp->ndevs ; i++){
@@ -409,7 +410,6 @@ catio(Fsdev *mp, int isread, void *a, long n, vlong off)
 		else
 			l = n;
 		//print("\tdev %d %p %ld %lld\n", i, a, l, off);
-		s = a;
 
 		if (isread)
 			wl = devtab[mc->type]->read(mc, a, l, off);

+ 209 - 0
sys/src/cmd/fossil/9.h

@@ -0,0 +1,209 @@
+#include <auth.h>
+#include <fcall.h>
+
+enum {
+	NFidHash	= 503,
+};
+
+typedef struct Con Con;
+typedef struct DirBuf DirBuf;
+typedef struct Excl Excl;
+typedef struct Fid Fid;
+typedef struct Fsys Fsys;
+typedef struct Msg Msg;
+
+struct Msg {
+	uchar*	data;
+	u32int	msize;			/* actual size of data */
+	Fcall	t;
+	Fcall	r;
+	Con*	con;
+	int	flush;
+
+	Msg*	next;
+	Msg*	prev;
+};
+
+struct Con {
+	VtLock*	lock;
+	int	fd;
+	char*	name;
+	uchar*	data;			/* max, not negotiated */
+	u32int	msize;			/* negotiated with Tversion */
+	int	state;
+	int	aok;
+	Msg*	version;
+	int	isconsole;
+
+	Msg*	mhead;			/* active queue */
+	Msg*	mtail;
+	VtRendez* active;
+	int	nmsg;
+
+	VtLock*	fidlock;		/* */
+	Fid*	fidhash[NFidHash];
+	Fid*	fhead;
+	Fid*	ftail;
+	int	nfid;
+};
+
+enum {
+	CsDead,
+	CsNew,
+	CsDown,
+	CsInit,
+	CsUp,
+	CsMoribund,
+};
+
+struct Fid {
+	VtLock*	lock;
+	Con*	con;
+	u32int	fidno;
+	int	ref;			/* inc/dec under Con.fidlock */
+	int	flags;
+
+	int	open;
+	File*	file;
+	Qid	qid;
+	char*	uid;
+	char*	uname;
+	DirBuf*	db;
+	Excl*	excl;
+
+	VtLock*	alock;			/* Tauth/Tattach */
+	AuthRpc* rpc;
+	Fsys*	fsys;
+	char*	cuname;
+
+	Fid*	hash;			/* lookup by fidno */
+	Fid*	next;			/* clunk session with Tversion */
+	Fid*	prev;
+};
+
+enum {					/* Fid.flags and fidGet(..., flags) */
+	FidFCreate	= 0x01,
+	FidFWlock	= 0x02,
+};
+
+enum {					/* Fid.open */
+	FidOCreate	= 0x01,
+	FidORead	= 0x02,
+	FidOWrite	= 0x04,
+	FidORclose	= 0x08,
+};
+
+/*
+ * 9p.c
+ */
+extern int (*rFcall[Tmax])(Msg*);
+extern int validFileName(char*);
+
+/*
+ * 9auth.c
+ */
+extern int authCheck(Fcall*, Fid*, Fsys*);
+extern int authRead(Fid*, void*, int);
+extern int authWrite(Fid*, void*, int);
+
+/*
+ * 9dir.c
+ */
+extern void dirBufFree(DirBuf*);
+extern int dirDe2M(DirEntry*, uchar*, int);
+extern int dirRead(Fid*, uchar*, int, vlong);
+
+/*
+ * 9excl.c
+ */
+extern int exclAlloc(Fid*);
+extern void exclFree(Fid*);
+extern void exclInit(void);
+extern int exclUpdate(Fid*);
+
+/*
+ * 9fid.c
+ */
+extern void fidClunk(Fid*);
+extern Fid* fidGet(Con*, u32int, int);
+extern void fidInit(void);
+extern void fidPut(Fid*);
+
+/*
+ * 9fsys.c
+ */
+extern Fsys* fsysGet(char*);
+extern Fs* fsysGetFs(Fsys*);
+extern void fsysFsRlock(Fsys*);
+extern void fsysFsRUnlock(Fsys*);
+extern File* fsysGetRoot(Fsys*, char*);
+extern Fsys* fsysIncRef(Fsys*);
+extern int fsysInit(void);
+extern int fsysNoAuthCheck(Fsys*);
+extern int fsysNoPermCheck(Fsys*);
+extern void fsysPut(Fsys*);
+extern int fsysWstatAllow(Fsys*);
+
+/*
+ * 9lstn.c
+ */
+extern int lstnInit(void);
+
+/*
+ * 9proc.c
+ */
+extern Con* conAlloc(int, char*);
+extern void procInit(void);
+
+/*
+ * 9srv.c
+ */
+extern int srvInit(void);
+
+/*
+ * 9user.c
+ */
+extern int groupLeader(char*, char*);
+extern int groupMember(char*, char*);
+extern int groupWriteMember(char*);
+extern char* unameByUid(char*);
+extern char* uidByUname(char*);
+extern int usersInit(void);
+extern int validUserName(char*);
+
+extern char* uidadm;
+extern char* unamenone;
+extern char* uidnoworld;
+
+/*
+ * Ccli.c
+ */
+extern int cliAddCmd(char*, int (*)(int, char*[]));
+extern int cliError(char*, ...);
+extern int cliInit(void);
+extern int cliExec(char*);
+
+/*
+ * Ccmd.c
+ */
+extern int cmdInit(void);
+
+/*
+ * Ccons.c
+ */
+extern int consPrompt(char*);
+extern int consInit(void);
+extern int consOpen(int, int, int);
+extern int consTTY(void);
+extern int consWrite(char*, int);
+
+/*
+ * Clog.c
+ */
+extern int consPrint(char*, ...);
+extern int consVPrint(char*, va_list);
+
+/*
+ * fossil.c
+ */
+extern int Dflag;

+ 126 - 0
sys/src/cmd/fossil/9auth.c

@@ -0,0 +1,126 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+int
+authRead(Fid* afid, void* data, int count)
+{
+	AuthInfo *ai;
+	AuthRpc *rpc;
+
+	if((rpc = afid->rpc) == nil)
+		return -1;
+
+	switch(auth_rpc(rpc, "read", nil, 0)){
+	default:
+		return -1;
+	case ARdone:
+		if((ai = auth_getinfo(rpc)) == nil)
+			break;
+		if(ai->cuid == nil || *ai->cuid == '\0'){
+			auth_freeAI(ai);
+			break;
+		}
+		assert(afid->cuname == nil);
+		afid->cuname = vtStrDup(ai->cuid);
+		auth_freeAI(ai);
+		if(Dflag)
+			fprint(2, "authRead cuname %s\n", afid->cuname);
+		assert(afid->uid == nil);
+		if((afid->uid = uidByUname(afid->cuname)) == nil)
+			break;
+		return 0;
+	case ARok:
+		if(count < rpc->narg)
+			break;
+		memmove(data, rpc->arg, rpc->narg);
+		return rpc->narg;
+	case ARphase:
+		break;
+	}
+	return -1;
+}
+
+int
+authWrite(Fid* afid, void* data, int count)
+{
+	assert(afid->rpc != nil);
+	if(auth_rpc(afid->rpc, "write", data, count) != ARok)
+		return -1;
+	return count;
+}
+
+int
+authCheck(Fcall* t, Fid* fid, Fs* fsys)
+{
+	Fid *afid;
+	uchar buf[1];
+
+	/*
+	 * Can't lookup with FidWlock here as there may be
+	 * protocol to do. Use a separate lock to protect altering
+	 * the auth information inside afid.
+	 */
+	if((afid = fidGet(fid->con, t->afid, 0)) == nil){
+		/*
+		 * If no authentication is asked for, allow
+		 * "none" provided the connection has already
+		 * been authenticatated.
+		 */
+		if(strcmp(fid->uname, unamenone) == 0 && fid->con->aok){
+			if((fid->uid = uidByUname(fid->uname)) == nil)
+				return 0;
+			return 1;
+		}
+
+		/*
+		 * The console is allowed to attach without
+		 * authentication.
+		 */
+		if(!fid->con->isconsole)
+			return 0;
+		if((fid->uid = uidByUname(fid->uname)) == nil)
+			return 0;
+		return 1;
+	}
+
+	/*
+	 * Check valid afid;
+	 * check uname and aname match.
+	 */
+	if(!(afid->qid.type & QTAUTH)){
+		fidPut(afid);
+		return 0;
+	}
+	if(strcmp(afid->uname, fid->uname) != 0 || afid->fsys != fsys){
+		fidPut(afid);
+		return 0;
+	}
+
+	vtLock(afid->alock);
+	if(afid->cuname == nil){
+		if(authRead(afid, buf, 0) != 0 || afid->cuname == nil){
+			vtUnlock(afid->alock);
+			fidPut(afid);
+			return 0;
+		}
+	}
+	vtUnlock(afid->alock);
+
+	assert(fid->uid == nil);
+	if((fid->uid = uidByUname(afid->cuname)) == nil){
+		fidPut(afid);
+		return 0;
+	}
+
+	vtMemFree(fid->uname);
+	fid->uname = vtStrDup(afid->cuname);
+	fidPut(afid);
+
+	/*
+	 * Allow "none" once the connection has been authenticated.
+	 */
+	fid->con->aok = 1;
+
+	return 1;
+}

+ 121 - 0
sys/src/cmd/fossil/9dir.c

@@ -0,0 +1,121 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+/* one entry buffer for reading directories */
+struct DirBuf {
+	DirEntryEnum*	dee;
+	int		valid;
+	DirEntry	de;
+};
+
+static DirBuf*
+dirBufAlloc(File* file)
+{
+	DirBuf *db;
+
+	db = vtMemAllocZ(sizeof(DirBuf));
+	db->dee = deeOpen(file);
+
+	return db;
+}
+
+void
+dirBufFree(DirBuf* db)
+{
+	if(db == nil)
+		return;
+
+	if(db->valid)
+		deCleanup(&db->de);
+	deeClose(db->dee);
+	vtMemFree(db);
+}
+
+int
+dirDe2M(DirEntry* de, uchar* p, int np)
+{
+	int n;
+	Dir dir;
+
+	memset(&dir, 0, sizeof(Dir));
+
+	dir.qid.path = de->qid;
+	dir.qid.vers = de->mcount;
+	dir.mode = de->mode & 0777;
+	if(de->mode & ModeAppend){
+		dir.qid.type |= QTAPPEND;
+		dir.mode |= DMAPPEND;
+	}
+	if(de->mode & ModeExclusive){
+		dir.qid.type |= QTEXCL;
+		dir.mode |= DMEXCL;
+	}
+	if(de->mode & ModeDir){
+		dir.qid.type |= QTDIR;
+		dir.mode |= DMDIR;
+	}
+	if(de->mode & ModeSnapshot){
+		dir.qid.type |= QTMOUNT;	/* just for debugging */
+		dir.mode |= DMMOUNT;
+	}
+
+	dir.atime = de->atime;
+	dir.mtime = de->mtime;
+	dir.length = de->size;
+
+	dir.name = de->elem;
+	if((dir.uid = unameByUid(de->uid)) == nil)
+		dir.uid = smprint("(%s)", de->uid);
+	if((dir.gid = unameByUid(de->gid)) == nil)
+		dir.gid = smprint("(%s)", de->gid);
+	if((dir.muid = unameByUid(de->mid)) == nil)
+		dir.muid = smprint("(%s)", de->mid);
+
+	n = convD2M(&dir, p, np);
+
+	vtMemFree(dir.muid);
+	vtMemFree(dir.gid);
+	vtMemFree(dir.uid);
+
+	return n;
+}
+
+int
+dirRead(Fid* fid, uchar* p, int count, vlong offset)
+{
+	int n, nb;
+	DirBuf *db;
+
+	/*
+	 * special case of rewinding a directory
+	 * otherwise ignore the offset
+	 */
+	if(offset == 0 && fid->db){
+		dirBufFree(fid->db);
+		fid->db = nil;
+	}
+
+	if(fid->db == nil)
+		fid->db = dirBufAlloc(fid->file);
+
+	db = fid->db;
+
+	for(nb = 0; nb < count; nb += n){
+		if(!db->valid){
+			n = deeRead(db->dee, &db->de);
+			if(n < 0)
+				return -1;
+			if(n == 0)
+				break;
+			db->valid = 1;
+		}
+		n = dirDe2M(&db->de, p+nb, count-nb);
+		if(n <= BIT16SZ)
+			break;
+		db->valid = 0;
+		deCleanup(&db->de);
+	}
+
+	return nb;
+}

+ 126 - 0
sys/src/cmd/fossil/9excl.c

@@ -0,0 +1,126 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+static struct {
+	VtLock*	lock;
+
+	Excl*	head;
+	Excl*	tail;
+} ebox;
+
+typedef struct Excl {
+	Fsys*	fsys;
+	uvlong	path;
+	ulong	time;
+
+	Excl*	next;
+	Excl*	prev;
+} Excl;
+
+enum {
+	LifeTime	= (5*60),
+};
+
+int
+exclAlloc(Fid* fid)
+{
+	ulong t;
+	Excl *excl;
+
+	assert(fid->excl == nil);
+
+	t = time(0L);
+	vtLock(ebox.lock);
+	for(excl = ebox.head; excl != nil; excl = excl->next){
+		if(excl->fsys != fid->fsys || excl->path != fid->qid.path)
+			continue;
+		/*
+		 * Found it.
+		 * Now, check if it's timed out.
+		 * If not, return error, it's locked.
+		 * If it has timed out, zap the old
+		 * one and continue on to allocate a
+		 * a new one.
+		 */
+		if(excl->time >= t){
+			vtUnlock(ebox.lock);
+			vtSetError("exclusive lock");
+			return 0;
+		}
+		excl->fsys = nil;
+	}
+
+	/*
+	 * Not found or timed-out.
+	 * Alloc a new one and initialise.
+	 */
+	excl = vtMemAllocZ(sizeof(Excl));
+	excl->fsys = fid->fsys;
+	excl->path = fid->qid.path;
+	excl->time = t+LifeTime;
+	if(ebox.tail != nil){
+		excl->prev = ebox.tail;
+		ebox.tail->next = excl;
+	}
+	else{
+		ebox.head = excl;
+		excl->prev = nil;
+	}
+	ebox.tail = excl;
+	excl->next = nil;
+	vtUnlock(ebox.lock);
+
+	fid->excl = excl;
+	return 1;
+}
+
+int
+exclUpdate(Fid* fid)
+{
+	ulong t;
+	Excl *excl;
+
+	excl = fid->excl;
+
+	t = time(0L);
+	vtLock(ebox.lock);
+	if(excl->time < t || excl->fsys != fid->fsys){
+		vtUnlock(ebox.lock);
+		vtSetError("exclusive lock broken");
+		return 0;
+	}
+	excl->time = t+LifeTime;
+	vtUnlock(ebox.lock);
+
+	return 1;
+}
+
+void
+exclFree(Fid* fid)
+{
+	Excl *excl;
+
+	if((excl = fid->excl) == nil)
+		return;
+	fid->excl = nil;
+
+	vtLock(ebox.lock);
+	if(excl->prev != nil)
+		excl->prev->next = excl->next;
+	else
+		ebox.head = excl->next;
+	if(excl->next != nil)
+		excl->next->prev = excl->prev;
+	else
+		ebox.tail = excl->prev;
+	vtUnlock(ebox.lock);
+
+	vtMemFree(excl);
+}
+
+void
+exclInit(void)
+{
+	ebox.lock = vtLockAlloc();
+}

+ 286 - 0
sys/src/cmd/fossil/9fid.c

@@ -0,0 +1,286 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+static struct {
+	VtLock*	lock;
+
+	Fid*	free;
+	int	nfree;
+	int	inuse;
+} fbox;
+
+static void
+fidLock(Fid* fid, int flags)
+{
+	if(flags & FidFWlock){
+		vtLock(fid->lock);
+		fid->flags = flags;
+	}
+	else
+		vtRLock(fid->lock);
+
+	/*
+	 * Callers of file* routines are expected to lock fsys->fs->elk
+	 * before making any calls in order to make sure the epoch doesn't
+	 * change underfoot. With the exception of Tversion and Tattach,
+	 * that implies all 9P functions need to lock on entry and unlock
+	 * on exit. Fortunately, the general case is the 9P functions do
+	 * fidGet on entry and fidPut on exit, so this is a convenient place
+	 * to do the locking.
+	 * No fsys->fs->elk lock is required if the fid is being created
+	 * (Tauth, Tattach and Twalk). FidFCreate is always accompanied by
+	 * FidFWlock so the setting and testing of FidFCreate here and in
+	 * fidUnlock below is always done under fid->lock.
+	 * A side effect is that fidFree is called with the fid locked, and
+	 * must call fidUnlock only after it has disposed of any File
+	 * resources still held.
+	 */
+	if(!(flags & FidFCreate))
+		fsysFsRlock(fid->fsys);
+}
+
+static void
+fidUnlock(Fid* fid)
+{
+	if(!(fid->flags & FidFCreate))
+		fsysFsRUnlock(fid->fsys);
+	if(fid->flags & FidFWlock){
+		fid->flags = 0;
+		vtUnlock(fid->lock);
+		return;
+	}
+	vtRUnlock(fid->lock);
+}
+
+static Fid*
+fidAlloc(void)
+{
+	Fid *fid;
+
+	vtLock(fbox.lock);
+	if(fbox.nfree > 0){
+		fid = fbox.free;
+		fbox.free = fid->hash;
+		fbox.nfree--;
+	}
+	else{
+		fid = vtMemAllocZ(sizeof(Fid));
+		fid->lock = vtLockAlloc();
+		fid->alock = vtLockAlloc();
+	}
+	fbox.inuse++;
+	vtUnlock(fbox.lock);
+
+	fid->con = nil;
+	fid->fidno = NOFID;
+	fid->ref = 0;
+	fid->flags = 0;
+	fid->open = FidOCreate;
+	assert(fid->file == nil);
+	fid->qid = (Qid){0, 0, 0};
+	assert(fid->uid == nil);
+	assert(fid->uname == nil);
+	assert(fid->db == nil);
+	assert(fid->excl == nil);
+	assert(fid->rpc == nil);
+	assert(fid->fsys == nil);
+	assert(fid->cuname == nil);
+	fid->hash = fid->next = fid->prev = nil;
+
+	return fid;
+}
+
+static void
+fidFree(Fid* fid)
+{
+	if(fid->file != nil){
+		fileDecRef(fid->file);
+		fid->file = nil;
+	}
+	if(fid->db != nil){
+		dirBufFree(fid->db);
+		fid->db = nil;
+	}
+	fidUnlock(fid);
+
+	if(fid->uid != nil){
+		vtMemFree(fid->uid);
+		fid->uid = nil;
+	}
+	if(fid->uname != nil){
+		vtMemFree(fid->uname);
+		fid->uname = nil;
+	}
+	if(fid->excl != nil)
+		exclFree(fid);
+	if(fid->rpc != nil){
+		close(fid->rpc->afd);
+		auth_freerpc(fid->rpc);
+		fid->rpc = nil;
+	}
+	if(fid->fsys != nil){
+		fsysPut(fid->fsys);
+		fid->fsys = nil;
+	}
+	if(fid->cuname != nil){
+		vtMemFree(fid->cuname);
+		fid->cuname = nil;
+	}
+
+	vtLock(fbox.lock);
+	fbox.inuse--;
+	if(fbox.nfree < 10){
+		fid->hash = fbox.free;
+		fbox.free = fid;
+		fbox.nfree++;
+	}
+	else{
+		vtLockFree(fid->alock);
+		vtLockFree(fid->lock);
+		vtMemFree(fid);
+	}
+	vtUnlock(fbox.lock);
+}
+
+static void
+fidUnHash(Fid* fid)
+{
+	Fid *fp, **hash;
+
+	assert(fid->ref == 0);
+
+	hash = &fid->con->fidhash[fid->fidno % NFidHash];
+	for(fp = *hash; fp != nil; fp = fp->hash){
+		if(fp == fid){
+			*hash = fp->hash;
+			break;
+		}
+		hash = &fp->hash;
+	}
+	assert(fp == fid);
+
+	if(fid->prev != nil)
+		fid->prev->next = fid->next;
+	else
+		fid->con->fhead = fid->next;
+	if(fid->next != nil)
+		fid->next->prev = fid->prev;
+	else
+		fid->con->ftail = fid->prev;
+	fid->prev = fid->next = nil;
+
+	fid->con->nfid--;
+}
+
+Fid*
+fidGet(Con* con, u32int fidno, int flags)
+{
+	Fid *fid, **hash;
+
+	if(fidno == NOFID)
+		return nil;
+
+	hash = &con->fidhash[fidno % NFidHash];
+	vtLock(con->fidlock);
+	for(fid = *hash; fid != nil; fid = fid->hash){
+		if(fid->fidno != fidno)
+			continue;
+
+		/*
+		 * Already in use is an error
+		 * when called from attach, clone or walk.
+		 */
+		if(flags & FidFCreate){
+			vtUnlock(con->fidlock);
+			vtSetError("fid in use");
+			return nil;
+		}
+		fid->ref++;
+		vtUnlock(con->fidlock);
+
+		fidLock(fid, flags);
+		if((fid->open & FidOCreate) || fid->fidno == NOFID){
+			fidPut(fid);
+			vtSetError("fid invalid");
+			return nil;
+		}
+		return fid;
+	}
+
+	if((flags & FidFCreate) && (fid = fidAlloc()) != nil){
+		assert(flags & FidFWlock);
+		fid->con = con;
+		fid->fidno = fidno;
+		fid->ref = 1;
+
+		fid->hash = *hash;
+		*hash = fid;
+		if(con->ftail != nil){
+			fid->prev = con->ftail;
+			con->ftail->next = fid;
+		}
+		else{
+			con->fhead = fid;
+			fid->prev = nil;
+		}
+		con->ftail = fid;
+		fid->next = nil;
+
+		con->nfid++;
+		vtUnlock(con->fidlock);
+
+		/*
+		 * The FidOCreate flag is used to prevent any
+		 * accidental access to the Fid between unlocking the
+		 * hash and acquiring the Fid lock for return.
+		 */
+		fidLock(fid, flags);
+		fid->open &= ~FidOCreate;
+		return fid;
+	}
+	vtUnlock(con->fidlock);
+
+	vtSetError("fid not found");
+	return nil;
+}
+
+void
+fidPut(Fid* fid)
+{
+	vtLock(fid->con->fidlock);
+	assert(fid->ref > 0);
+	fid->ref--;
+	vtUnlock(fid->con->fidlock);
+
+	if(fid->ref == 0 && fid->fidno == NOFID){
+		fidFree(fid);
+		return;
+	}
+	fidUnlock(fid);
+}
+
+void
+fidClunk(Fid* fid)
+{
+	assert(fid->flags & FidFWlock);
+
+	vtLock(fid->con->fidlock);
+	assert(fid->ref > 0);
+	fid->ref--;
+	fidUnHash(fid);
+	fid->fidno = NOFID;
+	vtUnlock(fid->con->fidlock);
+
+	if(fid->ref > 0){
+		fidUnlock(fid);
+		return;
+	}
+	fidFree(fid);
+}
+
+void
+fidInit(void)
+{
+	fbox.lock = vtLockAlloc();
+}

+ 1460 - 0
sys/src/cmd/fossil/9fsys.c

@@ -0,0 +1,1460 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+#include "9.h"
+
+typedef struct Fsys Fsys;
+
+typedef struct Fsys {
+	VtLock* lock;
+
+	char*	name;
+	char*	dev;
+	char*	venti;
+
+	Fs*	fs;
+	VtSession* session;
+	int	ref;
+
+	int	noauth;
+	int	noperm;
+	int	wstatallow;
+
+	Fsys*	next;
+} Fsys;
+
+static struct {
+	VtLock*	lock;
+	Fsys*	head;
+	Fsys*	tail;
+
+	char*	curfsys;
+} sbox;
+
+static char *_argv0;
+#define argv0 _argv0
+
+static char EFsysBusy[] = "fsys: '%s' busy";
+static char EFsysExists[] = "fsys: '%s' already exists";
+static char EFsysNoCurrent[] = "fsys: no current fsys";
+static char EFsysNotFound[] = "fsys: '%s' not found";
+static char EFsysNotOpen[] = "fsys: '%s' not open";
+
+static Fsys*
+_fsysGet(char* name)
+{
+	Fsys *fsys;
+
+	if(name == nil || name[0] == '\0')
+		name = "main";
+
+	vtRLock(sbox.lock);
+	for(fsys = sbox.head; fsys != nil; fsys = fsys->next){
+		if(strcmp(name, fsys->name) == 0){
+			fsys->ref++;
+			break;
+		}
+	}
+	if(fsys == nil)
+		vtSetError(EFsysNotFound, name);
+	vtRUnlock(sbox.lock);
+
+	return fsys;
+}
+
+static int
+cmdPrintConfig(int argc, char* argv[])
+{
+	Fsys *fsys;
+	char *usage = "usage: printconfig";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+
+	if(argc)
+		return cliError(usage);
+
+	vtRLock(sbox.lock);
+	for(fsys = sbox.head; fsys != nil; fsys = fsys->next){
+		consPrint("\tfsys %s config %s\n", fsys->name, fsys->dev);
+		if(fsys->venti && fsys->venti[0])
+			consPrint("\tfsys %s venti %q\n", fsys->name, fsys->venti);
+	}
+	vtRUnlock(sbox.lock);
+	return 1;
+}
+
+Fsys*
+fsysGet(char* name)
+{
+	Fsys *fsys;
+
+	if((fsys = _fsysGet(name)) == nil)
+		return nil;
+
+	vtLock(fsys->lock);
+	if(fsys->fs == nil){
+		vtSetError(EFsysNotOpen, fsys->name);
+		vtUnlock(fsys->lock);
+		fsysPut(fsys);
+		return nil;
+	}
+	vtUnlock(fsys->lock);
+
+	return fsys;
+}
+
+Fsys*
+fsysIncRef(Fsys* fsys)
+{
+	vtLock(sbox.lock);
+	fsys->ref++;
+	vtUnlock(sbox.lock);
+
+	return fsys;
+}
+
+void
+fsysPut(Fsys* fsys)
+{
+	vtLock(sbox.lock);
+	assert(fsys->ref > 0);
+	fsys->ref--;
+	vtUnlock(sbox.lock);
+}
+
+Fs*
+fsysGetFs(Fsys* fsys)
+{
+	assert(fsys != nil && fsys->fs != nil);
+
+	return fsys->fs;
+}
+
+void
+fsysFsRlock(Fsys* fsys)
+{
+	vtRLock(fsys->fs->elk);
+}
+
+void
+fsysFsRUnlock(Fsys* fsys)
+{
+	vtRUnlock(fsys->fs->elk);
+}
+
+int
+fsysNoAuthCheck(Fsys* fsys)
+{
+	return fsys->noauth;
+}
+
+int
+fsysNoPermCheck(Fsys* fsys)
+{
+	return fsys->noperm;
+}
+
+int
+fsysWstatAllow(Fsys* fsys)
+{
+	return fsys->wstatallow;
+}
+
+static char modechars[] = "YUGalLdHSATs";
+static ulong modebits[] = {
+	ModeSticky,
+	ModeSetUid,
+	ModeSetGid,
+	ModeAppend,
+	ModeExclusive,
+	ModeLink,
+	ModeDir,
+	ModeHidden,
+	ModeSystem,
+	ModeArchive,
+	ModeTemporary,
+	ModeSnapshot,
+	0
+};
+	
+char*
+fsysModeString(ulong mode, char *buf)
+{
+	int i;
+	char *p;
+
+	p = buf;
+	for(i=0; modebits[i]; i++)
+		if(mode & modebits[i])
+			*p++ = modechars[i];
+	sprint(p, "%luo", mode&0777);
+	return buf;
+}
+
+int
+fsysParseMode(char *s, ulong *mode)
+{
+	ulong x, y;
+	char *p;
+
+	x = 0;
+	for(; *s < '0' || *s > '9'; s++){
+		if(*s == 0)
+			return 0;
+		p = strchr(modechars, *s);
+		if(p == nil)
+			return 0;
+		x |= modebits[p-modechars];
+	}
+	y = strtoul(s, &p, 8);
+	if(*p != '\0' || y > 0777)
+		return 0;
+	*mode = x|y;
+	return 1;
+}
+
+File*
+fsysGetRoot(Fsys* fsys, char* name)
+{
+	File *root, *sub;
+
+	assert(fsys != nil && fsys->fs != nil);
+
+	root = fsGetRoot(fsys->fs);
+	if(name == nil || strcmp(name, "") == 0)
+		return root;
+
+	sub = fileWalk(root, name);
+	fileDecRef(root);
+
+	return sub;
+}
+
+static Fsys*
+fsysAlloc(char* name, char* dev)
+{
+	Fsys *fsys;
+
+	vtLock(sbox.lock);
+	for(fsys = sbox.head; fsys != nil; fsys = fsys->next){
+		if(strcmp(fsys->name, name) != 0)
+			continue;
+		vtSetError(EFsysExists, name);
+		vtUnlock(sbox.lock);
+		return nil;
+	}
+
+	fsys = vtMemAllocZ(sizeof(Fsys));
+	fsys->lock = vtLockAlloc();
+	fsys->name = vtStrDup(name);
+	fsys->dev = vtStrDup(dev);
+
+	fsys->ref = 1;
+
+	if(sbox.tail != nil)
+		sbox.tail->next = fsys;
+	else
+		sbox.head = fsys;
+	sbox.tail = fsys;
+	vtUnlock(sbox.lock);
+
+	return fsys;
+}
+
+static int
+fsysClose(Fsys* fsys, int argc, char* argv[])
+{
+	char *usage = "usage: [fsys name] close";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	return cliError("close isn't working yet; sync and then kill fossil");
+
+	/*
+	 * Oooh. This could be hard. What if fsys->ref != 1?
+	 * Also, fsClose() either does the job or panics, can we
+	 * gracefully detect it's still busy?
+	 *
+	 * More thought and care needed here.
+	 */
+	fsClose(fsys->fs);
+	fsys->fs = nil;
+	vtClose(fsys->session);
+	fsys->session = nil;
+
+	if(sbox.curfsys != nil && strcmp(fsys->name, sbox.curfsys) == 0){
+		sbox.curfsys = nil;
+		consPrompt(nil);
+	}
+
+	return 1;
+}
+
+static int
+fsysVac(Fsys* fsys, int argc, char* argv[])
+{
+	uchar score[VtScoreSize];
+	char *usage = "usage: [fsys name] vac path";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc != 1)
+		return cliError(usage);
+
+	if(!fsVac(fsys->fs, argv[0], score))
+		return 0;
+
+	consPrint("vac:%V\n", score);
+	return 1;
+}
+
+static int
+fsysSnap(Fsys* fsys, int argc, char* argv[])
+{
+	int doarchive;
+	char *usage = "usage: [fsys name] snap [-a]";
+
+	doarchive = 0;
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'a':
+		doarchive = 1;
+		break;
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	if(!fsSnapshot(fsys->fs, doarchive))
+		return 0;
+
+	return 1;
+}
+
+static int
+fsysSnapTime(Fsys* fsys, int argc, char* argv[])
+{
+	char buf[40], *x;
+	int hh, mm;
+	u32int arch, snap;
+	char *usage = "usage: [fsys name] snaptime [-a hhmm] [-s minutes]";
+
+	snapGetTimes(fsys->fs->snap, &arch, &snap);
+	ARGBEGIN{
+	case 'a':
+		x = ARGF();
+		if(x == nil)
+			return cliError(usage);
+		if(strcmp(x, "none") == 0){
+			arch = ~(u32int)0;
+			break;
+		}
+		if(strlen(x) != 4 || strspn(x, "0123456789") != 4)
+			return cliError(usage);
+		hh = (x[0]-'0')*10 + x[1]-'0';
+		mm = (x[2]-'0')*10 + x[3]-'0';
+		if(hh >= 24 || mm >= 60)
+			return cliError(usage);
+		arch = hh*60+mm;
+		break;
+	case 's':
+		x = ARGF();
+		if(x == nil)
+			return cliError(usage);
+		if(strcmp(x, "none") == 0){
+			snap = ~(u32int)0;
+			break;
+		}
+		snap = atoi(x);
+		break;
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc > 0)
+		return cliError(usage);
+
+	snapSetTimes(fsys->fs->snap, arch, snap);
+	snapGetTimes(fsys->fs->snap, &arch, &snap);
+	if(arch != ~(u32int)0)
+		sprint(buf, "-a %02d%02d", arch/60, arch%60);
+	else
+		sprint(buf, "-a none");
+	if(snap != ~(u32int)0)
+		sprint(buf+strlen(buf), " -s %d", snap);
+	else
+		sprint(buf+strlen(buf), " -s none");
+	consPrint("\tsnaptime %s\n", buf);
+	return 1;
+}
+
+static int
+fsysSync(Fsys* fsys, int argc, char* argv[])
+{
+	char *usage = "usage: [fsys name] sync";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc > 0)
+		return cliError(usage);
+
+	fsSync(fsys->fs);
+
+	return 1;
+}
+
+static int
+fsysRemove(Fsys* fsys, int argc, char* argv[])
+{
+	File *file;
+	char *usage = "usage: [fsys name] remove path ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc == 0)
+		return cliError(usage);
+
+	vtRLock(fsys->fs->elk);
+	while(argc > 0){
+		if((file = fileOpen(fsys->fs, argv[0])) == nil)
+			consPrint("%s: %R\n", argv[0]);
+		else{
+			if(!fileRemove(file, uidadm))
+				consPrint("%s: %R\n", argv[0]);
+			fileDecRef(file);
+		}
+		argc--;
+		argv++;
+	}
+	vtRUnlock(fsys->fs->elk);
+
+	return 1;
+}
+
+static int
+fsysClri(Fsys* fsys, int argc, char* argv[])
+{
+	char *usage = "usage: [fsys name] clri path ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc == 0)
+		return cliError(usage);
+
+	vtRLock(fsys->fs->elk);
+	while(argc > 0){
+		if(!fileClri(fsys->fs, argv[0], uidadm))
+			consPrint("clri %s: %R\n", argv[0]);
+		argc--;
+		argv++;
+	}
+	vtRUnlock(fsys->fs->elk);
+
+	return 1;
+}
+
+/*
+ * Inspect and edit the labels for blocks on disk.
+ */
+static int
+fsysLabel(Fsys* fsys, int argc, char* argv[])
+{
+	Fs *fs;
+	Label l;
+	int n, r;
+	u32int addr;
+	Block *b, *bb;
+	char *usage = "usage: [fsys name] label addr [type state epoch epochClose tag]";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc != 1 && argc != 6)
+		return cliError(usage);
+
+	r = 0;
+	vtRLock(fsys->fs->elk);
+
+	fs = fsys->fs;
+	addr = strtoul(argv[0], 0, 0);
+	b = cacheLocal(fs->cache, PartData, addr, OReadOnly);
+	if(b == nil)
+		goto Out0;
+
+	l = b->l;
+	consPrint("%slabel %#ux %ud %ud %ud %ud %#x\n",
+		argc==6 ? "old: " : "", addr, l.type, l.state,
+		l.epoch, l.epochClose, l.tag);
+
+	if(argc == 6){
+		if(strcmp(argv[1], "-") != 0)
+			l.type = atoi(argv[1]);
+		if(strcmp(argv[2], "-") != 0)
+			l.state = atoi(argv[2]);
+		if(strcmp(argv[3], "-") != 0)
+			l.epoch = strtoul(argv[3], 0, 0);
+		if(strcmp(argv[4], "-") != 0)
+			l.epochClose = strtoul(argv[4], 0, 0);
+		if(strcmp(argv[5], "-") != 0)
+			l.tag = strtoul(argv[5], 0, 0);
+
+		consPrint("new: label %#ux %ud %ud %ud %ud %#x\n",
+			addr, l.type, l.state, l.epoch, l.epochClose, l.tag);
+		bb = _blockSetLabel(b, &l);
+		if(bb == nil)
+			goto Out1;
+		n = 0;
+		for(;;){
+			if(blockWrite(bb)){
+				while(bb->iostate != BioClean){
+					assert(bb->iostate == BioWriting);
+					vtSleep(bb->ioready);
+				}
+				break;
+			}
+			consPrint("blockWrite: %R\n");
+			if(n++ >= 5){
+				consPrint("giving up\n");
+				break;
+			}
+			sleep(5*1000);
+		}
+		blockPut(bb);
+	}
+	r = 1;
+Out1:
+	blockPut(b);
+Out0:
+	vtRUnlock(fs->elk);
+
+	return r;
+}
+
+/*
+ * Inspect and edit the blocks on disk.
+ */
+static int
+fsysBlock(Fsys* fsys, int argc, char* argv[])
+{
+	Fs *fs;
+	char *s;
+	Block *b;
+	uchar *buf;
+	u32int addr;
+	int c, count, i, offset;
+	char *usage = "usage: [fsys name] block addr offset [count [data]]";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc < 2 || argc > 4)
+		return cliError(usage);
+
+	fs = fsys->fs;
+	addr = strtoul(argv[0], 0, 0);
+	offset = strtoul(argv[1], 0, 0);
+	if(offset < 0 || offset >= fs->blockSize){
+		vtSetError("bad offset");
+		return 0;
+	}
+	if(argc > 2)
+		count = strtoul(argv[2], 0, 0);
+	else
+		count = 100000000;
+	if(offset+count > fs->blockSize)
+		count = fs->blockSize - count;
+
+	vtRLock(fs->elk);
+
+	b = cacheLocal(fs->cache, PartData, addr, argc==4 ? OReadWrite : OReadOnly);
+	if(b == nil){
+		vtSetError("cacheLocal %#ux: %R", addr);
+		vtRUnlock(fs->elk);
+		return 0;
+	}
+
+	consPrint("\t%sblock %#ux %ud %ud %.*H\n", 
+		argc==4 ? "old: " : "", addr, offset, count, count, b->data+offset);
+
+	if(argc == 4){
+		s = argv[3];
+		if(strlen(s) != 2*count){
+			vtSetError("bad data count");
+			goto Out;
+		}
+		buf = vtMemAllocZ(count);
+		for(i = 0; i < count*2; i++){
+			if(s[i] >= '0' && s[i] <= '9')
+				c = s[i] - '0';
+			else if(s[i] >= 'a' && s[i] <= 'f')
+				c = s[i] - 'a' + 10;
+			else if(s[i] >= 'A' && s[i] <= 'F')
+				c = s[i] - 'A' + 10;
+			else{
+				vtSetError("bad hex");
+				vtMemFree(buf);
+				goto Out;
+			}
+			if((i & 1) == 0)
+				c <<= 4;
+			buf[i>>1] |= c;
+		}
+		memmove(b->data+offset, buf, count);
+		consPrint("\tnew: block %#ux %ud %ud %.*H\n", 
+			addr, offset, count, count, b->data+offset);
+		blockDirty(b);
+	}
+
+Out:
+	blockPut(b);
+	vtRUnlock(fs->elk);
+
+	return 1;
+}
+
+/*
+ * Free a disk block.
+ */
+static int
+fsysBfree(Fsys* fsys, int argc, char* argv[])
+{
+	Fs *fs;
+	Label l;
+	char *p;
+	Block *b;
+	u32int addr;
+	char *usage = "usage: [fsys name] bfree addr ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc == 0)
+		return cliError(usage);
+
+	fs = fsys->fs;
+	vtRLock(fs->elk);
+	while(argc > 0){
+		addr = strtoul(argv[0], &p, 0);
+		if(*p != '\0'){
+			consPrint("bad address - '%s'\n", addr);
+			/* syntax error; let's stop */
+			vtRUnlock(fs->elk);
+			return 0;
+		}
+		b = cacheLocal(fs->cache, PartData, addr, OReadOnly);
+		if(b == nil){
+			consPrint("loading %#ux: %R\n", addr);
+			continue;
+		}
+		l = b->l;
+		consPrint("label %#ux %ud %ud %ud %ud %#x\n",
+			addr, l.type, l.state, l.epoch, l.epochClose, l.tag);
+		l.state = BsFree;
+		l.type = BtMax;
+		l.tag = 0;
+		l.epoch = 0;
+		l.epochClose = 0;
+		if(!blockSetLabel(b, &l))
+			consPrint("freeing %#ux: %R\n", addr);
+		blockPut(b);
+		argc--;
+		argv++;
+	}
+	vtRUnlock(fs->elk);
+
+	return 1;
+}
+
+/*
+ * Zero an entry or a pointer.
+ */
+static int
+fsysClrep(Fsys* fsys, int argc, char* argv[], int ch)
+{
+	Fs *fs;
+	Entry e;
+	Block *b;
+	u32int addr;
+	int i, max, offset, sz;
+	uchar zero[VtEntrySize];
+	char *usage = "usage: [fsys name] clr%c addr offset ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage, ch);
+	}ARGEND
+	if(argc < 2)
+		return cliError(usage, ch);
+
+	fs = fsys->fs;
+	vtRLock(fsys->fs->elk);
+
+	addr = strtoul(argv[0], 0, 0);
+	b = cacheLocal(fs->cache, PartData, addr, argc==4 ? OReadWrite : OReadOnly);
+	if(b == nil){
+		vtSetError("cacheLocal %#ux: %R", addr);
+	Err:
+		vtRUnlock(fsys->fs->elk);
+		return 0;
+	}
+
+	switch(ch){
+	default:
+		vtSetError("clrep");
+		goto Err;
+	case 'e':
+		if(b->l.type != BtDir){
+			vtSetError("wrong block type");
+			goto Err;
+		}
+		sz = VtEntrySize;
+		memset(&e, 0, sizeof e);
+		entryPack(&e, zero, 0);
+		break;
+	case 'p':
+		if(b->l.type == BtDir || b->l.type == BtData){
+			vtSetError("wrong block type");
+			goto Err;
+		}
+		sz = VtScoreSize;
+		memmove(zero, vtZeroScore, VtScoreSize);
+		break;
+	}
+	max = fs->blockSize/sz;
+
+	for(i = 1; i < argc; i++){
+		offset = atoi(argv[i]);
+		if(offset >= max){
+			consPrint("\toffset %d too large (>= %d)\n", i, max);
+			continue;
+		}
+		consPrint("\tblock %#ux %d %d %.*H\n", addr, offset*sz, sz, sz, b->data+offset*sz);
+		memmove(b->data+offset*sz, zero, sz);
+	}
+	blockDirty(b);
+	blockPut(b);
+	vtRUnlock(fsys->fs->elk);
+
+	return 1;
+}
+
+static int
+fsysClre(Fsys* fsys, int argc, char* argv[])
+{
+	return fsysClrep(fsys, argc, argv, 'e');
+}
+
+static int
+fsysClrp(Fsys* fsys, int argc, char* argv[])
+{
+	return fsysClrep(fsys, argc, argv, 'p');
+}
+
+static int
+fsysEsearch1(File* f, char* s, u32int elo)
+{
+	int n, r;
+	DirEntry de;
+	DirEntryEnum *dee;
+	File *ff;
+	Entry e, ee;
+	char *t;
+
+	dee = deeOpen(f);
+	if(dee == nil)
+		return 0;
+
+	n = 0;
+	for(;;){
+		r = deeRead(dee, &de);
+		if(r < 0){
+			consPrint("\tdeeRead %s/%s: %R\n", s, de.elem);
+			break;
+		}
+		if(r == 0)
+			break;
+		if(de.mode & ModeSnapshot){
+			if((ff = fileWalk(f, de.elem)) == nil)
+				consPrint("\tcannot walk %s/%s: %R\n", s, de.elem);
+			else{
+				if(!fileGetSources(ff, &e, &ee, 0))
+					consPrint("\tcannot get sources for %s/%s: %R\n", s, de.elem);
+				else if(e.snap != 0 && e.snap < elo){
+					consPrint("\t%ud\tclri %s/%s\n", e.snap, s, de.elem);
+					n++;
+				}
+				fileDecRef(ff);
+			}
+		}
+		else if(de.mode & ModeDir){
+			if((ff = fileWalk(f, de.elem)) == nil)
+				consPrint("\tcannot walk %s/%s: %R\n", s, de.elem);
+			else{
+				t = vtMemAlloc(strlen(s)+1+strlen(de.elem)+1);
+				strcpy(t, s);
+				strcat(t, "/");
+				strcat(t, de.elem);
+				n += fsysEsearch1(ff, t, elo);
+				vtMemFree(t);
+				fileDecRef(ff);
+			}
+		}
+		deCleanup(&de);
+		if(r < 0)
+			break;
+	}
+	deeClose(dee);
+
+	return n;
+}
+			
+static int
+fsysEsearch(Fs* fs, char* path, u32int elo)
+{
+	int n;
+	File *f;
+	DirEntry de;
+
+	f = fileOpen(fs, path);
+	if(f == nil)
+		return 0;
+	if(!fileGetDir(f, &de)){
+		consPrint("\tfileGetDir %s failed: %R\n", path);
+		fileDecRef(f);
+		return 0;
+	}
+	if((de.mode & ModeDir) == 0){
+		fileDecRef(f);
+		deCleanup(&de);
+		return 0;
+	}
+	deCleanup(&de);
+	n = fsysEsearch1(f, path, elo);
+	fileDecRef(f);
+	return n;
+}
+
+static int
+fsysEpoch(Fsys* fsys, int argc, char* argv[])
+{
+	Fs *fs;
+	int force, n;
+	u32int low, old;
+	char *usage = "usage: [fsys name] epoch [[-y] low]";
+
+	force = 0;
+	ARGBEGIN{
+	case 'y':
+		force = 1;
+		break;
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc > 1)
+		return cliError(usage);
+	if(argc > 0)
+		low = strtoul(argv[0], 0, 0);
+	else
+		low = ~(u32int)0;
+
+	fs = fsys->fs;
+
+	vtRLock(fs->elk);
+	consPrint("\tlow %ud hi %ud\n", fs->elo, fs->ehi);
+	n = fsysEsearch(fsys->fs, "/archive", low);
+	n += fsysEsearch(fsys->fs, "/snapshot", low);
+	consPrint("\t%d snapshot%s found with epoch < %ud\n", n, n==1 ? "" : "s", low);
+	vtRUnlock(fs->elk);
+
+	/*
+	 * There's a small race here -- a new snapshot with epoch < low might
+	 * get introduced now that we unlocked fs->elk.  Low has to
+	 * be <= fs->ehi.  Of course, in order for this to happen low has
+	 * to be equal to the current fs->ehi _and_ a snapshot has to 
+	 * run right now.  This is a small enough window that I don't care.
+	 */
+	if(n != 0 && !force){
+		consPrint("\tnot setting low epoch\n");
+		return 1;
+	}
+	old = fs->elo;
+	if(!fsEpochLow(fs, low))
+		consPrint("\tfsEpochLow: %R\n");
+	else{
+		consPrint("\told: epoch%s %ud\n", force ? " -y" : "", old);
+		consPrint("\tnew: epoch%s %ud\n", force ? " -y" : "", fs->elo);
+		if(fs->elo < low)
+			consPrint("\twarning: new low epoch < old low epoch\n");
+	}
+
+	return 1;
+}
+
+static int
+fsysCreate(Fsys* fsys, int argc, char* argv[])
+{
+	int r;
+	ulong mode;
+	char *elem, *p, *path;
+	char *usage = "usage: [fsys name] create path uid gid perm";
+	DirEntry de;
+	File *file, *parent;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc != 4)
+		return cliError(usage);
+
+	if(!fsysParseMode(argv[3], &mode))
+		return cliError(usage);
+	if(mode&ModeSnapshot)
+		return cliError("create - cannot create with snapshot bit set");
+
+	if(strcmp(argv[1], uidnoworld) == 0)
+		return cliError("permission denied");
+
+	vtRLock(fsys->fs->elk);
+	path = vtStrDup(argv[0]);
+	if((p = strrchr(path, '/')) != nil){
+		*p++ = '\0';
+		elem = p;
+		p = path;
+		if(*p == '\0')
+			p = "/";
+	}
+	else{
+		p = "/";
+		elem = path;
+	}
+	r = 0;
+	if((parent = fileOpen(fsys->fs, p)) != nil){
+		file = fileCreate(parent, elem, mode, argv[1]);
+		fileDecRef(parent);
+		if(file != nil){
+			if(fileGetDir(file, &de)){
+				r = 1;
+				if(strcmp(de.gid, argv[2]) != 0){
+					vtMemFree(de.gid);
+					de.gid = vtStrDup(argv[2]);
+					r = fileSetDir(file, &de, argv[1]);
+				}
+				deCleanup(&de);
+			}
+			fileDecRef(file);
+		}
+	}
+	vtMemFree(path);	
+	vtRUnlock(fsys->fs->elk);
+
+	return r;
+}
+
+static void
+fsysPrintStat(char *prefix, char *file, DirEntry *de)
+{
+	char buf[64];
+
+	if(prefix == nil)
+		prefix = "";
+	consPrint("%sstat %q %q %q %q %s %llud\n", prefix,
+		file, de->elem, de->uid, de->gid, fsysModeString(de->mode, buf), de->size);
+}
+
+static int
+fsysStat(Fsys* fsys, int argc, char* argv[])
+{
+	int i;
+	File *f;
+	DirEntry de;
+	char *usage = "usage: [fsys name] stat files...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+
+	if(argc == 0)
+		return cliError(usage);
+
+	vtRLock(fsys->fs->elk);
+	for(i=0; i<argc; i++){
+		if((f = fileOpen(fsys->fs, argv[i])) == nil){
+			consPrint("%s: %R\n");
+			continue;
+		}
+		if(!fileGetDir(f, &de)){
+			consPrint("%s: %R\n");
+			fileDecRef(f);
+			continue;
+		}
+		fsysPrintStat("\t", argv[i], &de);
+		deCleanup(&de);
+		fileDecRef(f);
+	}
+	vtRUnlock(fsys->fs->elk);
+	return 1;
+}
+
+static int
+fsysWstat(Fsys *fsys, int argc, char* argv[])
+{
+	File *f;
+	char *p;
+	DirEntry de;
+	char *usage = "usage: [fsys name] wstat file elem uid gid mode length\n"
+		"\tuse - for any field to mean don't change";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+
+	if(argc != 6)
+		return cliError(usage);
+
+	vtRLock(fsys->fs->elk);
+	if((f = fileOpen(fsys->fs, argv[0])) == nil){
+		vtSetError("console wstat - walk - %R");
+		vtRUnlock(fsys->fs->elk);
+		return 0;
+	}
+	if(!fileGetDir(f, &de)){
+		vtSetError("console wstat - stat - %R");
+		fileDecRef(f);
+		vtRUnlock(fsys->fs->elk);
+		return 0;
+	}
+	fsysPrintStat("\told: w", argv[0], &de);
+
+	if(strcmp(argv[1], "-") != 0){
+		if(!validFileName(argv[1])){
+			vtSetError("console wstat - bad elem");
+			goto error;
+		}
+		vtMemFree(de.elem);
+		de.elem = vtStrDup(argv[1]);
+	}
+	if(strcmp(argv[2], "-") != 0){
+		if(!validUserName(argv[2])){
+			vtSetError("console wstat - bad uid");
+			goto error;
+		}
+		vtMemFree(de.uid);
+		de.uid = vtStrDup(argv[2]);
+	}
+	if(strcmp(argv[3], "-") != 0){
+		if(!validUserName(argv[3])){
+			vtSetError("console wstat - bad gid");
+			goto error;
+		}
+		vtMemFree(de.gid);
+		de.gid = vtStrDup(argv[3]);
+	}
+	if(strcmp(argv[4], "-") != 0){
+		if(!fsysParseMode(argv[4], &de.mode)){
+			vtSetError("console wstat - bad mode");
+			goto error;
+		}
+	}
+	if(strcmp(argv[5], "-") != 0){
+		de.size = strtoull(argv[5], &p, 0);
+		if(argv[5][0] == '\0' || *p != '\0' || de.size < 0){
+			vtSetError("console wstat - bad length");
+			goto error;
+		}
+	}
+
+	if(!fileSetDir(f, &de, uidadm)){
+		vtSetError("console wstat - %R");
+		goto error;
+	}
+	deCleanup(&de);
+
+	if(!fileGetDir(f, &de)){
+		vtSetError("console wstat - stat2 - %R");
+		goto error;
+	}
+	fsysPrintStat("\tnew: w", argv[0], &de);
+	deCleanup(&de);
+	fileDecRef(f);
+	vtRUnlock(fsys->fs->elk);
+
+	return 1;
+
+error:
+	deCleanup(&de);	/* okay to do this twice */
+	fileDecRef(f);
+	vtRUnlock(fsys->fs->elk);
+	return 0;
+}
+
+static int
+fsysVenti(char* name, int argc, char* argv[])
+{
+	int r;
+	char *host;
+	char *usage = "usage: [fsys name] venti [address]";
+	Fsys *fsys;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+
+	if(argc == 0)
+		host = nil;
+	else if(argc == 1)
+		host = argv[0];
+	else
+		return cliError(usage);
+
+	if((fsys = _fsysGet(name)) == nil)
+		return 0;
+
+	vtLock(fsys->lock);
+	if(host == nil)
+		host = fsys->venti;
+	else{
+		vtMemFree(fsys->venti);
+		if(host[0])
+			fsys->venti = vtStrDup(host);
+		else{
+			host = nil;
+			fsys->venti = nil;
+		}
+	}
+
+	/* already open: do a redial */
+	if(fsys->fs != nil){
+		r = 1;
+		if(!vtRedial(fsys->session, host)
+		|| !vtConnect(fsys->session, 0))
+			r = 0;
+		vtUnlock(fsys->lock);
+		fsysPut(fsys);
+		return r;
+	}
+
+	/* not yet open: try to dial */
+	if(fsys->session)
+		vtClose(fsys->session);
+	r = 1;
+	if((fsys->session = vtDial(host, 0)) == nil
+	|| !vtConnect(fsys->session, 0))
+		r = 0;
+	vtUnlock(fsys->lock);
+	fsysPut(fsys);
+	return r;
+}
+
+static int
+fsysOpen(char* name, int argc, char* argv[])
+{
+	char *p, *host;
+	Fsys *fsys;
+	long ncache;
+	int noauth, noperm, rflag, wstatallow;
+	char *usage = "usage: fsys name open [-APWr] [-c ncache]";
+
+	ncache = 1000;
+	noauth = noperm = wstatallow = 0;
+	rflag = OReadWrite;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'A':
+		noauth = 1;
+		break;
+	case 'P':
+		noperm = 1;
+		break;
+	case 'W':
+		wstatallow = 1;
+		break;
+	case 'c':
+		p = ARGF();
+		if(p == nil)
+			return cliError(usage);
+		ncache = strtol(argv[0], &p, 0);
+		if(ncache <= 0 || p == argv[0] || *p != '\0')
+			return cliError(usage);
+		break;
+	case 'r':
+		rflag = OReadOnly;
+		break;
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	if((fsys = _fsysGet(name)) == nil)
+		return 0;
+
+	vtLock(fsys->lock);
+	if(fsys->fs != nil){
+		vtSetError(EFsysBusy, fsys->name);
+		vtUnlock(fsys->lock);
+		fsysPut(fsys);
+		return 0;
+	}
+
+	if(fsys->session == nil){
+		if(fsys->venti && fsys->venti[0])
+			host = fsys->venti;
+		else
+			host = nil;
+		fsys->session = vtDial(host, 1);
+		if(!vtConnect(fsys->session, nil))
+			fprint(2, "vtConnect: %R\n");
+	}
+	if((fsys->fs = fsOpen(fsys->dev, fsys->session, ncache, rflag)) == nil){
+		vtUnlock(fsys->lock);
+		fsysPut(fsys);
+		return 0;
+	}
+	fsys->noauth = noauth;
+	fsys->noperm = noperm;
+	fsys->wstatallow = wstatallow;
+	vtUnlock(fsys->lock);
+
+	fsysPut(fsys);
+
+	return 1;
+}
+
+static int
+fsysUnconfig(char* name, int argc, char* argv[])
+{
+	Fsys *fsys, **fp;
+	char *usage = "usage: fsys name unconfig";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	vtLock(sbox.lock);
+	fp = &sbox.head;
+	for(fsys = *fp; fsys != nil; fsys = fsys->next){
+		if(strcmp(fsys->name, name) == 0)
+			break;
+		fp = &fsys->next;
+	}
+	if(fsys == nil){
+		vtSetError(EFsysNotFound, name);
+		vtUnlock(sbox.lock);
+		return 0;
+	}
+	if(fsys->ref != 0 || fsys->fs != nil){
+		vtSetError(EFsysBusy, fsys->name);
+		vtUnlock(sbox.lock);
+		return 0;
+	}
+	*fp = fsys->next;
+	vtUnlock(sbox.lock);
+
+	if(fsys->session != nil){
+		vtClose(fsys->session);
+		vtFree(fsys->session);
+	}
+	if(fsys->venti != nil)
+		vtMemFree(fsys->venti);
+	if(fsys->dev != nil)
+		vtMemFree(fsys->dev);
+	if(fsys->name != nil)
+		vtMemFree(fsys->name);
+	vtMemFree(fsys);
+
+	return 1;
+}
+
+static int
+fsysConfig(char* name, int argc, char* argv[])
+{
+	Fsys *fsys;
+	char *usage = "usage: fsys name config dev";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc != 1)
+		return cliError(usage);
+
+	if((fsys = _fsysGet(argv[0])) != nil){
+		vtLock(fsys->lock);
+		if(fsys->fs != nil){
+			vtSetError(EFsysBusy, fsys->name);
+			vtUnlock(fsys->lock);
+			fsysPut(fsys);
+			return 0;
+		}
+		vtMemFree(fsys->dev);
+		fsys->dev = vtStrDup(argv[0]);
+		vtUnlock(fsys->lock);
+	}
+	else if((fsys = fsysAlloc(name, argv[0])) == nil)
+		return 0;
+
+	fsysPut(fsys);
+
+	return 1;
+}
+
+static struct {
+	char*	cmd;
+	int	(*f)(Fsys*, int, char**);
+	int	(*f1)(char*, int, char**);
+} fsyscmd[] = {
+	{ "close",	fsysClose, },
+	{ "config",	nil, fsysConfig, },
+	{ "open",	nil, fsysOpen, },
+	{ "unconfig",	nil, fsysUnconfig, },
+	{ "venti",	nil, fsysVenti, },
+
+	{ "bfree",	fsysBfree, },
+	{ "block",	fsysBlock, },
+	{ "clre",	fsysClre, },
+	{ "clri",	fsysClri, },
+	{ "clrp",	fsysClrp, },
+	{ "create",	fsysCreate, },
+	{ "epoch",	fsysEpoch, },
+	{ "label",	fsysLabel, },
+	{ "remove",	fsysRemove, },
+	{ "snap",	fsysSnap, },
+	{ "snaptime",	fsysSnapTime, },
+	{ "stat",	fsysStat, },
+	{ "sync",	fsysSync, },
+	{ "wstat",	fsysWstat, },
+	{ "vac",	fsysVac, },
+
+	{ nil,		nil, },
+};
+
+static int
+fsysXXX(char* name, int argc, char* argv[])
+{
+	int i, r;
+	Fsys *fsys;
+
+	for(i = 0; fsyscmd[i].cmd != nil; i++){
+		if(strcmp(fsyscmd[i].cmd, argv[0]) == 0)
+			break;
+	}
+
+	if(fsyscmd[i].cmd == nil){
+		vtSetError("unknown command - '%s'", argv[0]);
+		return 0;
+	}
+
+	/* some commands want the name... */
+	if(fsyscmd[i].f1 != nil)
+		return (*fsyscmd[i].f1)(name, argc, argv);
+
+	/* ... but most commands want the Fsys */
+	if((fsys = _fsysGet(name)) == nil)
+		return 0;
+
+	vtLock(fsys->lock);
+	if(fsys->fs == nil){
+		vtUnlock(fsys->lock);
+		vtSetError(EFsysNotOpen, name);
+		fsysPut(fsys);
+		return 0;
+	}
+
+	r = (*fsyscmd[i].f)(fsys, argc, argv);
+	vtUnlock(fsys->lock);
+	fsysPut(fsys);
+	return r;
+}
+
+static int
+cmdFsysXXX(int argc, char* argv[])
+{
+	char *name;
+
+	if((name = sbox.curfsys) == nil){
+		vtSetError(EFsysNoCurrent, argv[0]);
+		return 0;
+	}
+
+	return fsysXXX(name, argc, argv);
+}
+
+static int
+cmdFsys(int argc, char* argv[])
+{
+	Fsys *fsys;
+	char *usage = "usage: fsys [name ...]";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+
+	if(argc == 0){
+		vtRLock(sbox.lock);
+		for(fsys = sbox.head; fsys != nil; fsys = fsys->next)
+			consPrint("\t%s\n", fsys->name);
+		vtRUnlock(sbox.lock);
+		return 1;
+	}
+	if(argc == 1){
+		if((fsys = fsysGet(argv[0])) == nil)
+			return 0;
+		sbox.curfsys = vtStrDup(fsys->name);
+		consPrompt(sbox.curfsys);
+		fsysPut(fsys);
+		return 1;
+	}
+
+	return fsysXXX(argv[0], argc-1, argv+1);
+}
+
+int
+fsysInit(void)
+{
+	int i;
+
+	fmtinstall('H', encodefmt);
+	fmtinstall('V', scoreFmt);
+	fmtinstall('R', vtErrFmt);
+	fmtinstall('L', labelFmt);
+
+	sbox.lock = vtLockAlloc();
+
+	cliAddCmd("fsys", cmdFsys);
+	for(i = 0; fsyscmd[i].cmd != nil; i++){
+		if(fsyscmd[i].f != nil)
+			cliAddCmd(fsyscmd[i].cmd, cmdFsysXXX);
+	}
+	/* the venti cmd is special: the fs can be either open or closed */
+	cliAddCmd("venti", cmdFsysXXX);
+	cliAddCmd("printconfig", cmdPrintConfig);
+
+	return 1;
+}

+ 177 - 0
sys/src/cmd/fossil/9lstn.c

@@ -0,0 +1,177 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+typedef struct Lstn Lstn;
+typedef struct Lstn {
+	int	afd;
+	char*	address;
+	char	dir[NETPATHLEN];
+
+	Lstn*	next;
+	Lstn*	prev;
+} Lstn;
+
+static struct {
+	VtLock*	lock;
+
+	Lstn*	head;
+	Lstn*	tail;
+} lbox;
+
+static void
+lstnFree(Lstn* lstn)
+{
+	vtLock(lbox.lock);
+	if(lstn->prev != nil)
+		lstn->prev->next = lstn->next;
+	else
+		lbox.head = lstn->next;
+	if(lstn->next != nil)
+		lstn->next->prev = lstn->prev;
+	else
+		lbox.tail = lstn->prev;
+	vtUnlock(lbox.lock);
+
+	if(lstn->afd != -1)
+		close(lstn->afd);
+	vtMemFree(lstn->address);
+	vtMemFree(lstn);
+}
+
+static void
+lstnListen(void* a)
+{
+	Lstn *lstn;
+	int dfd, lfd;
+	char newdir[NETPATHLEN];
+
+ 	vtThreadSetName("listen");
+
+	lstn = a;
+	for(;;){
+		if((lfd = listen(lstn->dir, newdir)) < 0){
+			fprint(2, "listen: listen '%s': %r", lstn->dir);
+			break;
+		}
+
+		if((dfd = accept(lfd, newdir)) >= 0)
+			conAlloc(dfd, newdir);
+		else
+			fprint(2, "listen: accept '%s': %r", newdir);
+		close(lfd);
+	}
+	lstnFree(lstn);
+}
+
+static Lstn*
+lstnAlloc(char* address)
+{
+	int afd;
+	Lstn *lstn;
+	char dir[NETPATHLEN];
+
+	vtLock(lbox.lock);
+	for(lstn = lbox.head; lstn != nil; lstn = lstn->next){
+		if(strcmp(lstn->address, address) != 0)
+			continue;
+		vtSetError("listen: already serving '%s'", address);
+		vtUnlock(lbox.lock);
+		return nil;
+	}
+
+	if((afd = announce(address, dir)) < 0){
+		vtSetError("listen: announce '%s': %r", address);
+		vtUnlock(lbox.lock);
+		return nil;
+	}
+
+	lstn = vtMemAllocZ(sizeof(Lstn));
+	lstn->afd = afd;
+	lstn->address = vtStrDup(address);
+	memmove(lstn->dir, dir, NETPATHLEN);
+
+	if(lbox.tail != nil){
+		lstn->prev = lbox.tail;
+		lbox.tail->next = lstn;
+	}
+	else{
+		lbox.head = lstn;
+		lstn->prev = nil;
+	}
+	lbox.tail = lstn;
+	vtUnlock(lbox.lock);
+
+	if(vtThread(lstnListen, lstn) < 0){
+		vtSetError("listen: thread '%s': %r", lstn->address);
+		lstnFree(lstn);
+		return nil;
+	}
+
+	return lstn;
+}
+
+static int
+cmdLstn(int argc, char* argv[])
+{
+	int dflag;
+	Lstn *lstn;
+	char *usage = "usage: listen [-d] [address]";
+
+	dflag = 0;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'd':
+		dflag = 1;
+		break;
+	}ARGEND
+
+	switch(argc){
+	default:
+		return cliError(usage);
+	case 0:
+		vtRLock(lbox.lock);
+		for(lstn = lbox.head; lstn != nil; lstn = lstn->next)
+			consPrint("\t%s\t%s\n", lstn->address, lstn->dir);
+		vtRUnlock(lbox.lock);
+		break;
+	case 1:
+		if(!dflag){
+			if(lstnAlloc(argv[0]) == nil)
+				return 0;
+			break;
+		}
+
+		vtLock(lbox.lock);
+		for(lstn = lbox.head; lstn != nil; lstn = lstn->next){
+			if(strcmp(lstn->address, argv[0]) != 0)
+				continue;
+			if(lstn->afd != -1){
+				close(lstn->afd);
+				lstn->afd = -1;
+			}
+			break;
+		}
+		vtUnlock(lbox.lock);
+
+		if(lstn == nil){
+			vtSetError("listen: '%s' not found", argv[0]);
+			return 0;
+		}
+		break;
+	}
+
+	return 1;
+}
+
+int
+lstnInit(void)
+{
+	lbox.lock = vtLockAlloc();
+
+	cliAddCmd("listen", cmdLstn);
+
+	return 1;
+}

+ 1118 - 0
sys/src/cmd/fossil/9p.c

@@ -0,0 +1,1118 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+enum {
+	OMODE		= 0x7,		/* Topen/Tcreate mode */
+};
+
+enum {
+	PermX		= 1,
+	PermW		= 2,
+	PermR		= 4,
+};
+
+static char EPermission[] = "permission denied";
+
+static int
+permFile(File* file, Fid* fid, int perm)
+{
+	char *u;
+	DirEntry de;
+
+	if(!fileGetDir(file, &de))
+		return 0;
+
+	/*
+	 * User none only gets other permissions.
+	 */
+	if(strcmp(fid->uname, unamenone) != 0){
+		/*
+		 * There is only one uid<->uname mapping
+		 * and it's already cached in the Fid, but
+		 * it might have changed during the lifetime
+		 * if this Fid.
+		 */
+		if((u = unameByUid(de.uid)) != nil){
+			if(strcmp(fid->uname, u) == 0 && ((perm<<6) & de.mode)){
+				vtMemFree(u);
+				deCleanup(&de);
+				return 1;
+			}
+			vtMemFree(u);
+		}
+		if(groupMember(de.gid, fid->uname) && ((perm<<3) & de.mode)){
+			deCleanup(&de);
+			return 1;
+		}
+	}
+	if(perm & de.mode){
+		if(perm == PermX && (de.mode & ModeDir)){
+			deCleanup(&de);
+			return 1;
+		}
+		if(!groupMember(uidnoworld, fid->uname)){
+			deCleanup(&de);
+			return 1;
+		}
+	}
+	if(fsysNoPermCheck(fid->fsys)){
+		deCleanup(&de);
+		return 1;
+	}
+	vtSetError(EPermission);
+
+	deCleanup(&de);
+	return 0;
+}
+
+static int
+permFid(Fid* fid, int p)
+{
+	return permFile(fid->file, fid, p);
+}
+
+static int
+permParent(Fid* fid, int p)
+{
+	int r;
+	File *parent;
+
+	parent = fileGetParent(fid->file);
+	r = permFile(parent, fid, p);
+	fileDecRef(parent);
+
+	return r;
+}
+
+int
+validFileName(char* name)
+{
+	char *p;
+
+	if(name == nil || name[0] == '\0'){
+		vtSetError("no file name");
+		return 0;
+	}
+	if(name[0] == '.'){
+		if(name[1] == '\0' || (name[1] == '.' && name[2] == '\0')){
+			vtSetError(". and .. illegal as file name");
+			return 0;
+		}
+	}
+
+	for(p = name; *p != '\0'; p++){
+		if((*p & 0xFF) < 040){
+			vtSetError("bad character in file name");
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int
+rTwstat(Msg* m)
+{
+	Dir dir;
+	Fid *fid;
+	ulong mode;
+	DirEntry de;
+	char *gid, *strs, *uid;
+	int gl, op, retval, tsync;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock)) == nil)
+		return 0;
+
+	gid = uid = nil;
+	retval = 0;
+
+	if(strcmp(fid->uname, unamenone) == 0 || (fid->qid.type & QTAUTH)){
+		vtSetError(EPermission);
+		goto error0;
+	}
+	if(fileIsRoFs(fid->file) || !groupWriteMember(fid->uname)){
+		vtSetError("read-only filesystem");
+		goto error0;
+	}
+
+	if(!fileGetDir(fid->file, &de))
+		goto error0;
+
+	strs = vtMemAlloc(m->t.nstat);
+	if(convM2D(m->t.stat, m->t.nstat, &dir, strs) == 0){
+		vtSetError("wstat -- protocol botch");
+		goto error;
+	}
+
+	/*
+	 * Run through each of the (sub-)fields in the provided Dir
+	 * checking for validity and whether it's a default:
+	 * .type, .dev and .atime are completely ignored and not checked;
+	 * .qid.path, .qid.vers and .muid are checked for validity but
+	 * any attempt to change them is an error.
+	 * .qid.type/.mode, .mtime, .name, .length, .uid and .gid can
+	 * possibly be changed.
+	 *
+	 * 'Op' flags there are changed fields, i.e. it's not a no-op.
+	 * 'Tsync' flags all fields are defaulted.
+	 */
+	tsync = 1;
+	if(dir.qid.path != ~0){
+		if(dir.qid.path != de.qid){
+			vtSetError("wstat -- attempt to change qid.path");
+			goto error;
+		}
+		tsync = 0;
+	}
+	if(dir.qid.vers != ~0){
+		if(dir.qid.vers != de.mcount){
+			vtSetError("wstat -- attempt to change qid.vers");
+			goto error;
+		}
+		tsync = 0;
+	}
+	if(dir.muid != nil && *dir.muid != '\0'){
+		if((uid = uidByUname(dir.muid)) == nil){
+			vtSetError("wstat -- unknown muid");
+			goto error;
+		}
+		if(strcmp(uid, de.mid) != 0){
+			vtSetError("wstat -- attempt to change muid");
+			goto error;
+		}
+		vtMemFree(uid);
+		uid = nil;
+		tsync = 0;
+	}
+
+	/*
+	 * Check .qid.type and .mode agree if neither is defaulted.
+	 */
+	if(dir.qid.type != (uchar)~0 && dir.mode != ~0){
+		if(dir.qid.type != ((dir.mode>>24) & 0xFF)){
+			vtSetError("wstat -- qid.type/mode mismatch");
+			goto error;
+		}
+	}
+
+	op = 0;
+
+	if(dir.qid.type != (uchar)~0 || dir.mode != ~0){
+		/*
+		 * .qid.type or .mode isn't defaulted, check for unknown bits.
+		 */
+		if(dir.mode == ~0)
+			dir.mode = (dir.qid.type<<24)|(de.mode & 0777);
+		if(dir.mode & ~(DMDIR|DMAPPEND|DMEXCL|0777)){
+			vtSetError("wstat -- unknown bits in qid.type/mode");
+			goto error;
+		}
+
+		/*
+		 * Synthesise a mode to check against the current settings.
+		 */
+		mode = dir.mode & 0777;
+		if(dir.mode & DMEXCL)
+			mode |= ModeExclusive;
+		if(dir.mode & DMAPPEND)
+			mode |= ModeAppend;
+		if(dir.mode & DMDIR)
+			mode |= ModeDir;
+
+		if((de.mode^mode) & ModeDir){
+			vtSetError("wstat -- attempt to change directory bit");
+			goto error;
+		}
+
+		if((de.mode & (ModeAppend|ModeExclusive|0777)) != mode){
+			de.mode &= ~(ModeAppend|ModeExclusive|0777);
+			de.mode |= mode;
+			op = 1;
+		}
+		tsync = 0;
+	}
+
+	if(dir.mtime != ~0){
+		if(dir.mtime != de.mtime){
+			de.mtime = dir.mtime;
+			op = 1;
+		}
+		tsync = 0;
+	}
+
+	if(dir.length != ~0){
+		if(de.mode & ModeDir){
+			vtSetError("wstat -- attempt to change length of directory");
+			goto error;
+		}
+		if(dir.length != de.size){
+			de.size = dir.length;
+			op = 1;
+		}
+		tsync = 0;
+	}
+
+	/*
+	 * Check for permission to change .mode, .mtime or .length,
+	 * must be owner or leader of either group, for which test gid
+	 * is needed; permission checks on gid will be done later.
+	 */
+	if(dir.gid != nil && *dir.gid != '\0'){
+		if((gid = uidByUname(dir.gid)) == nil){
+			vtSetError("wstat -- unknown gid");
+			goto error;
+		}
+		tsync = 0;
+	}
+	else
+		gid = vtStrDup(de.gid);
+
+	/*
+	 * 'Gl' counts whether neither, one or both groups are led.
+	 */
+	gl = groupLeader(gid, fid->uname) != 0;
+	gl += groupLeader(de.gid, fid->uname) != 0;
+
+	if(op && !fsysWstatAllow(fid->fsys)){
+		if(strcmp(fid->uid, de.uid) != 0 && !gl){
+			vtSetError("wstat -- not owner or group leader");
+			goto error;
+		}
+	}
+
+	/*
+	 * Check for permission to change group, must be
+	 * either owner and in new group or leader of both groups.
+	 * If gid is nil here then 
+	 */
+	if(strcmp(gid, de.gid) != 0){
+		if(!fsysWstatAllow(fid->fsys)
+		&& !(strcmp(fid->uid, de.uid) == 0 && groupMember(gid, fid->uname))
+		&& !(gl == 2)){
+			vtSetError("wstat -- not owner and not group leaders");
+			goto error;
+		}
+		vtMemFree(de.gid);
+		de.gid = gid;
+		gid = nil;
+		op = 1;
+	}
+
+	/*
+	 * Rename.
+	 * Check .name is valid and different to the current.
+	 * If so, check write permission in parent.
+	 */
+	if(dir.name != nil && *dir.name != '\0'){
+		if(!validFileName(dir.name))
+			goto error;
+		if(strcmp(dir.name, de.elem) != 0){
+			if(!permParent(fid, PermW))
+				goto error;
+			vtMemFree(de.elem);
+			de.elem = vtStrDup(dir.name);
+			op = 1;
+		}
+		tsync = 0;
+	}
+
+	/*
+	 * Check for permission to change owner - must be god.
+	 */
+	if(dir.uid != nil && *dir.uid != '\0'){
+		if((uid = uidByUname(dir.uid)) == nil){
+			vtSetError("wstat -- unknown uid");
+			goto error;
+		}
+		if(strcmp(uid, de.uid) != 0){
+			if(!fsysWstatAllow(fid->fsys)){
+				vtSetError("wstat -- not owner");
+				goto error;
+			}
+			if(strcmp(uid, uidnoworld) == 0){
+				vtSetError(EPermission);
+				goto error;
+			}
+			vtMemFree(de.uid);
+			de.uid = uid;
+			uid = nil;
+			op = 1;
+		}
+		tsync = 0;
+	}
+
+	if(op)
+		retval = fileSetDir(fid->file, &de, fid->uid);
+	else
+		retval = 1;
+
+	if(tsync){
+		/*
+		 * All values were defaulted,
+		 * make the state of the file exactly what it
+		 * claims to be before returning...
+		 */
+		USED(tsync);
+	}
+
+error:
+	deCleanup(&de);
+	vtMemFree(strs);
+	if(gid != nil)
+		vtMemFree(gid);
+	if(uid != nil)
+		vtMemFree(uid);
+error0:
+	fidPut(fid);
+	return retval;
+};
+
+static int
+rTstat(Msg* m)
+{
+	Dir dir;
+	Fid *fid;
+	DirEntry de;
+
+	if((fid = fidGet(m->con, m->t.fid, 0)) == nil)
+		return 0;
+	if(fid->qid.type & QTAUTH){
+		memset(&dir, 0, sizeof(Dir));
+		dir.qid = fid->qid;
+		dir.mode = DMAUTH;
+		dir.atime = time(0L);
+		dir.mtime = dir.atime;
+		dir.length = 0;
+		dir.name = "#¿";
+		dir.uid = fid->uname;
+		dir.gid = fid->uname;
+		dir.muid = fid->uname;
+
+		if((m->r.nstat = convD2M(&dir, m->data, m->con->msize)) == 0){
+			vtSetError("stat QTAUTH botch");
+			fidPut(fid);
+			return 0;
+		}
+		m->r.stat = m->data;
+
+		fidPut(fid);
+		return 1;
+	}
+	if(!fileGetDir(fid->file, &de)){
+		fidPut(fid);
+		return 0;
+	}
+	fidPut(fid);
+
+	/*
+	 * TODO: optimise this copy (in convS2M) away somehow.
+	 * This pettifoggery with m->data will do for the moment.
+	 */
+	m->r.nstat = dirDe2M(&de, m->data, m->con->msize);
+	m->r.stat = m->data;
+	deCleanup(&de);
+
+	return 1;
+}
+
+static int
+_rTclunk(Fid* fid, int remove)
+{
+	int rok;
+
+	if(fid->excl)
+		exclFree(fid);
+
+	rok = 1;
+	if(remove && !(fid->qid.type & QTAUTH)){
+		if((rok = permParent(fid, PermW)) != 0)
+			rok = fileRemove(fid->file, fid->uid);
+	}
+	fidClunk(fid);
+
+	return rok;
+}
+
+static int
+rTremove(Msg* m)
+{
+	Fid *fid;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock)) == nil)
+		return 0;
+	return _rTclunk(fid, 1);
+}
+
+static int
+rTclunk(Msg* m)
+{
+	Fid *fid;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock)) == nil)
+		return 0;
+	_rTclunk(fid, (fid->open & FidORclose));
+
+	return 1;
+}
+
+static int
+rTwrite(Msg* m)
+{
+	Fid *fid;
+	int count, n;
+
+	if((fid = fidGet(m->con, m->t.fid, 0)) == nil)
+		return 0;
+	if(!(fid->open & FidOWrite)){
+		vtSetError("fid not open for write");
+		goto error;
+	}
+
+	count = m->t.count;
+	if(count < 0 || count > m->con->msize-IOHDRSZ){
+		vtSetError("write count too big");
+		goto error;
+	}
+	if(m->t.offset < 0){
+		vtSetError("write offset negative");
+		goto error;
+	}
+	if(fid->excl != nil && !exclUpdate(fid))
+		goto error;
+
+	if(fid->qid.type & QTDIR){
+		vtSetError("is a directory");
+		goto error;
+	}
+	else if(fid->qid.type & QTAUTH)
+		n = authWrite(fid, m->t.data, count);
+	else
+		n = fileWrite(fid->file, m->t.data, count, m->t.offset, fid->uid);
+	if(n < 0)
+		goto error;
+	
+
+	m->r.count = n;
+
+	fidPut(fid);
+	return 1;
+
+error:
+	fidPut(fid);
+	return 0;
+}
+
+static int
+rTread(Msg* m)
+{
+	Fid *fid;
+	uchar *data;
+	int count, n;
+
+	if((fid = fidGet(m->con, m->t.fid, 0)) == nil)
+		return 0;
+	if(!(fid->open & FidORead)){
+		vtSetError("fid not open for read");
+		goto error;
+	}
+
+	count = m->t.count;
+	if(count < 0 || count > m->con->msize-IOHDRSZ){
+		vtSetError("read count too big");
+		goto error;
+	}
+	if(m->t.offset < 0){
+		vtSetError("read offset negative");
+		goto error;
+	}
+	if(fid->excl != nil && !exclUpdate(fid))
+		goto error;
+
+	/*
+	 * TODO: optimise this copy (in convS2M) away somehow.
+	 * This pettifoggery with m->data will do for the moment.
+	 */
+	data = m->data+IOHDRSZ;
+	if(fid->qid.type & QTDIR)
+		n = dirRead(fid, data, count, m->t.offset);
+	else if(fid->qid.type & QTAUTH)
+		n = authRead(fid, data, count);
+	else
+		n = fileRead(fid->file, data, count, m->t.offset);
+	if(n < 0)
+		goto error;
+
+	m->r.count = n;
+	m->r.data = (char*)data;
+
+	fidPut(fid);
+	return 1;
+
+error:
+	fidPut(fid);
+	return 0;
+}
+
+static int
+rTcreate(Msg* m)
+{
+	Fid *fid;
+	File *file;
+	ulong mode;
+	int omode, open, perm;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock)) == nil)
+		return 0;
+	if(fid->open){
+		vtSetError("fid open for I/O");
+		goto error;
+	}
+	if(fileIsRoFs(fid->file) || !groupWriteMember(fid->uname)){
+		vtSetError("read-only filesystem");
+		goto error;
+	}
+	if(!fileIsDir(fid->file)){
+		vtSetError("not a directory");
+		goto error;
+	}
+	if(!permFid(fid, PermW))
+		goto error;
+	if(!validFileName(m->t.name))
+		goto error;
+	if(strcmp(fid->uid, uidnoworld) == 0){
+		vtSetError(EPermission);
+		goto error;
+	}
+
+	omode = m->t.mode & OMODE;
+	open = 0;
+
+	if(omode == OREAD || omode == ORDWR || omode == OEXEC)
+		open |= FidORead;
+	if(omode == OWRITE || omode == ORDWR)
+		open |= FidOWrite;
+	if((open & (FidOWrite|FidORead)) == 0){
+		vtSetError("unknown mode");
+		goto error;
+	}
+	if(m->t.perm & DMDIR){
+		if((m->t.mode & (ORCLOSE|OTRUNC)) || (open & FidOWrite)){
+			vtSetError("illegal mode");
+			goto error;
+		}
+		if(m->t.perm & DMAPPEND){
+			vtSetError("illegal perm");
+			goto error;
+		}
+	}
+
+	mode = fileGetMode(fid->file);
+	perm = m->t.perm;
+	if(m->t.perm & DMDIR)
+		perm &= ~0777|(mode & 0777);
+	else
+		perm &= ~0666|(mode & 0666);
+	mode = perm & 0777;
+	if(m->t.perm & DMDIR)
+		mode |= ModeDir;
+	if(m->t.perm & DMAPPEND)
+		mode |= ModeAppend;
+	if(m->t.perm & DMEXCL)
+		mode |= ModeExclusive;
+
+	if((file = fileCreate(fid->file, m->t.name, mode, fid->uid)) == nil){
+		fidPut(fid);
+		return 0;
+	}
+	fileDecRef(fid->file);
+
+	fid->file = file;
+	mode = fileGetMode(fid->file);
+	if(mode & ModeDir)
+		fid->qid.type = QTDIR;
+	else
+		fid->qid.type = QTFILE;
+	if(mode & ModeAppend)
+		fid->qid.type |= QTAPPEND;
+	if(mode & ModeExclusive){
+		fid->qid.type |= QTEXCL;
+		assert(exclAlloc(fid) != 0);
+	}
+	fid->qid.vers = fileGetMcount(file);
+	fid->qid.path = fileGetId(file);
+	if(m->t.mode & ORCLOSE)
+		open |= FidORclose;
+	fid->open = open;
+
+	m->r.qid = fid->qid;
+	m->r.iounit = m->con->msize-IOHDRSZ;
+
+	fidPut(fid);
+	return 1;
+
+error:
+	fidPut(fid);
+	return 0;
+}
+
+static int
+rTopen(Msg* m)
+{
+	Fid *fid;
+	int isdir, mode, omode, open, rofs;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock)) == nil)
+		return 0;
+	if(fid->open){
+		vtSetError("fid open for I/O");
+		goto error;
+	}
+
+	isdir = fileIsDir(fid->file);
+	open = 0;
+	rofs = fileIsRoFs(fid->file) || !groupWriteMember(fid->uname);
+
+	if(m->t.mode & ORCLOSE){
+		if(isdir){
+			vtSetError("is a directory");
+			goto error;
+		}
+		if(rofs){
+			vtSetError("read-only filesystem");
+			goto error;
+		}
+		if(!permParent(fid, PermW))
+			goto error;
+
+		open |= FidORclose;
+	}
+
+	omode = m->t.mode & OMODE;
+	if(omode == OREAD || omode == ORDWR){
+		if(!permFid(fid, PermR))
+			goto error;
+		open |= FidORead;
+	}
+	if(omode == OWRITE || omode == ORDWR || (m->t.mode & OTRUNC)){
+		if(isdir){
+			vtSetError("is a directory");
+			goto error;
+		}
+		if(rofs){
+			vtSetError("read-only filesystem");
+			goto error;
+		}
+		if(!permFid(fid, PermW))
+			goto error;
+		open |= FidOWrite;
+	}
+	if(omode == OEXEC){
+		if(isdir){
+			vtSetError("is a directory");
+			goto error;
+		}
+		if(!permFid(fid, PermX))
+			goto error;
+		open |= FidORead;
+	}
+	if((open & (FidOWrite|FidORead)) == 0){
+		vtSetError("unknown mode");
+		goto error;
+	}
+
+	mode = fileGetMode(fid->file);
+	if((mode & ModeExclusive) && exclAlloc(fid) == 0)
+		goto error;
+
+	/*
+	 * Everything checks out, try to commit any changes.
+	 */
+	if((m->t.mode & OTRUNC) && !(mode & ModeAppend)){
+		if(!fileTruncate(fid->file, fid->uid))
+			goto error;
+		fid->qid.vers = fileGetMcount(fid->file);
+	}
+	if(isdir && fid->db != nil){
+		dirBufFree(fid->db);
+		fid->db = nil;
+	}
+
+	m->r.qid = fid->qid;
+	m->r.iounit = m->con->msize-IOHDRSZ;
+
+	fid->open = open;
+
+	fidPut(fid);
+	return 1;
+
+error:
+	if(fid->excl != nil)
+		exclFree(fid);
+	fidPut(fid);
+	return 0;
+}
+
+static int
+rTwalk(Msg* m)
+{
+	Qid qid;
+	Fcall *r, *t;
+	int nwname, wlock;
+	File *file, *nfile;
+	Fid *fid, *ofid, *nfid;
+
+	t = &m->t;
+	if(t->fid == t->newfid)
+		wlock = FidFWlock;
+	else
+		wlock = 0;
+
+	/*
+	 * The file identified by t->fid must be valid in the
+	 * current session and must not have been opened for I/O
+	 * by an open or create message.
+	 */
+	if((ofid = fidGet(m->con, t->fid, wlock)) == nil)
+		return 0;
+	if(ofid->open){
+		vtSetError("file open for I/O");
+		fidPut(ofid);
+		return 0;
+	}
+
+	/*
+	 * If newfid is not the same as fid, allocate a new file;
+	 * a side effect is checking newfid is not already in use (error);
+	 * if there are no names to walk this will be equivalent to a
+	 * simple 'clone' operation.
+	 * It's a no-op if newfid is the same as fid and t->nwname is 0.
+	 */
+	nfid = nil;
+	if(t->fid != t->newfid){
+		nfid = fidGet(m->con, t->newfid, FidFWlock|FidFCreate);
+		if(nfid == nil){
+			vtSetError("fid in use");
+			fidPut(ofid);
+			return 0;
+		}
+		nfid->open = ofid->open & ~FidORclose;
+		nfid->file = fileIncRef(ofid->file);
+		nfid->qid = ofid->qid;
+		nfid->uid = vtStrDup(ofid->uid);
+		nfid->uname = vtStrDup(ofid->uname);
+		nfid->fsys = fsysIncRef(ofid->fsys);
+		fid = nfid;
+	}
+	else
+		fid = ofid;
+
+	r = &m->r;
+	r->nwqid = 0;
+
+	if(t->nwname == 0){
+		if(nfid != nil)
+			fidPut(nfid);
+		fidPut(ofid);
+
+		return 1;
+	}
+
+	file = fid->file;
+	fileIncRef(file);
+	qid = fid->qid;
+
+	for(nwname = 0; nwname < t->nwname; nwname++){
+		/*
+		 * Walked elements must represent a directory and
+		 * the implied user must have permission to search
+		 * the directory.  Walking .. is always allowed, so that
+		 * you can't walk into a directory and then not be able
+		 * to walk out of it.
+		 */
+		if(!(qid.type & QTDIR)){
+			vtSetError("not a directory");
+			break;
+		}
+		if(!permFile(file, fid, PermX) && strcmp(t->wname[nwname], "..") != 0)
+			break;
+		if((nfile = fileWalk(file, t->wname[nwname])) == nil)
+			break;
+		fileDecRef(file);
+		file = nfile;
+		qid.type = QTFILE;
+		if(fileIsDir(file))
+			qid.type = QTDIR;
+		qid.vers = fileGetMcount(file);
+		qid.path = fileGetId(file);
+		r->wqid[r->nwqid++] = qid;
+	}
+
+	if(nwname == t->nwname){
+		/*
+		 * Walked all elements. Update the target fid
+		 * from the temporary qid used during the walk,
+		 * and tidy up.
+		 */
+		fid->qid = r->wqid[r->nwqid-1];
+		fileDecRef(fid->file);
+		fid->file = file;
+
+		if(nfid != nil)
+			fidPut(nfid);
+
+		fidPut(ofid);
+		return 1;
+	}
+
+	/*
+	 * Didn't walk all elements, 'clunk' nfid if it exists
+	 * and leave fid untouched.
+	 * It's not an error if some of the elements were walked OK.
+	 */
+	fileDecRef(file);
+	if(nfid != nil)
+		fidClunk(nfid);
+
+	fidPut(ofid);
+	if(nwname == 0)
+		return 0;
+	return 1;
+}
+
+static int
+rTflush(Msg* m)
+{
+	Msg *mp;
+	Con *con;
+	u32int oldtag;
+
+	if((oldtag = m->t.oldtag) == NOTAG)
+		return 1;
+
+	con = m->con;
+	vtLock(con->lock);
+	for(mp = con->mhead; mp != nil; mp = mp->next){
+		if(mp->t.tag == oldtag){
+			mp->flush = 1;
+			break;
+		}
+	}
+	vtUnlock(con->lock);
+
+	return 1;
+}
+
+static void
+parseAname(char *aname, char **fsname, char **path)
+{
+	char *s;
+
+	if(aname && aname[0])
+		s = vtStrDup(aname);
+	else
+		s = vtStrDup("main/active");
+	*fsname = s;
+	if((*path = strchr(s, '/')) != nil)
+		*(*path)++ = '\0';
+	else
+		*path = "";
+}
+
+static int
+rTattach(Msg* m)
+{
+	Fid *fid;
+	Fsys *fsys;
+	char *fsname, *path;
+
+	if((fid = fidGet(m->con, m->t.fid, FidFWlock|FidFCreate)) == nil)
+		return 0;
+
+	parseAname(m->t.aname, &fsname, &path);
+	if((fsys = fsysGet(fsname)) == nil){
+		fidClunk(fid);
+		vtMemFree(fsname);
+		return 0;
+	}
+	fid->fsys = fsys;
+
+	if(m->t.uname[0] != '\0')
+		fid->uname = vtStrDup(m->t.uname);
+	else
+		fid->uname = vtStrDup(unamenone);
+
+	if(fsysNoAuthCheck(fsys)){
+		if((fid->uid = uidByUname(fid->uname)) == nil)
+			fid->uid = vtStrDup(unamenone);
+	}
+	else if(!authCheck(&m->t, fid, fsys)){
+		fidClunk(fid);
+		vtMemFree(fsname);
+		vtSetError("authentication failed");
+		return 0;
+	}
+
+	fsysFsRlock(fsys);
+	if((fid->file = fsysGetRoot(fsys, path)) == nil){
+		fsysFsRUnlock(fsys);
+		fidClunk(fid);
+		vtMemFree(fsname);
+		return 0;
+	}
+	fsysFsRUnlock(fsys);
+	vtMemFree(fsname);
+
+	fid->qid = (Qid){fileGetId(fid->file), 0, QTDIR};
+	m->r.qid = fid->qid;
+
+	fidPut(fid);
+	return 1;
+}
+
+static int
+rTauth(Msg* m)
+{
+	int afd;
+	Con *con;
+	Fid *afid;
+	Fsys *fsys;
+	char *fsname, *path;
+
+	parseAname(m->t.aname, &fsname, &path);
+	if((fsys = fsysGet(fsname)) == nil){
+		vtMemFree(fsname);
+		return 0;
+	}
+	vtMemFree(fsname);
+
+	if(fsysNoAuthCheck(fsys)){
+		m->con->aok = 1;
+		vtSetError("authentication disabled");
+		fsysPut(fsys);
+		return 0;
+	}
+	if(strcmp(m->t.uname, unamenone) == 0){
+		vtSetError("user 'none' requires no authentication");
+		fsysPut(fsys);
+		return 0;
+	}
+
+	con = m->con;
+	if((afid = fidGet(con, m->t.afid, FidFWlock|FidFCreate)) == nil){
+		fsysPut(fsys);
+		return 0;
+	}
+	afid->fsys = fsys;
+
+	if((afd = open("/mnt/factotum/rpc", ORDWR)) < 0){
+		vtSetError("can't open \"/mnt/factotum/rpc\"");
+		fidClunk(afid);
+		return 0;
+	}
+	if((afid->rpc = auth_allocrpc(afd)) == nil){
+		close(afd);
+		vtSetError("can't auth_allocrpc");
+		fidClunk(afid);
+		return 0;
+	}
+	if(auth_rpc(afid->rpc, "start", "proto=p9any role=server", 23) != ARok){
+		vtSetError("can't auth_rpc");
+		fidClunk(afid);
+		return 0;
+	}
+
+	afid->open = FidOWrite|FidORead;
+	afid->qid.type = QTAUTH;
+	afid->qid.path = m->t.afid;
+	afid->uname = vtStrDup(m->t.uname);
+
+	m->r.qid = afid->qid;
+
+	fidPut(afid);
+	return 1;
+}
+
+static int
+rTversion(Msg* m)
+{
+	int v;
+	Con *con;
+	Fid *fid;
+	Fcall *r, *t;
+
+	t = &m->t;
+	r = &m->r;
+	con = m->con;
+
+	vtLock(con->lock);
+	if(con->state != CsInit){
+		vtUnlock(con->lock);
+		vtSetError("Tversion: down");
+		return 0;
+	}
+	con->state = CsNew;
+
+	/*
+	 * Release the karma of past lives and suffering.
+	 */
+	while(con->fhead != nil){
+		fid = fidGet(con, con->fhead->fidno, FidFWlock);
+		assert(fid == con->fhead);
+		fidClunk(fid);
+	}
+
+	if(t->tag != NOTAG){
+		vtUnlock(con->lock);
+		vtSetError("Tversion: invalid tag");
+		return 0;
+	}
+
+	if(t->msize < 256){
+		vtUnlock(con->lock);
+		vtSetError("Tversion: message size too small");
+		return 0;
+	}
+	if(t->msize < con->msize)
+		r->msize = t->msize;
+	else
+		r->msize = con->msize;
+
+	r->version = "unknown";
+	if(t->version[0] == '9' && t->version[1] == 'P'){
+		/*
+		 * Currently, the only defined version
+		 * is "9P2000"; ignore any later versions.
+          	 */
+		v = strtol(&t->version[2], 0, 10);
+		if(v >= 2000){
+			r->version = VERSION9P;
+			con->msize = r->msize;
+			con->state = CsUp;
+		}
+		else if(strcmp(t->version, "9PEoF") == 0){
+			r->version = "9PEoF";
+			con->msize = r->msize;
+			con->state = CsMoribund;
+		}
+	}
+	vtUnlock(con->lock);
+
+	return 1;
+}
+
+int (*rFcall[Tmax])(Msg*) = {
+	[Tversion]	= rTversion,
+	[Tauth]		= rTauth,
+	[Tattach]	= rTattach,
+	[Tflush]	= rTflush,
+	[Twalk]		= rTwalk,
+	[Topen]		= rTopen,
+	[Tcreate]	= rTcreate,
+	[Tread]		= rTread,
+	[Twrite]	= rTwrite,
+	[Tclunk]	= rTclunk,
+	[Tremove]	= rTremove,
+	[Tstat]		= rTstat,
+	[Twstat]	= rTwstat,
+};

+ 109 - 0
sys/src/cmd/fossil/9ping.c

@@ -0,0 +1,109 @@
+#include <u.h>
+#include <libc.h>
+
+typedef uvlong u64int;
+
+#define TWID64	((u64int)~(u64int)0)
+
+
+u64int
+unittoull(char *s)
+{
+	char *es;
+	u64int n;
+
+	if(s == nil)
+		return TWID64;
+	n = strtoul(s, &es, 0);
+	if(*es == 'k' || *es == 'K'){
+		n *= 1024;
+		es++;
+	}else if(*es == 'm' || *es == 'M'){
+		n *= 1024*1024;
+		es++;
+	}else if(*es == 'g' || *es == 'G'){
+		n *= 1024*1024*1024;
+		es++;
+	}
+	if(*es != '\0')
+		return TWID64;
+	return n;
+}
+
+void
+main(int argc, char *argv[])
+{
+	int fd, i;
+	int n = 1000, m;
+	int s = 1;
+	double *t, t0, t1;
+	uchar *buf;	
+	double a, d, max, min;
+
+	m = OREAD;
+	ARGBEGIN{
+	case 'n':
+		n = atoi(ARGF());
+		break;
+	case 's':
+		s = unittoull(ARGF());
+		if(s < 1 || s > 1024*1024)
+			sysfatal("bad size");
+		break;
+	case 'r':
+		m = OREAD;
+		break;
+	case 'w':
+		m = OWRITE;
+		break;
+	}ARGEND
+
+	fd = 0;
+	if(argc == 1){
+		fd = open(argv[0], m);
+		if(fd < 0)
+			sysfatal("could not open file: %s: %r", argv[0]);
+	}
+
+	buf = malloc(s);
+	t = malloc(n*sizeof(double));
+	
+	t0 = nsec();
+	for(i=0; i<n; i++){
+		if(m == OREAD){
+			if(pread(fd, buf, s, 0) < s)
+				sysfatal("bad read: %r");
+		}else{
+			if(pwrite(fd, buf, s, 0) < s)
+				sysfatal("bad write: %r");
+		}
+		t1 = nsec();
+		t[i] = (t1 - t0)*1e-3;
+		t0 = t1;
+	}
+
+	a = 0.;
+	d = 0.;
+	max = 0.;
+	min = 1e12;
+
+	for(i=0; i<n; i++){
+		a += t[i];
+		if(max < t[i])
+			max = t[i];
+		if(min > t[i])
+			min = t[i];
+	}
+
+	a /= n;
+	
+	for(i=0; i<n; i++)
+		d += (a - t[i]) * (a - t[i]);
+	d /= n;
+	d = sqrt(d);
+
+	print("avg = %.0fµs min = %.0fµs max = %.0fµs dev = %.0fµs\n", a, min, max, d);
+
+	exits(0);
+}
+

+ 415 - 0
sys/src/cmd/fossil/9proc.c

@@ -0,0 +1,415 @@
+#include "stdinc.h"
+
+#include "9.h"
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	NConInit	= 128,
+	NMsgInit	= 20,
+	NMsgProcInit	= 4,
+	NMsizeInit	= 8192+IOHDRSZ,
+};
+
+static struct {
+	VtLock*	lock;
+	Con**	con;			/* arena */
+	int	ncon;			/* how many in arena */
+	int	hi;			/* high watermark */
+	int	cur;			/* hint for allocation */
+
+	u32int	msize;
+} cbox;
+
+static struct {
+	VtLock*	lock;
+
+	Msg*	free;
+	VtRendez* alloc;
+
+	Msg*	head;
+	Msg*	tail;
+	VtRendez* work;
+
+	int	maxmsg;
+	int	nmsg;
+	int	maxproc;
+	int	nproc;
+
+	u32int	msize;			/* immutable */
+} mbox;
+
+static void
+msgFree(Msg* m)
+{
+	vtLock(mbox.lock);
+	if(mbox.nmsg > mbox.maxmsg){
+		vtMemFree(m->data);
+		vtMemFree(m);
+		mbox.nmsg--;
+		vtUnlock(mbox.lock);
+		return;
+	}
+	m->next = mbox.free;
+	mbox.free = m;
+	if(m->next == nil)
+		vtWakeup(mbox.alloc);
+	vtUnlock(mbox.lock);
+}
+
+static void
+conFree(Con* con)
+{
+	if(con->fd >= 0){
+		close(con->fd);
+		con->fd = -1;
+	}
+
+	assert(con->version == nil);
+	assert(con->mhead == nil);
+	assert(con->nmsg == 0);
+	assert(con->nfid == 0);
+	assert(con->state == CsMoribund);
+
+	con->state = CsDead;
+}
+
+static void
+msgProc(void*)
+{
+	int n;
+	Msg *m;
+	char *e;
+	Con *con;
+
+	vtThreadSetName("msg");
+
+	vtLock(mbox.lock);
+	while(mbox.nproc <= mbox.maxproc){
+		while(mbox.head == nil)
+			vtSleep(mbox.work);
+		m = mbox.head;
+		mbox.head = m->next;
+		m->next = nil;
+
+		e = nil;
+
+		con = m->con;
+		vtLock(con->lock);
+		assert(con->state != CsDead);
+		con->nmsg++;
+
+		if(m->t.type == Tversion){
+			con->version = m;
+			con->state = CsDown;
+			while(con->mhead != nil)
+				vtSleep(con->active);
+			assert(con->state == CsDown);
+			if(con->version == m){
+				con->version = nil;
+				con->state = CsInit;
+			}
+			else
+				e = "Tversion aborted";
+		}
+		else if(con->state != CsUp)	
+			e = "connection not ready";
+
+		/*
+		 * Add Msg to end of active list.
+		 */
+		if(con->mtail != nil){
+			m->prev = con->mtail;
+			con->mtail->next = m;
+		}
+		else{
+			con->mhead = m;
+			m->prev = nil;
+		}
+		con->mtail = m;
+		m->next = nil;
+
+		vtUnlock(con->lock);
+		vtUnlock(mbox.lock);
+
+		/*
+		 * Dispatch if not error already.
+		 */
+		m->r.tag = m->t.tag;
+		if(e == nil && !(*rFcall[m->t.type])(m))
+			e = vtGetError();
+		if(e != nil){
+			m->r.type = Rerror;
+			m->r.ename = e;
+		}
+		else
+			m->r.type = m->t.type+1;
+
+		vtLock(con->lock);
+		/*
+		 * Remove Msg from active list.
+		 */
+		if(m->prev != nil)
+			m->prev->next = m->next;
+		else
+			con->mhead = m->next;
+		if(m->next != nil)
+			m->next->prev = m->prev;
+		else
+			con->mtail = m->prev;
+		m->prev = m->next = nil;
+		if(con->mhead == nil)
+			vtWakeup(con->active);
+
+		if(Dflag)
+			fprint(2, "msgProc: r %F\n", &m->r);
+
+		if((con->state == CsNew || con->state == CsUp) && !m->flush){
+			/*
+			 * TODO: optimise this copy away somehow for
+			 * read, stat, etc.
+			 */
+			assert(n = convS2M(&m->r, con->data, con->msize));
+			if(write(con->fd, con->data, n) != n){
+				if(con->fd >= 0){
+					close(con->fd);
+					con->fd = -1;
+				}
+			}
+		}
+
+		con->nmsg--;
+		if(con->state == CsMoribund && con->nmsg == 0){
+			vtUnlock(con->lock);
+			conFree(con);
+		}
+		else
+			vtUnlock(con->lock);
+
+		vtLock(mbox.lock);
+		m->next = mbox.free;
+		mbox.free = m;
+		if(m->next == nil)
+			vtWakeup(mbox.alloc);
+	}
+	mbox.nproc--;
+	vtUnlock(mbox.lock);
+}
+
+static void
+conProc(void* v)
+{
+	Msg *m;
+	Con *con;
+	int eof, fd, n;
+
+	vtThreadSetName("con");
+
+	con = v;
+	if(Dflag)
+		fprint(2, "conProc: con->fd %d\n", con->fd);
+	fd = con->fd;
+	eof = 0;
+
+	vtLock(mbox.lock);
+	while(!eof){
+		while(mbox.free == nil){
+			if(mbox.nmsg >= mbox.maxmsg){
+				vtSleep(mbox.alloc);
+				continue;
+			}
+			m = vtMemAllocZ(sizeof(Msg));
+			m->data = vtMemAlloc(mbox.msize);
+			m->msize = mbox.msize;
+			mbox.nmsg++;
+			mbox.free = m;
+			break;
+		}
+		m = mbox.free;
+		mbox.free = m->next;
+		m->next = nil;
+		vtUnlock(mbox.lock);
+
+		m->con = con;
+		m->flush = 0;
+
+		while((n = read9pmsg(fd, m->data, con->msize)) == 0)
+			;
+		if(n < 0){
+			m->t.type = Tversion;
+			m->t.fid = NOFID;
+			m->t.tag = NOTAG;
+			m->t.msize = con->msize;
+			m->t.version = "9PEoF";
+			eof = 1;
+		}
+		else if(convM2S(m->data, n, &m->t) != n){
+			if(Dflag)
+				fprint(2, "conProc: convM2S error: %s\n",
+					con->name);
+			msgFree(m);
+			vtLock(mbox.lock);
+			continue;
+		}
+		if(Dflag)
+			fprint(2, "conProc: t %F\n", &m->t);
+
+		vtLock(mbox.lock);
+		if(mbox.head == nil){
+			mbox.head = m;
+			if(!vtWakeup(mbox.work) && mbox.nproc < mbox.maxproc){
+				if(vtThread(msgProc, nil) > 0)
+					mbox.nproc++;
+			}
+			vtWakeup(mbox.work);
+		}
+		else
+			mbox.tail->next = m;
+		mbox.tail = m;
+	}
+	vtUnlock(mbox.lock);
+}
+
+Con*
+conAlloc(int fd, char* name)
+{
+	Con *con;
+	int cur, i;
+
+	vtLock(cbox.lock);
+	cur = cbox.cur;
+	for(i = 0; i < cbox.hi; i++){
+		/*
+		 * Look for any unallocated or CsDead up to the
+		 * high watermark; cur is a hint where to start.
+		 * Wrap around the whole arena.
+		 */
+		if(cbox.con[cur] == nil || cbox.con[cur]->state == CsDead)
+			break;
+		if(++cur >= cbox.hi)
+			cur = 0;
+	}
+	if(i >= cbox.hi){
+		/*
+		 * None found.
+		 * If the high watermark is up to the limit of those
+		 * allocated, increase the size of the arena.
+		 * Bump up the watermark and take the next.
+		 */
+		if(cbox.hi >= cbox.ncon){
+			cbox.con = vtMemRealloc(cbox.con,
+					(cbox.ncon+NConInit)*sizeof(Con*));
+			memset(&cbox.con[cbox.ncon], 0, NConInit*sizeof(Con*));
+			cbox.ncon += NConInit;
+		}
+		cur = cbox.hi++;
+	}
+
+	/*
+	 * Do one-time initialisation if necessary.
+	 * Put back a new hint.
+	 * Do specific initialisation and start the proc.
+	 */
+	con = cbox.con[cur];
+	if(con == nil){
+		con = vtMemAllocZ(sizeof(Con));
+		con->lock = vtLockAlloc();
+		con->data = vtMemAlloc(cbox.msize);
+		con->msize = cbox.msize;
+		con->active = vtRendezAlloc(con->lock);
+		con->fidlock = vtLockAlloc();
+		cbox.con[cur] = con;
+	}
+	assert(con->mhead == nil);
+	assert(con->nmsg == 0);
+	assert(con->fhead == nil);
+	assert(con->nfid == 0);
+
+	con->state = CsNew;
+
+	if(++cur >= cbox.hi)
+		cur = 0;
+	cbox.cur = cur;
+
+	con->fd = fd;
+	if(con->name != nil){
+		vtMemFree(con->name);
+		con->name = nil;
+	}
+	if(name != nil)
+		con->name = vtStrDup(name);
+	con->aok = 0;
+	vtUnlock(cbox.lock);
+
+	if(vtThread(conProc, con) < 0){
+		conFree(con);
+		return nil;
+	}
+
+	return con;
+}
+
+static int
+cmdMsg(int argc, char* argv[])
+{
+	char *p;
+	int maxmsg, maxproc;
+	char *usage = "usage: msg [-m nmsg] [-p nproc]";
+
+	maxmsg = maxproc = 0;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'm':
+		p = ARGF();
+		if(p == nil)
+			return cliError(usage);
+		maxmsg = strtol(argv[0], &p, 0);
+		if(maxmsg <= 0 || p == argv[0] || *p != '\0')
+			return cliError(usage);
+		break;
+	case 'p':
+		p = ARGF();
+		if(p == nil)
+			return cliError(usage);
+		maxproc = strtol(argv[0], &p, 0);
+		if(maxproc <= 0 || p == argv[0] || *p != '\0')
+			return cliError(usage);
+		break;
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	vtLock(mbox.lock);
+	if(maxmsg)
+		mbox.maxmsg = maxmsg;
+	maxmsg = mbox.maxmsg;
+	if(maxproc)
+		mbox.maxproc = maxproc;
+	maxproc = mbox.maxproc;
+	vtUnlock(mbox.lock);
+
+	consPrint("\tmsg -m %d -p %d\n", maxmsg, maxproc);
+
+	return 1;
+}
+
+void
+procInit(void)
+{
+	mbox.lock = vtLockAlloc();
+	mbox.alloc = vtRendezAlloc(mbox.lock);
+	mbox.work = vtRendezAlloc(mbox.lock);
+
+	mbox.maxmsg = NMsgInit;
+	mbox.maxproc = NMsgProcInit;
+	mbox.msize = NMsizeInit;
+
+	cliAddCmd("msg", cmdMsg);
+
+	cbox.lock = vtLockAlloc();
+	cbox.con = nil;
+	cbox.ncon = 0;
+	cbox.msize = NMsizeInit;
+}

+ 195 - 0
sys/src/cmd/fossil/9srv.c

@@ -0,0 +1,195 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+typedef struct Srv Srv;
+typedef struct Srv {
+	int	fd;
+	int	srvfd;
+	char*	service;
+	char*	mntpnt;
+
+	Srv*	next;
+	Srv*	prev;
+} Srv;
+
+static struct {
+	VtLock*	lock;
+
+	Srv*	head;
+	Srv*	tail;
+} sbox;
+
+static int
+srvFd(char* name, int mode, int fd)
+{
+	int n, srvfd;
+	char buf[10], srv[VtMaxStringSize];
+
+	/*
+	 * Drop a file descriptor with given name and mode into /srv.
+	 * Create with ORCLOSE and don't close srvfd so it will be removed
+	 * automatically on process exit.
+	 */
+	snprint(srv, sizeof(srv), "/srv/%s", name);
+	if((srvfd = create(srv, ORCLOSE|OWRITE, mode)) < 0){
+		snprint(srv, sizeof(srv), "#s/%s", name);
+		if((srvfd = create(srv, ORCLOSE|OWRITE, mode)) < 0){
+			vtSetError("create %s: %r", srv);
+			return -1;
+		}
+	}
+
+	n = snprint(buf, sizeof(buf), "%d", fd);
+	if(write(srvfd, buf, n) < 0){
+		close(srvfd);
+		vtSetError("write %s: %r", srv);
+		return -1;
+	}
+
+	return srvfd;
+}
+
+static void
+srvFree(Srv* srv)
+{
+	if(srv->prev != nil)
+		srv->prev->next = srv->next;
+	else
+		sbox.head = srv->next;
+	if(srv->next != nil)
+		srv->next->prev = srv->prev;
+	else
+		sbox.tail = srv->prev;
+
+	if(srv->srvfd != -1)
+		close(srv->srvfd);
+	vtMemFree(srv->service);
+	vtMemFree(srv->mntpnt);
+	vtMemFree(srv);
+}
+
+static Srv*
+srvAlloc(char* service, int mode, int fd)
+{
+	Srv *srv;
+	int srvfd;
+
+	vtLock(sbox.lock);
+	for(srv = sbox.head; srv != nil; srv = srv->next){
+		if(strcmp(srv->service, service) != 0)
+			continue;
+		vtSetError("srv: already serving '%s'", service);
+		vtUnlock(sbox.lock);
+		return nil;
+	}
+
+	if((srvfd = srvFd(service, mode, fd)) < 0){
+		vtUnlock(sbox.lock);
+		return nil;
+	}
+	close(fd);
+
+	srv = vtMemAllocZ(sizeof(Srv));
+	srv->srvfd = srvfd;
+	srv->service = vtStrDup(service);
+
+	if(sbox.tail != nil){
+		srv->prev = sbox.tail;
+		sbox.tail->next = srv;
+	}
+	else{
+		sbox.head = srv;
+		srv->prev = nil;
+	}
+	sbox.tail = srv;
+	vtUnlock(sbox.lock);
+
+	return srv;
+}
+
+static int
+cmdSrv(int argc, char* argv[])
+{
+	Srv *srv;
+	int dflag, fd[2], mode, pflag, r;
+	char *usage = "usage: srv [-dp] [service]";
+
+	dflag = pflag = 0;
+	mode = 0666;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'd':
+		dflag = 1;
+		break;
+	case 'p':
+		pflag = 1;
+		mode = 0600;
+		break;
+	}ARGEND
+
+	switch(argc){
+	default:
+		return cliError(usage);
+	case 0:
+		vtRLock(sbox.lock);
+		for(srv = sbox.head; srv != nil; srv = srv->next)
+			consPrint("\t%s\t%d\n", srv->service, srv->srvfd);
+		vtRUnlock(sbox.lock);
+
+		return 1;
+	case 1:
+		if(!dflag)
+			break;
+
+		vtLock(sbox.lock);
+		for(srv = sbox.head; srv != nil; srv = srv->next){
+			if(strcmp(srv->service, argv[0]) != 0)
+				continue;
+			srvFree(srv);
+			break;
+		}
+		vtUnlock(sbox.lock);
+
+		if(srv == nil){
+			vtSetError("srv: '%s' not found", argv[0]);
+			return 0;
+		}
+
+		return 1;
+	}
+
+	if(pipe(fd) < 0){
+		vtSetError("srv pipe: %r");
+		return 0;
+	}
+	if((srv = srvAlloc(argv[0], mode, fd[0])) == nil){
+		close(fd[0]); close(fd[1]);
+		return 0;
+	}
+
+	if(pflag)
+		r = consOpen(fd[1], srv->srvfd, -1);
+	else
+		r = (conAlloc(fd[1], argv[0]) != nil);
+	if(r == 0){
+		close(fd[1]);
+		vtLock(sbox.lock);
+		srvFree(srv);
+		vtUnlock(sbox.lock);
+	}
+
+	return r;
+}
+
+int
+srvInit(void)
+{
+	sbox.lock = vtLockAlloc();
+
+	cliAddCmd("srv", cmdSrv);
+
+	return 1;
+}

+ 960 - 0
sys/src/cmd/fossil/9user.c

@@ -0,0 +1,960 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+enum {
+	NUserHash	= 1009,
+};
+
+typedef struct Ubox Ubox;
+typedef struct User User;
+
+typedef struct User {
+	char*	uid;
+	char*	uname;
+	char*	leader;
+	char**	group;
+	int	ngroup;
+
+	User*	next;			/* */
+	User*	ihash;			/* lookup by .uid */
+	User*	nhash;			/* lookup by .uname */
+} User;
+
+#pragma varargck type "U"   User*
+
+typedef struct Ubox {
+	User*	head;
+	User*	tail;
+	char*	name;
+	int	nuser;
+	int	len;
+
+	User*	ihash[NUserHash];	/* lookup by .uid */
+	User*	nhash[NUserHash];	/* lookup by .uname */
+} Ubox;
+
+static struct {
+	VtLock*	lock;
+
+	Ubox*	box;
+} ubox;
+
+static char usersDefault[] = {
+	"adm:adm:adm:sys\n"
+	"none:none::\n"
+	"noworld:noworld::\n"
+	"sys:sys::\n"
+};
+
+static char* usersMandatory[] = {
+	"adm",
+	"none",
+	"noworld",
+	"sys",
+	nil,
+};
+
+char* uidadm = "adm";
+char* unamenone = "none";
+char* uidnoworld = "noworld";
+
+static u32int
+userHash(char* s)
+{
+	uchar *p;
+	u32int hash;
+
+	hash = 0;
+	for(p = (uchar*)s; *p != '\0'; p++)
+		hash = hash*7 + *p;
+
+	return hash % NUserHash;
+}
+
+static User*
+_userByUid(Ubox* box, char* uid)
+{
+	User *u;
+
+	if(box != nil){
+		for(u = box->ihash[userHash(uid)]; u != nil; u = u->ihash){
+			if(strcmp(u->uid, uid) == 0)
+				return u;
+		}
+	}
+	vtSetError("uname: uid '%s' not found", uid);
+	return nil;
+}
+
+char*
+unameByUid(char* uid)
+{
+	User *u;
+	char *uname;
+
+	vtRLock(ubox.lock);
+	if((u = _userByUid(ubox.box, uid)) == nil){
+		vtRUnlock(ubox.lock);
+		return nil;
+	}
+	uname = vtStrDup(u->uname);
+	vtRUnlock(ubox.lock);
+
+	return uname;
+}
+
+static User*
+_userByUname(Ubox* box, char* uname)
+{
+	User *u;
+
+	if(box != nil){
+		for(u = box->nhash[userHash(uname)]; u != nil; u = u->nhash){
+			if(strcmp(u->uname, uname) == 0)
+				return u;
+		}
+	}
+	vtSetError("uname: uname '%s' not found", uname);
+	return nil;
+}
+
+char*
+uidByUname(char* uname)
+{
+	User *u;
+	char *uid;
+
+	vtRLock(ubox.lock);
+	if((u = _userByUname(ubox.box, uname)) == nil){
+		vtRUnlock(ubox.lock);
+		return nil;
+	}
+	uid = vtStrDup(u->uid);
+	vtRUnlock(ubox.lock);
+
+	return uid;
+}
+
+static int
+_groupMember(Ubox* box, char* group, char* member, int whenNoGroup)
+{
+	int i;
+	User *g, *m;
+
+	/*
+	 * Is 'member' a member of 'group'?
+	 * Note that 'group' is a 'uid' and not a 'uname'.
+	 * A 'member' is automatically in their own group.
+	 */
+	if((g = _userByUid(box, group)) == nil)
+		return whenNoGroup;
+	if((m = _userByUname(box, member)) == nil)
+		return 0;
+	if(m == g)
+		return 1;
+	for(i = 0; i < g->ngroup; i++){
+		if(strcmp(g->group[i], member) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+int
+groupWriteMember(char* uname)
+{
+	int ret;
+
+	/*
+	 * If there is a ``write'' group, then only its members can write
+	 * to the file system, no matter what the permission bits say.
+	 *
+	 * To users not in the ``write'' group, the file system appears
+	 * read only.  This is used to serve sources.cs.bell-labs.com
+	 * to the world.
+	 *
+	 * Note that if there is no ``write'' group, then this routine
+	 * makes it look like everyone is a member -- the opposite
+	 * of what groupMember does.
+	 *
+	 * We use this for sources.cs.bell-labs.com.
+	 * If this slows things down too much on systems that don't
+	 * use this functionality, we could cache the write group lookup.
+	 */
+
+	vtRLock(ubox.lock);
+	ret = _groupMember(ubox.box, "write", uname, 1);
+	vtRUnlock(ubox.lock);
+	return ret;
+}
+
+static int
+_groupRemMember(Ubox* box, User* g, char* member)
+{
+	int i;
+
+	if(_userByUname(box, member) == nil)
+		return 0;
+
+	for(i = 0; i < g->ngroup; i++){
+		if(strcmp(g->group[i], member) == 0)
+			break;
+	}
+	if(i >= g->ngroup){
+		if(strcmp(g->uname, member) == 0)
+			vtSetError("uname: '%s' always in own group", member);
+		else
+			vtSetError("uname: '%s' not in group '%s'",
+				member, g->uname);
+		return 0;
+	}
+
+	vtMemFree(g->group[i]);
+
+	box->len -= strlen(member);
+	if(g->ngroup > 1)
+		box->len--;
+	g->ngroup--;
+	switch(g->ngroup){
+	case 0:
+		vtMemFree(g->group);
+		g->group = nil;
+		break;
+	default:
+		while(i < g->ngroup){
+			g->group[i] = g->group[i+1];
+			i++;
+		}
+		/*FALLTHROUGH*/
+	case 1:
+		g->group = vtMemRealloc(g->group, (g->ngroup)*sizeof(char*));
+		break;
+	}
+
+	return 1;
+}
+
+static int
+_groupAddMember(Ubox* box, User* g, char* member)
+{
+	User *u;
+
+	if((u = _userByUname(box, member)) == nil)
+		return 0;
+	if(_groupMember(box, g->uid, u->uname, 0)){
+		if(strcmp(g->uname, member) == 0)
+			vtSetError("uname: '%s' always in own group", member);
+		else
+			vtSetError("uname: '%s' already in group '%s'",
+				member, g->uname);
+		return 0;
+	}
+
+	g->group = vtMemRealloc(g->group, (g->ngroup+1)*sizeof(char*));
+	g->group[g->ngroup] = vtStrDup(member);
+	box->len += strlen(member);
+	g->ngroup++;
+	if(g->ngroup > 1)
+		box->len++;
+
+	return 1;
+}
+
+int
+groupMember(char* group, char* member)
+{
+	int r;
+
+	if(group == nil)
+		return 0;
+
+	vtRLock(ubox.lock);
+	r = _groupMember(ubox.box, group, member, 0);
+	vtRUnlock(ubox.lock);
+
+	return r;
+}
+
+int
+groupLeader(char* group, char* member)
+{
+	int r;
+	User *g;
+
+	/*
+	 * Is 'member' the leader of 'group'?
+	 * Note that 'group' is a 'uid' and not a 'uname'.
+	 * Uname 'none' cannot be a group leader.
+	 */
+	if(strcmp(member, unamenone) == 0 || group == nil)
+		return 0;
+
+	vtRLock(ubox.lock);
+	if((g = _userByUid(ubox.box, group)) == nil){
+		vtRUnlock(ubox.lock);
+		return 0;
+	}
+	if(g->leader != nil){
+		if(strcmp(g->leader, member) == 0){
+			vtRUnlock(ubox.lock);
+			return 1;
+		}
+		r = 0;
+	}
+	else
+		r = _groupMember(ubox.box, group, member, 0);
+	vtRUnlock(ubox.lock);
+
+	return r;
+}
+
+static void
+userFree(User* u)
+{
+	int i;
+
+	vtMemFree(u->uid);
+	vtMemFree(u->uname);
+	if(u->leader != nil)
+		vtMemFree(u->leader);
+	if(u->ngroup){
+		for(i = 0; i < u->ngroup; i++)
+			vtMemFree(u->group[i]);
+		vtMemFree(u->group);
+	}
+	vtMemFree(u);
+}
+
+static User*
+userAlloc(char* uid, char* uname)
+{
+	User *u;
+
+	u = vtMemAllocZ(sizeof(User));
+	u->uid = vtStrDup(uid);
+	u->uname = vtStrDup(uname);
+
+	return u;
+}
+
+int
+validUserName(char* name)
+{
+	Rune *r;
+	static Rune invalid[] = L"#:,()";
+
+	for(r = invalid; *r != '\0'; r++){
+		if(utfrune(name, *r))
+			return 0;
+	}
+	return 1;
+}
+
+static int
+userFmt(Fmt* fmt)
+{
+	User *u;
+	int i, r;
+
+	u = va_arg(fmt->args, User*);
+
+	r = fmtprint(fmt, "%s:%s:", u->uid, u->uname);
+	if(u->leader != nil)
+		r += fmtprint(fmt, u->leader);
+	r += fmtprint(fmt, ":");
+	if(u->ngroup){
+		r += fmtprint(fmt, u->group[0]);
+		for(i = 1; i < u->ngroup; i++)
+			r += fmtprint(fmt, ",%s", u->group[i]);
+	}
+
+	return r;
+}
+
+static int
+usersFileWrite(Ubox* box)
+{
+	Fs *fs;
+	User *u;
+	int i, r;
+	Fsys *fsys;
+	char *p, *q, *s;
+	File *dir, *file;
+
+	if((fsys = fsysGet("main")) == nil)
+		return 0;
+	fsysFsRlock(fsys);
+	fs = fsysGetFs(fsys);
+
+	/*
+	 * BUG:
+	 * 	the owner/group/permissions need to be thought out.
+	 */
+	r = 0;
+	if((dir = fileOpen(fs, "/active")) == nil)
+		goto tidy0;
+	if((file = fileWalk(dir, "adm")) == nil)
+		file = fileCreate(dir, "adm", ModeDir|0775, uidadm);
+	fileDecRef(dir);
+	if(file == nil)
+		goto tidy;
+	dir = file;
+	if((file = fileWalk(dir, "users")) == nil)
+		file = fileCreate(dir, "users", 0664, uidadm);
+	fileDecRef(dir);
+	if(file == nil)
+		goto tidy;
+	if(!fileTruncate(file, uidadm))
+		goto tidy;
+
+	p = s = vtMemAlloc(box->len+1);
+	q = p + box->len+1;
+	for(u = box->head; u != nil; u = u->next){
+		p += snprint(p, q-p, "%s:%s:", u->uid, u->uname);
+		if(u->leader != nil)
+			p+= snprint(p, q-p, u->leader);
+		p += snprint(p, q-p, ":");
+		if(u->ngroup){
+			p += snprint(p, q-p, u->group[0]);
+			for(i = 1; i < u->ngroup; i++)
+				p += snprint(p, q-p, ",%s", u->group[i]);
+		}
+		p += snprint(p, q-p, "\n");
+	}
+	r = fileWrite(file, s, box->len, 0, uidadm);
+	vtMemFree(s);
+
+tidy:
+	if(file != nil)
+		fileDecRef(file);
+tidy0:
+	fsysFsRUnlock(fsys);
+	fsysPut(fsys);
+
+	return r;
+}
+
+static void
+uboxRemUser(Ubox* box, User *u)
+{
+	User **h, *up;
+
+	h = &box->ihash[userHash(u->uid)];
+	for(up = *h; up != nil && up != u; up = up->ihash)
+		h = &up->ihash;
+	assert(up == u);
+	*h = up->ihash;
+	box->len -= strlen(u->uid);
+
+	h = &box->nhash[userHash(u->uname)];
+	for(up = *h; up != nil && up != u; up = up->nhash)
+		h = &up->nhash;
+	assert(up == u);
+	*h = up->nhash;
+	box->len -= strlen(u->uname);
+
+	h = &box->head;
+	for(up = *h; up != nil && strcmp(up->uid, u->uid) != 0; up = up->next)
+		h = &up->next;
+	assert(up == u);
+	*h = u->next;
+	u->next = nil;
+
+	box->len -= 4;
+	box->nuser--;
+}
+
+static void
+uboxAddUser(Ubox* box, User* u)
+{
+	User **h, *up;
+
+	h = &box->ihash[userHash(u->uid)];
+	u->ihash = *h;
+	*h = u;
+	box->len += strlen(u->uid);
+
+	h = &box->nhash[userHash(u->uname)];
+	u->nhash = *h;
+	*h = u;
+	box->len += strlen(u->uname);
+
+	h = &box->head;
+	for(up = *h; up != nil && strcmp(up->uid, u->uid) < 0; up = up->next)
+		h = &up->next;
+	u->next = *h;
+	*h = u;
+
+	box->len += 4;
+	box->nuser++;
+}
+
+static void
+uboxDump(Ubox* box)
+{
+	User* u;
+
+	consPrint("nuser %d len = %d\n", box->nuser, box->len);
+
+	for(u = box->head; u != nil; u = u->next)
+		consPrint("%U\n", u);
+}
+
+static void
+uboxFree(Ubox* box)
+{
+	User *next, *u;
+
+	for(u = box->head; u != nil; u = next){
+		next = u->next;
+		userFree(u);
+	}
+	if(box->name != nil)
+		vtMemFree(box->name);
+	vtMemFree(box);
+}
+
+static int
+uboxInit(char* name, char* users, int len)
+{
+	User *g, *u;
+	Ubox *box, *obox;
+	int blank, comment, i, nuser;
+	char *buf, *f[5], **line, *p, *q, *s;
+
+	/*
+	 * Strip out whitespace and comments.
+	 * Note that comments are pointless, they disappear
+	 * when the server writes the database back out.
+	 */
+	blank = 1;
+	comment = nuser = 0;
+
+	s = p = buf = vtMemAlloc(len+1);
+	for(q = users; *q != '\0'; q++){
+		if(*q == '\r' || *q == '\t' || *q == ' ')
+			continue;
+		if(*q == '\n'){
+			if(!blank){
+				if(p != s){
+					*p++ = '\n';
+					nuser++;
+					s = p;
+				}
+				blank = 1;
+			}
+			comment = 0;
+			continue;
+		}
+		if(*q == '#')
+			comment = 1;
+		blank = 0;
+		if(!comment)
+			*p++ = *q;	
+	}
+	*p = '\0';
+
+	line = vtMemAllocZ((nuser+2)*sizeof(char*));
+	if((i = gettokens(buf, line, nuser+2, "\n")) != nuser){
+		fprint(2, "nuser %d (%d) botch\n", nuser, i);
+		vtMemFree(line);
+		vtMemFree(buf);
+		return 0;
+	}
+
+	fprint(2, "nuser %d\n", nuser);
+
+	/*
+	 * Everything us updated in a local Ubox until verified.
+	 */
+	box = vtMemAllocZ(sizeof(Ubox));
+	if(name != nil)
+		box->name = vtStrDup(name);
+
+	/*
+	 * First pass - check format, check for duplicates
+	 * and enter in hash buckets.
+	 */
+	for(i = 0; i < nuser; i++){
+		s = vtStrDup(line[i]);
+		if(getfields(s, f, nelem(f), 0, ":") != 4){
+			fprint(2, "bad line '%s'\n", line[i]);
+			vtMemFree(s);
+			continue;
+		}
+		if(*f[0] == '\0' || *f[1] == '\0'){
+			fprint(2, "bad line '%s'\n", line[i]);
+			vtMemFree(s);
+			continue;
+		}
+		if(!validUserName(f[0])){
+			fprint(2, "invalid uid '%s'\n", f[0]);
+			vtMemFree(s);
+			continue;
+		}
+		if(_userByUid(box, f[0]) != nil){
+			fprint(2, "duplicate uid '%s'\n", f[0]);
+			vtMemFree(s);
+			continue;
+		}
+		if(!validUserName(f[1])){
+			fprint(2, "invalid uname '%s'\n", f[0]);
+			vtMemFree(s);
+			continue;
+		}
+		if(_userByUname(box, f[1]) != nil){
+			fprint(2, "duplicate uname '%s'\n", f[1]);
+			vtMemFree(s);
+			continue;
+		}
+
+		u = userAlloc(f[0], f[1]);
+		uboxAddUser(box, u);
+
+		vtMemFree(s);
+	}
+	assert(box->nuser == nuser);
+
+	/*
+	 * Second pass - fill in leader and group information.
+	 */
+	for(i = 0; i < nuser; i++){
+		s = vtStrDup(line[i]);
+		getfields(s, f, nelem(f), 0, ":");
+
+		assert(g = _userByUname(box, f[1]));
+		if(*f[2] != '\0'){
+			if((u = _userByUname(box, f[2])) == nil)
+				g->leader = vtStrDup(g->uname);
+			else
+				g->leader = vtStrDup(u->uname);
+			box->len += strlen(g->leader);
+		}
+		for(p = f[3]; p != nil; p = q){
+			if((q = utfrune(p, L',')) != nil)
+				*q++ = '\0';
+			if(!_groupAddMember(box, g, p)){
+				// print/log error here
+			}
+		}
+
+		vtMemFree(s);
+	}
+
+	vtMemFree(line);
+	vtMemFree(buf);
+
+	for(i = 0; usersMandatory[i] != nil; i++){
+		if((u = _userByUid(box, usersMandatory[i])) == nil){
+			vtSetError("user '%s' is mandatory", usersMandatory[i]);
+			uboxFree(box);
+			return 0;
+		}
+		if(strcmp(u->uid, u->uname) != 0){
+			vtSetError("uid/uname for user '%s' must match",
+				usersMandatory[i]);
+			uboxFree(box);
+			return 0;
+		}
+	}
+
+	vtLock(ubox.lock);
+	if(name != nil && usersFileWrite(box) == 0){
+		/*
+		 * What to do here? How much whining?
+		 */
+	}
+	obox = ubox.box;
+	ubox.box = box;
+	vtUnlock(ubox.lock);
+
+	if(obox != nil)
+		uboxFree(obox);
+
+	return 1;
+}
+
+static int
+usersFileRead(char* path)
+{
+	char *p;
+	File *file;
+	Fsys *fsys;
+	int len, r;
+	uvlong size;
+
+	if((fsys = fsysGet("main")) == nil)
+		return 0;
+	fsysFsRlock(fsys);
+
+	r = 0;
+	if((file = fileOpen(fsysGetFs(fsys), path)) != nil){
+		if(fileGetSize(file, &size)){
+			len = size;
+			p = vtMemAlloc(size+1);
+			if(fileRead(file, p, len, 0) == len){
+				p[len] = '\0';
+				r = uboxInit(path, p, len);
+			}
+		}
+		fileDecRef(file);
+	}
+
+	fsysFsRUnlock(fsys);
+	fsysPut(fsys);
+
+	return r;
+}
+
+static int
+cmdUname(int argc, char* argv[])
+{
+	User *u, *up;
+	int d, dflag, i, r;
+	char *p, *uid, *uname;
+	char *createfmt = "fsys main create -d /active/usr/%s %s %s 0775";
+	char *usage = "usage: uname uname [uid|:uid|%%newname|=leader|+member|-member]";
+
+	dflag = 0;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'd':
+		dflag = 1;
+		break;
+	}ARGEND
+
+	if(argc < 1){
+		if(dflag){
+			vtRLock(ubox.lock);
+			if(dflag)
+				uboxDump(ubox.box);
+			vtRUnlock(ubox.lock);
+			return 1;
+		}
+		return cliError(usage);
+	}
+
+	uname = argv[0];
+	argc--; argv++;
+
+	if(argc == 0){
+		vtRLock(ubox.lock);
+		if((u = _userByUname(ubox.box, uname)) == nil){
+			vtRUnlock(ubox.lock);
+			return 0;
+		}
+		consPrint("\t%U\n", u);
+		vtRUnlock(ubox.lock);
+		return 1;
+	}
+
+	vtLock(ubox.lock);
+	u = _userByUname(ubox.box, uname);
+	while(argc--){
+		if(argv[0][0] == '%'){
+			if(u == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			p = &argv[0][1];
+			if((up = _userByUname(ubox.box, p)) != nil){
+				vtSetError("uname: uname '%s' already exists",
+					up->uname);
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			for(i = 0; usersMandatory[i] != nil; i++){
+				if(strcmp(usersMandatory[i], uname) != 0)
+					continue;
+				vtSetError("uname: uname '%s' is mandatory",
+					uname);
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+
+			d = strlen(p) - strlen(u->uname);
+			for(up = ubox.box->head; up != nil; up = up->next){
+				if(up->leader != nil){
+					if(strcmp(up->leader, u->uname) == 0){
+						vtMemFree(up->leader);
+						up->leader = vtStrDup(p);
+						ubox.box->len += d;
+					}
+				}
+				for(i = 0; i < up->ngroup; i++){
+					if(strcmp(up->group[i], u->uname) != 0)
+						continue;
+					vtMemFree(up->group[i]);
+					up->group[i] = vtStrDup(p);
+					ubox.box->len += d;
+					break;
+				}
+			}
+
+			uboxRemUser(ubox.box, u);
+			vtMemFree(u->uname);
+			u->uname = vtStrDup(p);
+			uboxAddUser(ubox.box, u);
+		}
+		else if(argv[0][0] == '='){
+			if(u == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			if((up = _userByUname(ubox.box, &argv[0][1])) == nil){
+				if(argv[0][1] != '\0'){
+					vtUnlock(ubox.lock);
+					return 0;
+				}
+			}
+			if(u->leader != nil){
+				ubox.box->len -= strlen(u->leader);
+				vtMemFree(u->leader);
+				u->leader = nil;
+			}
+			if(up != nil){
+				u->leader = vtStrDup(up->uname);
+				ubox.box->len += strlen(u->leader);
+			}
+		}
+		else if(argv[0][0] == '+'){
+			if(u == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			if((up = _userByUname(ubox.box, &argv[0][1])) == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			if(!_groupAddMember(ubox.box, u, up->uname)){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+		}
+		else if(argv[0][0] == '-'){
+			if(u == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			if((up = _userByUname(ubox.box, &argv[0][1])) == nil){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+			if(!_groupRemMember(ubox.box, u, up->uname)){
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+		}
+		else{
+			if(u != nil){
+				vtSetError("uname: uname '%s' already exists",
+					u->uname);
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+
+			uid = argv[0];
+			if(*uid == ':')
+				uid++;
+			if((u = _userByUid(ubox.box, uid)) != nil){
+				vtSetError("uname: uid '%s' already exists",
+					u->uid);
+				vtUnlock(ubox.lock);
+				return 0;
+			}
+
+			u = userAlloc(uid, uname);
+			uboxAddUser(ubox.box, u);
+			if(argv[0][0] != ':'){
+				// should have an option for the mode and gid
+				p = smprint(createfmt, uname, uname, uname);
+				r = cliExec(p);
+				vtMemFree(p);
+				if(r != 0){
+					vtUnlock(ubox.lock);
+					return 0;
+				}
+			}
+		}
+		argv++;
+	}
+
+	if(usersFileWrite(ubox.box) == 0){
+		vtUnlock(ubox.lock);
+		return 0;
+	}
+	if(dflag)
+		uboxDump(ubox.box);
+	vtUnlock(ubox.lock);
+
+	return 1;
+}
+
+static int
+cmdUsers(int argc, char* argv[])
+{
+	Ubox *box;
+	int dflag, r, wflag;
+	char *usage = "usage: users [-dw] [file]";
+
+	dflag = wflag = 0;
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'd':
+		dflag = 1;
+		break;
+	case 'w':
+		wflag = 1;
+		break;
+	}ARGEND
+
+	switch(argc){
+	default:
+		return cliError(usage);
+	case 0:
+		if(dflag)
+			uboxInit(nil, usersDefault, sizeof(usersDefault));
+		vtRLock(ubox.lock);
+		box = ubox.box;
+		if(box->name != nil)
+			consPrint("\tfile %s\n", box->name);
+		else
+			consPrint("\tno file\n");
+		consPrint("\tnuser %d len %d\n", box->nuser, box->len);
+		vtRUnlock(ubox.lock);
+		break;
+	case 1:
+		if(dflag)
+			return cliError(usage);
+		if(usersFileRead(argv[0]) == 0)
+			return 0;
+		break;
+	}
+
+	if(wflag){
+		vtRLock(ubox.lock);
+		r = usersFileWrite(ubox.box);
+		vtRUnlock(ubox.lock);
+		return r;
+	}
+
+	return 1;
+}
+
+int
+usersInit(void)
+{
+	fmtinstall('U', userFmt);
+
+	ubox.lock = vtLockAlloc();
+	uboxInit(nil, usersDefault, sizeof(usersDefault));
+
+	cliAddCmd("users", cmdUsers);
+	cliAddCmd("uname", cmdUname);
+
+	return 1;
+}

+ 112 - 0
sys/src/cmd/fossil/Ccli.c

@@ -0,0 +1,112 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+typedef struct {
+	char*	argv0;
+	int	(*cmd)(int, char*[]);
+} Cmd;
+
+static struct {
+	VtLock*	lock;
+	Cmd*	cmd;
+	int	ncmd;
+	int	hi;
+} cbox;
+
+enum {
+	NCmdIncr	= 20,
+};
+
+int
+cliError(char* fmt, ...)
+{
+	char *p;
+	va_list arg;
+
+	va_start(arg, fmt);
+	p = vsmprint(fmt, arg);
+	vtSetError("%s", p);
+	free(p);
+	va_end(arg);
+
+	return 0;
+}
+
+int
+cliExec(char* buf)
+{
+	int argc, i, r;
+	char *argv[20], *p;
+
+	p = vtStrDup(buf);
+	if((argc = tokenize(p, argv, nelem(argv)-1)) == 0){
+		vtMemFree(p);
+		return 1;
+	}
+	argv[argc] = 0;
+
+	if(argv[0][0] == '#'){
+		vtMemFree(p);
+		return 1;
+	}
+
+	vtLock(cbox.lock);
+	for(i = 0; i < cbox.hi; i++){
+		if(strcmp(cbox.cmd[i].argv0, argv[0]) == 0){
+			vtUnlock(cbox.lock);
+			if(!(r = cbox.cmd[i].cmd(argc, argv)))
+				consPrint("%s\n", vtGetError());
+			vtMemFree(p);
+			return r;
+		}
+	}
+	vtUnlock(cbox.lock);
+
+	consPrint("%s: - eh?\n", argv[0]);
+	vtMemFree(p);
+
+	return 0;
+}
+
+int
+cliAddCmd(char* argv0, int (*cmd)(int, char*[]))
+{
+	int i;
+	Cmd *opt;
+
+	vtLock(cbox.lock);
+	for(i = 0; i < cbox.hi; i++){
+		if(strcmp(argv0, cbox.cmd[i].argv0) == 0){
+			vtUnlock(cbox.lock);
+			return 0;
+		}
+	}
+	if(i >= cbox.hi){
+		if(cbox.hi >= cbox.ncmd){
+			cbox.cmd = vtMemRealloc(cbox.cmd,
+					(cbox.ncmd+NCmdIncr)*sizeof(Cmd));
+			memset(&cbox.cmd[cbox.ncmd], 0, NCmdIncr*sizeof(Cmd));
+			cbox.ncmd += NCmdIncr;
+		}
+	}
+
+	opt = &cbox.cmd[cbox.hi];
+	opt->argv0 = argv0;
+	opt->cmd = cmd;
+	cbox.hi++;
+	vtUnlock(cbox.lock);
+
+	return 1;
+}
+
+int
+cliInit(void)
+{
+	cbox.lock = vtLockAlloc();
+	cbox.cmd = vtMemAllocZ(NCmdIncr*sizeof(Cmd));
+	cbox.ncmd = NCmdIncr;
+	cbox.hi = 0;
+
+	return 1;
+}

+ 417 - 0
sys/src/cmd/fossil/Ccmd.c

@@ -0,0 +1,417 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+static struct {
+	VtLock*	lock;
+
+	Con*	con;
+	int	confd[2];
+	ushort	tag;
+} cbox;
+
+static ulong
+cmd9pStrtoul(char* s)
+{
+	if(strcmp(s, "~0") == 0)
+		return ~0UL;
+	return strtoul(s, 0, 0);
+}
+
+static uvlong
+cmd9pStrtoull(char* s)
+{
+	if(strcmp(s, "~0") == 0)
+		return ~0ULL;
+	return strtoull(s, 0, 0);
+}
+
+static int
+cmd9pTag(Fcall*, int, char **argv)
+{
+	cbox.tag = strtoul(argv[0], 0, 0)-1;
+
+	return 1;
+}
+
+static int
+cmd9pTwstat(Fcall* f, int, char **argv)
+{
+	Dir d;
+	static uchar buf[DIRMAX];
+
+	memset(&d, 0, sizeof d);
+	nulldir(&d);
+	d.name = argv[1];
+	d.uid = argv[2];
+	d.gid = argv[3];
+	d.mode = cmd9pStrtoul(argv[4]);
+	d.mtime = cmd9pStrtoul(argv[5]);
+	d.length = cmd9pStrtoull(argv[6]);
+
+	f->fid = strtol(argv[0], 0, 0);
+	f->stat = buf;
+	f->nstat = convD2M(&d, buf, sizeof buf);
+	if(f->nstat < BIT16SZ){
+		vtSetError("Twstat: convD2M failed (internal error)");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+cmd9pTstat(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTremove(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTclunk(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTwrite(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+	f->offset = strtoll(argv[1], 0, 0);
+	f->data = argv[2];
+	f->count = strlen(argv[2]);
+
+	return 1;
+}
+
+static int
+cmd9pTread(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+	f->offset = strtoll(argv[1], 0, 0);
+	f->count = strtol(argv[2], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTcreate(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+	f->name = argv[1];
+	f->perm = strtol(argv[2], 0, 8);
+	f->mode = strtol(argv[3], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTopen(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+	f->mode = strtol(argv[1], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTwalk(Fcall* f, int argc, char** argv)
+{
+	int i;
+
+	if(argc < 2){
+		vtSetError("usage: Twalk tag fid newfid [name...]");
+		return 0;
+	}
+	f->fid = strtol(argv[0], 0, 0);
+	f->newfid = strtol(argv[1], 0, 0);
+	f->nwname = argc-2;
+	if(f->nwname > MAXWELEM){
+		vtSetError("Twalk: too many names");
+		return 0;
+	}
+	for(i = 0; i < argc-2; i++)
+		f->wname[i] = argv[2+i];
+
+	return 1;
+}
+
+static int
+cmd9pTflush(Fcall* f, int, char** argv)
+{
+	f->oldtag = strtol(argv[0], 0, 0);
+
+	return 1;
+}
+
+static int
+cmd9pTattach(Fcall* f, int, char** argv)
+{
+	f->fid = strtol(argv[0], 0, 0);
+	f->afid = strtol(argv[1], 0, 0);
+	f->uname = argv[2];
+	f->aname = argv[3];
+
+	return 1;
+}
+
+static int
+cmd9pTauth(Fcall* f, int, char** argv)
+{
+	f->afid = strtol(argv[0], 0, 0);
+	f->uname = argv[1];
+	f->aname = argv[2];
+
+	return 1;
+}
+
+static int
+cmd9pTversion(Fcall* f, int, char** argv)
+{
+	f->msize = strtoul(argv[0], 0, 0);
+	if(f->msize > cbox.con->msize){
+		vtSetError("msize too big");
+		return 0;
+	}
+	f->version = argv[1];
+
+	return 1;
+}
+
+typedef struct Cmd9p Cmd9p;
+struct Cmd9p {
+	char*	name;
+	int	type;
+	int	argc;
+	char*	usage;
+	int	(*f)(Fcall*, int, char**);
+};
+
+static Cmd9p cmd9pTmsg[] = {
+	"Tversion", Tversion, 2, "msize version", cmd9pTversion,
+	"Tauth", Tauth, 3, "afid uname aname", cmd9pTauth,
+	"Tflush", Tflush, 1, "oldtag", cmd9pTflush,
+	"Tattach", Tattach, 4, "fid afid uname aname", cmd9pTattach,
+	"Twalk", Twalk, 0, "fid newfid [name...]", cmd9pTwalk,
+	"Topen", Topen, 2, "fid mode", cmd9pTopen,
+	"Tcreate", Tcreate, 4, "fid name perm mode", cmd9pTcreate,
+	"Tread", Tread, 3, "fid offset count", cmd9pTread,
+	"Twrite", Twrite, 3, "fid offset data", cmd9pTwrite,
+	"Tclunk", Tclunk, 1, "fid", cmd9pTclunk,
+	"Tremove", Tremove, 1, "fid", cmd9pTremove,
+	"Tstat", Tstat, 1, "fid", cmd9pTstat,
+	"Twstat", Twstat, 7, "fid name uid gid mode mtime length", cmd9pTwstat,
+	"nexttag", 0, 0, "", cmd9pTag,
+};
+
+static int
+cmd9p(int argc, char* argv[])
+{
+	int i, n;
+	Fcall f, t;
+	uchar *buf;
+	char *usage;
+	u32int msize;
+
+	usage = "usage: 9p T-message ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc < 1)
+		return cliError(usage);
+
+	for(i = 0; i < nelem(cmd9pTmsg); i++){
+		if(strcmp(cmd9pTmsg[i].name, argv[0]) == 0)
+			break;
+	}
+	if(i == nelem(cmd9pTmsg))
+		return cliError(usage);
+	argc--;
+	argv++;
+	if(cmd9pTmsg[i].argc && argc != cmd9pTmsg[i].argc){
+		vtSetError("usage: %s %s",
+			cmd9pTmsg[i].name, cmd9pTmsg[i].usage);
+		return 0;
+	}
+
+	memset(&t, 0, sizeof(t));
+	t.type = cmd9pTmsg[i].type;
+	if(t.type == Tversion)
+		t.tag = NOTAG;
+	else
+		t.tag = ++cbox.tag;
+	msize = cbox.con->msize;
+	if(!cmd9pTmsg[i].f(&t, argc, argv))
+		return 0;
+	buf = vtMemAlloc(msize);
+	n = convS2M(&t, buf, msize);
+	if(n <= BIT16SZ){
+		vtSetError("%s: convS2M error", cmd9pTmsg[i].name);
+		vtMemFree(buf);
+		return 0;
+	}
+	if(write(cbox.confd[0], buf, n) != n){
+		vtSetError("%s: write error: %r", cmd9pTmsg[i].name);
+		vtMemFree(buf);
+		return 0;
+	}
+	consPrint("\t-> %F\n", &t);
+
+	if((n = read9pmsg(cbox.confd[0], buf, msize)) <= 0){
+		vtSetError("%s: read error: %r", cmd9pTmsg[i].name);
+		vtMemFree(buf);
+		return 0;
+	}
+	if(convM2S(buf, n, &f) == 0){
+		vtSetError("%s: convM2S error", cmd9pTmsg[i].name);
+		vtMemFree(buf);
+		return 0;
+	}
+	consPrint("\t<- %F\n", &f);
+
+	vtMemFree(buf);
+	return 1;
+}
+
+static int
+cmdDot(int argc, char* argv[])
+{
+	long l;
+	Dir *dir;
+	int fd, r;
+	vlong length;
+	char *f, *p, *s, *usage;
+
+	usage = "usage: . file";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc != 1)
+		return cliError(usage);
+
+	if((dir = dirstat(argv[0])) == nil)
+		return cliError(". dirstat %s: %r", argv[0]);
+	length = dir->length;
+	free(dir);
+
+	r = 0;
+	if(length != 0){
+		/*
+		 * Read the whole file in.
+		 */
+		if((fd = open(argv[0], OREAD)) < 0)
+			return cliError(". open %s: %r", argv[0]);
+		f = vtMemAlloc(dir->length+1);
+		if((l = read(fd, f, length)) < 0){
+			vtMemFree(f);
+			close(fd);
+			return cliError(". read %s: %r", argv[0]);
+		}
+		close(fd);
+		f[l] = '\0';
+
+		/*
+		 * Call cliExec() for each line.
+		 */
+		for(p = s = f; *p != '\0'; p++){
+			if(*p == '\n'){
+				*p = '\0';
+				if((r = cliExec(s)) == 0)
+					break;
+				s = p+1;
+			}
+		}
+		vtMemFree(f);
+	}
+
+	return r;
+}
+
+static int
+cmdDflag(int argc, char* argv[])
+{
+	char *usage;
+
+	usage = "usage: dflag";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	}ARGEND
+	if(argc)
+		return cliError(usage);
+
+	Dflag ^= 1;
+	consPrint("dflag %d\n", Dflag);
+
+	return 1;
+}
+
+static int
+cmdEcho(int argc, char* argv[])
+{
+	char *usage;
+	int i, nflag;
+
+	nflag = 0;
+	usage = "usage: echo [-n] ...";
+
+	ARGBEGIN{
+	default:
+		return cliError(usage);
+	case 'n':
+		nflag = 1;
+		break;
+	}ARGEND
+
+	for(i = 0; i < argc; i++){
+		if(i != 0)
+			consPrint(" %s", argv[i]);
+		else
+			consPrint(argv[i]);
+	}
+	if(!nflag)
+		consPrint("\n");
+
+	return 1;
+}
+
+int
+cmdInit(void)
+{
+	cbox.lock = vtLockAlloc();
+	cbox.confd[0] = cbox.confd[1] = -1;
+
+	cliAddCmd(".", cmdDot);
+	cliAddCmd("9p", cmd9p);
+	cliAddCmd("dflag", cmdDflag);
+	cliAddCmd("echo", cmdEcho);
+
+	if(pipe(cbox.confd) < 0)
+		return 0;
+	if((cbox.con = conAlloc(cbox.confd[1], "internal-command")) == nil){
+		close(cbox.confd[0]);
+		close(cbox.confd[1]);
+		cbox.confd[0] = cbox.confd[1] = -1;
+		return 0;
+		
+	}
+	cbox.con->isconsole = 1;
+
+	return 1;
+}

+ 390 - 0
sys/src/cmd/fossil/Ccons.c

@@ -0,0 +1,390 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+enum {
+	Nl	= 256,			/* max. command line length */
+	Nq	= 8*1024,		/* amount of I/O buffered */
+};
+
+typedef struct Q {
+	VtLock*	lock;
+	VtRendez* full;
+	VtRendez* empty;
+
+	char	q[Nq];
+	int	n;
+	int	r;
+	int	w;
+} Q;
+
+typedef struct Cons {
+	VtLock*	lock;
+	int	ref;
+	int	closed;
+	int	fd;
+	int	srvfd;
+	int	ctlfd;
+	Q*	iq;		/* points to console.iq */
+	Q*	oq;		/* points to console.oq */
+} Cons;
+
+static struct {
+	Q*	iq;		/* input */
+	Q*	oq;		/* output */
+	char	l[Nl];		/* command line assembly */
+	int	nl;		/* current line length */
+
+	char*	prompt;
+	int	np;
+} console;
+
+static void
+consClose(Cons* cons)
+{
+	vtLock(cons->lock);
+	cons->closed = 1;
+
+	cons->ref--;
+	if(cons->ref > 0){
+		vtLock(cons->iq->lock);
+		vtWakeup(cons->iq->full);
+		vtUnlock(cons->iq->lock);
+		vtLock(cons->oq->lock);
+		vtWakeup(cons->oq->empty);
+		vtUnlock(cons->oq->lock);
+		vtUnlock(cons->lock);
+		return;
+	}
+
+	if(cons->ctlfd != -1){
+		close(cons->ctlfd);
+		cons->srvfd = -1;
+	}
+	if(cons->srvfd != -1){
+		close(cons->srvfd);
+		cons->srvfd = -1;
+	}
+	if(cons->fd != -1){
+		close(cons->fd);
+		cons->fd = -1;
+	}
+
+	vtUnlock(cons->lock);
+	vtLockFree(cons->lock);
+	vtMemFree(cons);
+}
+
+static void
+consIProc(void* v)
+{
+	Q *q;
+	Cons *cons;
+	int n, w;
+	char buf[Nq/4];
+
+	vtThreadSetName("consI");
+
+	cons = v;
+	q = cons->iq;
+	for(;;){
+		/*
+		 * Can't tell the difference between zero-length read
+		 * and eof, so keep calling read until we get an error.
+		 */
+		if(cons->closed || (n = read(cons->fd, buf, Nq/4)) < 0)
+			break;
+		vtLock(q->lock);
+		while(Nq - q->n < n && !cons->closed)
+			vtSleep(q->full);
+		w = Nq - q->w;
+		if(w < n){
+			memmove(&q->q[q->w], buf, w);
+			memmove(&q->q[0], buf + w, n - w);
+		}
+		else
+			memmove(&q->q[q->w], buf, n);
+		q->w = (q->w + n) % Nq;
+		q->n += n;
+		vtWakeup(q->empty);
+		vtUnlock(q->lock);
+	}
+	consClose(cons);
+}
+
+static void
+consOProc(void* v)
+{
+	Q *q;
+	Cons *cons;
+	char buf[Nq];
+	int lastn, n, r;
+
+	vtThreadSetName("consO");
+
+	cons = v;
+	q = cons->oq;
+	vtLock(q->lock);
+	lastn = 0;
+	for(;;){
+		while(lastn == q->n && !cons->closed)
+			vtSleep(q->empty);
+		if((n = q->n - lastn) > Nq)
+			n = Nq;
+		if(n > q->w){
+			r = n - q->w;
+			memmove(buf, &q->q[Nq - r], r);
+			memmove(buf+r, &q->q[0], n - r);
+		}
+		else
+			memmove(buf, &q->q[q->w - n], n);
+		lastn = q->n;
+		vtUnlock(q->lock);
+		if(cons->closed || write(cons->fd, buf, n) < 0)
+			break;
+		vtLock(q->lock);
+		vtWakeup(q->empty);
+	}
+	consClose(cons);
+}
+
+int
+consOpen(int fd, int srvfd, int ctlfd)
+{
+	Cons *cons;
+
+	cons = vtMemAllocZ(sizeof(Cons));
+	cons->lock = vtLockAlloc();
+	cons->fd = fd;
+	cons->srvfd = srvfd;
+	cons->ctlfd = ctlfd;
+	cons->iq = console.iq;
+	cons->oq = console.oq;
+
+	vtLock(cons->lock);
+	cons->ref = 2;
+	cons->closed = 0;
+	if(vtThread(consOProc, cons) < 0){
+		cons->ref--;
+		vtUnlock(cons->lock);
+		consClose(cons);
+		return 0;
+	}
+	vtUnlock(cons->lock);
+
+	if(ctlfd >= 0)
+		consIProc(cons);
+	else if(vtThread(consIProc, cons) < 0){
+		consClose(cons);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+qWrite(Q* q, char* p, int n)
+{
+	int w;
+
+	vtLock(q->lock);
+	if(n > Nq - q->w){
+		w = Nq - q->w;
+		memmove(&q->q[q->w], p, w);
+		memmove(&q->q[0], p + w, n - w);
+		q->w = n - w;
+	}
+	else{
+		memmove(&q->q[q->w], p, n);
+		q->w += n;
+	}
+	q->n += n;
+	vtWakeup(q->empty);
+	vtUnlock(q->lock);
+
+	return n;
+}
+
+static Q*
+qAlloc(void)
+{
+	Q *q;
+
+	q = vtMemAllocZ(sizeof(Q));
+	q->lock = vtLockAlloc();
+	q->full = vtRendezAlloc(q->lock);
+	q->empty = vtRendezAlloc(q->lock);
+	q->n = q->r = q->w = 0;
+
+	return q;
+}
+
+static void
+consProc(void*)
+{
+	Q *q;
+	int argc, i, n, r;
+	char *argv[20], buf[Nq], *lp, *wbuf;
+
+	vtThreadSetName("cons");
+
+	q = console.iq;
+	qWrite(console.oq, console.prompt, console.np);
+	vtLock(q->lock);
+	for(;;){
+		while((n = q->n) == 0)
+			vtSleep(q->empty);
+		r = Nq - q->r;
+		if(r < n){
+			memmove(buf, &q->q[q->r], r);
+			memmove(buf + r, &q->q[0], n - r);
+		}
+		else
+			memmove(buf, &q->q[q->r], n);
+		q->r = (q->r + n) % Nq;
+		q->n -= n;
+		vtWakeup(q->full);
+		vtUnlock(q->lock);
+
+		for(i = 0; i < n; i++){
+			switch(buf[i]){
+			case '\004':				/* ^D */
+				if(console.nl == 0){
+					qWrite(console.oq, "\n", 1);
+					break;
+				}
+				/*FALLTHROUGH*/
+			default:
+				if(console.nl < Nl-1){
+					qWrite(console.oq, &buf[i], 1);
+					console.l[console.nl++] = buf[i];
+				}
+				continue;
+			case '\b':
+				if(console.nl != 0){
+					qWrite(console.oq, &buf[i], 1);
+					console.nl--;
+				}
+				continue;
+			case '\n':
+				qWrite(console.oq, &buf[i], 1);
+				break;
+			case '\025':				/* ^U */
+				qWrite(console.oq, "^U\n", 3);
+				console.nl = 0;
+				break;
+			case '\027':				/* ^W */
+				console.l[console.nl] = '\0';
+				wbuf = vtMemAlloc(console.nl+1);
+				memmove(wbuf, console.l, console.nl+1);
+				argc = tokenize(wbuf, argv, nelem(argv));
+				if(argc > 0)
+					argc--;
+				console.nl = 0;
+				lp = console.l;
+				for(i = 0; i < argc; i++)
+					lp += sprint(lp, "%q ", argv[i]);
+				console.nl = lp - console.l;
+				vtMemFree(wbuf);
+				qWrite(console.oq, "^W\n", 3);
+				if(console.nl == 0)
+					break;
+				qWrite(console.oq, console.l, console.nl);
+				continue;
+			case '\177':
+				qWrite(console.oq, "\n", 1);
+				console.nl = 0;
+				break;
+			}
+
+			console.l[console.nl] = '\0';
+			if(console.nl != 0)
+				cliExec(console.l);
+
+			console.nl = 0;
+			qWrite(console.oq, console.prompt, console.np);
+		}
+
+		vtLock(q->lock);
+	}
+}
+
+int
+consWrite(char* buf, int len)
+{
+	if(console.oq == nil)
+		return fprint(2, buf, len);
+	return qWrite(console.oq, buf, len);
+}
+
+int
+consPrompt(char* prompt)
+{
+	char buf[ERRMAX];
+
+	if(prompt == nil)
+		prompt = "prompt";
+
+	vtMemFree(console.prompt);
+	console.np = snprint(buf, sizeof(buf), "%s: ", prompt);
+	console.prompt = vtStrDup(buf);
+
+	return console.np;
+}
+
+int
+consTTY(void)
+{
+	int ctl, fd;
+	char *name, *p;
+
+	name = "/dev/cons";
+	if((fd = open(name, ORDWR)) < 0){
+		name = "#c/cons";
+		if((fd = open(name, ORDWR)) < 0){
+			vtSetError("consTTY: open %s: %r", name);
+			return 0;
+		}
+	}
+
+	p = smprint("%sctl", name);
+	if((ctl = open(p, OWRITE)) < 0){
+		close(fd);
+		vtSetError("consTTY: open %s: %r", p);
+		free(p);
+		return 0;
+	}
+	if(write(ctl, "rawon", 5) < 0){
+		close(ctl);
+		close(fd);
+		vtSetError("consTTY: write %s: %r", p);
+		free(p);
+		return 0;
+	}
+	free(p);
+
+	if(consOpen(fd, fd, ctl) == 0){
+		close(ctl);
+		close(fd);
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+consInit(void)
+{
+	console.iq = qAlloc();
+	console.oq = qAlloc();
+	console.nl = 0;
+
+	consPrompt(nil);
+
+	if(vtThread(consProc, nil) < 0){
+		vtFatal("can't start console proc");
+		return 0;
+	}
+
+	return 1;
+}

+ 41 - 0
sys/src/cmd/fossil/Clog.c

@@ -0,0 +1,41 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+/*
+ * Dummy for now.
+ */
+
+int
+consPrint(char* fmt, ...)
+{
+	int len;
+	va_list args;
+	char buf[ERRMAX];
+
+	/*
+	 * To do:
+	 * This will be integrated with logging and become 'print'.
+	 */
+	va_start(args, fmt);
+	len = vsnprint(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	return consWrite(buf, len);
+}
+
+int
+consVPrint(char* fmt, va_list args)
+{
+	int len;
+	char buf[ERRMAX];
+
+	/*
+	 * To do:
+	 * This will be integrated with logging and become
+	 * something else ('vprint'?).
+	 */
+	len = vsnprint(buf, sizeof(buf), fmt, args);
+
+	return consWrite(buf, len);
+}

+ 441 - 0
sys/src/cmd/fossil/archive.c

@@ -0,0 +1,441 @@
+/*
+ * Archiver.  In charge of sending blocks to Venti.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+#include "9.h"	/* for consPrint */
+
+#define DEBUG 0
+
+static void archThread(void*);
+
+struct Arch
+{
+	int ref;
+	uint blockSize;
+	uint diskSize;
+	Cache *c;
+	Fs *fs;
+	VtSession *z;
+
+	VtLock *lk;
+	VtRendez *starve;
+	VtRendez *die;
+};
+
+Arch *
+archInit(Cache *c, Disk *disk, Fs *fs, VtSession *z)
+{
+	Arch *a;
+
+	a = vtMemAllocZ(sizeof(Arch));
+
+	a->c = c;
+	a->z = z;
+	a->fs = fs;
+	a->blockSize = diskBlockSize(disk);
+	a->lk = vtLockAlloc();
+	a->starve = vtRendezAlloc(a->lk);
+
+	a->ref = 2;
+	vtThread(archThread, a);
+
+	return a;
+}
+
+void
+archFree(Arch *a)
+{
+	/* kill slave */
+	vtLock(a->lk);
+	a->die = vtRendezAlloc(a->lk);
+	vtWakeup(a->starve);
+	while(a->ref > 1)
+		vtSleep(a->die);
+	vtUnlock(a->lk);
+	vtRendezFree(a->starve);
+	vtRendezFree(a->die);
+	vtLockFree(a->lk);
+	vtMemFree(a);
+}
+
+static int
+ventiSend(Arch *a, Block *b, uchar *data)
+{
+	uint n;
+	uchar score[VtScoreSize];
+
+	if(DEBUG > 1)
+		fprint(2, "ventiSend: sending %#ux %L to venti\n", b->addr, &b->l);
+	n = vtZeroTruncate(vtType[b->l.type], data, a->blockSize);
+	if(DEBUG > 1)
+		fprint(2, "ventiSend: truncate %d to %d\n", a->blockSize, n);
+	if(!vtWrite(a->z, score, vtType[b->l.type], data, n)){
+		fprint(2, "ventiSend: vtWrite block %#ux failed: %R\n", b->addr);
+		return 0;
+	}
+	if(!vtSha1Check(score, data, n)){
+		uchar score2[VtScoreSize];
+		vtSha1(score2, data, n);
+		fprint(2, "ventiSend: vtWrite block %#ux failed vtSha1Check %V %V\n",
+			b->addr, score, score2);
+		return 0;
+	}
+	if(!vtSync(a->z))
+		return 0;
+	return 1;
+}	
+
+/*
+ * parameters for recursion; there are so many,
+ * and some only change occasionally.  this is 
+ * easier than spelling things out at each call.
+ */
+typedef struct Param Param;
+struct Param
+{
+	/* these never change */
+	uint snapEpoch;	/* epoch for snapshot being archived */
+	uint blockSize;
+	Cache *c;
+	Arch *a;
+
+	/* changes on every call */
+	uint depth;
+
+	/* statistics */
+	uint nfixed;
+	uint nsend;
+	uint nvisit;
+	uint nfailsend;
+	uint maxdepth;
+	uint nreclaim;
+	uint nfake;
+	uint nreal;
+
+	/* these occasionally change (must save old values and put back) */
+	uint dsize;
+	uint psize;
+
+	/* return value; avoids using stack space */
+	Label l;
+	uchar score[VtScoreSize];
+};
+
+static void
+shaBlock(uchar score[VtScoreSize], Block *b, uchar *data, uint bsize)
+{
+	vtSha1(score, data, vtZeroTruncate(vtType[b->l.type], data, bsize));
+}
+
+static uint
+etype(Entry *e)
+{
+	uint t;
+
+	if(e->flags&VtEntryDir)
+		t = BtDir;
+	else
+		t = BtData;
+	return t+e->depth;
+}
+
+static uchar*
+copyBlock(Block *b, u32int blockSize)
+{
+	uchar *data;
+
+	data = vtMemAlloc(blockSize);
+	if(data == nil)
+		return nil;
+	memmove(data, b->data, blockSize);
+	return data;
+}
+
+/*
+ * Walk over the block tree, archiving it to Venti.
+ *
+ * We don't archive the snapshots. Instead we zero the
+ * entries in a temporary copy and archive that.
+ *
+ * Return value is:
+ *
+ *	ArchFailure	some error occurred
+ *	ArchSuccess	block and all children archived
+ * 	ArchFaked	success, but block or children got copied
+ */
+enum
+{
+	ArchFailure,
+	ArchSuccess,
+	ArchFaked,
+};
+static int
+archWalk(Param *p, u32int addr, uchar type, u32int tag)
+{
+	int ret, i, x, psize, dsize;
+	uchar *data, score[VtScoreSize];
+	Block *b;
+	Label l;
+	Entry *e;
+	WalkPtr w;
+
+	p->nvisit++;
+
+	b = cacheLocalData(p->c, addr, type, tag, OReadWrite,0);
+	if(b == nil){
+		fprint(2, "archive(%ud, %#ux): cannot find block: %R\n", p->snapEpoch, addr);
+		if(strcmp(vtGetError(), ELabelMismatch) == 0){
+			/* might as well plod on so we write _something_ to Venti */
+			memmove(p->score, vtZeroScore, VtScoreSize);
+			return ArchFaked;
+		}
+		return ArchFailure;
+	}
+
+	if(DEBUG) fprint(2, "%*sarchive(%ud, %#ux): block label %L\n",
+		p->depth*2, "",  p->snapEpoch, b->addr, &b->l);
+	p->depth++;
+	if(p->depth > p->maxdepth)
+		p->maxdepth = p->depth;
+
+	data = b->data;
+	if((b->l.state&BsVenti) == 0){
+		initWalk(&w, b, b->l.type==BtDir ? p->dsize : p->psize);
+		for(i=0; nextWalk(&w, score, &type, &tag, &e); i++){
+			if(e){
+				if(!(e->flags&VtEntryActive))
+					continue;
+				if(e->snap != 0 && e->archive == 0){
+				//	fprint(2, "snap; faking %#ux\n", b->addr);
+					if(data == b->data){
+						data = copyBlock(b, p->blockSize);
+						if(data == nil){
+							ret = ArchFailure;
+							goto Out;
+						}
+						w.data = data;
+					}
+					memmove(e->score, vtZeroScore, VtScoreSize);
+					e->depth = 0;
+					e->size = 0;
+					e->tag = 0;
+					e->flags &= ~VtEntryLocal;
+					entryPack(e, data, w.n-1);
+					continue;
+				}
+			}
+			addr = globalToLocal(score);
+			if(addr == NilBlock)
+				continue;
+			dsize = p->dsize;
+			psize = p->psize;
+			if(e){
+				p->dsize= e->dsize;
+				p->psize = e->psize;
+			}
+			vtUnlock(b->lk);
+			x = archWalk(p, addr, type, tag);
+			vtLock(b->lk);
+			if(e){
+				p->dsize = dsize;
+				p->psize = psize;
+			}
+			while(b->iostate != BioClean && b->iostate != BioDirty)
+				vtSleep(b->ioready);
+			switch(x){
+			case ArchFailure:
+				fprint(2, "archWalk %#ux failed; ptr is in %#ux offset %d\n",
+					addr, b->addr, i);
+				ret = ArchFailure;
+				goto Out;
+			case ArchFaked:
+if(0) fprint(2, "faked %#ux, faking %#ux (%V)\n", addr, b->addr, p->score);
+				if(data == b->data){
+					data = copyBlock(b, p->blockSize);
+					if(data == nil){
+						ret = ArchFailure;
+						goto Out;
+					}
+					w.data = data;
+				}
+				/* fall through */
+if(0) fprint(2, "falling\n");
+			case ArchSuccess:
+				if(e){
+					memmove(e->score, p->score, VtScoreSize);
+					e->flags &= ~VtEntryLocal;
+					entryPack(e, data, w.n-1);
+				}else
+					memmove(data+(w.n-1)*VtScoreSize, p->score, VtScoreSize);
+				if(data == b->data){
+					blockDirty(b);
+					if(!(b->l.state & BsCopied))
+						blockRemoveLink(b, addr, p->l.type, p->l.tag);
+				}
+				break;
+			}
+		}
+
+		if(!ventiSend(p->a, b, data)){
+			p->nfailsend++;
+			ret = ArchFailure;
+			goto Out;
+		}
+		p->nsend++;
+		if(data != b->data)
+			p->nfake++;
+		if(data == b->data){	/* not faking it, so update state */
+			p->nreal++;
+			l = b->l;
+			l.state |= BsVenti;
+			if(!blockSetLabel(b, &l)){
+				ret = ArchFailure;
+				goto Out;
+			}
+		}
+	}
+
+	shaBlock(p->score, b, data, p->blockSize);
+if(0) fprint(2, "ventisend %V %p %p %p\n", p->score, data, b->data, w.data);
+	ret = data!=b->data ? ArchFaked : ArchSuccess;
+	p->l = b->l;
+Out:
+	if(data != b->data)
+		vtMemFree(data);
+	p->depth--;
+	blockPut(b);
+	return ret;
+}
+
+static void
+archThread(void *v)
+{
+	Arch *a = v;
+	Block *b;
+	Param p;
+	Super super;
+	int ret;
+	u32int addr;
+	uchar rbuf[VtRootSize];
+	VtRoot root;
+
+	vtThreadSetName("arch");
+
+	for(;;){
+		/* look for work */
+		vtLock(a->fs->elk);
+		b = superGet(a->c, &super);
+		if(b == nil){
+			vtUnlock(a->fs->elk);
+			fprint(2, "archThread: superGet: %R");
+			sleep(60*1000);
+			continue;
+		}
+		addr = super.next;
+		if(addr != NilBlock && super.current == NilBlock){
+			super.current = addr;
+			super.next = NilBlock;
+			superPack(&super, b->data);
+			blockDirty(b);
+		}else
+			addr = super.current;
+		blockPut(b);
+		vtUnlock(a->fs->elk);
+
+		if(addr == NilBlock){
+			/* wait for work */
+			vtLock(a->lk);
+			vtSleep(a->starve);
+			if(a->die != nil)
+				goto Done;
+			vtUnlock(a->lk);
+			continue;
+		}
+
+sleep(10*1000);	/* window of opportunity to provoke races */
+
+		/* do work */
+		memset(&p, 0, sizeof p);
+		p.blockSize = a->blockSize;
+		p.dsize = 3*VtEntrySize;	/* root has three Entries */
+		p.c = a->c;
+		p.a = a;
+	
+		ret = archWalk(&p, addr, BtDir, RootTag);
+		switch(ret){
+		default:
+			abort();
+		case ArchFailure:
+			fprint(2, "archiveBlock %#ux: %R\n", addr);
+			sleep(60*1000);
+			continue;
+		case ArchSuccess:
+		case ArchFaked:
+			break;
+		}
+
+		if(0) fprint(2, "archiveSnapshot 0x%#ux: maxdepth %ud nfixed %ud"
+			" send %ud nfailsend %ud nvisit %ud"
+			" nreclaim %ud nfake %ud nreal %ud\n",
+			addr, p.maxdepth, p.nfixed,
+			p.nsend, p.nfailsend, p.nvisit,
+			p.nreclaim, p.nfake, p.nreal);
+		if(0) fprint(2, "archiveBlock %V (%ud)\n", p.score, p.blockSize);
+
+		/* tie up vac root */
+		memset(&root, 0, sizeof root);
+		root.version = VtRootVersion;
+		strcpy(root.type, "vac");
+		strecpy(root.name, root.name+sizeof root.name, "fossil");
+		memmove(root.score, p.score, VtScoreSize);
+		memmove(root.prev, super.last, VtScoreSize);
+		root.blockSize = a->blockSize;
+		vtRootPack(&root, rbuf);
+		if(!vtWrite(a->z, p.score, VtRootType, rbuf, VtRootSize)
+		|| !vtSha1Check(p.score, rbuf, VtRootSize)){
+			fprint(2, "vtWriteBlock %#ux: %R\n", addr);
+			sleep(60*1000);
+			continue;
+		}
+
+		/* record success */
+		vtLock(a->fs->elk);
+		b = superGet(a->c, &super);
+		if(b == nil){
+			vtUnlock(a->fs->elk);
+			fprint(2, "archThread: superGet: %R");
+			sleep(60*1000);
+			continue;
+		}
+		super.current = NilBlock;
+		memmove(super.last, p.score, VtScoreSize);
+		superPack(&super, b->data);
+		blockDirty(b);
+		blockPut(b);
+		vtUnlock(a->fs->elk);
+
+		consPrint("archive vac:%V\n", p.score);
+	}
+
+Done:
+	a->ref--;
+	vtWakeup(a->die);
+	vtUnlock(a->lk);
+}
+
+void
+archKick(Arch *a)
+{
+	if(a == nil){
+		fprint(2, "warning: archKick nil\n");
+		return;
+	}
+	vtLock(a->lk);
+	vtWakeup(a->starve);
+	vtUnlock(a->lk);
+}

+ 19 - 0
sys/src/cmd/fossil/build

@@ -0,0 +1,19 @@
+# once that works, this script from /usr/rob/dist/buildnotes
+# should build.  note it cross-builds for a different arch
+# because you can't overwrite running binaries safely.
+
+NPROC=8
+fileserver=emelie
+objtype=386
+cd /sys/src/ape
+mk install # so awk can be cross-compiled (needs to run pcc for maketab)
+cd /sys/src/cmd/vc
+mk install
+cd /sys/src/cmd/vl
+mk install
+cd /sys/src/cmd/va
+mk install
+mkdir /mips/bin/usb
+objtype=mips
+cd /sys/src
+mk install

+ 40 - 0
sys/src/cmd/fossil/buildsh

@@ -0,0 +1,40 @@
+#!/bin/rc
+
+rfork en
+9fs ehime
+
+# adapted from /lib/namespace
+
+root = /n/ehime/testplan9
+#root = /n/emelieother/seanq/testplan9
+echo setting up $root
+fn bind{
+	/$cputype/bin/bind $*
+}
+
+# pass terminal through
+bind /mnt/term $root/mnt/term
+# root
+bind  $root /
+bind -b '#/' /
+
+# kernel devices
+bind '#c' /dev
+bind '#d' /fd
+bind -c '#e' /env
+bind '#p' /proc
+bind -c '#s' /srv
+bind -a /mnt/term/dev/ /dev/
+bind /mnt/term/dev/draw /dev/draw
+
+# standard bin
+bind /$cputype/bin /bin
+bind -a /rc/bin /bin
+
+# ramfs
+cd /sys/src
+prompt=('test-ehime=; ' '	')
+fn cd
+rc -i
+
+

+ 421 - 0
sys/src/cmd/fossil/bwatch.c

@@ -0,0 +1,421 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+/*
+ * Lock watcher.  Check that locking of blocks is always down.
+ *
+ * This is REALLY slow, and it won't work when the blocks aren't
+ * arranged in a tree (e.g., after the first snapshot).  But it's great
+ * for debugging.
+ */
+enum
+{
+	MaxLock = 16,
+	HashSize = 1009,
+};
+
+/*
+ * Thread-specific watch state.
+ */
+typedef struct WThread WThread;
+struct WThread
+{
+	Block *b[MaxLock];	/* blocks currently held */
+	uint nb;
+	uint pid;
+};
+
+typedef struct WMap WMap;
+typedef struct WEntry WEntry;
+
+struct WEntry
+{
+	uchar c[VtScoreSize];
+	uchar p[VtScoreSize];
+	int off;
+
+	WEntry *cprev;
+	WEntry *cnext;
+	WEntry *pprev;
+	WEntry *pnext;
+};
+
+struct WMap
+{
+	VtLock *lk;
+
+	WEntry *hchild[HashSize];
+	WEntry *hparent[HashSize];
+};
+
+static WMap map;
+static void **wp;
+static uint blockSize;
+static WEntry *pool;
+uint bwatchDisabled;
+
+static uint
+hash(uchar score[VtScoreSize])
+{
+	uint i, h;
+
+	h = 0;
+	for(i=0; i<VtScoreSize; i++)
+		h = h*37 + score[i];
+	return h%HashSize;
+}
+
+#include <pool.h>
+static void
+freeWEntry(WEntry *e)
+{
+	memset(e, 0, sizeof(WEntry));
+	e->pnext = pool;
+	pool = e;
+}
+
+static WEntry*
+allocWEntry(void)
+{
+	int i;
+	WEntry *w;
+
+	w = pool;
+	if(w == nil){
+		w = vtMemAllocZ(1024*sizeof(WEntry));
+		for(i=0; i<1024; i++)
+			freeWEntry(&w[i]);
+		w = pool;
+	}
+	pool = w->pnext;
+	memset(w, 0, sizeof(WEntry));
+	return w;
+}
+
+/*
+ * remove all dependencies with score as a parent
+ */
+static void
+_bwatchResetParent(uchar *score)
+{
+	WEntry *w, *next;
+	uint h;
+
+	h = hash(score);
+	for(w=map.hparent[h]; w; w=next){
+		next = w->pnext;
+		if(memcmp(w->p, score, VtScoreSize) == 0){
+			if(w->pnext)
+				w->pnext->pprev = w->pprev;
+			if(w->pprev)
+				w->pprev->pnext = w->pnext;
+			else
+				map.hparent[h] = w->pnext;
+			if(w->cnext)
+				w->cnext->cprev = w->cprev;
+			if(w->cprev)
+				w->cprev->cnext = w->cnext;
+			else
+				map.hchild[hash(w->c)] = w->cnext;
+			freeWEntry(w);
+		}
+	}
+}
+/*
+ * and child 
+ */
+static void
+_bwatchResetChild(uchar *score)
+{
+	WEntry *w, *next;
+	uint h;
+
+	h = hash(score);
+	for(w=map.hchild[h]; w; w=next){
+		next = w->cnext;
+		if(memcmp(w->c, score, VtScoreSize) == 0){
+			if(w->pnext)
+				w->pnext->pprev = w->pprev;
+			if(w->pprev)
+				w->pprev->pnext = w->pnext;
+			else
+				map.hparent[hash(w->p)] = w->pnext;
+			if(w->cnext)
+				w->cnext->cprev = w->cprev;
+			if(w->cprev)
+				w->cprev->cnext = w->cnext;
+			else
+				map.hchild[h] = w->cnext;
+			freeWEntry(w);
+		}
+	}
+}
+
+static uchar*
+parent(uchar c[VtScoreSize], int *off)
+{
+	WEntry *w;
+	uint h;
+
+	h = hash(c);
+	for(w=map.hchild[h]; w; w=w->cnext)
+		if(memcmp(w->c, c, VtScoreSize) == 0){
+			*off = w->off;
+			return w->p;
+		}
+	return nil;
+}
+
+static void
+addChild(uchar p[VtEntrySize], uchar c[VtEntrySize], int off)
+{
+	uint h;
+	WEntry *w;
+
+	w = allocWEntry();
+	memmove(w->p, p, VtScoreSize);
+	memmove(w->c, c, VtScoreSize);
+	w->off = off;
+
+	h = hash(p);
+	w->pnext = map.hparent[h];
+	if(w->pnext)
+		w->pnext->pprev = w;
+	map.hparent[h] = w;
+
+	h = hash(c);
+	w->cnext = map.hchild[h];
+	if(w->cnext)
+		w->cnext->cprev = w;
+	map.hchild[h] = w;
+}
+
+void
+bwatchReset(uchar score[VtScoreSize])
+{
+	vtLock(map.lk);
+	_bwatchResetParent(score);
+	_bwatchResetChild(score);
+	vtUnlock(map.lk);
+}
+
+void
+bwatchInit(void)
+{
+	map.lk = vtLockAlloc();
+	wp = privalloc();
+	*wp = nil;
+}
+
+void
+bwatchSetBlockSize(uint bs)
+{
+	blockSize = bs;
+}
+
+static WThread*
+getWThread(void)
+{
+	WThread *w;
+
+	w = *wp;
+	if(w == nil || w->pid != getpid()){
+		w = vtMemAllocZ(sizeof(WThread));
+		*wp = w;
+		w->pid = getpid();
+	}
+	return w;
+}
+
+/*
+ * Derive dependencies from the contents of b.
+ */
+void
+bwatchDependency(Block *b)
+{
+	int i, epb, ppb;
+	Entry e;
+
+	if(bwatchDisabled)
+		return;
+
+	vtLock(map.lk);
+	_bwatchResetParent(b->score);
+
+	switch(b->l.type){
+	case BtData:
+		break;
+
+	case BtDir:
+		epb = blockSize / VtEntrySize;
+		for(i=0; i<epb; i++){
+			entryUnpack(&e, b->data, i);
+			if(!(e.flags & VtEntryActive))
+				continue;
+			addChild(b->score, e.score, i);
+		}
+		break;
+
+	default:
+		ppb = blockSize / VtScoreSize;
+		for(i=0; i<ppb; i++)
+			addChild(b->score, b->data+i*VtScoreSize, i);
+		break;
+	}
+	vtUnlock(map.lk);
+}
+
+static int
+depth(uchar *s)
+{
+	int d, x;
+
+	d = -1;
+	while(s){
+		d++;
+		s = parent(s, &x);
+	}
+	return d;
+}
+
+static int
+lockConflicts(uchar xhave[VtScoreSize], uchar xwant[VtScoreSize])
+{
+	uchar *have, *want;
+	int havedepth, wantdepth, havepos, wantpos;
+
+	have = xhave;
+	want = xwant;
+
+	havedepth = depth(have);
+	wantdepth = depth(want);
+
+	/*
+	 * walk one or the other up until they're both
+ 	 * at the same level.
+	 */
+	havepos = -1;
+	wantpos = -1;
+	have = xhave;
+	want = xwant;
+	while(wantdepth > havedepth){
+		wantdepth--;
+		want = parent(want, &wantpos);
+	}
+	while(havedepth > wantdepth){
+		havedepth--;
+		have = parent(have, &havepos);
+	}
+
+	/*
+	 * walk them up simultaneously until we reach
+	 * a common ancestor.
+	 */
+	while(have && want && memcmp(have, want, VtScoreSize) != 0){
+		have = parent(have, &havepos);
+		want = parent(want, &wantpos);
+	}
+
+	/*
+	 * not part of same tree.  happens mainly with
+	 * newly allocated blocks.
+	 */
+	if(!have || !want)
+		return 0;
+
+	/*
+	 * never walked want: means we want to lock
+	 * an ancestor of have.  no no.
+	 */
+	if(wantpos == -1)
+		return 1;
+
+	/*
+	 * never walked have: means we want to lock a
+	 * child of have.  that's okay.
+	 */
+	if(havepos == -1)
+		return 0;
+
+	/*
+	 * walked both: they're from different places in the tree.
+	 * require that the left one be locked before the right one.
+	 * (this is questionable, but it puts a total order on the block tree).
+	 */
+	return havepos < wantpos;
+}
+
+static void
+stop(void)
+{
+	int fd;
+	char buf[32];
+
+	snprint(buf, sizeof buf, "#p/%d/ctl", getpid());
+	fd = open(buf, OWRITE);
+	write(fd, "stop", 4);
+	close(fd);
+}
+
+/*
+ * Check whether the calling thread can validly lock b.
+ * That is, check that the calling thread doesn't hold
+ * locks for any of b's children.
+ */
+void
+bwatchLock(Block *b)
+{
+	int i;
+	WThread *w;
+
+	if(bwatchDisabled)
+		return;
+
+	if(b->part != PartData)
+		return;
+
+	vtLock(map.lk);
+	w = getWThread();
+	for(i=0; i<w->nb; i++){
+		if(lockConflicts(w->b[i]->score, b->score)){
+			fprint(2, "%d: have block %V; shouldn't lock %V\n",
+				w->pid, w->b[i]->score, b->score);
+			stop();
+		}
+	}
+	vtUnlock(map.lk);
+	if(w->nb >= MaxLock){
+		fprint(2, "%d: too many blocks held\n", w->pid);
+		stop();
+	}else
+		w->b[w->nb++] = b;
+}
+
+/*
+ * Note that the calling thread is about to unlock b.
+ */
+void
+bwatchUnlock(Block *b)
+{
+	int i;
+	WThread *w;
+
+	if(bwatchDisabled)
+		return;
+
+	if(b->part != PartData)
+		return;
+
+	w = getWThread();
+	for(i=0; i<w->nb; i++)
+		if(w->b[i] == b)
+			break;
+	if(i>=w->nb){
+		fprint(2, "%d: unlock of unlocked block %V\n", w->pid, b->score);
+		stop();
+	}else
+		w->b[i] = w->b[--w->nb];
+}
+

+ 2000 - 0
sys/src/cmd/fossil/cache.c

@@ -0,0 +1,2000 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+#include "9.h"	/* for cacheFlush */
+
+typedef struct FreeList FreeList;
+typedef struct BAddr BAddr;
+
+enum {
+	BadHeap = ~0,
+};
+
+/*
+ * Store data to the memory cache in c->size blocks
+ * with the block zero extended to fill it out.  When writing to
+ * Venti, the block will be zero truncated.  The walker will also check
+ * that the block fits within psize or dsize as the case may be.
+ */
+
+struct Cache
+{
+	VtLock	*lk;
+	int 	ref;
+	int	mode;
+
+	Disk 	*disk;
+	int	size;			/* block size */
+	VtSession *z;
+	u32int	now;			/* ticks for usage timestamps */
+	Block	**heads;		/* hash table for finding address */
+	int	nheap;			/* number of available victims */
+	Block	**heap;			/* heap for locating victims */
+	long	nblocks;		/* number of blocks allocated */
+	Block	*blocks;		/* array of block descriptors */
+	u8int	*mem;			/* memory for all block data & blists */
+
+	BList	*blfree;
+	VtRendez *blrend;
+
+	int 	ndirty;			/* number of dirty blocks in the cache */
+	int 	maxdirty;		/* max number of dirty blocks */
+	u32int	vers;
+
+	long hashSize;
+
+	FreeList *fl;
+
+	VtRendez *die;			/* daemon threads should die when != nil */
+
+	VtRendez *flush;
+	VtRendez *flushwait;
+	BAddr *baddr;
+	int bw, br, be;
+	int nflush;
+
+	/* unlink daemon */
+	BList *uhead;
+	BList *utail;
+	VtRendez *unlink;
+};
+
+struct BList {
+	int part;
+	u32int addr;
+	uchar type;
+	u32int tag;
+	u32int epoch;
+	u32int vers;
+	
+	/* for roll back */
+	int index;			/* -1 indicates not valid */
+	uchar score[VtScoreSize];
+
+	BList *next;
+};
+
+struct BAddr {
+	int part;
+	u32int addr;
+	u32int vers;
+};
+
+struct FreeList {
+	VtLock *lk;
+	u32int last;	/* last block allocated */
+	u32int end;	/* end of data partition */
+};
+
+static FreeList *flAlloc(u32int end);
+static void flFree(FreeList *fl);
+
+static Block *cacheBumpBlock(Cache *c);
+static void heapDel(Block*);
+static void heapIns(Block*);
+static void cacheCheck(Cache*);
+static int readLabel(Cache*, Label*, u32int addr);
+static void unlinkThread(void *a);
+static void flushThread(void *a);
+static void flushBody(Cache *c);
+static void unlinkBody(Cache *c);
+static int cacheFlushBlock(Cache *c);
+
+/*
+ * Mapping from local block type to Venti type
+ */
+int vtType[BtMax] = {
+	VtDataType,		/* BtData | 0  */
+	VtPointerType0,		/* BtData | 1  */
+	VtPointerType1,		/* BtData | 2  */
+	VtPointerType2,		/* BtData | 3  */
+	VtPointerType3,		/* BtData | 4  */
+	VtPointerType4,		/* BtData | 5  */
+	VtPointerType5,		/* BtData | 6  */
+	VtPointerType6,		/* BtData | 7  */
+	VtDirType,		/* BtDir | 0  */
+	VtPointerType0,		/* BtDir | 1  */
+	VtPointerType1,		/* BtDir | 2  */
+	VtPointerType2,		/* BtDir | 3  */
+	VtPointerType3,		/* BtDir | 4  */
+	VtPointerType4,		/* BtDir | 5  */
+	VtPointerType5,		/* BtDir | 6  */
+	VtPointerType6,		/* BtDir | 7  */
+};
+
+/*
+ * Allocate the memory cache. 
+ */
+Cache *
+cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
+{
+	int i;
+	Cache *c;
+	Block *b;
+	BList *bl;
+	u8int *p;
+	int nbl;
+
+	c = vtMemAllocZ(sizeof(Cache));
+
+	/* reasonable number of BList elements */
+	nbl = nblocks * 4;
+	
+	c->lk = vtLockAlloc();
+	c->ref = 1;
+	c->disk = disk;
+	c->z = z;
+	c->size = diskBlockSize(disk);
+bwatchSetBlockSize(c->size);
+	/* round c->size up to be a nice multiple */
+	c->size = (c->size + 127) & ~127;
+	c->nblocks = nblocks;
+	c->hashSize = nblocks;
+	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
+	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
+	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
+	c->mem = vtMemAllocZ(nblocks * c->size + nbl * sizeof(BList));
+	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
+	c->mode = mode;
+	c->vers++;
+	p = c->mem;
+	for(i = 0; i < nblocks; i++){
+		b = &c->blocks[i];
+		b->lk = vtLockAlloc();
+		b->c = c;
+		b->data = p;
+		b->heap = i;
+		b->ioready = vtRendezAlloc(b->lk);
+		c->heap[i] = b;
+		p += c->size;
+	}
+	c->nheap = nblocks;
+
+	for(i=0; i<nbl; i++){
+		bl = (BList*)p;
+		bl->next = c->blfree;
+		c->blfree = bl;
+		p += sizeof(BList);
+	}
+	c->blrend = vtRendezAlloc(c->lk);
+
+	c->maxdirty = nblocks*(DirtyPercentage*0.01);
+
+	c->fl = flAlloc(diskSize(disk, PartData));
+	
+	c->unlink = vtRendezAlloc(c->lk);
+	c->flush = vtRendezAlloc(c->lk);
+	c->flushwait = vtRendezAlloc(c->lk);
+
+	if(mode == OReadWrite){
+		c->ref += 2;
+		vtThread(unlinkThread, c);
+		vtThread(flushThread, c);
+	}
+	cacheCheck(c);
+
+	return c;
+}
+
+/*
+ * Free the whole memory cache, flushing all dirty blocks to the disk.
+ */
+void
+cacheFree(Cache *c)
+{
+	int i;
+
+	/* kill off daemon threads */
+	vtLock(c->lk);
+	c->die = vtRendezAlloc(c->lk);
+	vtWakeup(c->flush);
+	vtWakeup(c->unlink);
+	while(c->ref > 1)
+		vtSleep(c->die);
+
+	/* flush everything out */
+	do {
+		unlinkBody(c);
+		vtUnlock(c->lk);
+		while(cacheFlushBlock(c))
+			;
+		diskFlush(c->disk);
+		vtLock(c->lk);
+	} while(c->uhead || c->ndirty);
+	vtUnlock(c->lk);
+
+	cacheCheck(c);
+
+	for(i = 0; i < c->nblocks; i++){
+		assert(c->blocks[i].ref == 0);
+		vtRendezFree(c->blocks[i].ioready);
+		vtLockFree(c->blocks[i].lk);
+	}
+	flFree(c->fl);
+	vtMemFree(c->baddr);
+	vtMemFree(c->heads);
+	vtMemFree(c->blocks);
+	vtMemFree(c->mem);
+	vtLockFree(c->lk);
+	diskFree(c->disk);
+	vtRendezFree(c->blrend);
+	/* don't close vtSession */
+	vtMemFree(c);
+}
+
+static void
+cacheDump(Cache *c)
+{	
+	int i;
+	Block *b;
+
+	for(i = 0; i < c->nblocks; i++){
+		b = &c->blocks[i];
+		fprint(2, "p=%d a=%ud %V t=%d ref=%d state=%s io=%s\n",
+			b->part, b->addr, b->score, b->l.type, b->ref,
+			bsStr(b->l.state), bioStr(b->iostate));
+	}
+}
+
+static void
+cacheCheck(Cache *c)
+{
+	u32int size, now;
+	int i, k, refed;
+	static uchar zero[VtScoreSize];
+	Block *b;
+
+	size = c->size;
+	now = c->now;
+
+	for(i = 0; i < c->nheap; i++){
+		if(c->heap[i]->heap != i)
+			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
+		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
+			vtFatal("bad heap ordering");
+		k = (i << 1) + 1;
+		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
+			vtFatal("bad heap ordering");
+		k++;
+		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
+			vtFatal("bad heap ordering");
+	}
+
+	refed = 0;
+	for(i = 0; i < c->nblocks; i++){
+		b = &c->blocks[i];
+		if(b->data != &c->mem[i * size])
+			vtFatal("mis-blocked at %d", i);
+		if(b->ref && b->heap == BadHeap){
+			refed++;
+		}
+	}
+if(c->nheap + refed != c->nblocks){
+fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks);
+cacheDump(c);
+}
+	assert(c->nheap + refed == c->nblocks);
+	refed = 0;
+	for(i = 0; i < c->nblocks; i++){
+		b = &c->blocks[i];
+		if(b->ref){
+if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l);
+			refed++;
+		}
+	}
+if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed);
+}
+
+
+/*
+ * locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+/* called with c->lk held */
+static Block *
+cacheBumpBlock(Cache *c)
+{
+	Block *b;
+
+	/*
+	 * locate the block with the oldest second to last use.
+	 * remove it from the heap, and fix up the heap.
+	 */
+	if(c->nheap == 0)
+		vtFatal("cacheBumpBlock: no free blocks in cache");
+	b = c->heap[0];
+	heapDel(b);
+
+	assert(b->heap == BadHeap);
+	assert(b->ref == 0);
+	assert(b->iostate == BioEmpty || b->iostate == BioLabel || b->iostate == BioClean);
+	assert(b->prior == nil);
+	assert(b->uhead == nil);
+
+	/*
+	 * unchain the block from hash chain
+	 */
+	if(b->prev){
+		*(b->prev) = b->next;
+		if(b->next)
+			b->next->prev = b->prev;
+		b->prev = nil;
+	}
+
+ 	
+if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score);
+	/* set block to a reasonable state */
+	b->ref = 1;
+	b->part = PartError;
+	memset(&b->l, 0, sizeof(b->l));
+	b->iostate = BioEmpty;
+
+	return b;
+}
+
+/*
+ * look for a particular version of the block in the memory cache.
+ */
+static Block *
+_cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
+	int waitlock, int *lockfailure)
+{
+	Block *b;
+	ulong h;
+
+	h = addr % c->hashSize;
+
+	if(lockfailure)
+		*lockfailure = 0;
+
+	/*
+	 * look for the block in the cache
+	 */
+	vtLock(c->lk);
+	for(b = c->heads[h]; b != nil; b = b->next){
+		if(b->part == part && b->addr == addr)
+			break;
+	}
+	if(b == nil || b->vers != vers){
+		vtUnlock(c->lk);
+		return nil;
+	}
+	if(!waitlock && !vtCanLock(b->lk)){
+		*lockfailure = 1;
+		vtUnlock(c->lk);
+		return nil;
+	}
+	heapDel(b);
+	b->ref++;
+	vtUnlock(c->lk);
+
+	bwatchLock(b);
+	if(waitlock)
+		vtLock(b->lk);
+	b->nlock = 1;
+
+	for(;;){
+		switch(b->iostate){
+		default:
+			abort();
+		case BioEmpty:
+		case BioLabel:
+		case BioClean:	
+		case BioDirty:
+			if(b->vers != vers){
+				blockPut(b);
+				return nil;
+			}
+			return b;
+		case BioReading:
+		case BioWriting:
+			vtSleep(b->ioready);
+			break;
+		case BioVentiError:
+		case BioReadError:
+			blockSetIOState(b, BioEmpty);
+			blockPut(b);
+			vtSetError(EIO);
+			return nil;
+		}
+	}
+	/* NOT REACHED */
+}
+static Block*
+cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
+{
+	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
+}
+
+
+/*
+ * fetch a local (on-disk) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ */
+Block *
+_cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
+{
+	Block *b;
+	ulong h;
+
+	assert(part != PartVenti);
+
+	h = addr % c->hashSize;
+
+	/*
+	 * look for the block in the cache
+	 */
+	vtLock(c->lk);
+	for(b = c->heads[h]; b != nil; b = b->next){
+		if(b->part != part || b->addr != addr)
+			continue;
+		if(epoch && b->l.epoch != epoch){
+fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
+			vtUnlock(c->lk);
+			vtSetError(ELabelMismatch);
+			return nil;
+		}
+		heapDel(b);
+		b->ref++;
+		break;
+	}
+			
+	if(b == nil){
+		b = cacheBumpBlock(c);
+
+		b->part = part;
+		b->addr = addr;
+		localToGlobal(addr, b->score);
+
+		/* chain onto correct hash */
+		b->next = c->heads[h];
+		c->heads[h] = b;
+		if(b->next != nil)
+			b->next->prev = &b->next;
+		b->prev = &c->heads[h];
+	}
+
+	vtUnlock(c->lk);
+
+	/*
+	 * BUG: what if the epoch changes right here?
+	 * In the worst case, we could end up in some weird
+	 * lock loop, because the block we want no longer exists,
+	 * and instead we're trying to lock a block we have no 
+	 * business grabbing.
+	 *
+	 * For now, I'm not going to worry about it.
+	 */
+
+if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr);
+	bwatchLock(b);
+	vtLock(b->lk);
+	b->nlock = 1;
+
+	if(part == PartData && b->iostate == BioEmpty){
+		if(!readLabel(c, &b->l, addr)){
+			blockPut(b);
+			return nil;
+		}
+		blockSetIOState(b, BioLabel);
+	}
+	if(epoch && b->l.epoch != epoch){
+		blockPut(b);
+fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
+		vtSetError(ELabelMismatch);
+		return nil;
+	}
+
+	b->pc = getcallerpc(&c);
+	for(;;){
+		switch(b->iostate){
+		default:
+			abort();
+		case BioEmpty:
+		case BioLabel:
+			if(mode == OOverWrite){
+				blockSetIOState(b, BioClean);
+				return b;
+			}
+			diskRead(c->disk, b);
+			vtSleep(b->ioready);
+			break;
+		case BioClean:
+		case BioDirty:
+			return b;
+		case BioReading:
+		case BioWriting:
+			vtSleep(b->ioready);
+			break;
+		case BioReadError:
+			blockSetIOState(b, BioEmpty);
+			blockPut(b);
+			vtSetError(EIO);
+			return nil;
+		}
+	}
+	/* NOT REACHED */
+}
+
+Block *
+cacheLocal(Cache *c, int part, u32int addr, int mode)
+{
+	return _cacheLocal(c, part, addr, mode, 0);
+}
+
+/*
+ * fetch a local (on-disk) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ * check tag and type.
+ */
+Block *
+cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
+{
+	Block *b;
+
+	b = _cacheLocal(c, PartData, addr, mode, epoch);
+	if(b == nil)
+		return nil;
+	if(b->l.type != type || b->l.tag != tag){
+		fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %x exp %x\n",
+			addr, b->l.type, type, b->l.tag, tag);
+abort();
+		vtSetError(ELabelMismatch);
+		blockPut(b);
+		return nil;
+	}
+	b->pc = getcallerpc(&c);
+	return b;
+}
+
+/*
+ * fetch a global (Venti) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ * check tag and type if it's really a local block in disguise.
+ */
+Block *
+cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
+{
+	int n;
+	Block *b;
+	ulong h;
+	u32int addr;
+
+	addr = globalToLocal(score);
+	if(addr != NilBlock){
+		b = cacheLocalData(c, addr, type, tag, mode, 0);
+		if(b)
+			b->pc = getcallerpc(&c);
+		return b;
+	}
+
+	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
+
+	/*
+	 * look for the block in the cache
+	 */
+	vtLock(c->lk);
+	for(b = c->heads[h]; b != nil; b = b->next){
+		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
+			continue;
+		heapDel(b);
+		b->ref++;
+		break;
+	}
+
+	if(b == nil){
+if(0)fprint(2, "cacheGlobal %V %d\n", score, type);
+
+		b = cacheBumpBlock(c);
+
+		b->part = PartVenti;
+		b->addr = NilBlock;
+		b->l.type = type;
+		memmove(b->score, score, VtScoreSize);
+
+		/* chain onto correct hash */
+		b->next = c->heads[h];
+		c->heads[h] = b;
+		if(b->next != nil)
+			b->next->prev = &b->next;
+		b->prev = &c->heads[h];
+	}
+	vtUnlock(c->lk);
+
+	bwatchLock(b);
+	vtLock(b->lk);
+	b->nlock = 1;
+	b->pc = getcallerpc(&c);
+
+	switch(b->iostate){
+	default:
+		abort();
+	case BioEmpty:
+		n = vtRead(c->z, score, vtType[type], b->data, c->size);
+		if(n < 0 || !vtSha1Check(score, b->data, n)){
+			blockSetIOState(b, BioVentiError);
+			blockPut(b);
+			return nil;
+		}
+		vtZeroExtend(vtType[type], b->data, n, c->size);
+		blockSetIOState(b, BioClean);
+		return b;	
+	case BioClean:
+		return b;
+	case BioVentiError:
+	case BioReadError:
+		blockPut(b);
+		vtSetError(EIO);
+		blockSetIOState(b, BioEmpty);
+		return nil;
+	}
+	/* NOT REACHED */
+}
+
+/*
+ * allocate a new on-disk block and load it into the memory cache.
+ * BUG: if the disk is full, should we flush some of it to Venti?
+ */
+static u32int lastAlloc;
+
+Block *
+cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
+{
+	FreeList *fl;
+	u32int addr;
+	Block *b;
+	int n, nwrap;
+	Label lab;
+
+	n = c->size / LabelSize;
+	fl = c->fl;
+
+	vtLock(fl->lk);
+fl->last = 0;
+	addr = fl->last;
+	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
+	if(b == nil){
+		fprint(2, "cacheAllocBlock: xxx %R\n");
+		vtUnlock(fl->lk);
+		return nil;
+	}
+	nwrap = 0;
+	for(;;){
+		if(++addr >= fl->end){
+			addr = 0;
+			fprint(2, "cacheAllocBlock wrap %d\n", fl->end);
+			if(++nwrap >= 2){
+				blockPut(b);
+				fl->last = 0;
+				vtSetError("disk is full");
+				fprint(2, "cacheAllocBlock: xxx1 %R\n");
+				vtUnlock(fl->lk);
+				return nil;
+			}
+		}
+		if(addr%n == 0){
+			blockPut(b);
+			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
+			if(b == nil){
+				fl->last = addr;
+				fprint(2, "cacheAllocBlock: xxx2 %R\n");
+				vtUnlock(fl->lk);
+				return nil;
+			}
+		}
+		if(!labelUnpack(&lab, b->data, addr%n))
+			continue;
+		if(lab.state == BsFree)
+			goto Found;
+		if((lab.state&BsClosed) && lab.epochClose <= epochLow)
+			goto Found;
+	}
+Found:
+	blockPut(b);
+	b = cacheLocal(c, PartData, addr, OOverWrite);
+	if(b == nil){
+		fprint(2, "cacheAllocBlock: xxx3 %R\n");
+		return nil;
+	}
+assert(b->iostate == BioLabel || b->iostate == BioClean);
+	fl->last = addr;
+	lab.type = type;
+	lab.tag = tag;
+	lab.state = BsAlloc;
+	lab.epoch = epoch;
+	lab.epochClose = ~(u32int)0;
+	if(!blockSetLabel(b, &lab)){
+		fprint(2, "cacheAllocBlock: xxx4 %R\n");
+		blockPut(b);
+		return nil;
+	}
+	vtZeroExtend(vtType[type], b->data, 0, c->size);
+if(0)diskWrite(c->disk, b);
+
+if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag);
+	lastAlloc = addr;
+	vtUnlock(fl->lk);
+	b->pc = getcallerpc(&c);
+	return b;
+}
+
+static FreeList *
+flAlloc(u32int end)
+{
+	FreeList *fl;
+
+	fl = vtMemAllocZ(sizeof(*fl));
+	fl->lk = vtLockAlloc();
+	fl->last = end;
+	fl->end = end;
+	return fl;
+}
+
+static void
+flFree(FreeList *fl)
+{
+	vtLockFree(fl->lk);
+	vtMemFree(fl);
+}
+
+u32int
+cacheLocalSize(Cache *c, int part)
+{
+	return diskSize(c->disk, part);
+}
+
+/*
+ * Copy on write.  Copied blocks have to be marked BaCopy.
+ * See the big comment near blockRemoveLink.
+ */
+Block*
+blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
+{
+	Block *bb, *lb;
+	Label l;
+
+	assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi);
+	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
+	if(bb == nil){
+		blockPut(b);
+		return nil;
+	}
+
+	/*
+	 * Change label on b to mark that we've copied it.
+	 * This has to come after cacheAllocBlock, since we
+	 * can't hold any labels blocks (lb) while we try to
+	 * fetch others (in cacheAllocBlock).
+	 */
+	if(!(b->l.state&BsCopied) && b->part==PartData){
+		l = b->l;
+		l.state |= BsCopied;
+		lb = _blockSetLabel(b, &l);
+		if(lb == nil){
+			/* can't set label => can't copy block */
+			blockPut(b);
+			l.type = BtMax;
+			l.state = BsFree;
+			l.epoch = 0;
+			l.epochClose = 0;
+			l.tag = 0;
+			/* ignore error: block gets lost on error */
+			blockSetLabel(bb, &l);
+			blockPut(bb);
+			return nil;
+		}
+		blockDependency(bb, lb, -1, nil);
+		blockPut(lb);
+	}
+
+	if(0){
+		if(b->addr != NilBlock)
+			fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n",
+				b->addr, b->l.epoch, bb->addr, bb->l.epoch);
+		else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0)
+			fprint(2, "blockCopy %V => %#ux/%ud\n",
+				b->score, bb->addr, bb->l.epoch);
+	}
+
+	memmove(bb->data, b->data, b->c->size);
+	blockDirty(bb);
+	blockPut(b);
+	return bb;
+}
+
+/*
+ * The thread that has locked b may refer to it by
+ * multiple names.  Nlock counts the number of
+ * references the locking thread holds.  It will call
+ * blockPut once per reference.
+ */
+void
+blockDupLock(Block *b)
+{
+	assert(b->nlock > 0);
+	b->nlock++;
+}
+
+/*
+ * we're done with the block.
+ * unlock it.  can't use it after calling this.
+ */
+void
+blockPut(Block* b)
+{
+	Cache *c;
+
+	if(b == nil)
+		return;
+
+if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
+
+	if(b->iostate == BioDirty)
+		bwatchDependency(b);
+
+	if(--b->nlock > 0)
+		return;
+
+	/*
+	 * b->nlock should probably stay at zero while
+	 * the block is unlocked, but diskThread and vtSleep
+	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
+	 * so we have to keep b->nlock set to 1 even 
+	 * when the block is unlocked.
+	 */
+	assert(b->nlock == 0);
+	b->nlock = 1;
+//	b->pc = 0;
+
+	bwatchUnlock(b);
+	vtUnlock(b->lk);
+	c = b->c;
+	vtLock(c->lk);
+
+	if(--b->ref > 0){
+		vtUnlock(c->lk);
+		return;
+	}
+
+	assert(b->ref == 0);
+	switch(b->iostate){
+	default:
+		b->used = c->now++;
+		heapIns(b);
+		break;
+	case BioEmpty:
+	case BioLabel:
+		if(c->nheap == 0)
+			b->used = c->now++;
+		else
+			b->used = c->heap[0]->used;
+		heapIns(b);
+		break;
+	case BioDirty:
+		break;
+	}
+	vtUnlock(c->lk);
+}
+
+/*
+ * we're deleting a block; delete all the blocks it points to
+ * that are still active (i.e., not needed by snapshots).
+ */
+static void
+blockCleanup(Block *b, u32int epoch)
+{
+	Cache *c;
+	Block *bb;
+	int i, n;
+	Label l;
+	u32int a;
+	int type;
+	int mode;
+
+	type = b->l.type;
+	c = b->c;
+
+	bwatchReset(b->score);
+
+	blockSetIOState(b, BioClean);
+	
+	/* do not recursively free directories */
+	if(type == BtData || type == BtDir)
+		return;
+
+	n = c->size / VtScoreSize;
+	mode = OReadWrite;
+	if(type-1 == BtData || type-1 == BtDir)
+		mode = OOverWrite;
+	for(i=0; i<n; i++){
+		a = globalToLocal(b->data + i*VtScoreSize);
+		if(a == NilBlock || !readLabel(c, &l, a))
+			continue;
+		if((l.state&BsClosed) || l.epoch != epoch)
+			continue;
+		bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0);
+		if(bb == nil)
+			continue;
+		if((bb->l.state&BsClosed) || bb->l.epoch != epoch){
+			fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n",
+				a, &l, &bb->l);
+			blockPut(bb);
+			continue;
+		}
+		blockCleanup(bb, epoch);
+		l.type = BtMax;
+		l.epoch = epoch;
+		l.epochClose = 0;
+		l.state = BsFree;
+		l.tag = 0;
+		blockSetLabel(bb, &l);
+		blockPut(bb);
+	}
+}
+
+/*
+ * We don't need the block at addr anymore for the active file system.
+ * If we don't need it for the snapshots, remove it completely.
+ * Epoch is the epoch during which we got rid of the block.
+ * See blockRemoveLink for more.
+ */
+static int
+unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch)
+{
+	Block *b;
+	Label l;
+
+	if(addr == NilBlock)
+		return 1;
+
+//fprint(2, "unlinkBlock %#ux\n", addr);
+	b = cacheLocalData(c, addr, type, tag, OReadOnly, 0);
+	if(b == nil)
+		return 0;
+	if(b->l.epoch > epoch){
+fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch);
+		blockPut(b);
+		return 0;
+	}
+
+	l = b->l;
+	if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){
+		l.state = BsFree;
+		l.type = BtMax;
+		l.tag = 0;
+		l.epoch = 0;
+		l.epochClose = 0;
+		blockCleanup(b, epoch);
+	}else{
+		l.state |= BsClosed;
+		l.epochClose = epoch;
+	}
+	blockSetLabel(b, &l);
+	blockPut(b);
+	return 1;
+}
+
+/*
+ * try to allocate a BList so we can record that b must
+ * be written out before some other block.
+ * if can't find a BList, write b out instead and return nil.
+ */
+static BList *
+blistAlloc(Block *b)
+{
+	Cache *c;
+	BList *p;
+
+	/*
+	 * It's possible that when we marked b dirty, there were
+	 * too many dirty blocks so we just wrote b there and then.
+	 * So b might not be dirty.  If it's not, no need to worry
+	 * about recording the write constraint.
+	 *
+	 * BlockRemoveLink depends on the fact that if blistAlloc
+	 * returns non-nil, b really is dirty.
+	 */
+	if(b->iostate != BioDirty){
+		assert(b->iostate == BioClean);
+		return nil;
+	}
+		
+	/*
+	 * Easy: maybe there's a free list left.
+	 */
+	c = b->c;
+	vtLock(c->lk);
+	if(c->blfree){
+	HaveBlFree:
+		p = c->blfree;
+		c->blfree = p->next;
+		vtUnlock(c->lk);
+		return p;
+	}
+	vtUnlock(c->lk);
+
+	/*
+	 * No free BLists.  What are our options?
+	 */
+
+	/* Block has no priors? Just write it. */
+	if(b->prior == nil){
+		diskWrite(c->disk, b);
+		while(b->iostate != BioClean)
+			vtSleep(b->ioready);
+		return nil;
+	}
+
+	/*
+	 * Wake the flush thread, which will hopefully free up
+	 * some BLists for us.  We used to flush a block from
+	 * our own prior list and reclaim that BList, but this is
+	 * a no-no: some of the blocks on our prior list may
+	 * be locked by our caller.  Or maybe their label blocks
+	 * are locked by our caller.  In any event, it's too hard
+	 * to make sure we can do I/O for ourselves.  Instead,
+	 * we assume the flush thread will find something.
+	 * (The flush thread never blocks waiting for a block,
+	 * so it won't deadlock like we will.)
+	 */
+	vtLock(c->lk);
+	while(c->blfree == nil){
+		vtWakeup(c->flush);
+		vtSleep(c->blrend);
+	}
+	goto HaveBlFree;
+}
+
+void
+blistFree(Cache *c, BList *bl)
+{
+	vtLock(c->lk);
+	bl->next = c->blfree;
+	c->blfree = bl;
+	vtWakeup(c->blrend);
+	vtUnlock(c->lk);
+}
+
+/*
+ * Flush b or one of the blocks it depends on.
+ */
+void
+blockFlush(Block *b)
+{
+	int first, nlock;
+	BList *p, **pp;
+	Block *bb;
+	Cache *c;
+
+//fprint(2, "blockFlush %p\n", b);
+
+	c = b->c;
+	
+	first = 1;
+	pp = &b->prior;
+	for(p=*pp; p; p=*pp){
+		bb = cacheLocalLookup(c, p->part, p->addr, p->vers);
+		if(bb == nil){
+			*pp = p->next;
+			blistFree(c, p);
+			continue;
+		}
+		if(!first)
+			blockPut(b);
+		first = 0;
+		b = bb;
+		pp = &b->prior;
+	}
+
+	/*
+	 * If b->nlock > 1, the block is aliased within
+	 * a single thread.  That thread is us, and it's
+	 * the block that was passed in (rather than a prior).
+	 * DiskWrite does some funny stuff with VtLock
+	 * and blockPut that basically assumes b->nlock==1.
+	 * We humor diskWrite by temporarily setting
+	 * nlock to 1.  This needs to be revisited.  (TODO)
+	 */
+	nlock = b->nlock;
+	if(nlock > 1){
+		assert(first);
+		b->nlock = 1;
+	}
+	diskWrite(c->disk, b);
+	while(b->iostate != BioClean)
+		vtSleep(b->ioready);
+	b->nlock = nlock;
+	if(!first)
+		blockPut(b);
+}
+
+/*
+ * record that bb must be written out before b.
+ * if index is given, we're about to overwrite the score
+ * at that index in the block.  save the old value so we
+ * can write a safer ``old'' version of the block if pressed.
+ */
+void
+blockDependency(Block *b, Block *bb, int index, uchar *score)
+{
+	BList *p;
+
+	if(bb->iostate == BioClean)
+		return;
+
+	assert(bb->iostate == BioDirty);
+
+	p = blistAlloc(bb);
+	if(p == nil)
+		return;	
+
+if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
+
+	p->part = bb->part;
+	p->addr = bb->addr;
+	p->type = bb->l.type;
+	p->vers = bb->vers;
+	p->index = index;
+	if(p->index >= 0)
+		memmove(p->score, score, VtScoreSize);
+	p->next = b->prior;
+	b->prior = p;
+}
+
+/*
+ * Mark an in-memory block as dirty.  If there are too many
+ * dirty blocks, start writing some out to disk.  If there are way
+ * too many dirty blocks, write this one out too.
+ *
+ * Note that since we might call blockFlush, which walks
+ * the prior list, we can't call blockDirty while holding a lock
+ * on any of our priors.  This can be tested by recompiling
+ * with flush always set to 1 below.
+ */
+int
+blockDirty(Block *b)
+{
+	Cache *c;
+	int flush;
+
+	c = b->c;
+
+	assert(b->part != PartVenti);
+
+	if(b->iostate == BioDirty)
+		return 1;
+	assert(b->iostate == BioClean);
+	b->iostate = BioDirty;
+
+	vtLock(c->lk);
+	c->ndirty++;
+	if(c->ndirty > (c->maxdirty>>1))
+		vtWakeup(c->flush);
+	flush = c->ndirty > c->maxdirty;
+	vtUnlock(c->lk);
+
+	if(flush)
+		blockFlush(b);
+
+	return 1;
+}
+
+/*
+ * Block b once pointed at the block bb at addr/type/tag, but no longer does.
+ * 
+ * The file system maintains the following invariants (i-iv checked by flchk):
+ *
+ * (i) b.e in [bb.e, bb.eClose)
+ * (ii) if b.e==bb.e,  then no other b' in e points at bb.
+ * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb.
+ * (iv) if b is active then no other active b' points at bb.
+ * (v) if b is a past life of b' then only one of b and b' is active (too hard to check)
+ *
+ * The file system initially satisfies these invariants, and we can verify that
+ * the various file system operations maintain them.  See fossil.invariants.
+ *
+ * Condition (i) lets us reclaim blocks once the low epoch is greater
+ * than epochClose.
+ * 
+ * If the condition in (iii) is satisfied, then this is the only pointer to bb,
+ * so bb can be reclaimed once b has been written to disk.  blockRemoveLink
+ * checks !(b.state&Copied) as an optimization.  UnlinkBlock and blockCleanup
+ * will check the conditions again for each block they consider.
+ *
+ * 12/28/2002 01:11 RSC BUG
+ * When Entry structures are changed, most code does (for example):
+ *
+ *	oe = e;
+ *	memset(&e, 0, sizeof e);
+ *	entryPack(&e, b->data, index);
+ *	blockDirty(b);
+ *	blockDependency(b, block referenced by new e);
+ *	addr = globalToLocal(oe.score);
+ *	if(addr != NilBlock)
+ *		blockRemoveLink(b, addr, entryType(&oe), oe.tag);
+ *
+ * This is wrong if there is already a different dependency for that entry
+ * and the entries have different types (different heights of the hash tree).
+ * Because the dependency only records the block address and not the
+ * entry type, putting the old block address into the new entry results in
+ * a bogus entry structure.  blockRollback catches this in an assert failure.
+ * I think the solution is to record the entry type and tag in the BList structure,
+ * but I want to mull it over a bit longer.
+ *
+ * In two and a half months running the system I have seen exactly one
+ * crash due to this bug.
+ */
+int
+blockRemoveLink(Block *b, u32int addr, int type, u32int tag)
+{
+	BList *bl;
+	BList *p, **pp;
+	Cache *c;
+
+	c = b->c;
+
+	/* remove unlinked block from prior list */
+	pp = &b->prior;
+	for(p=*pp; p; p=*pp){
+		if(p->part != PartData || p->addr != addr){
+			pp = &p->next;
+			continue;
+		}
+		*pp = p->next;
+		blistFree(c, p);
+	}
+
+	/* if b has been copied, can't reclaim blocks it points at. */
+	if(b->l.state & BsCopied)
+		return 0;
+
+	bl = blistAlloc(b);
+	if(bl == nil)
+		return unlinkBlock(b->c, addr, type, tag, b->l.epoch);
+
+	/*
+	 * Because bl != nil, we know b is dirty.
+	 * (Linking b->uhead onto a clean block is
+	 * counterproductive, since we only look at
+	 * b->uhead when a block transitions from 
+	 * dirty to clean.)
+	 */
+	assert(b->iostate == BioDirty);
+	
+	bl->part = PartData;
+	bl->addr = addr;
+	bl->type = type;
+	bl->tag = tag;
+	bl->epoch = b->l.epoch;
+	if(b->uhead == nil)
+		b->uhead = bl;
+	else 
+		b->utail->next = bl;
+	b->utail = bl;
+	bl->next = nil;
+	return 1;
+}
+
+/*
+ * set the label associated with a block.
+ */
+Block*
+_blockSetLabel(Block *b, Label *l)
+{
+	int lpb;
+	Block *bb;
+	u32int a;
+	Cache *c;
+
+	c = b->c;
+
+	assert(b->part == PartData);
+	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
+	lpb = c->size / LabelSize;
+	a = b->addr / lpb;
+	bb = cacheLocal(c, PartLabel, a, OReadWrite);
+	if(bb == nil){
+		blockPut(b);
+		return nil;
+	}
+	b->l = *l;
+	labelPack(l, bb->data, b->addr%lpb);
+	blockDirty(bb);
+	return bb;
+}
+
+int
+blockSetLabel(Block *b, Label *l)
+{
+	Block *lb;
+	Label oldl;
+
+	oldl = b->l;
+	lb = _blockSetLabel(b, l);
+	if(lb == nil)
+		return 0;
+
+	/*
+	 * If we're allocating the block, make sure the label (bl)
+	 * goes to disk before the data block (b) itself.  This is to help
+	 * the blocks that in turn depend on b. 
+	 *
+	 * Suppose bx depends on (must be written out after) b.  
+	 * Once we write b we'll think it's safe to write bx.
+	 * Bx can't get at b unless it has a valid label, though.
+	 *
+	 * Allocation is the only case in which having a current label
+	 * is vital because:
+	 *
+	 *	- l.type is set at allocation and never changes.
+	 *	- l.tag is set at allocation and never changes.
+	 *	- l.state is not checked when we load blocks.
+	 *	- the archiver cares deeply about l.state being
+	 *		BaActive vs. BaCopied, but that's handled
+	 *		by direct calls to _blockSetLabel.
+	 */
+
+	if(oldl.state == BsFree)
+		blockDependency(b, lb, -1, nil);
+	blockPut(lb);
+	return 1;
+}
+
+/*
+ * We've decided to write out b.
+ * Maybe b has some pointers to blocks
+ * that haven't yet been written to disk.
+ * If so, construct a slightly out-of-date
+ * copy of b that is safe to write out.
+ * (diskThread will make sure the block
+ * remains marked as dirty.)
+ */
+uchar *
+blockRollback(Block *b, uchar *buf)
+{
+	u32int addr;
+	BList *p;
+	Entry e;
+	Super super;
+
+	/* easy case */
+	if(b->prior == nil)
+		return b->data;
+	
+	memmove(buf, b->data, b->c->size);
+	for(p=b->prior; p; p=p->next){
+		/*
+		 * we know p->index >= 0 because blockWrite has vetted this block for us.
+		 */
+		assert(p->index >= 0);
+		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
+		if(b->part == PartSuper){
+			assert(p->index == 0);
+			superUnpack(&super, buf);
+			addr = globalToLocal(p->score);
+			if(addr == NilBlock){
+				fprint(2, "rolling back super block: bad replacement addr %V\n", p->score);
+				abort();
+			}
+			super.active = addr;
+			superPack(&super, buf);
+			continue;
+		}
+		if(b->l.type != BtDir){
+			memmove(buf+p->index*VtScoreSize, p->score, VtScoreSize);
+			continue;
+		}
+		entryUnpack(&e, buf, p->index);
+		assert(entryType(&e) == p->type);
+		memmove(e.score, p->score, VtScoreSize);
+		if(globalToLocal(p->score) == NilBlock){
+			e.flags &= ~VtEntryLocal;	
+			e.tag = 0;
+			e.snap = 0;
+		}
+		entryPack(&e, buf, p->index);
+	}
+	return buf;
+}
+
+/*
+ * Try to write block b. 
+ * If b depends on other blocks:
+ * 
+ *	If the block has been written out, remove the dependency.
+ *	If we know how to write out an old version of b that doesn't
+ *		depend on it, do that.
+ *
+ *	Otherwise, bail.
+ */
+int
+blockWrite(Block *b)
+{
+	Cache *c;
+	BList *p, **pp;
+	Block *bb;
+	int lockfail;
+
+	c = b->c;
+
+	if(b->iostate != BioDirty)
+		return 1;
+
+	pp = &b->prior;
+	for(p=*pp; p; p=*pp){
+		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
+		if(bb == nil){
+			if(lockfail)
+				return 0;
+			/* block must have been written already */
+			*pp = p->next;
+			blistFree(c, p);
+			continue;
+		}
+
+		/*
+		 * same version of block is still in cache.
+		 * 
+		 * the assertion is true because the block still has version p->vers,
+		 * which means it hasn't been written out since we last saw it.
+		 */
+		assert(bb->iostate == BioDirty);
+		blockPut(bb);
+
+		if(p->index < 0){
+			/*
+			 * We don't know how to temporarily undo
+			 * b's dependency on bb, so just don't write b yet.
+			 */
+			if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
+				b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
+			return 0;
+		}
+		/* keep walking down the list */
+		pp = &p->next;
+	}
+
+	diskWrite(c->disk, b);
+	return 1;
+}
+
+/*
+ * Change the I/O state of block b.
+ * Just an assignment except for magic in
+ * switch statement (read comments there).
+ */
+void
+blockSetIOState(Block *b, int iostate)
+{
+	int dowakeup;
+	Cache *c;
+	BList *p, *q;
+
+if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
+	
+	c = b->c;
+
+	dowakeup = 0;
+	switch(iostate){
+	default:
+		abort();
+	case BioEmpty:
+		assert(!b->uhead);
+		break;
+	case BioLabel:
+		assert(!b->uhead);
+		break;
+	case BioClean:
+		bwatchDependency(b);
+		/*
+		 * If b->prior is set, it means a write just finished.
+		 * The prior list isn't needed anymore.
+		 */
+		for(p=b->prior; p; p=q){
+			q = p->next;
+			blistFree(c, p);
+		}
+		b->prior = nil;
+		/*
+		 * Freeing a block or just finished a write.
+		 * Move the blocks from the per-block unlink
+		 * queue to the cache unlink queue.
+		 */
+		if(b->iostate == BioDirty || b->iostate == BioWriting){
+			vtLock(c->lk);
+			c->ndirty--;
+			b->vers = c->vers++;
+			if(b->uhead){
+				/* add unlink blocks to unlink queue */
+				if(c->uhead == nil){
+					c->uhead = b->uhead;
+					vtWakeup(c->unlink);
+				}else
+					c->utail->next = b->uhead;
+				c->utail = b->utail;
+				b->uhead = nil;
+			}
+			vtUnlock(c->lk);
+		}
+		assert(!b->uhead);
+		dowakeup = 1;
+		break;
+	case BioDirty:
+		/*
+		 * Wrote out an old version of the block (see blockRollback).
+		 * Bump a version count, leave it dirty.
+		 */
+		if(b->iostate == BioWriting){
+			vtLock(c->lk);
+			b->vers = c->vers++;
+			vtUnlock(c->lk);
+			dowakeup = 1;
+		}
+		break;
+	case BioReading:
+	case BioWriting:
+		/*
+		 * Adding block to disk queue.  Bump reference count.
+		 * diskThread decs the count later by calling blockPut.
+		 * This is here because we need to lock c->lk to
+		 * manipulate the ref count.
+		 */
+		vtLock(c->lk);
+		b->ref++;
+		vtUnlock(c->lk);
+		break;
+	case BioReadError:
+	case BioVentiError:
+		/*
+		 * Oops.
+		 */
+		dowakeup = 1;
+		break;
+	}
+	b->iostate = iostate;
+	/*
+	 * Now that the state has changed, we can wake the waiters.
+	 */
+	if(dowakeup)
+		vtWakeupAll(b->ioready);
+}
+
+char*
+bsStr(int state)
+{
+	static char s[100];
+
+	if(state == BsFree)
+		return "Free";
+	if(state == BsBad)
+		return "Bad";
+
+	sprint(s, "%x", state);
+	if(!(state&BsAlloc))
+		strcat(s, ",Free");	/* should not happen */
+	if(state&BsCopied)
+		strcat(s, ",Copied");
+	if(state&BsVenti)
+		strcat(s, ",Venti");
+	if(state&BsClosed)
+		strcat(s, ",Closed");
+	return s;
+}
+
+char *
+bioStr(int iostate)
+{
+	switch(iostate){
+	default:
+		return "Unknown!!";
+	case BioEmpty:
+		return "Empty";
+	case BioLabel:
+		return "Label";
+	case BioClean:
+		return "Clean";
+	case BioDirty:
+		return "Dirty";
+	case BioReading:
+		return "Reading";
+	case BioWriting:
+		return "Writing";
+	case BioReadError:
+		return "ReadError";
+	case BioVentiError:
+		return "VentiError";
+	case BioMax:
+		return "Max";
+	}
+}
+
+static char *bttab[] = {
+	"BtData",
+	"BtData+1",
+	"BtData+2",
+	"BtData+3",
+	"BtData+4",
+	"BtData+5",
+	"BtData+6",
+	"BtData+7",
+	"BtDir",
+	"BtDir+1",
+	"BtDir+2",
+	"BtDir+3",
+	"BtDir+4",
+	"BtDir+5",
+	"BtDir+6",
+	"BtDir+7",
+};
+
+char*
+btStr(int type)
+{
+	if(type < nelem(bttab))
+		return bttab[type];
+	return "unknown";
+}
+
+int
+labelFmt(Fmt *f)
+{
+	Label *l;
+
+	l = va_arg(f->args, Label*);
+	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
+		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
+}
+
+int
+scoreFmt(Fmt *f)
+{
+	uchar *v;
+	int i;
+	u32int addr;
+
+	v = va_arg(f->args, uchar*);
+	if(v == nil){
+		fmtprint(f, "*");
+	}else if((addr = globalToLocal(v)) != NilBlock)
+		fmtprint(f, "0x%.8ux", addr);
+	else{
+		for(i = 0; i < VtScoreSize; i++)
+			fmtprint(f, "%2.2ux", v[i]);
+	}
+
+	return 0;
+}
+
+static int
+upHeap(int i, Block *b)
+{
+	Block *bb;
+	u32int now;
+	int p;
+	Cache *c;
+	
+	c = b->c;
+	now = c->now;
+	for(; i != 0; i = p){
+		p = (i - 1) >> 1;
+		bb = c->heap[p];
+		if(b->used - now >= bb->used - now)
+			break;
+		c->heap[i] = bb;
+		bb->heap = i;
+	}
+	c->heap[i] = b;
+	b->heap = i;
+
+	return i;
+}
+
+static int
+downHeap(int i, Block *b)
+{
+	Block *bb;
+	u32int now;
+	int k;
+	Cache *c;
+	
+	c = b->c;
+	now = c->now;
+	for(; ; i = k){
+		k = (i << 1) + 1;
+		if(k >= c->nheap)
+			break;
+		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
+			k++;
+		bb = c->heap[k];
+		if(b->used - now <= bb->used - now)
+			break;
+		c->heap[i] = bb;
+		bb->heap = i;
+	}
+	c->heap[i] = b;
+	b->heap = i;
+	return i;
+}
+
+/*
+ * Delete a block from the heap.
+ * Called with c->lk held.
+ */
+static void
+heapDel(Block *b)
+{
+	int i, si;
+	Cache *c;
+
+	c = b->c;
+
+	si = b->heap;
+	if(si == BadHeap)
+		return;
+	b->heap = BadHeap;
+	c->nheap--;
+	if(si == c->nheap)
+		return;
+	b = c->heap[c->nheap];
+	i = upHeap(si, b);
+	if(i == si)
+		downHeap(i, b);
+}
+
+/*
+ * Insert a block into the heap.
+ * Called with c->lk held.
+ */
+static void
+heapIns(Block *b)
+{
+	assert(b->heap == BadHeap);
+	upHeap(b->c->nheap++, b);
+}
+
+/*
+ * Get just the label for a block.
+ */
+static int
+readLabel(Cache *c, Label *l, u32int addr)
+{
+	int lpb;
+	Block *b;
+	u32int a;
+
+	lpb = c->size / LabelSize;
+	a = addr / lpb;
+	b = cacheLocal(c, PartLabel, a, OReadOnly);
+	if(b == nil){
+		blockPut(b);
+		return 0;
+	}
+
+	if(!labelUnpack(l, b->data, addr%lpb)){
+		blockPut(b);
+		return 0;
+	}
+	blockPut(b);
+	return 1;
+}
+
+/*
+ * Process unlink queue.
+ * Called with c->lk held.
+ */
+static void
+unlinkBody(Cache *c)
+{
+	BList *p;
+
+	while(c->uhead != nil){
+		p = c->uhead;
+		c->uhead = p->next;
+		vtUnlock(c->lk);
+
+		if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch))
+			fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n",
+				p->addr, p->type, p->tag);
+
+		vtLock(c->lk);
+		p->next = c->blfree;
+		c->blfree = p;
+	}
+}
+
+/*
+ * Occasionally unlink the blocks on the cache unlink queue.
+ */
+static void
+unlinkThread(void *a)
+{
+	Cache *c = a;
+
+	vtThreadSetName("unlink");
+
+	vtLock(c->lk);
+	for(;;){
+		while(c->uhead == nil && c->die == nil)
+			vtSleep(c->unlink);
+		if(c->die != nil)
+			break;
+		unlinkBody(c);
+	}
+	c->ref--;
+	vtWakeup(c->die);
+	vtUnlock(c->lk);
+}
+
+static int
+baddrCmp(void *a0, void *a1)
+{
+	BAddr *b0, *b1;
+	b0 = a0;
+	b1 = a1;
+
+	if(b0->part < b1->part)
+		return -1;
+	if(b0->part > b1->part)
+		return 1;
+	if(b0->addr < b1->addr)
+		return -1;
+	if(b0->addr > b1->addr)
+		return 1;
+	return 0;
+}
+
+/*
+ * Scan the block list for dirty blocks; add them to the list c->baddr.
+ */
+static void
+flushFill(Cache *c)
+{
+	int i;
+	BAddr *p;
+	Block *b;
+
+	vtLock(c->lk);
+	if(c->ndirty == 0){
+		vtUnlock(c->lk);
+		return;
+	}
+
+	p = c->baddr;
+	for(i=0; i<c->nblocks; i++){
+		b = c->blocks + i;
+		if(b->part == PartError || b->iostate != BioDirty)
+			continue;
+		p->part = b->part;
+		p->addr = b->addr;
+		p->vers = b->vers;
+		p++;
+	}
+	vtUnlock(c->lk);
+	
+	c->bw = p - c->baddr;
+	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
+}
+	
+/*
+ * This is not thread safe, i.e. it can't be called from multiple threads.
+ * 
+ * It's okay how we use it, because it only gets called in
+ * the flushThread.  And cacheFree, but only after
+ * cacheFree has killed off the flushThread.
+ */
+static int
+cacheFlushBlock(Cache *c)
+{
+	Block *b;
+	BAddr *p;
+	int lockfail, nfail;
+
+	nfail = 0;
+	for(;;){
+		if(c->br == c->be){
+			if(c->bw == 0 || c->bw == c->be)
+				flushFill(c);
+			c->br = 0;
+			c->be = c->bw;
+			c->bw = 0;
+			c->nflush = 0;
+		}
+
+		if(c->br == c->be)
+			return 0;
+		p = c->baddr + c->br;
+		c->br++;
+		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
+
+		if(b && blockWrite(b)){
+			c->nflush++;
+			blockPut(b);
+			return 1;
+		}
+		if(b)
+			blockPut(b);
+
+		/*
+		 * Why didn't we write the block?
+		 */
+
+		/* Block already written out */
+		if(b == nil && !lockfail)
+			continue;
+
+		/* Failed to acquire lock; sleep if happens a lot. */
+		if(lockfail && ++nfail > 100)
+			sleep(500);
+
+		/* Requeue block. */
+		if(c->bw < c->be)
+			c->baddr[c->bw++] = *p;
+	}
+	return 0;
+}
+
+/*
+ * Occasionally flush dirty blocks from memory to the disk.
+ */
+static void
+flushThread(void *a)
+{
+	Cache *c = a;
+	int i;
+
+	vtThreadSetName("flush");
+	vtLock(c->lk);
+	while(c->die == nil){
+		vtSleep(c->flush);
+		vtUnlock(c->lk);
+		for(i=0; i<FlushSize; i++)
+			if(!cacheFlushBlock(c))
+				break;
+		vtLock(c->lk);
+		vtWakeupAll(c->flushwait);
+	}
+	c->ref--;
+	vtWakeup(c->die);
+	vtUnlock(c->lk);
+}
+
+/*
+ * Keep flushing until everything is clean.
+ */
+void
+cacheFlush(Cache *c, int wait)
+{
+	vtLock(c->lk);
+	if(wait){
+		while(c->ndirty){
+			consPrint("cacheFlush: %d dirty blocks\n", c->ndirty);
+			vtWakeup(c->flush);
+			vtSleep(c->flushwait);
+		}
+		consPrint("cacheFlush: done\n", c->ndirty);
+	}else
+		vtWakeup(c->flush);
+	vtUnlock(c->lk);
+}

+ 298 - 0
sys/src/cmd/fossil/dat.h

@@ -0,0 +1,298 @@
+typedef struct Arch Arch;
+typedef struct BList BList;
+typedef struct Block Block;
+typedef struct Cache Cache;
+typedef struct Disk Disk;
+typedef struct Entry Entry;
+typedef struct Header Header;
+typedef struct Label Label;
+typedef struct Periodic Periodic;
+typedef struct Snap Snap;
+typedef struct Source Source;
+typedef struct Super Super;
+typedef struct WalkPtr WalkPtr;
+
+/* tuneable parameters - probably should not be constants */
+enum {
+	BytesPerEntry = 100,	/* estimate of bytes per dir entries - determines number of index entries in the block */
+	FullPercentage = 80,	/* don't allocate in block if more than this percentage full */
+	FlushSize = 200,	/* number of blocks to flush */
+	DirtyPercentage = 50,	/* maximum percentage of dirty blocks */
+};
+
+enum {
+	NilBlock	= (~0UL),
+	MaxBlock	= (1UL<<31),
+};
+
+enum {
+	HeaderMagic = 0x3776ae89,
+	HeaderVersion = 1,
+	HeaderOffset = 128*1024,
+	HeaderSize = 512,
+	SuperMagic = 0x2340a3b1,
+	SuperSize = 512,
+	SuperVersion = 1,
+	LabelSize = 14,
+};
+
+/* well known tags */
+enum {
+	BadTag = 0,		/* this tag should not be used */
+	RootTag = 1,		/* root of fs */
+	EnumTag,		/* root of a dir listing */
+	UserTag = 32,		/* all other tags should be >= UserTag */
+};
+
+struct Super {
+	u16int version;
+	u32int epochLow;
+	u32int epochHigh;
+	u64int qid;			/* next qid */
+	u32int active;			/* root of active file system */
+	u32int next;			/* root of next snapshot to archive */
+	u32int current;			/* root of snapshot currently archiving */
+	uchar last[VtScoreSize];	/* last snapshot successfully archived */
+	char name[128];			/* label */
+};
+
+
+struct Fs {
+	Arch *arch;		/* immutable */
+	Cache *cache;		/* immutable */
+	int mode;		/* immutable */
+	int blockSize;		/* immutable */
+	VtSession *z;		/* immutable */
+	Snap *snap;	/* immutable */
+
+	Periodic *metaFlush;	/* periodically flushes meta data cached in files */
+
+	
+
+	/*
+	 * epoch lock.
+	 * Most operations on the fs require a read lock of elk, ensuring that
+	 * the current high and low epochs do not change under foot.
+	 * This lock is mostly acquired via a call to fileLock or fileRlock.
+	 * Deletion and creation of snapshots occurs under a write lock of elk,
+	 * ensuring no file operations are occurring concurrently.
+	 */
+	VtLock *elk;		/* epoch lock */
+	u32int ehi;		/* epoch high */
+	u32int elo;		/* epoch low */
+
+	Source *source;		/* immutable: root of sources */
+	File *file;		/* immutable: root of files */
+};
+
+/*
+ * variant on VtEntry
+ * there are extra fields when stored locally
+ */
+struct Entry {
+	u32int gen;			/* generation number */
+	ushort psize;			/* pointer block size */
+	ushort dsize;			/* data block size */
+	uchar depth;			/* unpacked from flags */
+	uchar flags;
+	uvlong size;
+	uchar score[VtScoreSize];
+	u32int tag;			/* tag for local blocks: zero if stored on Venti */
+	u32int snap;			/* non zero -> entering snapshot of given epoch */
+	uchar archive;			/* archive this snapshot: only valid for snap != 0 */
+};
+
+struct Source {
+	Fs *fs;		/* immutable */
+	int mode;	/* immutable */
+	u32int gen;	/* immutable */
+	int dsize;	/* immutable */
+	int dir;	/* immutable */
+
+	Source *parent;	/* immutable */
+
+	VtLock *lk;
+	int ref;
+	/*
+	 * epoch for the source 
+	 * for ReadWrite sources, epoch is used to lazily notice
+	 * sources that must be split from the snapshots.
+	 * for ReadOnly sources, the epoch represents the minimum epoch
+	 * along the chain from the root, and is used to lazily notice
+	 * sources that have become invalid because they belong to an old
+	 * snapshot.
+	 */
+	u32int epoch;
+	Block *b;			/* block containing this source */
+	uchar score[VtScoreSize];	/* score of block containing this source */
+	u32int scoreEpoch;	/* epoch of block containing this source */
+	int epb;			/* immutable: entries per block in parent */
+	u32int tag;			/* immutable: tag of parent */
+	u32int offset; 			/* immutable: entry offset in parent */
+};
+
+
+struct Header {
+	ushort version;
+	ushort blockSize;
+	ulong super;	/* super blocks */
+	ulong label;	/* start of labels */
+	ulong data;	/* end of labels - start of data blocks */
+	ulong end;	/* end of data blocks */
+};
+
+/*
+ * contains a one block buffer
+ * to avoid problems of the block changing underfoot
+ * and to enable an interface that supports unget.
+ */
+struct DirEntryEnum {
+	File *file;
+
+	u32int boff; 	/* block offset */
+
+	int i, n;
+	DirEntry *buf;
+};
+
+/* Block states; two orthogonal fields, Bv* and Ba* */
+enum {
+	BsFree = 0,		/* available for allocation */
+	BsBad = 0xFF,		/* something is wrong with this block */
+
+	/* bit fields */
+	BsAlloc = 1<<0,	/* block is in use */
+	BsCopied = 1<<1,	/* block has been copied */
+	BsVenti = 1<<2,	/* block has been stored on Venti */
+	BsClosed = 1<<3,	/* block has been unlinked from active file system */
+	BsMask = BsAlloc|BsCopied|BsVenti|BsClosed,
+};
+
+/*
+ * Each block has a state and generation
+ * The following invariants are maintained
+ * 	Each block has no more than than one parent per generation
+ * 	For Active*, no child has a parent of a greater generation
+ *	For Snap*, there is a snap parent of given generation and there are
+ *		no parents of greater gen - implies no children snaps
+ *		of a lesser gen
+ *	For *RO, the block is fixed - no change can be made - all pointers
+ *		are valid venti addresses
+ *	For *A, the block is on the venti server
+ *	There are no pointers to Zombie blocks
+ *
+ * Transitions
+ *	Archiver at generation g
+ *	Mutator at generation h
+ *	
+ *	Want to modify a block
+ *		Venti: create new Active(h)
+ *		Active(x): x == h: do nothing
+ *		Active(x): x < h: change to Snap(h-1) + add Active(h)
+ *		ActiveRO(x): change to SnapRO(h-1) + add Active(h)
+ *		ActiveA(x): add Active(h)
+ *		Snap*(x): should not occur
+ *		Zombie(x): should not occur
+ *	Want to archive
+ *		Active(x): x != g: should never happen
+ *		Active(x): x == g fix children and free them: move to ActiveRO(g);
+ *		ActiveRO(x): x != g: should never happen
+ *		ActiveRO(x): x == g: wait until it hits ActiveA or SnapA
+ *		ActiveA(x): done
+ *		Snap(x): x < g: should never happen
+ *		Snap(x): x >= g: fix children, freeing all SnapA(y) x == y;
+ *		SnapRO(x): wait until it hits SnapA
+ *
+ */
+
+/* 
+ * block types
+ * more regular than Venti block types
+ * bit 3 -> block or data block
+ * bits 2-0 -> level of block
+ */
+enum {
+	BtData,
+	BtDir = 1<<3,
+	BtLevelMask = 7,
+	BtMax = 1<<4,
+};
+
+/* io states */
+enum {
+	BioEmpty,	/* label & data are not valid */
+	BioLabel,	/* label is good */
+	BioClean,	/* data is on the disk */
+	BioDirty,	/* data is not yet on the disk */
+	BioReading,	/* in process of reading data */
+	BioWriting,	/* in process of writing data */
+	BioReadError,	/* error reading: assume disk always handles write errors */
+	BioVentiError,	/* error reading from venti (probably disconnected) */
+	BioMax
+};
+
+struct Label {
+	uchar type;
+	uchar state;
+	u32int tag;
+	u32int epoch;
+	u32int epochClose;
+};
+
+struct Block {
+	Cache *c;
+	int ref;
+	int nlock;
+	ulong	pc;		/* pc that fetched this block from the cache */
+
+	VtLock *lk;
+	
+	int 	part;
+	u32int	addr;
+	uchar	score[VtScoreSize];	/* score */
+	Label l;
+
+	uchar 	*data;
+
+	/* the following is private; used by cache */
+
+	Block	*next;			/* doubly linked hash chains */
+	Block	**prev;
+	u32int	heap;			/* index in heap table */
+	u32int	used;			/* last reference times */
+
+	u32int	vers;			/* version of dirty flag */
+
+	BList	*uhead;			/* blocks to unlink when this block is written */
+	BList	*utail;
+
+	/* block ordering for cache -> disk */
+	BList	*prior;			/* list of blocks before this one */
+
+	Block	*ionext;
+	int	iostate;
+	VtRendez *ioready;
+};
+
+/* tree walker, for gc and archiver */
+struct WalkPtr
+{
+	uchar *data;
+	int isEntry;
+	int n;
+	int m;
+	Entry e;
+	uchar type;
+	u32int tag;
+};
+
+/* disk partitions */
+enum {
+	PartError,
+	PartSuper,
+	PartLabel,
+	PartData,
+	PartVenti,	/* fake partition */
+};
+
+extern vtType[BtMax];

+ 25 - 0
sys/src/cmd/fossil/deadlock

@@ -0,0 +1,25 @@
+#!/bin/rc
+
+rfork e
+
+x=($*)
+if(~ $#x 0){
+	x=`{ps |awk '$NF=="8.fossil" {print $2}'}
+	ps | awk '$7=="8.fossil"'
+}
+if(~ $#x 0){
+	x=`{ps | awk '$NF=="fossil" {print $2}'}
+	ps -a | awk '$7 == "fossil"'
+}
+
+y=$x^', '
+y=$"y
+echo 'include("/sys/src/cmd/fossil/fossil-acid");
+print("--XXX\n");
+deadlocklist({' ^ $y ^ '});
+print("--YYY\n");' |
+	acid $x(1) |
+	sed -n '/--XXX/,/--YYY/p' |
+	sed 's/acid: //g' |
+	grep -v '^--'
+

+ 332 - 0
sys/src/cmd/fossil/disk.c

@@ -0,0 +1,332 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+static void diskThread(void *a);
+
+enum {
+	QueueSize = 100,		/* maximum block to queue */
+};
+
+struct Disk {
+	VtLock *lk;
+	int ref;
+
+	int fd;
+	Header h;
+
+	VtRendez *flow;
+	VtRendez *starve;
+	VtRendez *flush;
+	VtRendez *die;
+
+	int nqueue;
+
+	Block *cur;		/* block to do on current scan */
+	Block *next;		/* blocks to do next scan */
+};
+
+
+Disk *
+diskAlloc(int fd)
+{
+	u8int buf[HeaderSize];
+	Header h;
+	Disk *disk;
+	
+	if(pread(fd, buf, HeaderSize, HeaderOffset) < HeaderSize){
+		vtOSError();
+		return nil;
+	}
+
+	if(!headerUnpack(&h, buf))
+		return nil;
+	disk = vtMemAllocZ(sizeof(Disk));
+	disk->lk = vtLockAlloc();
+	disk->starve = vtRendezAlloc(disk->lk);
+	disk->flow = vtRendezAlloc(disk->lk);
+	disk->flush = vtRendezAlloc(disk->lk);
+	disk->fd = fd;
+	disk->h = h;
+
+	disk->ref = 2;
+	vtThread(diskThread, disk);
+
+	return disk;
+}
+
+void
+diskFree(Disk *disk)
+{
+	diskFlush(disk);
+
+	/* kill slave */
+	vtLock(disk->lk);
+	disk->die = vtRendezAlloc(disk->lk);
+	vtWakeup(disk->starve);
+	while(disk->ref > 1)
+		vtSleep(disk->die);
+	vtUnlock(disk->lk);
+	vtRendezFree(disk->flow);
+	vtRendezFree(disk->starve);
+	vtRendezFree(disk->die);
+	vtLockFree(disk->lk);
+	close(disk->fd);
+	vtMemFree(disk);
+}
+
+static u32int
+partStart(Disk *disk, int part)
+{
+	switch(part){
+	default:
+		assert(0);
+	case PartSuper:
+		return disk->h.super;
+	case PartLabel:
+		return disk->h.label;
+	case PartData:
+		return disk->h.data;
+	}
+}
+
+
+static u32int
+partEnd(Disk *disk, int part)
+{
+	switch(part){
+	default:
+		assert(0);
+	case PartSuper:
+		return disk->h.super+1;
+	case PartLabel:
+		return disk->h.data;
+	case PartData:
+		return disk->h.end;
+	}
+}
+
+int
+diskReadRaw(Disk *disk, int part, u32int addr, uchar *buf)
+{
+	ulong start, end;
+	u64int offset;
+	int n, nn;
+
+	start = partStart(disk, part);
+	end = partEnd(disk, part);
+
+	if(addr >= end-start){
+		vtSetError(EBadAddr);
+		return 0;
+	}
+
+	offset = ((u64int)(addr + start))*disk->h.blockSize;
+	n = disk->h.blockSize;
+	while(n > 0){
+		nn = pread(disk->fd, buf, n, offset);
+		if(nn < 0){
+			vtOSError();
+			return 0;
+		}
+		if(nn == 0){
+			vtSetError(EIO);
+			return 0;
+		}
+		n -= nn;
+		offset += nn;
+		buf += nn;
+	}
+	return 1;
+}
+
+int
+diskWriteRaw(Disk *disk, int part, u32int addr, uchar *buf)
+{
+	ulong start, end;
+	u64int offset;
+	int n;
+
+	start = partStart(disk, part);
+	end = partEnd(disk, part);
+
+	if(addr >= end-start){
+		vtSetError(EBadAddr);
+		return 0;
+	}
+
+	offset = ((u64int)(addr + start))*disk->h.blockSize;
+	n = disk->h.blockSize;
+	if(pwrite(disk->fd, buf, n, offset) < n){
+		vtOSError();
+		return 0;
+	}
+
+	return 1;
+}
+
+static void
+diskQueue(Disk *disk, Block *b)
+{
+	Block **bp, *bb;
+
+	vtLock(disk->lk);
+	while(disk->nqueue >= QueueSize)
+		vtSleep(disk->flow);
+	if(disk->cur == nil || b->addr > disk->cur->addr)
+		bp = &disk->cur;
+	else
+		bp = &disk->next;
+
+	for(bb=*bp; bb; bb=*bp){
+		if(b->addr < bb->addr)
+			break;
+		bp = &bb->ionext;
+	}
+	b->ionext = bb;	
+	*bp = b;
+	if(disk->nqueue == 0)
+		vtWakeup(disk->starve);
+	disk->nqueue++;
+	vtUnlock(disk->lk);
+}
+
+
+void
+diskRead(Disk *disk, Block *b)
+{
+	assert(b->iostate == BioEmpty || b->iostate == BioLabel);
+	blockSetIOState(b, BioReading);
+	diskQueue(disk, b);
+}
+
+void
+diskWrite(Disk *disk, Block *b)
+{
+	assert(b->iostate == BioDirty);
+	blockSetIOState(b, BioWriting);
+	diskQueue(disk, b);
+}
+
+int
+diskBlockSize(Disk *disk)
+{
+	return disk->h.blockSize;	/* immuttable */
+}
+
+int
+diskFlush(Disk *disk)
+{
+	Dir dir;
+
+	vtLock(disk->lk);
+	while(disk->nqueue > 0)
+		vtSleep(disk->flush);
+	vtUnlock(disk->lk);
+
+	/* there really should be a cleaner interface to flush an fd */
+	nulldir(&dir);
+	if(dirfwstat(disk->fd, &dir) < 0){
+		vtOSError();
+		return 0;
+	}
+	return 1;
+}
+
+u32int
+diskSize(Disk *disk, int part)
+{
+	return partEnd(disk, part) - partStart(disk, part);
+}
+
+static void
+diskThread(void *a)
+{
+	Disk *disk = a;
+	Block *b;
+	uchar *buf, *p;
+	double t;
+	int nio;
+
+	vtThreadSetName("disk");
+
+fprint(2, "diskThread %d\n", getpid());
+
+	buf = vtMemAlloc(disk->h.blockSize);
+
+	vtLock(disk->lk);
+	nio = 0;
+	t = -nsec();
+	for(;;){
+		while(disk->nqueue == 0){
+			t += nsec();
+if(nio >= 10000){
+fprint(2, "disk: io=%d at %.3fms\n", nio, t*1e-6/nio);
+nio = 0;
+t = 0.;
+}
+			if(disk->die != nil)
+				goto Done;
+			vtSleep(disk->starve);
+			t -= nsec();
+		}
+		assert(disk->cur != nil || disk->next != nil);
+
+		if(disk->cur == nil){
+			disk->cur = disk->next;
+			disk->next = nil;
+		}
+		b = disk->cur;
+		disk->cur = b->ionext;
+		vtUnlock(disk->lk);
+
+		/*
+		 * no one should hold onto blocking in the
+		 * reading or writing state, so this lock should
+		 * not cause deadlock.
+		 */
+if(0)fprint(2, "diskThread: %d:%d %x\n", getpid(), b->part, b->addr);
+		bwatchLock(b);
+		vtLock(b->lk);
+		assert(b->nlock == 1);
+
+		switch(b->iostate){
+		default:
+			abort();
+		case BioReading:
+			if(!diskReadRaw(disk, b->part, b->addr, b->data)){
+fprint(2, "diskReadRaw failed: part=%d addr=%ux: %r", b->part, b->addr);
+				blockSetIOState(b, BioReadError);
+			}else
+				blockSetIOState(b, BioClean);
+			break;
+		case BioWriting:
+			p = blockRollback(b, buf);
+			if(!diskWriteRaw(disk, b->part, b->addr, p)){
+fprint(2, "diskWriteRaw failed: part=%d addr=%ux: %r", b->part, b->addr);
+				break;
+			}
+			if(p != buf)
+				blockSetIOState(b, BioClean);
+			else
+				blockSetIOState(b, BioDirty);
+			break;
+		}
+		
+		blockPut(b);		/* remove extra reference, unlock */
+		vtLock(disk->lk);
+		disk->nqueue--;
+		if(disk->nqueue == QueueSize-1)
+			vtWakeup(disk->flow);
+		if(disk->nqueue == 0)
+			vtWakeup(disk->flush);
+		nio++;
+	}
+Done:
+fprint(2, "diskThread done\n");
+	disk->ref--;
+	vtWakeup(disk->die);
+	vtUnlock(disk->lk);
+	vtMemFree(buf);
+}

+ 86 - 0
sys/src/cmd/fossil/dump.c

@@ -0,0 +1,86 @@
+/*
+ * Clumsy hack to take snapshots and dumps.
+ */
+#include <u.h>
+#include <libc.h>
+
+void
+usage(void)
+{
+	fprint(2, "usage: fossil/dump [-i snap-interval] [-n name] fscons /n/fossil\n");
+	exits("usage");
+}
+
+char*
+snapnow(void)
+{
+	Tm t;
+	static char buf[100];
+
+	t = *localtime(time(0)-5*60*60);	/* take dumps at 5:00 am */
+
+	sprint(buf, "archive/%d/%02d%02d", t.year+1900, t.mon+1, t.mday);
+	return buf;
+}
+
+void
+main(int argc, char **argv)
+{
+	int onlyarchive, cons, s;
+	ulong t, i;
+	char *name;
+
+	name = "main";
+	s = 0;
+	onlyarchive = 0;
+	i = 60*60;		/* one hour */
+	ARGBEGIN{
+	case 'i':
+		i = atoi(EARGF(usage()));
+		if(i == 0){
+			onlyarchive = 1;
+			i = 60*60;
+		}
+		break;
+	case 'n':
+		name = EARGF(usage());
+		break;
+	case 's':
+		s = atoi(EARGF(usage()));
+		break;
+	}ARGEND
+
+	if(argc != 2)
+		usage();
+
+	if((cons = open(argv[0], OWRITE)) < 0)
+		sysfatal("open %s: %r", argv[0]);
+
+	if(chdir(argv[1]) < 0)
+		sysfatal("chdir %s: %r", argv[1]);
+
+	rfork(RFNOTEG);
+	switch(fork()){
+	case -1:
+		sysfatal("fork: %r");
+	case 0:
+		break;
+	default:
+		exits(0);
+	}
+
+	/*
+	 * pause at boot time to let clock stabilize.
+	 */
+	if(s)
+		sleep(s*1000);
+
+	for(;;){
+		if(access(snapnow(), AEXIST) < 0)
+			fprint(cons, "\nfsys %s snap -a\n", name);
+		t = time(0);
+		sleep((i - t%i)*1000+200);
+		if(!onlyarchive)
+			fprint(cons, "\nfsys %s snap\n", name);
+	}
+}

+ 36 - 0
sys/src/cmd/fossil/error.c

@@ -0,0 +1,36 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+char EBadAddr[] = "illegal block address";
+char EBadDir[] = "corrupted directory entry";
+char EBadEntry[] = "corrupted file entry";
+char EBadLabel[] = "corrupted block label";
+char EBadMeta[] = "corrupted meta data";
+char EBadMode[] = "illegal mode";
+char EBadOffset[] = "illegal offset";
+char EBadPath[] = "illegal path element";
+char EBadRoot[] = "root of file system is corrupted";
+char EBadSuper[] = "corrupted super block";
+char EBlockTooBig[] = "block too big";
+char ECacheFull[] = "no free blocks in memory cache";
+char EConvert[] = "protocol botch";
+char EExists[] = "file already exists";
+char EFsFill[] = "file system is full";
+char EIO[] = "i/o error";
+char EInUse[] = "file is in use";
+char ELabelMismatch[] = "block label mismatch";
+char ENilBlock[] = "illegal block address";
+char ENoDir[] = "directory entry is not allocated";
+char ENoFile[] = "file does not exist";
+char ENotDir[] = "not a directory";
+char ENotEmpty[] = "directory not empty";
+char ENotFile[] = "not a file";
+char EReadOnly[] = "file is read only";
+char ERemoved[] = "file has been removed";
+char EResize[] = "only support truncation to zero length";
+char ERoot[] = "cannot remove root";
+char ESnapOld[] = "snapshot has been deleted";
+char ESnapRO[] = "snapshot is read only";
+char ETooBig[] = "file too big";

+ 31 - 0
sys/src/cmd/fossil/error.h

@@ -0,0 +1,31 @@
+extern char EBadAddr[];
+extern char EBadDir[];
+extern char EBadEntry[];
+extern char EBadLabel[];
+extern char EBadMeta[];
+extern char EBadMode[];
+extern char EBadOffset[];
+extern char EBadPath[];
+extern char EBadRoot[];
+extern char EBadSuper[];
+extern char EBlockTooBig[];
+extern char ECacheFull[];
+extern char EConvert[];
+extern char EExists[];
+extern char EFsFill[];
+extern char EIO[];
+extern char EInUse[];
+extern char ELabelMismatch[];
+extern char ENilBlock[];
+extern char ENoDir[];
+extern char ENoFile[];
+extern char ENotDir[];
+extern char ENotEmpty[];
+extern char ENotFile[];
+extern char EReadOnly[];
+extern char ERemoved[];
+extern char EResize[];
+extern char ERoot[];
+extern char ESnapOld[];
+extern char ESnapRO[];
+extern char ETooBig[];

+ 1648 - 0
sys/src/cmd/fossil/file.c

@@ -0,0 +1,1648 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+/*
+ * locking order is upwards.  A thread can hold the lock for a File
+ * and then acquire the lock of its parent
+ */
+
+struct File {
+	Fs	*fs;		/* immutable */
+
+	/* meta data for file: protected by the lk in the parent */
+	int	ref;		/* holds this data structure up */
+
+	int	partial;	/* file was never really open */
+	int	removed;	/* file has been removed */
+	int	dirty;		/* dir is dirty with respect to meta data in block */
+	u32int	boff;		/* block offset within msource for this file's meta data */
+
+	DirEntry dir;		/* meta data for this file */
+
+	File	*up;		/* parent file */
+	File	*next;		/* sibling */
+
+	/* data for file */
+	VtLock	*lk;		/* lock for the following */
+	Source	*source;
+	Source	*msource;	/* for directories: meta data for children */
+	File	*down;		/* children */
+
+	int	mode;
+};
+
+static int fileMetaFlush2(File*, char*);
+static u32int fileMetaAlloc(File*, DirEntry*, u32int);
+static int fileRLock(File*);
+static void fileRUnlock(File*);
+static int fileLock(File*);
+static void fileUnlock(File*);
+static void fileMetaLock(File*);
+static void fileMetaUnlock(File*);
+static void fileRAccess(File*);
+static void fileWAccess(File*, char*);
+
+static File *
+fileAlloc(Fs *fs)
+{
+	File *f;
+
+	f = vtMemAllocZ(sizeof(File));
+	f->lk = vtLockAlloc();
+	f->ref = 1;
+	f->fs = fs;
+	f->boff = NilBlock;
+	f->mode = fs->mode;
+	return f;
+}
+
+static void
+fileFree(File *f)
+{
+	sourceClose(f->source);
+	vtLockFree(f->lk);	
+	sourceClose(f->msource);
+	deCleanup(&f->dir);
+	
+	memset(f, ~0, sizeof(File));
+	vtMemFree(f);
+}
+
+/*
+ * the file is locked already
+ * f->msource is unlocked
+ */
+static File *
+dirLookup(File *f, char *elem)
+{
+	int i;
+	MetaBlock mb;
+	MetaEntry me;
+	Block *b;
+	Source *meta;
+	File *ff;
+	u32int bo, nb;
+
+	meta = f->msource;
+	b = nil;
+	if(!sourceLock(meta, -1))
+		return nil;
+	nb = (sourceGetSize(meta)+meta->dsize-1)/meta->dsize;
+	for(bo=0; bo<nb; bo++){
+		b = sourceBlock(meta, bo, OReadOnly);
+		if(b == nil)
+			goto Err;
+		if(!mbUnpack(&mb, b->data, meta->dsize))
+			goto Err;
+		if(mbSearch(&mb, elem, &i, &me)){
+			ff = fileAlloc(f->fs);
+			if(!deUnpack(&ff->dir, &me)){
+				fileFree(ff);
+				goto Err;
+			}
+			sourceUnlock(meta);
+			blockPut(b);
+			ff->boff = bo;
+			ff->mode = f->mode;
+			return ff;
+		}
+		
+		blockPut(b);
+		b = nil;
+	}
+	vtSetError(ENoFile);
+	/* fall through */
+Err:
+	sourceUnlock(meta);
+	blockPut(b);
+	return nil;
+}
+
+File *
+fileRoot(Source *r)
+{
+	Block *b;
+	Source *r0, *r1, *r2;
+	MetaBlock mb;
+	MetaEntry me;
+	File *root, *mr;
+	Fs *fs;
+
+	b = nil;
+	root = nil;
+	mr = nil;
+	r1 = nil;
+	r2 = nil;
+
+	fs = r->fs;
+	if(!sourceLock(r, -1))
+		return nil;
+	r0 = sourceOpen(r, 0, fs->mode);
+	if(r0 == nil)
+		goto Err;
+	r1 = sourceOpen(r, 1, fs->mode);
+	if(r1 == nil)
+		goto Err;
+	r2 = sourceOpen(r, 2, fs->mode);
+	if(r2 == nil)
+		goto Err;
+
+	mr = fileAlloc(fs);
+	mr->msource = r2;
+	r2 = nil;
+
+	root = fileAlloc(fs);
+	root->boff = 0;
+	root->up = mr;
+	root->source = r0;
+	r0 = nil;
+	root->msource = r1;
+	r1 = nil;
+
+	mr->down = root;
+
+	if(!sourceLock(mr->msource, -1))
+		goto Err;
+	b = sourceBlock(mr->msource, 0, OReadOnly);
+	sourceUnlock(mr->msource);
+	if(b == nil)
+		goto Err;
+
+	if(!mbUnpack(&mb, b->data, mr->msource->dsize))
+		goto Err;
+
+	meUnpack(&me, &mb, 0);
+	if(!deUnpack(&root->dir, &me))
+		goto Err;
+	blockPut(b);
+	sourceUnlock(r);
+	fileRAccess(root);
+
+	return root;
+Err:
+	blockPut(b);
+	if(r0)
+		sourceClose(r0);
+	if(r1)
+		sourceClose(r1);
+	if(r2)
+		sourceClose(r2);
+	if(mr)
+		fileFree(mr);
+	if(root)		
+		fileFree(root);
+	sourceUnlock(r);
+
+	return nil;
+}
+
+static Source *
+fileOpenSource(File *f, u32int offset, u32int gen, int dir, uint mode)
+{
+	Source *r;
+
+	if(!sourceLock(f->source, mode))
+		return nil;
+	r = sourceOpen(f->source, offset, mode);
+	sourceUnlock(f->source);
+	if(r == nil)
+		return nil;
+	if(r->gen != gen){
+		vtSetError(ERemoved);
+		goto Err;
+	}
+	if(r->dir != dir && r->mode != -1){
+fprint(2, "fileOpenSource: dir mismatch %d %d\n", r->dir, dir);
+		vtSetError(EBadMeta);
+		goto Err;
+	}
+	return r;
+Err:
+	sourceClose(r);
+	return nil;
+}
+
+File *
+_fileWalk(File *f, char *elem, int partial)
+{
+	File *ff;
+
+	fileRAccess(f);
+
+	if(elem[0] == 0){
+		vtSetError(EBadPath);
+		return nil;
+	}
+
+	if(!fileIsDir(f)){
+		vtSetError(ENotDir);
+		return nil;
+	}
+
+	if(strcmp(elem, ".") == 0){
+		return fileIncRef(f);
+	}
+
+	if(strcmp(elem, "..") == 0){
+		if(fileIsRoot(f))
+			return fileIncRef(f);
+		return fileIncRef(f->up);
+	}
+
+	if(!fileLock(f))
+		return nil;
+
+	for(ff = f->down; ff; ff=ff->next){
+		if(strcmp(elem, ff->dir.elem) == 0 && !ff->removed){
+			ff->ref++;
+			goto Exit;
+		}
+	}
+
+	ff = dirLookup(f, elem);
+	if(ff == nil)
+		goto Err;
+
+	if(ff->dir.mode & ModeSnapshot)
+		ff->mode = OReadOnly;
+
+	if(partial){
+		/*
+		 * Do nothing.  We're opening this file only so we can clri it.
+		 * Usually the sources can't be opened, hence we won't even bother.
+		 * Be VERY careful with the returned file.  If you hand it to a routine
+		 * expecting ff->source and/or ff->msource to be non-nil, we're
+		 * likely to dereference nil.  FileClri should be the only routine 
+		 * setting partial.
+		 */
+		ff->partial = 1;
+	}else if(ff->dir.mode & ModeDir){
+		ff->source = fileOpenSource(f, ff->dir.entry, ff->dir.gen, 1, ff->mode);
+		ff->msource = fileOpenSource(f, ff->dir.mentry, ff->dir.mgen, 0, ff->mode);
+		if(ff->source == nil || ff->msource == nil)
+			goto Err;
+	}else{
+		ff->source = fileOpenSource(f, ff->dir.entry, ff->dir.gen, 0, ff->mode);
+		if(ff->source == nil)
+			goto Err;
+	}
+
+	/* link in and up parent ref count */
+	ff->next = f->down;
+	f->down = ff;
+	ff->up = f;
+	fileIncRef(f);
+Exit:
+	fileUnlock(f);
+	return ff;
+Err:
+	fileUnlock(f);
+	if(ff != nil)
+		fileDecRef(ff);
+	return nil;
+}
+
+File *
+fileWalk(File *f, char *elem)
+{
+	return _fileWalk(f, elem, 0);
+}
+
+File *
+_fileOpen(Fs *fs, char *path, int partial)
+{
+	File *f, *ff;
+	char *p, elem[VtMaxStringSize];
+	int n;
+
+	f = fs->file;
+	fileIncRef(f);
+	while(*path != 0){
+		for(p = path; *p && *p != '/'; p++)
+			;
+		n = p - path;
+		if(n > 0){
+			if(n > VtMaxStringSize){
+				vtSetError(EBadPath);
+				goto Err;
+			}
+			memmove(elem, path, n);
+			elem[n] = 0;
+			ff = _fileWalk(f, elem, partial && *p=='\0');
+			if(ff == nil)
+				goto Err;
+			fileDecRef(f);
+			f = ff;
+		}
+		if(*p == '/')
+			p++;
+		path = p;
+	}
+	return f;
+Err:
+	fileDecRef(f);
+	return nil;
+}
+
+File*
+fileOpen(Fs *fs, char *path)
+{
+	return _fileOpen(fs, path, 0);
+}
+
+File *
+fileCreate(File *f, char *elem, ulong mode, char *uid)
+{
+	File *ff;
+	DirEntry *dir;
+	Source *pr, *r, *mr;
+	int isdir;
+
+	if(!fileLock(f))
+		return nil;
+
+	r = nil;
+	mr = nil;
+	for(ff = f->down; ff; ff=ff->next){
+		if(strcmp(elem, ff->dir.elem) == 0 && !ff->removed){
+			ff = nil;
+			vtSetError(EExists);
+			goto Err1;
+		}
+	}
+
+	ff = dirLookup(f, elem);
+	if(ff != nil){
+		vtSetError(EExists);
+		goto Err1;
+	}
+
+	pr = f->source;
+	if(pr->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		goto Err1;
+	}
+
+	if(!sourceLock2(f->source, f->msource, -1))
+		goto Err1;
+
+	ff = fileAlloc(f->fs);
+	isdir = mode & ModeDir;
+
+	r = sourceCreate(pr, pr->dsize, isdir, 0);
+	if(r == nil)
+		goto Err;
+	if(isdir){
+		mr = sourceCreate(pr, pr->dsize, 0, r->offset);
+		if(mr == nil)
+			goto Err;
+	}
+
+	dir = &ff->dir;
+	dir->elem = vtStrDup(elem);
+	dir->entry = r->offset;
+	dir->gen = r->gen;
+	if(isdir){
+		dir->mentry = mr->offset;
+		dir->mgen = mr->gen;
+	}
+	dir->size = 0;
+	if(!fsNextQid(f->fs, &dir->qid))
+		goto Err;
+	dir->uid = vtStrDup(uid);
+	dir->gid = vtStrDup(f->dir.gid);
+	dir->mid = vtStrDup(uid);
+	dir->mtime = time(0L);
+	dir->mcount = 0;
+	dir->ctime = dir->mtime;
+	dir->atime = dir->mtime;
+	dir->mode = mode;
+
+	ff->boff = fileMetaAlloc(f, dir, 0);
+	if(ff->boff == NilBlock)
+		goto Err;
+
+	sourceUnlock(f->source);
+	sourceUnlock(f->msource);
+
+	ff->source = r;
+	ff->msource = mr;
+
+	/* link in and up parent ref count */
+	ff->next = f->down;
+	f->down = ff;
+	ff->up = f;
+	fileIncRef(f);
+
+	fileWAccess(f, uid);
+
+	fileUnlock(f);
+	return ff;
+
+Err:
+	sourceUnlock(f->source);
+	sourceUnlock(f->msource);
+Err1:
+	if(r)
+		sourceRemove(r);
+	if(mr)
+		sourceRemove(mr);
+	if(ff)
+		fileDecRef(ff);
+	fileUnlock(f);
+	return 0;
+}
+
+int
+fileRead(File *f, void *buf, int cnt, vlong offset)
+{
+	Source *s;
+	uvlong size;
+	u32int bn;
+	int off, dsize, n, nn;
+	Block *b;
+	uchar *p;
+
+if(0)fprint(2, "fileRead: %s %d, %lld\n", f->dir.elem, cnt, offset);
+
+	if(!fileRLock(f))
+		return -1;
+
+	if(offset < 0){
+		vtSetError(EBadOffset);
+		goto Err1;
+	}
+
+	fileRAccess(f);
+
+	if(!sourceLock(f->source, OReadOnly))
+		goto Err1;
+
+	s = f->source;
+	dsize = s->dsize;
+	size = sourceGetSize(s);
+
+	if(offset >= size)
+		offset = size;
+
+	if(cnt > size-offset)
+		cnt = size-offset;
+	bn = offset/dsize;
+	off = offset%dsize;
+	p = buf;
+	while(cnt > 0){
+		b = sourceBlock(s, bn, OReadOnly);
+		if(b == nil)
+			goto Err;
+		n = cnt;
+		if(n > dsize-off)
+			n = dsize-off;
+		nn = dsize-off;
+		if(nn > n)
+			nn = n;
+		memmove(p, b->data+off, nn);
+		memset(p+nn, 0, nn-n);
+		off = 0;
+		bn++;
+		cnt -= n;
+		p += n;
+		blockPut(b);
+	}
+	sourceUnlock(s);
+	fileRUnlock(f);
+	return p-(uchar*)buf;
+
+Err:
+	sourceUnlock(s);
+Err1:
+	fileRUnlock(f);
+	return -1;
+}
+
+int
+fileWrite(File *f, void *buf, int cnt, vlong offset, char *uid)
+{
+	Source *s;
+	ulong bn;
+	int off, dsize, n;
+	Block *b;
+	uchar *p;
+	vlong eof;
+
+if(0)fprint(2, "fileWrite: %s %d, %lld\n", f->dir.elem, cnt, offset);
+
+	if(!fileLock(f))
+		return -1;
+
+	s = nil;
+	if(f->dir.mode & ModeDir){
+		vtSetError(ENotFile);
+		goto Err;
+	}
+
+	if(f->source->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		goto Err;
+	}
+	if(offset < 0){
+		vtSetError(EBadOffset);
+		goto Err;
+	}
+
+	fileWAccess(f, uid);
+
+	if(!sourceLock(f->source, -1))
+		goto Err;
+	s = f->source;
+	dsize = s->dsize;
+
+	eof = sourceGetSize(s);
+	if(f->dir.mode & ModeAppend)
+		offset = eof;
+	bn = offset/dsize;
+	off = offset%dsize;
+	p = buf;
+	while(cnt > 0){
+		n = cnt;
+		if(n > dsize-off)
+			n = dsize-off;
+		b = sourceBlock(s, bn, n<dsize?OReadWrite:OOverWrite);
+		if(b == nil){
+			if(offset > eof)
+				sourceSetSize(s, offset);
+			goto Err;
+		}
+		memmove(b->data+off, p, n);
+		off = 0;
+		cnt -= n;
+		p += n;
+		offset += n;
+		bn++;
+		blockDirty(b);
+		blockPut(b);
+	}
+	if(offset > eof && !sourceSetSize(s, offset))
+		goto Err;
+	sourceUnlock(s);
+	fileUnlock(f);
+	return p-(uchar*)buf;
+Err:
+	if(s)
+		sourceUnlock(s);
+	fileUnlock(f);
+	return -1;
+}
+
+int
+fileGetDir(File *f, DirEntry *dir)
+{
+	if(!fileRLock(f))
+		return 0;
+
+	fileMetaLock(f);
+	deCopy(dir, &f->dir);
+	fileMetaUnlock(f);
+
+	if(!fileIsDir(f)){
+		if(!sourceLock(f->source, OReadOnly)){
+			fileRUnlock(f);
+			return 0;
+		}
+		dir->size = sourceGetSize(f->source);
+		sourceUnlock(f->source);
+	}
+	fileRUnlock(f);
+
+	return 1;
+}
+
+int
+fileTruncate(File *f, char *uid)
+{
+	if(fileIsDir(f)){
+		vtSetError(ENotFile);
+		return 0;
+	}
+
+	if(!fileLock(f))
+		return 0;
+
+	if(f->source->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		fileUnlock(f);
+		return 0;
+	}
+	if(!sourceLock(f->source, -1)){
+		fileUnlock(f);
+		return 0;
+	}
+	if(!sourceTruncate(f->source)){
+		sourceUnlock(f->source);
+		fileUnlock(f);
+		return 0;
+	}
+	sourceUnlock(f->source);
+	fileUnlock(f);
+
+	fileWAccess(f->up, uid);
+
+	return 1;
+}
+
+int
+fileSetDir(File *f, DirEntry *dir, char *uid)
+{
+	File *ff;
+	char *oelem;
+	u32int mask;
+	u64int size;
+
+	/* can not set permissions for the root */
+	if(fileIsRoot(f)){
+		vtSetError(ERoot);
+		return 0;
+	}
+
+	if(!fileLock(f))
+		return 0;
+
+	if(f->source->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		fileUnlock(f);
+		return 0;
+	}
+
+	fileMetaLock(f);
+
+	/* check new name does not already exist */
+	if(strcmp(f->dir.elem, dir->elem) != 0){
+		for(ff = f->up->down; ff; ff=ff->next){
+			if(strcmp(dir->elem, ff->dir.elem) == 0 && !ff->removed){
+				vtSetError(EExists);
+				goto Err;
+			}
+		}
+	
+		ff = dirLookup(f->up, dir->elem);
+		if(ff != nil){
+			fileDecRef(ff);
+			vtSetError(EExists);
+			goto Err;
+		}
+	}
+
+	if(!fileIsDir(f)){
+		if(!sourceLock(f->source, -1))
+			goto Err;
+		size = sourceGetSize(f->source);
+		if(size != dir->size){
+			if(!sourceSetSize(f->source, dir->size)){
+				sourceUnlock(f->source);
+				goto Err;
+			}
+			/* commited to changing it now */
+		}
+		sourceUnlock(f->source);
+	}
+
+	/* commited to changing it now */
+	oelem = nil;
+	if(strcmp(f->dir.elem, dir->elem) != 0){
+		oelem = f->dir.elem;
+		f->dir.elem = vtStrDup(dir->elem);
+	}
+
+	if(strcmp(f->dir.uid, dir->uid) != 0){
+		vtMemFree(f->dir.uid);
+		f->dir.uid = vtStrDup(dir->uid);
+	}
+
+	if(strcmp(f->dir.gid, dir->gid) != 0){
+		vtMemFree(f->dir.gid);
+		f->dir.gid = vtStrDup(dir->gid);
+	}
+
+	f->dir.mtime = dir->mtime;
+	f->dir.atime = dir->atime;
+
+//fprint(2, "mode %x %x ", f->dir.mode, dir->mode);
+	mask = ~(ModeDir|ModeSnapshot);
+	f->dir.mode &= ~mask;
+	f->dir.mode |= mask & dir->mode;
+	f->dirty = 1;
+//fprint(2, "->%x\n", f->dir.mode);
+
+	fileMetaFlush2(f, oelem);
+	vtMemFree(oelem);
+
+	fileMetaUnlock(f);
+	fileUnlock(f);
+
+	fileWAccess(f->up, uid);
+
+	return 1;
+Err:
+	fileMetaUnlock(f);
+	fileUnlock(f);
+	return 0;
+}
+
+int
+fileSetQidSpace(File *f, u64int offset, u64int max)
+{	
+	int ret;
+
+	if(!fileLock(f))
+		return 0;
+	fileMetaLock(f);
+	f->dir.qidSpace = 1;
+	f->dir.qidOffset = offset;
+	f->dir.qidMax = max;
+	ret = fileMetaFlush2(f, nil);
+	fileMetaUnlock(f);
+	fileUnlock(f);
+	return ret;
+}
+
+
+uvlong
+fileGetId(File *f)
+{
+	/* immutable */
+	return f->dir.qid;
+}
+
+ulong
+fileGetMcount(File *f)
+{
+	ulong mcount;
+	
+	fileMetaLock(f);
+	mcount = f->dir.mcount;
+	fileMetaUnlock(f);
+	return mcount;
+}
+
+ulong
+fileGetMode(File *f)
+{
+	ulong mode;
+	
+	fileMetaLock(f);
+	mode = f->dir.mode;
+	fileMetaUnlock(f);
+	return mode;
+}
+
+int
+fileIsDir(File *f)
+{
+	/* immutable */
+	return (f->dir.mode & ModeDir) != 0;
+}
+
+int
+fileIsRoot(File *f)
+{
+	return f == f->fs->file;
+}
+
+int
+fileIsRoFs(File *f)
+{
+	return f->fs->mode == OReadOnly;
+}
+
+int
+fileGetSize(File *f, uvlong *size)
+{
+	if(!fileRLock(f))
+		return 0;
+	if(!sourceLock(f->source, OReadOnly)){
+		fileRUnlock(f);
+		return 0;
+	}
+	*size = sourceGetSize(f->source);
+	sourceUnlock(f->source);
+	fileRUnlock(f);
+
+	return 1;
+}
+
+void
+fileMetaFlush(File *f, int rec)
+{
+	File **kids, *p;
+	int nkids;
+	int i;
+
+	fileMetaLock(f);
+	fileMetaFlush2(f, nil);
+	fileMetaUnlock(f);
+
+	if(!rec || !fileIsDir(f))
+		return;
+
+	if(!fileLock(f))
+		return;
+	nkids = 0;
+	for(p=f->down; p; p=p->next)
+		nkids++;
+	kids = vtMemAlloc(nkids*sizeof(File*));
+	i = 0;
+	for(p=f->down; p; p=p->next){
+		kids[i++] = p;
+		p->ref++;
+	}
+	fileUnlock(f);
+
+	for(i=0; i<nkids; i++){
+		fileMetaFlush(kids[i], 1);
+		fileDecRef(kids[i]);
+	}
+	vtMemFree(kids);
+}
+
+/* assumes metaLock is held */
+static int
+fileMetaFlush2(File *f, char *oelem)
+{
+	File *fp;
+	Block *b, *bb;
+	MetaBlock mb;
+	MetaEntry me, me2;
+	int i, n;
+	u32int boff;
+
+	if(!f->dirty)
+		return 1;
+
+	if(oelem == nil)
+		oelem = f->dir.elem;
+
+//print("fileMetaFlush %s->%s\n", oelem, f->dir.elem);
+
+	fp = f->up;
+
+	if(!sourceLock(fp->msource, -1))
+		return 0;
+	b = sourceBlock(fp->msource, f->boff, OReadWrite);
+	if(b == nil)
+		goto Err1;
+
+	if(!mbUnpack(&mb, b->data, fp->msource->dsize))
+		goto Err;
+	if(!mbSearch(&mb, oelem, &i, &me))
+		goto Err;
+
+	n = deSize(&f->dir);
+if(0)fprint(2, "old size %d new size %d\n", me.size, n);
+
+	if(mbResize(&mb, &me, n)){
+		/* fits in the block */
+		mbDelete(&mb, i);
+		if(strcmp(f->dir.elem, oelem) != 0)
+			mbSearch(&mb, f->dir.elem, &i, &me2);
+		dePack(&f->dir, &me);
+		mbInsert(&mb, i, &me);
+		mbPack(&mb);	
+		blockDirty(b);
+		blockPut(b);
+		sourceUnlock(fp->msource);
+		f->dirty = 0;
+
+		return 1;
+	}
+
+	/*
+	 * moving entry to another block
+	 * it is feasible for the fs to crash leaving two copies
+	 * of the directory entry.  This is just too much work to
+	 * fix.  Given that entries are only allocated in a block that
+	 * is less than PercentageFull, most modifications of meta data
+	 * will fit within the block.  i.e. this code should almost
+	 * never be executed.
+	 */
+	boff = fileMetaAlloc(fp, &f->dir, f->boff+1);
+	if(boff == NilBlock){
+		/* mbResize might have modified block */
+		mbPack(&mb);	
+		blockDirty(b);
+		goto Err;
+	}	
+fprint(2, "fileMetaFlush moving entry from %ud -> %ud\n", f->boff, boff);
+	f->boff = boff;
+
+	/* make sure deletion goes to disk after new entry */
+	bb = sourceBlock(fp->msource, f->boff, OReadWrite);
+	mbDelete(&mb, i);
+	mbPack(&mb);	
+	blockDependency(b, bb, -1, nil);
+	blockPut(bb);
+	blockDirty(b);
+	blockPut(b);
+	sourceUnlock(fp->msource);
+
+	f->dirty = 0;
+
+	return 1;
+
+Err:
+	blockPut(b);
+Err1:
+	sourceUnlock(fp->msource);
+	return 0;
+}
+
+static int
+fileMetaRemove(File *f, char *uid)
+{
+	Block *b;
+	MetaBlock mb;
+	MetaEntry me;
+	int i;
+	File *up;
+
+	up = f->up;
+
+	fileWAccess(up, uid);
+
+	fileMetaLock(f);
+
+	sourceLock(up->msource, OReadWrite);
+	b = sourceBlock(up->msource, f->boff, OReadWrite);
+	if(b == nil)
+		goto Err;
+
+	if(!mbUnpack(&mb, b->data, up->msource->dsize))
+{
+fprint(2, "U\n");
+		goto Err;
+}
+	if(!mbSearch(&mb, f->dir.elem, &i, &me))
+{
+fprint(2, "S\n");
+		goto Err;
+}
+	mbDelete(&mb, i);
+	mbPack(&mb);
+	sourceUnlock(up->msource);
+
+	blockDirty(b);
+	blockPut(b);
+	
+	f->removed = 1;
+	f->boff = NilBlock;
+	f->dirty = 0;
+
+	fileMetaUnlock(f);
+	return 1;
+
+Err:
+	sourceUnlock(up->msource);
+	blockPut(b);
+	fileMetaUnlock(f);
+	return 0;
+}
+
+/* assume file is locked, assume f->msource is locked */
+static int
+fileCheckEmpty(File *f)
+{
+	u32int i, n;
+	Block *b;
+	MetaBlock mb;
+	Source *r;
+
+	r = f->msource;
+	n = (sourceGetSize(r)+r->dsize-1)/r->dsize;
+	for(i=0; i<n; i++){
+		b = sourceBlock(r, i, OReadOnly);
+		if(b == nil)
+			goto Err;
+		if(!mbUnpack(&mb, b->data, r->dsize))
+			goto Err;
+		if(mb.nindex > 0){
+			vtSetError(ENotEmpty);
+			goto Err;
+		}
+		blockPut(b);
+	}
+	return 1;
+Err:
+	blockPut(b);
+	return 0;
+}
+
+int
+fileRemove(File *f, char *uid)
+{
+	File *ff;
+
+	/* can not remove the root */
+	if(fileIsRoot(f)){
+		vtSetError(ERoot);
+		return 0;
+	}
+
+	if(!fileLock(f))
+		return 0;
+
+	if(f->source->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		goto Err1;
+	}
+	if(!sourceLock2(f->source, f->msource, -1))
+		goto Err1;
+	if(fileIsDir(f) && !fileCheckEmpty(f))
+		goto Err;
+		
+	for(ff=f->down; ff; ff=ff->next)
+		assert(ff->removed);
+
+	sourceRemove(f->source);
+	f->source = nil;
+	if(f->msource){
+		sourceRemove(f->msource);
+		f->msource = nil;
+	}
+
+	fileUnlock(f);
+
+	if(!fileMetaRemove(f, uid))
+		return 0;
+	
+	return 1;
+		
+Err:
+	sourceUnlock(f->source);
+	if(f->msource)
+		sourceUnlock(f->msource);
+Err1:
+	fileUnlock(f);
+	return 0;
+}
+
+int
+fileClri(Fs *fs, char *path, char *uid)
+{
+	int r;
+	File *f;
+
+	f = _fileOpen(fs, path, 1);
+	if(f == nil)
+		return 0;
+	if(f->up->source->mode != OReadWrite){
+		vtSetError(EReadOnly);
+		fileDecRef(f);
+		return 0;
+	}
+	r = fileMetaRemove(f, uid);
+	fileDecRef(f);
+	return r;
+}
+
+File *
+fileIncRef(File *vf)
+{
+	fileMetaLock(vf);
+	assert(vf->ref > 0);
+	vf->ref++;
+	fileMetaUnlock(vf);
+	return vf;
+}
+
+int 
+fileDecRef(File *f)
+{
+	File *p, *q, **qq;
+
+	if(f->up == nil){
+		/* never linked in */
+		assert(f->ref == 1);
+		fileFree(f);
+		return 1;
+	}
+
+	fileMetaLock(f);
+	f->ref--;
+	if(f->ref > 0){
+		fileMetaUnlock(f);
+		return 0;
+	}
+	assert(f->ref == 0);
+	assert(f->down == nil);
+
+	fileMetaFlush2(f, nil);
+
+	p = f->up;
+	qq = &p->down;
+	for(q = *qq; q; q = *qq){
+		if(q == f)
+			break;
+		qq = &q->next;
+	}
+	assert(q != nil);
+	*qq = f->next;
+
+	fileMetaUnlock(f);
+	fileFree(f);
+
+	fileDecRef(p);
+	return 1;
+}
+
+File *
+fileGetParent(File *f)
+{
+	if(fileIsRoot(f))
+		return fileIncRef(f);
+	return fileIncRef(f->up);
+}
+
+DirEntryEnum *
+deeOpen(File *f)
+{
+	DirEntryEnum *dee;
+	File *p;
+
+	if(!fileIsDir(f)){
+		vtSetError(ENotDir);
+		fileDecRef(f);
+		return nil;
+	}
+
+	/* flush out meta data */
+	if(!fileLock(f))
+		return nil;
+	for(p=f->down; p; p=p->next)
+		fileMetaFlush2(p, nil);
+	fileUnlock(f);
+
+	dee = vtMemAllocZ(sizeof(DirEntryEnum));
+	dee->file = fileIncRef(f);
+
+	return dee;
+}
+
+static int
+dirEntrySize(Source *s, ulong elem, ulong gen, uvlong *size)
+{
+	Block *b;
+	ulong bn;
+	Entry e;
+	int epb;
+
+	epb = s->dsize/VtEntrySize;
+	bn = elem/epb;
+	elem -= bn*epb;
+
+	b = sourceBlock(s, bn, OReadOnly);
+	if(b == nil)
+		goto Err;
+	if(!entryUnpack(&e, b->data, elem))
+		goto Err;
+	
+	/* hanging entries are returned as zero size */
+	if(!(e.flags & VtEntryActive) || e.gen != gen)
+		*size = 0;
+	else
+		*size = e.size;
+	blockPut(b);
+	return 1;	
+
+Err:
+	blockPut(b);
+	return 0;
+}
+
+static int
+deeFill(DirEntryEnum *dee)
+{
+	int i, n;
+	Source *meta, *source;
+	MetaBlock mb;
+	MetaEntry me;
+	File *f;
+	Block *b;
+	DirEntry *de;
+
+	/* clean up first */
+	for(i=dee->i; i<dee->n; i++)
+		deCleanup(dee->buf+i);
+	vtMemFree(dee->buf);
+	dee->buf = nil;
+	dee->i = 0;
+	dee->n = 0;
+
+	f = dee->file;
+
+	source = f->source;
+	meta = f->msource;
+
+	b = sourceBlock(meta, dee->boff, OReadOnly);
+	if(b == nil)
+		goto Err;
+	if(!mbUnpack(&mb, b->data, meta->dsize))
+		goto Err;
+
+	n = mb.nindex;
+	dee->buf = vtMemAlloc(n * sizeof(DirEntry));
+
+	for(i=0; i<n; i++){
+		de = dee->buf + i;
+		meUnpack(&me, &mb, i);
+		if(!deUnpack(de, &me))
+			goto Err;
+		dee->n++;
+		if(!(de->mode & ModeDir))
+		if(!dirEntrySize(source, de->entry, de->gen, &de->size))
+			goto Err;
+	}
+	dee->boff++;
+	blockPut(b);
+	return 1;
+Err:
+	blockPut(b);
+	return 0;
+}
+
+int
+deeRead(DirEntryEnum *dee, DirEntry *de)
+{
+	int ret, didread;
+	File *f;
+	u32int nb;
+
+	f = dee->file;
+	if(!fileRLock(f))
+		return -1;
+
+	if(!sourceLock2(f->source, f->msource, OReadOnly)){
+		fileRUnlock(f);
+		return -1;
+	}
+
+	nb = (sourceGetSize(f->msource)+f->msource->dsize-1)/f->msource->dsize;
+
+	didread = 0;
+	while(dee->i >= dee->n){
+		if(dee->boff >= nb){
+			ret = 0;
+			goto Return;
+		}
+		didread = 1;
+		if(!deeFill(dee)){
+			ret = -1;
+			goto Return;
+		}
+	}
+	
+	memmove(de, dee->buf + dee->i, sizeof(DirEntry));
+	dee->i++;
+	ret = 1;
+
+Return:
+	sourceUnlock(f->source);
+	sourceUnlock(f->msource);
+	fileRUnlock(f);
+
+	if(didread)
+		fileRAccess(f);
+	return ret;
+}
+
+void
+deeClose(DirEntryEnum *dee)
+{
+	int i;
+	if(dee == nil)
+		return;
+	for(i=dee->i; i<dee->n; i++)
+		deCleanup(dee->buf+i);
+	vtMemFree(dee->buf);
+	fileDecRef(dee->file);
+	vtMemFree(dee);
+}
+
+/*
+ * caller must lock f->source and f->msource
+ * caller must NOT lock the source and msource
+ * referenced by dir.
+ */
+static u32int
+fileMetaAlloc(File *f, DirEntry *dir, u32int start)
+{
+	u32int nb, bo;
+	Block *b, *bb;
+	MetaBlock mb;
+	int nn;
+	uchar *p;
+	int i, n, epb;
+	MetaEntry me;
+	Source *s, *ms;
+
+	s = f->source;
+	ms = f->msource;
+
+	n = deSize(dir);
+	nb = (sourceGetSize(ms)+ms->dsize-1)/ms->dsize;
+	b = nil;
+	if(start > nb)
+		start = nb;
+	for(bo=start; bo<nb; bo++){
+		b = sourceBlock(ms, bo, OReadWrite);
+		if(b == nil)
+			goto Err;
+		if(!mbUnpack(&mb, b->data, ms->dsize))
+			goto Err;
+		nn = (mb.maxsize*FullPercentage/100) - mb.size + mb.free;
+		if(n <= nn && mb.nindex < mb.maxindex)
+			break;
+		blockPut(b);
+		b = nil;
+	}
+
+	/* add block to meta file */
+	if(b == nil){
+		b = sourceBlock(ms, bo, OReadWrite);
+		if(b == nil)
+			goto Err;
+		sourceSetSize(ms, (nb+1)*ms->dsize);
+		mbInit(&mb, b->data, ms->dsize, ms->dsize/BytesPerEntry);
+	}
+
+	p = mbAlloc(&mb, n);
+	if(p == nil){
+		/* mbAlloc might have changed block */
+		mbPack(&mb);
+		blockDirty(b);
+		vtSetError(EBadMeta);
+		goto Err;
+	}
+
+	mbSearch(&mb, dir->elem, &i, &me);
+	assert(me.p == nil);
+	me.p = p;
+	me.size = n;
+	dePack(dir, &me);
+	mbInsert(&mb, i, &me);
+	mbPack(&mb);
+
+	/* meta block depends on super block for qid ... */
+	bb = cacheLocal(b->c, PartSuper, 0, OReadOnly);
+	blockDependency(b, bb, -1, nil);
+	blockPut(bb);
+
+	/* ... and one or two dir entries */
+	epb = s->dsize/VtEntrySize;
+	bb = sourceBlock(s, dir->entry/epb, OReadOnly);
+	blockDependency(b, bb, -1, nil);
+	blockPut(bb);
+	if(dir->mode & ModeDir){
+		bb = sourceBlock(s, dir->mentry/epb, OReadOnly);
+		blockDependency(b, bb, -1, nil);
+		blockPut(bb);
+	}
+	
+	blockDirty(b);
+	blockPut(b);
+	return bo;
+Err:
+	blockPut(b);
+	return NilBlock;
+}
+
+static int
+chkSource(File *f)
+{
+	if(f->partial)
+		return 1;
+
+	if(f->source == nil || (f->dir.mode & ModeDir) && f->msource == nil){
+		vtSetError(ERemoved);
+		return 0;
+	}
+	return 1;
+}
+
+static int
+fileRLock(File *f)
+{
+	assert(!vtCanLock(f->fs->elk));
+	vtRLock(f->lk);
+	if(!chkSource(f)){
+		fileRUnlock(f);
+		return 0;
+	}
+	return 1;
+}
+
+static void
+fileRUnlock(File *f)
+{
+	vtRUnlock(f->lk);
+}
+
+static int
+fileLock(File *f)
+{
+	assert(!vtCanLock(f->fs->elk));
+	vtLock(f->lk);
+	if(!chkSource(f)){
+		fileUnlock(f);
+		return 0;
+	}
+	return 1;
+}
+
+static void
+fileUnlock(File *f)
+{
+	vtUnlock(f->lk);
+}
+
+/*
+ * f->source and f->msource must NOT be locked.
+ * fileMetaFlush locks the fileMeta and then the source (in fileMetaFlush2).
+ * We have to respect that ordering.
+ */
+static void
+fileMetaLock(File *f)
+{
+if(f->up == nil)
+fprint(2, "f->elem = %s\n", f->dir.elem);
+	assert(f->up != nil);
+	assert(!vtCanLock(f->fs->elk));
+	vtLock(f->up->lk);
+}
+
+static void
+fileMetaUnlock(File *f)
+{
+	vtUnlock(f->up->lk);
+}
+
+/*
+ * f->source and f->msource must NOT be locked.
+ * see fileMetaLock.
+ */
+static void
+fileRAccess(File* f)
+{
+	if(f->mode == OReadOnly)
+		return;
+
+	fileMetaLock(f);
+	f->dir.atime = time(0L);
+	f->dirty = 1;
+	fileMetaUnlock(f);
+}
+
+/*
+ * f->source and f->msource must NOT be locked.
+ * see fileMetaLock.
+ */
+static void
+fileWAccess(File* f, char *mid)
+{
+	if(f->mode == OReadOnly)
+		return;
+
+	fileMetaLock(f);
+	f->dir.atime = f->dir.mtime = time(0L);
+	if(strcmp(f->dir.mid, mid) != 0){
+		vtMemFree(f->dir.mid);
+		f->dir.mid = vtStrDup(mid);
+	}
+	f->dir.mcount++;
+	f->dirty = 1;
+	fileMetaUnlock(f);
+}
+
+static void
+markCopied(Block *b)
+{
+	Block *lb;
+	Label l;
+
+	if(globalToLocal(b->score) == NilBlock)
+		return;
+
+	if(!(b->l.state & BsCopied)){
+		/*
+		 * We need to record that there are now pointers in
+		 * b that are not unique to b.  We do this by marking
+		 * b as copied.  Since we don't return the label block,
+		 * the caller can't get the dependencies right.  So we have
+		 * to flush the block ourselves.  This is a rare occurrence.
+		 */
+		l = b->l;
+		l.state |= BsCopied;
+		lb = _blockSetLabel(b, &l);
+	WriteAgain:
+		while(!blockWrite(lb)){
+			fprint(2, "getEntry: could not write label block\n");
+			sleep(10*1000);
+		}
+		while(lb->iostate != BioClean && lb->iostate != BioDirty){
+			assert(lb->iostate == BioWriting);
+			vtSleep(lb->ioready);
+		}
+		if(lb->iostate == BioDirty)
+			goto WriteAgain;
+		blockPut(lb);
+	}
+}
+
+static int
+getEntry(Source *r, Entry *e, int mark)
+{
+	Block *b;
+
+	if(r == nil){
+		memset(&e, 0, sizeof e);
+		return 1;
+	}
+
+	b = cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, OReadOnly);
+	if(b == nil)
+		return 0;
+	if(!entryUnpack(e, b->data, r->offset % r->epb)){
+		blockPut(b);
+		return 0;
+	}
+
+	if(mark)
+		markCopied(b);
+	blockPut(b);
+	return 1;
+}
+
+static int
+setEntry(Source *r, Entry *e)
+{
+	Block *b;
+	Entry oe;
+
+	b = cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, OReadWrite);
+	if(0) fprint(2, "setEntry: b %#ux %d score=%V\n", b->addr, r->offset % r->epb, e->score);
+	if(b == nil)
+		return 0;
+	if(!entryUnpack(&oe, b->data, r->offset % r->epb)){
+		blockPut(b);
+		return 0;
+	}
+	e->gen = oe.gen;
+	entryPack(e, b->data, r->offset % r->epb);
+
+	/* BUG b should depend on the entry pointer */
+
+	markCopied(b);
+	blockDirty(b);
+	blockPut(b);
+	return 1;
+}
+
+/* assumes hold elk */
+int
+fileSnapshot(File *dst, File *src, u32int epoch, int doarchive)
+{
+	Entry e, ee;
+
+	/* add link to snapshot */
+	if(!getEntry(src->source, &e, 1) || !getEntry(src->msource, &ee, 1))
+		return 0;
+
+	e.snap = epoch;
+	e.archive = doarchive;
+	ee.snap = epoch;
+	ee.archive = doarchive;
+
+	if(!setEntry(dst->source, &e) || !setEntry(dst->msource, &ee))
+		return 0;
+	return 1;
+}
+
+int
+fileGetSources(File *f, Entry *e, Entry *ee, int mark)
+{
+	if(!getEntry(f->source, e, mark)
+	|| !getEntry(f->msource, ee, mark))
+		return 0;
+	return 1;
+}
+
+int
+fileWalkSources(File *f)
+{
+	if(f->mode == OReadOnly)
+		return 1;
+	if(!sourceLock2(f->source, f->msource, OReadWrite))
+		return 0;
+	sourceUnlock(f->source);
+	sourceUnlock(f->msource);
+	return 1;
+}

+ 657 - 0
sys/src/cmd/fossil/flchk.c

@@ -0,0 +1,657 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct MetaChunk MetaChunk;
+
+struct MetaChunk {
+	ushort offset;
+	ushort size;
+	ushort index;
+};
+
+static void usage(void);
+static void setBit(uchar *bmap, ulong addr);
+static int getBit(uchar *bmap, ulong addr);
+static int readLabel(Label *l, u32int addr);
+static void error(char*, ...);
+static void warn(char *fmt, ...);
+static void chkEpoch(u32int epoch);
+static void chkFree(void);
+static int readLabel(Label *l, u32int addr);
+static void chkDir(char *name, Source *source, Source *meta);
+
+#pragma	varargck	argpos	error	1
+#pragma	varargck	argpos	warn	1
+
+uchar *amap;		/* bitmap: has been visited at all */
+uchar *emap;		/* bitmap: see condition (ii) below */
+uchar *vmap;		/* bitmap: has been visited in this epoch */
+uchar *xmap;		/* bitmap: see condition (iii) below */
+
+Fs *fs;
+Cache *cache;
+int nblocks;
+int bsize;
+int badactive;
+int fast;		/* don't check that all the venti blocks are there */
+u32int hint;	/* a guess at where chkEpoch might look to find the next root */
+
+void
+main(int argc, char *argv[])
+{
+	int csize = 1000;
+	VtSession *z;
+	char *host = nil;
+	u32int e;
+	Source *r, *mr;
+	Block *b;
+	Super super;
+
+	ARGBEGIN{
+	default:
+		usage();
+	case 'c':
+		csize = atoi(ARGF());
+		break;
+	case 'f':
+		fast = 1;
+		break;
+	case 'h':
+		host = ARGF();
+		break;
+	}ARGEND;
+
+	if(argc != 1)
+		usage();
+
+	vtAttach();
+
+	fmtinstall('L', labelFmt);
+	fmtinstall('V', scoreFmt);
+	fmtinstall('R', vtErrFmt);
+
+	/*
+	 * Connect to Venti.
+	 */
+	z = vtDial(host, 0);
+	if(z == nil)
+		vtFatal("could not connect to server: %s", vtGetError());
+	if(!vtConnect(z, 0))
+		vtFatal("vtConnect: %s", vtGetError());
+
+	/*
+	 * Initialize file system.
+	 */
+	fs = fsOpen(argv[0], z, csize, OReadOnly);
+	if(fs == nil)
+		vtFatal("could not open file system: %R");
+	cache = fs->cache;
+	nblocks = cacheLocalSize(cache, PartData);
+	bsize = fs->blockSize;
+
+	b = superGet(cache, &super);
+	if(b == nil)
+		vtFatal("could not load super block: %R");
+	blockPut(b);
+
+	hint = super.active;
+
+	/*
+	 * Run checks.
+	 */
+	amap = vtMemAllocZ(nblocks/8 + 1);
+	emap = vtMemAllocZ(nblocks/8 + 1);
+	vmap = vtMemAllocZ(nblocks/8 + 1);
+	xmap = vtMemAllocZ(nblocks/8 + 1);
+	for(e=fs->ehi; e >= fs->elo; e--){
+		memset(emap, 0, nblocks/8+1);
+		memset(xmap, 0, nblocks/8+1);
+		chkEpoch(e);
+	}
+	chkFree();
+	vtMemFree(amap);
+	vtMemFree(emap);
+	vtMemFree(vmap);
+	vtMemFree(xmap);
+
+	sourceLock(fs->source, OReadOnly);
+	r = sourceOpen(fs->source, 0, OReadOnly);
+	mr = sourceOpen(fs->source, 1, OReadOnly);
+	sourceUnlock(fs->source);
+	chkDir("", r, mr);
+
+	sourceClose(r);
+	sourceClose(mr);
+
+	fsClose(fs);
+
+	exits(0);
+}
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-c cachesize] [-h host] file\n", argv0);
+	exits("usage");
+}
+
+/*
+ * When b points at bb, need to check:
+ *
+ * (i) b.e in [bb.e, bb.eClose)
+ * (ii) if b.e==bb.e,  then no other b' in e points at bb.
+ * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb.
+ * (iv) if b is active then no other active b' points at bb.
+ * (v) if b is a past life of b' then only one of b and b' is active (too hard to check)
+ *
+ * Does not walk onto Venti.
+ */
+
+static int
+walk(Block *b, uchar score[VtScoreSize], int type, u32int tag, u32int epoch)
+{
+	Block *bb;
+	u32int addr;
+	int i, ret;
+	u32int ep;
+	Entry e;
+
+	if(fast && globalToLocal(score) == NilBlock)
+		return 1;
+
+	bb = cacheGlobal(cache, score, type, tag, OReadOnly);
+	if(bb == nil){
+		error("could not load block %V type %d tag %ux: %R", score, type, tag);
+		return 0;
+	}
+
+	ret = 0;
+	addr = globalToLocal(score);
+	if(addr == NilBlock){
+		ret = 1;
+		goto Exit;
+	}
+
+	/* (i) */
+	if(b->l.epoch < bb->l.epoch || bb->l.epochClose <= b->l.epoch){
+		error("walk: block %#ux [%ud, %ud) points at %#ux [%ud, %ud)\n",
+			b->addr, b->l.epoch, b->l.epochClose,
+			bb->addr, bb->l.epoch, bb->l.epochClose);
+		goto Exit;
+	}
+
+	/* (ii) */
+	if(b->l.epoch == epoch && bb->l.epoch == epoch){
+		if(getBit(emap, addr)){
+			error("walk: epoch join detected: addr %#ux %L\n", bb->addr, &bb->l);
+			goto Exit;
+		}
+		setBit(emap, addr);
+	}
+
+	/* (iii) */
+	if(!(b->l.state&BsCopied) && b->l.epoch == bb->l.epoch){
+		if(getBit(xmap, addr)){
+			error("walk: copy join detected; addr %#ux %L\n", bb->addr, &bb->l);
+			goto Exit;
+		}
+		setBit(xmap, addr);
+	}
+
+	/* (iv) */
+	if(epoch == fs->ehi){
+		/* since epoch==fs->ehi is first, amap is same as ``have seen active'' */
+		if(getBit(amap, addr)){
+			error("walk: active join detected: addr %#ux %L\n", bb->addr, &bb->l);
+			goto Exit;
+		}
+	}
+
+	if(getBit(vmap, addr)){
+		ret = 1;
+		goto Exit;
+	}
+
+	setBit(vmap, addr);
+	setBit(amap, addr);
+
+	b = nil;		/* make sure no more refs to parent */
+	USED(b);
+
+	switch(type){
+	default:
+		/* pointer block */
+		for(i=0; i<bsize/VtScoreSize; i++)
+			if(!walk(bb, bb->data + i*VtScoreSize, type-1, tag, epoch))
+				print("# clrp %#ux %d\n", bb->addr, i);
+		break;
+	case BtData:
+		break;
+	case BtDir:
+		for(i=0; i<bsize/VtEntrySize; i++){
+			if(!entryUnpack(&e, bb->data, i)){
+				error("walk: could not unpack entry: %ux[%d]: %R", addr, i);
+				print("# clre %#ux %d\n", bb->addr, i);
+				continue;
+			}
+			if(!(e.flags & VtEntryActive))
+				continue;
+//fprint(2, "%x[%d] tag=%x snap=%d score=%V\n", addr, i, e.tag, e.snap, e.score);
+			ep = epoch;
+			if(e.snap != 0){
+				if(e.snap >= epoch){
+					error("bad snap in entry: %ux[%d] snap = %ud: epoch = %ud",
+						addr, i, e.snap, epoch);
+					print("# clre %#ux %d\n", bb->addr, i);
+					continue;
+				}
+				continue;
+			}
+			if(e.flags & VtEntryLocal){
+				if(e.tag < UserTag)
+				if(e.tag != RootTag || tag != RootTag || i != 1){
+					error("bad tag in entry: %ux[%d] tag = %ux", addr, i, e.tag);
+					print("# clre %#ux %d\n", bb->addr, i);
+					continue;
+				}
+			}else{
+				if(e.tag != 0){
+					error("bad tag in entry: %ux[%d] tag = %ux", addr, i, e.tag);
+					print("# clre %#ux %d\n", bb->addr, i);
+					continue;
+				}
+			}
+			if(!walk(bb, e.score, entryType(&e), e.tag, ep))
+				print("# clre %#ux %d\n", bb->addr, i);
+		}
+		break;
+	}
+
+	ret = 1;
+
+Exit:
+	blockPut(bb);
+	return ret;
+}
+
+static void
+chkEpoch(u32int epoch)
+{
+	u32int a;
+	Label l;
+	Entry e;
+	Block *b;
+
+	print("chkEpoch %ud\n", epoch);
+	
+	/* find root block */
+	for(a=0; a<nblocks; a++){
+		if(!readLabel(&l, (a+hint)%nblocks)){
+			error("could not read label: addr %ux %d %ux %ux: %R", a, l.type, l.state, l.state);
+			continue;
+		}
+		if(l.tag == RootTag && l.epoch == epoch)
+			break;
+	}
+
+	if(a == nblocks){
+		print("chkEpoch: could not find root block for epoch: %ud\n", epoch);
+		return;
+	}
+
+	a = (a+hint)%nblocks;
+	b = cacheLocalData(cache, a, BtDir, RootTag, OReadOnly, 0);
+	if(b == nil){
+		error("could not read root block %ux: %R\n", a);
+		return;
+	}
+
+	/* no one should point at the root blocks */
+	setBit(amap, a);
+	setBit(emap, a);
+	setBit(vmap, a);
+	setBit(xmap, a);
+
+	/*
+	 * First entry is the rest of the file system.
+	 * Second entry is link to previous epoch root,
+	 * just a convenience to help the search.
+	 */
+	if(!entryUnpack(&e, b->data, 0)){
+		error("could not unpack root block %ux: %R", a);
+		blockPut(b);
+		return;
+	}
+	walk(b, e.score, BtDir, e.tag, epoch);
+	if(entryUnpack(&e, b->data, 1))
+		hint = globalToLocal(e.score);
+	blockPut(b);
+}
+
+/*
+ * We've just walked the whole write buffer.  Notice blocks that
+ * aren't marked available but that we didn't visit.  They are lost.
+ */
+static void
+chkFree(void)
+{
+	u32int a;
+	Label l;
+	u32int nfree;
+	u32int nlost;
+
+	nfree = 0;
+	nlost = 0;
+	/* find root block */
+	for(a=0; a<nblocks; a++){
+		if(!readLabel(&l, a)){
+			error("could not read label: addr %ux %d %d: %R",
+				a, l.type, l.state);
+			continue;
+		}
+		if(getBit(amap, a))
+			continue;
+		if(l.state == BsFree || l.epochClose <= fs->elo){
+			nfree++;
+			setBit(amap, a);
+			continue;
+		}
+		nlost++;
+		warn("unreachable block: addr %ux type %d tag %ux state %s epoch %ud",
+			a, l.type, l.tag, bsStr(l.state), l.epoch);
+		print("# bfree %#ux\n", a);
+		setBit(amap, a);
+	}
+	fprint(2, "\tused=%ud free space = %ud(%f%%) lost=%ud\n",
+		nblocks-nfree-nlost, nblocks, 100.*nfree/nblocks, nlost);
+}
+
+static Source *
+openSource(Source *s, char *name, uchar *bm, u32int offset, u32int gen, int dir)
+{	
+	Source *r;
+
+	if(getBit(bm, offset)){
+		warn("multiple references to source: %s -> %d", name, offset);
+		print("# clri %s\n", name);
+		return nil;
+	}
+	setBit(bm, offset);
+
+	r = sourceOpen(s, offset, OReadOnly);
+	if(r == nil){
+		warn("could not open source: %s -> %d: %R", name, offset);
+		print("# clri %s\n", name);
+		return nil;
+	}
+
+	if(r->gen != gen){
+		warn("source has been removed: %s -> %d", name, offset);
+		print("# clri %s\n", name);
+		goto Err;
+	}
+
+	if(r->dir != dir){
+		warn("dir mismatch: %s -> %d", name, offset);
+		print("# clri %s\n", name);
+		goto Err;
+	}
+	return r;
+Err:
+	sourceClose(r);
+	return nil;
+}
+
+static int
+offsetCmp(void *s0, void *s1)
+{
+	MetaChunk *mc0, *mc1;
+
+	mc0 = s0;
+	mc1 = s1;
+	if(mc0->offset < mc1->offset)
+		return -1;
+	if(mc0->offset > mc1->offset)
+		return 1;
+	return 0;
+}
+
+/* 
+ * Check that MetaBlock has reasonable header, sorted entries,
+ */
+int
+chkMetaBlock(MetaBlock *mb)
+{
+	MetaChunk *mc;
+	int oo, o, n, i;
+	uchar *p;
+
+	mc = vtMemAlloc(mb->nindex*sizeof(MetaChunk));
+	p = mb->buf + MetaHeaderSize;
+	for(i = 0; i<mb->nindex; i++){
+		mc[i].offset = (p[0]<<8) | p[1];
+		mc[i].size = (p[2]<<8) | p[3];
+		mc[i].index = i;
+		p += MetaIndexSize;
+	}
+
+	qsort(mc, mb->nindex, sizeof(MetaChunk), offsetCmp);
+
+	/* check block looks ok */
+	oo = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+	o = oo;
+	n = 0;
+	for(i=0; i<mb->nindex; i++){
+		o = mc[i].offset;
+		n = mc[i].size;
+		if(o < oo)
+			goto Err;
+		oo += n;
+	}
+	if(o+n > mb->size)
+		goto Err;
+	if(mb->size - oo != mb->free)
+		goto Err;
+
+	vtMemFree(mc);
+	return 1;
+Err:
+fprint(2, "metaChunks failed!\n");
+oo = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+for(i=0; i<mb->nindex; i++){
+fprint(2, "\t%d: %d %d\n", i, mc[i].offset, mc[i].offset + mc[i].size);
+oo += mc[i].size;
+}
+fprint(2, "\tused=%d size=%d free=%d free2=%d\n", oo, mb->size, mb->free, mb->size - oo);
+	vtMemFree(mc);
+	return 0;
+}
+
+/*
+ * Walk the source tree making sure that the BtData
+ * sources containing directory entries are okay.
+ *
+ * Walks onto Venti, so takes a long time.
+ */
+static void
+chkDir(char *name, Source *source, Source *meta)
+{
+	uchar *bm;
+	Block *b, *bb;
+	u32int nb, o;
+	MetaBlock mb;
+	DirEntry de;
+	MetaEntry me;
+	int i;
+	char *s, *nn;
+	Source *r, *mr;
+
+	if(fast && globalToLocal(source->score)==NilBlock && globalToLocal(meta->score)==NilBlock)
+		return;
+
+	if(!sourceLock2(source, meta, OReadOnly)){
+		warn("could not lock sources for %s: %R", name);
+		return;
+	}
+
+	bm = vtMemAllocZ(sourceGetDirSize(source)/8 + 1);
+	
+	nb = (sourceGetSize(meta) + meta->dsize - 1)/meta->dsize;
+	for(o=0; o<nb; o++){
+		b = sourceBlock(meta, o, OReadOnly);
+if(0)fprint(2, "source %V:%d block %d addr %d\n", source->score, source->offset, o, b->addr);
+		if(b == nil){
+			error("could not read block in meta file: %s[%ud]: %R", name, o);
+			continue;
+		}
+		if(!mbUnpack(&mb, b->data, meta->dsize)){
+			error("could not unpack meta block: %s[%ud]: %R", name, o);
+			blockPut(b);
+			continue;
+		}
+		if(!chkMetaBlock(&mb)){
+			error("bad meta block: %s[%ud]: %R", name, o);
+			blockPut(b);
+			continue;
+		}
+		s = vtStrDup("");
+		for(i=0; i<mb.nindex; i++){
+			meUnpack(&me, &mb, i);
+			if(!deUnpack(&de, &me)){
+				error("cound not unpack dir entry: %s[%ud][%d]: %R", name, o, i);
+				continue;
+			}
+			if(strcmp(s, de.elem) >= 0)
+				error("dir entry out of order: %s[%ud][%d] = %s last = %s", name, o, i,
+					de.elem, s);
+			vtMemFree(s);
+			s = vtStrDup(de.elem);
+			nn = smprint("%s/%s", name, de.elem);
+			if(!(de.mode & ModeDir)){
+				r = openSource(source, nn, bm, de.entry, de.gen, 0);
+				if(r != nil)
+					sourceClose(r);
+				deCleanup(&de);
+				free(nn);
+				continue;
+			}
+
+			r = openSource(source, nn, bm, de.entry, de.gen, 1);
+			if(r == nil){
+				deCleanup(&de);
+				free(nn);
+				continue;
+			}
+
+			mr = openSource(source, nn, bm, de.mentry, de.mgen, 0);
+			if(mr == nil){
+				sourceClose(r);
+				deCleanup(&de);
+				free(nn);
+				continue;
+			}
+			
+			chkDir(nn, r, mr);
+
+			sourceClose(mr);
+			sourceClose(r);
+			deCleanup(&de);
+			free(nn);
+			deCleanup(&de);
+
+		}
+		vtMemFree(s);
+		blockPut(b);
+	}
+
+	nb = sourceGetDirSize(source);
+	for(o=0; o<nb; o++){
+		if(getBit(bm, o))
+			continue;
+		r = sourceOpen(source, o, OReadOnly);
+		if(r == nil)
+			continue;
+		warn("non referenced entry in source %s[%d]", name, o);
+		if((bb = sourceBlock(source, o/(source->dsize/VtEntrySize), OReadOnly)) != nil){
+			if(bb->addr != NilBlock)
+				print("# clre %#ux %d\n", bb->addr, o%(source->dsize/VtEntrySize));
+			blockPut(bb);
+		}
+		sourceClose(r);
+	}
+	
+	sourceUnlock(source);
+	sourceUnlock(meta);
+	vtMemFree(bm);
+}
+
+
+static void
+setBit(uchar *bmap, ulong addr)
+{
+	bmap[addr>>3] |= 1 << (addr & 7);
+}
+
+static int
+getBit(uchar *bmap, ulong addr)
+{
+	return (bmap[addr>>3] >> (addr & 7)) & 1;
+}
+
+static int
+readLabel(Label *l, u32int addr)
+{
+	int lpb;
+	Block *b;
+	u32int a;
+
+	lpb = bsize / LabelSize;
+	a = addr / lpb;
+	b = cacheLocal(cache, PartLabel, a, OReadOnly);
+	if(b == nil){
+		blockPut(b);
+		return 0;
+	}
+
+	if(!labelUnpack(l, b->data, addr%lpb)){
+		print("labelUnpack %ux failed\n", addr);
+		blockPut(b);
+		return 0;
+	}
+	blockPut(b);
+	return 1;
+}
+
+static void
+error(char *fmt, ...)
+{
+	static nerr;
+	va_list arg;
+	char buf[128];
+
+
+	va_start(arg, fmt);
+	vseprint(buf, buf+sizeof(buf), fmt, arg);
+	va_end(arg);
+
+	print("error: %s\n", buf);
+
+//	if(nerr++ > 20)
+//		vtFatal("too many errors");
+}
+
+static void
+warn(char *fmt, ...)
+{
+	static nerr;
+	va_list arg;
+	char buf[128];
+
+
+	va_start(arg, fmt);
+	vseprint(buf, buf+sizeof(buf), fmt, arg);
+	va_end(arg);
+
+	print("warn: %s\n", buf);
+}

+ 553 - 0
sys/src/cmd/fossil/flfmt.c

@@ -0,0 +1,553 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+#define blockWrite _blockWrite	/* hack */
+
+static void usage(void);
+static u64int fdsize(int fd);
+static void partition(int fd, int bsize, Header *h);
+static void writeBlock(int fd, uchar *buf, int bsize, ulong bn);
+static u64int unittoull(char *s);
+static u32int blockAlloc(int type, u32int tag);
+static void blockRead(int part, u32int addr);
+static void blockWrite(int part, u32int addr);
+static void superInit(char *label, u32int root);
+static void rootMetaInit(Entry *e);
+static u32int rootInit(Entry *e);
+static void topLevel(char *name);
+static int parseScore(uchar[VtScoreSize], char*);
+static u32int ventiRoot(char*, char*);
+static VtSession *z;
+
+#define TWID64	((u64int)~(u64int)0)
+
+Disk *disk;
+Fs *fs;
+uchar *buf;
+int bsize = 8*1024;
+u64int qid = 1;
+
+int
+confirm(char *msg)
+{
+	char buf[100];
+	int n;
+
+	fprint(2, "%s [y/n]: ", msg);
+	n = read(0, buf, sizeof buf - 1);
+	if(n <= 0)
+		return 0;
+	if(buf[0] == 'y')
+		return 1;
+	return 0;
+}
+
+void
+main(int argc, char *argv[])
+{
+	int fd, force;
+	Header h;
+	ulong bn;	
+	Entry e;
+	char *label = "vfs";
+	char *host = nil;
+	char *score = nil;
+	u32int root;
+	Dir *d;
+
+	force = 0;
+	ARGBEGIN{
+	default:
+		usage();
+	case 'b':
+		bsize = unittoull(EARGF(usage()));
+		if(bsize == ~0)
+			usage();
+		break;
+	case 'h':
+		host = EARGF(usage());
+		break;
+	case 'l':
+		label = EARGF(usage());
+		break;
+	case 'v':
+		score = EARGF(usage());
+		break;
+
+	/*
+	 * This is -y instead of -f because flchk has a
+	 * (frequently used) -f option.  I type flfmt instead
+	 * of flchk all the time, and want to make it hard
+	 * to reformat my file system accidentally.
+	 */
+	case 'y':
+		force = 1;
+		break;
+	}ARGEND
+
+	if(argc != 1)
+		usage();
+		
+	vtAttach();
+
+	fmtinstall('V', scoreFmt);
+	fmtinstall('R', vtErrFmt);
+	fmtinstall('L', labelFmt);
+
+	fd = open(argv[0], ORDWR);
+	if(fd < 0)
+		vtFatal("could not open file: %s: %r", argv[0]);
+
+	buf = vtMemAllocZ(bsize);
+	if(pread(fd, buf, bsize, HeaderOffset) != bsize)
+		vtFatal("could not read fs header block: %r");
+
+	if(headerUnpack(&h, buf) && !force
+	&& !confirm("fs header block already exists; are you sure?"))
+		goto Out;
+
+	if((d = dirfstat(fd)) == nil)
+		vtFatal("dirfstat: %r");
+
+	if(d->type == 'M' && !force 
+	&& !confirm("fs file is mounted via devmnt (is not a kernel device); are you sure?"))
+		goto Out;
+
+	partition(fd, bsize, &h);
+	headerPack(&h, buf);
+	if(pwrite(fd, buf, bsize, HeaderOffset) < bsize)
+		vtFatal("could not write fs header: %r");
+
+	disk = diskAlloc(fd);
+	if(disk == nil)
+		vtFatal("could not open disk: %r");
+
+	/* zero labels */
+	memset(buf, 0, bsize);
+	for(bn = 0; bn < diskSize(disk, PartLabel); bn++)
+		blockWrite(PartLabel, bn);
+
+	if(score)
+		root = ventiRoot(host, score);
+	else{
+		rootMetaInit(&e);
+		root = rootInit(&e);
+	}
+
+	superInit(label, root);
+	diskFree(disk);
+
+	if(argc == 1)
+		topLevel(argv[0]);
+
+Out:
+	vtDetach();
+	exits(0);
+}
+
+static u64int
+fdsize(int fd)
+{
+	Dir *dir;
+	u64int size;
+
+	dir = dirfstat(fd);
+	if(dir == nil)
+		vtFatal("could not stat file: %r");
+	size = dir->length;
+	free(dir);
+	return size;
+}
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-b blocksize] file [vac:score]\n", argv0);
+	exits("usage");
+}
+
+static void
+partition(int fd, int bsize, Header *h)
+{
+	ulong nblock, ndata, nlabel;
+	ulong lpb;
+
+	if(bsize % 512 != 0)
+		sysfatal("block size must be a multiple of 512 bytes");
+	if(bsize > VtMaxLumpSize)
+		sysfatal("block size must be less than %d", VtMaxLumpSize);
+
+	memset(h, 0, sizeof(*h));
+	h->blockSize = bsize;
+
+	lpb = bsize/LabelSize;
+
+	nblock = fdsize(fd)/bsize;
+	
+	/* sanity check */
+	if(nblock < (HeaderOffset*10)/bsize)
+		vtFatal("file too small");
+	
+	h->super = (HeaderOffset + 2*bsize)/bsize;
+	h->label = h->super + 1;
+	ndata = ((u64int)lpb)*(nblock - h->label)/(lpb+1);
+	nlabel = (ndata + lpb - 1)/lpb;
+	h->data = h->label + nlabel;
+	h->end = h->data + ndata;
+
+	assert(h->end == nblock);
+}
+
+static u32int
+tagGen(void)
+{
+	u32int tag;
+
+	for(;;){
+		tag = lrand();
+		if(tag > RootTag)
+			break;
+	}
+	return tag;
+}
+
+static void
+entryInit(Entry *e)
+{
+	e->gen = 0;
+	e->dsize = bsize;
+	e->psize = bsize/VtEntrySize*VtEntrySize;
+	e->flags = VtEntryActive;
+	e->depth = 0;
+	e->size = 0;
+	memmove(e->score, vtZeroScore, VtScoreSize);
+	e->tag = tagGen();
+	e->snap = 0;
+	e->archive = 0;
+}
+
+static void
+rootMetaInit(Entry *e)
+{
+	u32int addr;
+	u32int tag;
+	DirEntry de;
+	MetaBlock mb;
+	MetaEntry me;
+
+	memset(&de, 0, sizeof(de));
+	de.elem = vtStrDup("root");
+	de.entry = 0;
+	de.gen = 0;
+	de.mentry = 1;
+	de.mgen = 0;
+	de.size = 0;
+	de.qid = qid++;
+	de.uid = vtStrDup("adm");
+	de.gid = vtStrDup("adm");
+	de.mid = vtStrDup("adm");
+	de.mtime = time(0);
+	de.mcount = 0;
+	de.ctime = time(0);
+	de.atime = time(0);
+	de.mode = ModeDir | 0555;
+
+	tag = tagGen();
+	addr = blockAlloc(BtData, tag);
+
+	/* build up meta block */
+	memset(buf, 0, bsize);
+	mbInit(&mb, buf, bsize, bsize/100);
+	me.size = deSize(&de);
+	me.p = mbAlloc(&mb, me.size);
+	assert(me.p != nil);
+	dePack(&de, &me);
+	mbInsert(&mb, 0, &me);
+	mbPack(&mb);
+	blockWrite(PartData, addr);
+	deCleanup(&de);
+
+	/* build up entry for meta block */
+	entryInit(e);
+	e->flags |= VtEntryLocal;
+ 	e->size = bsize;
+	e->tag = tag;
+	localToGlobal(addr, e->score);
+}
+
+static u32int
+rootInit(Entry *e)
+{
+	ulong addr;
+	u32int tag;
+
+	tag = tagGen();
+
+	addr = blockAlloc(BtDir, tag);
+	memset(buf, 0, bsize);
+
+	/* root meta data is in the third entry */
+	entryPack(e, buf, 2);
+
+	entryInit(e);
+	e->flags |= VtEntryDir;
+	entryPack(e, buf, 0);
+	
+	entryInit(e);
+	entryPack(e, buf, 1);
+
+	blockWrite(PartData, addr);
+
+	entryInit(e);
+	e->flags |= VtEntryLocal|VtEntryDir;
+ 	e->size = VtEntrySize*3;
+	e->tag = tag;
+	localToGlobal(addr, e->score);
+
+	addr = blockAlloc(BtDir, RootTag);
+	memset(buf, 0, bsize);
+	entryPack(e, buf, 0);
+
+	blockWrite(PartData, addr);
+
+	return addr;
+}
+
+	
+static u32int
+blockAlloc(int type, u32int tag)
+{
+	static u32int addr;
+	Label l;
+	int lpb;
+
+	lpb = bsize/LabelSize;
+
+	blockRead(PartLabel, addr/lpb);
+	if(!labelUnpack(&l, buf, addr % lpb) || l.state != BsFree)
+		vtFatal("bad label: %r");
+	l.epoch = 1;
+	l.epochClose = ~(u32int)0;
+	l.type = type;
+	l.state = BsAlloc;
+	l.tag = tag;
+	labelPack(&l, buf, addr % lpb);
+	blockWrite(PartLabel, addr/lpb);
+	return addr++;
+}
+
+static void
+superInit(char *label, u32int root)
+{
+	Super s;
+
+	memset(buf, 0, bsize);
+	memset(&s, 0, sizeof(s));
+	s.version = SuperVersion;
+	s.epochLow = 1;
+	s.epochHigh = 1;
+	s.qid = qid;
+	s.active = root;	
+	s.next = NilBlock;
+	s.current = NilBlock;
+	strecpy(s.name, s.name+sizeof(s.name), label);
+	memmove(s.last, vtZeroScore, VtScoreSize);
+
+	superPack(&s, buf);
+	blockWrite(PartSuper, 0);
+}
+
+static u64int
+unittoull(char *s)
+{
+	char *es;
+	u64int n;
+
+	if(s == nil)
+		return TWID64;
+	n = strtoul(s, &es, 0);
+	if(*es == 'k' || *es == 'K'){
+		n *= 1024;
+		es++;
+	}else if(*es == 'm' || *es == 'M'){
+		n *= 1024*1024;
+		es++;
+	}else if(*es == 'g' || *es == 'G'){
+		n *= 1024*1024*1024;
+		es++;
+	}
+	if(*es != '\0')
+		return TWID64;
+	return n;
+}
+
+static void
+blockRead(int part, u32int addr)
+{
+	if(!diskReadRaw(disk, part, addr, buf))
+		vtFatal("read failed: %r");
+}
+
+static void
+blockWrite(int part, u32int addr)
+{
+	if(!diskWriteRaw(disk, part, addr, buf))
+		vtFatal("write failed: %r");
+}
+
+static void
+addFile(File *root, char *name, uint mode)
+{
+	File *f;
+
+	f = fileCreate(root, name, mode | ModeDir, "adm");
+	if(f == nil)
+		vtFatal("could not create file: %s: %r", name);
+	fileDecRef(f);
+}
+
+static void
+topLevel(char *name)
+{
+	Fs *fs;
+	File *root;
+
+	/* ok, now we can open as a fs */
+	fs = fsOpen(name, z, 100, OReadWrite);
+	if(fs == nil)
+		vtFatal("could not open file system: %r");
+	vtRLock(fs->elk);
+	root = fsGetRoot(fs);
+	if(root == nil)
+		vtFatal("could not open root: %r");
+	addFile(root, "active", 0777);	/* BUG: add create command to Ccmd instead */
+	addFile(root, "archive", 0555);
+	addFile(root, "snapshot", 0555);
+	fileDecRef(root);
+	vtRUnlock(fs->elk);
+	fsClose(fs);
+}
+
+static int
+ventiRead(uchar score[VtScoreSize], int type)
+{
+	int n;
+
+	n = vtRead(z, score, type, buf, bsize);
+	if(n < 0)
+		vtFatal("ventiRead %V (%d) failed: %R", score, type);
+	vtZeroExtend(type, buf, n, bsize);
+	return n;
+}
+
+static u32int
+ventiRoot(char *host, char *s)
+{
+	int i, n;
+	uchar score[VtScoreSize];
+	u32int addr, tag;
+	DirEntry de;
+	MetaBlock mb;
+	MetaEntry me;
+	Entry e;
+	VtRoot root;
+
+	if(!parseScore(score, s))
+		vtFatal("bad score '%s'", s);
+
+	if((z = vtDial(host, 0)) == nil
+	|| !vtConnect(z, nil))
+		vtFatal("connect to venti: %R");
+
+	tag = tagGen();
+	addr = blockAlloc(BtDir, tag);
+
+	ventiRead(score, VtRootType);
+	if(!vtRootUnpack(&root, buf))
+		vtFatal("corrupted root: vtRootUnpack");
+	n = ventiRead(root.score, VtDirType);
+
+	/*
+	 * Fossil's vac archives start with an extra layer of source,
+	 * but vac's don't.
+	 */
+	if(n <= 2*VtEntrySize){
+		if(!entryUnpack(&e, buf, 0))
+			vtFatal("bad root: top entry");
+		n = ventiRead(e.score, VtDirType);
+	}
+
+	/*
+	 * There should be three root sources (and nothing else) here.
+	 */
+	for(i=0; i<3; i++){
+		if(!entryUnpack(&e, buf, i)
+		|| !(e.flags&VtEntryActive)
+		|| e.psize < 256
+		|| e.dsize < 256)
+			vtFatal("bad root: entry %d", i);
+		fprint(2, "%V\n", e.score);
+	}
+	if(n > 3*VtEntrySize)
+		vtFatal("bad root: entry count");
+
+	blockWrite(PartData, addr);
+
+	/*
+	 * Maximum qid is recorded in root's msource, entry #2 (conveniently in e).
+	 */
+	ventiRead(e.score, VtDataType);
+	if(!mbUnpack(&mb, buf, bsize))
+		vtFatal("bad root: mbUnpack");
+	meUnpack(&me, &mb, 0);
+	if(!deUnpack(&de, &me))
+		vtFatal("bad root: dirUnpack");
+	if(!de.qidSpace)
+		vtFatal("bad root: no qidSpace");
+	qid = de.qidMax;
+
+	/*
+	 * Recreate the top layer of source.
+	 */
+	entryInit(&e);
+	e.flags |= VtEntryLocal|VtEntryDir;
+	e.size = VtEntrySize*3;
+	e.tag = tag;
+	localToGlobal(addr, e.score);
+
+	addr = blockAlloc(BtDir, RootTag);
+	memset(buf, 0, bsize);
+	entryPack(&e, buf, 0);
+	blockWrite(PartData, addr);
+
+	return addr;
+}
+
+static int
+parseScore(uchar *score, char *buf)
+{
+	int i, c;
+
+	memset(score, 0, VtScoreSize);
+
+	if(strlen(buf) < VtScoreSize*2)
+		return 0;
+	for(i=0; i<VtScoreSize*2; i++){
+		if(buf[i] >= '0' && buf[i] <= '9')
+			c = buf[i] - '0';
+		else if(buf[i] >= 'a' && buf[i] <= 'f')
+			c = buf[i] - 'a' + 10;
+		else if(buf[i] >= 'A' && buf[i] <= 'F')
+			c = buf[i] - 'A' + 10;
+		else
+			return 0;
+
+		if((i & 1) == 0)
+			c <<= 4;
+	
+		score[i>>1] |= c;
+	}
+	return 1;
+}
+

+ 13 - 0
sys/src/cmd/fossil/flproto

@@ -0,0 +1,13 @@
+#
+# Test filesystem.
+#
+fsys main config /tmp/fossil
+fsys main open -AWP
+fsys main 
+uname rsc :rsc
+uname sys +rsc
+uname jmk :jmk
+uname sys +jmk
+srv -p test.fscons
+srv test.fossil
+create /active/tmp sys sys d777

+ 98 - 0
sys/src/cmd/fossil/fns.h

@@ -0,0 +1,98 @@
+Source* sourceRoot(Fs*, u32int, int);
+Source* sourceOpen(Source*, ulong, int);
+Source* sourceCreate(Source*, int, int, u32int);
+Block* sourceBlock(Source*, ulong, int);
+int sourceGetEntry(Source*, Entry*);
+int sourceSetSize(Source*, uvlong);
+uvlong sourceGetSize(Source*);
+int sourceSetDirSize(Source*, ulong);
+ulong sourceGetDirSize(Source*);
+int sourceTruncate(Source*);
+int sourceRemove(Source*);
+void sourceClose(Source*);
+int sourceLock(Source*, int);
+void sourceUnlock(Source*);
+int sourceLock2(Source*, Source*, int);
+
+Cache* cacheAlloc(Disk*, VtSession*, ulong, int);
+void cacheFree(Cache*);
+Block* cacheLocal(Cache*, int, u32int, int);
+Block* cacheLocalData(Cache*, u32int, int, u32int, int, u32int);
+Block* cacheGlobal(Cache*, uchar[VtScoreSize], int, u32int, int);
+Block* cacheAllocBlock(Cache*, int, u32int, u32int, u32int);
+void cacheFlush(Cache*, int);
+u32int cacheLocalSize(Cache*, int);
+
+Block* blockCopy(Block*, u32int, u32int, u32int);
+void blockDupLock(Block*);
+void blockPut(Block*);
+void blockDependency(Block*, Block*, int, uchar*);
+int blockDirty(Block*);
+int blockRemoveLink(Block*, u32int, int, u32int);
+int blockSetLabel(Block*, Label*);
+Block* _blockSetLabel(Block*, Label*);
+void blockSetIOState(Block*, int);
+int blockWrite(Block*);
+uchar* blockRollback(Block*, uchar*);
+
+Disk* diskAlloc(int);
+void diskFree(Disk*);
+int diskReadRaw(Disk*, int, u32int, uchar*);
+int diskWriteRaw(Disk*, int, u32int, uchar*);
+void diskRead(Disk*, Block*);
+void diskWrite(Disk*, Block*);
+int diskFlush(Disk*);
+u32int diskSize(Disk*, int);
+int diskBlockSize(Disk*);
+
+char* bsStr(int);
+char* bioStr(int);
+char* btStr(int);
+u32int globalToLocal(uchar[VtScoreSize]);
+void localToGlobal(u32int, uchar[VtScoreSize]);
+
+int headerUnpack(Header*, uchar*);
+void headerPack(Header*, uchar*);
+
+int labelFmt(Fmt*);
+int labelUnpack(Label*, uchar*, int);
+void labelPack(Label*, uchar*, int);
+
+int scoreFmt(Fmt*);
+
+int superUnpack(Super*, uchar*);
+void superPack(Super*, uchar*);
+
+int entryUnpack(Entry*, uchar*, int);
+void entryPack(Entry*, uchar*, int);
+int entryType(Entry*);
+
+Periodic* periodicAlloc(void (*)(void*), void*, int);
+void periodicKill(Periodic*);
+
+File* fileRoot(Source*);
+int fileSnapshot(File*, File*, u32int, int);
+int fileGetSources(File*, Entry*, Entry*, int);
+int mkVac(VtSession*, uint, Entry*, Entry*, DirEntry*, uchar[VtScoreSize]);
+int fsNextQid(Fs*, u64int*);
+Block* superGet(Cache*, Super*);
+void superPut(Block*, Super*, int);
+
+Arch* archInit(Cache*, Disk*, Fs*, VtSession*);
+void archFree(Arch*);
+void archKick(Arch*);
+
+void bwatchLock(Block*);
+void bwatchUnlock(Block*);
+void bwatchInit(void);
+void bwatchSetBlockSize(uint);
+void bwatchDependency(Block*);
+void bwatchReset(uchar[VtScoreSize]);
+
+void initWalk(WalkPtr*, Block*, uint);
+int nextWalk(WalkPtr*, uchar[VtScoreSize], uchar*, u32int*, Entry**);
+
+void snapGetTimes(Snap*, u32int*, u32int*);
+void snapSetTimes(Snap*, u32int, u32int);
+
+#pragma varargck type "L" Label*

+ 186 - 0
sys/src/cmd/fossil/fossil-acid

@@ -0,0 +1,186 @@
+// pick up the common data structures
+
+rc("cd /sys/src/cmd/fossil; mk 9fsys.acid");
+include("/sys/src/cmd/fossil/9fsys.acid");
+rc("cd /sys/src/cmd/fossil; mk cache.acid");
+include("/sys/src/cmd/fossil/cache.acid");
+rc("cd /sys/src/cmd/fossil; mk disk.acid");
+include("/sys/src/cmd/fossil/disk.acid");
+rc("cd /sys/src/cmd/fossil; mk fs.acid");
+include("/sys/src/cmd/fossil/fs.acid");
+rc("cd /sys/src/libventi; mk plan9.acid");
+include("/sys/src/libventi/plan9.acid");
+
+// make a list of pids from a list of Thread structures
+defn _threadlist(t)
+{
+	local l;
+
+	l = {};
+	while t do {
+		t = (Thread)t;
+		l = append l, t.pid;
+		t = t.next;
+	}
+	return l;
+}
+
+// print info about a VtRendez
+defn vtrendez(r)
+{
+	local l, t, w, q;
+
+	r = (VtRendez)r;
+	w = _threadlist(r.wfirst);
+	if match(pid, w) >= 0 then
+		print("\twaiting for wakeup\n");
+
+	l = (VtLock)r.lk;
+	q = _threadlist(l.qfirst);
+	if match(pid, q) >= 0 then
+		print("\tawakened; waiting for lock\n");
+
+	print("\tr=(VtRendez)", r\X, "\n");
+	print("\tl=(VtLock)", l\X, "\n");
+	if l.writer != 0 then {
+		t = (Thread)l.writer;
+		print("\tvtLock is held by ", t.pid\D, "\n");
+	}
+}
+
+// print info about a VtLock
+defn vtlock(l)
+{
+	local t;
+
+	l = (VtLock)l;
+	print("\tl=(VtLock)", l\X, "\n");
+	if l.writer then {
+		t = (Thread)l.writer;
+		print("\tvtLock is held by ", t.pid\D, "\n");
+	} else if l.readers then
+		print("\tvtLock is held by ", l.readers\D, " readers\n");
+	else 
+		print("\tvtLock is not held!\n");
+}
+
+// try to say something intelligent about why a process is stuck.
+_pauses = {
+	open,
+	pread,
+	pwrite,
+	sleep,
+	vtSleep,
+	vtLock,
+};
+
+defn deadlocklist(l)
+{
+	while l do {
+		setproc(head l);
+		deadlock();
+		l = tail l;
+	}
+}
+
+defn deadlock()
+{
+	local stk, frame, name, stallframe, fossilframe, stallname;
+
+	stk = strace(*PC, *SP, linkreg(0));
+
+	print("setproc(", pid, ") // ", readfile("/proc/"+itoa(pid)+"/args"), "\n");
+	stallframe = 0;
+	stallname = "";
+	fossilframe = 0;
+	while stk do {
+		frame = head stk;
+		name = fmt(frame[0], 'a');
+		if !stallframe && match(name, _pauses) >= 0 then {
+			stallframe = frame;
+			stallname = name;
+			print("\t", fmt(frame[0], 'a'), "(");
+			params(frame[2]);
+			print(") ", pcfile(frame[0]), ":", pcline(frame[0]));
+			print("\n\t\tcalled from ", fmt(frame[1], 'a'), " ");
+			pfl(frame[1]);
+		}
+		if !fossilframe && regexp("^/sys/src/cmd/fossil/.*", pcfile(frame[0])) then {
+			fossilframe = frame;
+			print("\t", fmt(frame[0], 'a'), "(");
+			params(frame[2]);
+			print(") ", pcfile(frame[0]), ":", pcline(frame[0]));
+			print("\n\t\tcalled from ", fmt(frame[1], 'a'), " ");
+			pfl(frame[1]);
+
+			if name == cacheLocalLookup && stallname == vtLock then
+				print("\twaiting to lock block b=(Block)", *cacheLocalLookup:b\X, "\n");
+			if name == cacheLocal && stallname == vtSleep then
+				print("\tsleeping on block b=(Block)", *cacheLocal:b\X, "\n");
+			if name == blockFlush && stallname == vtSleep then
+				print("\tsleeping on block b=(Block)", *blockFlush:b\X, "\n");
+		}
+		stk = tail stk;
+	}
+
+	if stallname == vtSleep then
+		vtrendez(*vtSleep:q);
+	if stallname == vtLock then
+		vtlock(*vtLock:p);
+	if !stallframe || !fossilframe then 
+		print("\tconfused\n");
+	print("\n");
+}
+
+// fetch fsys
+defn
+fsysGet(name)
+{
+	return fsysmain;
+}
+
+// dump information about the cache
+defn
+cacheDump(c)
+{
+	local i, b, x;
+
+	c = (Cache)c;
+	x = c.blocks;
+	i=0;
+	loop 1,c.nblocks do {
+		b = (Block)(x+i);
+		print(b\X, " ", b.pc\X, " ", b.ref\D, "\n");
+		i = i+sizeofBlock;
+	}
+}
+
+// print block info
+defn
+printblist(bl)
+{
+	bl = (BList)bl;
+	while bl != 0 do {
+		print("[", bl.part\D, " ", bl.addr\X, " ", bl.vers\D, "]");
+		bl = bl.next;
+		if bl != 0 then
+			print(", ");
+	}
+}
+
+defn
+block(b)
+{
+	local i;
+	
+	b = (Block)b;
+	print("b=(Block)", b\X, "\n");
+	print("\tref ", b.ref\D, " nlock ", b.nlock\D, "\n");
+	print("\tpav=[", b.part\D, " ", b.addr\X, " ", b.vers\D, "]\n");
+	print("\tprior=");
+	printblist(b.prior);
+	print("\n");
+	print("\tunlink=");
+	printblist(b.uhead);
+	print("\n");
+}

+ 94 - 0
sys/src/cmd/fossil/fossil.c

@@ -0,0 +1,94 @@
+#include "stdinc.h"
+
+#include "9.h"
+
+int Dflag;
+char* none = "none";
+int stdfd[2];
+
+static char* myname = "numpty";
+
+static void
+usage(void)
+{
+	argv0 = myname;
+	sysfatal("usage: %s"
+		" [-Dt]"
+		" [-c cmd]",
+		myname);
+}
+
+void
+main(int argc, char* argv[])
+{
+	char **cmd, *p;
+	int i, ncmd, tflag;
+
+	myname = argv[0];
+	fmtinstall('D', dirfmt);
+	fmtinstall('F', fcallfmt);
+	fmtinstall('M', dirmodefmt);
+	quotefmtinstall();
+
+	/*
+	 * Insulate from the invoker's environment.
+	 */
+	if(rfork(RFREND|RFNOTEG|RFNAMEG) < 0)
+		sysfatal("rfork: %r");
+
+	close(0);
+	open("/dev/null", OREAD);
+	close(1);
+	open("/dev/null", OWRITE);
+
+	cmd = nil;
+	ncmd = tflag = 0;
+
+	vtAttach();
+
+	ARGBEGIN{
+	case '?':
+	default:
+		usage();
+		break;
+	case 'D':
+		Dflag ^= 1;
+		break;
+	case 'c':
+		p = ARGF();
+		if(p == nil)
+			usage();
+		cmd = vtMemRealloc(cmd, (ncmd+1)*sizeof(char*));
+		cmd[ncmd++] = p;
+		break;
+	case 't':
+		tflag = 1;
+		break;
+	}ARGEND
+	if(argc != 0)
+		usage();
+
+	consInit();
+	cliInit();
+	procInit();
+	cmdInit();
+	fsysInit();
+	exclInit();
+	fidInit();
+
+	srvInit();
+	lstnInit();
+	usersInit();
+
+	for(i = 0; i < ncmd; i++){
+		if(cliExec(cmd[i]) == 0)
+			break;
+	}
+	vtMemFree(cmd);
+
+	if(tflag && consTTY() == 0)
+		consPrint("%s\n", vtGetError());
+
+	vtDetach();
+	exits(0);
+}

+ 819 - 0
sys/src/cmd/fossil/fs.c

@@ -0,0 +1,819 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+static void fsMetaFlush(void *a);
+static Snap *snapInit(Fs*);
+static void snapClose(Snap*);
+
+Fs *
+fsOpen(char *file, VtSession *z, long ncache, int mode)
+{
+	Fs *fs;
+	Disk *disk;
+	int fd;
+	Block *b, *bs;
+	Super super;
+	int m;
+
+	switch(mode){
+	default:
+		vtSetError(EBadMode);
+		return nil;
+	case OReadOnly:
+		m = OREAD;
+		break;
+	case OReadWrite:
+		m = ORDWR;
+		break;
+	}
+	fd = open(file, m);
+	if(fd < 0){
+		vtOSError();
+		return nil;
+	}
+
+	bwatchInit();	
+	disk = diskAlloc(fd);
+	if(disk == nil){
+		close(fd);
+		return nil;
+	}
+
+	fs = vtMemAllocZ(sizeof(Fs));
+	fs->mode = mode;
+	fs->blockSize = diskBlockSize(disk);
+	fs->elk = vtLockAlloc();
+	fs->cache = cacheAlloc(disk, z, ncache, mode);
+	if(mode == OReadWrite)
+		fs->arch = archInit(fs->cache, disk, fs, z);
+	fs->z = z;
+
+	b = cacheLocal(fs->cache, PartSuper, 0, mode);
+	if(b == nil)
+		goto Err;
+	if(!superUnpack(&super, b->data)){
+		blockPut(b);
+		goto Err;
+	}
+	blockPut(b);
+
+	fs->ehi = super.epochHigh;
+	fs->elo = super.epochLow;
+
+fprint(2, "fs->ehi %d fs->elo %d active=%d\n", fs->ehi, fs->elo, super.active);
+
+	fs->source = sourceRoot(fs, super.active, mode);
+	if(fs->source == nil){
+		/*
+		 * Perhaps it failed because the block is copy-on-write.
+		 * Do the copy and try again.
+		 */
+		if(mode == OReadOnly || strcmp(vtGetError(), EBadRoot) != 0)
+			goto Err;
+		b = cacheLocalData(fs->cache, super.active, BtDir, RootTag, OReadWrite, 0);
+		if(b == nil)
+			goto Err;
+		if(!(b->l.state&BsClosed) && b->l.epoch == fs->ehi){
+			blockPut(b);
+			goto Err;
+		}
+		b = blockCopy(b, RootTag, fs->ehi, fs->elo);
+		if(b == nil)
+			goto Err;
+		super.active = b->addr;
+		bs = cacheLocal(fs->cache, PartSuper, 0, OReadWrite);
+		if(bs == nil){
+			blockPut(b);
+			goto Err;
+		}
+		superPack(&super, bs->data);
+		blockDependency(bs, b, -1, nil);
+		blockDirty(bs);
+		blockPut(bs);
+		blockPut(b);
+		fs->source = sourceRoot(fs, super.active, mode);
+		if(fs->source == nil)
+			goto Err;
+	}
+
+fprint(2, "got fs source\n");
+
+	vtRLock(fs->elk);
+	fs->file = fileRoot(fs->source);
+	vtRUnlock(fs->elk);
+	if(fs->file == nil)
+		goto Err;
+
+fprint(2, "got file root\n");
+
+	if(mode == OReadWrite){
+		fs->metaFlush = periodicAlloc(fsMetaFlush, fs, 1000);
+		fs->snap = snapInit(fs);
+	}
+	return fs;
+
+Err:
+	fsClose(fs);
+	return nil;
+}
+
+void
+fsClose(Fs *fs)
+{
+	vtRLock(fs->elk);
+	periodicKill(fs->metaFlush);
+	snapClose(fs->snap);
+	if(fs->file){
+		fileMetaFlush(fs->file, 0);
+		if(!fileDecRef(fs->file))
+			vtFatal("fsClose: files still in use: %r\n");
+	}
+	fs->file = nil;
+	sourceClose(fs->source);
+	cacheFree(fs->cache);
+	if(fs->arch)
+		archFree(fs->arch);
+	vtRUnlock(fs->elk);
+	vtLockFree(fs->elk);
+	memset(fs, ~0, sizeof(Fs));
+	vtMemFree(fs);
+}
+
+int
+fsRedial(Fs *fs, char *host)
+{
+	if(!vtRedial(fs->z, host))
+		return 0;
+	if(!vtConnect(fs->z, 0))
+		return 0;
+	return 1;
+}
+
+File *
+fsGetRoot(Fs *fs)
+{
+	return fileIncRef(fs->file);
+}
+
+int
+fsGetBlockSize(Fs *fs)
+{
+	return fs->blockSize;
+}
+
+Block*
+superGet(Cache *c, Super* super)
+{
+	Block *b;
+
+	if((b = cacheLocal(c, PartSuper, 0, OReadWrite)) == nil){
+		fprint(2, "superGet: cacheLocal failed: %R");
+		return nil;
+	}
+	if(!superUnpack(super, b->data)){
+		fprint(2, "superGet: superUnpack failed: %R");
+		blockPut(b);
+		return nil;
+	}
+
+	return b;
+}
+
+void
+superPut(Block* b, Super* super, int forceWrite)
+{
+	superPack(super, b->data);
+	blockDirty(b);
+	if(forceWrite){
+		while(!blockWrite(b)){
+			/* BUG: what should really happen here? */
+			fprint(2, "could not write super block; waiting 10 seconds\n");
+			sleep(10*000);
+		}
+		while(b->iostate != BioClean && b->iostate != BioDirty){
+			assert(b->iostate == BioWriting);
+			vtSleep(b->ioready);
+		}
+		/*
+		 * it's okay that b might still be dirty.
+		 * that means it got written out but with an old root pointer,
+		 * but the other fields went out, and those are the ones
+		 * we really care about.  (specifically, epochHigh; see fsSnapshot).
+		 */
+	}
+	blockPut(b);
+}
+
+/*
+ * Prepare the directory to store a snapshot.
+ * Temporary snapshots go into /snapshot/#.
+ * Archival snapshots go into /archive/yyyy/mmdd[.#].
+ *
+ * TODO This should be rewritten to eliminate most of the duplication.
+ */
+static File*
+fileOpenSnapshot(Fs *fs, int doarchive)
+{
+	int n;
+	char buf[30], *s;
+	File *dir, *f;
+	Tm now;
+
+	if(doarchive){
+		/* 
+		 * a snapshot intended to be archived to venti.
+		 */
+		dir = fileOpen(fs, "/archive");
+		if(dir == nil)
+			return nil;
+		now = *localtime(time(0));
+
+		/* yyyy */
+		snprint(buf, sizeof(buf), "%d", now.year+1900);
+		f = fileWalk(dir, buf);
+		if(f == nil)
+			f = fileCreate(dir, buf, ModeDir|0555, "adm");
+		fileDecRef(dir);
+		if(f == nil)
+			return nil;
+		dir = f;
+
+		/* mmdd[#] */
+		snprint(buf, sizeof(buf), "%02d%02d", now.mon+1, now.mday);
+		s = buf+strlen(buf);
+		for(n=0;; n++){
+			if(n)
+				seprint(s, buf+sizeof(buf), ".%d", n);
+			f = fileWalk(dir, buf);
+			if(f != nil){
+				fileDecRef(f);
+				continue;
+			}
+			f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm");
+			break;
+		}
+		fileDecRef(dir);
+		return f;
+	}else{
+		/*
+		 * Just a temporary snapshot
+		 * We'll use /snapshot/yyyy/mmdd/hhmm.
+		 * There may well be a better naming scheme.
+		 * (I'd have used hh:mm but ':' is reserved in Microsoft file systems.)
+		 */
+		dir = fileOpen(fs, "/snapshot");
+		if(dir == nil)
+			return nil;
+
+/*
+ * used to do /snapshot/#
+ * 
+		for(n=0;; n++){
+			if(n)
+				seprint(s, buf+sizeof(buf), ".%d", n);
+			f = fileWalk(dir, buf);
+			if(f != nil){
+				fileDecRef(f);
+				continue;
+			}
+			f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm");
+			break;
+		}
+		dir = fileOpen(fs, "/snapshot");
+		if(dir == nil)
+			return nil;
+		snprint(buf, sizeof(buf), "%d", fs->ehi);
+		f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm");
+		fileDecRef(dir);
+		return f;
+*/
+
+		now = *localtime(time(0));
+
+		/* yyyy */
+		snprint(buf, sizeof(buf), "%d", now.year+1900);
+		f = fileWalk(dir, buf);
+		if(f == nil)
+			f = fileCreate(dir, buf, ModeDir|0555, "adm");
+		fileDecRef(dir);
+		if(f == nil)
+			return nil;
+		dir = f;
+
+		/* mmdd */
+		snprint(buf, sizeof(buf), "%02d%02d", now.mon+1, now.mday);
+		f = fileWalk(dir, buf);
+		if(f == nil)
+			f = fileCreate(dir, buf, ModeDir|0555, "adm");
+		fileDecRef(dir);
+		if(f == nil)
+			return nil;
+		dir = f;
+
+		/* hhmm */
+		snprint(buf, sizeof buf, "%02d%02d", now.hour, now.min);
+		f = fileWalk(dir, buf);
+		if(f != nil){
+			fileDecRef(f);
+			fileDecRef(dir);
+			fprint(2, "/snapshot/%d/%02d%02d/%s already exists!\n",
+				now.year+1900, now.mon+1, now.mday, buf);
+			return nil;
+		}
+		f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm");
+		fileDecRef(dir);
+		return f;
+	}
+}
+
+int
+fsEpochLow(Fs *fs, u32int low)
+{
+	Block *bs;
+	Super super;
+
+	vtLock(fs->elk);
+	if(low > fs->ehi){
+		vtSetError("bad low epoch (must be <= %ud)", fs->ehi);
+		vtUnlock(fs->elk);
+		return 0;
+	}
+
+	if((bs = superGet(fs->cache, &super)) == nil){
+		vtUnlock(fs->elk);
+		return 0;
+	}
+
+	super.epochLow = low;
+	fs->elo = low;
+	superPut(bs, &super, 1);
+	vtUnlock(fs->elk);
+
+	return 1;
+}
+
+static int
+bumpEpoch(Fs *fs, int doarchive)
+{
+	uchar score[VtScoreSize];
+	u32int oldaddr;
+	Block *b, *bs;
+	Entry e;
+	Source *r;
+	Super super;
+
+	/*
+	 * Duplicate the root block.
+	 *
+	 * As a hint to flchk, the garbage collector,
+	 * and any (human) debuggers, store a pointer
+	 * to the old root block in entry 1 of the new root block.
+	 */
+	r = fs->source;
+	b = cacheGlobal(fs->cache, r->score, BtDir, RootTag, OReadOnly);
+	if(b == nil)
+		return 0;
+
+	memset(&e, 0, sizeof e);
+	e.flags = VtEntryActive | VtEntryLocal | VtEntryDir;
+	memmove(e.score, b->score, VtScoreSize);
+	e.tag = RootTag;
+	e.snap = b->l.epoch;
+	oldaddr = b->addr;
+
+	b = blockCopy(b, RootTag, fs->ehi+1, fs->elo);
+	if(b == nil){
+		fprint(2, "bumpEpoch: blockCopy: %R\n");
+		return 0;
+	}
+
+	if(0) fprint(2, "snapshot root from %d to %d\n", oldaddr, b->addr);
+	entryPack(&e, b->data, 1);
+	blockDirty(b);
+
+	/*
+	 * Update the superblock with the new root and epoch.
+	 */
+	if((bs = superGet(fs->cache, &super)) == nil)
+		return 0;
+
+	fs->ehi++;
+	memmove(r->score, b->score, VtScoreSize);
+	r->epoch = fs->ehi;
+
+	super.epochHigh = fs->ehi;
+	oldaddr = super.active;
+	super.active = b->addr;
+	if(doarchive)
+		super.next = oldaddr;
+
+	/*
+	 * Record that the new super.active can't get written out until
+	 * the new b gets written out.  Until then, use the old value.
+	 */
+	localToGlobal(oldaddr, score);
+	blockDependency(bs, b, 0, score);
+	blockPut(b);
+
+	/*
+	 * We force the super block to disk so that super.epochHigh gets updated.
+	 * Otherwise, if we crash and come back, we might incorrectly treat as active
+	 * some of the blocks that making up the snapshot we just created.
+	 * Basically every block in the active file system and all the blocks in 
+	 * the recently-created snapshot depend on the super block now.
+	 * Rather than record all those dependencies, we just force the block to disk.
+	 *
+	 * Note that blockWrite might actually (will probably) send a slightly outdated 
+	 * super.active to disk.  It will be the address of the most recent root that has
+	 * gone to disk.
+	 */
+	superPut(bs, &super, 1);
+
+	return 1;
+}
+
+int
+saveQid(Fs *fs)
+{
+	Block *b;
+	Super super;
+	u64int qidMax;
+
+	if((b = superGet(fs->cache, &super)) == nil)
+		return 0;
+	qidMax = super.qid;
+	blockPut(b);
+
+	if(!fileSetQidSpace(fs->file, 0, qidMax))
+		return 0;
+
+	return 1;
+}
+
+int
+fsSnapshot(Fs *fs, int doarchive)
+{
+	File *src, *dst;
+	
+	assert(fs->mode == OReadWrite);
+
+	dst = nil;
+
+	/*
+	 * Freeze file system activity.
+	 */
+	vtLock(fs->elk);
+
+	/*
+	 * Get the root of the directory we're going to save.
+	 */
+	src = fileOpen(fs, "/active");
+	if(src == nil)
+		goto Err;
+
+	/*
+	 * It is important that we maintain the invariant that:
+	 *	if both b and bb are marked as Active with epoch e
+	 *	and b points at bb, then no other pointers to bb exist.
+	 * 
+	 * The archiver uses this property to aggressively reclaim
+	 * such blocks once they have been stored on Venti, and
+	 * blockCleanup knows about this property as well.
+	 * 
+	 * Let's say src->source is block sb, and src->msource is block
+	 * mb.  Let's also say that block b holds the Entry structures for
+	 * both src->source and src->msource (their Entry structures might
+	 * be in different blocks, but the argument is the same).
+	 * That is, right now we have:
+	 *
+	 *	b	Active w/ epoch e, holds ptrs to sb and mb.
+	 *	sb	Active w/ epoch e.
+	 *	mb	Active w/ epoch e.
+	 *
+	 * With things as they are now, the invariant requires that
+	 * b holds the only pointers to sb and mb.  We want to record
+	 * pointers to sb and mb in new Entries corresponding to dst,
+	 * which breaks the invariant.  Thus we need to do something
+	 * about b.  Specifically, we bump the file system's epoch and
+	 * then rewalk the path from the root down to and including b.  
+	 * This will copy-on-write as we walk, so now the state will be:
+	 *
+	 *	b	Snap w/ epoch e, holds ptrs to sb and mb.
+	 *	new-b	Active w/ epoch e+1, holds ptrs to sb and mb.
+	 *	sb	Active w/ epoch e.
+	 *	mb	Active w/ epoch e.
+	 *
+	 * In this state, it's perfectly okay to add pointers to dst, which
+	 * will live in a block marked Active with epoch e+1.
+	 *
+	 * Of course, we need to make sure that the copied path makes
+	 * it out to disk before the new dst block; if the dst block goes out
+	 * first and then we crash, the invariant is violated.  Rather than
+	 * deal with the dependencies, we just sync the file system to disk
+	 * right now.
+	 */
+	if(!bumpEpoch(fs, 0) || !fileWalkSources(src))
+		goto Err;
+
+	/*
+	 * Sync to disk.
+	 */
+	cacheFlush(fs->cache, 1);
+
+	/*
+	 * Create the directory where we will store the copy of src.
+	 */
+	dst = fileOpenSnapshot(fs, doarchive);
+	if(dst == nil)
+		goto Err;
+
+	/*
+	 * Actually make the copy by setting dst's source and msource
+	 * to be src's.
+	 */
+	if(!fileSnapshot(dst, src, fs->ehi-1, doarchive))
+		goto Err;
+
+	fileDecRef(src);
+	fileDecRef(dst);
+	/*
+	 * Make another copy of the file system.  This one is for the
+	 * archiver, so that the file system we archive has the recently
+	 * added snapshot both in /active and in /archive/yyyy/mmdd[.#].
+	 */
+	if(doarchive){
+		if(!saveQid(fs))
+			goto Err;
+		if(!bumpEpoch(fs, 1))
+			goto Err;
+	}
+
+	vtUnlock(fs->elk);
+
+	/* BUG? can fs->arch fall out from under us here? */
+	if(doarchive && fs->arch)
+		archKick(fs->arch);
+
+	return 1;
+
+Err:
+	fprint(2, "fsSnapshot: %R\n");
+	if(src)
+		fileDecRef(src);
+	if(dst)
+		fileDecRef(dst);
+	vtUnlock(fs->elk);
+	return 0;
+}
+
+int
+fsVac(Fs *fs, char *name, uchar score[VtScoreSize])
+{
+	int r;
+	DirEntry de;
+	Entry e, ee;
+	File *f;
+
+	vtRLock(fs->elk);
+	f = fileOpen(fs, name);
+	if(f == nil){
+		vtRUnlock(fs->elk);
+		return 0;
+	}
+
+	if(!fileGetSources(f, &e, &ee, 0) || !fileGetDir(f, &de)){
+		fileDecRef(f);
+		vtRUnlock(fs->elk);
+		return 0;
+	}
+	fileDecRef(f);
+
+	r = mkVac(fs->z, fs->blockSize, &e, &ee, &de, score);
+	vtRUnlock(fs->elk);
+	return r;
+}
+
+static int
+vtWriteBlock(VtSession *z, uchar *buf, uint n, uint type, uchar score[VtScoreSize])
+{
+	if(!vtWrite(z, score, type, buf, n))
+		return 0;
+	if(!vtSha1Check(score, buf, n))
+		return 0;
+	return 1;
+}
+
+int
+mkVac(VtSession *z, uint blockSize, Entry *pe, Entry *pee, DirEntry *pde, uchar score[VtScoreSize])
+{
+	uchar buf[8192];
+	int i;
+	uchar *p;
+	uint n;
+	DirEntry de;
+	Entry e, ee, eee;
+	MetaBlock mb;
+	MetaEntry me;
+	VtRoot root;
+
+	e = *pe;
+	ee = *pee;
+	de = *pde;
+
+	if(globalToLocal(e.score) != NilBlock
+	|| (ee.flags&VtEntryActive && globalToLocal(ee.score) != NilBlock)){
+		vtSetError("can only vac paths already stored on venti");
+		return 0;
+	}
+
+	/*
+	 * Build metadata source for root.
+	 */
+	n = deSize(&de);
+	if(n+MetaHeaderSize+MetaIndexSize > sizeof buf){
+		vtSetError("DirEntry too big");
+		return 0;
+	}
+	memset(buf, 0, sizeof buf);
+	mbInit(&mb, buf, n+MetaHeaderSize+MetaIndexSize, 1);
+	p = mbAlloc(&mb, n);
+	if(p == nil)
+		abort();
+	mbSearch(&mb, de.elem, &i, &me);
+	assert(me.p == nil);
+	me.p = p;
+	me.size = n;
+	dePack(&de, &me);
+	mbInsert(&mb, i, &me);
+	mbPack(&mb);
+
+	eee.size = n+MetaHeaderSize+MetaIndexSize;
+	if(!vtWriteBlock(z, buf, eee.size, VtDataType, eee.score))
+		return 0;
+	eee.psize = 8192;
+	eee.dsize = 8192;
+	eee.depth = 0;
+	eee.flags = VtEntryActive;
+
+	/*
+	 * Build root source with three entries in it.
+	 */
+	entryPack(&e, buf, 0);
+	entryPack(&ee, buf, 1);
+	entryPack(&eee, buf, 2);
+
+	n = VtEntrySize*3;
+	memset(&root, 0, sizeof root);
+	if(!vtWriteBlock(z, buf, n, VtDirType, root.score))
+		return 0;
+
+	/*
+	 * Save root.
+	 */
+	root.version = VtRootVersion;
+	strcpy(root.type, "vac");
+	strecpy(root.name, root.name+sizeof root.name, de.elem);
+	root.blockSize = blockSize;
+	vtRootPack(&root, buf);
+	if(!vtWriteBlock(z, buf, VtRootSize, VtRootType, score))
+		return 0;
+
+	return 1;
+}
+
+int
+fsSync(Fs *fs)
+{
+	vtLock(fs->elk);
+	cacheFlush(fs->cache, 1);
+	vtUnlock(fs->elk);
+	return 1;
+}
+
+int
+fsNextQid(Fs *fs, u64int *qid)
+{
+	Block *b;
+	Super super;
+
+	if((b = superGet(fs->cache, &super)) == nil)
+		return 0;
+
+	*qid = super.qid++;
+
+	/*
+	 * It's okay if the super block doesn't go to disk immediately,
+	 * since fileMetaAlloc will record a dependency between the
+	 * block holding this qid and the super block.  See file.c:/^fileMetaAlloc.
+	 */
+	superPut(b, &super, 0);
+	return 1;
+}
+
+static void
+fsMetaFlush(void *a)
+{
+	Fs *fs = a;
+
+	vtRLock(fs->elk);
+	fileMetaFlush(fs->file, 1);
+	vtRUnlock(fs->elk);
+	cacheFlush(fs->cache, 0);
+}
+
+struct Snap
+{
+	Fs *fs;
+	Periodic *tick;
+	VtLock *lk;
+	uint snapMinutes;
+	uint archMinute;
+	u32int lastSnap;
+	u32int lastArch;
+	uint ignore;
+};
+
+static void
+snapEvent(void *v)
+{
+	Snap *s;
+	u32int now, min;
+	Tm tm;
+
+	s = v;
+
+	now = time(0)/60;
+	vtLock(s->lk);
+
+	/*
+	 * Snapshots happen every snapMinutes minutes.
+	 * If we miss a snapshot (for example, because we
+	 * were down), we wait for the next one.
+	 */
+	if(s->snapMinutes != ~0 && s->snapMinutes != 0
+	&& now%s->snapMinutes==0 && now != s->lastSnap){
+if(0)fprint(2, "snapshot %02d%02d\n", now/60, now%60);
+		if(!fsSnapshot(s->fs, 0))
+			fprint(2, "fsSnapshot snap: %R\n");
+		s->lastSnap = now;
+	}
+
+	/*
+	 * Archival snapshots happen at archMinute.
+	 */
+	tm = *localtime(now*60);
+	min = tm.hour*60+tm.min;
+	if(s->archMinute != ~0 && min == s->archMinute && now != s->lastArch){
+if(0)fprint(2, "archive %02d%02d\n", now/60, now%60);
+		if(!fsSnapshot(s->fs, 1))
+			fprint(2, "fsSnapshot arch: %R\n");
+		s->lastArch = now;
+	}
+	vtUnlock(s->lk);
+}
+
+static Snap*
+snapInit(Fs *fs)
+{
+	Snap *s;
+
+	s = vtMemAllocZ(sizeof(Snap));
+	s->fs = fs;
+	s->tick = periodicAlloc(snapEvent, s, 10*1000);
+	s->lk = vtLockAlloc();
+	s->snapMinutes = -1;
+	s->archMinute = -1;
+	s->ignore = 5*2;	/* wait five minutes for clock to stabilize */
+	return s;
+}
+
+void
+snapGetTimes(Snap *s, u32int *arch, u32int *snap)
+{
+	vtLock(s->lk);
+	*snap = s->snapMinutes;
+	*arch = s->archMinute;
+	vtUnlock(s->lk);
+}
+
+void
+snapSetTimes(Snap *s, u32int arch, u32int snap)
+{
+	vtLock(s->lk);
+	s->snapMinutes = snap;
+	s->archMinute = arch;
+	vtUnlock(s->lk);
+}
+
+static void
+snapClose(Snap *s)
+{
+	if(s == nil)
+		return;
+
+	periodicKill(s->tick);
+	vtMemFree(s);
+}
+

+ 48 - 0
sys/src/cmd/fossil/fs.h

@@ -0,0 +1,48 @@
+typedef struct Fs Fs;
+typedef struct File File;
+typedef struct DirEntryEnum DirEntryEnum;
+
+/* modes */
+
+enum {
+	OReadOnly,
+	OReadWrite,
+	OOverWrite,
+};
+
+Fs *fsOpen(char*, VtSession*, long, int);
+void fsClose(Fs*);
+File *fsGetRoot(Fs*);
+int fsSnapshot(Fs*, int);
+int fsSync(Fs*);
+int fsVac(Fs*, char*, uchar[VtScoreSize]);
+int fsRedial(Fs*, char*);
+int fsEpochLow(Fs*, u32int);
+
+File *fileOpen(Fs*, char*);
+File *fileCreate(File*, char*, ulong, char*);
+File *fileWalk(File*, char*);
+int fileRemove(File*, char*);
+int fileClri(Fs*, char*, char*);
+int fileRead(File*, void *, int, vlong);
+int fileWrite(File*, void *, int, vlong, char*);
+uvlong fileGetId(File*);
+ulong fileGetMcount(File*);
+int fileIsDir(File*);
+int fileGetSize(File*, uvlong*);
+int fileGetDir(File*, DirEntry*);
+int fileSetDir(File*, DirEntry*, char*);
+File *fileGetParent(File*);
+int fileSync(File*);
+File *fileIncRef(File*);
+int fileDecRef(File*);
+int fileIsRoot(File*);
+void fileMetaFlush(File*, int);
+int fileSetQidSpace(File*, u64int, u64int);
+int fileTruncate(File*, char*);
+int fileIsRoFs(File*);
+ulong fileGetMode(File*);
+DirEntryEnum *deeOpen(File*);
+int deeRead(DirEntryEnum*, DirEntry*);
+void deeClose(DirEntryEnum*);
+int fileWalkSources(File*);

+ 17 - 0
sys/src/cmd/fossil/history

@@ -0,0 +1,17 @@
+changes since initial alpha release
+
+5 jan 2003
+	add -v flag to flfmt as documented
+	add "con /srv/fscons" to fossilcons(8) synopsis
+	add -AWP to the initialization example in fossil(4).
+	change users to print "no file" if the user table is 
+		not backed by a file.
+	change snapClose not to die when s==nil
+	correct handling of file truncation to specific size
+	disable the close command for now
+
+7 jan 2003
+	make fossil chatter a bit less to stderr.  errors
+	still go to stderr.
+
+

+ 121 - 0
sys/src/cmd/fossil/invariants

@@ -0,0 +1,121 @@
+.EQ
+delim $#
+.EN
+.NH 3
+Invariants
+.HP
+Reclamation is tricky enough to warrant explicit statement
+of the invariants that are needed and the reasons they are true.
+This section will use the notation
+$b.e#
+and
+$b.e sub 1#
+to denote the allocation and
+closing epochs of block
+$b#.
+The invariants are:
+.IP (i)
+If $b# points at $bb#, then $bb.e <= b.e < bb.e sub 1#.
+.IP (ii)
+If $b# points at $bb#, then no other block $b'# with $b'.e = b.e# points at $bb#.
+.IP (iii)
+If $b# is not marked
+.CW BsCopied
+and points at $bb# such that $b.e = bb.e#, then no other block $b'# points at $bb#.
+.IP (iv)
+If $b# is in the active file system and points at $bb# then no other block $b'# in the
+active file system points at $bb#.
+.IP (v)
+If $b'# is a (possibly indirect) copy of $b#, then only one of $b# and $b'# is in the active file system.
+.LP
+Invariant (i) lets us reclaim blocks using the file system low epoch.
+Invariant (iii) lets us reclaim some blocks immediately once they are unlinked.
+Invariants (ii), (iv), and (v) are helpful in proving (i) and (iii); collectively they
+say that taking snapshots doesn't break the active file system.
+.PP
+Freshly allocated blocks start filled with nil pointers,
+and thus satisfy all the invariants.  We need to check that
+copying a block, zeroing a pointer, and setting a pointer
+preserve the invariants.
+.LP
+$"BlockCopy" (b)#
+allocates a new block
+$b'# and copies the active and open block $b# into $b'#.
+.IP (i)
+Since $b# is open, all the blocks $bb# it points to are also
+active, and thus they have $bb.e sub 1# set to positive infinity
+(well,
+.CW ~0 ).
+Thus (i) is satisfied.
+.IP (ii)
+Since $b'.e# will be set to the current epoch, and $b.e# is less
+than the current epoch (it's copy-on-write), $b.e < b'.e# so (ii)
+is vacuously satisfied.
+.IP (iii)
+Since $b.e < b'.e#, all the pointers in $b# are to blocks with epochs less than $b'.e#.
+Thus (iii) is vacuously satisfied for both $b'#.
+Since $"blockCopy"# sets the
+.CW BsCopied
+flag, (iii) is vacuously satisfied for $b#.
+.IP (iv),(v)
+Since no pointers to $b# or $b'# were modified,
+(iv) and (v) are unchanged.
+.LP
+$"BlockRemoveLink" (b -> bb)# removes from block $b# the pointer to $bb#
+.IP
+Zeroing a pointer only restricts the preconditions on the 
+invariants, so it's always okay.
+By (iii), if $b# is not
+.CW BsCopied
+and $b.e = bb.e#, then no other $b'# anywhere
+points at $bb#, so $bb# can be freed.
+.LP
+$"BlockSetLink" (b->bb sub 0 , bb sub 1)# changes the pointer in block $b# from $bb sub 0# to $bb sub 1#.
+We derive sufficient conditions on $bb sub 1#, and then
+examine the possible values of $bb sub 0# and $bb sub 1#.
+.IP (i)
+Since we're changing $b#, $b.e# is the current epoch.
+If $bb sub 1# is open, then (i) is satisfied.
+.IP (ii)
+If either $b.e != bb sub 1 .e# or $bb sub 1# is an orphan, then (ii) is satisfied.
+.IP (iii)
+If either $b.e != bb sub 1 .e# or $b# is marked
+.CW BsCopied
+or $bb sub 1# is an orphan, then (iii) is satisfied.
+.IP (iv)
+If $bb sub 1# is not currently active or $bb sub 1# is an orphan, then (iv) is satisfied.
+.IP (v)
+If $bb sub 1# is a copy of $bb sub 0# or $bb sub 1# is empty, then (v) is satisfied.
+.LP
+$"BlockSetLink" (b -> bb sub 0 , "blockAlloc" ())# allocates a new block and points $b# at it.
+.IP
+Since $bb sub 1# in this case is newly allocated, it is open, an orphan, and empty, and thus
+the invariants are satisfied.
+.LP
+$"BlockSetLink" (b -> bb sub 0 , "blockCopy" (bb sub 0 ))# copies $bb sub 0# and points
+$b# at the copy.
+.IP
+Since $bb sub 1# is newly allocated, it is open and an orphan.  Thus (i)-(iv) are satisfied.
+Since $bb sub 1# is a copy of $bb sub 0#, (v) is satisfied.
+.LP
+$"BlockSetLink" (b -> "nil" , "oldRoot" )# changes a nil pointer to point
+at a snapshot root.
+.IP (i)
+Invariant (i) is broken, but the 
+.CW snap
+field in the entry will be used to make sure
+we don't access the snapshot after it has been reclaimed.
+.IP (ii)
+Since the epoch of  $"oldRoot"# is less than the current epoch but $b.e# is equal
+to the current epoch, (ii) is vacuously true.
+.IP (iii)
+XXX
+.IP (iv)
+XXX
+.IP (v)
+XXX
+.PP
+Ta da!
+xxx
+yyyy
+zzz

+ 96 - 0
sys/src/cmd/fossil/mkfile

@@ -0,0 +1,96 @@
+</$objtype/mkfile
+BIN=/$objtype/bin/fossil
+
+TARG=fossil flchk flfmt
+
+LIBFILES=\
+	9p\
+	9auth\
+	9dir\
+	9excl\
+	9fid\
+	9fsys\
+	9lstn\
+	9proc\
+	9srv\
+	9user\
+	Ccmd\
+	Ccli\
+	Ccons\
+	Clog\
+	archive\
+	nobwatch\
+	cache\
+	disk\
+	error\
+	file\
+	fs\
+	pack\
+	periodic\
+	source\
+	vac\
+	walk\
+
+LIBCFILES=${LIBFILES:%=%.c}
+LIBOFILES=${LIBFILES:%=%.$O}
+LIB=libfs.a.$O
+
+HFILES=\
+	/sys/include/venti.h\
+	stdinc.h\
+	vac.h\
+	dat.h\
+	fns.h\
+	fs.h\
+	error.h\
+	9.h\
+
+UPDATE=\
+	mkfile\
+	$CFILES\
+	$HFILES\
+	${TARG:%=/386/bin/%}
+
+default:V: all
+
+CFILES=${TARG:%=%.c} $LIBCFILES
+
+default:V: all
+
+test:V: all
+	rm -f /srv/test.fossil /srv/test.fscons
+	slay 8.flfmt | rc
+	slay 8.fossil | rc
+	unmount /n/fossil || status=''
+	8.flfmt -y /tmp/fossil
+	8.fossil -c '. flproto' && mount -c /srv/test.fossil /n/fossil
+#	cp /env/timezone /n/fossil/tmp
+#	cp /lib/words /n/fossil/tmp
+	dircp /sys/src/cmd/aux /n/fossil/tmp
+#	@{cd /n/fossil/tmp && time tar xTf /sys/src/cmd/fossil/test.tar}
+#	unmount /n/fossil
+#	rm /srv/fossil
+
+</sys/src/cmd/mkmany
+
+$LIB(%.$O):N: %.$O
+$LIB:	${LIBOFILES:%=$LIB(%)}
+	names = `{echo $newprereq |sed 's/ /\n/g' |sed -n 's/'$LIB'\(([^)]+)\)/\1/gp'}
+	ar vu $LIB $names
+#	rm $names
+
+%.page:V: %.ps
+	page -w $stem.ps
+
+%.ps:D: %.ms
+	tbl $stem.ms | pic | eqn | troff -ms | lp -dstdout >$target
+
+bundle:V:
+	rfork n
+	ramfs -m /n/kremvax >[2]/dev/null
+	bind -a /n/kremvax .
+	cp /sys/doc/fossil.ms /sys/doc/fossil.ps /n/kremvax
+	cp /sys/man/4/fossil /n/kremvax/fossil.4.man
+	cp /sys/man/8/fossilcons /n/kremvax/fossilcons.8.man
+	x=`{ls |grep -v 'TODO|test.tar|fossil.tar.gz'}
+	tar c $x | gzip > fossil.tar.gz

+ 39 - 0
sys/src/cmd/fossil/nobwatch.c

@@ -0,0 +1,39 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+void
+bwatchReset(uchar score[VtScoreSize])
+{
+	USED(score);
+}
+
+void
+bwatchInit(void)
+{
+}
+
+void
+bwatchSetBlockSize(uint)
+{
+}
+
+void
+bwatchDependency(Block *b)
+{
+	USED(b);
+}
+
+void
+bwatchLock(Block *b)
+{
+	USED(b);
+}
+
+void
+bwatchUnlock(Block *b)
+{
+	USED(b);
+}
+

+ 226 - 0
sys/src/cmd/fossil/pack.c

@@ -0,0 +1,226 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+/*
+ * integer conversion routines
+ */
+#define	U8GET(p)	((p)[0])
+#define	U16GET(p)	(((p)[0]<<8)|(p)[1])
+#define	U32GET(p)	(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])
+#define	U48GET(p)	(((uvlong)U16GET(p)<<32)|(uvlong)U32GET((p)+2))
+#define	U64GET(p)	(((uvlong)U32GET(p)<<32)|(uvlong)U32GET((p)+4))
+
+#define	U8PUT(p,v)	(p)[0]=(v)
+#define	U16PUT(p,v)	(p)[0]=(v)>>8;(p)[1]=(v)
+#define	U32PUT(p,v)	(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v)
+#define	U48PUT(p,v,t32)	t32=(v)>>32;U16PUT(p,t32);t32=(v);U32PUT((p)+2,t32)
+#define	U64PUT(p,v,t32)	t32=(v)>>32;U32PUT(p,t32);t32=(v);U32PUT((p)+4,t32)
+
+void
+headerPack(Header *h, uchar *p)
+{
+	memset(p, 0, HeaderSize);
+	U32PUT(p, HeaderMagic);
+	U16PUT(p+4, HeaderVersion);
+	U16PUT(p+6, h->blockSize);
+	U32PUT(p+8, h->super);
+	U32PUT(p+12, h->label);
+	U32PUT(p+16, h->data);
+	U32PUT(p+20, h->end);
+}
+
+int
+headerUnpack(Header *h, uchar *p)
+{
+	if(U32GET(p) != HeaderMagic){
+		vtSetError("vac header bad magic");
+		return 0;
+	}
+	h->version = U16GET(p+4);
+	if(h->version != HeaderVersion){
+		vtSetError("vac header bad version");
+		return 0;
+	}
+	h->blockSize = U16GET(p+6);
+	h->super = U32GET(p+8);
+	h->label = U32GET(p+12);
+	h->data = U32GET(p+16);
+	h->end = U32GET(p+20);
+	return 1;
+}
+
+void
+labelPack(Label *l, uchar *p, int i)
+{
+	p += i*LabelSize;
+	U8PUT(p, l->state);
+	U8PUT(p+1, l->type);
+	U32PUT(p+2, l->epoch);
+	U32PUT(p+6, l->epochClose);
+	U32PUT(p+10, l->tag);
+}
+
+int
+labelUnpack(Label *l, uchar *p, int i)
+{
+	p += i*LabelSize;
+	l->state = p[0];
+	l->type = p[1];
+	l->epoch = U32GET(p+2);
+	l->epochClose = U32GET(p+6);
+	l->tag = U32GET(p+10);
+
+	if(l->type > BtMax){
+	Bad:
+		vtSetError(EBadLabel);
+fprint(2, "labelUnpack %.2ux %.2ux %.8ux %.8ux %.8ux\n",
+	l->state, l->type, l->epoch, l->epochClose, l->tag);
+		return 0;
+	}
+	if(l->state != BsBad && l->state != BsFree){
+		if(!(l->state&BsAlloc))
+			goto Bad;
+		if(l->state&~BsMask)
+			goto Bad;
+		if(l->state&BsClosed){
+			if(l->epochClose == ~(u32int)0)
+				goto Bad;
+		}else{
+			if(l->epochClose != ~(u32int)0)
+				goto Bad;
+		}
+	}
+	return 1;
+}
+
+u32int
+globalToLocal(uchar score[VtScoreSize])
+{
+	int i;
+
+	for(i=0; i<VtScoreSize-4; i++)
+		if(score[i] != 0)
+			return NilBlock;
+
+	return U32GET(score+VtScoreSize-4);
+}
+
+void
+localToGlobal(u32int addr, uchar score[VtScoreSize])
+{
+	memset(score, 0, VtScoreSize-4);
+	U32PUT(score+VtScoreSize-4, addr);
+}
+	
+void
+entryPack(Entry *e, uchar *p, int index)
+{
+	ulong t32;
+	int flags;
+
+	p += index * VtEntrySize;
+
+	U32PUT(p, e->gen);
+	U16PUT(p+4, e->psize);
+	U16PUT(p+6, e->dsize);
+	flags = e->flags | ((e->depth << VtEntryDepthShift) & VtEntryDepthMask);
+	U8PUT(p+8, flags);
+	memset(p+9, 0, 5);
+	U48PUT(p+14, e->size, t32);
+
+	if(flags & VtEntryLocal){
+		if(globalToLocal(e->score) == NilBlock)
+			abort();
+		memset(p+20, 0, 7);
+		U8PUT(p+27, e->archive);
+		U32PUT(p+28, e->snap);
+		U32PUT(p+32, e->tag);
+		memmove(p+36, e->score+16, 4);
+	}else
+		memmove(p+20, e->score, VtScoreSize);
+}
+
+int
+entryUnpack(Entry *e, uchar *p, int index)
+{
+	p += index * VtEntrySize;
+
+	e->gen = U32GET(p);
+	e->psize = U16GET(p+4);
+	e->dsize = U16GET(p+6);
+	e->flags = U8GET(p+8);
+	e->depth = (e->flags & VtEntryDepthMask) >> VtEntryDepthShift;
+	e->flags &= ~VtEntryDepthMask;
+	e->size = U48GET(p+14);
+
+	if(e->flags & VtEntryLocal){
+		e->archive = p[27];
+		e->snap = U32GET(p+28);
+		e->tag = U32GET(p+32);
+		memset(e->score, 0, 16);
+		memmove(e->score+16, p+36, 4);
+	}else{
+		e->archive = 0;
+		e->snap = 0;
+		e->tag = 0;
+		memmove(e->score, p+20, VtScoreSize);
+	}
+
+	return 1;
+}
+
+int
+entryType(Entry *e)
+{
+	return (((e->flags & VtEntryDir) != 0) << 3) | e->depth;
+}
+
+
+void
+superPack(Super *s, uchar *p)
+{
+	u32int t32;
+
+	memset(p, 0, SuperSize);
+	U32PUT(p, SuperMagic);
+	assert(s->version == SuperVersion);
+	U16PUT(p+4, s->version);
+	U32PUT(p+6, s->epochLow);
+	U32PUT(p+10, s->epochHigh);
+	U64PUT(p+14, s->qid, t32);
+	U32PUT(p+22, s->active);
+	U32PUT(p+26, s->next);
+	U32PUT(p+30, s->current);
+	memmove(p+34, s->last, VtScoreSize);
+	memmove(p+54, s->name, sizeof(s->name));
+}
+
+int
+superUnpack(Super *s, uchar *p)
+{
+	memset(s, 0, sizeof(*s));
+	if(U32GET(p) != SuperMagic)
+		goto Err;
+	s->version = U16GET(p+4);
+	if(s->version != SuperVersion)
+		goto Err;
+	s->epochLow = U32GET(p+6);
+	s->epochHigh = U32GET(p+10);
+	s->qid = U64GET(p+14);
+	if(s->epochLow == 0 || s->epochLow > s->epochHigh || s->qid == 0)
+		goto Err;
+	s->active = U32GET(p+22);
+	s->next = U32GET(p+26);
+	s->current = U32GET(p+30);
+	memmove(s->last, p+34, VtScoreSize);
+	memmove(s->name, p+54, sizeof(s->name));
+	s->name[sizeof(s->name)-1] = 0;
+	return 1;
+Err:
+	memset(s, 0, sizeof(*s));
+	vtSetError(EBadSuper);
+	return 0;
+}
+

+ 84 - 0
sys/src/cmd/fossil/periodic.c

@@ -0,0 +1,84 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+struct Periodic {
+	VtLock *lk;
+	int die;
+	void (*f)(void*);
+	void *a;
+	int msec;
+};
+
+static void periodicThread(void *a);
+
+Periodic *
+periodicAlloc(void (*f)(void*), void *a, int msec)
+{
+	Periodic *p;
+
+	p = vtMemAllocZ(sizeof(Periodic));
+	p->lk = vtLockAlloc();
+	p->f = f;
+	p->a = a;
+	p->msec = msec;
+	if(p->msec < 10)
+		p->msec = 10;
+
+	vtThread(periodicThread, p);
+	return p;
+}
+	
+void
+periodicKill(Periodic *p)
+{	
+	if(p == nil)
+		return;
+	vtLock(p->lk);
+	p->die = 1;
+	vtUnlock(p->lk);
+}
+
+static void
+periodicFree(Periodic *p)
+{
+	vtLockFree(p->lk);
+	vtMemFree(p);
+}
+
+static void
+periodicThread(void *a)
+{
+	Periodic *p = a;
+	double t, ct, ts;
+
+	vtThreadSetName("periodic");
+
+	ct = nsec()*1e-6;
+	t = ct + p->msec;
+
+	for(;;){
+		/* skip missed */
+		while(t <= ct)
+			t += p->msec;
+		
+		ts = t - ct;
+		if(ts > 1000)
+			ts = 1000;
+		sleep(ts);
+		ct = nsec()*1e-6;
+		vtLock(p->lk);
+		if(p->die){
+			vtUnlock(p->lk);
+			break;
+		}
+		if(t <= ct){
+			p->f(p->a);
+			t += p->msec;
+		}
+		vtUnlock(p->lk);
+	}
+	periodicFree(p);
+}
+

+ 958 - 0
sys/src/cmd/fossil/source.c

@@ -0,0 +1,958 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+static int	sizeToDepth(uvlong s, int psize, int dsize);
+static u32int 	tagGen(void);
+static Block 	*sourceLoad(Source *r, Entry *e);
+static Block	*sourceLoadUnlocked(Source *r, Entry *e);
+static int	sourceShrinkDepth(Source*, Block*, Entry*, int);
+static int	sourceShrinkSize(Source*, Entry*, uvlong);
+static int	sourceGrowDepth(Source*, Block*, Entry*, int);
+
+#define sourceIsLocked(r)	((r)->b != nil)
+
+static Source *
+sourceAlloc(Fs *fs, Block *b, Source *p, u32int offset, int mode)
+{
+	Source *r;
+	int epb;
+	u32int epoch;
+	Entry e;
+
+	assert(p==nil || sourceIsLocked(p));
+
+	if(p == nil){
+		assert(offset == 0);
+		epb = 1;
+	}else
+		epb = p->dsize / VtEntrySize;
+
+	if(b->l.type != BtDir)
+		goto Bad;
+
+	/*
+	 * a non-active entry is the only thing that
+	 * can legitimately happen here. all the others
+	 * get prints.
+	 */
+	if(!entryUnpack(&e, b->data, offset % epb)){
+		fprint(2, "entryUnpack failed\n");
+		goto Bad;
+	}
+	if(!(e.flags & VtEntryActive)){
+		if(0)fprint(2, "not active\n");
+		goto Bad;
+	}
+	if(e.psize < 256 || e.dsize < 256){
+		fprint(2, "psize %ud dsize %ud\n", e.psize, e.dsize);
+		goto Bad;
+	}
+
+	if(e.depth < sizeToDepth(e.size, e.psize, e.dsize)){
+		fprint(2, "depth %ud size %llud psize %ud dsize %ud\n", e.depth, e.size, e.psize, e.dsize);
+		goto Bad;
+	}
+
+	if((e.flags & VtEntryLocal) && e.tag == 0){
+		fprint(2, "flags %#ux tag %#ux\n", e.flags, e.tag);
+		goto Bad;
+	}
+
+	if(e.dsize > fs->blockSize || e.psize > fs->blockSize){
+		fprint(2, "psize %ud dsize %ud blocksize %ud\n", e.psize, e.dsize, fs->blockSize);
+		goto Bad;
+	}
+
+	epoch = b->l.epoch;
+	if(mode == OReadWrite){
+		if(e.snap != 0){
+			vtSetError(ESnapRO);
+			return nil;
+		}
+	}else if(e.snap != 0){
+		if(e.snap < fs->elo){
+			vtSetError(ESnapOld);
+			return nil;
+		}
+		if(e.snap >= fs->ehi)
+			goto Bad;
+		epoch = e.snap;
+	}
+
+	r = vtMemAllocZ(sizeof(Source));
+	r->fs = fs;
+	r->mode = mode;
+	r->dsize = e.dsize;
+	r->gen = e.gen;
+	r->dir = (e.flags & VtEntryDir) != 0;
+	r->lk = vtLockAlloc();
+	r->ref = 1;
+	r->parent = p;
+	if(p){
+		vtLock(p->lk);
+		assert(mode == OReadOnly || p->mode == OReadWrite);
+		p->ref++;
+		vtUnlock(p->lk);
+	}
+	r->epoch = epoch;
+//fprint(2, "sourceAlloc have %V be.%d fse.%d %s\n", b->score, b->l.epoch, r->fs->ehi, mode==OReadWrite ? "rw" : "ro");
+	memmove(r->score, b->score, VtScoreSize);
+	r->scoreEpoch = b->l.epoch;
+	r->offset = offset;
+	r->epb = epb;
+	r->tag = b->l.tag;
+
+//fprint(2, "sourceAlloc: %p -> %V %d\n", r, r->score, r->offset);
+
+	return r;
+Bad:
+	vtSetError(EBadEntry);
+	return nil;
+	
+}
+
+Source *
+sourceRoot(Fs *fs, u32int addr, int mode)
+{
+	Source *r;
+	Block *b;
+
+	b = cacheLocalData(fs->cache, addr, BtDir, RootTag, mode, 0);
+	if(b == nil)
+		return nil;
+
+	if(mode == OReadWrite)
+	if((b->l.state&BsClosed) || b->l.epoch != fs->ehi){
+		fprint(2, "sourceRoot: fs->ehi = %ud, b->l = %L\n", fs->ehi, &b->l);
+		blockPut(b);
+		vtSetError(EBadRoot);
+		return nil;
+	}
+
+	r = sourceAlloc(fs, b, nil, 0, mode);
+	blockPut(b);
+	return r;
+}
+
+Source *
+sourceOpen(Source *r, ulong offset, int mode)
+{
+	ulong bn;
+	Block *b;
+
+	assert(sourceIsLocked(r));
+	if(r->mode == OReadWrite)
+		assert(r->epoch == r->b->l.epoch);
+	if(!r->dir){
+		vtSetError(ENotDir);
+		return nil;
+	}
+
+	bn = offset/(r->dsize/VtEntrySize);
+
+	b = sourceBlock(r, bn, mode);
+	if(b == nil)
+		return nil;
+	r = sourceAlloc(r->fs, b, r, offset, mode);
+	blockPut(b);
+	return r;
+}
+
+Source *
+sourceCreate(Source *r, int dsize, int dir, u32int offset)
+{
+	int i;
+	Block *b;
+	u32int bn, size;
+	Entry e;
+	int epb;
+	int psize;
+	Source *rr;
+ 
+	assert(sourceIsLocked(r));
+
+	if(!r->dir){
+		vtSetError(ENotDir);
+		return nil;
+	}
+
+	epb = r->dsize/VtEntrySize;
+	psize = (dsize/VtScoreSize)*VtScoreSize;
+
+	size = sourceGetDirSize(r);
+	if(offset == 0){
+		/*
+		 * look at a random block to see if we can find an empty entry
+		 */
+		offset = lnrand(size+1);
+		offset -= offset % epb;
+	}
+
+	/* try the given block and then try the last block */
+	for(;;){
+		bn = offset/epb;
+		b = sourceBlock(r, bn, OReadWrite);
+		if(b == nil)
+			return nil;
+		for(i=offset%r->epb; i<epb; i++){
+			entryUnpack(&e, b->data, i);
+			if((e.flags&VtEntryActive) == 0 && e.gen != ~0)
+				goto Found;
+		}
+		blockPut(b);
+		if(offset == size){
+			fprint(2, "sourceCreate: cannot happen\n");
+			vtSetError("sourceCreate: cannot happen");
+			return nil;
+		}
+		offset = size;
+	}
+
+Found:
+	/* found an entry - gen already set */
+	e.psize = psize;
+	e.dsize = dsize;
+	e.flags = VtEntryActive;
+	if(dir)
+		e.flags |= VtEntryDir;
+	e.depth = 0;
+	e.size = 0;
+	memmove(e.score, vtZeroScore, VtScoreSize);
+	e.tag = 0;
+	e.snap = 0;
+	e.archive = 0;
+	entryPack(&e, b->data, i);
+	blockDirty(b);
+
+	offset = bn*epb + i;
+	if(offset+1 > size){
+		if(!sourceSetDirSize(r, offset+1)){
+			blockPut(b);
+			return nil;
+		}
+	}
+
+	rr = sourceAlloc(r->fs, b, r, offset, OReadWrite);
+	blockPut(b);
+	return rr;
+}
+
+static int
+sourceKill(Source *r, int doremove)
+{
+	Entry e;
+	Block *b;
+	u32int addr;
+	u32int tag;
+	int type;
+
+	assert(sourceIsLocked(r));
+	b = sourceLoad(r, &e);
+	if(b == nil)
+		return 0;
+
+	assert(b->l.epoch == r->fs->ehi);
+
+	if(doremove==0 && e.size == 0){
+		/* already truncated */
+		blockPut(b);
+		return 1;
+	}
+
+	/* remember info on link we are removing */
+	addr = globalToLocal(e.score);
+	type = entryType(&e);
+	tag = e.tag;
+
+	if(doremove){
+		if(e.gen != ~0)
+			e.gen++;
+		e.dsize = 0;
+		e.psize = 0;
+		e.flags = 0;	
+	}else{
+		e.flags &= ~VtEntryLocal;
+	}
+	e.depth = 0;
+	e.size = 0;
+	e.tag = 0;
+	memmove(e.score, vtZeroScore, VtScoreSize);
+	entryPack(&e, b->data, r->offset % r->epb);
+	blockDirty(b);
+	if(addr != NilBlock)
+		blockRemoveLink(b, addr, type, tag);
+	blockPut(b);
+
+	if(doremove){
+		sourceUnlock(r);
+		sourceClose(r);
+	}
+
+	return 1;
+}
+
+int
+sourceRemove(Source *r)
+{
+	return sourceKill(r, 1);
+}
+
+int
+sourceTruncate(Source *r)
+{
+	return sourceKill(r, 0);
+}
+
+uvlong
+sourceGetSize(Source *r)
+{
+	Entry e;
+	Block *b;
+
+	assert(sourceIsLocked(r));
+	b = sourceLoad(r, &e);
+	if(b == nil)
+		return 0;
+	blockPut(b);
+
+	return e.size;
+}
+
+static int
+sourceShrinkSize(Source *r, Entry *e, uvlong size)
+{
+	int i, type, ppb;
+	uvlong ptrsz;
+	u32int addr;
+	uchar score[VtScoreSize];
+	Block *b;
+
+	type = entryType(e);
+	b = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
+	if(b == nil)
+		return 0;
+
+	ptrsz = e->dsize;
+	ppb = e->psize/VtScoreSize;
+	for(i=0; i+1<e->depth; i++)
+		ptrsz *= ppb;
+
+	while(type&BtLevelMask){
+		if(b->addr == NilBlock
+		|| (b->l.state&BsClosed)
+		|| b->l.epoch != r->fs->ehi){
+			/* not worth copying the block just so we can zero some of it */
+			blockPut(b);
+			return 0;
+		}
+
+		/*
+		 * invariant: each pointer in the tree rooted at b accounts for ptrsz bytes
+		 */
+
+		/* zero the pointers to unnecessary blocks */
+		i = (size+ptrsz-1)/ptrsz;
+		for(; i<ppb; i++){
+			addr = globalToLocal(b->data+i*VtScoreSize);
+			memmove(b->data+i*VtScoreSize, vtZeroScore, VtScoreSize);
+			blockDirty(b);
+			if(addr != NilBlock)
+				blockRemoveLink(b, addr, type-1, e->tag);
+		}
+
+		/* recurse (go around again) on the partially necessary block */
+		i = size/ptrsz;
+		size = size%ptrsz;
+		if(size == 0){
+			blockPut(b);
+			return 1;
+		}
+		ptrsz /= ppb;
+		type--;
+		memmove(score, b->data+i*VtScoreSize, VtScoreSize);
+		blockPut(b);
+		b = cacheGlobal(r->fs->cache, score, type, e->tag, OReadWrite);
+		if(b == nil)
+			return 0;
+	}
+
+	if(b->addr == NilBlock
+	|| (b->l.state&BsClosed)
+	|| b->l.epoch != r->fs->ehi){
+		blockPut(b);
+		return 0;
+	}
+
+	/*
+	 * No one ever truncates BtDir blocks.
+	 */
+	if(type == BtData && e->dsize > size){
+		memset(b->data+size, 0, e->dsize-size);
+		blockDirty(b);
+	}
+	blockPut(b);
+	return 1;
+}
+
+int
+sourceSetSize(Source *r, uvlong size)
+{
+	int depth;
+	Entry e;
+	Block *b;
+
+	assert(sourceIsLocked(r));
+	if(size == 0)
+		return sourceTruncate(r);
+
+	if(size > VtMaxFileSize || size > ((uvlong)MaxBlock)*r->dsize){
+		vtSetError(ETooBig);
+		return 0;
+	}
+
+	b = sourceLoad(r, &e);
+	if(b == nil)
+		return 0;
+
+	/* quick out */
+	if(e.size == size){
+		blockPut(b);
+		return 1;
+	}
+
+	depth = sizeToDepth(size, e.psize, e.dsize);
+
+	if(depth < e.depth){
+		if(!sourceShrinkDepth(r, b, &e, depth)){
+			blockPut(b);
+			return 0;
+		}
+	}else if(depth > e.depth){
+		if(!sourceGrowDepth(r, b, &e, depth)){
+			blockPut(b);
+			return 0;
+		}
+	}
+
+	if(size < e.size)
+		sourceShrinkSize(r, &e, size);
+
+	e.size = size;
+	entryPack(&e, b->data, r->offset % r->epb);
+	blockDirty(b);
+	blockPut(b);
+
+	return 1;
+}
+
+int
+sourceSetDirSize(Source *r, ulong ds)
+{
+	uvlong size;
+	int epb;
+
+	assert(sourceIsLocked(r));
+	epb = r->dsize/VtEntrySize;
+
+	size = (uvlong)r->dsize*(ds/epb);
+	size += VtEntrySize*(ds%epb);
+	return sourceSetSize(r, size);
+}
+
+ulong
+sourceGetDirSize(Source *r)
+{
+	ulong ds;
+	uvlong size;
+	int epb;
+
+	assert(sourceIsLocked(r));
+	epb = r->dsize/VtEntrySize;
+
+	size = sourceGetSize(r);
+	ds = epb*(size/r->dsize);
+	ds += (size%r->dsize)/VtEntrySize;
+	return ds;
+}
+
+int
+sourceGetEntry(Source *r, Entry *e)
+{
+	Block *b;
+
+	assert(sourceIsLocked(r));
+	b = sourceLoad(r, e);
+	if(b == nil)
+		return 0;
+	blockPut(b);
+
+	return 1;
+}
+
+static Block *
+blockWalk(Block *p, int index, int mode, Fs *fs, Entry *e)
+{
+	Block *b;
+	Cache *c;
+	u32int addr;
+	int type;
+
+	c = fs->cache;
+
+	if((p->l.type & BtLevelMask) == 0){
+		assert(p->l.type == BtDir);
+		type = entryType(e);
+		b = cacheGlobal(c, e->score, type, e->tag, mode);
+	}else{
+		type = p->l.type - 1;
+		b = cacheGlobal(c, p->data + index*VtScoreSize, type, e->tag, mode);
+	}
+
+	if(b == nil || mode == OReadOnly)
+		return b;
+
+	assert(!(p->l.state&BsClosed) && p->l.epoch == fs->ehi);
+	if(!(b->l.state&BsClosed) && b->l.epoch == fs->ehi)
+		return b;
+
+	/*
+	 * Copy on write.
+	 */
+	if(e->tag == 0){
+		assert(p->l.type == BtDir);
+		e->tag = tagGen();
+		e->flags |= VtEntryLocal;
+	}
+
+	addr = b->addr;
+	b = blockCopy(b, e->tag, fs->ehi, fs->elo);
+	if(b == nil)
+		return nil;
+
+	assert(b->l.epoch == fs->ehi);
+
+	if(p->l.type == BtDir){
+		memmove(e->score, b->score, VtScoreSize);
+		entryPack(e, p->data, index);
+	}else{	
+		memmove(p->data+index*VtScoreSize, b->score, VtScoreSize);
+	}
+	blockDirty(b);
+	blockDependency(p, b, index, b->score);
+	blockDirty(p);
+
+	if(addr != NilBlock)
+		blockRemoveLink(p, addr, type, e->tag);
+
+	return b;
+}
+
+/*
+ * Change the depth of the source r.
+ * The entry e for r is contained in block p.
+ */
+static int
+sourceGrowDepth(Source *r, Block *p, Entry *e, int depth)
+{
+	Block *b, *bb;
+	u32int tag;
+	int type;
+
+	assert(sourceIsLocked(r));
+	assert(depth <= VtPointerDepth);
+
+	type = entryType(e);
+	b = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
+	if(b == nil)
+		return 0;
+
+	tag = e->tag;
+	if(tag == 0)
+		tag = tagGen();
+
+	/*
+	 * Keep adding layers until we get to the right depth
+	 * or an error occurs.
+	 */
+	while(e->depth < depth){
+		bb = cacheAllocBlock(r->fs->cache, type+1, tag, r->fs->ehi, r->fs->elo);
+		if(bb == nil)
+			break;
+		memmove(bb->data, b->score, VtScoreSize);
+		memmove(e->score, bb->score, VtScoreSize);	
+		e->depth++;
+		type++;
+		e->tag = tag;
+		e->flags |= VtEntryLocal;
+		blockDependency(bb, b, -1, nil);
+		blockPut(b);
+		blockDirty(bb);
+		b = bb;
+	}
+
+	entryPack(e, p->data, r->offset % r->epb);
+	blockDependency(p, b, -1, nil);
+	blockPut(b);
+	blockDirty(p);
+
+	return e->depth == depth;
+}
+
+static int
+sourceShrinkDepth(Source *r, Block *p, Entry *e, int depth)
+{
+	Block *b, *nb, *ob, *rb;
+	u32int tag;
+	int type, d;
+
+	assert(sourceIsLocked(r));
+	assert(depth <= VtPointerDepth);
+
+	type = entryType(e);
+	rb = cacheGlobal(r->fs->cache, e->score, type, e->tag, OReadWrite);
+	if(rb == nil)
+		return 0;
+
+	tag = e->tag;
+	if(tag == 0)
+		tag = tagGen();
+
+	/*
+	 * Walk down to the new root block.
+	 * We may stop early, but something is better than nothing.
+	 */
+	ob = nil;
+	b = rb;
+	for(d=e->depth; d > depth; d--, type++){
+		nb = cacheGlobal(r->fs->cache, b->data, type-1, tag, OReadWrite);
+		if(nb == nil)
+			break;
+		if(ob!=nil && ob!=rb)
+			blockPut(ob);
+		ob = b;
+		b = nb;
+	}
+
+	if(b == rb){
+		blockPut(rb);
+		return 0;
+	}
+
+	/*
+	 * Right now, e points at the root block rb, b is the new root block,
+	 * and ob points at b.  To update:
+	 *
+	 *	(i) change e to point at b
+	 *	(ii) zero the pointer ob -> b
+	 *	(iii) free the root block
+	 *
+	 * p (the block containing e) must be written before
+	 * anything else.
+ 	 */
+
+	/* (i) */
+	e->depth = d;
+	memmove(e->score, b->score, VtScoreSize);
+	entryPack(e, p->data, r->offset % r->epb);
+	blockDirty(p);
+
+	/* (ii) */
+	memmove(ob->data, vtZeroScore, VtScoreSize);
+	blockDependency(ob, p, -1, nil);
+	blockDirty(ob);
+
+	/* (iii) */
+	if(rb->addr != NilBlock)
+		blockRemoveLink(p, rb->addr, rb->l.type, rb->l.tag);
+
+	blockPut(rb);
+	if(ob!=nil && ob!=rb)
+		blockPut(ob);
+	if(b!=rb)
+		blockPut(b);
+
+	return d == depth;
+}
+
+Block *
+sourceBlock(Source *r, ulong bn, int mode)
+{
+	Block *b, *bb;
+	int index[VtPointerDepth+1];
+	Entry e;
+	int i, np;
+	int m;
+
+	assert(sourceIsLocked(r));
+	assert(bn != NilBlock);
+
+	/* mode for intermediate block */
+	m = mode;
+	if(m == OOverWrite)
+		m = OReadWrite;
+
+	b = sourceLoad(r, &e);
+	if(b == nil)
+		return nil;
+
+	np = e.psize/VtScoreSize;
+	memset(index, 0, sizeof(index));
+	for(i=0; bn > 0; i++){
+		if(i >= VtPointerDepth){
+			vtSetError(EBadAddr);
+			goto Err;
+		}
+		index[i] = bn % np;
+		bn /= np;
+	}
+
+	if(i > e.depth){
+		if(mode == OReadOnly){
+			vtSetError(EBadAddr);
+			goto Err;
+		}
+		if(!sourceGrowDepth(r, b, &e, i)) 
+			goto Err;
+	}
+
+	index[e.depth] = r->offset % r->epb;
+
+	for(i=e.depth; i>=0; i--){
+		bb = blockWalk(b, index[i], m, r->fs, &e);
+		if(bb == nil)
+			goto Err;
+		blockPut(b);
+		b = bb;
+	}
+	return b;
+Err:
+	blockPut(b);
+	return nil;
+}
+
+void
+sourceClose(Source *r)
+{
+	if(r == nil)
+		return;
+	vtLock(r->lk);
+	r->ref--;
+	if(r->ref){
+		vtUnlock(r->lk);
+		return;
+	}
+	assert(r->ref == 0);
+	vtUnlock(r->lk);
+	if(r->parent)
+		sourceClose(r->parent);
+	vtLockFree(r->lk);
+	memset(r, ~0, sizeof(*r));
+	vtMemFree(r);
+}
+
+/*
+ * Retrieve the block containing the entry for r.
+ * If a snapshot has happened, we might need
+ * to get a new copy of the block.  We avoid this
+ * in the common case by caching the score for
+ * the block and the last epoch in which it was valid.
+ *
+ * We use r->mode to tell the difference between active
+ * file system sources (OReadWrite) and sources for the
+ * snapshot file system (OReadOnly).
+ */
+static Block*
+sourceLoadBlock(Source *r, int mode)
+{
+	u32int addr;
+	Block *b;
+
+	switch(r->mode){
+	default:
+		assert(0);
+	case OReadWrite:
+		assert(r->mode == OReadWrite);
+		assert(r->epoch >= r->fs->elo);
+		if(r->epoch == r->fs->ehi){
+			b = cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, OReadWrite);
+			assert(r->epoch == b->l.epoch);
+			if(b == nil)
+				return nil;
+			return b;
+		}
+		assert(r->parent != nil);
+		if(!sourceLock(r->parent, OReadWrite))
+			return nil;
+		b = sourceBlock(r->parent, r->offset/r->epb, OReadWrite);
+		sourceUnlock(r->parent);
+		if(b == nil)
+			return nil;
+		assert(b->l.epoch == r->fs->ehi);
+		memmove(r->score, b->score, VtScoreSize);
+		r->scoreEpoch = b->l.epoch;
+		r->tag = b->l.tag;
+		r->epoch = r->fs->ehi;
+		return b;
+
+	case OReadOnly:
+		addr = globalToLocal(r->score);
+		if(addr == NilBlock)
+			return cacheGlobal(r->fs->cache, r->score, BtDir, r->tag, mode);
+
+		b = cacheLocalData(r->fs->cache, addr, BtDir, r->tag, mode, r->scoreEpoch);
+		if(b)
+			return b;
+
+		/*
+		 * If it failed because the epochs don't match, the block has been
+		 * archived and reclaimed.  Rewalk from the parent and get the
+		 * new pointer.  This can't happen in the OReadWrite case
+		 * above because blocks in the current epoch don't get
+		 * reclaimed.  The fact that we're OReadOnly means we're
+		 * a snapshot.  (Or else the file system is read-only, but then
+		 * the archiver isn't going around deleting blocks.)
+		 */
+		if(strcmp(vtGetError(), ELabelMismatch) == 0){
+			if(!sourceLock(r->parent, OReadOnly))
+				return nil;
+			b = sourceBlock(r->parent, r->offset/r->epb, OReadOnly);
+			sourceUnlock(r->parent);
+			if(b){
+				fprint(2, "sourceAlloc: lost %V found %V\n",
+					r->score, b->score);
+				memmove(r->score, b->score, VtScoreSize);
+				r->scoreEpoch = b->l.epoch;
+				return b;
+			}
+		}
+		return nil;
+	}
+}
+
+int
+sourceLock(Source *r, int mode)
+{
+	Block *b;
+
+	if(mode == -1)
+		mode = r->mode;
+
+	b = sourceLoadBlock(r, mode);
+	if(b == nil)
+		return 0;
+	/*
+	 * The fact that we are holding b serves as the
+	 * lock entitling us to write to r->b.
+	 */
+	assert(r->b == nil);
+	r->b = b;
+	if(r->mode == OReadWrite)
+		assert(r->epoch == r->b->l.epoch);
+	return 1;
+}
+
+/*
+ * Lock two (usually sibling) sources.  This needs special care
+ * because the Entries for both sources might be in the same block.
+ * We also try to lock blocks in left-to-right order within the tree.
+ */
+int
+sourceLock2(Source *r, Source *rr, int mode)
+{
+	Block *b, *bb;
+
+	if(rr == nil)
+		return sourceLock(r, mode);
+
+	if(mode == -1)
+		mode = r->mode;
+
+	if(r->parent==rr->parent && r->offset/r->epb == rr->offset/rr->epb){
+		b = sourceLoadBlock(r, mode);
+		if(b == nil)
+			return 0;
+		blockDupLock(b);
+		bb = b;
+	}else if(r->parent==rr->parent || r->offset > rr->offset){
+		bb = sourceLoadBlock(rr, mode);
+		b = sourceLoadBlock(r, mode);
+	}else{
+		b = sourceLoadBlock(r, mode);
+		bb = sourceLoadBlock(rr, mode);
+	}
+	if(b == nil || bb == nil){
+		if(b)
+			blockPut(b);
+		if(bb)
+			blockPut(bb);
+		return 0;
+	}
+
+	/*
+	 * The fact that we are holding b and bb serves
+	 * as the lock entitling us to write to r->b and rr->b.
+	 */
+	r->b = b;
+	rr->b = bb;
+	return 1;
+}
+
+void
+sourceUnlock(Source *r)
+{
+	Block *b;
+
+	if(r->b == nil){
+		fprint(2, "sourceUnlock: already unlocked\n");
+		abort();
+	}
+	b = r->b;
+	r->b = nil;
+	blockPut(b);
+}
+
+static Block*
+sourceLoad(Source *r, Entry *e)
+{
+	Block *b;
+
+	assert(sourceIsLocked(r));
+	b = r->b;
+	if(!entryUnpack(e, b->data, r->offset % r->epb))
+		return nil;
+	if(e->gen != r->gen){
+		vtSetError(ERemoved);
+		return nil;
+	}
+	blockDupLock(b);
+	return b;
+}
+
+static int
+sizeToDepth(uvlong s, int psize, int dsize)
+{
+	int np;
+	int d;
+	
+	/* determine pointer depth */
+	np = psize/VtScoreSize;
+	s = (s + dsize - 1)/dsize;
+	for(d = 0; s > 1; d++)
+		s = (s + np - 1)/np;
+	return d;
+}
+
+static u32int
+tagGen(void)
+{
+	u32int tag;
+
+	for(;;){
+		tag = lrand();
+		if(tag >= UserTag)
+			break;
+	}
+	return tag;
+}

+ 271 - 0
sys/src/cmd/fossil/srcload.c

@@ -0,0 +1,271 @@
+#include "stdinc.h"
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+int num = 100;
+int length = 20*1024;
+int block= 1024;
+int bush = 4;
+int iter = 100;
+Biobuf *bout;
+int maxdepth;
+
+Source *mkroot(Cache*);
+void new(Source*, int trace, int);
+int delete(Source*);
+int count(Source *s, int);
+void stats(Source *s);
+void dump(Source *s, int ident, ulong entry);
+static void bench(Source *r);
+
+void
+main(int argc, char *argv[])
+{
+	int i;
+	Fs *fs;
+	int csize = 1000;
+	ulong t;
+	Source *r;
+
+	ARGBEGIN{
+	case 'i':
+		iter = atoi(ARGF());
+		break;
+	case 'n':
+		num = atoi(ARGF());
+		break;
+	case 'l':
+		length = atoi(ARGF());
+		break;
+	case 'b':	
+		block = atoi(ARGF());
+		break;
+	case 'u':
+		bush = atoi(ARGF());
+		break;
+	case 'c':
+		csize = atoi(ARGF());
+		break;
+	}ARGEND;
+
+	vtAttach();
+
+	bout = vtMemAllocZ(sizeof(Biobuf));
+	Binit(bout, 1, OWRITE);
+
+	fmtinstall('V', vtScoreFmt);
+	fmtinstall('R', vtErrFmt);
+
+	fs = fsOpen(argv[0], nil, csize, OReadWrite);
+	if(fs == nil)
+		sysfatal("could not open fs: %r");
+
+	t = time(0);
+
+	srand(0);
+
+	r = fs->source;
+dump(r, 0, 0);
+
+	fprint(2, "count = %d\n", count(r, 1));
+	for(i=0; i<num; i++)
+		new(r, 0, 0);
+
+	for(i=0; i<iter; i++){
+if(i % 10000 == 0)
+stats(r);
+		new(r, 0, 0);
+		delete(r);
+	}
+
+//	dump(r, 0, 0);
+
+	fprint(2, "count = %d\n", count(r, 1));
+//	cacheCheck(c);
+
+fprint(2, "deleting\n");
+	for(i=0; i<num; i++)
+		delete(r);
+//	dump(r, 0, 0);
+
+	fprint(2, "count = %d\n", count(r, 1));
+	fprint(2, "total time = %ld\n", time(0)-t);
+	
+	fsClose(fs);
+
+	vtDetach();
+
+	exits(0);
+}
+
+static void
+bench(Source *r)
+{
+	vlong t;
+	Entry e;
+	int i;
+
+	t = nsec();
+
+	for(i=0; i<1000000; i++)
+		sourceGetEntry(r, &e);
+
+	fprint(2, "%f\n", 1e-9*(nsec() - t));
+}
+
+void
+new(Source *s, int trace, int depth)
+{
+	int i, n;
+	Source *ss;
+	Entry e;
+	
+	if(depth > maxdepth)
+		maxdepth = depth;
+
+	Bflush(bout);
+
+	n = sourceGetDirSize(s);
+	for(i=0; i<n; i++){
+		ss = sourceOpen(s, nrand(n), OReadWrite);
+		if(ss == nil || !sourceGetEntry(ss, &e))
+			continue;
+		if((e.flags & VtEntryDir) && frand() < 1./bush){
+			if(trace){
+				int j;
+				for(j=0; j<trace; j++)
+					Bprint(bout, " ");
+				Bprint(bout, "decend %d\n", i);
+			}
+			new(ss, trace?trace+1:0, depth+1);
+			sourceClose(ss);
+			return;
+		}
+		sourceClose(ss);
+	}
+	ss = sourceCreate(s, s->dsize, 1+frand()>.5, 0);
+	if(ss == nil){
+		Bprint(bout, "could not create directory: %R\n");
+		return;
+	}
+	if(trace){
+		int j;
+		for(j=1; j<trace; j++)
+			Bprint(bout, " ");
+		Bprint(bout, "create %d\n", ss->offset);
+	}
+	sourceClose(ss);
+}
+
+int
+delete(Source *s)
+{
+	int i, n;
+	Source *ss;
+
+	n = sourceGetDirSize(s);
+	/* check if empty */
+	for(i=0; i<n; i++){
+		ss = sourceOpen(s, i, OReadWrite);
+		if(ss != nil){
+			sourceClose(ss);
+			break;
+		}
+	}
+	if(i == n)
+		return 0;
+		
+	for(;;){
+		ss = sourceOpen(s, nrand(n), OReadWrite);
+		if(ss == nil)
+			continue;
+		if(s->dir && delete(ss)){
+			sourceClose(ss);
+			return 1;
+		}
+		if(1)
+			break;
+		sourceClose(ss);
+	}
+
+
+	sourceRemove(ss);
+	return 1;
+}
+
+void
+dump(Source *s, int ident, ulong entry)
+{
+	ulong i, n;
+	Source *ss;
+	Entry e;
+
+	for(i=0; i<ident; i++)
+		Bprint(bout, " ");
+
+	if(!sourceGetEntry(s, &e)){
+		fprint(2, "sourceGetEntry failed: %r\n");
+		return;
+	}
+
+	Bprint(bout, "%4lud: gen %4ud depth %d tag=%x score=%V", entry, e.gen, e.depth, e.tag, e.score);
+	if(!s->dir){
+		Bprint(bout, " data size: %llud\n", e.size);
+		return;
+	}
+	n = sourceGetDirSize(s);
+	Bprint(bout, " dir size: %lud\n", n);
+	for(i=0; i<n; i++){
+		ss = sourceOpen(s, i, 1);
+		if(ss == nil)
+			continue;
+		dump(ss, ident+1, i);
+		sourceClose(ss);
+	}
+	return;
+}
+
+int
+count(Source *s, int rec)
+{
+	ulong i, n;
+	int c;
+	Source *ss;
+
+	n = sourceGetDirSize(s);
+	c = 0;
+	for(i=0; i<n; i++){
+		ss = sourceOpen(s, i, OReadOnly);
+		if(ss == nil)
+			continue;
+		if(rec)
+			c += count(ss, rec);
+		c++;
+		sourceClose(ss);
+	}
+	return c;
+}
+
+void
+stats(Source *s)
+{
+	int n, i, c, cc, max;
+	Source *ss;
+
+	cc = 0;
+	max = 0;
+	n = sourceGetDirSize(s);
+	for(i=0; i<n; i++){
+		ss = sourceOpen(s, i, 1);
+		if(ss == nil)
+			continue;
+		cc++;
+		c = count(ss, 1);
+		if(c > max)
+			max = c;
+		sourceClose(ss);
+	}
+fprint(2, "count = %d top = %d depth=%d maxcount %d\n", cc, n, maxdepth, max);
+}

+ 11 - 0
sys/src/cmd/fossil/stdinc.h

@@ -0,0 +1,11 @@
+#include <u.h>
+#include <libc.h>
+
+typedef uvlong	u64int;
+typedef	uchar	u8int;
+typedef ushort	u16int;
+
+#include "venti.h"
+#include "vac.h"
+#include "fs.h"
+

+ 19 - 0
sys/src/cmd/fossil/trunc.c

@@ -0,0 +1,19 @@
+#include <u.h>
+#include <libc.h>
+
+void
+main(int argc, char **argv)
+{
+	Dir d;
+
+	if(argc != 3){
+		fprint(2, "usage: trunc file size\n");
+		exits("usage");
+	}
+
+	nulldir(&d);
+	d.length = strtoull(argv[2], 0, 0);
+	if(dirwstat(argv[1], &d) < 0)
+		sysfatal("dirwstat: %r");
+	exits(0);
+}

+ 13 - 0
sys/src/cmd/fossil/unpack

@@ -0,0 +1,13 @@
+#!/bin/rc
+
+D=/n/ehime/testplan9
+
+time cp /sys/lib/dist/web.protect/plan9.iso.bz2 /n/ehime
+time bunzip2 -c /n/ehime/plan9.iso.bz2 > /n/ehime/plan9.iso
+rm /srv/9660
+9660srv
+mount /srv/9660 /n/sid /n/ehime/plan9.iso
+rm -rf $D
+mkdir $D
+time dircp /n/sid $D
+mkdir $D/n/emelieother  # for lp

+ 746 - 0
sys/src/cmd/fossil/vac.c

@@ -0,0 +1,746 @@
+#include "stdinc.h"
+
+typedef struct MetaChunk MetaChunk;
+
+struct MetaChunk {
+	ushort offset;
+	ushort size;
+	ushort index;
+};
+
+static int stringUnpack(char **s, uchar **p, int *n);
+static int meCmp(MetaEntry*, char *s);
+static int meCmpOld(MetaEntry*, char *s);
+
+
+
+static char EBadMeta[] = "corrupted meta data";
+static char ENoFile[] = "file does not exist";
+
+/*
+ * integer conversion routines
+ */
+#define	U8GET(p)	((p)[0])
+#define	U16GET(p)	(((p)[0]<<8)|(p)[1])
+#define	U32GET(p)	(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])
+#define	U48GET(p)	(((uvlong)U16GET(p)<<32)|(uvlong)U32GET((p)+2))
+#define	U64GET(p)	(((uvlong)U32GET(p)<<32)|(uvlong)U32GET((p)+4))
+
+#define	U8PUT(p,v)	(p)[0]=(v)
+#define	U16PUT(p,v)	(p)[0]=(v)>>8;(p)[1]=(v)
+#define	U32PUT(p,v)	(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v)
+#define	U48PUT(p,v,t32)	t32=(v)>>32;U16PUT(p,t32);t32=(v);U32PUT((p)+2,t32)
+#define	U64PUT(p,v,t32)	t32=(v)>>32;U32PUT(p,t32);t32=(v);U32PUT((p)+4,t32)
+
+static int
+stringUnpack(char **s, uchar **p, int *n)
+{
+	int nn;
+
+	if(*n < 2)
+		return 0;
+	
+	nn = U16GET(*p);
+	*p += 2;
+	*n -= 2;
+	if(nn > *n)
+		return 0;
+	*s = vtMemAlloc(nn+1);
+	memmove(*s, *p, nn);
+	(*s)[nn] = 0;
+	*p += nn;
+	*n -= nn;
+	return 1;
+}
+
+static int
+stringPack(char *s, uchar *p)
+{
+	int n;
+
+	n = strlen(s);
+	U16PUT(p, n);
+	memmove(p+2, s, n);
+	return n+2;
+}
+
+int
+mbSearch(MetaBlock *mb, char *elem, int *ri, MetaEntry *me)
+{
+	int i;
+	int b, t, x;
+if(0)fprint(2, "mbSearch %s\n", elem);
+
+	/* binary search within block */
+	b = 0;
+	t = mb->nindex;
+	while(b < t){
+		i = (b+t)>>1;
+		meUnpack(me, mb, i);
+
+		if(mb->botch)
+			x = meCmpOld(me, elem);
+		else
+			x = meCmp(me, elem);
+
+		if(x == 0){
+			*ri = i;
+			return 1;
+		}
+	
+		if(x < 0)
+			b = i+1;
+		else /* x > 0 */
+			t = i;
+	}
+
+	assert(b == t);
+	
+	*ri = b;	/* b is the index to insert this entry */
+	memset(me, 0, sizeof(*me));
+
+	vtSetError(ENoFile);
+	return 0;
+}
+
+void
+mbInit(MetaBlock *mb, uchar *p, int n, int ne)
+{
+	memset(p, 0, n);
+	mb->maxsize = n;
+	mb->maxindex = ne;
+	mb->nindex = 0;
+	mb->free = 0;
+	mb->size = MetaHeaderSize + ne*MetaIndexSize;
+	mb->buf = p;
+	mb->botch = 0;
+}
+
+int
+mbUnpack(MetaBlock *mb, uchar *p, int n)
+{	
+	u32int magic;
+	int i;
+	int eo, en, omin;
+	uchar *q;
+
+	mb->maxsize = n;
+	mb->buf = p;
+
+	if(n == 0){
+		memset(mb, 0, sizeof(MetaBlock));
+		return 1;
+	}
+
+	magic = U32GET(p);
+	if(magic != MetaMagic && magic != MetaMagic-1)
+		goto Err;
+	mb->size = U16GET(p+4);
+	mb->free = U16GET(p+6);
+	mb->maxindex = U16GET(p+8);
+	mb->nindex = U16GET(p+10);
+	mb->botch = magic != MetaMagic;
+	if(mb->size > n)
+		goto Err;
+
+	omin = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+	if(n < omin)
+		goto Err;
+
+	
+	p += MetaHeaderSize;
+
+	/* check the index table - ensures that meUnpack and meCmp never fail */
+	for(i=0; i<mb->nindex; i++){
+		eo = U16GET(p);
+		en = U16GET(p+2);
+		if(eo < omin || eo+en > mb->size || en < 8)
+			goto Err;
+		q = mb->buf + eo;
+		if(U32GET(q) != DirMagic)
+			goto Err;
+		p += 4;
+	}
+
+	return 1;
+Err:
+	vtSetError(EBadMeta);
+	return 0;
+}
+
+
+void
+mbPack(MetaBlock *mb)
+{
+	uchar *p;
+
+	p = mb->buf;
+
+	assert(!mb->botch);
+
+	U32PUT(p, MetaMagic);
+	U16PUT(p+4, mb->size);
+	U16PUT(p+6, mb->free);
+	U16PUT(p+8, mb->maxindex);
+	U16PUT(p+10, mb->nindex);
+}
+
+
+void
+mbDelete(MetaBlock *mb, int i)
+{
+	uchar *p;
+	int n;
+	MetaEntry me;
+
+	assert(i < mb->nindex);
+	meUnpack(&me, mb, i);
+	memset(me.p, 0, me.size);
+
+	if(me.p - mb->buf + me.size == mb->size)
+		mb->size -= me.size;
+	else
+		mb->free += me.size;
+
+	p = mb->buf + MetaHeaderSize + i*MetaIndexSize;
+	n = (mb->nindex-i-1)*MetaIndexSize;
+	memmove(p, p+MetaIndexSize, n);
+	memset(p+n, 0, MetaIndexSize);
+	mb->nindex--;
+}
+
+void
+mbInsert(MetaBlock *mb, int i, MetaEntry *me)
+{
+	uchar *p;
+	int o, n;
+
+	assert(mb->nindex < mb->maxindex);
+
+	o = me->p - mb->buf;
+	n = me->size;
+	if(o+n > mb->size){
+		mb->free -= mb->size - o;
+		mb->size = o + n;
+	}else
+		mb->free -= n;
+
+	p = mb->buf + MetaHeaderSize + i*MetaIndexSize;
+	n = (mb->nindex-i)*MetaIndexSize;
+	memmove(p+MetaIndexSize, p, n);
+	U16PUT(p, me->p - mb->buf);
+	U16PUT(p+2, me->size);
+	mb->nindex++;
+}
+
+int
+mbResize(MetaBlock *mb, MetaEntry *me, int n)
+{
+	uchar *p, *ep;
+
+	/* easy case */
+	if(n <= me->size){
+		me->size = n;
+		return 1;
+	}
+
+	/* try and expand entry */
+
+	p = me->p + me->size;
+	ep = mb->buf + mb->maxsize;
+	while(p < ep && *p == 0)
+		p++;
+	if(n <= p - me->p){
+		me->size = n;
+		return 1;
+	}
+
+	p = mbAlloc(mb, n);
+	if(p != nil){
+		me->p = p;
+		me->size = n;
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+meUnpack(MetaEntry *me, MetaBlock *mb, int i)
+{
+	uchar *p;
+	int eo, en;
+
+	assert(i >= 0 && i < mb->nindex);
+
+	p = mb->buf + MetaHeaderSize + i*MetaIndexSize;
+	eo = U16GET(p);
+	en = U16GET(p+2);
+
+	me->p = mb->buf + eo;
+	me->size = en;
+
+	/* checked by mbUnpack */
+	assert(me->size >= 8);
+}
+
+/* assumes a small amount of checking has been done in mbEntry */
+static int
+meCmp(MetaEntry *me, char *s)
+{
+	int n;
+	uchar *p;
+
+	p = me->p;
+
+	/* skip magic & version */
+	p += 6;
+	n = U16GET(p);
+	p += 2;
+	
+	if(n > me->size - 8)
+		n = me->size - 8;
+
+	while(n > 0){
+		if(*s == 0)
+			return 1;
+		if(*p < (uchar)*s)
+			return -1;
+		if(*p > (uchar)*s)
+			return 1;
+		p++;
+		s++;
+		n--;
+	}
+	return -(*s != 0);
+}
+
+/*
+ * This is the old and broken meCmp.
+ * This cmp routine reverse the sense of the comparison
+ * when one string is a prefix of the other.
+ * In other words, it put "ab" after "abc" rather
+ * than before.  This behaviour is ok; binary search
+ * and sort still work.  However, it is goes against
+ * the usual convention.
+ */
+static int
+meCmpOld(MetaEntry *me, char *s)
+{
+	int n;
+	uchar *p;
+
+	p = me->p;
+
+	/* skip magic & version */
+	p += 6;
+	n = U16GET(p);
+	p += 2;
+	
+	if(n > me->size - 8)
+		n = me->size - 8;
+
+	while(n > 0){
+		if(*s == 0)
+			return -1;
+		if(*p < (uchar)*s)
+			return -1;
+		if(*p > (uchar)*s)
+			return 1;
+		p++;
+		s++;
+		n--;
+	}
+	return *s != 0;
+}
+
+static int
+offsetCmp(void *s0, void *s1)
+{
+	MetaChunk *mc0, *mc1;
+
+	mc0 = s0;
+	mc1 = s1;
+	if(mc0->offset < mc1->offset)
+		return -1;
+	if(mc0->offset > mc1->offset)
+		return 1;
+	return 0;
+}
+
+static MetaChunk *
+metaChunks(MetaBlock *mb)
+{
+	MetaChunk *mc;
+	int oo, o, n, i;
+	uchar *p;
+
+	mc = vtMemAlloc(mb->nindex*sizeof(MetaChunk));
+	p = mb->buf + MetaHeaderSize;
+	for(i = 0; i<mb->nindex; i++){
+		mc[i].offset = U16GET(p);
+		mc[i].size = U16GET(p+2);
+		mc[i].index = i;
+		p += MetaIndexSize;
+	}
+
+	qsort(mc, mb->nindex, sizeof(MetaChunk), offsetCmp);
+
+	/* check block looks ok */
+	oo = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+	o = oo;
+	n = 0;
+	for(i=0; i<mb->nindex; i++){
+		o = mc[i].offset;
+		n = mc[i].size;
+		if(o < oo)
+			goto Err;
+		oo += n;
+	}
+	if(o+n > mb->size)
+		goto Err;
+	if(mb->size - oo != mb->free)
+		goto Err;
+
+	return mc;
+Err:
+fprint(2, "metaChunks failed!\n");
+oo = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+for(i=0; i<mb->nindex; i++){
+fprint(2, "\t%d: %d %d\n", i, mc[i].offset, mc[i].offset + mc[i].size);
+oo += mc[i].size;
+}
+fprint(2, "\tused=%d size=%d free=%d free2=%d\n", oo, mb->size, mb->free, mb->size - oo);
+	vtSetError(EBadMeta);
+	vtMemFree(mc);
+	return nil;
+}
+
+static void
+mbCompact(MetaBlock *mb, MetaChunk *mc)
+{
+	int oo, o, n, i;
+
+	oo = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+	
+	for(i=0; i<mb->nindex; i++){
+		o = mc[i].offset;
+		n = mc[i].size;
+		if(o != oo){
+			memmove(mb->buf + oo, mb->buf + o, n);
+			U16PUT(mb->buf + MetaHeaderSize + mc[i].index*MetaIndexSize, oo);
+		}
+		oo += n;
+	}
+
+	mb->size = oo;
+	mb->free = 0;
+}
+
+uchar *
+mbAlloc(MetaBlock *mb, int n)
+{
+	int i, o;
+	MetaChunk *mc;
+
+	/* off the end */
+	if(mb->maxsize - mb->size >= n)
+		return mb->buf + mb->size;
+
+	/* check if possible */
+	if(mb->maxsize - mb->size + mb->free < n)
+		return nil;
+
+	mc = metaChunks(mb);
+	if(mc == nil){
+fprint(2, "mbAlloc: metaChunks failed: %r\n");
+		return nil;
+	}
+
+	/* look for hole */
+	o = MetaHeaderSize + mb->maxindex*MetaIndexSize;
+	for(i=0; i<mb->nindex; i++){
+		if(mc[i].offset - o >= n){
+			vtMemFree(mc);
+			return mb->buf + o;
+		}
+		o = mc[i].offset + mc[i].size;
+	}
+
+	if(mb->maxsize - o >= n){
+		vtMemFree(mc);
+		return mb->buf + o;
+	}
+
+	/* compact and return off the end */
+	mbCompact(mb, mc);
+	vtMemFree(mc);
+
+	if(mb->maxsize - mb->size < n){
+		vtSetError(EBadMeta);
+		return nil;
+	}
+	return mb->buf + mb->size;
+}
+
+int
+deSize(DirEntry *dir)
+{
+	int n;
+	
+	/* constant part */
+
+	n = 	4 +	/* magic */
+		2 + 	/* version */
+		4 +	/* entry */
+		4 + 	/* guid */
+		4 + 	/* mentry */
+		4 + 	/* mgen */
+		8 +	/* qid */
+		4 + 	/* mtime */
+		4 + 	/* mcount */
+		4 + 	/* ctime */
+		4 + 	/* atime */
+		4 +	/* mode */
+		0;
+
+	/* strings */
+	n += 2 + strlen(dir->elem);
+	n += 2 + strlen(dir->uid);
+	n += 2 + strlen(dir->gid);
+	n += 2 + strlen(dir->mid);
+
+	/* optional sections */
+	if(dir->qidSpace){
+		n += 	3 + 	/* option header */
+			8 + 	/* qidOffset */
+			8;	/* qid Max */
+	}
+
+	return n;
+}
+
+void
+dePack(DirEntry *dir, MetaEntry *me)
+{
+	uchar *p;
+	ulong t32;
+
+	p = me->p;
+	
+	U32PUT(p, DirMagic);
+	U16PUT(p+4, 9);		/* version */
+	p += 6;
+
+	p += stringPack(dir->elem, p);
+
+	U32PUT(p, dir->entry);
+	U32PUT(p+4, dir->gen);
+	U32PUT(p+8, dir->mentry);
+	U32PUT(p+12, dir->mgen);
+	U64PUT(p+16, dir->qid, t32);
+	p += 24;
+
+	p += stringPack(dir->uid, p);
+	p += stringPack(dir->gid, p);
+	p += stringPack(dir->mid, p);
+	
+	U32PUT(p, dir->mtime);
+	U32PUT(p+4, dir->mcount);
+	U32PUT(p+8, dir->ctime);
+	U32PUT(p+12, dir->atime);
+	U32PUT(p+16, dir->mode);
+	p += 5*4;
+
+	if(dir->qidSpace){
+		U8PUT(p, DeQidSpace);
+		U16PUT(p+1, 2*8);
+		p += 3;
+		U64PUT(p, dir->qidOffset, t32);
+		U64PUT(p+8, dir->qidMax, t32);
+		p += 16;
+	}
+
+	assert(p == me->p + me->size);
+}
+
+
+int
+deUnpack(DirEntry *dir, MetaEntry *me)
+{
+	int t, nn, n, version;
+	uchar *p;
+	
+	p = me->p;
+	n = me->size;
+
+	memset(dir, 0, sizeof(DirEntry));
+
+if(0)print("vdUnpack\n");
+	/* magic */
+	if(n < 4 || U32GET(p) != DirMagic)
+		goto Err;
+	p += 4;
+	n -= 4;
+
+if(0)print("vdUnpack: got magic\n");
+	/* version */
+	if(n < 2)
+		goto Err;
+	version = U16GET(p);
+	if(version < 7 || version > 9)
+		goto Err;
+	p += 2;
+	n -= 2;	
+
+if(0)print("vdUnpack: got version\n");
+
+	/* elem */
+	if(!stringUnpack(&dir->elem, &p, &n))
+		goto Err;
+
+if(0)print("vdUnpack: got elem\n");
+
+	/* entry  */
+	if(n < 4)
+		goto Err;
+	dir->entry = U32GET(p);
+	p += 4;
+	n -= 4;
+
+if(0)print("vdUnpack: got entry\n");
+
+	if(version < 9){
+		dir->gen = 0;
+		dir->mentry = dir->entry+1;
+		dir->mgen = 0;
+	}else{
+		if(n < 3*4)
+			goto Err;
+		dir->gen = U32GET(p);
+		dir->mentry = U32GET(p+4);
+		dir->mgen = U32GET(p+8);
+		p += 3*4;
+		n -= 3*4;
+	}
+
+if(0)print("vdUnpack: got gen etc\n");
+
+	/* size is gotten from VtEntry */
+	dir->size = 0;
+
+	/* qid */
+	if(n < 8)
+		goto Err;
+	dir->qid = U64GET(p);
+	p += 8;
+	n -= 8;
+
+if(0)print("vdUnpack: got qid\n");
+	/* skip replacement */
+	if(version == 7){
+		if(n < VtScoreSize)
+			goto Err;
+		p += VtScoreSize;
+		n -= VtScoreSize;
+	}
+	
+	/* uid */
+	if(!stringUnpack(&dir->uid, &p, &n))
+		goto Err;
+
+	/* gid */
+	if(!stringUnpack(&dir->gid, &p, &n))
+		goto Err;
+
+	/* mid */
+	if(!stringUnpack(&dir->mid, &p, &n))
+		goto Err;
+
+if(0)print("vdUnpack: got ids\n");
+	if(n < 5*4)
+		goto Err;
+	dir->mtime = U32GET(p);
+	dir->mcount = U32GET(p+4);
+	dir->ctime = U32GET(p+8);
+	dir->atime = U32GET(p+12);
+	dir->mode = U32GET(p+16);
+	p += 5*4;
+	n -= 5*4;
+
+if(0)print("vdUnpack: got times\n");
+	/* optional meta data */
+	while(n > 0){
+		if(n < 3)
+			goto Err;
+		t = p[0];
+		nn = U16GET(p+1);
+		p += 3;
+		n -= 3;
+		if(n < nn)
+			goto Err;
+		switch(t){
+		case DePlan9:
+			/* not valid in version >= 9 */
+			if(version >= 9)
+				break;
+			if(dir->plan9 || nn != 12)
+				goto Err;
+			dir->plan9 = 1;
+			dir->p9path = U64GET(p);
+			dir->p9version = U32GET(p+8);
+			if(dir->mcount == 0)
+				dir->mcount = dir->p9version;
+			break;
+		case DeGen:
+			/* not valid in version >= 9 */
+			if(version >= 9)
+				break;
+			break;
+		case DeQidSpace:
+			if(dir->qidSpace || nn != 16)
+				goto Err;
+			dir->qidSpace = 1;
+			dir->qidOffset = U64GET(p);
+			dir->qidMax = U64GET(p+8);
+			break;
+		}
+		p += nn;
+		n -= nn;
+	}
+if(0)print("vdUnpack: got options\n");
+
+	if(p != me->p + me->size)
+		goto Err;
+
+if(0)print("vdUnpack: correct size\n");
+	return 1;
+Err:
+if(0)print("vdUnpack: XXXXXXXXXXXX EbadMeta\n");
+	vtSetError(EBadMeta);
+	deCleanup(dir);
+	return 0;
+}
+
+void
+deCleanup(DirEntry *dir)
+{
+	vtMemFree(dir->elem);
+	dir->elem = nil;
+	vtMemFree(dir->uid);
+	dir->uid = nil;
+	vtMemFree(dir->gid);
+	dir->gid = nil;
+	vtMemFree(dir->mid);
+	dir->mid = nil;
+}
+
+void
+deCopy(DirEntry *dst, DirEntry *src)
+{
+	*dst = *src;
+	dst->elem = vtStrDup(src->elem);
+	dst->uid = vtStrDup(src->uid);
+	dst->gid = vtStrDup(src->gid);
+	dst->mid = vtStrDup(src->mid);
+}

+ 107 - 0
sys/src/cmd/fossil/vac.h

@@ -0,0 +1,107 @@
+typedef struct DirEntry DirEntry;
+typedef struct MetaBlock MetaBlock;
+typedef struct MetaEntry MetaEntry;
+
+enum {
+	MetaMagic = 0x5656fc7a,
+	MetaHeaderSize = 12,
+	MetaIndexSize = 4,
+	IndexEntrySize = 8,
+	DirMagic = 0x1c4d9072,
+};
+
+/*
+ * Mode bits
+ */
+enum {
+	ModeOtherExec = (1<<0),		
+	ModeOtherWrite = (1<<1),
+	ModeOtherRead = (1<<2),
+	ModeGroupExec = (1<<3),
+	ModeGroupWrite = (1<<4),
+	ModeGroupRead = (1<<5),
+	ModeOwnerExec = (1<<6),
+	ModeOwnerWrite = (1<<7),
+	ModeOwnerRead = (1<<8),
+	ModeSticky = (1<<9),
+	ModeSetUid = (1<<10),
+	ModeSetGid = (1<<11),
+	ModeAppend = (1<<12),		/* append only file */
+	ModeExclusive = (1<<13),	/* lock file - plan 9 */
+	ModeLink = (1<<14),		/* sym link */
+	ModeDir	= (1<<15),		/* duplicate of DirEntry */
+	ModeHidden = (1<<16),		/* MS-DOS */
+	ModeSystem = (1<<17),		/* MS-DOS */
+	ModeArchive = (1<<18),		/* MS-DOS */
+	ModeTemporary = (1<<19),	/* MS-DOS */
+	ModeSnapshot = (1<<20),		/* read only snapshot */
+};
+
+/* optional directory entry fields */
+enum {
+	DePlan9 = 1,	/* not valid in version >= 9 */
+	DeNT,		/* not valid in version >= 9 */
+	DeQidSpace,
+	DeGen,		/* not valid in version >= 9 */
+};
+
+struct DirEntry {
+	char *elem;		/* path element */
+	ulong entry;		/* entry in directory for data */
+	ulong gen;		/* generation of data entry */
+	ulong mentry;		/* entry in directory for meta */
+	ulong mgen;		/* generation of meta entry */
+	uvlong size;		/* size of file */
+	uvlong qid;		/* unique file id */
+	
+	char *uid;		/* owner id */
+	char *gid;		/* group id */
+	char *mid;		/* last modified by */
+	ulong mtime;		/* last modified time */
+	ulong mcount;		/* number of modifications: can wrap! */
+	ulong ctime;		/* directory entry last changed */
+	ulong atime;		/* last time accessed */
+	ulong mode;		/* various mode bits */
+
+	/* plan 9 */
+	int plan9;
+	uvlong p9path;
+	ulong p9version;
+
+	/* sub space of qid */
+	int qidSpace;
+	uvlong qidOffset;	/* qid offset */
+	uvlong qidMax;		/* qid maximum */
+};
+
+struct MetaEntry {
+	uchar *p;
+	ushort size;
+};
+
+struct MetaBlock {
+	int maxsize;		/* size of block */
+	int size;		/* size used */
+	int free;		/* free space within used size */
+	int maxindex;		/* entries allocated for table */
+	int nindex;		/* amount of table used */
+	int botch;		/* compensate for my stupidity */
+	uchar *buf;
+};
+
+void	deCleanup(DirEntry*);
+void	deCopy(DirEntry*, DirEntry*);
+int	deSize(DirEntry*);
+void	dePack(DirEntry*, MetaEntry*);
+int	deUnpack(DirEntry*, MetaEntry*);
+
+void	mbInit(MetaBlock*, uchar*, int, int);
+int	mbUnpack(MetaBlock*, uchar*, int);
+void	mbInsert(MetaBlock*, int, MetaEntry*);
+void	mbDelete(MetaBlock*, int);
+void	mbPack(MetaBlock*);
+uchar	*mbAlloc(MetaBlock*, int);
+int	mbResize(MetaBlock*, MetaEntry*, int);
+int	mbSearch(MetaBlock*, char*, int*, MetaEntry*);
+
+void	meUnpack(MetaEntry*, MetaBlock*, int);

+ 1127 - 0
sys/src/cmd/fossil/view.c

@@ -0,0 +1,1127 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include <draw.h>
+#include <event.h>
+
+/* --- tree.h */
+typedef struct Tree Tree;
+typedef struct Tnode Tnode;
+
+struct Tree
+{
+	Tnode *root;
+	Point offset;
+	Image *clipr;
+};
+
+struct Tnode
+{
+	Point offset;
+
+	char *str;
+//	char *(*strfn)(Tnode*);
+//	uint (*draw)(Tnode*, Image*, Image*, Point);
+	void (*expand)(Tnode*);
+	void (*collapse)(Tnode*);
+
+	uint expanded;
+	Tnode **kid;
+	int nkid;
+	void *aux;
+};
+
+typedef struct Atree Atree;
+struct Atree
+{
+	int resizefd;
+	Tnode *root;
+};
+
+Atree *atreeinit(char*);
+
+/* --- visfossil.c */
+Tnode *initxheader(void);
+Tnode *initxcache(char *name);
+Tnode *initxsuper(void);
+Tnode *initxlocalroot(char *name, u32int addr);
+Tnode *initxentry(Entry);
+Tnode *initxsource(Entry, int);
+Tnode *initxentryblock(Block*, Entry*);
+Tnode *initxdatablock(Block*, uint);
+Tnode *initxroot(char *name, uchar[VtScoreSize]);
+
+int fd;
+Header h;
+Super super;
+VtSession *z;
+VtRoot vac;
+int showinactive;
+
+/*
+ * dumbed down versions of fossil routines
+ */
+char*
+bsStr(int state)
+{
+	static char s[100];
+
+	if(state == BsFree)
+		return "Free";
+	if(state == BsBad)
+		return "Bad";
+
+	sprint(s, "%x", state);
+	if(!(state&BsAlloc))
+		strcat(s, ",Free");	/* should not happen */
+	if(state&BsCopied)
+		strcat(s, ",Copied");
+	if(state&BsVenti)
+		strcat(s, ",Venti");
+	if(state&BsClosed)
+		strcat(s, ",Closed");
+	return s;
+}
+
+char *bttab[] = {
+	"BtData",
+	"BtData+1",
+	"BtData+2",
+	"BtData+3",
+	"BtData+4",
+	"BtData+5",
+	"BtData+6",
+	"BtData+7",
+	"BtDir",
+	"BtDir+1",
+	"BtDir+2",
+	"BtDir+3",
+	"BtDir+4",
+	"BtDir+5",
+	"BtDir+6",
+	"BtDir+7",
+};
+
+char*
+btStr(int type)
+{
+	if(type < nelem(bttab))
+		return bttab[type];
+	return "unknown";
+}
+#pragma varargck argpos stringnode 1
+
+Block*
+allocBlock(void)
+{
+	Block *b;
+
+	b = mallocz(sizeof(Block)+h.blockSize, 1);
+	b->data = (void*)&b[1];
+	return b;
+}
+
+void
+blockPut(Block *b)
+{
+	free(b);
+}
+
+static u32int
+partStart(int part)
+{
+	switch(part){
+	default:
+		assert(0);
+	case PartSuper:
+		return h.super;
+	case PartLabel:
+		return h.label;
+	case PartData:
+		return h.data;
+	}
+}
+
+
+static u32int
+partEnd(int part)
+{
+	switch(part){
+	default:
+		assert(0);
+	case PartSuper:
+		return h.super+1;
+	case PartLabel:
+		return h.data;
+	case PartData:
+		return h.end;
+	}
+}
+
+Block*
+readBlock(int part, u32int addr)
+{
+	u32int start, end;
+	u64int offset;
+	int n, nn;
+	Block *b;
+	uchar *buf;
+
+	start = partStart(part);
+	end = partEnd(part);
+	if(addr >= end-start){
+		werrstr("bad addr 0x%.8ux; wanted 0x%.8ux - 0x%.8ux", addr, start, end);
+		return nil;
+	}
+
+	b = allocBlock();
+	b->addr = addr;
+	buf = b->data;
+	offset = ((u64int)(addr+start))*h.blockSize;
+	n = h.blockSize;
+	while(n > 0){
+		nn = pread(fd, buf, n, offset);
+		if(nn < 0){
+			blockPut(b);
+			return nil;
+		}
+		if(nn == 0){
+			werrstr("short read");
+			blockPut(b);
+			return nil;
+		}
+		n -= nn;
+		offset += nn;
+		buf += nn;
+	}
+	return b;
+}
+
+int vtType[BtMax] = {
+	VtDataType,		/* BtData | 0  */
+	VtPointerType0,		/* BtData | 1  */
+	VtPointerType1,		/* BtData | 2  */
+	VtPointerType2,		/* BtData | 3  */
+	VtPointerType3,		/* BtData | 4  */
+	VtPointerType4,		/* BtData | 5  */
+	VtPointerType5,		/* BtData | 6  */
+	VtPointerType6,		/* BtData | 7  */
+	VtDirType,		/* BtDir | 0  */
+	VtPointerType0,		/* BtDir | 1  */
+	VtPointerType1,		/* BtDir | 2  */
+	VtPointerType2,		/* BtDir | 3  */
+	VtPointerType3,		/* BtDir | 4  */
+	VtPointerType4,		/* BtDir | 5  */
+	VtPointerType5,		/* BtDir | 6  */
+	VtPointerType6,		/* BtDir | 7  */
+};
+
+Block*
+ventiBlock(uchar score[VtScoreSize], uint type)
+{
+	int n;
+	Block *b;
+
+	b = allocBlock();
+	memmove(b->score, score, VtScoreSize);
+	b->addr = NilBlock;
+
+	n = vtRead(z, b->score, vtType[type], b->data, h.blockSize);
+	if(n < 0){
+		fprint(2, "vtRead returns %d: %R\n", n);
+		blockPut(b);
+		return nil;
+	}
+	vtZeroExtend(vtType[type], b->data, n, h.blockSize);
+	b->l.type = type;
+	b->l.state = 0;
+	b->l.tag = 0;
+	b->l.epoch = 0;
+	return b;
+}
+
+Block*
+dataBlock(uchar score[VtScoreSize], uint type, uint tag)
+{
+	Block *b, *bl;
+	int lpb;
+	Label l;
+	u32int addr;
+
+	addr = globalToLocal(score);
+	if(addr == NilBlock)
+		return ventiBlock(score, type);
+
+	lpb = h.blockSize/LabelSize;
+	bl = readBlock(PartLabel, addr/lpb);
+	if(bl == nil)
+		return nil;
+	if(!labelUnpack(&l, bl->data, addr%lpb)){
+		werrstr("%R");
+		blockPut(bl);
+		return nil;
+	}
+	blockPut(bl);
+	if(l.type != type){
+		werrstr("type mismatch; got %d (%s) wanted %d (%s)",
+			l.type, btStr(l.type), type, btStr(type));
+		return nil;
+	}
+	if(tag && l.tag != tag){
+		werrstr("tag mismatch; got 0x%.8ux wanted 0x%.8ux",
+			l.tag, tag);
+		return nil;
+	}
+	b = readBlock(PartData, addr);
+	if(b == nil)
+		return nil;
+	b->l = l;
+	return b;
+}
+
+Entry*
+copyEntry(Entry e)
+{
+	Entry *p;
+
+	p = mallocz(sizeof *p, 1);
+	*p = e;
+	return p;
+}
+
+MetaBlock*
+copyMetaBlock(MetaBlock mb)
+{
+	MetaBlock *p;
+
+	p = mallocz(sizeof mb, 1);
+	*p = mb;
+	return p;
+}
+
+/*
+ * visualizer 
+ */
+
+Tnode*
+stringnode(char *fmt, ...)
+{
+	va_list arg;
+	Tnode *t;
+
+	t = mallocz(sizeof(Tnode), 1);
+	va_start(arg, fmt);
+	t->str = vsmprint(fmt, arg);
+	va_end(arg);
+	t->nkid = -1;
+	return t;
+}
+
+void
+xcacheexpand(Tnode *t)
+{
+	if(t->nkid >= 0)
+		return;
+
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*t->nkid, 1);
+	t->kid[0] = initxheader();
+}
+
+Tnode*
+initxcache(char *name)
+{
+	Tnode *t;
+
+	if((fd = open(name, OREAD)) < 0)
+		sysfatal("cannot open %s: %r", name);
+
+	t = stringnode("%s", name);
+	t->expand = xcacheexpand;
+	return t;
+}
+
+void
+xheaderexpand(Tnode *t)
+{
+	if(t->nkid >= 0)
+		return;
+
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*t->nkid, 1);
+	t->kid[0] = initxsuper();
+	//t->kid[1] = initxlabel(h.label);
+	//t->kid[2] = initxdata(h.data);
+}
+
+Tnode*
+initxheader(void)
+{
+	u8int buf[HeaderSize];
+	Tnode *t;
+
+	if(pread(fd, buf, HeaderSize, HeaderOffset) < HeaderSize)
+		return stringnode("error reading header: %r");
+	if(!headerUnpack(&h, buf))
+		return stringnode("error unpacking header: %R");
+
+	t = stringnode("header "
+		"version=%#ux (%d) "
+		"blockSize=%#ux (%d) "
+		"super=%#lux (%ld) "
+		"label=%#lux (%ld) "
+		"data=%#lux (%ld) "
+		"end=%#lux (%ld)",
+		h.version, h.version, h.blockSize, h.blockSize,
+		h.super, h.super,
+		h.label, h.label, h.data, h.data, h.end, h.end);
+	t->expand = xheaderexpand;
+	return t;
+}
+
+void
+xsuperexpand(Tnode *t)
+{
+	if(t->nkid >= 0)
+		return;
+
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*t->nkid, 1);
+	t->kid[0] = initxlocalroot("active", super.active);
+//	t->kid[1] = initxlocalroot("next", super.next);
+//	t->kid[2] = initxlocalroot("current", super.current);
+}
+
+Tnode*
+initxsuper(void)
+{
+	Block *b;
+	Tnode *t;
+
+	b = readBlock(PartSuper, 0);
+	if(b == nil)
+		return stringnode("reading super: %r");
+	if(!superUnpack(&super, b->data)){
+		blockPut(b);
+		return stringnode("unpacking super: %R");
+	}
+	blockPut(b);
+	t = stringnode("super "
+		"version=%#ux "
+		"epoch=[%#ux,%#ux) "
+		"qid=%#llux "
+		"active=%#x "
+		"next=%#x "
+		"current=%#x "
+		"last=%V "
+		"name=%s",
+		super.version, super.epochLow, super.epochHigh,
+		super.qid, super.active, super.next, super.current,
+		super.last, super.name);
+	t->expand = xsuperexpand;
+	return t;
+}
+
+void
+xvacrootexpand(Tnode *t)
+{
+	if(t->nkid >= 0)
+		return;
+
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*t->nkid, 1);
+	t->kid[0] = initxroot("root", vac.score);
+}
+
+Tnode*
+initxvacroot(uchar score[VtScoreSize])
+{
+	Tnode *t;
+	uchar buf[VtRootSize];
+	int n;
+
+	if((n = vtRead(z, score, VtRootType, buf, VtRootSize)) < 0)
+		return stringnode("reading root %V: %R", score);
+
+	if(!vtRootUnpack(&vac, buf))
+		return stringnode("unpack %d-byte root: %R", n);
+
+	h.blockSize = vac.blockSize;
+	t = stringnode("vac version=%#ux name=%s type=%s blockSize=%ud score=%V prev=%V",
+		vac.version, vac.name, vac.type, vac.blockSize, vac.score, vac.prev);
+	t->expand = xvacrootexpand;
+	return t;
+}
+
+Tnode*
+initxlabel(Label l)
+{
+	return stringnode("label type=%s state=%s epoch=%#ux tag=%#ux",
+		btStr(l.type), bsStr(l.state), l.epoch, l.tag);
+}
+
+typedef struct Xblock Xblock;
+struct Xblock
+{
+	Tnode;
+	Block *b;
+	int (*gen)(void*, Block*, int, Tnode**);
+	void *arg;
+	int printlabel;
+};
+
+void
+xblockexpand(Tnode *tt)
+{
+	int i, j;
+	enum { Q = 32 };
+	Xblock *t = (Xblock*)tt;
+	Tnode *nn;
+
+	if(t->nkid >= 0)
+		return;
+
+	j = 0;
+	if(t->printlabel){
+		t->kid = mallocz(Q*sizeof(t->kid[0]), 1);
+		t->kid[0] = initxlabel(t->b->l);
+		j = 1;
+	}
+
+	for(i=0;; i++){
+		switch((*t->gen)(t->arg, t->b, i, &nn)){
+		case -1:
+			t->nkid = j;
+			return;
+		case 0:
+			break;
+		case 1:
+			if(j%Q == 0)
+				t->kid = realloc(t->kid, (j+Q)*sizeof(t->kid[0]));
+			t->kid[j++] = nn;
+			break;
+		}
+	}
+}
+
+int
+nilgen(void*, Block*, int, Tnode**)
+{
+	return -1;
+}
+
+Tnode*
+initxblock(Block *b, char *s, int (*gen)(void*, Block*, int, Tnode**), void *arg)
+{
+	Xblock *t;
+
+	if(gen == nil)
+		gen = nilgen;
+	t = mallocz(sizeof(Xblock), 1);
+	t->b = b;
+	t->gen = gen;
+	t->arg = arg;
+	if(b->addr == NilBlock)
+		t->str = smprint("Block %V: %s", b->score, s);
+	else
+		t->str = smprint("Block %#ux: %s", b->addr, s);
+	t->printlabel = 1;
+	t->nkid = -1;
+	t->expand = xblockexpand;
+	return t;
+}
+
+int
+xentrygen(void *v, Block *b, int o, Tnode **tp)
+{
+	Entry e;
+	Entry *ed;
+
+	ed = v;
+	if(o >= ed->dsize/VtEntrySize)
+		return -1;
+
+	entryUnpack(&e, b->data, o);
+	if(!showinactive && !(e.flags & VtEntryActive))
+		return 0;
+	*tp = initxentry(e);
+	return 1;
+}
+
+Tnode*
+initxentryblock(Block *b, Entry *ed)
+{
+	return initxblock(b, "entry", xentrygen, ed);
+}
+
+typedef struct Xentry Xentry;
+struct Xentry 
+{
+	Tnode;
+	Entry e;
+};
+
+void
+xentryexpand(Tnode *tt)
+{
+	Xentry *t = (Xentry*)tt;
+
+	if(t->nkid >= 0)
+		return;
+
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*t->nkid, 1);
+	t->kid[0] = initxsource(t->e, 1);
+}
+
+Tnode*
+initxentry(Entry e)
+{
+	Xentry *t;
+
+	t = mallocz(sizeof *t, 1);
+	t->nkid = -1;
+	t->str = smprint("Entry gen=%#ux psize=%d dsize=%d depth=%d flags=%#ux size=%lld score=%V",
+		e.gen, e.psize, e.dsize, e.depth, e.flags, e.size, e.score);
+	if(e.flags & VtEntryLocal)
+		t->str = smprint("%s archive=%d snap=%d tag=%#ux", t->str, e.archive, e.snap, e.tag);
+	t->expand = xentryexpand;
+	t->e = e;
+	return t;	
+}
+
+int
+ptrgen(void *v, Block *b, int o, Tnode **tp)
+{
+	Entry *ed;
+	Entry e;
+
+	ed = v;
+	if(o >= ed->psize/VtScoreSize)
+		return -1;
+
+	e = *ed;
+	e.depth--;
+	memmove(e.score, b->data+o*VtScoreSize, VtScoreSize);
+	if(memcmp(e.score, vtZeroScore, VtScoreSize) == 0)
+		return 0;
+	*tp = initxsource(e, 0);
+	return 1;
+}
+
+static int
+etype(int flags, int depth)
+{
+	uint t;
+
+	if(flags&VtEntryDir)
+		t = BtDir;
+	else
+		t = BtData;
+	return t+depth;
+}
+
+Tnode*
+initxsource(Entry e, int dowrap)
+{
+	Block *b;
+	Tnode *t, *tt;
+
+	b = dataBlock(e.score, etype(e.flags, e.depth), e.tag);
+	if(b == nil)
+		return stringnode("dataBlock: %r");
+
+	if((e.flags & VtEntryActive) == 0)
+		return stringnode("inactive Entry");
+
+	if(e.depth == 0){
+		if(e.flags & VtEntryDir)
+			tt = initxentryblock(b, copyEntry(e));
+		else
+			tt = initxdatablock(b, e.dsize);
+	}else{
+		tt = initxblock(b, smprint("%s+%d pointer", (e.flags & VtEntryDir) ? "BtDir" : "BtData", e.depth),
+			ptrgen, copyEntry(e));
+	}
+
+	/*
+	 * wrap the contents of the Source in a Source node,
+	 * just so it's closer to what you see in the code.
+	 */
+	if(dowrap){
+		t = stringnode("Source");
+		t->nkid = 1;
+		t->kid = mallocz(sizeof(Tnode*)*1, 1);
+		t->kid[0] = tt;
+		tt = t;
+	}
+	return tt;
+}
+
+int
+xlocalrootgen(void*, Block *b, int o, Tnode **tp)
+{
+	Entry e;
+
+	if(o >= 1)
+		return -1;
+	entryUnpack(&e, b->data, o);
+	*tp = initxentry(e);
+	return 1;
+}
+
+Tnode*
+initxlocalroot(char *name, u32int addr)
+{
+	uchar score[VtScoreSize];
+	Block *b;
+
+	localToGlobal(addr, score);
+	b = dataBlock(score, BtDir, RootTag);
+	if(b == nil)
+		return stringnode("read data block %#ux: %R", addr);
+	return initxblock(b, smprint("'%s' fs root", name), xlocalrootgen, nil);
+}
+
+int
+xvacrootgen(void*, Block *b, int o, Tnode **tp)
+{
+	Entry e;
+
+	if(o >= 3)
+		return -1;
+	entryUnpack(&e, b->data, o);
+	*tp = initxentry(e);
+	return 1;
+}
+
+Tnode*
+initxroot(char *name, uchar score[VtScoreSize])
+{
+	Block *b;
+
+	b = dataBlock(score, BtDir, RootTag);
+	if(b == nil)
+		return stringnode("read data block %V: %R", score);
+	return initxblock(b, smprint("'%s' fs root", name), xvacrootgen, nil);
+}
+Tnode*
+initxdirentry(MetaEntry *me)
+{
+	DirEntry dir;
+	Tnode *t;
+
+	if(!deUnpack(&dir, me))
+		return stringnode("deUnpack: %R");
+
+	t = stringnode("dirEntry elem=%s size=%llud data=%#lux/%#lux meta=%#lux/%#lux", dir.elem, dir.size, dir.entry, dir.gen, dir.mentry, dir.mgen);
+	t->nkid = 1;
+	t->kid = mallocz(sizeof(t->kid[0])*1, 1);
+	t->kid[0] = stringnode(
+		"qid=%#llux\n"
+		"uid=%s gid=%s mid=%s\n"
+		"mtime=%lud mcount=%lud ctime=%lud atime=%lud\n"
+		"mode=%luo\n"
+		"plan9 %d p9path %#llux p9version %lud\n"
+		"qidSpace %d offset %#llux max %#llux",
+		dir.qid,
+		dir.uid, dir.gid, dir.mid,
+		dir.mtime, dir.mcount, dir.ctime, dir.atime,
+		dir.mode,
+		dir.plan9, dir.p9path, dir.p9version,
+		dir.qidSpace, dir.qidOffset, dir.qidMax);
+	return t;
+}
+
+int
+metaentrygen(void *v, Block*, int o, Tnode **tp)
+{
+	Tnode *t;
+	MetaBlock *mb;
+	MetaEntry me;
+
+	mb = v;
+	if(o >= mb->nindex)
+		return -1;
+	meUnpack(&me, mb, o);
+
+	t = stringnode("MetaEntry %d bytes", mb->size);
+	t->kid = mallocz(sizeof(t->kid[0])*1, 1);
+	t->kid[0] = initxdirentry(&me);
+	t->nkid = 1;
+	*tp = t;
+	return 1;
+}
+
+int
+metablockgen(void *v, Block *b, int o, Tnode **tp)
+{
+	Xblock *t;
+	MetaBlock *mb;
+
+	if(o >= 1)
+		return -1;
+
+	/* hack: reuse initxblock as a generic iterator */
+	mb = v;
+	t = (Xblock*)initxblock(b, "", metaentrygen, mb);
+	t->str = smprint("MetaBlock %d/%d space used, %d add'l free %d/%d table used%s",
+		mb->size, mb->maxsize, mb->free, mb->nindex, mb->maxindex,
+		mb->botch ? " [BOTCH]" : "");
+	t->printlabel = 0;
+	*tp = t;
+	return 1;
+}
+
+/*
+ * attempt to guess at the type of data in the block.
+ * it could just be data from a file, but we're hoping it's MetaBlocks.
+ */
+Tnode*
+initxdatablock(Block *b, uint n)
+{
+	MetaBlock mb;
+
+	if(n > h.blockSize)
+		n = h.blockSize;
+
+	if(mbUnpack(&mb, b->data, n))
+		return initxblock(b, "metadata", metablockgen, copyMetaBlock(mb));
+
+	return initxblock(b, "data", nil, nil);
+}
+
+int
+parseScore(uchar *score, char *buf, int n)
+{
+	int i, c;
+
+	memset(score, 0, VtScoreSize);
+
+	if(n < VtScoreSize*2)
+		return 0;
+	for(i=0; i<VtScoreSize*2; i++){
+		if(buf[i] >= '0' && buf[i] <= '9')
+			c = buf[i] - '0';
+		else if(buf[i] >= 'a' && buf[i] <= 'f')
+			c = buf[i] - 'a' + 10;
+		else if(buf[i] >= 'A' && buf[i] <= 'F')
+			c = buf[i] - 'A' + 10;
+		else{
+			return 0;
+		}
+
+		if((i & 1) == 0)
+			c <<= 4;
+	
+		score[i>>1] |= c;
+	}
+	return 1;
+}
+
+int
+scoreFmt(Fmt *f)
+{
+	uchar *v;
+	int i;
+	u32int addr;
+
+	v = va_arg(f->args, uchar*);
+	if(v == nil){
+		fmtprint(f, "*");
+	}else if((addr = globalToLocal(v)) != NilBlock)
+		fmtprint(f, "0x%.8ux", addr);
+	else{
+		for(i = 0; i < VtScoreSize; i++)
+			fmtprint(f, "%2.2ux", v[i]);
+	}
+
+	return 0;
+}
+
+Atree*
+atreeinit(char *arg)
+{
+	Atree *a;
+	uchar score[VtScoreSize];
+
+	vtAttach();
+
+	fmtinstall('V', scoreFmt);
+	fmtinstall('R', vtErrFmt);
+
+	z = vtDial(nil, 1);
+	if(z == nil)
+		fprint(2, "warning: cannot dial venti: %R\n");
+	if(!vtConnect(z, 0)){
+		fprint(2, "warning: cannot connect to venti: %R\n");
+		z = nil;
+	}
+	a = mallocz(sizeof(Atree), 1);
+	if(strncmp(arg, "vac:", 4) == 0){
+		if(!parseScore(score, arg+4, strlen(arg+4))){
+			fprint(2, "cannot parse score\n");
+			return nil;
+		}
+		a->root = initxvacroot(score);
+	}else
+		a->root = initxcache(arg);
+	a->resizefd = -1;
+	return a;
+}
+
+/* --- tree.c */
+enum
+{
+	Nubwidth = 11,
+	Nubheight = 11,
+	Linewidth = Nubwidth*2+4,
+};
+
+uint
+drawtext(char *s, Image *m, Image *clipr, Point o)
+{
+	char *t, *nt, *e;
+	uint dy;
+
+	if(s == nil)
+		s = "???";
+
+	dy = 0;
+	for(t=s; t&&*t; t=nt){
+		if(nt = strchr(t, '\n')){
+			e = nt;
+			nt++;
+		}else
+			e = t+strlen(t);
+
+		_string(m, Pt(o.x, o.y+dy), display->black, ZP, display->defaultfont,
+			t, nil, e-t, clipr->clipr, nil, ZP, SoverD);
+		dy += display->defaultfont->height;
+	}
+	return dy;
+}
+
+void
+drawnub(Image *m, Image *clipr, Point o, Tnode *t)
+{
+	clipr = nil;
+
+	if(t->nkid == 0)
+		return;
+	if(t->nkid == -1 && t->expand == nil)
+		return;
+
+	o.y += (display->defaultfont->height-Nubheight)/2;
+	draw(m, rectaddpt(Rect(0,0,1,Nubheight), o), display->black, clipr, ZP);
+	draw(m, rectaddpt(Rect(0,0,Nubwidth,1), o), display->black, clipr, o);
+	draw(m, rectaddpt(Rect(Nubwidth-1,0,Nubwidth,Nubheight), o), 
+		display->black, clipr, addpt(o, Pt(Nubwidth-1, 0)));
+	draw(m, rectaddpt(Rect(0, Nubheight-1, Nubwidth, Nubheight), o),
+		display->black, clipr, addpt(o, Pt(0, Nubheight-1)));
+
+	draw(m, rectaddpt(Rect(0, Nubheight/2, Nubwidth, Nubheight/2+1), o),
+		display->black, clipr, addpt(o, Pt(0, Nubheight/2)));
+	if(!t->expanded)
+		draw(m, rectaddpt(Rect(Nubwidth/2, 0, Nubwidth/2+1, Nubheight), o),
+			display->black, clipr, addpt(o, Pt(Nubwidth/2, 0)));
+
+}
+
+uint
+drawnode(Tnode *t, Image *m, Image *clipr, Point o)
+{
+	int i;
+	char *fs, *s;
+	uint dy;
+	Point oo;
+
+	if(t == nil)
+		return 0;
+
+	t->offset = o;
+
+	oo = Pt(o.x+Nubwidth+2, o.y);
+//	if(t->draw)
+//		dy = (*t->draw)(t, m, clipr, oo);
+//	else{
+		fs = nil;
+		if(t->str)
+			s = t->str;
+	//	else if(t->strfn)
+	//		fs = s = (*t->strfn)(t);
+		else
+			s = "???";
+		dy = drawtext(s, m, clipr, oo);
+		free(fs);
+//	}
+
+	if(t->expanded){
+		if(t->nkid == -1 && t->expand)
+			(*t->expand)(t);
+		oo = Pt(o.x+Nubwidth+(Linewidth-Nubwidth)/2, o.y+dy);
+		for(i=0; i<t->nkid; i++)
+			oo.y += drawnode(t->kid[i], m, clipr, oo);
+		dy = oo.y - o.y;
+	}
+	drawnub(m, clipr, o, t);
+	return dy;
+}
+
+void
+drawtree(Tree *t, Image *m, Rectangle r)
+{
+	Point p;
+
+	draw(m, r, display->white, nil, ZP);
+
+	replclipr(t->clipr, 1, r);
+	p = addpt(t->offset, r.min);
+	drawnode(t->root, m, t->clipr, p);
+}
+
+Tnode*
+findnode(Tnode *t, Point p)
+{
+	int i;
+	Tnode *tt;
+
+	if(ptinrect(p, rectaddpt(Rect(0,0,Nubwidth, Nubheight), t->offset)))
+		return t;
+	if(!t->expanded)
+		return nil;
+	for(i=0; i<t->nkid; i++)
+		if(tt = findnode(t->kid[i], p))
+			return tt;
+	return nil;
+}
+
+void
+usage(void)
+{
+	fprint(2, "usage: vtree /dev/sdC0/fossil\n");
+	exits("usage");
+}
+
+Tree t;
+
+void
+eresized(int new)
+{
+	Rectangle r;
+	r = screen->r;
+	if(new && getwindow(display, Refnone) < 0)
+		fprint(2,"can't reattach to window");
+	drawtree(&t, screen, screen->r);
+}
+
+enum
+{
+	Left = 1<<0,
+	Middle = 1<<1,
+	Right = 1<<2,
+
+	MMenu = 2,
+};
+
+char *items[] = { "exit", 0 };
+enum { IExit, };
+
+Menu menu;
+
+void
+main(int argc, char **argv)
+{
+	int n;
+	char *dir;
+	Event e;
+	Point op, p;
+	Tnode *tn;
+	Mouse m;
+	int Eready;
+	Atree *fs;
+
+	ARGBEGIN{
+	case 'a':
+		showinactive = 1;
+		break;
+	default:
+		usage();
+	}ARGEND
+
+	switch(argc){
+	default:
+		usage();
+	case 1:
+		dir = argv[0];
+		break;
+	}
+
+	fs = atreeinit(dir);
+	initdraw(0, "/lib/font/bit/lucidasans/unicode.8.font", "tree");
+	t.root = fs->root;
+	t.offset = ZP;
+	t.clipr = allocimage(display, Rect(0,0,1,1), GREY1, 1, DOpaque);
+
+	eresized(0);
+	flushimage(display, 1);
+
+	einit(Emouse);
+
+	menu.item = items;
+	menu.gen = 0;
+	menu.lasthit = 0;
+	if(fs->resizefd > 0){
+		Eready = 1<<3;
+		estart(Eready, fs->resizefd, 1);
+	}else
+		Eready = 0;
+
+	for(;;){
+		switch(n=eread(Emouse|Eready, &e)){
+		default:
+			if(Eready && n==Eready)
+				eresized(0);
+			break;
+		case Emouse:
+			m = e.mouse;
+			switch(m.buttons){
+			case Left:
+				op = t.offset;
+				p = m.xy;
+				do {
+					t.offset = addpt(t.offset, subpt(m.xy, p));
+					p = m.xy;
+					eresized(0);
+					m = emouse();
+				}while(m.buttons == Left);
+				if(m.buttons){
+					t.offset = op;
+					eresized(0);
+				}
+				break;
+			case Middle:
+				n = emenuhit(MMenu, &m, &menu);
+				if(n == -1)
+					break;
+				switch(n){
+				case IExit:
+					exits(nil);
+				}
+				break;
+			case Right:
+				do
+					m = emouse();
+				while(m.buttons == Right);
+				if(m.buttons)
+					break;
+				tn = findnode(t.root, m.xy);
+				if(tn){
+					tn->expanded = !tn->expanded;
+					eresized(0);
+				}
+				break;
+			}
+		}
+	}
+}

+ 65 - 0
sys/src/cmd/fossil/walk.c

@@ -0,0 +1,65 @@
+/*
+ * Generic traversal routines.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static uint
+etype(Entry *e)
+{
+	uint t;
+
+	if(e->flags&VtEntryDir)
+		t = BtDir;
+	else
+		t = BtData;
+	return t+e->depth;
+}
+
+void
+initWalk(WalkPtr *w, Block *b, uint size)
+{
+	memset(w, 0, sizeof *w);
+	switch(b->l.type){
+	case BtData:
+		return;
+
+	case BtDir:
+		w->data = b->data;
+		w->m = size / VtEntrySize;
+		w->isEntry = 1;
+		return;
+
+	default:
+		w->data = b->data;
+		w->m = size / VtScoreSize;
+		w->type = b->l.type;
+		w->tag = b->l.tag;
+		return;
+	}
+}
+
+int
+nextWalk(WalkPtr *w, uchar score[VtScoreSize], uchar *type, u32int *tag, Entry **e)
+{
+	if(w->n >= w->m)
+		return 0;
+
+	if(w->isEntry){
+		*e = &w->e;
+		entryUnpack(&w->e, w->data, w->n);
+		memmove(score, w->e.score, VtScoreSize);
+		*type = etype(&w->e);
+		*tag = w->e.tag;
+	}else{
+		*e = nil;
+		memmove(score, w->data+w->n*VtScoreSize, VtScoreSize);
+		*type = w->type-1;
+		*tag = w->tag;
+	}
+	w->n++;
+	return 1;
+}
+