fs2json.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. #!/usr/bin/env python3
  2. # Note:
  3. # - Hardlinks are copied
  4. # - The size of symlinks and directories is meaningless, it depends on whatever
  5. # the filesystem/tar file reports
  6. import argparse
  7. import json
  8. import os
  9. import stat
  10. import sys
  11. import itertools
  12. import logging
  13. import hashlib
  14. import tarfile
  15. VERSION = 3
  16. IDX_NAME = 0
  17. IDX_SIZE = 1
  18. IDX_MTIME = 2
  19. IDX_MODE = 3
  20. IDX_UID = 4
  21. IDX_GID = 5
  22. # target for symbolic links
  23. # child nodes for directories
  24. # sha256 for files
  25. IDX_TARGET = 6
  26. IDX_SHA256 = 6
  27. S_IFLNK = 0xA000
  28. S_IFREG = 0x8000
  29. S_IFDIR = 0x4000
  30. def hash_file(filename):
  31. with open(filename, "rb", buffering=0) as f:
  32. return hash_fileobj(f)
  33. def hash_fileobj(f):
  34. h = hashlib.sha256()
  35. for b in iter(lambda: f.read(128*1024), b""):
  36. h.update(b)
  37. return h.hexdigest()
  38. def main():
  39. logging.basicConfig(format="%(message)s")
  40. logger = logging.getLogger("fs2json")
  41. logger.setLevel(logging.DEBUG)
  42. args = argparse.ArgumentParser(description="Create filesystem JSON. Example:\n"
  43. " ./fs2xml.py --exclude /boot/ --out fs.json /mnt/",
  44. formatter_class=argparse.RawTextHelpFormatter
  45. )
  46. args.add_argument("--exclude",
  47. action="append",
  48. metavar="path",
  49. help="Path to exclude (relative to base path). Can be specified multiple times.")
  50. args.add_argument("--out",
  51. metavar="out",
  52. nargs="?",
  53. type=argparse.FileType("w"),
  54. help="File to write to (defaults to stdout)",
  55. default=sys.stdout)
  56. args.add_argument("path",
  57. metavar="path-or-tar",
  58. help="Base path or tar file to include in JSON")
  59. args = args.parse_args()
  60. path = os.path.normpath(args.path)
  61. try:
  62. tar = tarfile.open(path, "r")
  63. except IsADirectoryError:
  64. tar = None
  65. if tar:
  66. (root, total_size) = handle_tar(logger, tar)
  67. else:
  68. (root, total_size) = handle_dir(logger, path, args.exclude)
  69. if False:
  70. # normalize the order of children, useful to debug differences between
  71. # the tar and filesystem reader
  72. def sort_children(children):
  73. for c in children:
  74. if isinstance(c[IDX_TARGET], list):
  75. sort_children(c[IDX_TARGET])
  76. children.sort()
  77. sort_children(root)
  78. result = {
  79. "fsroot": root,
  80. "version": VERSION,
  81. "size": total_size,
  82. }
  83. logger.info("Creating json ...")
  84. json.dump(result, args.out, check_circular=False, separators=(',', ':'))
  85. def handle_dir(logger, path, exclude):
  86. path = path + "/"
  87. exclude = exclude or []
  88. exclude = [os.path.join("/", os.path.normpath(p)) for p in exclude]
  89. exclude = set(exclude)
  90. def onerror(oserror):
  91. logger.warning(oserror)
  92. rootdepth = path.count("/")
  93. files = os.walk(path, onerror=onerror)
  94. prevpath = []
  95. mainroot = []
  96. total_size = 0
  97. rootstack = [mainroot]
  98. def make_node(st, name):
  99. obj = [None] * 7
  100. obj[IDX_NAME] = name
  101. obj[IDX_SIZE] = st.st_size
  102. obj[IDX_MTIME] = int(st.st_mtime)
  103. obj[IDX_MODE] = int(st.st_mode)
  104. obj[IDX_UID] = st.st_uid
  105. obj[IDX_GID] = st.st_gid
  106. nonlocal total_size
  107. total_size += st.st_size
  108. # Missing:
  109. # int(st.st_atime),
  110. # int(st.st_ctime),
  111. return obj
  112. logger.info("Creating file tree ...")
  113. for f in files:
  114. dirpath, dirnames, filenames = f
  115. pathparts = dirpath.split("/")
  116. pathparts = pathparts[rootdepth:]
  117. fullpath = os.path.join("/", *pathparts)
  118. if fullpath in exclude:
  119. dirnames[:] = []
  120. continue
  121. depth = 0
  122. for this, prev in zip(pathparts, prevpath):
  123. if this != prev:
  124. break
  125. depth += 1
  126. for _name in prevpath[depth:]:
  127. rootstack.pop()
  128. oldroot = rootstack[-1]
  129. assert len(pathparts[depth:]) == 1
  130. openname = pathparts[-1]
  131. if openname == "":
  132. root = mainroot
  133. else:
  134. root = []
  135. st = os.stat(dirpath)
  136. rootobj = make_node(st, openname)
  137. rootobj[IDX_TARGET] = root
  138. oldroot.append(rootobj)
  139. rootstack.append(root)
  140. for filename in itertools.chain(filenames, dirnames):
  141. absname = os.path.join(dirpath, filename)
  142. st = os.lstat(absname)
  143. isdir = stat.S_ISDIR(st.st_mode)
  144. islink = stat.S_ISLNK(st.st_mode)
  145. isfile = stat.S_ISREG(st.st_mode)
  146. if isdir and not islink:
  147. continue
  148. obj = make_node(st, filename)
  149. if islink:
  150. target = os.readlink(absname)
  151. obj[IDX_TARGET] = target
  152. elif isfile:
  153. obj[IDX_SHA256] = hash_file(absname)
  154. while obj[-1] is None:
  155. obj.pop()
  156. root.append(obj)
  157. prevpath = pathparts
  158. return (mainroot, total_size)
  159. def handle_tar(logger, tar):
  160. mainroot = []
  161. total_size = 0
  162. for member in tar.getmembers():
  163. parts = member.name.split("/")
  164. name = parts.pop()
  165. dir = mainroot
  166. for p in parts:
  167. for c in dir:
  168. if c[IDX_NAME] == p:
  169. dir = c[IDX_TARGET]
  170. obj = [None] * 7
  171. obj[IDX_NAME] = name
  172. obj[IDX_SIZE] = member.size
  173. obj[IDX_MTIME] = member.mtime
  174. obj[IDX_MODE] = member.mode
  175. obj[IDX_UID] = member.uid
  176. obj[IDX_GID] = member.gid
  177. if member.isfile() or member.islnk():
  178. obj[IDX_MODE] |= S_IFREG
  179. f = tar.extractfile(member)
  180. obj[IDX_SHA256] = hash_fileobj(f)
  181. if member.islnk():
  182. # fix size for hard links
  183. f.seek(0, os.SEEK_END)
  184. obj[IDX_SIZE] = int(f.tell())
  185. elif member.isdir():
  186. obj[IDX_MODE] |= S_IFDIR
  187. obj[IDX_TARGET] = []
  188. elif member.issym():
  189. obj[IDX_MODE] |= S_IFLNK
  190. obj[IDX_TARGET] = member.linkname
  191. else:
  192. logger.error("Unsupported type: {} ({})".format(member.type, name))
  193. total_size += obj[IDX_SIZE]
  194. while obj[-1] is None:
  195. obj.pop()
  196. dir.append(obj)
  197. return mainroot, total_size
  198. if __name__ == "__main__":
  199. main()