fs2json.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #!/usr/bin/env python3
  2. # Note:
  3. # - Hardlinks are copied
  4. # - The size of symlinks and directories is meaningless, it depends on whatever
  5. # the filesystem/tar file reports
  6. import argparse
  7. import json
  8. import os
  9. import stat
  10. import sys
  11. import itertools
  12. import logging
  13. import hashlib
  14. import tarfile
  15. VERSION = 3
  16. IDX_NAME = 0
  17. IDX_SIZE = 1
  18. IDX_MTIME = 2
  19. IDX_MODE = 3
  20. IDX_UID = 4
  21. IDX_GID = 5
  22. # target for symbolic links
  23. # child nodes for directories
  24. # filename for files
  25. IDX_TARGET = 6
  26. IDX_FILENAME = 6
  27. HASH_LENGTH = 8
  28. S_IFLNK = 0xA000
  29. S_IFREG = 0x8000
  30. S_IFDIR = 0x4000
  31. def hash_file(filename) -> str:
  32. with open(filename, "rb", buffering=0) as f:
  33. return hash_fileobj(f)
  34. def hash_fileobj(f) -> str:
  35. h = hashlib.sha256()
  36. for b in iter(lambda: f.read(128*1024), b""):
  37. h.update(b)
  38. return h.hexdigest()
  39. def main():
  40. logging.basicConfig(format="%(message)s")
  41. logger = logging.getLogger("fs2json")
  42. logger.setLevel(logging.DEBUG)
  43. args = argparse.ArgumentParser(description="Create filesystem JSON. Example:\n"
  44. " ./fs2xml.py --exclude /boot/ --out fs.json /mnt/",
  45. formatter_class=argparse.RawTextHelpFormatter
  46. )
  47. args.add_argument("--exclude",
  48. action="append",
  49. metavar="path",
  50. help="Path to exclude (relative to base path). Can be specified multiple times.")
  51. args.add_argument("--out",
  52. metavar="out",
  53. nargs="?",
  54. type=argparse.FileType("w"),
  55. help="File to write to (defaults to stdout)",
  56. default=sys.stdout)
  57. args.add_argument("path",
  58. metavar="path-or-tar",
  59. help="Base path or tar file to include in JSON")
  60. args = args.parse_args()
  61. path = os.path.normpath(args.path)
  62. try:
  63. tar = tarfile.open(path, "r")
  64. except IsADirectoryError:
  65. tar = None
  66. if tar:
  67. (root, total_size) = handle_tar(logger, tar)
  68. else:
  69. (root, total_size) = handle_dir(logger, path, args.exclude)
  70. if False:
  71. # normalize the order of children, useful to debug differences between
  72. # the tar and filesystem reader
  73. def sort_children(children):
  74. for c in children:
  75. if isinstance(c[IDX_TARGET], list):
  76. sort_children(c[IDX_TARGET])
  77. children.sort()
  78. sort_children(root)
  79. result = {
  80. "fsroot": root,
  81. "version": VERSION,
  82. "size": total_size,
  83. }
  84. logger.info("Creating json ...")
  85. json.dump(result, args.out, check_circular=False, separators=(',', ':'))
  86. def handle_dir(logger, path, exclude):
  87. path = path + "/"
  88. exclude = exclude or []
  89. exclude = [os.path.join("/", os.path.normpath(p)) for p in exclude]
  90. exclude = set(exclude)
  91. def onerror(oserror):
  92. logger.warning(oserror)
  93. rootdepth = path.count("/")
  94. files = os.walk(path, onerror=onerror)
  95. prevpath = []
  96. mainroot = []
  97. filename_to_hash = {}
  98. total_size = 0
  99. rootstack = [mainroot]
  100. def make_node(st, name):
  101. obj = [None] * 7
  102. obj[IDX_NAME] = name
  103. obj[IDX_SIZE] = st.st_size
  104. obj[IDX_MTIME] = int(st.st_mtime)
  105. obj[IDX_MODE] = int(st.st_mode)
  106. obj[IDX_UID] = st.st_uid
  107. obj[IDX_GID] = st.st_gid
  108. nonlocal total_size
  109. total_size += st.st_size
  110. # Missing:
  111. # int(st.st_atime),
  112. # int(st.st_ctime),
  113. return obj
  114. logger.info("Creating file tree ...")
  115. for f in files:
  116. dirpath, dirnames, filenames = f
  117. pathparts = dirpath.split("/")
  118. pathparts = pathparts[rootdepth:]
  119. fullpath = os.path.join("/", *pathparts)
  120. if fullpath in exclude:
  121. dirnames[:] = []
  122. continue
  123. depth = 0
  124. for this, prev in zip(pathparts, prevpath):
  125. if this != prev:
  126. break
  127. depth += 1
  128. for _name in prevpath[depth:]:
  129. rootstack.pop()
  130. oldroot = rootstack[-1]
  131. assert len(pathparts[depth:]) == 1
  132. openname = pathparts[-1]
  133. if openname == "":
  134. root = mainroot
  135. else:
  136. root = []
  137. st = os.stat(dirpath)
  138. rootobj = make_node(st, openname)
  139. rootobj[IDX_TARGET] = root
  140. oldroot.append(rootobj)
  141. rootstack.append(root)
  142. for filename in itertools.chain(filenames, dirnames):
  143. absname = os.path.join(dirpath, filename)
  144. st = os.lstat(absname)
  145. isdir = stat.S_ISDIR(st.st_mode)
  146. islink = stat.S_ISLNK(st.st_mode)
  147. isfile = stat.S_ISREG(st.st_mode)
  148. if isdir and not islink:
  149. continue
  150. obj = make_node(st, filename)
  151. if islink:
  152. target = os.readlink(absname)
  153. obj[IDX_TARGET] = target
  154. elif isfile:
  155. file_hash = hash_file(absname)
  156. filename = file_hash[0:HASH_LENGTH] + ".bin"
  157. existing = filename_to_hash.get(filename)
  158. assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
  159. filename_to_hash[filename] = file_hash
  160. obj[IDX_FILENAME] = filename
  161. while obj[-1] is None:
  162. obj.pop()
  163. root.append(obj)
  164. prevpath = pathparts
  165. return (mainroot, total_size)
  166. def handle_tar(logger, tar):
  167. mainroot = []
  168. filename_to_hash = {}
  169. total_size = 0
  170. for member in tar.getmembers():
  171. parts = member.name.split("/")
  172. name = parts.pop()
  173. dir = mainroot
  174. for p in parts:
  175. for c in dir:
  176. if c[IDX_NAME] == p:
  177. dir = c[IDX_TARGET]
  178. obj = [None] * 7
  179. obj[IDX_NAME] = name
  180. obj[IDX_SIZE] = member.size
  181. obj[IDX_MTIME] = member.mtime
  182. obj[IDX_MODE] = member.mode
  183. obj[IDX_UID] = member.uid
  184. obj[IDX_GID] = member.gid
  185. if member.isfile() or member.islnk():
  186. obj[IDX_MODE] |= S_IFREG
  187. f = tar.extractfile(member)
  188. file_hash = hash_fileobj(f)
  189. filename = file_hash[0:HASH_LENGTH] + ".bin"
  190. existing = filename_to_hash.get(filename)
  191. assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
  192. filename_to_hash[filename] = file_hash
  193. obj[IDX_FILENAME] = filename
  194. if member.islnk():
  195. # fix size for hard links
  196. f.seek(0, os.SEEK_END)
  197. obj[IDX_SIZE] = int(f.tell())
  198. elif member.isdir():
  199. obj[IDX_MODE] |= S_IFDIR
  200. obj[IDX_TARGET] = []
  201. elif member.issym():
  202. obj[IDX_MODE] |= S_IFLNK
  203. obj[IDX_TARGET] = member.linkname
  204. else:
  205. logger.error("Unsupported type: {} ({})".format(member.type, name))
  206. total_size += obj[IDX_SIZE]
  207. while obj[-1] is None:
  208. obj.pop()
  209. dir.append(obj)
  210. return mainroot, total_size
  211. if __name__ == "__main__":
  212. main()