Browse Source

Fix fs2json.py and copy-to-sha256.py for the new shorter hash

Fabian 2 years ago
parent
commit
0247e75674
2 changed files with 32 additions and 16 deletions
  1. 12 10
      tools/copy-to-sha256.py
  2. 20 6
      tools/fs2json.py

+ 12 - 10
tools/copy-to-sha256.py

@@ -8,12 +8,13 @@ import hashlib
 import shutil
 import tarfile
 
+HASH_LENGTH = 8
 
-def hash_file(filename):
+def hash_file(filename) -> str:
     with open(filename, "rb", buffering=0) as f:
         return hash_fileobj(f)
 
-def hash_fileobj(f):
+def hash_fileobj(f) -> str:
     h = hashlib.sha256()
     for b in iter(lambda: f.read(128*1024), b""):
         h.update(b)
@@ -42,9 +43,9 @@ def main():
     if tar:
         handle_tar(logger, tar, to_path)
     else:
-        handle_dir(logger, path, to_path)
+        handle_dir(logger, from_path, to_path)
 
-def handle_dir(logger, from_path, to_path):
+def handle_dir(logger, from_path: str, to_path: str):
     def onerror(oserror):
         logger.warning(oserror)
 
@@ -62,8 +63,9 @@ def handle_dir(logger, from_path, to_path):
             if stat.S_ISLNK(mode) or stat.S_ISCHR(mode) or stat.S_ISBLK(mode) or stat.S_ISFIFO(mode) or stat.S_ISSOCK(mode):
                 continue
 
-            sha256 = hash_file(absname)
-            to_abs = os.path.join(to_path, sha256)
+            file_hash = hash_file(absname)
+            filename = file_hash[0:HASH_LENGTH] + ".bin"
+            to_abs = os.path.join(to_path, filename)
 
             if os.path.exists(to_abs):
                 logger.info("Exists, skipped {} ({})".format(to_abs, absname))
@@ -71,13 +73,13 @@ def handle_dir(logger, from_path, to_path):
                 logger.info("cp {} {}".format(absname, to_abs))
                 shutil.copyfile(absname, to_abs)
 
-def handle_tar(logger, tar, to_path):
+def handle_tar(logger, tar, to_path: str):
     for member in tar.getmembers():
         if member.isfile() or member.islnk():
             f = tar.extractfile(member)
-            sha256 = hash_fileobj(f)
-
-            to_abs = os.path.join(to_path, sha256)
+            file_hash = hash_fileobj(f)
+            filename = file_hash[0:HASH_LENGTH] + ".bin"
+            to_abs = os.path.join(to_path, filename)
 
             if os.path.exists(to_abs):
                 logger.info("Exists, skipped {} ({})".format(to_abs, member.name))

+ 20 - 6
tools/fs2json.py

@@ -26,19 +26,21 @@ IDX_GID = 5
 
 # target for symbolic links
 # child nodes for directories
-# sha256 for files
+# filename for files
 IDX_TARGET = 6
-IDX_SHA256 = 6
+IDX_FILENAME = 6
+
+HASH_LENGTH = 8
 
 S_IFLNK = 0xA000
 S_IFREG = 0x8000
 S_IFDIR = 0x4000
 
-def hash_file(filename):
+def hash_file(filename) -> str:
     with open(filename, "rb", buffering=0) as f:
         return hash_fileobj(f)
 
-def hash_fileobj(f):
+def hash_fileobj(f) -> str:
     h = hashlib.sha256()
     for b in iter(lambda: f.read(128*1024), b""):
         h.update(b)
@@ -115,6 +117,7 @@ def handle_dir(logger, path, exclude):
     prevpath = []
 
     mainroot = []
+    filename_to_hash = {}
     total_size = 0
     rootstack = [mainroot]
 
@@ -193,7 +196,12 @@ def handle_dir(logger, path, exclude):
                 target = os.readlink(absname)
                 obj[IDX_TARGET] = target
             elif isfile:
-                obj[IDX_SHA256] = hash_file(absname)
+                file_hash = hash_file(absname)
+                filename = file_hash[0:HASH_LENGTH] + ".bin"
+                existing = filename_to_hash.get(filename)
+                assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
+                filename_to_hash[filename] = file_hash
+                obj[IDX_FILENAME] = filename
 
             while obj[-1] is None:
                 obj.pop()
@@ -206,6 +214,7 @@ def handle_dir(logger, path, exclude):
 
 def handle_tar(logger, tar):
     mainroot = []
+    filename_to_hash = {}
     total_size = 0
 
     for member in tar.getmembers():
@@ -230,7 +239,12 @@ def handle_tar(logger, tar):
         if member.isfile() or member.islnk():
             obj[IDX_MODE] |= S_IFREG
             f = tar.extractfile(member)
-            obj[IDX_SHA256] = hash_fileobj(f)
+            file_hash = hash_fileobj(f)
+            filename = file_hash[0:HASH_LENGTH] + ".bin"
+            existing = filename_to_hash.get(filename)
+            assert existing is None or existing == file_hash, "Collision in short hash (%s and %s)" % (existing, file_hash)
+            filename_to_hash[filename] = file_hash
+            obj[IDX_FILENAME] = filename
             if member.islnk():
                 # fix size for hard links
                 f.seek(0, os.SEEK_END)