extract BlockHasher and TreeHasher classes

2022-02-21 13:49:05 +01:00
parent a807ec320e
commit a2f85690a3
5 changed files with 97 additions and 67 deletions
--- a/zfs_autobackup/BlockHasher.py
+++ b/zfs_autobackup/BlockHasher.py
@ -0,0 +1,45 @@
+import hashlib
+
+
+class BlockHasher():
+    """This class was created to checksum huge files and blockdevices (TB's)
+    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
+
+    The chunksize is count*bs (bs is the read blocksize from disk)
+
+    Its also possible to only read a certain percentage of blocks to just check a sample.
+    """
+    def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1):
+        self.count=count
+        self.bs=bs
+        self.hash_class=hash_class
+
+
+    def generate(self, fname):
+        """Generates checksums
+
+        yields(chunk_nr, hexdigest)
+
+        yields nothing for empty files.
+        """
+        with open(fname, "rb") as f:
+            hash = self.hash_class()
+            block_nr = 0
+            chunk_nr = 0
+            for block in iter(lambda: f.read(self.bs), b""):
+                hash.update(block)
+                block_nr = block_nr + 1
+                if block_nr % self.count == 0:
+                    yield (chunk_nr, hash.hexdigest())
+                    chunk_nr = chunk_nr + 1
+                    hash = self.hash_class()
+
+            # yield last (incomplete) block
+            if block_nr % self.count != 0:
+                yield (chunk_nr, hash.hexdigest())
+
+        # def compare(fname, generator):
+        #     """reads from generatos and compares blocks"""
+        #
+        #     with open(fname, "rb") as f:
+        #         for ( count, bs , chunk_nr, hexdigest) in input_generator:
--- a/zfs_autobackup/TreeHasher.py
+++ b/zfs_autobackup/TreeHasher.py
@ -0,0 +1,33 @@
+import os
+
+
+class TreeHasher():
+    """uses BlockHasher recursively on a directory tree"""
+
+    def __init__(self, block_hasher):
+        self.block_hasher=block_hasher
+
+    def generate(self, start_path):
+        """Use BlockHasher on every file in a tree, yielding the results
+
+        note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
+        It also ignores empty directories, symlinks and special files.
+        """
+
+        cwd=os.getcwd()
+        os.chdir(start_path)
+
+        def walkerror(e):
+            raise e
+
+        try:
+            for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
+                for f in filenames:
+                    file_path=os.path.join(dirpath, f)[2:]
+
+                    if (not os.path.islink(file_path)) and os.path.isfile(file_path):
+                        for (chunk_nr, hash) in self.block_hasher.generate(file_path):
+                            yield ( file_path, chunk_nr, hash )
+        finally:
+            os.chdir(cwd)
+
--- a/zfs_autobackup/ZfsCheck.py
+++ b/zfs_autobackup/ZfsCheck.py
@ -1,8 +1,10 @@
 from __future__ import print_function

+import time
 from signal import signal, SIGPIPE

-
+from .TreeHasher import TreeHasher
+from .BlockHasher import BlockHasher
 from .ZfsNode import ZfsNode
 from .util import *
 from .CliBase import CliBase
@ -62,9 +64,11 @@ class ZfsCheck(CliBase):

            snapshot.mount(mnt)

+            tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs))
+
            self.debug("Hashing tree: {}".format(mnt))
            if not self.args.test:
-                for (file, block, hash) in block_hash_tree(mnt, count, bs):
+                for (file, block, hash) in tree_hasher.generate(mnt):
                    print("{}\t{}\t{}".format(file, block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect

@ -113,14 +117,14 @@ class ZfsCheck(CliBase):
    def hash_volume(self, snapshot, count, bs):
        try:
            dev=self.activate_volume_snapshot(snapshot)
+            block_hasher=BlockHasher(count=count, bs=bs)

            self.debug("Hashing dev: {}".format(dev))
            if not self.args.test:
-                for (block, hash) in block_hash(dev, count, bs):
+                for (block, hash) in block_hasher.generate(dev):
                    print("{}\t{}".format(block, hash))
                    sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect

-
        finally:
            self.deacitvate_volume_snapshot(snapshot)

--- a/zfs_autobackup/util.py
+++ b/zfs_autobackup/util.py
@ -1,5 +1,3 @@
-import hashlib
-
 # root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip
 # -rw-rw-r-- 1 psy psy 990M Nov 26  2020 /home/psy/Downloads/carimage.zip
 # root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip
@ -18,60 +16,6 @@ import hashlib
 import os
 import platform
 import sys
-import time
-
-
-
-def block_hash(fname, count=10000, bs=4096):
-    """This function was created to checksum huge files and blockdevices (TB's)
-    Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file.
-
-    yields sha1 hash of fname,  per count blocks.
-    yields(chunk_nr, hexdigest)
-
-    yields nothing for empty files.
-
-    """
-
-    with open(fname, "rb") as f:
-        hash = hashlib.sha1()
-        block_nr = 0
-        chunk_nr = 0
-        for block in iter(lambda: f.read(bs), b""):
-            hash.update(block)
-            block_nr = block_nr + 1
-            if block_nr % count == 0:
-                yield (chunk_nr, hash.hexdigest())
-                chunk_nr = chunk_nr + 1
-                hash = hashlib.sha1()
-
-        # yield last (incomplete) block
-        if block_nr % count != 0:
-            yield (chunk_nr, hash.hexdigest())
-
-def block_hash_tree(start_path, count=10000, bs=4096):
-    """block_hash every file in a tree, yielding the results
-
-    note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes.
-    It also ignores empty directories, symlinks and special files.
-    """
-
-    cwd=os.getcwd()
-    os.chdir(start_path)
-
-    def walkerror(e):
-        raise e
-
-    try:
-        for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror):
-            for f in filenames:
-                file_path=os.path.join(dirpath, f)[2:]
-
-                if (not os.path.islink(file_path)) and os.path.isfile(file_path):
-                    for (chunk_nr, hash) in block_hash(file_path, count, bs):
-                        yield ( file_path, chunk_nr, hash )
-    finally:
-        os.chdir(cwd)


 def tmp_name(suffix=""):