extract BlockHasher and TreeHasher classes
This commit is contained in:
		
							
								
								
									
										45
									
								
								zfs_autobackup/BlockHasher.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								zfs_autobackup/BlockHasher.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,45 @@ | ||||
| import hashlib | ||||
|  | ||||
|  | ||||
| class BlockHasher(): | ||||
|     """This class was created to checksum huge files and blockdevices (TB's) | ||||
|     Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file. | ||||
|  | ||||
|     The chunksize is count*bs (bs is the read blocksize from disk) | ||||
|  | ||||
|     Its also possible to only read a certain percentage of blocks to just check a sample. | ||||
|     """ | ||||
|     def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1): | ||||
|         self.count=count | ||||
|         self.bs=bs | ||||
|         self.hash_class=hash_class | ||||
|  | ||||
|  | ||||
|     def generate(self, fname): | ||||
|         """Generates checksums | ||||
|  | ||||
|         yields(chunk_nr, hexdigest) | ||||
|  | ||||
|         yields nothing for empty files. | ||||
|         """ | ||||
|         with open(fname, "rb") as f: | ||||
|             hash = self.hash_class() | ||||
|             block_nr = 0 | ||||
|             chunk_nr = 0 | ||||
|             for block in iter(lambda: f.read(self.bs), b""): | ||||
|                 hash.update(block) | ||||
|                 block_nr = block_nr + 1 | ||||
|                 if block_nr % self.count == 0: | ||||
|                     yield (chunk_nr, hash.hexdigest()) | ||||
|                     chunk_nr = chunk_nr + 1 | ||||
|                     hash = self.hash_class() | ||||
|  | ||||
|             # yield last (incomplete) block | ||||
|             if block_nr % self.count != 0: | ||||
|                 yield (chunk_nr, hash.hexdigest()) | ||||
|  | ||||
|         # def compare(fname, generator): | ||||
|         #     """reads from generatos and compares blocks""" | ||||
|         # | ||||
|         #     with open(fname, "rb") as f: | ||||
|         #         for ( count, bs , chunk_nr, hexdigest) in input_generator: | ||||
							
								
								
									
										33
									
								
								zfs_autobackup/TreeHasher.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								zfs_autobackup/TreeHasher.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | ||||
| import os | ||||
|  | ||||
|  | ||||
| class TreeHasher(): | ||||
|     """uses BlockHasher recursively on a directory tree""" | ||||
|  | ||||
|     def __init__(self, block_hasher): | ||||
|         self.block_hasher=block_hasher | ||||
|  | ||||
|     def generate(self, start_path): | ||||
|         """Use BlockHasher on every file in a tree, yielding the results | ||||
|  | ||||
|         note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes. | ||||
|         It also ignores empty directories, symlinks and special files. | ||||
|         """ | ||||
|  | ||||
|         cwd=os.getcwd() | ||||
|         os.chdir(start_path) | ||||
|  | ||||
|         def walkerror(e): | ||||
|             raise e | ||||
|  | ||||
|         try: | ||||
|             for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror): | ||||
|                 for f in filenames: | ||||
|                     file_path=os.path.join(dirpath, f)[2:] | ||||
|  | ||||
|                     if (not os.path.islink(file_path)) and os.path.isfile(file_path): | ||||
|                         for (chunk_nr, hash) in self.block_hasher.generate(file_path): | ||||
|                             yield ( file_path, chunk_nr, hash ) | ||||
|         finally: | ||||
|             os.chdir(cwd) | ||||
|  | ||||
| @ -1,8 +1,10 @@ | ||||
| from __future__ import print_function | ||||
|  | ||||
| import time | ||||
| from signal import signal, SIGPIPE | ||||
|  | ||||
|  | ||||
| from .TreeHasher import TreeHasher | ||||
| from .BlockHasher import BlockHasher | ||||
| from .ZfsNode import ZfsNode | ||||
| from .util import * | ||||
| from .CliBase import CliBase | ||||
| @ -62,9 +64,11 @@ class ZfsCheck(CliBase): | ||||
|  | ||||
|             snapshot.mount(mnt) | ||||
|  | ||||
|             tree_hasher=TreeHasher(BlockHasher(count=count, bs=bs)) | ||||
|  | ||||
|             self.debug("Hashing tree: {}".format(mnt)) | ||||
|             if not self.args.test: | ||||
|                 for (file, block, hash) in block_hash_tree(mnt, count, bs): | ||||
|                 for (file, block, hash) in tree_hasher.generate(mnt): | ||||
|                     print("{}\t{}\t{}".format(file, block, hash)) | ||||
|                     sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect | ||||
|  | ||||
| @ -113,14 +117,14 @@ class ZfsCheck(CliBase): | ||||
|     def hash_volume(self, snapshot, count, bs): | ||||
|         try: | ||||
|             dev=self.activate_volume_snapshot(snapshot) | ||||
|             block_hasher=BlockHasher(count=count, bs=bs) | ||||
|  | ||||
|             self.debug("Hashing dev: {}".format(dev)) | ||||
|             if not self.args.test: | ||||
|                 for (block, hash) in block_hash(dev, count, bs): | ||||
|                 for (block, hash) in block_hasher.generate(dev): | ||||
|                     print("{}\t{}".format(block, hash)) | ||||
|                     sys.stdout.flush() #important, to generate SIGPIPES on ssh disconnect | ||||
|  | ||||
|  | ||||
|         finally: | ||||
|             self.deacitvate_volume_snapshot(snapshot) | ||||
|  | ||||
|  | ||||
| @ -1,5 +1,3 @@ | ||||
| import hashlib | ||||
|  | ||||
| # root@psyt14s:/home/psy/zfs_autobackup# ls -lh /home/psy/Downloads/carimage.zip | ||||
| # -rw-rw-r-- 1 psy psy 990M Nov 26  2020 /home/psy/Downloads/carimage.zip | ||||
| # root@psyt14s:/home/psy/zfs_autobackup# time sha1sum /home/psy/Downloads/carimage.zip | ||||
| @ -18,60 +16,6 @@ import hashlib | ||||
| import os | ||||
| import platform | ||||
| import sys | ||||
| import time | ||||
|  | ||||
|  | ||||
|  | ||||
| def block_hash(fname, count=10000, bs=4096): | ||||
|     """This function was created to checksum huge files and blockdevices (TB's) | ||||
|     Instead of one sha1sum of the whole file, it generates sha1susms of chunks of the file. | ||||
|  | ||||
|     yields sha1 hash of fname,  per count blocks. | ||||
|     yields(chunk_nr, hexdigest) | ||||
|  | ||||
|     yields nothing for empty files. | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     with open(fname, "rb") as f: | ||||
|         hash = hashlib.sha1() | ||||
|         block_nr = 0 | ||||
|         chunk_nr = 0 | ||||
|         for block in iter(lambda: f.read(bs), b""): | ||||
|             hash.update(block) | ||||
|             block_nr = block_nr + 1 | ||||
|             if block_nr % count == 0: | ||||
|                 yield (chunk_nr, hash.hexdigest()) | ||||
|                 chunk_nr = chunk_nr + 1 | ||||
|                 hash = hashlib.sha1() | ||||
|  | ||||
|         # yield last (incomplete) block | ||||
|         if block_nr % count != 0: | ||||
|             yield (chunk_nr, hash.hexdigest()) | ||||
|  | ||||
| def block_hash_tree(start_path, count=10000, bs=4096): | ||||
|     """block_hash every file in a tree, yielding the results | ||||
|  | ||||
|     note that it only checks the contents of actual files. It ignores metadata like permissions and mtimes. | ||||
|     It also ignores empty directories, symlinks and special files. | ||||
|     """ | ||||
|  | ||||
|     cwd=os.getcwd() | ||||
|     os.chdir(start_path) | ||||
|  | ||||
|     def walkerror(e): | ||||
|         raise e | ||||
|  | ||||
|     try: | ||||
|         for (dirpath, dirnames, filenames) in os.walk(".", onerror=walkerror): | ||||
|             for f in filenames: | ||||
|                 file_path=os.path.join(dirpath, f)[2:] | ||||
|  | ||||
|                 if (not os.path.islink(file_path)) and os.path.isfile(file_path): | ||||
|                     for (chunk_nr, hash) in block_hash(file_path, count, bs): | ||||
|                         yield ( file_path, chunk_nr, hash ) | ||||
|     finally: | ||||
|         os.chdir(cwd) | ||||
|  | ||||
|  | ||||
| def tmp_name(suffix=""): | ||||
|  | ||||
		Reference in New Issue
	
	Block a user