Skip to content

Commit

Permalink
enh: compute the hash of the input file
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Dec 6, 2023
1 parent e459815 commit 316c606
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.6.1
- enh: compute the hash of the input file a second time while waiting
for the hash of the target file
0.6.0
- feat: generalize GUI to use all recipes
- enh: prevent GUI from locking when transferring large file
Expand Down
29 changes: 22 additions & 7 deletions mpl_data_cast/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import uuid
from typing import Type, Callable, List

from .util import hashfile, copyhashfile
from .util import HasherThread, hashfile, copyhashfile


#: Files that are not copied (unless specified explicitly by a recipe)
Expand Down Expand Up @@ -222,14 +222,29 @@ def transfer_to_target_path(temp_path: pathlib.Path,
success = True
else:
# transfer to target_path
hash_input_verify = copyhashfile(temp_path, target_path)

# Compute the hash of the target path *and* the hash of the
# input path (you never know) again. We save some time here
# by computing the hash in two parallel threads (assuming
# disk/network speed is the bottleneck, not the CPU).
thr_out = HasherThread(target_path)
thr_out.start()
if hash_input is None:
hash_input = copyhashfile(temp_path, target_path)
else:
shutil.copy2(temp_path, target_path)
# compute md5 hash of target path
hash_cp = hashfile(target_path)
thr_in = HasherThread(temp_path)
thr_in.start()
thr_in.join()
hash_input = thr_in.hash
thr_out.join()
hash_target = thr_out.hash

# sanity check
assert len(hash_target) == 32
assert len(hash_input) == 32
assert len(hash_input_verify) == 32

# compare md5 hashes (verification)
success = hash_input == hash_cp
success = hash_input == hash_target == hash_input_verify
if not success:
# Since we copied the wrong file, we are responsible for
# deleting it.
Expand Down
25 changes: 25 additions & 0 deletions mpl_data_cast/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,37 @@
import hashlib
import pathlib
import shutil
import threading
from typing import Callable


DEFAULT_BLOCK_SIZE = 4 * (1024 ** 2)


class HasherThread(threading.Thread):
def __init__(self, path, copy_to=None, *args, **kwargs):
"""Thread for hashing files
Parameters
----------
path: pathlib.Path
Path to hash
copy_to: pathlib.Path
Write data to this file while hashing
"""
super(HasherThread, self).__init__(*args, **kwargs)
self.path = path
self.copy_to = copy_to
self.hash = None

def run(self):
if self.copy_to:
self.hash = copyhashfile(path_in=self.path,
path_out=self.copy_to)
else:
self.hash = hashfile(self.path)


def copyhashfile(path_in: str | pathlib.Path,
path_out: str | pathlib.Path,
blocksize: int = DEFAULT_BLOCK_SIZE,
Expand Down

0 comments on commit 316c606

Please sign in to comment.