Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Centralize object hashing and provide a mechanism for types to register a hash #626

Open
effigies opened this issue Mar 7, 2023 · 2 comments
Labels
enhancement New feature or request

Comments

@effigies
Copy link
Contributor

effigies commented Mar 7, 2023

Right now we have hashing split up in a few places:

def hash_value(value, tp=None, metadata=None, precalculated=None):
"""calculating hash or returning values recursively"""
if metadata is None:
metadata = {}
if isinstance(value, (tuple, list, set)):
return [hash_value(el, tp, metadata, precalculated) for el in value]
elif isinstance(value, dict):
dict_hash = {
k: hash_value(v, tp, metadata, precalculated) for (k, v) in value.items()
}
# returning a sorted object
return [list(el) for el in sorted(dict_hash.items(), key=lambda x: x[0])]
else: # not a container
if (
(tp is File or "pydra.engine.specs.File" in str(tp))
and is_existing_file(value)
and "container_path" not in metadata
):
return hash_file(value, precalculated=precalculated)
elif (
(tp is File or "pydra.engine.specs.Directory" in str(tp))
and is_existing_file(value)
and "container_path" not in metadata
):
return hash_dir(value, precalculated=precalculated)
elif type(value).__module__ == "numpy": # numpy objects
return [
hash_value(el, tp, metadata, precalculated)
for el in ensure_list(value.tolist())
]
else:
return value

def hash_function(obj):
"""Generate hash of object."""
return sha256(str(obj).encode()).hexdigest()

def hash_file(
afile, chunk_len=8192, crypto=sha256, raise_notfound=True, precalculated=None
):
"""Compute hash of a file using 'crypto' module."""
from .specs import LazyField
if afile is None or isinstance(afile, LazyField) or isinstance(afile, list):
return None
if not Path(afile).is_file():
if raise_notfound:
raise RuntimeError('File "%s" not found.' % afile)
return None
# if the path exists already in precalculated
# the time of the last modification will be compared
# and the precalculated hash value will be used if the file has not change
if precalculated and str(Path(afile)) in precalculated:
pre_mtime, pre_cont_hash = precalculated[str(Path(afile))]
if Path(afile).stat().st_mtime == pre_mtime:
return pre_cont_hash
crypto_obj = crypto()
with open(afile, "rb") as fp:
while True:
data = fp.read(chunk_len)
if not data:
break
crypto_obj.update(data)
cont_hash = crypto_obj.hexdigest()
if precalculated is not None:
precalculated[str(Path(afile))] = (Path(afile).stat().st_mtime, cont_hash)
return cont_hash
def hash_dir(
dirpath,
crypto=sha256,
ignore_hidden_files=False,
ignore_hidden_dirs=False,
raise_notfound=True,
precalculated=None,
):
"""Compute hash of directory contents.
This function computes the hash of every file in directory `dirpath` and then
computes the hash of that list of hashes to return a single hash value. The
directory is traversed recursively.
Parameters
----------
dirpath : :obj:`str`
Path to directory.
crypto : :obj: `function`
cryptographic hash functions
ignore_hidden_files : :obj:`bool`
If `True`, ignore filenames that begin with `.`.
ignore_hidden_dirs : :obj:`bool`
If `True`, ignore files in directories that begin with `.`.
raise_notfound : :obj:`bool`
If `True` and `dirpath` does not exist, raise `FileNotFound` exception. If
`False` and `dirpath` does not exist, return `None`.
Returns
-------
hash : :obj:`str`
Hash of the directory contents.
"""
from .specs import LazyField
if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list):
return None
if not Path(dirpath).is_dir():
if raise_notfound:
raise FileNotFoundError(f"Directory {dirpath} not found.")
return None
file_hashes = []
for dpath, dirnames, filenames in os.walk(dirpath):
# Sort in-place to guarantee order.
dirnames.sort()
filenames.sort()
dpath = Path(dpath)
if ignore_hidden_dirs and dpath.name.startswith(".") and str(dpath) != dirpath:
continue
for filename in filenames:
if ignore_hidden_files and filename.startswith("."):
continue
if not is_existing_file(dpath / filename):
file_hashes.append(str(dpath / filename))
else:
this_hash = hash_file(dpath / filename, precalculated=precalculated)
file_hashes.append(this_hash)
crypto_obj = crypto()
for h in file_hashes:
crypto_obj.update(h.encode())
return crypto_obj.hexdigest()

An alternative approach could be to use functools.singledispatch:

@functools.singledispatch
def hash_obj(obj: object) -> bytes:
    # Works for generic objects with __dict__
    dict_rep = ":".join(":".join(key, hash_obj(val)) for key, val in obj.__dict__.items())
    return sha256(f"{obj.__class__}:{dict_rep}".encode()).hexdigest() 

This defines a cryptographic hash for a generic object that applies recursively. We would need some bottom types that don't have __dict__:

@hash_obj.register
def _(obj: int) -> bytes: ...

@hash_obj.register
def _(obj: str) -> bytes: ...

@hash_obj.register
def _(obj: dict) -> bytes: ...

And each type would be able to declare how much is needed to uniquely identify it across instances. We could add set() and frozenset() to ensure that these known-problematic builtin types are consistent. And then provide a means for a downstream tool to register a type with our hasher, such as:

@pydra.util.hash_obj.register
def _(obj: MyType) -> bytes:
    ...

or

pydra.utils.register_hash(MyType, myhashfun)
@effigies
Copy link
Contributor Author

effigies commented Mar 7, 2023

@tclose
Copy link
Contributor

tclose commented Mar 8, 2023

This looks good to me

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants