Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly handle capitalized hashes #214

Merged
merged 5 commits into from
Nov 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pooch/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ def load_registry(self, fname):
if len(elements) == 3:
file_url = elements[2]
self.urls[file_name] = file_url
self.registry[file_name] = file_checksum
self.registry[file_name] = file_checksum.lower()

def is_available(self, fname):
"""
Expand Down
2 changes: 1 addition & 1 deletion pooch/tests/data/registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb

store.zip 0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a
store.zip 0498D2A001E71051BBD2ACD2346F38DA7CBD345A633CB7BF0F8A20938714B51A
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b

store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
Expand Down
57 changes: 31 additions & 26 deletions pooch/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,56 +204,43 @@ def test_file_hash_invalid_algorithm():
assert "'blah'" in str(exc.value)


def test_hash_matches():
@pytest.mark.parametrize("alg", ["sha256", "sha512", "md5"])
def test_hash_matches(alg):
"Make sure the hash checking function works"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
with open(fname, "rb") as fin:
data = fin.read()
# Check if the check passes
hasher = hashlib.new("sha256")
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = f"{hasher.hexdigest()}"
known_hash = f"{alg}:{hasher.hexdigest()}"
assert hash_matches(fname, known_hash)
for alg in ("sha512", "md5"):
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = f"{alg}:{hasher.hexdigest()}"
assert hash_matches(fname, known_hash)
# And also if it fails
known_hash = "p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
known_hash = f"{alg}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
assert not hash_matches(fname, known_hash)
for alg in ("sha512", "md5"):
known_hash = f"{alg}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
assert not hash_matches(fname, known_hash)


def test_hash_matches_strict():
@pytest.mark.parametrize("alg", ["sha256", "sha512", "md5"])
def test_hash_matches_strict(alg):
"Make sure the hash checking function raises an exception if strict"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
with open(fname, "rb") as fin:
data = fin.read()
# Check if the check passes
hasher = hashlib.new("sha256")
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = f"{hasher.hexdigest()}"
known_hash = f"{alg}:{hasher.hexdigest()}"
assert hash_matches(fname, known_hash, strict=True)
for alg in ("sha512", "md5"):
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = f"{alg}:{hasher.hexdigest()}"
assert hash_matches(fname, known_hash, strict=True)
# And also if it fails
bad_hash = "p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
bad_hash = f"{alg}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
with pytest.raises(ValueError) as error:
hash_matches(fname, bad_hash, strict=True, source="Neverland")
assert "Neverland" in str(error.value)
for alg in ("sha512", "md5"):
bad_hash = f"{alg}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
with pytest.raises(ValueError) as error:
hash_matches(fname, bad_hash, strict=True)
assert fname in str(error.value)
with pytest.raises(ValueError) as error:
hash_matches(fname, bad_hash, strict=True, source=None)
assert fname in str(error.value)


def test_hash_matches_none():
Expand All @@ -266,6 +253,24 @@ def test_hash_matches_none():
assert hash_matches(fname, known_hash=None, strict=True)


@pytest.mark.parametrize("alg", ["sha256", "sha512", "md5"])
def test_hash_matches_uppercase(alg):
"Hash matching should be independent of upper or lower case"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
with open(fname, "rb") as fin:
data = fin.read()
# Check if the check passes
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = f"{alg}:{hasher.hexdigest().upper()}"
assert hash_matches(fname, known_hash, strict=True)
# And also if it fails
with pytest.raises(ValueError) as error:
hash_matches(fname, known_hash[:-5], strict=True, source="Neverland")
assert "Neverland" in str(error.value)


def test_temporary_file():
"Make sure the file is writable and cleaned up in the end"
with temporary_file() as tmp:
Expand Down
9 changes: 7 additions & 2 deletions pooch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,8 @@ def hash_algorithm(hash_string):
md5
>>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
sha256
>>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
sha256
>>> print(hash_algorithm(None))
sha256

Expand All @@ -337,7 +339,7 @@ def hash_algorithm(hash_string):
algorithm = default
else:
algorithm = hash_string.split(":")[0]
return algorithm
return algorithm.lower()


def hash_matches(fname, known_hash, strict=False, source=None):
Expand All @@ -346,6 +348,9 @@ def hash_matches(fname, known_hash, strict=False, source=None):

If the *known_hash* is None, will always return True.

Coverts hashes to lowercase before comparison to avoid system specific
mismatches between hashes in the registry and computed hashes.

Parameters
----------
fname : str or PathLike
Expand All @@ -372,7 +377,7 @@ def hash_matches(fname, known_hash, strict=False, source=None):
return True
algorithm = hash_algorithm(known_hash)
new_hash = file_hash(fname, alg=algorithm)
matches = new_hash == known_hash.split(":")[-1]
matches = new_hash.lower() == known_hash.split(":")[-1].lower()
if strict and not matches:
if source is None:
source = str(fname)
Expand Down