Skip to content

Commit

Permalink
Copy dandischema.digests.zarr.get_checksum() to dandi-cli
Browse files Browse the repository at this point in the history
  • Loading branch information
jwodder committed Dec 8, 2023
1 parent b9a1099 commit c22aecd
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 8 deletions.
13 changes: 6 additions & 7 deletions dandi/files/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from time import sleep
from typing import Any

from dandischema.digests.zarr import get_checksum
from dandischema.models import BareAsset, DigestType
import requests
from zarr_checksum.tree import ZarrChecksumTree
Expand Down Expand Up @@ -153,25 +152,25 @@ def stat(self) -> ZarrStat:

def dirstat(dirpath: LocalZarrEntry) -> ZarrStat:
# Avoid heavy import by importing within function:
from dandi.support.digests import md5file_nocache
from dandi.support.digests import checksum_zarr_dir, md5file_nocache

size = 0
dir_md5s = {}
file_md5s = {}
dir_info = {}
file_info = {}
files = []
for p in dirpath.iterdir():
if p.is_dir():
st = dirstat(p)
size += st.size
dir_md5s[p.name] = (st.digest.value, st.size)
dir_info[p.name] = (st.digest.value, st.size)
files.extend(st.files)
else:
size += p.size
file_md5s[p.name] = (md5file_nocache(p.filepath), p.size)
file_info[p.name] = (md5file_nocache(p.filepath), p.size)
files.append(p)
return ZarrStat(
size=size,
digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)),
digest=Digest.dandi_zarr(checksum_zarr_dir(file_info, dir_info)),
files=files,
)

Expand Down
29 changes: 29 additions & 0 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from dandischema.digests.dandietag import DandiETag
from fscacher import PersistentCache
from zarr_checksum.checksum import ZarrChecksum, ZarrChecksumManifest
from zarr_checksum.tree import ZarrChecksumTree

from .threaded_walk import threaded_walk
Expand Down Expand Up @@ -134,3 +135,31 @@ def md5file_nocache(filepath: str | Path) -> str:
present in Zarrs
"""
return Digester(["md5"])(filepath)["md5"]


def checksum_zarr_dir(
files: dict[str, tuple[str, int]], directories: dict[str, tuple[str, int]]
) -> str:
"""
Calculate the Zarr checksum of a directory only from information about the
files and subdirectories immediately within it.
:param files:
A mapping from names of files in the directory to pairs of their MD5
digests and sizes
:param directories:
A mapping from names of subdirectories in the directory to pairs of
their Zarr checksums and the sum of the sizes of all files recursively
within them
"""
manifest = ZarrChecksumManifest(
files=[
ZarrChecksum(digest=digest, name=name, size=size)
for name, (digest, size) in files.items()
],
directories=[
ZarrChecksum(digest=digest, name=name, size=size)
for name, (digest, size) in directories.items()
],
)
return manifest.generate_digest().digest
56 changes: 55 additions & 1 deletion dandi/support/tests/test_digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@

from pathlib import Path

import pytest
from pytest_mock import MockerFixture

from .. import digests
from ..digests import Digester, get_zarr_checksum
from ..digests import Digester, checksum_zarr_dir, get_zarr_checksum


def test_digester(tmp_path):
Expand Down Expand Up @@ -101,3 +102,56 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
== "f77f4c5b277575f781c19ba91422f0c5-8--197"
)
spy.assert_called_once_with(sub2 / "file7.txt")


@pytest.mark.parametrize(
"files,directories,checksum",
[
({}, {}, "481a2f77ab786a0f45aafd5db0971caa-0--0"),
(
{"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)},
{},
"f21b9b4bf53d7ce1167bcfae76371e59-1--1",
),
(
{},
{"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1)},
"ea8b8290b69b96422a3ed1cca0390f21-1--1",
),
(
{
"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1),
"baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2),
},
{},
"4e67de4393d14c1e9c472438f0f1f8b1-2--3",
),
(
{},
{
"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1),
"baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2),
},
"859ca1926affe9c7d0424030f26fbd89-2--3",
),
(
{},
{
"baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", 1),
"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--2", 2),
},
"8f8361a286c9a7c3fbfd464e33989037-2--3",
),
(
{"baz": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)},
{"bar": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2)},
"3cb139f47d3a3580388f41956c15f55e-2--3",
),
],
)
def test_checksum_zarr_dir(
files: dict[str, tuple[str, int]],
directories: dict[str, tuple[str, int]],
checksum: str,
) -> None:
assert checksum_zarr_dir(files=files, directories=directories) == checksum

0 comments on commit c22aecd

Please sign in to comment.