Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dlt/_workspace/deployment/package_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ def write_package_to_stream(
}
)
# Create and add manifest with file metadata at the end
# NOTE: Sort files in manifest because os.scandir(), which the file selector's pathspec.util.iter_tree_files() relies on,
# yields files in a system-dependent order (https://peps.python.org/pep-0471/#os-scandir).
manifest: TDeploymentManifest = {
"engine_version": DEPLOYMENT_ENGINE_VERSION,
"files": manifest_files,
"files": sorted(manifest_files, key=lambda x: x["relative_path"]),
}
manifest_yaml = yaml.dump(
manifest, allow_unicode=True, default_flow_style=False, sort_keys=False
Expand Down
6 changes: 3 additions & 3 deletions dlt/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,11 @@ def digest256_file_stream(stream: BinaryIO, chunk_size: int = 4096) -> str:


def digest256_tar_stream(stream: BinaryIO, chunk_size: int = 8192) -> str:
"""Returns a base64 encoded sha3_256 hash of tar archive contents (ignoring metadata)
"""Returns a base64 encoded sha3_256 hash of tar archive contents.

Hashes only filenames and file contents, ignoring timestamps and other metadata.
This ensures identical file contents produce identical hashes regardless of when
the tar was created.
Members are sorted by name before hashing, so tar member order doesn't affect
the hash.

Note: This function operates entirely in-memory using tar.extractfile() which reads
from the archive stream. No files are written to disk, preventing leakage of sensitive
Expand Down
25 changes: 25 additions & 0 deletions tests/workspace/deployment/test_package_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,28 @@ def test_build_package() -> None:

assert package_path != package_path_2
assert content_hash == content_hash_2


def test_manifest_files_are_sorted() -> None:
"""Test that hash is independent of file iteration order."""
with isolated_workspace("default") as ctx:
builder = DeploymentPackageBuilder(ctx)
selector = WorkspaceFileSelector(ctx)

hash1 = builder.write_package_to_stream(selector, BytesIO())

original_order = list(selector)
reversed_order = list(reversed(original_order))
assert original_order != reversed_order

# Imitate different iteration order
class ReversedSelector(WorkspaceFileSelector):
def __init__(self, files):
self.files = files

def __iter__(self):
return iter(self.files)

hash2 = builder.write_package_to_stream(ReversedSelector(reversed_order), BytesIO())

assert hash1 == hash2
Loading