From 8a2699dfbbe19794c7a767217d930a6abe6d6150 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Sat, 15 Feb 2020 01:13:45 +0200 Subject: [PATCH] schema: use a more flexible hash schema (#3333) Fixes #3331 --- dvc/output/__init__.py | 20 +++++++++------ dvc/stage.py | 2 +- tests/unit/output/test_output.py | 42 ++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 tests/unit/output/test_output.py diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index abcf681912..e86deeba56 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -1,5 +1,5 @@ from urllib.parse import urlparse -from voluptuous import Any, Required +from voluptuous import Any, Required, Lower, Length, Coerce, And, SetTo from dvc.output.base import OutputBase from dvc.output.gs import OutputGS @@ -29,6 +29,12 @@ Schemes.LOCAL: OutputLOCAL, } +CHECKSUM_SCHEMA = Any( + None, + And(str, Length(max=0), SetTo(None)), + And(Any(str, And(int, Coerce(str))), Length(min=3), Lower), +) + # NOTE: currently there are only 3 possible checksum names: # # 1) md5 (LOCAL, SSH, GS); @@ -37,15 +43,15 @@ # # so when a few types of outputs share the same name, we only need # specify it once. -CHECKSUM_SCHEMA = { - RemoteLOCAL.PARAM_CHECKSUM: Any(str, None), - RemoteS3.PARAM_CHECKSUM: Any(str, None), - RemoteHDFS.PARAM_CHECKSUM: Any(str, None), +CHECKSUMS_SCHEMA = { + RemoteLOCAL.PARAM_CHECKSUM: CHECKSUM_SCHEMA, + RemoteS3.PARAM_CHECKSUM: CHECKSUM_SCHEMA, + RemoteHDFS.PARAM_CHECKSUM: CHECKSUM_SCHEMA, } -TAGS_SCHEMA = {str: CHECKSUM_SCHEMA} +TAGS_SCHEMA = {str: CHECKSUMS_SCHEMA} -SCHEMA = CHECKSUM_SCHEMA.copy() +SCHEMA = CHECKSUMS_SCHEMA.copy() SCHEMA[Required(OutputBase.PARAM_PATH)] = str SCHEMA[OutputBase.PARAM_CACHE] = bool SCHEMA[OutputBase.PARAM_METRIC] = OutputBase.METRIC_SCHEMA diff --git a/dvc/stage.py b/dvc/stage.py index 954be11ba1..8c9d79b8ad 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -191,7 +191,7 @@ class Stage(object): PARAM_ALWAYS_CHANGED = "always_changed" SCHEMA = { - PARAM_MD5: Any(str, None), + PARAM_MD5: output.CHECKSUM_SCHEMA, PARAM_CMD: Any(str, None), PARAM_WDIR: Any(str, None), PARAM_DEPS: Any([dependency.SCHEMA], None), diff --git a/tests/unit/output/test_output.py b/tests/unit/output/test_output.py new file mode 100644 index 0000000000..2cf30e9680 --- /dev/null +++ b/tests/unit/output/test_output.py @@ -0,0 +1,42 @@ +import pytest + +from voluptuous import Schema, MultipleInvalid + +from dvc.output import CHECKSUM_SCHEMA + + +@pytest.mark.parametrize( + "value,expected", + [ + ("", None), + (None, None), + (11111, "11111"), + ("11111", "11111"), + ("aAaBa", "aaaba"), + ( + "3cc286c534a71504476da009ed174423", + "3cc286c534a71504476da009ed174423", + ), # md5 + ( + "d41d8cd98f00b204e9800998ecf8427e-38", + "d41d8cd98f00b204e9800998ecf8427e-38", + ), # etag + ( + "000002000000000000000000c16859d1d071c6b1ffc9c8557d4909f1", + "000002000000000000000000c16859d1d071c6b1ffc9c8557d4909f1", + ), # hdfs checksum + # Not much we can do about hex and oct values without writing our own + # parser. So listing these test cases just to acknowledge this. + # See https://github.com/iterative/dvc/issues/3331. + (0x3451, "13393"), + (0o1244, "676"), + ], +) +def test_checksum_schema(value, expected): + assert Schema(CHECKSUM_SCHEMA)(value) == expected + + +@pytest.mark.parametrize("value", ["1", "11", {}, {"a": "b"}, [], [1, 2]]) +def test_checksum_schema_fail(value): + with pytest.raises(MultipleInvalid): + Schema(CHECKSUM_SCHEMA)(value)["md5"]