Skip to content

Commit

Permalink
schema: use a more flexible hash schema (iterative#3333)
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop authored Feb 14, 2020
1 parent 276a2b0 commit 8a2699d
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 8 deletions.
20 changes: 13 additions & 7 deletions dvc/output/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from urllib.parse import urlparse
from voluptuous import Any, Required
from voluptuous import Any, Required, Lower, Length, Coerce, And, SetTo

from dvc.output.base import OutputBase
from dvc.output.gs import OutputGS
Expand Down Expand Up @@ -29,6 +29,12 @@
Schemes.LOCAL: OutputLOCAL,
}

CHECKSUM_SCHEMA = Any(
None,
And(str, Length(max=0), SetTo(None)),
And(Any(str, And(int, Coerce(str))), Length(min=3), Lower),
)

# NOTE: currently there are only 3 possible checksum names:
#
# 1) md5 (LOCAL, SSH, GS);
Expand All @@ -37,15 +43,15 @@
#
# so when a few types of outputs share the same name, we only need
# specify it once.
CHECKSUM_SCHEMA = {
RemoteLOCAL.PARAM_CHECKSUM: Any(str, None),
RemoteS3.PARAM_CHECKSUM: Any(str, None),
RemoteHDFS.PARAM_CHECKSUM: Any(str, None),
CHECKSUMS_SCHEMA = {
RemoteLOCAL.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
RemoteS3.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
RemoteHDFS.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
}

TAGS_SCHEMA = {str: CHECKSUM_SCHEMA}
TAGS_SCHEMA = {str: CHECKSUMS_SCHEMA}

SCHEMA = CHECKSUM_SCHEMA.copy()
SCHEMA = CHECKSUMS_SCHEMA.copy()
SCHEMA[Required(OutputBase.PARAM_PATH)] = str
SCHEMA[OutputBase.PARAM_CACHE] = bool
SCHEMA[OutputBase.PARAM_METRIC] = OutputBase.METRIC_SCHEMA
Expand Down
2 changes: 1 addition & 1 deletion dvc/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ class Stage(object):
PARAM_ALWAYS_CHANGED = "always_changed"

SCHEMA = {
PARAM_MD5: Any(str, None),
PARAM_MD5: output.CHECKSUM_SCHEMA,
PARAM_CMD: Any(str, None),
PARAM_WDIR: Any(str, None),
PARAM_DEPS: Any([dependency.SCHEMA], None),
Expand Down
42 changes: 42 additions & 0 deletions tests/unit/output/test_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest

from voluptuous import Schema, MultipleInvalid

from dvc.output import CHECKSUM_SCHEMA


@pytest.mark.parametrize(
"value,expected",
[
("", None),
(None, None),
(11111, "11111"),
("11111", "11111"),
("aAaBa", "aaaba"),
(
"3cc286c534a71504476da009ed174423",
"3cc286c534a71504476da009ed174423",
), # md5
(
"d41d8cd98f00b204e9800998ecf8427e-38",
"d41d8cd98f00b204e9800998ecf8427e-38",
), # etag
(
"000002000000000000000000c16859d1d071c6b1ffc9c8557d4909f1",
"000002000000000000000000c16859d1d071c6b1ffc9c8557d4909f1",
), # hdfs checksum
# Not much we can do about hex and oct values without writing our own
# parser. So listing these test cases just to acknowledge this.
# See https://github.com/iterative/dvc/issues/3331.
(0x3451, "13393"),
(0o1244, "676"),
],
)
def test_checksum_schema(value, expected):
assert Schema(CHECKSUM_SCHEMA)(value) == expected


@pytest.mark.parametrize("value", ["1", "11", {}, {"a": "b"}, [], [1, 2]])
def test_checksum_schema_fail(value):
with pytest.raises(MultipleInvalid):
Schema(CHECKSUM_SCHEMA)(value)["md5"]

0 comments on commit 8a2699d

Please sign in to comment.