diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 0000000000..4e110ab044 --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,8 @@ +{ + "title": "DVC: Data Version Control - Git for Data & Models", + "keywords": [ + "data-science", "data-version-control", "machine-learning", "git", + "developer-tools", "reproducibility", "collaboration", "ai", "python"], + "contributors": [ + {"name": "DVC team", "type": "Other", "affiliation": "Iterative"}] +} diff --git a/README.rst b/README.rst index 12f974cd99..9c24974bda 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,4 @@ -.. image:: https://dvc.org/static/img/logo-github-readme.png - :target: https://dvc.org - :alt: DVC logo +|Banner| `Website `_ • `Docs `_ @@ -10,33 +8,7 @@ • `Tutorial `_ • `Mailing List `_ -.. image:: https://img.shields.io/badge/release-ok-brightgreen - :target: https://travis-ci.com/iterative/dvc - :alt: Release - -.. image:: https://img.shields.io/travis/com/iterative/dvc/master?label=dev - :target: https://travis-ci.com/iterative/dvc - :alt: Travis dev branch - -.. image:: https://codeclimate.com/github/iterative/dvc/badges/gpa.svg - :target: https://codeclimate.com/github/iterative/dvc - :alt: Code Climate - -.. image:: https://codecov.io/gh/iterative/dvc/branch/master/graph/badge.svg - :target: https://codecov.io/gh/iterative/dvc - :alt: Codecov - -.. image:: https://img.shields.io/badge/patreon-donate-green.svg - :target: https://www.patreon.com/DVCorg/overview - :alt: Donate - -.. image:: https://anaconda.org/conda-forge/dvc/badges/version.svg - :target: https://anaconda.org/conda-forge/dvc - :alt: Conda-forge - -.. image:: https://img.shields.io/badge/snap-install-82BEA0.svg?logo=snapcraft - :target: https://snapcraft.io/dvc - :alt: Snapcraft +|Release| |CI| |Maintainability| |Coverage| |Donate| |Conda| |Snap| |DOI| | @@ -79,9 +51,7 @@ to store data and model files seamlessly out of Git, while preserving almost the were stored in Git itself. To store and share the data cache, DVC supports multiple remotes - any cloud (S3, Azure, Google Cloud, etc) or any on-premise network storage (via SSH, for example). -.. image:: https://dvc.org/static/img/flow.gif - :target: https://dvc.org/static/img/flow.gif - :alt: how_dvc_works +|Flowchart| The DVC pipelines (computational graph) feature connects code and data together. It is possible to explicitly specify all steps required to produce a model: input dependencies including data, commands to run, @@ -148,6 +118,8 @@ Homebrew Conda (Anaconda) ---------------- +|Conda| + .. code-block:: bash conda install -c conda-forge dvc @@ -157,6 +129,8 @@ Currently, this includes support for Python versions 2.7, 3.6 and 3.7. Snap (Snapcraft) ---------------- +|Snap| + .. code-block:: bash snap install dvc --classic @@ -206,40 +180,43 @@ Comparison to related technologies Contributing ============ + +|Maintainability| |Donate| + Contributions are welcome! Please see our `Contributing Guide `_ for more details. .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/0 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/0 - :alt: 0 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/0 + :alt: 0 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/1 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/1 - :alt: 1 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/1 + :alt: 1 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/2 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/2 - :alt: 2 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/2 + :alt: 2 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/3 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/3 - :alt: 3 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/3 + :alt: 3 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/4 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/4 - :alt: 4 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/4 + :alt: 4 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/5 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/5 - :alt: 5 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/5 + :alt: 5 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/6 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/6 - :alt: 6 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/6 + :alt: 6 .. image:: https://sourcerer.io/fame/efiop/iterative/dvc/images/7 - :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/7 - :alt: 7 + :target: https://sourcerer.io/fame/efiop/iterative/dvc/links/7 + :alt: 7 Mailing List ============ @@ -253,3 +230,51 @@ This project is distributed under the Apache license version 2.0 (see the LICENS By submitting a pull request to this project, you agree to license your contribution under the Apache license version 2.0 to this project. + +Citation +======== + +|DOI| + +Iterative, *DVC: Data Version Control - Git for Data & Models* (2020) +`DOI:10.5281/zenodo.012345 `_. + +.. |Banner| image:: https://dvc.org/static/img/logo-github-readme.png + :target: https://dvc.org + :alt: DVC logo + +.. |Release| image:: https://img.shields.io/badge/release-ok-brightgreen + :target: https://travis-ci.com/iterative/dvc + :alt: Release + +.. |CI| image:: https://img.shields.io/travis/com/iterative/dvc/master?label=dev + :target: https://travis-ci.com/iterative/dvc + :alt: Travis dev branch + +.. |Maintainability| image:: https://codeclimate.com/github/iterative/dvc/badges/gpa.svg + :target: https://codeclimate.com/github/iterative/dvc + :alt: Code Climate + +.. |Coverage| image:: https://codecov.io/gh/iterative/dvc/branch/master/graph/badge.svg + :target: https://codecov.io/gh/iterative/dvc + :alt: Codecov + +.. |Donate| image:: https://img.shields.io/badge/patreon-donate-green.svg + :target: https://www.patreon.com/DVCorg/overview + :alt: Donate + +.. |Conda| image:: https://anaconda.org/conda-forge/dvc/badges/version.svg + :target: https://anaconda.org/conda-forge/dvc + :alt: Conda-forge + +.. |Snap| image:: https://img.shields.io/badge/snap-install-82BEA0.svg?logo=snapcraft + :target: https://snapcraft.io/dvc + :alt: Snapcraft + +.. |DOI| image:: https://img.shields.io/badge/DOI-10.5281/zenodo.3677553-blue.svg + :target: https://doi.org/10.5281/zenodo.3677553 + :alt: DOI + +.. |Flowchart| image:: https://dvc.org/static/img/flow.gif + :target: https://dvc.org/static/img/flow.gif + :alt: how_dvc_works diff --git a/dvc/api.py b/dvc/api.py index def9e8cf5b..d1312baaf3 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -18,10 +18,10 @@ def __init__(self, url): def get_url(path, repo=None, rev=None, remote=None): - """ - Returns the full URL to the data artifact specified by its `path` in a - `repo`. - NOTE: There is no guarantee that the file actually exists in that location. + """Returns URL to the storage location of a data artifact tracked + by DVC, specified by its path in a repo. + + NOTE: There's no guarantee that the file actually exists in that location. """ with _make_repo(repo, rev=rev) as _repo: _require_dvc(_repo) @@ -31,7 +31,7 @@ def get_url(path, repo=None, rev=None, remote=None): def open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): - """Context manager to open a file artifact as a file object.""" + """Context manager to open a tracked file as a file object.""" args = (path,) kwargs = { "repo": repo, @@ -63,7 +63,7 @@ def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): - """Returns the contents of a file artifact.""" + """Returns the contents of a tracked file.""" with open( path, repo=repo, rev=rev, remote=remote, mode=mode, encoding=encoding ) as fd: diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 5667adfe56..804e65fd80 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -28,7 +28,7 @@ def _format(diff): dir/ dir/1 - An example of a diff formatted when entries contain checksums: + An example of a diff formatted when entries contain hash: Added: d3b07384 foo @@ -66,7 +66,7 @@ def _digest(checksum): for entry in entries: path = entry["path"] - checksum = entry.get("checksum") + checksum = entry.get("hash") summary[state] += 1 if not path.endswith(os.sep) else 0 content.append( "{space}{checksum}{separator}{path}".format( @@ -100,10 +100,10 @@ def run(self): if not any(diff.values()): return 0 - if not self.args.checksums: + if not self.args.show_hash: for _, entries in diff.items(): for entry in entries: - del entry["checksum"] + del entry["hash"] if self.args.show_json: res = json.dumps(diff) @@ -149,7 +149,7 @@ def add_parser(subparsers, parent_parser): default=False, ) diff_parser.add_argument( - "--checksums", + "--show-hash", help="Display hash value for each entry", action="store_true", default=False, diff --git a/dvc/command/gc.py b/dvc/command/gc.py index 2ee4979d5e..67c0927d2e 100644 --- a/dvc/command/gc.py +++ b/dvc/command/gc.py @@ -12,6 +12,16 @@ class CmdGC(CmdBase): def run(self): + from dvc.repo.gc import _raise_error_if_all_disabled + + _raise_error_if_all_disabled( + all_branches=self.args.all_branches, + all_tags=self.args.all_tags, + all_commits=self.args.all_commits, + workspace=self.args.workspace, + cloud=self.args.cloud, + ) + msg = "This will remove all cache except items used in " msg += "the working tree" @@ -47,6 +57,7 @@ def run(self): force=self.args.force, jobs=self.args.jobs, repos=self.args.repos, + workspace=self.args.workspace, ) return 0 @@ -64,6 +75,13 @@ def add_parser(subparsers, parent_parser): help=GC_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) + gc_parser.add_argument( + "-w", + "--workspace", + action="store_true", + default=False, + help="Keep data files used in the current workspace.", + ) gc_parser.add_argument( "-a", "--all-branches", diff --git a/dvc/config.py b/dvc/config.py index d893855f71..6a59fff1c4 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -101,6 +101,13 @@ class RelPath(str): "shared": All(Lower, Choices("group")), Optional("slow_link_warning", default=True): Bool, } +HTTP_COMMON = { + "auth": All(Lower, Choices("basic", "digest", "custom")), + "custom_auth_header": str, + "user": str, + "password": str, + "ask_password": Bool, +} SCHEMA = { "core": { "remote": Lower, @@ -169,8 +176,8 @@ class RelPath(str): "gdrive_user_credentials_file": str, **REMOTE_COMMON, }, - "http": REMOTE_COMMON, - "https": REMOTE_COMMON, + "http": {**HTTP_COMMON, **REMOTE_COMMON}, + "https": {**HTTP_COMMON, **REMOTE_COMMON}, "remote": {str: object}, # Any of the above options are valid } ) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index b91fd1424a..2f6c25a47b 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -7,6 +7,10 @@ class DvcException(Exception): """Base class for all dvc exceptions.""" +class InvalidArgumentError(ValueError, DvcException): + """Thrown if arguments are invalid.""" + + class OutputDuplicationError(DvcException): """Thrown if a file/directory is specified as an output in more than one stage. diff --git a/dvc/output/base.py b/dvc/output/base.py index d59276da7a..d07010ad6a 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -409,12 +409,23 @@ def get_used_cache(self, **kwargs): cache.external[dep.repo_pair].add(dep.def_path) return cache - if not self.info: - logger.warning( - "Output '{}'({}) is missing version info. Cache for it will " - "not be collected. Use `dvc repro` to get your pipeline up to " - "date.".format(self, self.stage) + if not self.checksum: + msg = ( + "Output '{}'({}) is missing version info. " + "Cache for it will not be collected. " + "Use `dvc repro` to get your pipeline up to date.".format( + self, self.stage + ) ) + if self.exists: + msg += ( + "\n" + "You can also use `dvc commit {stage}` to associate " + "existing '{out}' with '{stage}'.".format( + out=self, stage=self.stage.relpath + ) + ) + logger.warning(msg) return NamedCache() ret = NamedCache.make(self.scheme, self.checksum, str(self)) diff --git a/dvc/remote/gdrive.py b/dvc/remote/gdrive.py index 0175cbd74a..fe115d2978 100644 --- a/dvc/remote/gdrive.py +++ b/dvc/remote/gdrive.py @@ -1,3 +1,4 @@ +from collections import defaultdict import os import posixpath import logging @@ -5,7 +6,7 @@ import threading from urllib.parse import urlparse -from funcy import retry, compose, decorator, wrap_with +from funcy import retry, wrap_with, wrap_prop, cached_property from funcy.py3 import cat from dvc.progress import Tqdm @@ -19,10 +20,6 @@ FOLDER_MIME_TYPE = "application/vnd.google-apps.folder" -class GDriveRetriableError(DvcException): - pass - - class GDrivePathNotFound(DvcException): def __init__(self, path_info): super().__init__("Google Drive path '{}' not found.".format(path_info)) @@ -41,30 +38,18 @@ def __init__(self, path): ) -@decorator -def _wrap_pydrive_retriable(call): +def gdrive_retry(func): from pydrive2.files import ApiRequestError - try: - result = call() - except ApiRequestError as exception: - retry_codes = ["403", "500", "502", "503", "504"] - if any( - "HttpError {}".format(code) in str(exception) - for code in retry_codes - ): - raise GDriveRetriableError("Google API request failed") - raise - return result + retry_re = re.compile(r"HttpError (403|500|502|503|504)") - -gdrive_retry = compose( # 15 tries, start at 0.5s, multiply by golden ratio, cap at 20s - retry( - 15, GDriveRetriableError, timeout=lambda a: min(0.5 * 1.618 ** a, 20) - ), - _wrap_pydrive_retriable, -) + return retry( + 15, + timeout=lambda a: min(0.5 * 1.618 ** a, 20), + errors=ApiRequestError, + filter_errors=lambda exc: retry_re.search(str(exc)), + )(func) class GDriveURLInfo(CloudURLInfo): @@ -126,127 +111,98 @@ def __init__(self, repo, config): ) self._list_params = None - self._gdrive = None - - self._cache_initialized = False - self._remote_root_id = None - self._cached_dirs = None - self._cached_ids = None - @property - @wrap_with(threading.RLock()) + @wrap_prop(threading.RLock()) + @cached_property def drive(self): from pydrive2.auth import RefreshError + from pydrive2.auth import GoogleAuth + from pydrive2.drive import GoogleDrive + + if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): + with open( + self._gdrive_user_credentials_path, "w" + ) as credentials_file: + credentials_file.write( + os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) + ) - if not self._gdrive: - from pydrive2.auth import GoogleAuth - from pydrive2.drive import GoogleDrive + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + GoogleAuth.DEFAULT_SETTINGS["client_config"] = { + "client_id": self._client_id, + "client_secret": self._client_secret, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "revoke_uri": "https://oauth2.googleapis.com/revoke", + "redirect_uri": "", + } + GoogleAuth.DEFAULT_SETTINGS["save_credentials"] = True + GoogleAuth.DEFAULT_SETTINGS["save_credentials_backend"] = "file" + GoogleAuth.DEFAULT_SETTINGS[ + "save_credentials_file" + ] = self._gdrive_user_credentials_path + GoogleAuth.DEFAULT_SETTINGS["get_refresh_token"] = True + GoogleAuth.DEFAULT_SETTINGS["oauth_scope"] = [ + "https://www.googleapis.com/auth/drive", + "https://www.googleapis.com/auth/drive.appdata", + ] + + # Pass non existent settings path to force DEFAULT_SETTINGS loading + gauth = GoogleAuth(settings_file="") + try: + gauth.CommandLineAuth() + except RefreshError as exc: + raise GDriveAccessTokenRefreshError from exc + except KeyError as exc: + raise GDriveMissedCredentialKeyError( + self._gdrive_user_credentials_path + ) from exc + # Handle pydrive2.auth.AuthenticationError and other auth failures + except Exception as exc: + raise DvcException("Google Drive authentication failed") from exc + finally: if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): - with open( - self._gdrive_user_credentials_path, "w" - ) as credentials_file: - credentials_file.write( - os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) - ) - - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - GoogleAuth.DEFAULT_SETTINGS["client_config"] = { - "client_id": self._client_id, - "client_secret": self._client_secret, - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "revoke_uri": "https://oauth2.googleapis.com/revoke", - "redirect_uri": "", - } - GoogleAuth.DEFAULT_SETTINGS["save_credentials"] = True - GoogleAuth.DEFAULT_SETTINGS["save_credentials_backend"] = "file" - GoogleAuth.DEFAULT_SETTINGS[ - "save_credentials_file" - ] = self._gdrive_user_credentials_path - GoogleAuth.DEFAULT_SETTINGS["get_refresh_token"] = True - GoogleAuth.DEFAULT_SETTINGS["oauth_scope"] = [ - "https://www.googleapis.com/auth/drive", - "https://www.googleapis.com/auth/drive.appdata", - ] - - # Pass non existent settings path to force DEFAULT_SETTINGS loading - gauth = GoogleAuth(settings_file="") - - try: - gauth.CommandLineAuth() - except RefreshError as exc: - raise GDriveAccessTokenRefreshError from exc - except KeyError as exc: - raise GDriveMissedCredentialKeyError( - self._gdrive_user_credentials_path - ) from exc - # Handle pydrive2.auth.AuthenticationError and other auth failures - except Exception as exc: - raise DvcException( - "Google Drive authentication failed" - ) from exc - finally: - if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): - os.remove(self._gdrive_user_credentials_path) - - self._gdrive = GoogleDrive(gauth) - - return self._gdrive + os.remove(self._gdrive_user_credentials_path) - @wrap_with(threading.RLock()) - def _initialize_cache(self): - if self._cache_initialized: - return + return GoogleDrive(gauth) + + @wrap_prop(threading.RLock()) + @cached_property + def cache(self): + cache = {"dirs": defaultdict(list), "ids": {}} + + cache["root_id"] = self._get_remote_id(self.path_info) + cache["dirs"][self.path_info.path] = [cache["root_id"]] + self._cache_path(self.path_info.path, cache["root_id"], cache) - cached_dirs = {} - cached_ids = {} - self._remote_root_id = self._get_remote_id(self.path_info) - for dir1 in self.gdrive_list_item( - "'{}' in parents and trashed=false".format(self._remote_root_id) + for item in self.gdrive_list_item( + "'{}' in parents and trashed=false".format(cache["root_id"]) ): - remote_path = posixpath.join(self.path_info.path, dir1["title"]) - cached_dirs.setdefault(remote_path, []).append(dir1["id"]) - cached_ids[dir1["id"]] = dir1["title"] - - self._cached_dirs = cached_dirs - self._cached_ids = cached_ids - self._cache_initialized = True - - @property - def cached_dirs(self): - if not self._cache_initialized: - self._initialize_cache() - return self._cached_dirs - - @property - def cached_ids(self): - if not self._cache_initialized: - self._initialize_cache() - return self._cached_ids - - @property - def remote_root_id(self): - if not self._cache_initialized: - self._initialize_cache() - return self._remote_root_id - - @property + remote_path = (self.path_info / item["title"]).path + self._cache_path(remote_path, item["id"], cache) + + return cache + + def _cache_path(self, remote_path, remote_id, cache=None): + cache = cache or self.cache + cache["dirs"][remote_path].append(remote_id) + cache["ids"][remote_id] = remote_path + + @cached_property def list_params(self): - if not self._list_params: - params = {"corpora": "default"} - if self._bucket != "root" and self._bucket != "appDataFolder": - params["driveId"] = self._get_remote_drive_id(self._bucket) - params["corpora"] = "drive" - self._list_params = params - return self._list_params + params = {"corpora": "default"} + if self._bucket != "root" and self._bucket != "appDataFolder": + params["driveId"] = self._get_remote_drive_id(self._bucket) + params["corpora"] = "drive" + return params @gdrive_retry def gdrive_upload_file( self, parent_id, title, - no_progress_bar=True, + no_progress_bar=False, from_file="", progress_name="", ): @@ -300,16 +256,14 @@ def gdrive_list_item(self, query): @wrap_with(threading.RLock()) def gdrive_create_dir(self, parent_id, title, remote_path): - if parent_id == self.remote_root_id: - cached = self.cached_dirs.get(remote_path, []) - if cached: - return cached[0] + cached = self.cache["dirs"].get(remote_path) + if cached: + return cached[0] item = self._create_remote_dir(parent_id, title) - if parent_id == self.remote_root_id: - self.cached_dirs.setdefault(remote_path, []).append(item["id"]) - self.cached_ids[item["id"]] = item["title"] + if parent_id == self.cache["root_id"]: + self._cache_path(remote_path, item["id"]) return item["id"] @@ -362,10 +316,8 @@ def _get_remote_drive_id(self, remote_id): def _get_cached_remote_ids(self, path): if not path: return [self._bucket] - if self._cache_initialized: - if path == self.path_info.path: - return [self.remote_root_id] - return self.cached_dirs.get(path, []) + if "cache" in self.__dict__: + return self.cache["dirs"].get(path, []) return [] def _path_to_remote_ids(self, path, create): @@ -416,25 +368,18 @@ def _download(self, from_info, to_file, name, no_progress_bar): file_id = self._get_remote_id(from_info) self.gdrive_download_file(file_id, to_file, name, no_progress_bar) - def all(self): - if not self.cached_ids: + def list_cache_paths(self): + if not self.cache["ids"]: return - query = "({})".format( - " or ".join( - "'{}' in parents".format(dir_id) for dir_id in self.cached_ids - ) + parents_query = " or ".join( + "'{}' in parents".format(dir_id) for dir_id in self.cache["ids"] ) + query = "({}) and trashed=false".format(parents_query) - query += " and trashed=false" - for file1 in self.gdrive_list_item(query): - parent_id = file1["parents"][0]["id"] - path = posixpath.join(self.cached_ids[parent_id], file1["title"]) - try: - yield self.path_to_checksum(path) - except ValueError: - # We ignore all the non-cache looking files - logger.debug('Ignoring path as "non-cache looking"') + for item in self.gdrive_list_item(query): + parent_id = item["parents"][0]["id"] + yield posixpath.join(self.cache["ids"][parent_id], item["title"]) def remove(self, path_info): remote_id = self._get_remote_id(path_info) diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index d9ee6d0e1f..9230699535 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -51,7 +51,7 @@ def _upload_to_bucket( to_info, chunk_size=None, name=None, - no_progress_bar=True, + no_progress_bar=False, ): blob = bucket.blob(to_info.path, chunk_size=chunk_size) with io.open(from_file, mode="rb") as fobj: @@ -166,7 +166,7 @@ def exists(self, path_info): """ return self.isfile(path_info) or self.isdir(path_info) - def _upload(self, from_file, to_info, name=None, no_progress_bar=True): + def _upload(self, from_file, to_info, name=None, no_progress_bar=False): bucket = self.gs.bucket(to_info.bucket) _upload_to_bucket( bucket, @@ -176,7 +176,7 @@ def _upload(self, from_file, to_info, name=None, no_progress_bar=True): no_progress_bar=no_progress_bar, ) - def _download(self, from_info, to_file, name=None, no_progress_bar=True): + def _download(self, from_info, to_file, name=None, no_progress_bar=False): bucket = self.gs.bucket(from_info.bucket) blob = bucket.get_blob(from_info.path) with io.open(to_file, mode="wb") as fobj: diff --git a/dvc/remote/http.py b/dvc/remote/http.py index f3b3fb5a55..d0f35029bb 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -1,8 +1,10 @@ import logging +import os.path import threading -from funcy import cached_property, wrap_prop +from funcy import cached_property, memoize, wrap_prop, wrap_with +import dvc.prompt as prompt from dvc.config import ConfigError from dvc.exceptions import DvcException, HTTPError from dvc.progress import Tqdm @@ -12,6 +14,15 @@ logger = logging.getLogger(__name__) +@wrap_with(threading.Lock()) +@memoize +def ask_password(host, user): + return prompt.password( + "Enter a password for " + "host '{host}' user '{user}'".format(host=host, user=user) + ) + + class RemoteHTTP(RemoteBASE): scheme = Schemes.HTTP SESSION_RETRIES = 5 @@ -24,7 +35,13 @@ def __init__(self, repo, config): super().__init__(repo, config) url = config.get("url") - self.path_info = self.path_cls(url) if url else None + if url: + self.path_info = self.path_cls(url) + user = config.get("user", None) + if user: + self.path_info.user = user + else: + self.path_info = None if not self.no_traverse: raise ConfigError( @@ -32,6 +49,12 @@ def __init__(self, repo, config): "files. Use: `dvc remote modify no_traverse true`" ) + self.auth = config.get("auth", None) + self.custom_auth_header = config.get("custom_auth_header", None) + self.password = config.get("password", None) + self.ask_password = config.get("ask_password", False) + self.headers = {} + def _download(self, from_info, to_file, name=None, no_progress_bar=False): response = self._request("GET", from_info.url, stream=True) if response.status_code != 200: @@ -48,6 +71,28 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): fd.write(chunk) pbar.update(len(chunk)) + def _upload(self, from_file, to_info, name=None, no_progress_bar=False): + with Tqdm( + total=None if no_progress_bar else os.path.getsize(from_file), + leave=False, + bytes=True, + desc=to_info.url if name is None else name, + disable=no_progress_bar, + ) as pbar: + + def chunks(): + with open(from_file, "rb") as fd: + while True: + chunk = fd.read(self.CHUNK_SIZE) + if not chunk: + break + pbar.update(len(chunk)) + yield chunk + + response = self._request("POST", to_info.url, data=chunks()) + if response.status_code not in (200, 201): + raise HTTPError(response.status_code, response.reason) + def exists(self, path_info): return bool(self._request("HEAD", path_info.url)) @@ -74,6 +119,24 @@ def get_file_checksum(self, path_info): return etag + def auth_method(self, path_info=None): + from requests.auth import HTTPBasicAuth, HTTPDigestAuth + + if path_info is None: + path_info = self.path_info + + if self.auth: + if self.ask_password and self.password is None: + host, user = path_info.host, path_info.user + self.password = ask_password(host, user) + if self.auth == "basic": + return HTTPBasicAuth(path_info.user, self.password) + if self.auth == "digest": + return HTTPDigestAuth(path_info.user, self.password) + if self.auth == "custom" and self.custom_auth_header: + self.headers.update({self.custom_auth_header: self.password}) + return None + @wrap_prop(threading.Lock()) @cached_property def _session(self): @@ -100,7 +163,13 @@ def _request(self, method, url, **kwargs): kwargs.setdefault("timeout", self.REQUEST_TIMEOUT) try: - res = self._session.request(method, url, **kwargs) + res = self._session.request( + method, + url, + auth=self.auth_method(), + headers=self.headers, + **kwargs, + ) redirect_no_location = ( kwargs["allow_redirects"] diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 36dc6b073c..6205e24e51 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -24,6 +24,7 @@ def locked(f): @wraps(f) def wrapper(repo, *args, **kwargs): with repo.lock, repo.state: + repo._reset() ret = f(repo, *args, **kwargs) # Our graph cache is no longer valid after we release the repo.lock repo._reset() diff --git a/dvc/repo/add.py b/dvc/repo/add.py index f216d6ddec..b0a0fa586c 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -126,6 +126,7 @@ def _create_stages(repo, targets, fname, pbar=None): stage = Stage.create( repo, outs=[out], accompany_outs=True, fname=fname ) + repo._reset() if not stage: if pbar is not None: diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 1f6dbeee6a..8b474ef9cc 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -66,10 +66,10 @@ def _paths_checksums(): modified = sorted(set(old) & set(new)) return { - "added": [{"path": path, "checksum": new[path]} for path in added], - "deleted": [{"path": path, "checksum": old[path]} for path in deleted], + "added": [{"path": path, "hash": new[path]} for path in added], + "deleted": [{"path": path, "hash": old[path]} for path in deleted], "modified": [ - {"path": path, "checksum": {"old": old[path], "new": new[path]}} + {"path": path, "hash": {"old": old[path], "new": new[path]}} for path in modified if old[path] != new[path] ], diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index e55a34dc3f..a19c3932ba 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -2,6 +2,7 @@ from . import locked from dvc.cache import NamedCache +from dvc.exceptions import InvalidArgumentError logger = logging.getLogger(__name__) @@ -13,6 +14,15 @@ def _do_gc(typ, func, clist): logger.info("No unused '{}' cache to remove.".format(typ)) +def _raise_error_if_all_disabled(**kwargs): + if not any(kwargs.values()): + raise InvalidArgumentError( + "Invalid Arguments. Either of ({}) needs to be enabled.".format( + ", ".join(kwargs.keys()) + ) + ) + + @locked def gc( self, @@ -25,7 +35,20 @@ def gc( force=False, jobs=None, repos=None, + workspace=False, ): + + # require `workspace` to be true to come into effect. + # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, + # or `all_branches` are enabled. + _raise_error_if_all_disabled( + workspace=workspace, + all_tags=all_tags, + all_commits=all_commits, + all_branches=all_branches, + cloud=cloud, + ) + from contextlib import ExitStack from dvc.repo import Repo diff --git a/dvc/version.py b/dvc/version.py index 479a8001f7..9cd7ed8c16 100644 --- a/dvc/version.py +++ b/dvc/version.py @@ -7,7 +7,7 @@ import subprocess -_BASE_VERSION = "0.86.2" +_BASE_VERSION = "0.86.5" def _generate_version(base_version): diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 0727aa8ced..004685410c 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -1,12 +1,8 @@ #!/usr/bin/env bash - -#---------------------------------------------------------- -# Repository: https://github.com/iterative/dvc -# # References: # - https://www.gnu.org/software/bash/manual/html_node/Programmable-Completion.html # - https://opensource.com/article/18/3/creating-bash-completion-script -#---------------------------------------------------------- +# - https://stackoverflow.com/questions/12933362 _dvc_commands='add cache checkout commit config destroy diff fetch get-url get gc \ import-url import init install lock list metrics move pipeline pull push \ @@ -15,15 +11,19 @@ _dvc_commands='add cache checkout commit config destroy diff fetch get-url get g _dvc_options='-h --help -V --version' _dvc_global_options='-h --help -q --quiet -v --verbose' -_dvc_add='-R --recursive -f --file --no-commit $(compgen -G *)' +_dvc_add='-R --recursive -f --file --no-commit' +_dvc_add_COMPGEN=_dvc_compgen_files _dvc_cache='dir' _dvc_cache_dir=' --global --system --local -u --unset' -_dvc_checkout='-d --with-deps -R --recursive -f --force --relink $(compgen -G *.dvc)' -_dvc_commit='-f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' +_dvc_checkout='-d --with-deps -R --recursive -f --force --relink' +_dvc_checkout_COMPGEN=_dvc_compgen_DVCFiles +_dvc_commit='-f --force -d --with-deps -R --recursive' +_dvc_commit_COMPGEN=_dvc_compgen_DVCFiles _dvc_config='-u --unset --local --system --global' _dvc_destroy='-f --force' -_dvc_diff='-t --show-json --checksums' -_dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' +_dvc_diff='-t --show-json --show-hash' +_dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive' +_dvc_fetch_COMPGEN=_dvc_compgen_DVCFiles _dvc_get_url='' _dvc_get='-o --out --rev --show-url' _dvc_gc='-a --all-branches -T --all-tags -c --cloud -r --remote -f --force -p --projects -j --jobs' @@ -31,72 +31,96 @@ _dvc_import_url='-f --file' _dvc_import='-o --out --rev' _dvc_init='--no-scm -f --force' _dvc_install='' -_dvc_list='-R --recursive --outs-only --rev $(compgen -G *)' -_dvc_lock='$(compgen -G *.dvc)' +_dvc_list='-R --recursive --outs-only --rev' +_dvc_list_COMPGEN=_dvc_compgen_files +_dvc_lock='$(compgen -f -X \!*?.dvc)' _dvc_metrics='add modify rmeove show' -_dvc_metrics_add='-t --type -x --xpath $(compgen -G *)' -_dvc_metrics_show='$(-t --type -x --xpath -a --all-branches -T --all-tags -R --recursive $(compgen -G *)' +_dvc_metrics_add='-t --type -x --xpath' +_dvc_metrics_add_COMPGEN=_dvc_compgen_files +_dvc_metrics_show='$(-t --type -x --xpath -a --all-branches -T --all-tags -R --recursive' +_dvc_metrics_show_COMPGEN=_dvc_compgen_files _dvc_metrics_diff='--targets -t --type -x --xpath -R --show-json' -_dvc_metrics_modify='-t --type -x --xpath $(compgen -G *)' -_dvc_metrics_remove='$(compgen -G *)' -_dvc_move='$(compgen -G *)' +_dvc_metrics_modify='-t --type -x --xpath' +_dvc_metrics_modify_COMPGEN=_dvc_compgen_files +_dvc_metrics_remove='$(compgen -f)' +_dvc_move='$(compgen -f)' _dvc_pipeline='list show' _dvc_pipeline_list='' -_dvc_pipeline_show='-c --commands -o --outs --ascii --dot --tree -l --locked $(compgen -G *.dvc)' -_dvc_pull='-j --jobs -r --remote -a --all-branches -T --all-tags -f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' -_dvc_push='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' +_dvc_pipeline_show='-c --commands -o --outs --ascii --dot --tree -l --locked' +_dvc_pipeline_show_COMPGEN=_dvc_compgen_DVCFiles +_dvc_pull='-j --jobs -r --remote -a --all-branches -T --all-tags -f --force -d --with-deps -R --recursive' +_dvc_pull_COMPGEN=_dvc_compgen_DVCFiles +_dvc_push='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive' +_dvc_push_COMPGEN=_dvc_compgen_DVCFiles _dvc_remote='add default list modify remove' _dvc_remote_add='--global --system --local -d --default -f --force' _dvc_remote_default='--global --system --local -u --unset' _dvc_remote_list='--global --system --local' _dvc_remote_modify='--global --system --local -u --unset' _dvc_remote_remove='--global --system --local' -_dvc_remove='-o --outs -p --purge -f --force $(compgen -G *.dvc)' -_dvc_repro='-f --force -s --single-item -c --cwd -m --metrics --dry -i --interactive -p --pipeline -P --all-pipelines --ignore-build-cache --no-commit -R --recursive --downstream $(compgen -G *.dvc)' +_dvc_remove='-o --outs -p --purge -f --force' +_dvc_remove_COMPGEN=_dvc_compgen_DVCFiles +_dvc_repro='-f --force -s --single-item -c --cwd -m --metrics --dry -i --interactive -p --pipeline -P --all-pipelines --ignore-build-cache --no-commit -R --recursive --downstream' +_dvc_repro_COMPGEN=_dvc_compgen_DVCFiles _dvc_root='' _dvc_run='--no-exec -f --file -c --cwd -d --deps -o --outs -O --outs-no-cache --outs-persist --outs-persist-no-cache -m --metrics -M --metrics-no-cache -y --yes --overwrite-dvcfile --ignore-build-cache --remove-outs --no-commit -w --wdir' -_dvc_status='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -c --cloud $(compgen -G *.dvc)' -_dvc_unlock='$(compgen -G *.dvc)' -_dvc_unprotect='$(compgen -G *)' -_dvc_update='--rev $(compgen -G *.dvc)' +_dvc_status='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -c --cloud' +_dvc_run_COMPGEN=_dvc_compgen_DVCFiles +_dvc_unlock_COMPGEN=_dvc_compgen_DVCFiles +_dvc_unprotect_COMPGEN=_dvc_compgen_files +_dvc_update='--rev' +_dvc_update_COMPGEN=_dvc_compgen_DVCFiles _dvc_version='' -# Params -# $1 - COMP_WORDS[1] -comp_command() { - local options_list="_dvc_$(replace_hyphen $1)" +# $1=COMP_WORDS[1] +_dvc_compgen_DVCFiles() { + compgen -f -X '!*?.dvc' -- $1 + compgen -d -S '/' -- $1 # recurse into subdirs + # Note that the recurse into dirs is only for looking for DVC-files. + # Since dirs themselves are not required, we need `-o nospace` at the bottom + # unfortunately :( +} + +# $1=COMP_WORDS[1] +_dvc_compgen_files() { + compgen -f -- $1 + compgen -d -S '/' -- $1 # recurse into subdirs +} + +# $1=COMP_WORDS[1] +_dvc_replace_hyphen() { + echo $1 | sed 's/-/_/g' +} - COMPREPLY=( $(compgen -W "$_dvc_global_options ${!options_list}" -- "$word") ) +# $1=COMP_WORDS[1] +_dvc_compgen_command() { + local flags_list="_dvc_$(_dvc_replace_hyphen $1)" + local args_gen="${flags_list}_COMPGEN" + COMPREPLY=( $(compgen -W "$_dvc_global_options ${!flags_list}" -- "$word"; [ -n "${!args_gen}" ] && ${!args_gen} "$word") ) } -# Params -# $1 - COMP_WORDS[1] -# $1 - COMP_WORDS[2] -comp_subcommand() { - local options_list="_dvc_$(replace_hyphen $1)_$(replace_hyphen $2)" - if [ -z "${!options_list}" ]; then - comp_command $1 +# $1=COMP_WORDS[1] +# $2=COMP_WORDS[2] +_dvc_compgen_subcommand() { + local flags_list="_dvc_$(_dvc_replace_hyphen $1)_$(_dvc_replace_hyphen $2)" + local args_gen="${flags_list}_COMPGEN" + [ -n "${!args_gen}" ] && local opts_more="$(${!args_gen} "$word")" + local opts="${!flags_list}" + if [ -z "$opts$opts_more" ]; then + _dvc_compgen_command $1 else - COMPREPLY=( $(compgen -W "$_dvc_global_options ${!options_list}" -- "$word") ) + COMPREPLY=( $(compgen -W "$_dvc_global_options $opts" -- "$word"; [ -n "$opts_more" ] && echo "$opts_more") ) fi } # Notes: -# # `COMPREPLY` contains what will be rendered after completion is triggered -# # `word` refers to the current typed word -# # `${!var}` is to evaluate the content of `var` and expand its content as a variable -# # hello="world" # x="hello" # ${!x} -> ${hello} -> "world" -# _dvc() { - replace_hyphen() { - echo $(echo $1 | sed 's/-/_/g') - } local word="${COMP_WORDS[COMP_CWORD]}" COMPREPLY=() @@ -107,12 +131,12 @@ _dvc() { *) COMPREPLY=($(compgen -W "$_dvc_commands" -- "$word")) ;; esac elif [ "${COMP_CWORD}" -eq 2 ]; then - comp_command ${COMP_WORDS[1]} - elif [ "${COMP_CWORD}" -eq 3 ]; then - comp_subcommand ${COMP_WORDS[1]} ${COMP_WORDS[2]} + _dvc_compgen_command ${COMP_WORDS[1]} + elif [ "${COMP_CWORD}" -ge 3 ]; then + _dvc_compgen_subcommand ${COMP_WORDS[1]} ${COMP_WORDS[2]} fi return 0 } -complete -F _dvc dvc +complete -o nospace -F _dvc dvc diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 22084cb765..af97520e00 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -99,7 +99,7 @@ _dvc_destroy=( _dvc_diff=( "--show-json[Format the output into a JSON]" - "--checksums[Display checksums for each entry]" + "--show-hash[Display hash value for each entry]" "1:Old Git commit to compare (defaults to HEAD):" "2:New Git commit to compare (defaults to the current workspace):" ) @@ -254,7 +254,6 @@ _dvc_run=( _dvc_status=( {-j,--jobs}"[Number of jobs to run simultaneously.]:Number of jobs:" - "--show-checksums[Show checksums instead of file names.]" {-q,--quiet}"[Suppresses all output. Exit with 0 if pipelines are up to date, otherwise 1.]" {-c,--cloud}"[Show status of a local cache compared to a remote repository.]" {-r,--remote}"[Remote repository to compare local cache to.]:Remote repository:" diff --git a/setup.py b/setup.py index b39a1f6e91..88b111960e 100644 --- a/setup.py +++ b/setup.py @@ -148,7 +148,8 @@ def run(self): "hdfs": hdfs, "tests": tests_requirements, }, - keywords="data science, data version control, machine learning", + keywords="data-science data-version-control machine-learning git" + " developer-tools reproducibility collaboration ai", python_requires=">=3.5", classifiers=[ "Development Status :: 4 - Beta", diff --git a/tests/conftest.py b/tests/conftest.py index 6e01739aac..a6a4c9ba8e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest from dvc.remote.ssh.connection import SSHConnection +from tests.utils.httpd import PushRequestHandler, StaticFileServer from .dir_helpers import * # noqa @@ -57,3 +58,9 @@ def _close_pools(): yield close_pools() + + +@pytest.fixture +def http_server(tmp_dir): + with StaticFileServer(handler_class=PushRequestHandler) as httpd: + yield httpd diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 13b83c6315..95407ee7d0 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -655,3 +655,10 @@ def test_add_from_data_dir(tmp_dir, scm, dvc): "tracked output: 'dir'.\n" "To include '{out}' in 'dir', run 'dvc commit dir.dvc'" ).format(out=os.path.join("dir", "file2")) + + +def test_not_raises_on_re_add(tmp_dir, dvc): + tmp_dir.dvc_gen("file", "file content") + + tmp_dir.gen({"file2": "file2 content", "file": "modified file"}) + dvc.add(["file2", "file"]) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 39df54bbbf..b28c2e438b 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -30,6 +30,7 @@ GCP, GDrive, HDFS, + HTTP, Local, S3, SSHMocked, @@ -290,6 +291,20 @@ def _get_cloud_class(self): return RemoteHDFS +@pytest.mark.usefixtures("http_server") +class TestRemoteHTTP(HTTP, TestDataCloudBase): + @pytest.fixture(autouse=True) + def setup_method_fixture(self, request, http_server): + self.http_server = http_server + self.method_name = request.function.__name__ + + def get_url(self): + return super().get_url(self.http_server.server_port) + + def _get_cloud_class(self): + return RemoteHTTP + + class TestDataCloudCLIBase(TestDvc): def main(self, args): ret = main(args) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 7af8f3afca..827039fb13 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -22,7 +22,7 @@ def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") assert dvc.diff() == { - "added": [{"path": "file", "checksum": digest("text")}], + "added": [{"path": "file", "hash": digest("text")}], "deleted": [], "modified": [], } @@ -41,15 +41,15 @@ def test_no_cache_entry(tmp_dir, scm, dvc): assert dvc.diff() == { "added": [ - {"path": os.path.join("dir", ""), "checksum": dir_checksum}, - {"path": os.path.join("dir", "1"), "checksum": digest("1")}, - {"path": os.path.join("dir", "2"), "checksum": digest("2")}, + {"path": os.path.join("dir", ""), "hash": dir_checksum}, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, ], "deleted": [], "modified": [ { "path": "file", - "checksum": {"old": digest("first"), "new": digest("second")}, + "hash": {"old": digest("first"), "new": digest("second")}, } ], } @@ -61,7 +61,7 @@ def test_deleted(tmp_dir, scm, dvc): assert dvc.diff() == { "added": [], - "deleted": [{"path": "file", "checksum": digest("text")}], + "deleted": [{"path": "file", "hash": digest("text")}], "modified": [], } @@ -76,7 +76,7 @@ def test_modified(tmp_dir, scm, dvc): "modified": [ { "path": "file", - "checksum": {"old": digest("first"), "new": digest("second")}, + "hash": {"old": digest("first"), "new": digest("second")}, } ], } @@ -94,17 +94,13 @@ def test_refs(tmp_dir, scm, dvc): assert dvc.diff("HEAD~1") == { "added": [], "deleted": [], - "modified": [ - {"path": "file", "checksum": {"old": HEAD_1, "new": HEAD}} - ], + "modified": [{"path": "file", "hash": {"old": HEAD_1, "new": HEAD}}], } assert dvc.diff("HEAD~2", "HEAD~1") == { "added": [], "deleted": [], - "modified": [ - {"path": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} - ], + "modified": [{"path": "file", "hash": {"old": HEAD_2, "new": HEAD_1}}], } with pytest.raises(DvcException, match=r"unknown Git revision 'missing'"): @@ -128,42 +124,40 @@ def test_directories(tmp_dir, scm, dvc): "added": [ { "path": os.path.join("dir", ""), - "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", + "hash": "5fb6b29836c388e093ca0715c872fe2a.dir", }, - {"path": os.path.join("dir", "1"), "checksum": digest("1")}, - {"path": os.path.join("dir", "2"), "checksum": digest("2")}, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, ], "deleted": [], "modified": [], } assert dvc.diff(":/directory", ":/modify") == { - "added": [{"path": os.path.join("dir", "3"), "checksum": digest("3")}], + "added": [{"path": os.path.join("dir", "3"), "hash": digest("3")}], "deleted": [], "modified": [ { "path": os.path.join("dir", ""), - "checksum": { + "hash": { "old": "5fb6b29836c388e093ca0715c872fe2a.dir", "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", }, }, { "path": os.path.join("dir", "2"), - "checksum": {"old": digest("2"), "new": digest("two")}, + "hash": {"old": digest("2"), "new": digest("two")}, }, ], } assert dvc.diff(":/modify", ":/delete") == { "added": [], - "deleted": [ - {"path": os.path.join("dir", "2"), "checksum": digest("two")} - ], + "deleted": [{"path": os.path.join("dir", "2"), "hash": digest("two")}], "modified": [ { "path": os.path.join("dir", ""), - "checksum": { + "hash": { "old": "9b5faf37366b3370fd98e3e60ca439c1.dir", "new": "83ae82fb367ac9926455870773ff09e6.dir", }, diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index f1aa4012ac..bb395669ab 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -1,9 +1,11 @@ +import logging import os import configobj import pytest from git import Repo +from dvc.compat import fspath from dvc.exceptions import CollectCacheError from dvc.main import main from dvc.repo import Repo as DvcRepo @@ -28,11 +30,11 @@ def setUp(self): self.bad_cache.append(path) def test_api(self): - self.dvc.gc() + self.dvc.gc(workspace=True) self._test_gc() def test_cli(self): - ret = main(["gc", "-f"]) + ret = main(["gc", "-wf"]) self.assertEqual(ret, 0) self._test_gc() @@ -169,10 +171,10 @@ def test(self): self._check_cache(3) - self.dvc.gc(repos=[self.additional_path]) + self.dvc.gc(repos=[self.additional_path], workspace=True) self._check_cache(3) - self.dvc.gc() + self.dvc.gc(workspace=True) self._check_cache(2) @@ -196,10 +198,10 @@ def test_gc_no_dir_cache(tmp_dir, dvc): os.unlink(dir_stage.outs[0].cache_path) with pytest.raises(CollectCacheError): - dvc.gc() + dvc.gc(workspace=True) assert _count_files(dvc.cache.local.cache_dir) == 4 - dvc.gc(force=True) + dvc.gc(force=True, workspace=True) assert _count_files(dvc.cache.local.cache_dir) == 2 @@ -218,5 +220,59 @@ def test_gc_no_unpacked_dir(tmp_dir, dvc): assert os.path.exists(unpackeddir) - dvc.gc(force=True) + dvc.gc(force=True, workspace=True) assert not os.path.exists(unpackeddir) + + +def test_gc_without_workspace_raises_error(tmp_dir, dvc): + dvc.gc(force=True, workspace=True) # works without error + + from dvc.exceptions import InvalidArgumentError + + with pytest.raises(InvalidArgumentError): + dvc.gc(force=True) + + with pytest.raises(InvalidArgumentError): + dvc.gc(force=True, workspace=False) + + +def test_gc_without_workspace_on_tags_branches_commits(tmp_dir, dvc): + dvc.gc(force=True, all_tags=True) + dvc.gc(force=True, all_commits=True) + dvc.gc(force=False, all_branches=True) + + # even if workspace is disabled, and others are enabled, assume as if + # workspace is enabled. + dvc.gc(force=False, all_branches=True, all_commits=False, workspace=False) + + +def test_gc_without_workspace(tmp_dir, dvc, caplog): + with caplog.at_level(logging.WARNING, logger="dvc"): + assert main(["gc", "-vf"]) == 255 + + assert "Invalid Arguments" in caplog.text + + +def test_gc_with_possible_args_positive(tmp_dir, dvc): + for flag in [ + "-w", + "-a", + "-T", + "--all-commits", + "-aT", + "-wa", + "-waT", + ]: + assert main(["gc", "-vf", flag]) == 0 + + +def test_gc_cloud_positive(tmp_dir, dvc, tmp_path_factory): + with dvc.config.edit() as conf: + storage = fspath(tmp_path_factory.mktemp("test_remote_base")) + conf["remote"]["local_remote"] = {"url": storage} + conf["core"]["remote"] = "local_remote" + + dvc.push() + + for flag in ["-c", "-ca", "-cT", "-caT", "-cwT"]: + assert main(["gc", "-vf", flag]) == 0 diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index c9beb777bb..d084de26b7 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -175,8 +175,9 @@ def test_nested(self): Stage.load(self.dvc, error_stage_path), ] - with self.assertRaises(StagePathAsOutputError): - self.dvc.reproduce(error_stage_path) + with patch.object(self.dvc, "_reset"): # to prevent `stages` resetting + with self.assertRaises(StagePathAsOutputError): + self.dvc.reproduce(error_stage_path) def test_similar_paths(self): # File structure: @@ -949,7 +950,7 @@ def test(self, mock_prompt): self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) - self.dvc.gc() + self.dvc.gc(workspace=True) self.assertEqual(self.dvc.status(), {}) self.dvc.remove(cmd_stage.path, outs_only=True) diff --git a/tests/func/test_stage.py b/tests/func/test_stage.py index ce4991fd80..949e8b5671 100644 --- a/tests/func/test_stage.py +++ b/tests/func/test_stage.py @@ -1,6 +1,7 @@ import os import tempfile + from dvc.main import main from dvc.output.local import OutputLOCAL from dvc.remote.local import RemoteLOCAL diff --git a/tests/remotes.py b/tests/remotes.py index 2102ad9632..6964fc8a77 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -275,3 +275,11 @@ def get_url(): return "hdfs://{}@127.0.0.1{}".format( getpass.getuser(), Local.get_storagepath() ) + + +class HTTP: + should_test = always_test + + @staticmethod + def get_url(port): + return "http://127.0.0.1:{}".format(port) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 6ce2be58da..c70feac7e9 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -8,7 +8,7 @@ def test_default(mocker, caplog): args = parse_args(["diff"]) cmd = args.func(args) diff = { - "added": [{"path": "file", "checksum": "00000000"}], + "added": [{"path": "file", "hash": "00000000"}], "deleted": [], "modified": [], } @@ -23,21 +23,18 @@ def test_default(mocker, caplog): ) in caplog.text -def test_checksums(mocker, caplog): - args = parse_args(["diff", "--checksums"]) +def test_show_hash(mocker, caplog): + args = parse_args(["diff", "--show-hash"]) cmd = args.func(args) diff = { "added": [], "deleted": [ - {"path": os.path.join("data", ""), "checksum": "XXXXXXXX.dir"}, - {"path": os.path.join("data", "bar"), "checksum": "00000000"}, - {"path": os.path.join("data", "foo"), "checksum": "11111111"}, + {"path": os.path.join("data", ""), "hash": "XXXXXXXX.dir"}, + {"path": os.path.join("data", "bar"), "hash": "00000000"}, + {"path": os.path.join("data", "foo"), "hash": "11111111"}, ], "modified": [ - { - "path": "file", - "checksum": {"old": "AAAAAAAA", "new": "BBBBBBBB"}, - } + {"path": "file", "hash": {"old": "AAAAAAAA", "new": "BBBBBBBB"}} ], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) @@ -55,11 +52,11 @@ def test_checksums(mocker, caplog): ) in caplog.text -def test_json(mocker, caplog): +def test_show_json(mocker, caplog): args = parse_args(["diff", "--show-json"]) cmd = args.func(args) diff = { - "added": [{"path": "file", "checksum": "00000000"}], + "added": [{"path": "file", "hash": "00000000"}], "deleted": [], "modified": [], } @@ -71,16 +68,14 @@ def test_json(mocker, caplog): assert '"modified": []' in caplog.text -def test_json_checksums(mocker, caplog): - args = parse_args(["diff", "--show-json", "--checksums"]) +def test_show_json_and_hash(mocker, caplog): + args = parse_args(["diff", "--show-json", "--show-hash"]) cmd = args.func(args) diff = { "added": [ # py35: maintain a consistent key order for tests purposes - collections.OrderedDict( - [("path", "file"), ("checksum", "00000000")] - ) + collections.OrderedDict([("path", "file"), ("hash", "00000000")]) ], "deleted": [], "modified": [], @@ -88,6 +83,6 @@ def test_json_checksums(mocker, caplog): mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() - assert '"added": [{"path": "file", "checksum": "00000000"}]' in caplog.text + assert '"added": [{"path": "file", "hash": "00000000"}]' in caplog.text assert '"deleted": []' in caplog.text assert '"modified": []' in caplog.text diff --git a/tests/unit/output/test_output.py b/tests/unit/output/test_output.py index 2cf30e9680..fb9325471b 100644 --- a/tests/unit/output/test_output.py +++ b/tests/unit/output/test_output.py @@ -1,8 +1,12 @@ +import logging + import pytest +from funcy import first from voluptuous import Schema, MultipleInvalid -from dvc.output import CHECKSUM_SCHEMA +from dvc.cache import NamedCache +from dvc.output import CHECKSUM_SCHEMA, OutputBase @pytest.mark.parametrize( @@ -40,3 +44,47 @@ def test_checksum_schema(value, expected): def test_checksum_schema_fail(value): with pytest.raises(MultipleInvalid): Schema(CHECKSUM_SCHEMA)(value)["md5"] + + +@pytest.mark.parametrize( + "exists, expected_message", + [ + ( + False, + ( + "Output 'path'(Stage stage.dvc) is missing version info. " + "Cache for it will not be collected. " + "Use `dvc repro` to get your pipeline up to date." + ), + ), + ( + True, + ( + "Output 'path'(Stage stage.dvc) is missing version info. " + "Cache for it will not be collected. " + "Use `dvc repro` to get your pipeline up to date.\n" + "You can also use `dvc commit stage.dvc` to associate " + "existing 'path' with 'stage.dvc'." + ), + ), + ], +) +def test_get_used_cache(exists, expected_message, mocker, caplog): + stage = mocker.MagicMock() + mocker.patch.object(stage, "__str__", return_value="Stage stage.dvc") + mocker.patch.object(stage, "relpath", "stage.dvc") + + output = OutputBase(stage, "path") + + mocker.patch.object(output, "use_cache", True) + mocker.patch.object(stage, "is_repo_import", False) + mocker.patch.object( + OutputBase, "checksum", new_callable=mocker.PropertyMock + ).return_value = None + mocker.patch.object( + OutputBase, "exists", new_callable=mocker.PropertyMock + ).return_value = exists + + with caplog.at_level(logging.WARNING, logger="dvc"): + assert isinstance(output.get_used_cache(), NamedCache) + assert first(caplog.messages) == expected_message diff --git a/tests/unit/remote/test_http.py b/tests/unit/remote/test_http.py index 20c8854fce..65ad2d2e85 100644 --- a/tests/unit/remote/test_http.py +++ b/tests/unit/remote/test_http.py @@ -27,3 +27,74 @@ def test_download_fails_on_error_code(dvc): with pytest.raises(HTTPError): remote._download(URLInfo(url) / "missing.txt", "missing.txt") + + +def test_public_auth_method(dvc): + config = { + "url": "http://example.com/", + "path_info": "file.html", + "user": "", + "password": "", + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() is None + + +def test_basic_auth_method(dvc): + from requests.auth import HTTPBasicAuth + + user = "username" + password = "password" + auth = HTTPBasicAuth(user, password) + config = { + "url": "http://example.com/", + "path_info": "file.html", + "auth": "basic", + "user": user, + "password": password, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() == auth + assert isinstance(remote.auth_method(), HTTPBasicAuth) + + +def test_digest_auth_method(dvc): + from requests.auth import HTTPDigestAuth + + user = "username" + password = "password" + auth = HTTPDigestAuth(user, password) + config = { + "url": "http://example.com/", + "path_info": "file.html", + "auth": "digest", + "user": user, + "password": password, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() == auth + assert isinstance(remote.auth_method(), HTTPDigestAuth) + + +def test_custom_auth_method(dvc): + header = "Custom-Header" + password = "password" + config = { + "url": "http://example.com/", + "path_info": "file.html", + "auth": "custom", + "custom_auth_header": header, + "password": password, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() is None + assert header in remote.headers + assert remote.headers[header] == password diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index d544da7930..b6ee260de9 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -2,6 +2,8 @@ import pytest +from dvc.repo import locked + def test_is_dvc_internal(dvc): assert dvc.is_dvc_internal(os.path.join("path", "to", ".dvc", "file")) @@ -49,3 +51,18 @@ def test_used_cache(tmp_dir, dvc, path): used_cache._items == expected._items and used_cache.external == expected.external ) + + +def test_locked(mocker): + repo = mocker.MagicMock() + repo.method = locked(repo.method) + + args = {} + kwargs = {} + repo.method(repo, args, kwargs) + + assert repo.method_calls == [ + mocker.call._reset(), + mocker.call.method(repo, args, kwargs), + mocker.call._reset(), + ] diff --git a/tests/utils/httpd.py b/tests/utils/httpd.py index 2a3091eb37..378bb75b3f 100644 --- a/tests/utils/httpd.py +++ b/tests/utils/httpd.py @@ -1,7 +1,8 @@ import hashlib import os import threading -from http.server import HTTPServer +from http import HTTPStatus +from http.server import HTTPServer, SimpleHTTPRequestHandler from RangeHTTPServer import RangeRequestHandler @@ -35,6 +36,35 @@ class ContentMD5Handler(TestRequestHandler): checksum_header = "Content-MD5" +class PushRequestHandler(SimpleHTTPRequestHandler): + def _chunks(self): + while True: + data = self.rfile.readline(65537) + chunk_size = int(data[:-2], 16) + if chunk_size == 0: + return + data = self.rfile.read(chunk_size) + yield data + self.rfile.read(2) + + def do_POST(self): + chunked = self.headers.get("Transfer-Encoding", "") == "chunked" + path = self.translate_path(self.path) + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as fd: + if chunked: + for chunk in self._chunks(): + fd.write(chunk) + else: + size = int(self.headers.get("Content-Length", 0)) + fd.write(self.rfile.read(size)) + except OSError as e: + self.send_error(HTTPStatus.INTERNAL_SERVER_ERROR, str(e)) + self.send_response(HTTPStatus.OK) + self.end_headers() + + class StaticFileServer: _lock = threading.Lock()