diff --git a/package.json b/package.json index e4781f8558..4d8fe4ebd8 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "debug": "node --inspect-brk server.js", "build": "next build", "test": "jest", - "start": "NODE_ENV=production node server.js", + "start": "./scripts/clear-cloudflare-cache.js; NODE_ENV=production node server.js", "format-staged": "pretty-quick --staged --no-restage --bail", "format-check": "prettier --check '{.,pages/**,public/static/docs/**,src/**}/*.{js,md,json}'", "lint-check": "eslint --ext .json,.js src pages", diff --git a/public/static/docs/api-reference/get_url.md b/public/static/docs/api-reference/get_url.md new file mode 100644 index 0000000000..9a83e33a09 --- /dev/null +++ b/public/static/docs/api-reference/get_url.md @@ -0,0 +1,110 @@ +# dvc.api.get_url() + +Returns the URL to the storage location of a data file or directory tracked in a +DVC project. + +```py +def get_url(path: str, + repo: str = None, + rev: str = None, + remote: str = None) -> str +``` + +#### Usage: + +```py +import dvc.api + +resource_url = dvc.api.get_url( + 'get-started/data.xml', + repo='https://github.com/iterative/dataset-registry') + +# resource_url is now "https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355" +``` + +## Description + +Returns the URL string of the storage location (in a +[DVC remote](/doc/command-reference/remote)) where a target file or directory, +specified by its `path` in a `repo` (DVC project), is stored. + +The URL is formed by reading the project's +[remote configuration](/doc/command-reference/config#remote) and the +[DVC-file](/doc/user-guide/dvc-file-format) where the given `path` is an +output. The URL schema returned depends on the +[type](/doc/command-reference/remote/add#supported-storage-types) of the +`remote` used (see the [Parameters](#parameters) section). + +If the target is a directory, the returned URL will end in `.dir`. Refer to +[Structure of cache directory](/doc/user-guide/dvc-files-and-directories#structure-of-cache-directory) +and `dvc add` to learn more about how DVC handles data directories. + +⚠️ This function does not check for the actual existence of the file or +directory in the remote storage. + +💡 Having the resource's URL, it should be possible to download it directly with +an appropriate library, such as +[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.download_fileobj) +or +[`paramiko`](https://docs.paramiko.org/en/stable/api/sftp.html#paramiko.sftp_client.SFTPClient.get). + +## Parameters + +- **`path`** - location and file name of the file or directory in `repo`, + relative to the project's root. + +- `repo` - specifies the location of the DVC project. It can be a URL or a file + system path. Both HTTP and SSH protocols are supported for online Git repos + (e.g. `[user@]server:project.git`). _Default_: The current project is used + (the current working directory tree is walked up to find it). + +- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as + a branch or tag name, or a commit hash). If `repo` is not a Git repo, this + option is ignored. _Default_: `HEAD`. + +- `remote` - name of the [DVC remote](/doc/command-reference/remote) to use to + form the returned URL string. _Default_: The + [default remote](/doc/command-reference/remote/default) of `repo` is used. + +## Exceptions + +- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project. + +- `dvc.exceptions.NoRemoteError` - no `remote` is found. + +## Example: Getting the URL to a DVC-tracked file + +```py +import dvc.api + +resource_url = dvc.api.get_url( + 'get-started/data.xml', + repo='https://github.com/iterative/dataset-registry' + ) + +print(resource_url) +``` + +The script above prints + +`https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355` + +This URL represents the location where the data is stored, and is built by +reading the corresponding DVC-file +([`get-started/data.xml.dvc`](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc)) +where the `md5` file hash is stored, + +```yaml +outs: + - md5: a304afb96060aad90176268345e10355 + path: get-started/data.xml +``` + +and the project configuration +([`.dvc/config`](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)) +where the remote URL is saved: + +```ini +['remote "storage"'] +url = https://remote.dvc.org/dataset-registry +``` diff --git a/public/static/docs/api-reference/index.md b/public/static/docs/api-reference/index.md new file mode 100644 index 0000000000..0298b185ab --- /dev/null +++ b/public/static/docs/api-reference/index.md @@ -0,0 +1,16 @@ +# Python API + +DVC can be used as a Python library, simply [install](/doc/install) with `pip` +or `conda`. This reference provides the details about the functions in the API +module `dvc.api`, which can be imported any regular way, for example: + +```py +import dvc.api +``` + +The purpose of this API is to provide programatic access to the data or models +[stored and versioned](/doc/use-cases/versioning-data-and-model-files) in +DVC repositories from Python code. + +Please choose a function from the navigation sidebar to the left, or click the +`Next` button below to jump into the first one ↘ diff --git a/public/static/docs/api-reference/open.md b/public/static/docs/api-reference/open.md new file mode 100644 index 0000000000..858090e0fb --- /dev/null +++ b/public/static/docs/api-reference/open.md @@ -0,0 +1,200 @@ +# dvc.api.open() + +Opens a tracked file. + +```py +def open(path: str, + repo: str = None, + rev: str = None, + remote: str = None, + mode: str = "r", + encoding: str = None) +``` + +#### Usage: + +```py +import dvc.api + +with dvc.api.open( + 'get-started/data.xml', + repo='https://github.com/iterative/dataset-registry' + ) as fd: + # ... fd is a file descriptor that can be processed normally. +``` + +## Description + +Open a data or model file tracked in a DVC project and generate a +corresponding +[file object](https://docs.python.org/3/glossary.html#term-file-object). The +file can be tracked by DVC or by Git. + +> The exact type of file object depends on the `mode` used. For more details, +> please refer to Python's +> [`open()`](https://docs.python.org/3/library/functions.html#open) built-in, +> which is used under the hood. + +`dvc.api.open()` may only be used as a +[context manager](https://www.python.org/dev/peps/pep-0343/#context-managers-in-the-standard-library) +(using the `with` keyword, as shown in the examples). + +This function makes a direct connection to the +[remote storage](/doc/command-reference/remote/add#supported-storage-types) +(except for Google Drive), so the file contents can be streamed. Your code can +process the data [buffer](https://docs.python.org/3/c-api/buffer.html) as it's +streamed, which optimizes memory usage. + +> Use `dvc.api.read()` to load the complete file contents in a single function +> call – no _context manager_ involved. Neither function utilizes disc space. + +## Parameters + +- **`path`** - location and file name of the file in `repo`, relative to the + project's root. + +- `repo` - specifies the location of the DVC project. It can be a URL or a file + system path. Both HTTP and SSH protocols are supported for online Git repos + (e.g. `[user@]server:project.git`). _Default_: The current project is used + (the current working directory tree is walked up to find it). + +- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as + a branch or tag name, or a commit hash). If `repo` is not a Git repo, this + option is ignored. _Default_: `HEAD`. + +- `remote` - name of the [DVC remote](/doc/command-reference/remote) to look for + the target data. _Default_: The + [default remote](/doc/command-reference/remote/default) of `repo` is used if a + `remote` argument is not given. For local projects, the cache is + tied before the default remote. + +- `mode` - specifies the mode in which the file is opened. Defaults to `"r"` + (read). Mirrors the namesake parameter in builtin + [`open()`](https://docs.python.org/3/library/functions.html#open). + +- `encoding` - + [codec](https://docs.python.org/3/library/codecs.html#standard-encodings) used + to decode the file contents to a string. This should only be used in text + mode. Defaults to `"utf-8"`. Mirrors the namesake parameter in builtin + `open()`. + +## Exceptions + +- `dvc.exceptions.FileMissingError` - file in `path` is missing from `repo`. + +- `dvc.exceptions.PathMissingError` - `path` cannot be found in `repo`. + +- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project. + +- `dvc.exceptions.NoRemoteError` - no `remote` is found. + +## Example: Use data or models from DVC repositories + +Any data artifact hosted online can be processed directly in your +Python code with this API. For example, an XML file tracked in a public DVC repo +on Github can be processed like this: + +```py +from xml.sax import parse +import dvc.api +from mymodule import mySAXHandler + +with dvc.api.open( + 'get-started/data.xml', + repo='https://github.com/iterative/dataset-registry' + ) as fd: + parse(fd, mySAXHandler) +``` + +Notice that we use a [SAX](http://www.saxproject.org/) XML parser here because +`dvc.api.open()` is able to stream the data from +[remote storage](/doc/command-reference/remote/add#supported-storage-types). +(The `mySAXHandler` object should handle the event-driven parsing of the +document in this case.) This increases the performance of the code (minimizing +memory usage), and is typically faster than loading the whole data into memory. + +> If you just needed to load the complete file contents into memory, you can use +> `dvc.api.read()` instead: +> +> ```py +> from xml.dom.minidom import parse +> import dvc.api +> +> xmldata = dvc.api.read('get-started/data.xml', +> repo='https://github.com/iterative/dataset-registry') +> xmldom = parse(xmldata) +> ``` + +## Example: Accessing private repos + +This is just a matter of using the right `repo` argument, for example an SSH URL +(requires that the +[credentials are configured](https://help.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh) +locally): + +```py +import dvc.api + +with dvc.api.open( + 'features.dat', + repo='git@server.com:path/to/repo.git' + ) as fd: + # ... Process 'features' +``` + +## Example: Use different versions of data + +The `rev` argument lets you specify any Git commit to look for an artifact. This +way any previous version, or alternative experiment can be accessed +programmatically. For example, let's say your DVC repo has tagged releases of a +CSV dataset: + +```py +import csv +import dvc.api + +with dvc.api.open( + 'clean.csv', + rev='v1.1.0' + ) as fd: + reader = csv.reader(fd) + # ... Process 'clean' data from version 1.1.0 +``` + +Also, notice that we didn't supply a `repo` argument in this example. DVC will +attempt to find a DVC project to use in the current working +directory tree, and look for the file contents of `clean.csv` in its local +cache; no download will happen if found. See the +[Parameters](#parameters) section for more info. + +## Example: Chose a specific remote as the data source + +Sometimes we may want to choose the [remote](/doc/command-reference/remote) data +source, for example if the `repo` has no default remote set. This can be done by +providing a `remote` argument: + +```py +import dvc.api + +with open( + 'activity.log', + repo='location/of/dvc/project', + remote='my-s3-bucket' + ) as fd: + for line in fd: + match = re.search(r'user=(\w+)', line) + # ... Process users activity log +``` + +## Example: Specify the text encoding + +To chose which codec to open a text file with, send an `encoding` argument: + +```py +import dvc.api + +with dvc.api.open( + 'data/nlp/words_ru.txt', + encoding='koi8_r') as fd: + # ... Process Russian words +``` diff --git a/public/static/docs/api-reference/read.md b/public/static/docs/api-reference/read.md new file mode 100644 index 0000000000..e83ae063b0 --- /dev/null +++ b/public/static/docs/api-reference/read.md @@ -0,0 +1,101 @@ +# dvc.api.read() + +Returns the contents of a tracked file. + +```py +def open(path: str, + repo: str = None, + rev: str = None, + remote: str = None, + mode: str = "r", + encoding: str = None) +``` + +#### Usage: + +```py +import dvc.api + +modelpkl = dvc.api.read( + 'model.pkl', + repo='https://github.com/example/project.git' + mode='rb') +``` + +## Description + +This function wraps [`dvc.api.open()`](/doc/api-reference/open), for a simple +way to return the complete contents of a file tracked in a DVC +project. The file can be tracked by DVC or by Git. + +> This is similar to the `dvc get` command in our CLI. + +The returned contents can be a +[string](https://docs.python.org/3/library/stdtypes.html#text-sequence-type-str) +or a [bytearray](https://docs.python.org/3/library/stdtypes.html#bytearray). +These are loaded to memory directly (without using any disc space). + +> The type returned depends on the `mode` used. For more details, please refer +> to Python's [`open()`](https://docs.python.org/3/library/functions.html#open) +> built-in, which is used under the hood. + +## Parameters + +- **`path`** - location and file name of the file in `repo`, relative to the + project's root. + +- `repo` - specifies the location of the DVC project. It can be a URL or a file + system path. Both HTTP and SSH protocols are supported for online Git repos + (e.g. `[user@]server:project.git`). _Default_: The current project is used + (the current working directory tree is walked up to find it). + +- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as + a branch or tag name, or a commit hash). If `repo` is not a Git repo, this + option is ignored. _Default_: `HEAD`. + +- `remote` - name of the [DVC remote](/doc/command-reference/remote) to look for + the target data. _Default_: The + [default remote](/doc/command-reference/remote/default) of `repo` is used if a + `remote` argument is not given. For local projects, the cache is + tied before the default remote. + +- `mode` - specifies the mode in which the file is opened. Defaults to `"r"` + (read). Mirrors the namesake parameter in builtin + [`open()`](https://docs.python.org/3/library/functions.html#open). + +- `encoding` - + [codec](https://docs.python.org/3/library/codecs.html#standard-encodings) used + to decode the file contents to a string. This should only be used in text + mode. Defaults to `"utf-8"`. Mirrors the namesake parameter in builtin + `open()`. + +## Exceptions + +- `dvc.exceptions.FileMissingError` - file in `path` is missing from `repo`. + +- `dvc.exceptions.PathMissingError` - `path` cannot be found in `repo`. + +- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project. + +- `dvc.exceptions.NoRemoteError` - no `remote` is found. + +## Example: Load data from a DVC repository + +Any data artifact hosted online can be loaded directly in your +Python code with this API. For example, let's say that you want to load and +unserialize a binary model from a repo on Github: + +```py +import pickle +import dvc.api + +model = pickle.loads( + dvc.api.read( + 'model.pkl', + repo='https://github.com/example/project.git' + mode='rb' + ) + ) +``` + +> We're using `'rb'` mode here for compatibility with `pickle.loads()`. diff --git a/public/static/docs/command-reference/commit.md b/public/static/docs/command-reference/commit.md index bb5399cdef..4ab9eeb2d4 100644 --- a/public/static/docs/command-reference/commit.md +++ b/public/static/docs/command-reference/commit.md @@ -45,7 +45,7 @@ needed after a `git commit`. See `dvc install` for more details. stages. `dvc commit` can help avoid having to reproduce a pipeline in these cases by forcing the update of the DVC-files. -Let's take a look at what is happening in the fist scenario closely. Normally +Let's take a look at what is happening in the first scenario closely. Normally DVC commands like `dvc add`, `dvc repro` or `dvc run` commit the data to the cache after creating a DVC-file. What _commit_ means is that DVC: @@ -54,7 +54,7 @@ DVC commands like `dvc add`, `dvc repro` or `dvc run` commit the data to the - Tells Git to ignore the file/directory (adding an entry to `.gitignore`). (Note that if the project was initialized with no SCM support (`dvc init --no-scm`), this does not happen.) -- Adds the file/directory or to the cache. +- Adds the file/directory to the cache. There are many cases where the last step is not desirable (for example rapid iterations on an experiment). The `--no-commit` option prevents the last step @@ -258,7 +258,7 @@ that both Git and DVC recognize a change was made. If we ran `dvc repro` at this point, this pipeline would be reproduced. But since the change was inconsequential, that would be a waste of time and CPU. -That's especially critical if the corresponding stages lots of resources to +That's especially critical if the corresponding stages take lots of resources to execute. ```dvc diff --git a/public/static/docs/command-reference/config.md b/public/static/docs/command-reference/config.md index 97b5f7f46f..27b3284a61 100644 --- a/public/static/docs/command-reference/config.md +++ b/public/static/docs/command-reference/config.md @@ -62,14 +62,14 @@ file (in `.dvc/config` by default), and they support the options below: This is the main section with the general config options: -- `core.loglevel` - log level that the `dvc` command should use. Possible values - are: `info`, `debug`, `warning`, `error`. +- `core.loglevel` - log level that the `dvc` command should use. Accepts values + `info`, `debug`, `warning`, or `error`. - `core.remote` - name of the remote storage that should be used by default. - `core.interactive` - whether to always ask for confirmation before reproducing each [stage](/doc/command-reference/run) in `dvc repro`. (Normally, this - behavior requires the use of option `-i` in that command.) Accepts values + behavior requires the use of option `-i` in that command.) Accepts values: `true` and `false`. - `core.analytics` - used to turn off @@ -85,6 +85,11 @@ This is the main section with the general config options: project is on a file system that doesn't properly support file locking (e.g. [NFS v3 and older](http://nfs.sourceforge.net/)). +- `core.no_scm` - tells DVC to not expect or integrate with Git (even if the + project is initialized inside a Git repo). Accepts values `true` + and `false` (default). Set with the `--no-scm` option of `dvc init` + ([more details](/doc/command-reference/init#initializing-dvc-without-git)). + ### remote These are sections in the config file that describe particular remotes. These diff --git a/public/static/docs/command-reference/get-url.md b/public/static/docs/command-reference/get-url.md index 1a45e8c992..4d7fccf6a6 100644 --- a/public/static/docs/command-reference/get-url.md +++ b/public/static/docs/command-reference/get-url.md @@ -3,8 +3,8 @@ Download a file or directory from a supported URL (for example `s3://`, `ssh://`, and other protocols) into the local file system. -> Unlike `dvc import-url`, this command does not track the downloaded data files -> (does not create a DVC-file). +> See `dvc get` to download data/model files or directories from other DVC +> repositories (e.g. hosted on GitHub). ## Synopsis @@ -22,15 +22,15 @@ In some cases it's convenient to get a data artifact from a remote location into the local file system. The `dvc get-url` command helps the user do just that. +> Note that unlike `dvc import-url`, this command does not track the downloaded +> data files (does not create a DVC-file). For that reason, this command doesn't +> require an existing DVC project to run in. + The `url` argument should provide the location of the data to be downloaded, while `out` can be used to specify the directory and/or file name desired for the downloaded data. If an existing directory is specified, then the output will be placed inside of it. -Note that this command doesn't require an existing DVC project to -run in. It's a single-purpose command that can be used out of the box after -installing DVC. - DVC supports several types of (local or) remote locations (protocols): | Type | Description | `url` format | @@ -61,9 +61,6 @@ HTTP(S) it's possible to instead use: $ wget https://example.com/path/to/data.csv ``` -> See `dvc get` to download data/model files or directories from other DVC -> repositories (e.g. GitHub URLs). - ## Options - `-h`, `--help` - prints the usage/help message, and exit. diff --git a/public/static/docs/command-reference/get.md b/public/static/docs/command-reference/get.md index 84e26ae834..af0476dfdd 100644 --- a/public/static/docs/command-reference/get.md +++ b/public/static/docs/command-reference/get.md @@ -3,8 +3,7 @@ Download a file or directory tracked by DVC or by Git into the current working directory. -> Unlike `dvc import`, this command does not track the downloaded files (does -> not create a DVC-file). +> See also our `dvc.api.open()` Python API function. ## Synopsis @@ -21,11 +20,12 @@ positional arguments: Provides an easy way to download files or directories tracked in any DVC repository (e.g. datasets, intermediate results, ML models), or Git repository (e.g. source code, small image/other files). `dvc get` copies the -target file or directory (`url`/`path`) to the current working directory. -(Analogous to `wget`, but for repos.) +target file or directory (found at `path` in `url`) to the current working +directory. (Analogous to `wget`, but for repos.) -Note that this command doesn't require an existing DVC project to run in. It's a -single-purpose command that can be used out of the box after installing DVC. +> Note that unlike `dvc import`, this command does not track the downloaded +> files (does not create a DVC-file). For that reason, this command doesn't +> require an existing DVC project to run in. The `url` argument specifies the address of the DVC or Git repository containing the data source. Both HTTP and SSH protocols are supported for online repos diff --git a/public/static/docs/command-reference/import-url.md b/public/static/docs/command-reference/import-url.md index 797971aecd..aa625396d7 100644 --- a/public/static/docs/command-reference/import-url.md +++ b/public/static/docs/command-reference/import-url.md @@ -4,8 +4,8 @@ Download a file or directory from a supported URL (for example `s3://`, `ssh://`, and other protocols) into the workspace, and track changes in the remote data source. Creates a DVC-file. -> See also `dvc get-url`, that corresponds to the first half of what this -> command does (downloading the data artifact). +> See `dvc import` to download and tack data/model files or directories from +> other DVC repositories (e.g. hosted on GitHub). ## Synopsis @@ -28,6 +28,9 @@ external data source changes. Example scenarios: - A batch process running regularly updates a data file to import. - A shared dataset on a remote storage that is managed and updated outside DVC. +> Note that `dvc get-url` corresponds to the first step this command performs +> (just download the file or directory). + The `dvc import-url` command helps the user create such an external data dependency. The `url` argument specifies the external location of the data to be imported, while `out` can be used to specify the directory and/or file name @@ -103,9 +106,6 @@ Note that import stages are considered always locked, meaning that if you run `dvc repro`, they won't be updated. Use `dvc update` on them to bring the import up to date from the external data source. -> See `dvc import` to download and tack data/model files or directories from -> other DVC repositories (e.g. GitHub URLs). - ## Options - `-f FILE`, `--file FILE` - specify a path and/or file name for the DVC-file diff --git a/public/static/docs/command-reference/import.md b/public/static/docs/command-reference/import.md index 5940b6a0fd..b8e910be70 100644 --- a/public/static/docs/command-reference/import.md +++ b/public/static/docs/command-reference/import.md @@ -6,8 +6,7 @@ Download a file or directory tracked by DVC or by Git into the source, which can later be used to [update](/doc/command-reference/update) the import. -> See also `dvc get`, that corresponds to the first step this command performs -> (just download the data). +> See also our `dvc.api.open()` Python API function. ## Synopsis @@ -24,9 +23,13 @@ positional arguments: Provides an easy way to reuse files or directories tracked in any DVC repository (e.g. datasets, intermediate results, ML models) or Git repository (e.g. source code, small image/other files). `dvc import` downloads -the target file or directory (`url`/`path`) in a way so that it's tracked with -DVC, becoming a local data artifact. This also permits updating the -import later, if it has changed in its data source. (See `dvc update`.) +the target file or directory (found at `path` in `url`) in a way so that it's +tracked with DVC, becoming a local data artifact. This also permits +updating the import later, if it has changed in its data source. (See +`dvc update`.) + +> Note that `dvc get` corresponds to the first step this command performs (just +> download the data). The `url` argument specifies the address of the DVC or Git repository containing the data source. Both HTTP and SSH protocols are supported for online repos @@ -62,8 +65,7 @@ To actually [track the data](https://dvc.org/doc/get-started/add-files), `git add` (and `git commit`) the import stage. Note that import stages are considered always locked, meaning that if you run -`dvc repro`, they won't be updated. Use `dvc update` or -[re-import](#example-fixed-revisions-re-importing) them to update the downloaded +`dvc repro`, they won't be updated. Use `dvc update` to update the downloaded data artifact from the source repo. ## Options @@ -129,7 +131,7 @@ Several of the values above are pulled from the original stage file subfields under `repo` are used to save the origin and version of the dependency, respectively. -## Example: Fixed revisions & re-importing +## Example: Fixed revisions and updating to different revision To import a specific version of a data artifact, we may use the `--rev` option: @@ -159,23 +161,14 @@ deps: If `rev` is a Git branch or tag (where the underlying commit changes), the data source may have updates at a later time. To bring it up to date if so (and update `rev_lock` in the DVC-file), simply use `dvc update .dvc`. If -`rev` is a specific commit hash (does not change), `dvc update` will never have -an effect on the import stage. You may **re-import** a different commit instead, -by using `dvc import` again with a different (or without) `--rev`. For example: +`rev` is a specific commit (does not change), `dvc update` will never have an +effect on the import stage. You may `dvc update` to a different commit, using +`--rev`: ```dvc -$ dvc import --rev master \ - git@github.com:iterative/dataset-registry.git \ - use-cases/cats-dogs +$ dvc update --rev cats-dogs-v2 ``` -The import stage is overwritten, and will now be able update normally with -`dvc update`. - -> In the above example, the value for `rev` in the new import stage will be -> `master` (default branch), so the command is equivalent to not using `--rev` -> at all. - ## Example: Data registry If you take a look at our diff --git a/public/static/docs/command-reference/init.md b/public/static/docs/command-reference/init.md index a00f415284..13dd336cf4 100644 --- a/public/static/docs/command-reference/init.md +++ b/public/static/docs/command-reference/init.md @@ -1,39 +1,167 @@ # init -This command initializes a DVC project on a directory. - -Note that by default the current working directory is expected to contain a Git -repository, unless the `--no-scm` option is used. +Initialize a DVC project in the current working directory. ## Synopsis ```usage -usage: dvc init [-h] [-q | -v] [--no-scm] [-f] +usage: dvc init [-h] [-q | -v] [--no-scm] [-f] [--subdir] ``` ## Description +DVC works on top of a Git repository by default. This enables all features, +providing the most value. It means that `dvc init` (without flags) expects to +run in a Git repository root (a `.git/` directory should be present). + +The command options can be used to start an alternative workflow for advanced +scenarios like monorepos, automation, etc: + +- [Initializing DVC in subdirectories](#initializing-dvc-in-subdirectories) - + support for monorepos, nested DVC projects, etc. +- [Initializing DVC without Git](#how-does-it-affect-dvc-commands) - support for + SCM other than Git, deployment automation cases, etc. + After DVC initialization, a new directory `.dvc/` will be created with the `config` and `.gitignore` files. These and other files and directories are hidden from user, as typically there's no need to interact with them directly. See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to learn more. -`.dvc/cache` is one of the most important -[DVC directories](/doc/user-guide/dvc-files-and-directories). It will hold all -the contents of tracked data files. Note that `.dvc/.gitignore` lists this -directory, which means that the cache directory is not tracked by Git. This is a -local cache and you cannot `git push` it. +### Initializing DVC in subdirectories + +`--subdir` must be provided to initialize DVC in a subdirectory of a Git +repository. DVC still expects to find the Git repository (will check all +directories up to the root to find `.git`). This options does not affect any +config files, `.dvc` directory is created the same way as in the default mode. +This way multiple DVC projects (including nested ones) could be initialized in a +single Git repository providing isolation and granular project management. + +#### When is this useful? + +This option is mostly used in the scenario of a +[monorepo](https://en.wikipedia.org/wiki/Monorepo), but also can be used in +other workflows when such isolation and/or advanced granularity is needed. + +Let's imagine we have an existing Git repository that is split into sub-projects +(monorepo). In this case `dvc init --subdir` can be run in one or many +sub-projects to mitigate the issues of initializing in the Git repository root: + +- Repository maintainers might not allow extra `.dvc` top level directory, + especially if DVC is being used by a small number of sub-projects. + +- Not enough isolation/granularity - DVC config, cache, and other files are + shared across different sub-projects. Means that it's not easy to use + different remote storages, for example, for different sub-projects, etc. + +- Not enough isolation/granularity - commands like `dvc pull`, `dvc checkout`, + and others analyze the whole repository to look for + [DVC-files](/doc/user-guide/dvc-file-format) to download files and + directories, to reproduce pipelines, etc. It can be expensive in + the large repositories with a lot of projects. + +- Not enough isolation/granularity - commands like `dvc metrics diff`, + `dvc pipeline show` and others by default dump all the metrics, all the + pipelines, etc. + +#### How does it affect DVC commands? + +No matter what mode is used, DVC looks for the `.dvc` directory when it starts +(from the current working directory and up). Location of the found `.dvc` +directory determines the root of the DVC project. (In case of `--subdir` it +might happen that Git repository root is located at different path than the DVC +project root.) + +DVC project root defines the scope for the most DVC commands. Mostly meaning +that all DVC-file under the root path are being analyzed. + +If there are multiple DVC sub-projects but they _are not_ nested, e.g.: + +```sh +. +├── .git +| +├── project-A +│   └── .dvc +│ ... +├── project-B +│ └── .dvc +│ ... +``` -## Options +DVC considers them a two separate DVC projects. Any DVC command that is being +run in the `project-A` is not aware about DVC `project-B`. DVC does not consider +Git repository root an initialized DVC project in this case and commands that +require DVC project will raise an error. + +On the other hand, if there _are_ nested DVC projects, e.g.: + +```sh +project-A +├── .dvc +├── data-A.dvc +│ ... +└── project-B + ├── .dvc + ├── data-B.dvc + │ ... +``` + +Nothing changes for the `project-B`. But for any DVC command being run in the +`project-A` ignores the whole directory `project-B/`, meaning for example: + +```dvc +$ cd project-A +$ dvc pull +``` + +won't download or checkout data for the `data-B.dvc` file. + +### Initializing DVC without Git + +In rare cases, `--no-scm` option might be used to initialize DVC in a directory +that is not part of a Git repository, or to make DVC ignore Git. Examples +include: + +- SCM other than Git is being used. Even though there are DVC features that + require DVC to be run in the Git repo, DVC can work well with other version + control systems. Since DVC relies on simple text + [DVC-files](/doc/user-guide/dvc-file-format) to manage pipelines, + data, etc, they can be added into any SCM thus providing large data files and + directories versioning. + +- There is no need to keep the history at all, e.g. having a deployment + automation like running a data pipeline using `cron`. + +In this mode DVC features that depend on Git being present are not available - +e.g. managing `.gitignore` files on `dvc add` or `dvc run` to avoid committing +DVC-tracked files into Git, or `dvc diff` and `dvc metrics diff` that accept +Git-revisions to compare, etc. -- `--no-scm` - skip Git specific initialization, `.dvc/.gitignore` will not be - written. +DVC sets the `core.no_scm` option value to `true` in the DVC +[config](/doc/command-reference/config) when it is initialized this way. It +means that even if the project was Git-tracked already or Git is initialized in +it later, DVC keeps operating in the detached from Git mode. + +## Options - `-f`, `--force` - remove `.dvc/` if it exists before initialization. Will remove any existing local cache. Useful when a previous `dvc init` has been corrupted. +- `--subdir` - initialize the DVC project in the current working directory, + _even if it's not the Git repository root_. (If run in a project root, this + option is ignored.) It affects how other DVC commands behave afterwards, + please see + [Initializing DVC in subdirectories](#initializing-dvc-in-subdirectories) for + more details. + +- `--no-scm` - initialize the DVC project detached from Git. It means that DVC + doesn't try to find or use Git in the directory it's initialized in. Certain + DVC features are not available in this mode, please see + [Initializing DVC without Git](#initializing-dvc-without-git) for more + details. + - `-h`, `--help` - prints the usage/help message, and exit. - `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no @@ -41,9 +169,10 @@ local cache and you cannot `git push` it. - `-v`, `--verbose` - displays detailed tracing information. -## Examples +## Examples: Most common initialization workflow -Create a new DVC repository (requires Git): +Create a new DVC repository (requires to be run in the Git +repository root): ```dvc $ mkdir example && cd example @@ -67,3 +196,30 @@ $ cat .dvc/.gitignore ... /cache ``` + +## Examples: Initializing DVC in a subdirectory + +Create a new DVC repository in a subdirectory of a Git repository: + +```dvc +$ mkdir repo && cd repo + +$ git init +$ mkdir project-a && cd project-a + +$ dvc init --subdir +``` + +In this case, Git repository is inside `repo` directory, while DVC +repository is inside `repo/project-a`. + +```dvc +$ tree repo -a +repo +├── .git +. +. +. +└── project-a + └── .dvc +``` diff --git a/public/static/docs/command-reference/update.md b/public/static/docs/command-reference/update.md index 630c342867..9cba5d609a 100644 --- a/public/static/docs/command-reference/update.md +++ b/public/static/docs/command-reference/update.md @@ -6,7 +6,7 @@ projects, and corresponding [DVC-files](/doc/user-guide/dvc-file-format). ## Synopsis ```usage -usage: dvc update [-h] [-q | -v] targets [targets ...] +usage: dvc update [-h] [-q | -v] [--rev [REV]] targets [targets ...] positional arguments: targets DVC-files to update. @@ -27,12 +27,24 @@ Note that import stages are considered always locked, meaning that if you run update them. `dvc update` will not have an effect on import stages that are fixed to a commit -hash (`rev` field in the DVC-file). Please refer to -[Fixed revisions & re-importing](/doc/command-reference/import#example-fixed-revisions-re-importing) -for more details. +hash (`rev` field in the DVC-file). To update the imported artifacts to a +certain revision, `--rev` with specified revision can be used. + +```dvc +dvc update --rev master +``` ## Options +- `--rev` - specific + [Git revision](https://git-scm.com/book/en/v2/Git-Internals-Git-References) + (such as a branch name, a tag, or a commit hash) of the repository to update + the file or directory from (also starts tracking the given revision). + + > Note that this adds or updates a `rev` field in the DVC-file that fixes it + > to this revision (and updates `rev_lock` in the DVC-file). This can have an + > impact on the behavior of `dvc update` later. + - `-h`, `--help` - prints the usage/help message, and exit. - `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no @@ -40,7 +52,7 @@ for more details. - `-v`, `--verbose` - displays detailed tracing information. -## Examples +## Example: Updating imported artifacts Let's first import a data artifact from our [get started example repo](https://github.com/iterative/example-get-started): @@ -69,3 +81,41 @@ stable. > Note that `dvc update` updates the `rev_lock` field of the corresponding > [DVC-file](/doc/user-guide/dvc-file-format) (when there are changes to bring > in). + +## Example: Updating imported artifacts to a specified revision + +Let's import a data artifact from an older commit from our +[get started example repo](https://github.com/iterative/example-get-started) at +first: + +```dvc +$ dvc import --rev baseline-experiment git@github.com:iterative/example-get-started model.pkl +Importing 'model.pkl (git@github.com:iterative/example-get-started)' +-> 'model.pkl' +``` + +After this, the import stage (DVC-file) `model.pkl.dvc` is created. + +Let's try to run `dvc update` on the given stage file, and see what happens. + +```dvc +$ dvc update model.pkl.dvc +``` + +There was no output at all, meaning, the `model.pkl` file was not updated. This +is because, we tied the import stage with a `rev` that never changes (i.e. tag +is tied to a specific commit). Therefore, it was not updated. + +Let's try to update the model to a different experiment `bigrams-experiment`: + +```dvc +$ dvc update --rev bigrams-experiment model.pkl.dvc +Importing 'model.pkl (git@github.com:iterative/example-get-started)' +-> 'model.pkl' +``` + +The import stage is overwritten, and will get updated from the latest changes in +the given revision (i.e. `bigrams-experiment` tag). + +> In the above example, the value for `rev` in the new import stage will be +> `bigrams-experiment`. diff --git a/public/static/docs/glossary.js b/public/static/docs/glossary.js index bb6d10bc59..fe982958ac 100644 --- a/public/static/docs/glossary.js +++ b/public/static/docs/glossary.js @@ -16,6 +16,7 @@ code, ML models, etc. It will conatain your DVC project. name: 'DVC Project', match: [ 'DVC project', + 'DVC projects', 'project', 'projects', 'DVC repository', diff --git a/public/static/docs/install/index.md b/public/static/docs/install/index.md index c3ed40abb6..c375c0ca61 100644 --- a/public/static/docs/install/index.md +++ b/public/static/docs/install/index.md @@ -7,6 +7,13 @@ Please double check that you don't already have DVC (for example running - [Install on Windows](/doc/install/windows) - [Install on Linux](/doc/install/linux) +## Install as a Python library + +DVC can be used as a Python library, simply install it with a package manager +like `pip` or `conda`, and as a Python +[project requirement](https://pip.pypa.io/en/latest/user_guide/#requirements-files) +if needed. The [Python API](/doc/api-reference) module is `dvc.api`. + ## Advanced options - Shell completion is automatically enabled by certain installation methods. If diff --git a/public/static/docs/install/linux.md b/public/static/docs/install/linux.md index 598f73b231..5b5ab6c22d 100644 --- a/public/static/docs/install/linux.md +++ b/public/static/docs/install/linux.md @@ -1,5 +1,8 @@ # Installation on Linux +> To use DVC [as a Python library](/doc/api-reference), please +> [install with pip](#install-with-pip) or [with conda](#install-with-conda). + ## Install with pip > We **strongly** recommend creating a diff --git a/public/static/docs/install/macos.md b/public/static/docs/install/macos.md index d7d9550c9e..3a231e4647 100644 --- a/public/static/docs/install/macos.md +++ b/public/static/docs/install/macos.md @@ -1,5 +1,8 @@ # Installation on MacOS +> To use DVC [as a Python library](/doc/api-reference), please +> [install with pip](#install-with-pip) or [with conda](#install-with-conda). + ## Install with brew Recommended. Requires [Homebrew](https://brew.sh/). diff --git a/public/static/docs/install/windows.md b/public/static/docs/install/windows.md index b1b1a1b762..cc2e78e4fe 100644 --- a/public/static/docs/install/windows.md +++ b/public/static/docs/install/windows.md @@ -4,6 +4,11 @@ > [Running DVC on Windows](/doc/user-guide/running-dvc-on-windows) for important > tips to improve your experience using DVC on Windows. + + +> To use DVC [as a Python library](/doc/api-reference), please +> [install with pip](#install-with-pip) or [with conda](#install-with-conda). + ## Windows installer The easiest way is to use the self-contained, executable installer (binary), diff --git a/public/static/docs/sidebar.json b/public/static/docs/sidebar.json index a13ff3591b..6cd58aeba4 100644 --- a/public/static/docs/sidebar.json +++ b/public/static/docs/sidebar.json @@ -364,6 +364,25 @@ } ] }, + { + "slug": "api-reference", + "label": "Python API Reference", + "source": "api-reference/index.md", + "children": [ + { + "slug": "get_url", + "label": "get_url()" + }, + { + "slug": "open", + "label": "open()" + }, + { + "slug": "read", + "label": "read()" + } + ] + }, { "slug": "understanding-dvc", "label": "Understanding DVC", diff --git a/public/static/docs/use-cases/data-registries.md b/public/static/docs/use-cases/data-registries.md index 68d3a4ddaf..4cb28cde84 100644 --- a/public/static/docs/use-cases/data-registries.md +++ b/public/static/docs/use-cases/data-registries.md @@ -89,8 +89,8 @@ $ dvc push ## Using registries The main methods to consume data artifacts from a **data registry** -are the `dvc import` and `dvc get` commands, as well as the `dvc.api` Python -API. +are the `dvc import` and `dvc get` commands, as well as the +[`dvc.api`](/doc/api-reference) Python API. ### Simple download (get) @@ -141,7 +141,7 @@ $ dvc update dataset.dvc `images/faces/`, based on the latest commit in the source repo. It also updates the project dependency metadata in the import stage (DVC-file). -### Programatic reusability of DVC data +### Programmatic reusability of DVC data Our Python API, included with the `dvc` package installed with DVC, includes the `open` function to load/stream data directly from external DVC diff --git a/public/static/docs/user-guide/dvcignore.md b/public/static/docs/user-guide/dvcignore.md index 5afc16576a..d997ae9130 100644 --- a/public/static/docs/user-guide/dvcignore.md +++ b/public/static/docs/user-guide/dvcignore.md @@ -150,7 +150,7 @@ data.dvc: modified: data ``` -## Example: Ignore dvc controlled file +## Example: Ignore DVC tracked file Let's analyze an example workspace: diff --git a/scripts/clear-cloudflare-cache.js b/scripts/clear-cloudflare-cache.js new file mode 100755 index 0000000000..a9a2e9c68d --- /dev/null +++ b/scripts/clear-cloudflare-cache.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node +/* global process */ + +// This script runs just before the app starts. If we are running the +// production heroku app (the only one with the below env variables) +// the cache gets cleared. +// +// To clear the cache yourself, you can use the button in the +// cloudflare dashboard ("Caching tab > Purge everything"), or run +// this script with the required environment variables: +// +// - CLOUDFLARE_TOKEN: a token with the "Zone.Cache Purge" permission. +// You can generate this token in "My Profile > API Tokens" +// +// - CLOUDFLARE_ZONE_ID: The zone ID to purge. You can find it in the +// sidebar of the "overview" tab for dvc.org + +const fetch = require('isomorphic-fetch'); + +const { CLOUDFLARE_TOKEN, CLOUDFLARE_ZONE_ID } = process.env; + +async function main() { + const res = await fetch( + `https://api.cloudflare.com/client/v4/zones/${CLOUDFLARE_ZONE_ID}/purge_cache`, + { + method: 'POST', + headers: { + authorization: `Bearer ${CLOUDFLARE_TOKEN}`, + 'content-type': 'application/json' + }, + body: JSON.stringify({ purge_everything: true }) + } + ); + + const body = await res.text(); + + if (!res.ok) { + throw new Error('Error response received from CloudFlare: ' + body); + } + + console.log('Cleared cache successfully'); +} + +if (CLOUDFLARE_TOKEN) { + main().catch(e => { + console.error(e); + process.exit(1); + }); +} diff --git a/scripts/exclude-links.txt b/scripts/exclude-links.txt index 74e4a9dada..d9f10dcbea 100644 --- a/scripts/exclude-links.txt +++ b/scripts/exclude-links.txt @@ -8,6 +8,7 @@ https://accounts.google.com/o/oauth2/auth https://api.github.com/repos/$ https://blog.$ https://circleci.com/gh/iterative/dvc.org +https://api.cloudflare.com/client/v4/zones/$ https://code.dvc.org/foo/bar https://data.dvc.org/foo/bar https://discuss.$ @@ -32,6 +33,9 @@ https://man.dvc.org/foo https://marketplace.visualstudio.com/items?itemName=stkb.rewrap https://myendpoint.com https://object-storage.example.com +https://remote.dvc.org/dataset-registry +https://remote.dvc.org/dataset-registry/a3/04af... +https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355 https://remote.dvc.org/foo/bar https://remote.dvc.org/get-started https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar diff --git a/server.js b/server.js index 70b057758c..673efcab86 100644 --- a/server.js +++ b/server.js @@ -28,10 +28,13 @@ app.prepare().then(() => { const { pathname, query } = parsedUrl const host = req.headers.host + res.setHeader('Cache-Control', 'public, max-age=0, s-maxage=99999') + let [redirectCode, redirectLocation] = getRedirect(host, pathname, { req, dev }) + if (redirectLocation) { // HTTP redirects @@ -40,7 +43,6 @@ app.prepare().then(() => { redirectLocation += '?' + queryStr } res.writeHead(redirectCode, { - 'Cache-control': 'no-cache', Location: redirectLocation }) res.end() diff --git a/src/components/Documentation/SidebarMenu/index.js b/src/components/Documentation/SidebarMenu/index.js index cb1ead6601..7253437034 100644 --- a/src/components/Documentation/SidebarMenu/index.js +++ b/src/components/Documentation/SidebarMenu/index.js @@ -75,15 +75,14 @@ export default function SidebarMenu({ id, sidebar, currentPath, onClick }) { setIsScrollHidden(true) - setTimeout(() => { - if (psRef.current) { - psRef.current.update() - scrollIntoView(node, parent, { onlyScrollIfNeeded: true }) - setIsScrollHidden(false) - } + const timeout = setTimeout(() => { + psRef.current.update() + scrollIntoView(node, parent, { onlyScrollIfNeeded: true }) + setIsScrollHidden(false) }, 400) return () => { + clearTimeout(timeout) psRef.current.destroy() psRef.current = null } diff --git a/src/components/DownloadButton/index.js b/src/components/DownloadButton/index.js index 268f9c8499..75138f79a3 100644 --- a/src/components/DownloadButton/index.js +++ b/src/components/DownloadButton/index.js @@ -19,7 +19,7 @@ import { Triangle } from './styles' -const VERSION = `0.83.0` +const VERSION = `0.87.0` const OSX = `osx` const WINDOWS = `win` const LINUX = `linux`