diff --git a/package.json b/package.json
index e4781f8558..4d8fe4ebd8 100644
--- a/package.json
+++ b/package.json
@@ -8,7 +8,7 @@
"debug": "node --inspect-brk server.js",
"build": "next build",
"test": "jest",
- "start": "NODE_ENV=production node server.js",
+ "start": "./scripts/clear-cloudflare-cache.js; NODE_ENV=production node server.js",
"format-staged": "pretty-quick --staged --no-restage --bail",
"format-check": "prettier --check '{.,pages/**,public/static/docs/**,src/**}/*.{js,md,json}'",
"lint-check": "eslint --ext .json,.js src pages",
diff --git a/public/static/docs/api-reference/get_url.md b/public/static/docs/api-reference/get_url.md
new file mode 100644
index 0000000000..9a83e33a09
--- /dev/null
+++ b/public/static/docs/api-reference/get_url.md
@@ -0,0 +1,110 @@
+# dvc.api.get_url()
+
+Returns the URL to the storage location of a data file or directory tracked in a
+DVC project.
+
+```py
+def get_url(path: str,
+ repo: str = None,
+ rev: str = None,
+ remote: str = None) -> str
+```
+
+#### Usage:
+
+```py
+import dvc.api
+
+resource_url = dvc.api.get_url(
+ 'get-started/data.xml',
+ repo='https://github.com/iterative/dataset-registry')
+
+# resource_url is now "https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355"
+```
+
+## Description
+
+Returns the URL string of the storage location (in a
+[DVC remote](/doc/command-reference/remote)) where a target file or directory,
+specified by its `path` in a `repo` (DVC project), is stored.
+
+The URL is formed by reading the project's
+[remote configuration](/doc/command-reference/config#remote) and the
+[DVC-file](/doc/user-guide/dvc-file-format) where the given `path` is an
+output. The URL schema returned depends on the
+[type](/doc/command-reference/remote/add#supported-storage-types) of the
+`remote` used (see the [Parameters](#parameters) section).
+
+If the target is a directory, the returned URL will end in `.dir`. Refer to
+[Structure of cache directory](/doc/user-guide/dvc-files-and-directories#structure-of-cache-directory)
+and `dvc add` to learn more about how DVC handles data directories.
+
+⚠️ This function does not check for the actual existence of the file or
+directory in the remote storage.
+
+💡 Having the resource's URL, it should be possible to download it directly with
+an appropriate library, such as
+[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.download_fileobj)
+or
+[`paramiko`](https://docs.paramiko.org/en/stable/api/sftp.html#paramiko.sftp_client.SFTPClient.get).
+
+## Parameters
+
+- **`path`** - location and file name of the file or directory in `repo`,
+ relative to the project's root.
+
+- `repo` - specifies the location of the DVC project. It can be a URL or a file
+ system path. Both HTTP and SSH protocols are supported for online Git repos
+ (e.g. `[user@]server:project.git`). _Default_: The current project is used
+ (the current working directory tree is walked up to find it).
+
+- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as
+ a branch or tag name, or a commit hash). If `repo` is not a Git repo, this
+ option is ignored. _Default_: `HEAD`.
+
+- `remote` - name of the [DVC remote](/doc/command-reference/remote) to use to
+ form the returned URL string. _Default_: The
+ [default remote](/doc/command-reference/remote/default) of `repo` is used.
+
+## Exceptions
+
+- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project.
+
+- `dvc.exceptions.NoRemoteError` - no `remote` is found.
+
+## Example: Getting the URL to a DVC-tracked file
+
+```py
+import dvc.api
+
+resource_url = dvc.api.get_url(
+ 'get-started/data.xml',
+ repo='https://github.com/iterative/dataset-registry'
+ )
+
+print(resource_url)
+```
+
+The script above prints
+
+`https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355`
+
+This URL represents the location where the data is stored, and is built by
+reading the corresponding DVC-file
+([`get-started/data.xml.dvc`](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc))
+where the `md5` file hash is stored,
+
+```yaml
+outs:
+ - md5: a304afb96060aad90176268345e10355
+ path: get-started/data.xml
+```
+
+and the project configuration
+([`.dvc/config`](https://github.com/iterative/dataset-registry/blob/master/.dvc/config))
+where the remote URL is saved:
+
+```ini
+['remote "storage"']
+url = https://remote.dvc.org/dataset-registry
+```
diff --git a/public/static/docs/api-reference/index.md b/public/static/docs/api-reference/index.md
new file mode 100644
index 0000000000..0298b185ab
--- /dev/null
+++ b/public/static/docs/api-reference/index.md
@@ -0,0 +1,16 @@
+# Python API
+
+DVC can be used as a Python library, simply [install](/doc/install) with `pip`
+or `conda`. This reference provides the details about the functions in the API
+module `dvc.api`, which can be imported any regular way, for example:
+
+```py
+import dvc.api
+```
+
+The purpose of this API is to provide programatic access to the data or models
+[stored and versioned](/doc/use-cases/versioning-data-and-model-files) in
+DVC repositories from Python code.
+
+Please choose a function from the navigation sidebar to the left, or click the
+`Next` button below to jump into the first one ↘
diff --git a/public/static/docs/api-reference/open.md b/public/static/docs/api-reference/open.md
new file mode 100644
index 0000000000..858090e0fb
--- /dev/null
+++ b/public/static/docs/api-reference/open.md
@@ -0,0 +1,200 @@
+# dvc.api.open()
+
+Opens a tracked file.
+
+```py
+def open(path: str,
+ repo: str = None,
+ rev: str = None,
+ remote: str = None,
+ mode: str = "r",
+ encoding: str = None)
+```
+
+#### Usage:
+
+```py
+import dvc.api
+
+with dvc.api.open(
+ 'get-started/data.xml',
+ repo='https://github.com/iterative/dataset-registry'
+ ) as fd:
+ # ... fd is a file descriptor that can be processed normally.
+```
+
+## Description
+
+Open a data or model file tracked in a DVC project and generate a
+corresponding
+[file object](https://docs.python.org/3/glossary.html#term-file-object). The
+file can be tracked by DVC or by Git.
+
+> The exact type of file object depends on the `mode` used. For more details,
+> please refer to Python's
+> [`open()`](https://docs.python.org/3/library/functions.html#open) built-in,
+> which is used under the hood.
+
+`dvc.api.open()` may only be used as a
+[context manager](https://www.python.org/dev/peps/pep-0343/#context-managers-in-the-standard-library)
+(using the `with` keyword, as shown in the examples).
+
+This function makes a direct connection to the
+[remote storage](/doc/command-reference/remote/add#supported-storage-types)
+(except for Google Drive), so the file contents can be streamed. Your code can
+process the data [buffer](https://docs.python.org/3/c-api/buffer.html) as it's
+streamed, which optimizes memory usage.
+
+> Use `dvc.api.read()` to load the complete file contents in a single function
+> call – no _context manager_ involved. Neither function utilizes disc space.
+
+## Parameters
+
+- **`path`** - location and file name of the file in `repo`, relative to the
+ project's root.
+
+- `repo` - specifies the location of the DVC project. It can be a URL or a file
+ system path. Both HTTP and SSH protocols are supported for online Git repos
+ (e.g. `[user@]server:project.git`). _Default_: The current project is used
+ (the current working directory tree is walked up to find it).
+
+- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as
+ a branch or tag name, or a commit hash). If `repo` is not a Git repo, this
+ option is ignored. _Default_: `HEAD`.
+
+- `remote` - name of the [DVC remote](/doc/command-reference/remote) to look for
+ the target data. _Default_: The
+ [default remote](/doc/command-reference/remote/default) of `repo` is used if a
+ `remote` argument is not given. For local projects, the cache is
+ tied before the default remote.
+
+- `mode` - specifies the mode in which the file is opened. Defaults to `"r"`
+ (read). Mirrors the namesake parameter in builtin
+ [`open()`](https://docs.python.org/3/library/functions.html#open).
+
+- `encoding` -
+ [codec](https://docs.python.org/3/library/codecs.html#standard-encodings) used
+ to decode the file contents to a string. This should only be used in text
+ mode. Defaults to `"utf-8"`. Mirrors the namesake parameter in builtin
+ `open()`.
+
+## Exceptions
+
+- `dvc.exceptions.FileMissingError` - file in `path` is missing from `repo`.
+
+- `dvc.exceptions.PathMissingError` - `path` cannot be found in `repo`.
+
+- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project.
+
+- `dvc.exceptions.NoRemoteError` - no `remote` is found.
+
+## Example: Use data or models from DVC repositories
+
+Any data artifact hosted online can be processed directly in your
+Python code with this API. For example, an XML file tracked in a public DVC repo
+on Github can be processed like this:
+
+```py
+from xml.sax import parse
+import dvc.api
+from mymodule import mySAXHandler
+
+with dvc.api.open(
+ 'get-started/data.xml',
+ repo='https://github.com/iterative/dataset-registry'
+ ) as fd:
+ parse(fd, mySAXHandler)
+```
+
+Notice that we use a [SAX](http://www.saxproject.org/) XML parser here because
+`dvc.api.open()` is able to stream the data from
+[remote storage](/doc/command-reference/remote/add#supported-storage-types).
+(The `mySAXHandler` object should handle the event-driven parsing of the
+document in this case.) This increases the performance of the code (minimizing
+memory usage), and is typically faster than loading the whole data into memory.
+
+> If you just needed to load the complete file contents into memory, you can use
+> `dvc.api.read()` instead:
+>
+> ```py
+> from xml.dom.minidom import parse
+> import dvc.api
+>
+> xmldata = dvc.api.read('get-started/data.xml',
+> repo='https://github.com/iterative/dataset-registry')
+> xmldom = parse(xmldata)
+> ```
+
+## Example: Accessing private repos
+
+This is just a matter of using the right `repo` argument, for example an SSH URL
+(requires that the
+[credentials are configured](https://help.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh)
+locally):
+
+```py
+import dvc.api
+
+with dvc.api.open(
+ 'features.dat',
+ repo='git@server.com:path/to/repo.git'
+ ) as fd:
+ # ... Process 'features'
+```
+
+## Example: Use different versions of data
+
+The `rev` argument lets you specify any Git commit to look for an artifact. This
+way any previous version, or alternative experiment can be accessed
+programmatically. For example, let's say your DVC repo has tagged releases of a
+CSV dataset:
+
+```py
+import csv
+import dvc.api
+
+with dvc.api.open(
+ 'clean.csv',
+ rev='v1.1.0'
+ ) as fd:
+ reader = csv.reader(fd)
+ # ... Process 'clean' data from version 1.1.0
+```
+
+Also, notice that we didn't supply a `repo` argument in this example. DVC will
+attempt to find a DVC project to use in the current working
+directory tree, and look for the file contents of `clean.csv` in its local
+cache; no download will happen if found. See the
+[Parameters](#parameters) section for more info.
+
+## Example: Chose a specific remote as the data source
+
+Sometimes we may want to choose the [remote](/doc/command-reference/remote) data
+source, for example if the `repo` has no default remote set. This can be done by
+providing a `remote` argument:
+
+```py
+import dvc.api
+
+with open(
+ 'activity.log',
+ repo='location/of/dvc/project',
+ remote='my-s3-bucket'
+ ) as fd:
+ for line in fd:
+ match = re.search(r'user=(\w+)', line)
+ # ... Process users activity log
+```
+
+## Example: Specify the text encoding
+
+To chose which codec to open a text file with, send an `encoding` argument:
+
+```py
+import dvc.api
+
+with dvc.api.open(
+ 'data/nlp/words_ru.txt',
+ encoding='koi8_r') as fd:
+ # ... Process Russian words
+```
diff --git a/public/static/docs/api-reference/read.md b/public/static/docs/api-reference/read.md
new file mode 100644
index 0000000000..e83ae063b0
--- /dev/null
+++ b/public/static/docs/api-reference/read.md
@@ -0,0 +1,101 @@
+# dvc.api.read()
+
+Returns the contents of a tracked file.
+
+```py
+def open(path: str,
+ repo: str = None,
+ rev: str = None,
+ remote: str = None,
+ mode: str = "r",
+ encoding: str = None)
+```
+
+#### Usage:
+
+```py
+import dvc.api
+
+modelpkl = dvc.api.read(
+ 'model.pkl',
+ repo='https://github.com/example/project.git'
+ mode='rb')
+```
+
+## Description
+
+This function wraps [`dvc.api.open()`](/doc/api-reference/open), for a simple
+way to return the complete contents of a file tracked in a DVC
+project. The file can be tracked by DVC or by Git.
+
+> This is similar to the `dvc get` command in our CLI.
+
+The returned contents can be a
+[string](https://docs.python.org/3/library/stdtypes.html#text-sequence-type-str)
+or a [bytearray](https://docs.python.org/3/library/stdtypes.html#bytearray).
+These are loaded to memory directly (without using any disc space).
+
+> The type returned depends on the `mode` used. For more details, please refer
+> to Python's [`open()`](https://docs.python.org/3/library/functions.html#open)
+> built-in, which is used under the hood.
+
+## Parameters
+
+- **`path`** - location and file name of the file in `repo`, relative to the
+ project's root.
+
+- `repo` - specifies the location of the DVC project. It can be a URL or a file
+ system path. Both HTTP and SSH protocols are supported for online Git repos
+ (e.g. `[user@]server:project.git`). _Default_: The current project is used
+ (the current working directory tree is walked up to find it).
+
+- `rev` - Git commit (any [revision](https://git-scm.com/docs/revisions) such as
+ a branch or tag name, or a commit hash). If `repo` is not a Git repo, this
+ option is ignored. _Default_: `HEAD`.
+
+- `remote` - name of the [DVC remote](/doc/command-reference/remote) to look for
+ the target data. _Default_: The
+ [default remote](/doc/command-reference/remote/default) of `repo` is used if a
+ `remote` argument is not given. For local projects, the cache is
+ tied before the default remote.
+
+- `mode` - specifies the mode in which the file is opened. Defaults to `"r"`
+ (read). Mirrors the namesake parameter in builtin
+ [`open()`](https://docs.python.org/3/library/functions.html#open).
+
+- `encoding` -
+ [codec](https://docs.python.org/3/library/codecs.html#standard-encodings) used
+ to decode the file contents to a string. This should only be used in text
+ mode. Defaults to `"utf-8"`. Mirrors the namesake parameter in builtin
+ `open()`.
+
+## Exceptions
+
+- `dvc.exceptions.FileMissingError` - file in `path` is missing from `repo`.
+
+- `dvc.exceptions.PathMissingError` - `path` cannot be found in `repo`.
+
+- `dvc.api.UrlNotDvcRepoError` - `repo` is not a DVC project.
+
+- `dvc.exceptions.NoRemoteError` - no `remote` is found.
+
+## Example: Load data from a DVC repository
+
+Any data artifact hosted online can be loaded directly in your
+Python code with this API. For example, let's say that you want to load and
+unserialize a binary model from a repo on Github:
+
+```py
+import pickle
+import dvc.api
+
+model = pickle.loads(
+ dvc.api.read(
+ 'model.pkl',
+ repo='https://github.com/example/project.git'
+ mode='rb'
+ )
+ )
+```
+
+> We're using `'rb'` mode here for compatibility with `pickle.loads()`.
diff --git a/public/static/docs/command-reference/commit.md b/public/static/docs/command-reference/commit.md
index bb5399cdef..4ab9eeb2d4 100644
--- a/public/static/docs/command-reference/commit.md
+++ b/public/static/docs/command-reference/commit.md
@@ -45,7 +45,7 @@ needed after a `git commit`. See `dvc install` for more details.
stages. `dvc commit` can help avoid having to reproduce a pipeline in these
cases by forcing the update of the DVC-files.
-Let's take a look at what is happening in the fist scenario closely. Normally
+Let's take a look at what is happening in the first scenario closely. Normally
DVC commands like `dvc add`, `dvc repro` or `dvc run` commit the data to the
cache after creating a DVC-file. What _commit_ means is that DVC:
@@ -54,7 +54,7 @@ DVC commands like `dvc add`, `dvc repro` or `dvc run` commit the data to the
- Tells Git to ignore the file/directory (adding an entry to `.gitignore`).
(Note that if the project was initialized with no SCM support
(`dvc init --no-scm`), this does not happen.)
-- Adds the file/directory or to the cache.
+- Adds the file/directory to the cache.
There are many cases where the last step is not desirable (for example rapid
iterations on an experiment). The `--no-commit` option prevents the last step
@@ -258,7 +258,7 @@ that both Git and DVC recognize a change was made.
If we ran `dvc repro` at this point, this pipeline would be reproduced. But
since the change was inconsequential, that would be a waste of time and CPU.
-That's especially critical if the corresponding stages lots of resources to
+That's especially critical if the corresponding stages take lots of resources to
execute.
```dvc
diff --git a/public/static/docs/command-reference/config.md b/public/static/docs/command-reference/config.md
index 97b5f7f46f..27b3284a61 100644
--- a/public/static/docs/command-reference/config.md
+++ b/public/static/docs/command-reference/config.md
@@ -62,14 +62,14 @@ file (in `.dvc/config` by default), and they support the options below:
This is the main section with the general config options:
-- `core.loglevel` - log level that the `dvc` command should use. Possible values
- are: `info`, `debug`, `warning`, `error`.
+- `core.loglevel` - log level that the `dvc` command should use. Accepts values
+ `info`, `debug`, `warning`, or `error`.
- `core.remote` - name of the remote storage that should be used by default.
- `core.interactive` - whether to always ask for confirmation before reproducing
each [stage](/doc/command-reference/run) in `dvc repro`. (Normally, this
- behavior requires the use of option `-i` in that command.) Accepts values
+ behavior requires the use of option `-i` in that command.) Accepts values:
`true` and `false`.
- `core.analytics` - used to turn off
@@ -85,6 +85,11 @@ This is the main section with the general config options:
project is on a file system that doesn't properly support file locking (e.g.
[NFS v3 and older](http://nfs.sourceforge.net/)).
+- `core.no_scm` - tells DVC to not expect or integrate with Git (even if the
+ project is initialized inside a Git repo). Accepts values `true`
+ and `false` (default). Set with the `--no-scm` option of `dvc init`
+ ([more details](/doc/command-reference/init#initializing-dvc-without-git)).
+
### remote
These are sections in the config file that describe particular remotes. These
diff --git a/public/static/docs/command-reference/get-url.md b/public/static/docs/command-reference/get-url.md
index 1a45e8c992..4d7fccf6a6 100644
--- a/public/static/docs/command-reference/get-url.md
+++ b/public/static/docs/command-reference/get-url.md
@@ -3,8 +3,8 @@
Download a file or directory from a supported URL (for example `s3://`,
`ssh://`, and other protocols) into the local file system.
-> Unlike `dvc import-url`, this command does not track the downloaded data files
-> (does not create a DVC-file).
+> See `dvc get` to download data/model files or directories from other DVC
+> repositories (e.g. hosted on GitHub).
## Synopsis
@@ -22,15 +22,15 @@ In some cases it's convenient to get a data artifact from a remote
location into the local file system. The `dvc get-url` command helps the user do
just that.
+> Note that unlike `dvc import-url`, this command does not track the downloaded
+> data files (does not create a DVC-file). For that reason, this command doesn't
+> require an existing DVC project to run in.
+
The `url` argument should provide the location of the data to be downloaded,
while `out` can be used to specify the directory and/or file name desired for
the downloaded data. If an existing directory is specified, then the output will
be placed inside of it.
-Note that this command doesn't require an existing DVC project to
-run in. It's a single-purpose command that can be used out of the box after
-installing DVC.
-
DVC supports several types of (local or) remote locations (protocols):
| Type | Description | `url` format |
@@ -61,9 +61,6 @@ HTTP(S) it's possible to instead use:
$ wget https://example.com/path/to/data.csv
```
-> See `dvc get` to download data/model files or directories from other DVC
-> repositories (e.g. GitHub URLs).
-
## Options
- `-h`, `--help` - prints the usage/help message, and exit.
diff --git a/public/static/docs/command-reference/get.md b/public/static/docs/command-reference/get.md
index 84e26ae834..af0476dfdd 100644
--- a/public/static/docs/command-reference/get.md
+++ b/public/static/docs/command-reference/get.md
@@ -3,8 +3,7 @@
Download a file or directory tracked by DVC or by Git into the current working
directory.
-> Unlike `dvc import`, this command does not track the downloaded files (does
-> not create a DVC-file).
+> See also our `dvc.api.open()` Python API function.
## Synopsis
@@ -21,11 +20,12 @@ positional arguments:
Provides an easy way to download files or directories tracked in any DVC
repository (e.g. datasets, intermediate results, ML models), or Git
repository (e.g. source code, small image/other files). `dvc get` copies the
-target file or directory (`url`/`path`) to the current working directory.
-(Analogous to `wget`, but for repos.)
+target file or directory (found at `path` in `url`) to the current working
+directory. (Analogous to `wget`, but for repos.)
-Note that this command doesn't require an existing DVC project to run in. It's a
-single-purpose command that can be used out of the box after installing DVC.
+> Note that unlike `dvc import`, this command does not track the downloaded
+> files (does not create a DVC-file). For that reason, this command doesn't
+> require an existing DVC project to run in.
The `url` argument specifies the address of the DVC or Git repository containing
the data source. Both HTTP and SSH protocols are supported for online repos
diff --git a/public/static/docs/command-reference/import-url.md b/public/static/docs/command-reference/import-url.md
index 797971aecd..aa625396d7 100644
--- a/public/static/docs/command-reference/import-url.md
+++ b/public/static/docs/command-reference/import-url.md
@@ -4,8 +4,8 @@ Download a file or directory from a supported URL (for example `s3://`,
`ssh://`, and other protocols) into the workspace, and track
changes in the remote data source. Creates a DVC-file.
-> See also `dvc get-url`, that corresponds to the first half of what this
-> command does (downloading the data artifact).
+> See `dvc import` to download and tack data/model files or directories from
+> other DVC repositories (e.g. hosted on GitHub).
## Synopsis
@@ -28,6 +28,9 @@ external data source changes. Example scenarios:
- A batch process running regularly updates a data file to import.
- A shared dataset on a remote storage that is managed and updated outside DVC.
+> Note that `dvc get-url` corresponds to the first step this command performs
+> (just download the file or directory).
+
The `dvc import-url` command helps the user create such an external data
dependency. The `url` argument specifies the external location of the data to be
imported, while `out` can be used to specify the directory and/or file name
@@ -103,9 +106,6 @@ Note that import stages are considered always locked, meaning that if you run
`dvc repro`, they won't be updated. Use `dvc update` on them to bring the import
up to date from the external data source.
-> See `dvc import` to download and tack data/model files or directories from
-> other DVC repositories (e.g. GitHub URLs).
-
## Options
- `-f FILE`, `--file FILE` - specify a path and/or file name for the DVC-file
diff --git a/public/static/docs/command-reference/import.md b/public/static/docs/command-reference/import.md
index 5940b6a0fd..b8e910be70 100644
--- a/public/static/docs/command-reference/import.md
+++ b/public/static/docs/command-reference/import.md
@@ -6,8 +6,7 @@ Download a file or directory tracked by DVC or by Git into the
source, which can later be used to [update](/doc/command-reference/update) the
import.
-> See also `dvc get`, that corresponds to the first step this command performs
-> (just download the data).
+> See also our `dvc.api.open()` Python API function.
## Synopsis
@@ -24,9 +23,13 @@ positional arguments:
Provides an easy way to reuse files or directories tracked in any DVC
repository (e.g. datasets, intermediate results, ML models) or Git
repository (e.g. source code, small image/other files). `dvc import` downloads
-the target file or directory (`url`/`path`) in a way so that it's tracked with
-DVC, becoming a local data artifact. This also permits updating the
-import later, if it has changed in its data source. (See `dvc update`.)
+the target file or directory (found at `path` in `url`) in a way so that it's
+tracked with DVC, becoming a local data artifact. This also permits
+updating the import later, if it has changed in its data source. (See
+`dvc update`.)
+
+> Note that `dvc get` corresponds to the first step this command performs (just
+> download the data).
The `url` argument specifies the address of the DVC or Git repository containing
the data source. Both HTTP and SSH protocols are supported for online repos
@@ -62,8 +65,7 @@ To actually [track the data](https://dvc.org/doc/get-started/add-files),
`git add` (and `git commit`) the import stage.
Note that import stages are considered always locked, meaning that if you run
-`dvc repro`, they won't be updated. Use `dvc update` or
-[re-import](#example-fixed-revisions-re-importing) them to update the downloaded
+`dvc repro`, they won't be updated. Use `dvc update` to update the downloaded
data artifact from the source repo.
## Options
@@ -129,7 +131,7 @@ Several of the values above are pulled from the original stage file
subfields under `repo` are used to save the origin and version of the
dependency, respectively.
-## Example: Fixed revisions & re-importing
+## Example: Fixed revisions and updating to different revision
To import a specific version of a data artifact, we may use the
`--rev` option:
@@ -159,23 +161,14 @@ deps:
If `rev` is a Git branch or tag (where the underlying commit changes), the data
source may have updates at a later time. To bring it up to date if so (and
update `rev_lock` in the DVC-file), simply use `dvc update .dvc`. If
-`rev` is a specific commit hash (does not change), `dvc update` will never have
-an effect on the import stage. You may **re-import** a different commit instead,
-by using `dvc import` again with a different (or without) `--rev`. For example:
+`rev` is a specific commit (does not change), `dvc update` will never have an
+effect on the import stage. You may `dvc update` to a different commit, using
+`--rev`:
```dvc
-$ dvc import --rev master \
- git@github.com:iterative/dataset-registry.git \
- use-cases/cats-dogs
+$ dvc update --rev cats-dogs-v2
```
-The import stage is overwritten, and will now be able update normally with
-`dvc update`.
-
-> In the above example, the value for `rev` in the new import stage will be
-> `master` (default branch), so the command is equivalent to not using `--rev`
-> at all.
-
## Example: Data registry
If you take a look at our
diff --git a/public/static/docs/command-reference/init.md b/public/static/docs/command-reference/init.md
index a00f415284..13dd336cf4 100644
--- a/public/static/docs/command-reference/init.md
+++ b/public/static/docs/command-reference/init.md
@@ -1,39 +1,167 @@
# init
-This command initializes a DVC project on a directory.
-
-Note that by default the current working directory is expected to contain a Git
-repository, unless the `--no-scm` option is used.
+Initialize a DVC project in the current working directory.
## Synopsis
```usage
-usage: dvc init [-h] [-q | -v] [--no-scm] [-f]
+usage: dvc init [-h] [-q | -v] [--no-scm] [-f] [--subdir]
```
## Description
+DVC works on top of a Git repository by default. This enables all features,
+providing the most value. It means that `dvc init` (without flags) expects to
+run in a Git repository root (a `.git/` directory should be present).
+
+The command options can be used to start an alternative workflow for advanced
+scenarios like monorepos, automation, etc:
+
+- [Initializing DVC in subdirectories](#initializing-dvc-in-subdirectories) -
+ support for monorepos, nested DVC projects, etc.
+- [Initializing DVC without Git](#how-does-it-affect-dvc-commands) - support for
+ SCM other than Git, deployment automation cases, etc.
+
After DVC initialization, a new directory `.dvc/` will be created with the
`config` and `.gitignore` files. These and other files and directories are
hidden from user, as typically there's no need to interact with them directly.
See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to
learn more.
-`.dvc/cache` is one of the most important
-[DVC directories](/doc/user-guide/dvc-files-and-directories). It will hold all
-the contents of tracked data files. Note that `.dvc/.gitignore` lists this
-directory, which means that the cache directory is not tracked by Git. This is a
-local cache and you cannot `git push` it.
+### Initializing DVC in subdirectories
+
+`--subdir` must be provided to initialize DVC in a subdirectory of a Git
+repository. DVC still expects to find the Git repository (will check all
+directories up to the root to find `.git`). This options does not affect any
+config files, `.dvc` directory is created the same way as in the default mode.
+This way multiple DVC projects (including nested ones) could be initialized in a
+single Git repository providing isolation and granular project management.
+
+#### When is this useful?
+
+This option is mostly used in the scenario of a
+[monorepo](https://en.wikipedia.org/wiki/Monorepo), but also can be used in
+other workflows when such isolation and/or advanced granularity is needed.
+
+Let's imagine we have an existing Git repository that is split into sub-projects
+(monorepo). In this case `dvc init --subdir` can be run in one or many
+sub-projects to mitigate the issues of initializing in the Git repository root:
+
+- Repository maintainers might not allow extra `.dvc` top level directory,
+ especially if DVC is being used by a small number of sub-projects.
+
+- Not enough isolation/granularity - DVC config, cache, and other files are
+ shared across different sub-projects. Means that it's not easy to use
+ different remote storages, for example, for different sub-projects, etc.
+
+- Not enough isolation/granularity - commands like `dvc pull`, `dvc checkout`,
+ and others analyze the whole repository to look for
+ [DVC-files](/doc/user-guide/dvc-file-format) to download files and
+ directories, to reproduce pipelines, etc. It can be expensive in
+ the large repositories with a lot of projects.
+
+- Not enough isolation/granularity - commands like `dvc metrics diff`,
+ `dvc pipeline show` and others by default dump all the metrics, all the
+ pipelines, etc.
+
+#### How does it affect DVC commands?
+
+No matter what mode is used, DVC looks for the `.dvc` directory when it starts
+(from the current working directory and up). Location of the found `.dvc`
+directory determines the root of the DVC project. (In case of `--subdir` it
+might happen that Git repository root is located at different path than the DVC
+project root.)
+
+DVC project root defines the scope for the most DVC commands. Mostly meaning
+that all DVC-file under the root path are being analyzed.
+
+If there are multiple DVC sub-projects but they _are not_ nested, e.g.:
+
+```sh
+.
+├── .git
+|
+├── project-A
+│ └── .dvc
+│ ...
+├── project-B
+│ └── .dvc
+│ ...
+```
-## Options
+DVC considers them a two separate DVC projects. Any DVC command that is being
+run in the `project-A` is not aware about DVC `project-B`. DVC does not consider
+Git repository root an initialized DVC project in this case and commands that
+require DVC project will raise an error.
+
+On the other hand, if there _are_ nested DVC projects, e.g.:
+
+```sh
+project-A
+├── .dvc
+├── data-A.dvc
+│ ...
+└── project-B
+ ├── .dvc
+ ├── data-B.dvc
+ │ ...
+```
+
+Nothing changes for the `project-B`. But for any DVC command being run in the
+`project-A` ignores the whole directory `project-B/`, meaning for example:
+
+```dvc
+$ cd project-A
+$ dvc pull
+```
+
+won't download or checkout data for the `data-B.dvc` file.
+
+### Initializing DVC without Git
+
+In rare cases, `--no-scm` option might be used to initialize DVC in a directory
+that is not part of a Git repository, or to make DVC ignore Git. Examples
+include:
+
+- SCM other than Git is being used. Even though there are DVC features that
+ require DVC to be run in the Git repo, DVC can work well with other version
+ control systems. Since DVC relies on simple text
+ [DVC-files](/doc/user-guide/dvc-file-format) to manage pipelines,
+ data, etc, they can be added into any SCM thus providing large data files and
+ directories versioning.
+
+- There is no need to keep the history at all, e.g. having a deployment
+ automation like running a data pipeline using `cron`.
+
+In this mode DVC features that depend on Git being present are not available -
+e.g. managing `.gitignore` files on `dvc add` or `dvc run` to avoid committing
+DVC-tracked files into Git, or `dvc diff` and `dvc metrics diff` that accept
+Git-revisions to compare, etc.
-- `--no-scm` - skip Git specific initialization, `.dvc/.gitignore` will not be
- written.
+DVC sets the `core.no_scm` option value to `true` in the DVC
+[config](/doc/command-reference/config) when it is initialized this way. It
+means that even if the project was Git-tracked already or Git is initialized in
+it later, DVC keeps operating in the detached from Git mode.
+
+## Options
- `-f`, `--force` - remove `.dvc/` if it exists before initialization. Will
remove any existing local cache. Useful when a previous `dvc init` has been
corrupted.
+- `--subdir` - initialize the DVC project in the current working directory,
+ _even if it's not the Git repository root_. (If run in a project root, this
+ option is ignored.) It affects how other DVC commands behave afterwards,
+ please see
+ [Initializing DVC in subdirectories](#initializing-dvc-in-subdirectories) for
+ more details.
+
+- `--no-scm` - initialize the DVC project detached from Git. It means that DVC
+ doesn't try to find or use Git in the directory it's initialized in. Certain
+ DVC features are not available in this mode, please see
+ [Initializing DVC without Git](#initializing-dvc-without-git) for more
+ details.
+
- `-h`, `--help` - prints the usage/help message, and exit.
- `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no
@@ -41,9 +169,10 @@ local cache and you cannot `git push` it.
- `-v`, `--verbose` - displays detailed tracing information.
-## Examples
+## Examples: Most common initialization workflow
-Create a new DVC repository (requires Git):
+Create a new DVC repository (requires to be run in the Git
+repository root):
```dvc
$ mkdir example && cd example
@@ -67,3 +196,30 @@ $ cat .dvc/.gitignore
...
/cache
```
+
+## Examples: Initializing DVC in a subdirectory
+
+Create a new DVC repository in a subdirectory of a Git repository:
+
+```dvc
+$ mkdir repo && cd repo
+
+$ git init
+$ mkdir project-a && cd project-a
+
+$ dvc init --subdir
+```
+
+In this case, Git repository is inside `repo` directory, while DVC
+repository is inside `repo/project-a`.
+
+```dvc
+$ tree repo -a
+repo
+├── .git
+.
+.
+.
+└── project-a
+ └── .dvc
+```
diff --git a/public/static/docs/command-reference/update.md b/public/static/docs/command-reference/update.md
index 630c342867..9cba5d609a 100644
--- a/public/static/docs/command-reference/update.md
+++ b/public/static/docs/command-reference/update.md
@@ -6,7 +6,7 @@ projects, and corresponding [DVC-files](/doc/user-guide/dvc-file-format).
## Synopsis
```usage
-usage: dvc update [-h] [-q | -v] targets [targets ...]
+usage: dvc update [-h] [-q | -v] [--rev [REV]] targets [targets ...]
positional arguments:
targets DVC-files to update.
@@ -27,12 +27,24 @@ Note that import stages are considered always locked, meaning that if you run
update them.
`dvc update` will not have an effect on import stages that are fixed to a commit
-hash (`rev` field in the DVC-file). Please refer to
-[Fixed revisions & re-importing](/doc/command-reference/import#example-fixed-revisions-re-importing)
-for more details.
+hash (`rev` field in the DVC-file). To update the imported artifacts to a
+certain revision, `--rev` with specified revision can be used.
+
+```dvc
+dvc update --rev master
+```
## Options
+- `--rev` - specific
+ [Git revision](https://git-scm.com/book/en/v2/Git-Internals-Git-References)
+ (such as a branch name, a tag, or a commit hash) of the repository to update
+ the file or directory from (also starts tracking the given revision).
+
+ > Note that this adds or updates a `rev` field in the DVC-file that fixes it
+ > to this revision (and updates `rev_lock` in the DVC-file). This can have an
+ > impact on the behavior of `dvc update` later.
+
- `-h`, `--help` - prints the usage/help message, and exit.
- `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no
@@ -40,7 +52,7 @@ for more details.
- `-v`, `--verbose` - displays detailed tracing information.
-## Examples
+## Example: Updating imported artifacts
Let's first import a data artifact from our
[get started example repo](https://github.com/iterative/example-get-started):
@@ -69,3 +81,41 @@ stable.
> Note that `dvc update` updates the `rev_lock` field of the corresponding
> [DVC-file](/doc/user-guide/dvc-file-format) (when there are changes to bring
> in).
+
+## Example: Updating imported artifacts to a specified revision
+
+Let's import a data artifact from an older commit from our
+[get started example repo](https://github.com/iterative/example-get-started) at
+first:
+
+```dvc
+$ dvc import --rev baseline-experiment git@github.com:iterative/example-get-started model.pkl
+Importing 'model.pkl (git@github.com:iterative/example-get-started)'
+-> 'model.pkl'
+```
+
+After this, the import stage (DVC-file) `model.pkl.dvc` is created.
+
+Let's try to run `dvc update` on the given stage file, and see what happens.
+
+```dvc
+$ dvc update model.pkl.dvc
+```
+
+There was no output at all, meaning, the `model.pkl` file was not updated. This
+is because, we tied the import stage with a `rev` that never changes (i.e. tag
+is tied to a specific commit). Therefore, it was not updated.
+
+Let's try to update the model to a different experiment `bigrams-experiment`:
+
+```dvc
+$ dvc update --rev bigrams-experiment model.pkl.dvc
+Importing 'model.pkl (git@github.com:iterative/example-get-started)'
+-> 'model.pkl'
+```
+
+The import stage is overwritten, and will get updated from the latest changes in
+the given revision (i.e. `bigrams-experiment` tag).
+
+> In the above example, the value for `rev` in the new import stage will be
+> `bigrams-experiment`.
diff --git a/public/static/docs/glossary.js b/public/static/docs/glossary.js
index bb6d10bc59..fe982958ac 100644
--- a/public/static/docs/glossary.js
+++ b/public/static/docs/glossary.js
@@ -16,6 +16,7 @@ code, ML models, etc. It will conatain your DVC project.
name: 'DVC Project',
match: [
'DVC project',
+ 'DVC projects',
'project',
'projects',
'DVC repository',
diff --git a/public/static/docs/install/index.md b/public/static/docs/install/index.md
index c3ed40abb6..c375c0ca61 100644
--- a/public/static/docs/install/index.md
+++ b/public/static/docs/install/index.md
@@ -7,6 +7,13 @@ Please double check that you don't already have DVC (for example running
- [Install on Windows](/doc/install/windows)
- [Install on Linux](/doc/install/linux)
+## Install as a Python library
+
+DVC can be used as a Python library, simply install it with a package manager
+like `pip` or `conda`, and as a Python
+[project requirement](https://pip.pypa.io/en/latest/user_guide/#requirements-files)
+if needed. The [Python API](/doc/api-reference) module is `dvc.api`.
+
## Advanced options
- Shell completion is automatically enabled by certain installation methods. If
diff --git a/public/static/docs/install/linux.md b/public/static/docs/install/linux.md
index 598f73b231..5b5ab6c22d 100644
--- a/public/static/docs/install/linux.md
+++ b/public/static/docs/install/linux.md
@@ -1,5 +1,8 @@
# Installation on Linux
+> To use DVC [as a Python library](/doc/api-reference), please
+> [install with pip](#install-with-pip) or [with conda](#install-with-conda).
+
## Install with pip
> We **strongly** recommend creating a
diff --git a/public/static/docs/install/macos.md b/public/static/docs/install/macos.md
index d7d9550c9e..3a231e4647 100644
--- a/public/static/docs/install/macos.md
+++ b/public/static/docs/install/macos.md
@@ -1,5 +1,8 @@
# Installation on MacOS
+> To use DVC [as a Python library](/doc/api-reference), please
+> [install with pip](#install-with-pip) or [with conda](#install-with-conda).
+
## Install with brew
Recommended. Requires [Homebrew](https://brew.sh/).
diff --git a/public/static/docs/install/windows.md b/public/static/docs/install/windows.md
index b1b1a1b762..cc2e78e4fe 100644
--- a/public/static/docs/install/windows.md
+++ b/public/static/docs/install/windows.md
@@ -4,6 +4,11 @@
> [Running DVC on Windows](/doc/user-guide/running-dvc-on-windows) for important
> tips to improve your experience using DVC on Windows.
+
+
+> To use DVC [as a Python library](/doc/api-reference), please
+> [install with pip](#install-with-pip) or [with conda](#install-with-conda).
+
## Windows installer
The easiest way is to use the self-contained, executable installer (binary),
diff --git a/public/static/docs/sidebar.json b/public/static/docs/sidebar.json
index a13ff3591b..6cd58aeba4 100644
--- a/public/static/docs/sidebar.json
+++ b/public/static/docs/sidebar.json
@@ -364,6 +364,25 @@
}
]
},
+ {
+ "slug": "api-reference",
+ "label": "Python API Reference",
+ "source": "api-reference/index.md",
+ "children": [
+ {
+ "slug": "get_url",
+ "label": "get_url()"
+ },
+ {
+ "slug": "open",
+ "label": "open()"
+ },
+ {
+ "slug": "read",
+ "label": "read()"
+ }
+ ]
+ },
{
"slug": "understanding-dvc",
"label": "Understanding DVC",
diff --git a/public/static/docs/use-cases/data-registries.md b/public/static/docs/use-cases/data-registries.md
index 68d3a4ddaf..4cb28cde84 100644
--- a/public/static/docs/use-cases/data-registries.md
+++ b/public/static/docs/use-cases/data-registries.md
@@ -89,8 +89,8 @@ $ dvc push
## Using registries
The main methods to consume data artifacts from a **data registry**
-are the `dvc import` and `dvc get` commands, as well as the `dvc.api` Python
-API.
+are the `dvc import` and `dvc get` commands, as well as the
+[`dvc.api`](/doc/api-reference) Python API.
### Simple download (get)
@@ -141,7 +141,7 @@ $ dvc update dataset.dvc
`images/faces/`, based on the latest commit in the source repo. It also updates
the project dependency metadata in the import stage (DVC-file).
-### Programatic reusability of DVC data
+### Programmatic reusability of DVC data
Our Python API, included with the `dvc` package installed with DVC, includes the
`open` function to load/stream data directly from external DVC
diff --git a/public/static/docs/user-guide/dvcignore.md b/public/static/docs/user-guide/dvcignore.md
index 5afc16576a..d997ae9130 100644
--- a/public/static/docs/user-guide/dvcignore.md
+++ b/public/static/docs/user-guide/dvcignore.md
@@ -150,7 +150,7 @@ data.dvc:
modified: data
```
-## Example: Ignore dvc controlled file
+## Example: Ignore DVC tracked file
Let's analyze an example workspace:
diff --git a/scripts/clear-cloudflare-cache.js b/scripts/clear-cloudflare-cache.js
new file mode 100755
index 0000000000..a9a2e9c68d
--- /dev/null
+++ b/scripts/clear-cloudflare-cache.js
@@ -0,0 +1,49 @@
+#!/usr/bin/env node
+/* global process */
+
+// This script runs just before the app starts. If we are running the
+// production heroku app (the only one with the below env variables)
+// the cache gets cleared.
+//
+// To clear the cache yourself, you can use the button in the
+// cloudflare dashboard ("Caching tab > Purge everything"), or run
+// this script with the required environment variables:
+//
+// - CLOUDFLARE_TOKEN: a token with the "Zone.Cache Purge" permission.
+// You can generate this token in "My Profile > API Tokens"
+//
+// - CLOUDFLARE_ZONE_ID: The zone ID to purge. You can find it in the
+// sidebar of the "overview" tab for dvc.org
+
+const fetch = require('isomorphic-fetch');
+
+const { CLOUDFLARE_TOKEN, CLOUDFLARE_ZONE_ID } = process.env;
+
+async function main() {
+ const res = await fetch(
+ `https://api.cloudflare.com/client/v4/zones/${CLOUDFLARE_ZONE_ID}/purge_cache`,
+ {
+ method: 'POST',
+ headers: {
+ authorization: `Bearer ${CLOUDFLARE_TOKEN}`,
+ 'content-type': 'application/json'
+ },
+ body: JSON.stringify({ purge_everything: true })
+ }
+ );
+
+ const body = await res.text();
+
+ if (!res.ok) {
+ throw new Error('Error response received from CloudFlare: ' + body);
+ }
+
+ console.log('Cleared cache successfully');
+}
+
+if (CLOUDFLARE_TOKEN) {
+ main().catch(e => {
+ console.error(e);
+ process.exit(1);
+ });
+}
diff --git a/scripts/exclude-links.txt b/scripts/exclude-links.txt
index 74e4a9dada..d9f10dcbea 100644
--- a/scripts/exclude-links.txt
+++ b/scripts/exclude-links.txt
@@ -8,6 +8,7 @@ https://accounts.google.com/o/oauth2/auth
https://api.github.com/repos/$
https://blog.$
https://circleci.com/gh/iterative/dvc.org
+https://api.cloudflare.com/client/v4/zones/$
https://code.dvc.org/foo/bar
https://data.dvc.org/foo/bar
https://discuss.$
@@ -32,6 +33,9 @@ https://man.dvc.org/foo
https://marketplace.visualstudio.com/items?itemName=stkb.rewrap
https://myendpoint.com
https://object-storage.example.com
+https://remote.dvc.org/dataset-registry
+https://remote.dvc.org/dataset-registry/a3/04af...
+https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355
https://remote.dvc.org/foo/bar
https://remote.dvc.org/get-started
https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar
diff --git a/server.js b/server.js
index 70b057758c..673efcab86 100644
--- a/server.js
+++ b/server.js
@@ -28,10 +28,13 @@ app.prepare().then(() => {
const { pathname, query } = parsedUrl
const host = req.headers.host
+ res.setHeader('Cache-Control', 'public, max-age=0, s-maxage=99999')
+
let [redirectCode, redirectLocation] = getRedirect(host, pathname, {
req,
dev
})
+
if (redirectLocation) {
// HTTP redirects
@@ -40,7 +43,6 @@ app.prepare().then(() => {
redirectLocation += '?' + queryStr
}
res.writeHead(redirectCode, {
- 'Cache-control': 'no-cache',
Location: redirectLocation
})
res.end()
diff --git a/src/components/Documentation/SidebarMenu/index.js b/src/components/Documentation/SidebarMenu/index.js
index cb1ead6601..7253437034 100644
--- a/src/components/Documentation/SidebarMenu/index.js
+++ b/src/components/Documentation/SidebarMenu/index.js
@@ -75,15 +75,14 @@ export default function SidebarMenu({ id, sidebar, currentPath, onClick }) {
setIsScrollHidden(true)
- setTimeout(() => {
- if (psRef.current) {
- psRef.current.update()
- scrollIntoView(node, parent, { onlyScrollIfNeeded: true })
- setIsScrollHidden(false)
- }
+ const timeout = setTimeout(() => {
+ psRef.current.update()
+ scrollIntoView(node, parent, { onlyScrollIfNeeded: true })
+ setIsScrollHidden(false)
}, 400)
return () => {
+ clearTimeout(timeout)
psRef.current.destroy()
psRef.current = null
}
diff --git a/src/components/DownloadButton/index.js b/src/components/DownloadButton/index.js
index 268f9c8499..75138f79a3 100644
--- a/src/components/DownloadButton/index.js
+++ b/src/components/DownloadButton/index.js
@@ -19,7 +19,7 @@ import {
Triangle
} from './styles'
-const VERSION = `0.83.0`
+const VERSION = `0.87.0`
const OSX = `osx`
const WINDOWS = `win`
const LINUX = `linux`