Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
)
from dvc.ignore import DvcIgnoreFilter
from dvc.log import logger
from dvc.utils import as_posix
from dvc.utils.objects import cached_property

if TYPE_CHECKING:
Expand Down Expand Up @@ -351,16 +350,6 @@ def fs(self, fs: "FileSystem"):
# fs.
self._reset()

@property
def subrepo_relpath(self) -> str:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[F] Moved into dvc.utils.studio. Tests moved too.

from dvc.fs import GitFileSystem

scm_root_dir = "/" if isinstance(self.fs, GitFileSystem) else self.scm.root_dir

relpath = as_posix(self.fs.relpath(self.root_dir, scm_root_dir))

return "" if relpath == "." else relpath

@property
def data_index(self) -> "DataIndex":
from dvc_data.index import DataIndex
Expand Down
10 changes: 9 additions & 1 deletion dvc/repo/experiments/executor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
from dvc.stage.serialize import to_lockfile
from dvc.utils import dict_sha256, env2bool, relpath
from dvc.utils.fs import remove
from dvc.utils.studio import env_to_config
from dvc.utils.studio import (
env_to_config,
get_dvc_experiment_parent_data,
get_subrepo_relpath,
)

if TYPE_CHECKING:
from queue import Queue
Expand Down Expand Up @@ -624,6 +628,10 @@ def _repro_dvc(
params=to_studio_params(dvc.params.show()),
dvc_studio_config=dvc_studio_config,
message=message,
subdir=get_subrepo_relpath(dvc),
dvc_experiment_parent_data=get_dvc_experiment_parent_data(
dvc, info.baseline_rev
),
)
logger.debug("Running repro in '%s'", os.getcwd())
yield dvc
Expand Down
5 changes: 3 additions & 2 deletions dvc/scm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Manages source control systems (e.g. Git)."""

import os
from collections.abc import Iterator, Mapping
from contextlib import contextmanager
Expand Down Expand Up @@ -181,7 +182,7 @@ def resolve_rev(scm: Union["Git", "NoSCM"], rev: str) -> str:
raise RevError(str(exc)) # noqa: B904


def _get_n_commits(scm: "Git", revs: list[str], num: int) -> list[str]:
def get_n_commits(scm: "Git", revs: list[str], num: int) -> list[str]:
results = []
for rev in revs:
if num == 0:
Expand Down Expand Up @@ -227,7 +228,7 @@ def iter_revs(
return {}

revs = revs or []
results: list[str] = _get_n_commits(scm, revs, num)
results: list[str] = get_n_commits(scm, revs, num)

if all_commits:
results.extend(scm.list_all_commits())
Expand Down
53 changes: 52 additions & 1 deletion dvc/utils/studio.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Optional, Union
from urllib.parse import urljoin

import requests
Expand All @@ -12,10 +12,14 @@
DVC_STUDIO_URL,
)
from dvc.log import logger
from dvc.utils import as_posix

if TYPE_CHECKING:
from requests import Response

from dvc.repo import Repo


logger = logger.getChild(__name__)

STUDIO_URL = "https://studio.iterative.ai"
Expand Down Expand Up @@ -111,3 +115,50 @@ def env_to_config(env: dict[str, Any]) -> dict[str, Any]:
if DVC_STUDIO_URL in env:
config["url"] = env[DVC_STUDIO_URL]
return config


def get_subrepo_relpath(repo: "Repo") -> str:
from dvc.fs import GitFileSystem

scm_root_dir = "/" if isinstance(repo.fs, GitFileSystem) else repo.scm.root_dir

relpath = as_posix(repo.fs.relpath(repo.root_dir, scm_root_dir))

return "" if relpath == "." else relpath


def get_dvc_experiment_parent_data(
repo: "Repo", baseline_rev: Union[str, None]
) -> Union[dict[str, Any], None]:
from scmrepo.exceptions import SCMError

from dvc.scm import NoSCM, get_n_commits

scm = repo.scm

try:
if (
not baseline_rev
or not scm
or isinstance(scm, NoSCM)
or not (commit := scm.resolve_commit(baseline_rev))
or not (
first_100_parent_shas := get_n_commits(scm, [baseline_rev], 101)[1:]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: shy do we need this? (looks complicated, and not very obvious)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See https://github.com/iterative/studio/issues/8848#issuecomment-1935151710

It is complicated and it is needed because of commits missing from Studio.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. A few thoughts / questions:

  • should it be fine for us at least initially disregard it (and improve later?). Rely on the existence of the baseline commit in Studio? (if I understood the issue correctly, apologies if not)
  • should it be optional (remove it from if .. or not) - so that's it's not a blocker for sending at least basic data?
  • probably a place where a comment would not hurt :) (absolutely your call on this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • should it be fine for us at least initially disregard it (and improve later?). Rely on the existence of the baseline commit in Studio? (if I understood the issue correctly, apologies if not)
  • should it be optional (remove it from if .. or not) - so that's it's not a blocker for sending at least basic data?

This data is additional to the baseline_rev not instead of. The extra data will only be used if the baseline cannot be found in Studio. Under those circumstances we will try to create a "fake" parent. Without the parent commits we will not know where to insert the "fake" parent and so will still end up with a detached commit. IMO there is therefore no point in sending the extra data without the parent commits.

This solutions is the best (and maybe only) available given the current constraints.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify and double check.get_dvc_experiment_parent_data would return None (no even baseline sent) if we can't get first_100_parent_shas for some reason ... is it correct?

My question was - can there a be situation when getting just a single baseline works, but 100 fails.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The baseline is duplicated in the payload sent for the start event. It is sent both inside and outside of the dvc_experiment_parent_data (smaller change). An example payload is shown in this test. I.e.

        assert start_call.kwargs["json"] == {
            "type": "start",
            "repo_url": "STUDIO_REPO_URL",
            "baseline_sha": baseline_sha,
            "name": name,
            "params": {"params.yaml": {"foo": 1}},
            "client": "dvc",
            "dvc_experiment_parent_data": {
                "author": {
                    "email": "dvctester@example.com",
                    "name": "DVC Tester",
                },
                "date": M.any,
                "message": "init",
                "parent_shas": M.any,
                "title": "init",
                "sha": baseline_sha,
            },
        }

Copy link
Contributor Author

@mattseddon mattseddon Feb 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My question was - can there a be situation when getting just a single baseline works, but 100 fails.

Shouldn't happen but get_dvc_experiment_parent_data is meant to be designed to not stop the start event being sent OR brick the experiment if the required data is not found.

)
):
return None

return {
"author": {
"name": commit.author_name,
"email": commit.author_email,
},
"date": commit.author_datetime.isoformat(),
"message": commit.message,
"parent_shas": first_100_parent_shas,
"sha": commit.hexsha,
"title": commit.message.partition("\n")[0].strip(),
}

except SCMError:
return None
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies = [
"dvc-data>=3.10,<3.12",
"dvc-http>=2.29.0",
"dvc-render>=1.0.1,<2",
"dvc-studio-client>=0.19,<1",
"dvc-studio-client@git+https://github.com/iterative/dvc-studio-client.git@refs/pull/144/head",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[I] Do not merge before releasing dvc-studio-client & studio and updating this.

"dvc-task>=0.3.0,<1",
"flatten_dict<1,>=0.4.1",
# https://github.com/iterative/dvc/issues/9654
Expand Down
122 changes: 120 additions & 2 deletions tests/integration/test_studio_live_experiments.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import pytest
from dvc_studio_client import env, post_live_metrics
from funcy import first
Expand All @@ -8,12 +10,15 @@
DVC_STUDIO_TOKEN,
DVC_STUDIO_URL,
)
from dvc.repo import Repo
from dvc.utils.studio import get_dvc_experiment_parent_data, get_subrepo_relpath


@pytest.mark.studio
@pytest.mark.parametrize("tmp", [True, False])
@pytest.mark.parametrize("offline", [True, False])
def test_post_to_studio(
tmp_dir, dvc, scm, exp_stage, mocker, monkeypatch, tmp, offline
M, tmp_dir, dvc, scm, exp_stage, mocker, monkeypatch, tmp, offline
):
valid_response = mocker.MagicMock()
valid_response.status_code = 200
Expand Down Expand Up @@ -53,6 +58,17 @@ def test_post_to_studio(
"name": name,
"params": {"params.yaml": {"foo": 1}},
"client": "dvc",
"dvc_experiment_parent_data": {
"author": {
"email": "dvctester@example.com",
"name": "DVC Tester",
},
"date": M.any,
"message": "init",
"parent_shas": M.any,
"title": "init",
"sha": baseline_sha,
},
}

assert done_call.kwargs["json"] == {
Expand All @@ -66,9 +82,10 @@ def test_post_to_studio(
}


@pytest.mark.studio
@pytest.mark.parametrize("tmp", [True, False])
def test_post_to_studio_custom_message(
tmp_dir, dvc, scm, exp_stage, mocker, monkeypatch, tmp
M, tmp_dir, dvc, scm, exp_stage, mocker, monkeypatch, tmp
):
valid_response = mocker.MagicMock()
valid_response.status_code = 200
Expand Down Expand Up @@ -97,4 +114,105 @@ def test_post_to_studio_custom_message(
"params": {"params.yaml": {"foo": 1}},
"client": "dvc",
"message": "foo",
"dvc_experiment_parent_data": {
"author": {
"email": "dvctester@example.com",
"name": "DVC Tester",
},
"date": M.any,
"message": "init",
"parent_shas": M.any,
"title": "init",
"sha": baseline_sha,
},
}


@pytest.mark.studio
def test_monorepo_relpath(tmp_dir, scm):
from dvc.repo.destroy import destroy

tmp_dir.gen({"project_a": {}, "subdir/project_b": {}})

non_monorepo = Repo.init(tmp_dir)
assert get_subrepo_relpath(non_monorepo) == ""

destroy(non_monorepo)

monorepo_project_a = Repo.init(tmp_dir / "project_a", subdir=True)

assert get_subrepo_relpath(monorepo_project_a) == "project_a"

monorepo_project_b = Repo.init(tmp_dir / "subdir" / "project_b", subdir=True)

assert get_subrepo_relpath(monorepo_project_b) == "subdir/project_b"


@pytest.mark.studio
def test_virtual_monorepo_relpath(tmp_dir, scm):
from dvc.fs.git import GitFileSystem
from dvc.repo.destroy import destroy

tmp_dir.gen({"project_a": {}, "subdir/project_b": {}})
scm.commit("initial commit")
gfs = GitFileSystem(scm=scm, rev="master")

non_monorepo = Repo.init(tmp_dir)
non_monorepo.fs = gfs
non_monorepo.root_dir = "/"

assert get_subrepo_relpath(non_monorepo) == ""

destroy(non_monorepo)

monorepo_project_a = Repo.init(tmp_dir / "project_a", subdir=True)
monorepo_project_a.fs = gfs
monorepo_project_a.root_dir = "/project_a"

assert get_subrepo_relpath(monorepo_project_a) == "project_a"

monorepo_project_b = Repo.init(tmp_dir / "subdir" / "project_b", subdir=True)
monorepo_project_b.fs = gfs
monorepo_project_b.root_dir = "/subdir/project_b"

assert get_subrepo_relpath(monorepo_project_b) == "subdir/project_b"


@pytest.mark.studio
def test_get_dvc_experiment_parent_data(M, tmp_dir, scm, dvc):
parent_shas = [scm.get_rev()]

for i in range(5):
tmp_dir.scm_gen({"metrics.json": json.dumps({"metric": i})}, commit=f"step {i}")
parent_shas.insert(0, scm.get_rev())

title = "a final commit with a fairly long message"
message = f"{title}\nthat is split over two lines"

tmp_dir.scm_gen({"metrics.json": json.dumps({"metric": 100})}, commit=message)

head_sha = scm.get_rev()

assert isinstance(head_sha, str)
assert head_sha not in parent_shas

dvc_experiment_parent_data = get_dvc_experiment_parent_data(dvc, head_sha)

assert dvc_experiment_parent_data is not None
assert isinstance(dvc_experiment_parent_data["date"], str)

assert dvc_experiment_parent_data == {
"author": {
"email": "dvctester@example.com",
"name": "DVC Tester",
},
"date": M.any,
"message": message,
"parent_shas": parent_shas,
"title": title,
"sha": head_sha,
}, (
"up to 100 parent_shas are sent "
"to Studio these are used to identify where to insert the parent "
"information if the baseline_rev does not exist in the Studio DB"
)
48 changes: 0 additions & 48 deletions tests/unit/repo/test_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,51 +135,3 @@ def test_dynamic_cache_initialization(tmp_dir, scm):
dvc.close()

Repo(str(tmp_dir)).close()


def test_monorepo_relpath(tmp_dir, scm):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see https://github.com/iterative/dvc/pull/10291/files#r1482381232 (moved these along with the method)

from dvc.repo.destroy import destroy

tmp_dir.gen({"project_a": {}, "subdir/project_b": {}})

non_monorepo = Repo.init(tmp_dir)
assert non_monorepo.subrepo_relpath == ""

destroy(non_monorepo)

monorepo_project_a = Repo.init(tmp_dir / "project_a", subdir=True)

assert monorepo_project_a.subrepo_relpath == "project_a"

monorepo_project_b = Repo.init(tmp_dir / "subdir" / "project_b", subdir=True)

assert monorepo_project_b.subrepo_relpath == "subdir/project_b"


def test_virtual_monorepo_relpath(tmp_dir, scm):
from dvc.fs.git import GitFileSystem
from dvc.repo.destroy import destroy

tmp_dir.gen({"project_a": {}, "subdir/project_b": {}})
scm.commit("initial commit")
gfs = GitFileSystem(scm=scm, rev="master")

non_monorepo = Repo.init(tmp_dir)
non_monorepo.fs = gfs
non_monorepo.root_dir = "/"

assert non_monorepo.subrepo_relpath == ""

destroy(non_monorepo)

monorepo_project_a = Repo.init(tmp_dir / "project_a", subdir=True)
monorepo_project_a.fs = gfs
monorepo_project_a.root_dir = "/project_a"

assert monorepo_project_a.subrepo_relpath == "project_a"

monorepo_project_b = Repo.init(tmp_dir / "subdir" / "project_b", subdir=True)
monorepo_project_b.fs = gfs
monorepo_project_b.root_dir = "/subdir/project_b"

assert monorepo_project_b.subrepo_relpath == "subdir/project_b"
Loading