Skip to content

Commit

Permalink
add more info into dataset summary
Browse files Browse the repository at this point in the history
- replace dataset_byte_size with blobs_byte_size
- add increased_blobs_byte_size into DatasetSummary
  • Loading branch information
tianweidut committed Mar 30, 2023
1 parent aef24af commit ddb4539
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 55 deletions.
25 changes: 17 additions & 8 deletions client/starwhale/api/_impl/dataset/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,15 @@ def __init__(
if origin_uri_exists:
_summary = self.__loading_core_dataset.summary()
# TODO: raise none summary exception for existed dataset
self._total_rows = 0 if _summary is None else _summary.rows
if _summary is None:
self._total_rows = 0
self._total_blobs_size = 0
else:
self._total_rows = _summary.rows
self._total_blobs_size = _summary.blobs_byte_size
else:
self._total_rows = 0
self._total_blobs_size = 0

def _auto_complete_version(self, version: str) -> str:
version = version.strip()
Expand Down Expand Up @@ -660,7 +666,9 @@ def _dump_manifest() -> Path:
if self._dataset_builder is None:
raise RuntimeError("failed to commit, because dataset builder is None")

_signs = [str(m) for m in self._dataset_builder.signature_bins_meta]
increased_blobs_size = sum(
[m.size for m in self._dataset_builder.signature_bins_meta]
)

_manifest = {
"build": {
Expand All @@ -670,12 +678,13 @@ def _dump_manifest() -> Path:
"version": self._pending_commit_version,
"related_datastore_timestamp": "", # TODO: get timestamp from datastore
CREATED_AT_KEY: now_str(),
"append_signs": _signs,
"dataset_summary": {
"rows": self._dataset_builder.calculate_rows_cnt(), # maybe slow
"updated_rows": self._updated_rows_by_commit,
"deleted_rows": self._deleted_rows_by_commit,
},
"dataset_summary": DatasetSummary(
rows=self._dataset_builder.calculate_rows_cnt(), # maybe slow
updated_rows=self._updated_rows_by_commit,
deleted_rows=self._deleted_rows_by_commit,
blobs_byte_size=self._total_blobs_size + increased_blobs_size,
increased_blobs_byte_size=increased_blobs_size,
).asdict(),
"message": message,
}

Expand Down
4 changes: 3 additions & 1 deletion client/starwhale/base/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,9 @@ def get_bundle_size_from_resp(self, typ: str, item: t.Dict) -> int:
return default_size

if typ == "dataset":
return int(meta.get("dataset_byte_size", default_size))
return int(
meta.get("dataset_summary", {}).get("blobs_byte_size", default_size)
)
if typ == "runtime":
# no size info in meta for now
return default_size
Expand Down
4 changes: 2 additions & 2 deletions client/starwhale/core/dataset/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def history(
dict(
name=self.name,
version=_bf.version,
size=_manifest.get("dataset_byte_size", 0),
size=_manifest.get("blobs_byte_size", 0),
created_at=_manifest[CREATED_AT_KEY],
tags=_bf.tags,
path=_bf.path,
Expand Down Expand Up @@ -269,7 +269,7 @@ def list(
dict(
name=_bf.name,
version=_bf.version,
size=_manifest.get("dataset_byte_size", 0),
size=_manifest.get("dataset_summary", {}).get("blobs_byte_size", 0),
created_at=_manifest[CREATED_AT_KEY],
is_removed=_bf.is_removed,
path=_bf.path,
Expand Down
20 changes: 10 additions & 10 deletions client/starwhale/core/dataset/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,25 +944,25 @@ class DatasetSummary(ASDictMixin):
def __init__(
self,
rows: int = 0,
increased_rows: int = 0,
data_byte_size: int = 0,
updated_rows: int = 0,
deleted_rows: int = 0,
blobs_byte_size: int = 0,
increased_blobs_byte_size: int = 0,
**kw: t.Any,
) -> None:
self.rows = rows
self.increased_rows = increased_rows
self.unchanged_rows = rows - increased_rows
self.data_byte_size = data_byte_size
# TODO: cleanup expired increased_rows, unchanged_rows, data_byte_size fields
self.updated_rows = kw.get("updated_rows", 0)
self.deleted_rows = kw.get("deleted_rows", 0)
self.updated_rows = updated_rows
self.deleted_rows = deleted_rows
self.blobs_byte_size = blobs_byte_size
self.increased_blobs_byte_size = increased_blobs_byte_size

def __str__(self) -> str:
return f"Dataset Summary: rows({self.rows})"

def __repr__(self) -> str:
return (
f"Dataset Summary: rows({self.rows}, increased: {self.increased_rows}), "
f"size(data:{self.data_byte_size})"
f"Dataset Summary: rows(total: {self.rows}, updated: {self.updated_rows}, deleted: {self.deleted_rows}), "
f"size(blobs:{self.blobs_byte_size})"
)


Expand Down
4 changes: 2 additions & 2 deletions client/starwhale/core/project/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _show_objects(objects: t.List[t.Dict[str, t.Any]], typ: str) -> Tree:
# TODO: add model version for every version
_size = _o["files"][0]["size"]
else:
_size = pretty_bytes(_v["meta"]["dataset_byte_size"])
_size = pretty_bytes(_v["meta"]["blobs_byte_size"])

otree.add(
f"[{_v['id']}][green]{_v[_k]}[/] :timer_clock: {_v['created_at']} :dizzy:{_size}"
Expand Down Expand Up @@ -176,7 +176,7 @@ def _show_objects(objects: t.List[t.Dict[str, t.Any]], typ: str) -> Tree:
# TODO: add model version for every version
_size = _o["files"][0]["size"]
else:
_size = pretty_bytes(_v["meta"]["dataset_byte_size"])
_size = pretty_bytes(_v["meta"]["blobs_byte_size"])

otree.add(
f"[{_v['id']}][green]{_v[_k]}[/] :timer_clock: {_v['created_at']} :dizzy:{_size}"
Expand Down
2 changes: 1 addition & 1 deletion client/tests/base/test_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_get_bundle_size_from_resp(self):
size = ins.get_bundle_size_from_resp("whatever", item)
assert size == 7

meta = {"dataset_byte_size": 8}
meta = {"dataset_summary": {"blobs_byte_size": 8}}
item = {"meta": yaml.safe_dump(meta)}
size = ins.get_bundle_size_from_resp("dataset", item)
assert size == 8
Expand Down
1 change: 0 additions & 1 deletion client/tests/core/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ def test_head(
) -> None:
m_summary.return_value = DatasetSummary(
rows=2,
increased_rows=2,
)
m_scan.return_value = [
TabularDatasetRow(
Expand Down
20 changes: 18 additions & 2 deletions client/tests/sdk/test_dataset_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
BoundingBox,
DatasetSummary,
GrayscaleImage,
D_ALIGNMENT_SIZE,
COCOObjectAnnotation,
)
from starwhale.core.dataset.tabular import TabularDatasetInfo
Expand Down Expand Up @@ -806,6 +807,20 @@ def test_manifest(self) -> None:
m = empty_ds.manifest()
assert m == {}

def test_summary(self) -> None:
existed_ds_uri = self._init_simple_dataset_with_str_id()
ds = dataset(existed_ds_uri)
summary = ds.summary()
assert summary is not None
assert summary.rows == len(ds)
assert summary.updated_rows == 10
assert summary.deleted_rows == 0
assert (
summary.blobs_byte_size
== summary.increased_blobs_byte_size
== 10 * D_ALIGNMENT_SIZE
)

def test_create_dataset(self) -> None:
existed_ds_uri = self._init_simple_dataset_with_str_id()

Expand Down Expand Up @@ -924,7 +939,7 @@ def test_copy(self, rm: Mocker) -> None:
rm.request(
HTTPMethod.POST,
"http://1.1.1.1/api/v1/datastore/updateTable",
json={"data": "datastore-revision"},
json={"data": "datastore_revision"},
)

rm.request(
Expand Down Expand Up @@ -984,13 +999,14 @@ def test_commit_from_empty(self) -> None:
/ "_manifest.yaml"
)
manifest = load_yaml(manifest_path)
assert manifest["append_signs"] == []
assert "created_at" in manifest
assert "related_datastore_timestamp" in manifest
assert manifest["dataset_summary"] == {
"deleted_rows": 0,
"rows": 1,
"updated_rows": 1,
"blobs_byte_size": 0,
"increased_blobs_byte_size": 0,
}
assert manifest["message"] == commit_msg
assert manifest["version"] == ds.loading_version
Expand Down
4 changes: 0 additions & 4 deletions client/tests/sdk/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,8 @@ def test_ppl(
_run_dir = _logdir / RunSubDirType.RUNLOG / "ppl" / "0"
_status_dir = _run_dir / RunSubDirType.STATUS

# mock dataset
m_summary.return_value = DatasetSummary(
rows=1,
increased_rows=1,
)

fname = "data_ubyte_0.swds_bin"
Expand Down Expand Up @@ -239,10 +237,8 @@ def cmp(self, _data_loader: t.Any) -> t.Any:

assert label_data == data[0]["ds_data"]["label"]

# mock dataset
m_summary.return_value = DatasetSummary(
rows=1,
increased_rows=1,
)

fname = "data_ubyte_0.swds_bin"
Expand Down
30 changes: 6 additions & 24 deletions client/tests/sdk/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@ def setUp(self) -> None:
@patch("starwhale.core.dataset.model.StandaloneDataset.summary")
@patch("starwhale.api._impl.wrapper.Dataset.scan_id")
def test_range_match(self, m_scan_id: MagicMock, m_summary: MagicMock) -> None:
m_summary.return_value = DatasetSummary(
include_user_raw=True,
include_link=False,
)
m_summary.return_value = DatasetSummary(rows=1)
m_scan_id.return_value = [{"id": "path/0"}]
consumption = get_dataset_consumption(
self.dataset_uri,
Expand All @@ -73,10 +70,7 @@ def test_range_match(self, m_scan_id: MagicMock, m_summary: MagicMock) -> None:
def test_user_raw_local_store(
self, m_scan: MagicMock, m_scan_id: MagicMock, m_summary: MagicMock
) -> None:
m_summary.return_value = DatasetSummary(
include_user_raw=True,
include_link=False,
)
m_summary.return_value = DatasetSummary(rows=1)
m_scan_id.return_value = [{"id": "path/0"}]

consumption = get_dataset_consumption(self.dataset_uri, session_id="1")
Expand Down Expand Up @@ -163,10 +157,7 @@ def test_user_raw_remote_store(
with tempfile.TemporaryDirectory() as tmpdirname:
config._config = {}
os.environ["SW_CLI_CONFIG"] = tmpdirname + "/config.yaml"
m_summary.return_value = DatasetSummary(
include_user_raw=True,
include_link=True,
)
m_summary.return_value = DatasetSummary(rows=4)
m_scan_id.return_value = [{"id": i} for i in range(0, 4)]

snapshot_workdir = DatasetStorage(self.dataset_uri).snapshot_workdir
Expand Down Expand Up @@ -331,10 +322,7 @@ def test_swds_bin_s3(
"http://127.0.0.1:1234/api/v1/project/self",
json={"data": {"id": 1, "name": "project"}},
)
m_summary.return_value = DatasetSummary(
include_user_raw=False,
include_link=False,
)
m_summary.return_value = DatasetSummary(rows=1)
m_scan_id.return_value = [{"id": 0}]
version = "1122334455667788"
dataset_uri = URI(
Expand Down Expand Up @@ -507,10 +495,7 @@ def test_remote_batch_sign(
m_scan_batch: MagicMock,
m_summary: MagicMock,
) -> None:
m_summary.return_value = DatasetSummary(
include_user_raw=True,
include_link=False,
)
m_summary.return_value = DatasetSummary(rows=4)
tdsc = m_sc()
tdsc.get_scan_range.side_effect = [["a", "d"], None]
tdsc.batch_size = 20
Expand Down Expand Up @@ -692,10 +677,7 @@ def test_data_row_exceptions(self) -> None:
@patch("starwhale.api._impl.dataset.loader.TabularDataset.scan")
def test_loader_with_cache(self, m_scan: MagicMock, m_summary: MagicMock) -> None:
rows_cnt = 100
m_summary.return_value = DatasetSummary(
rows=rows_cnt,
increased_rows=rows_cnt,
)
m_summary.return_value = DatasetSummary(rows=1)
fname = "data_ubyte_0.swds_bin"
m_scan.return_value = [
TabularDatasetRow(
Expand Down

0 comments on commit ddb4539

Please sign in to comment.