diff --git a/client/starwhale/api/_impl/dataset/model.py b/client/starwhale/api/_impl/dataset/model.py index c060598be7..5a451fd546 100644 --- a/client/starwhale/api/_impl/dataset/model.py +++ b/client/starwhale/api/_impl/dataset/model.py @@ -159,9 +159,15 @@ def __init__( if origin_uri_exists: _summary = self.__loading_core_dataset.summary() # TODO: raise none summary exception for existed dataset - self._total_rows = 0 if _summary is None else _summary.rows + if _summary is None: + self._total_rows = 0 + self._total_blobs_size = 0 + else: + self._total_rows = _summary.rows + self._total_blobs_size = _summary.blobs_byte_size else: self._total_rows = 0 + self._total_blobs_size = 0 def _auto_complete_version(self, version: str) -> str: version = version.strip() @@ -660,7 +666,9 @@ def _dump_manifest() -> Path: if self._dataset_builder is None: raise RuntimeError("failed to commit, because dataset builder is None") - _signs = [str(m) for m in self._dataset_builder.signature_bins_meta] + increased_blobs_size = sum( + [m.size for m in self._dataset_builder.signature_bins_meta] + ) _manifest = { "build": { @@ -670,12 +678,13 @@ def _dump_manifest() -> Path: "version": self._pending_commit_version, "related_datastore_timestamp": "", # TODO: get timestamp from datastore CREATED_AT_KEY: now_str(), - "append_signs": _signs, - "dataset_summary": { - "rows": self._dataset_builder.calculate_rows_cnt(), # maybe slow - "updated_rows": self._updated_rows_by_commit, - "deleted_rows": self._deleted_rows_by_commit, - }, + "dataset_summary": DatasetSummary( + rows=self._dataset_builder.calculate_rows_cnt(), # maybe slow + updated_rows=self._updated_rows_by_commit, + deleted_rows=self._deleted_rows_by_commit, + blobs_byte_size=self._total_blobs_size + increased_blobs_size, + increased_blobs_byte_size=increased_blobs_size, + ).asdict(), "message": message, } diff --git a/client/starwhale/base/cloud.py b/client/starwhale/base/cloud.py index feef6b98df..d8fc5c4fed 100644 --- a/client/starwhale/base/cloud.py +++ b/client/starwhale/base/cloud.py @@ -283,7 +283,9 @@ def get_bundle_size_from_resp(self, typ: str, item: t.Dict) -> int: return default_size if typ == "dataset": - return int(meta.get("dataset_byte_size", default_size)) + return int( + meta.get("dataset_summary", {}).get("blobs_byte_size", default_size) + ) if typ == "runtime": # no size info in meta for now return default_size diff --git a/client/starwhale/core/dataset/model.py b/client/starwhale/core/dataset/model.py index 4ee7e720cd..8ba75ad1c1 100644 --- a/client/starwhale/core/dataset/model.py +++ b/client/starwhale/core/dataset/model.py @@ -209,7 +209,7 @@ def history( dict( name=self.name, version=_bf.version, - size=_manifest.get("dataset_byte_size", 0), + size=_manifest.get("blobs_byte_size", 0), created_at=_manifest[CREATED_AT_KEY], tags=_bf.tags, path=_bf.path, @@ -269,7 +269,7 @@ def list( dict( name=_bf.name, version=_bf.version, - size=_manifest.get("dataset_byte_size", 0), + size=_manifest.get("dataset_summary", {}).get("blobs_byte_size", 0), created_at=_manifest[CREATED_AT_KEY], is_removed=_bf.is_removed, path=_bf.path, diff --git a/client/starwhale/core/dataset/type.py b/client/starwhale/core/dataset/type.py index 5fda4e6371..9e2a4c841b 100644 --- a/client/starwhale/core/dataset/type.py +++ b/client/starwhale/core/dataset/type.py @@ -944,25 +944,25 @@ class DatasetSummary(ASDictMixin): def __init__( self, rows: int = 0, - increased_rows: int = 0, - data_byte_size: int = 0, + updated_rows: int = 0, + deleted_rows: int = 0, + blobs_byte_size: int = 0, + increased_blobs_byte_size: int = 0, **kw: t.Any, ) -> None: self.rows = rows - self.increased_rows = increased_rows - self.unchanged_rows = rows - increased_rows - self.data_byte_size = data_byte_size - # TODO: cleanup expired increased_rows, unchanged_rows, data_byte_size fields - self.updated_rows = kw.get("updated_rows", 0) - self.deleted_rows = kw.get("deleted_rows", 0) + self.updated_rows = updated_rows + self.deleted_rows = deleted_rows + self.blobs_byte_size = blobs_byte_size + self.increased_blobs_byte_size = increased_blobs_byte_size def __str__(self) -> str: return f"Dataset Summary: rows({self.rows})" def __repr__(self) -> str: return ( - f"Dataset Summary: rows({self.rows}, increased: {self.increased_rows}), " - f"size(data:{self.data_byte_size})" + f"Dataset Summary: rows(total: {self.rows}, updated: {self.updated_rows}, deleted: {self.deleted_rows}), " + f"size(blobs:{self.blobs_byte_size})" ) diff --git a/client/starwhale/core/project/view.py b/client/starwhale/core/project/view.py index 5d6898ceff..36df7fc5d4 100644 --- a/client/starwhale/core/project/view.py +++ b/client/starwhale/core/project/view.py @@ -115,7 +115,7 @@ def _show_objects(objects: t.List[t.Dict[str, t.Any]], typ: str) -> Tree: # TODO: add model version for every version _size = _o["files"][0]["size"] else: - _size = pretty_bytes(_v["meta"]["dataset_byte_size"]) + _size = pretty_bytes(_v["meta"]["blobs_byte_size"]) otree.add( f"[{_v['id']}][green]{_v[_k]}[/] :timer_clock: {_v['created_at']} :dizzy:{_size}" @@ -176,7 +176,7 @@ def _show_objects(objects: t.List[t.Dict[str, t.Any]], typ: str) -> Tree: # TODO: add model version for every version _size = _o["files"][0]["size"] else: - _size = pretty_bytes(_v["meta"]["dataset_byte_size"]) + _size = pretty_bytes(_v["meta"]["blobs_byte_size"]) otree.add( f"[{_v['id']}][green]{_v[_k]}[/] :timer_clock: {_v['created_at']} :dizzy:{_size}" diff --git a/client/tests/base/test_cloud.py b/client/tests/base/test_cloud.py index b983ed346f..c0bc4cfdd0 100644 --- a/client/tests/base/test_cloud.py +++ b/client/tests/base/test_cloud.py @@ -19,7 +19,7 @@ def test_get_bundle_size_from_resp(self): size = ins.get_bundle_size_from_resp("whatever", item) assert size == 7 - meta = {"dataset_byte_size": 8} + meta = {"dataset_summary": {"blobs_byte_size": 8}} item = {"meta": yaml.safe_dump(meta)} size = ins.get_bundle_size_from_resp("dataset", item) assert size == 8 diff --git a/client/tests/core/test_dataset.py b/client/tests/core/test_dataset.py index 52addd17bb..055dc37e0c 100644 --- a/client/tests/core/test_dataset.py +++ b/client/tests/core/test_dataset.py @@ -284,7 +284,6 @@ def test_head( ) -> None: m_summary.return_value = DatasetSummary( rows=2, - increased_rows=2, ) m_scan.return_value = [ TabularDatasetRow( diff --git a/client/tests/sdk/test_dataset_sdk.py b/client/tests/sdk/test_dataset_sdk.py index 9ba43856d3..50d55ea952 100644 --- a/client/tests/sdk/test_dataset_sdk.py +++ b/client/tests/sdk/test_dataset_sdk.py @@ -801,6 +801,16 @@ def test_manifest(self) -> None: m = empty_ds.manifest() assert m == {} + def test_summary(self) -> None: + existed_ds_uri = self._init_simple_dataset_with_str_id() + ds = dataset(existed_ds_uri) + summary = ds.summary() + assert summary is not None + assert summary.rows == len(ds) + assert summary.updated_rows == 10 + assert summary.deleted_rows == 0 + assert summary.blobs_byte_size == summary.increased_blobs_byte_size == 40960 + def test_create_dataset(self) -> None: existed_ds_uri = self._init_simple_dataset_with_str_id() @@ -978,13 +988,14 @@ def test_commit_from_empty(self) -> None: / "_manifest.yaml" ) manifest = load_yaml(manifest_path) - assert manifest["append_signs"] == [] assert "created_at" in manifest assert "related_datastore_timestamp" in manifest assert manifest["dataset_summary"] == { "deleted_rows": 0, "rows": 1, "updated_rows": 1, + "blobs_byte_size": 0, + "increased_blobs_byte_size": 0, } assert manifest["message"] == commit_msg assert manifest["version"] == ds.loading_version diff --git a/client/tests/sdk/test_evaluation.py b/client/tests/sdk/test_evaluation.py index e5c0a5878a..79be829739 100644 --- a/client/tests/sdk/test_evaluation.py +++ b/client/tests/sdk/test_evaluation.py @@ -163,10 +163,8 @@ def test_ppl( _run_dir = _logdir / RunSubDirType.RUNLOG / "ppl" / "0" _status_dir = _run_dir / RunSubDirType.STATUS - # mock dataset m_summary.return_value = DatasetSummary( rows=1, - increased_rows=1, ) fname = "data_ubyte_0.swds_bin" @@ -239,10 +237,8 @@ def cmp(self, _data_loader: t.Any) -> t.Any: assert label_data == data[0]["ds_data"]["label"] - # mock dataset m_summary.return_value = DatasetSummary( rows=1, - increased_rows=1, ) fname = "data_ubyte_0.swds_bin" diff --git a/client/tests/sdk/test_loader.py b/client/tests/sdk/test_loader.py index 159ec95d99..c92ab79550 100644 --- a/client/tests/sdk/test_loader.py +++ b/client/tests/sdk/test_loader.py @@ -43,10 +43,7 @@ def setUp(self) -> None: @patch("starwhale.core.dataset.model.StandaloneDataset.summary") @patch("starwhale.api._impl.wrapper.Dataset.scan_id") def test_range_match(self, m_scan_id: MagicMock, m_summary: MagicMock) -> None: - m_summary.return_value = DatasetSummary( - include_user_raw=True, - include_link=False, - ) + m_summary.return_value = DatasetSummary(rows=1) m_scan_id.return_value = [{"id": "path/0"}] consumption = get_dataset_consumption( self.dataset_uri, @@ -73,10 +70,7 @@ def test_range_match(self, m_scan_id: MagicMock, m_summary: MagicMock) -> None: def test_user_raw_local_store( self, m_scan: MagicMock, m_scan_id: MagicMock, m_summary: MagicMock ) -> None: - m_summary.return_value = DatasetSummary( - include_user_raw=True, - include_link=False, - ) + m_summary.return_value = DatasetSummary(rows=1) m_scan_id.return_value = [{"id": "path/0"}] consumption = get_dataset_consumption(self.dataset_uri, session_id="1") @@ -163,10 +157,7 @@ def test_user_raw_remote_store( with tempfile.TemporaryDirectory() as tmpdirname: config._config = {} os.environ["SW_CLI_CONFIG"] = tmpdirname + "/config.yaml" - m_summary.return_value = DatasetSummary( - include_user_raw=True, - include_link=True, - ) + m_summary.return_value = DatasetSummary(rows=4) m_scan_id.return_value = [{"id": i} for i in range(0, 4)] snapshot_workdir = DatasetStorage(self.dataset_uri).snapshot_workdir @@ -331,10 +322,7 @@ def test_swds_bin_s3( "http://127.0.0.1:1234/api/v1/project/self", json={"data": {"id": 1, "name": "project"}}, ) - m_summary.return_value = DatasetSummary( - include_user_raw=False, - include_link=False, - ) + m_summary.return_value = DatasetSummary(rows=1) m_scan_id.return_value = [{"id": 0}] version = "1122334455667788" dataset_uri = URI( @@ -507,10 +495,7 @@ def test_remote_batch_sign( m_scan_batch: MagicMock, m_summary: MagicMock, ) -> None: - m_summary.return_value = DatasetSummary( - include_user_raw=True, - include_link=False, - ) + m_summary.return_value = DatasetSummary(rows=4) tdsc = m_sc() tdsc.get_scan_range.side_effect = [["a", "d"], None] tdsc.batch_size = 20 @@ -692,10 +677,7 @@ def test_data_row_exceptions(self) -> None: @patch("starwhale.api._impl.dataset.loader.TabularDataset.scan") def test_loader_with_cache(self, m_scan: MagicMock, m_summary: MagicMock) -> None: rows_cnt = 100 - m_summary.return_value = DatasetSummary( - rows=rows_cnt, - increased_rows=rows_cnt, - ) + m_summary.return_value = DatasetSummary(rows=1) fname = "data_ubyte_0.swds_bin" m_scan.return_value = [ TabularDatasetRow(