From 6010384dc237c77f317fda96bb9e7e5bed9f4418 Mon Sep 17 00:00:00 2001 From: tianweidut Date: Sat, 8 Oct 2022 10:47:23 +0800 Subject: [PATCH] add python sdk doc --- client/starwhale/__init__.py | 7 +- client/tests/sdk/test_loader.py | 9 +- client/tests/sdk/test_model.py | 3 +- docs/docs/reference/sdk/data_type.md | 3 + docs/docs/reference/sdk/dataset.md | 3 + docs/docs/reference/sdk/evaluation.md | 3 + docs/docs/reference/sdk/other.md | 3 + docs/docs/reference/sdk/overview.md | 3 + .../current/reference/cli/model.md | 2 +- .../current/reference/sdk/data_type.md | 588 ++++++++++++++++++ .../current/reference/sdk/dataset.md | 165 +++++ .../current/reference/sdk/evaluation.md | 195 ++++++ .../current/reference/sdk/other.md | 53 ++ .../current/reference/sdk/overview.md | 43 ++ docs/sidebars.js | 3 + 15 files changed, 1070 insertions(+), 13 deletions(-) create mode 100644 docs/docs/reference/sdk/data_type.md create mode 100644 docs/docs/reference/sdk/other.md create mode 100644 docs/docs/reference/sdk/overview.md create mode 100644 docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/data_type.md create mode 100644 docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/other.md create mode 100644 docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/overview.md diff --git a/client/starwhale/__init__.py b/client/starwhale/__init__.py index 5fd75ebbac..bc9e8def83 100644 --- a/client/starwhale/__init__.py +++ b/client/starwhale/__init__.py @@ -1,5 +1,6 @@ from starwhale.api.job import step, Context from starwhale.version import STARWHALE_VERSION as __version__ +from starwhale.base.uri import URI, URIType from starwhale.api.model import PipelineHandler, PPLResultIterator from starwhale.api.metric import multi_classification from starwhale.api.dataset import ( @@ -18,8 +19,6 @@ get_data_loader, LocalFSLinkAuth, DefaultS3LinkAuth, - SWDSBinDataLoader, - UserRawDataLoader, COCOObjectAnnotation, SWDSBinBuildExecutor, UserRawBuildExecutor, @@ -29,6 +28,8 @@ "__version__", "PipelineHandler", "multi_classification", + "URI", + "URIType", "step", "Context", "get_data_loader", @@ -41,8 +42,6 @@ "BuildExecutor", # SWDSBinBuildExecutor alias "UserRawBuildExecutor", "SWDSBinBuildExecutor", - "SWDSBinDataLoader", - "UserRawDataLoader", "Binary", "Text", "Audio", diff --git a/client/tests/sdk/test_loader.py b/client/tests/sdk/test_loader.py index 331d2160eb..f155a8bba5 100644 --- a/client/tests/sdk/test_loader.py +++ b/client/tests/sdk/test_loader.py @@ -4,13 +4,7 @@ from pyfakefs.fake_filesystem_unittest import TestCase -from starwhale import ( - MIMEType, - S3LinkAuth, - get_data_loader, - SWDSBinDataLoader, - UserRawDataLoader, -) +from starwhale import MIMEType, S3LinkAuth, get_data_loader from starwhale.consts import AUTH_ENV_FNAME, SWDSBackendType from starwhale.base.uri import URI from starwhale.utils.fs import ensure_dir, ensure_file @@ -22,6 +16,7 @@ LocalFSStorageBackend, ) from starwhale.core.dataset.tabular import TabularDatasetRow +from starwhale.api._impl.dataset.loader import SWDSBinDataLoader, UserRawDataLoader from .. import ROOT_DIR diff --git a/client/tests/sdk/test_model.py b/client/tests/sdk/test_model.py index f2058bc88c..26db9f30ab 100644 --- a/client/tests/sdk/test_model.py +++ b/client/tests/sdk/test_model.py @@ -7,7 +7,7 @@ from pyfakefs.fake_filesystem_unittest import TestCase -from starwhale import Context, get_data_loader, PipelineHandler, UserRawDataLoader +from starwhale import Context, get_data_loader, PipelineHandler from starwhale.consts import DEFAULT_PROJECT from starwhale.base.uri import URI from starwhale.utils.fs import ensure_dir @@ -22,6 +22,7 @@ from starwhale.core.dataset.type import MIMEType, ArtifactType, DatasetSummary from starwhale.core.dataset.store import DatasetStorage from starwhale.core.dataset.tabular import TabularDatasetRow +from starwhale.api._impl.dataset.loader import UserRawDataLoader from .. import ROOT_DIR diff --git a/docs/docs/reference/sdk/data_type.md b/docs/docs/reference/sdk/data_type.md new file mode 100644 index 0000000000..20214061bf --- /dev/null +++ b/docs/docs/reference/sdk/data_type.md @@ -0,0 +1,3 @@ +--- +title: Data Type +--- diff --git a/docs/docs/reference/sdk/dataset.md b/docs/docs/reference/sdk/dataset.md index e69de29bb2..8217dd99f2 100644 --- a/docs/docs/reference/sdk/dataset.md +++ b/docs/docs/reference/sdk/dataset.md @@ -0,0 +1,3 @@ +--- +title: Dataset Builder and Loader +--- diff --git a/docs/docs/reference/sdk/evaluation.md b/docs/docs/reference/sdk/evaluation.md index e69de29bb2..aaaf66fc2f 100644 --- a/docs/docs/reference/sdk/evaluation.md +++ b/docs/docs/reference/sdk/evaluation.md @@ -0,0 +1,3 @@ +--- +title: Model Evaluation +--- diff --git a/docs/docs/reference/sdk/other.md b/docs/docs/reference/sdk/other.md new file mode 100644 index 0000000000..a11bb625dc --- /dev/null +++ b/docs/docs/reference/sdk/other.md @@ -0,0 +1,3 @@ +--- +title: Other +--- diff --git a/docs/docs/reference/sdk/overview.md b/docs/docs/reference/sdk/overview.md new file mode 100644 index 0000000000..43b4d1a971 --- /dev/null +++ b/docs/docs/reference/sdk/overview.md @@ -0,0 +1,3 @@ +--- +title: Python SDK +--- diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/cli/model.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/cli/model.md index 5b4f4d8f78..2a1b7ddc00 100644 --- a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/cli/model.md +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/cli/model.md @@ -14,7 +14,7 @@ model命令提供适用于Standalone Instance和Cloud Instance的Starwhale Model model包含如下子命令: -|Command|Standalone|Cloud| +|命令|Standalone|Cloud| |-------|----------|-----| |`build`|✅|❌| |`copy`|✅|✅| diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/data_type.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/data_type.md new file mode 100644 index 0000000000..319465d156 --- /dev/null +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/data_type.md @@ -0,0 +1,588 @@ +--- +title: 数据类型 +--- + +## starwhale.COCOObjectAnnotation + +提供COCO类型的定义,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L403)。 + +```python +COCOObjectAnnotation( + id: int, + image_id: int, + category_id: int, + segmentation: Union[t.List, t.Dict], + area: Union[float, int], + bbox: Union[BoundingBox, t.List[float]], + iscrowd: int, +) +``` + +|参数|说明| +|---|---| +|`id`|object id,一般为全局object的递增id| +|`image_id`|image id,一般为图片id| +|`category_id`|category id,一般为目标检测中类别的id| +|`segmentation`|物体轮廓表示,Polygon(多边形的点)或RLE格式| +|`area`|object面积| +|`bbox`|表示bounding box,可以为BoundingBox类型或float的列表| +|`iscrowd`|0表示是一个单独的object,1表示两个没有分开的object| + +### 使用示例 + +```python +def _make_coco_annotations( + self, mask_fpath: Path, image_id: int +) -> t.List[COCOObjectAnnotation]: + mask_img = PILImage.open(str(mask_fpath)) + + mask = np.array(mask_img) + object_ids = np.unique(mask)[1:] + binary_mask = mask == object_ids[:, None, None] + # TODO: tune permute without pytorch + binary_mask_tensor = torch.as_tensor(binary_mask, dtype=torch.uint8) + binary_mask_tensor = ( + binary_mask_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1) + ) + + coco_annotations = [] + for i in range(0, len(object_ids)): + _pos = np.where(binary_mask[i]) + _xmin, _ymin = float(np.min(_pos[1])), float(np.min(_pos[0])) + _xmax, _ymax = float(np.max(_pos[1])), float(np.max(_pos[0])) + _bbox = BoundingBox( + x=_xmin, y=_ymin, width=_xmax - _xmin, height=_ymax - _ymin + ) + + rle: t.Dict = coco_mask.encode(binary_mask_tensor[i].numpy()) # type: ignore + rle["counts"] = rle["counts"].decode("utf-8") + + coco_annotations.append( + COCOObjectAnnotation( + id=self.object_id, + image_id=image_id, + category_id=1, # PennFudan Dataset only has one class-PASPersonStanding + segmentation=rle, + area=_bbox.width * _bbox.height, + bbox=_bbox, + iscrowd=0, # suppose all instances are not crowd + ) + ) + self.object_id += 1 + + return coco_annotations +``` + +## starwhale.GrayscaleImage + +提供灰度图类型,比如MNIST中数字手写体图片,是 `Image` 类型的一个特例。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L301)。 + +```python +GrayscaleImage( + fp: _TArtifactFP = "", + display_name: str = "", + shape: Optional[_TShape] = None, + as_mask: bool = False, + mask_uri: str = "", +) +``` + +|参数|说明| +|---|---| +|`fp`|图片的路径、IO对象或文件内容的bytes| +|`display_name`|Dataset Viewer上展示的名字| +|`shape`|图片的Width和Height,channel默认为1| +|`as_mask`|是否作为Mask图片| +|`mask_uri`|Mask原图的URI| + +### 使用示例 + +```python +for i in range(0, min(data_number, label_number)): + _data = data_file.read(image_size) + _label = struct.unpack(">B", label_file.read(1))[0] + yield GrayscaleImage( + _data, + display_name=f"{i}", + shape=(height, width, 1), + ), {"label": _label} +``` + +### 函数 + +#### to_types + +```python +to_bytes(encoding: str= "utf-8") -> bytes +``` + +#### carry_raw_data + +```python +carry_raw_data() -> GrayscaleImage +``` + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +## starwhale.BoundingBox + +提供边界框类型,目前为 `LTWH` 格式,即 `left_x`, `top_y`, `width` 和 `height`。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L363)。 + +```python +BoundingBox( + x: float, + y: float, + width: float, + height: float +) +``` + +|参数|说明| +|---|---| +|`x`|left_x的坐标| +|`y`|top_y的坐标| +|`width`|图片的宽度| +|`height`|图片的高度| + +## starwhale.ClassLabel + +描述label的数量和类型,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L344)。 + +```python +ClassLabel( + names: List[Union[int, float, str]] +) +``` + +## starwhale.Image + +图片类型,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L267)。 + +```python +Image( + fp: _TArtifactFP = "", + display_name: str = "", + shape: Optional[_TShape] = None, + mime_type: Optional[MIMEType] = None, + as_mask: bool = False, + mask_uri: str = "", +) +``` + +|参数|说明| +|---|---| +|`fp`|图片的路径、IO对象或文件内容的bytes| +|`display_name`|Dataset Viewer上展示的名字| +|`shape`|图片的Width、Height和channel| +|`mime_type`|MIMEType支持的类型| +|`as_mask`|是否作为Mask图片| +|`mask_uri`|Mask原图的URI| + +### 使用示例 + +```python +import io +import typing as t +import pickle +from PIL import Image as PILImage +from starwhale import Image, MIMEType + +def _iter_item(paths: t.List[Path]) -> t.Generator[t.Tuple[t.Any, t.Dict], None, None]: + for path in paths: + with path.open("rb") as f: + content = pickle.load(f, encoding="bytes") + for data, label, filename in zip( + content[b"data"], content[b"labels"], content[b"filenames"] + ): + annotations = { + "label": label, + "label_display_name": dataset_meta["label_names"][label], + } + + image_array = data.reshape(3, 32, 32).transpose(1, 2, 0) + image_bytes = io.BytesIO() + PILImage.fromarray(image_array).save(image_bytes, format="PNG") + + yield Image( + fp=image_bytes.getvalue(), + display_name=filename.decode(), + shape=image_array.shape, + mime_type=MIMEType.PNG, + ), annotations + + +``` + +### 函数 + +#### to_types + +```python +to_bytes(encoding: str= "utf-8") -> bytes +``` + +#### carry_raw_data + +```python +carry_raw_data() -> GrayscaleImage +``` + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +## starwhale.Audio + +音频类型,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L324)。 + +```python +Audio( + fp: _TArtifactFP = "", + display_name: str = "", + shape: Optional[_TShape] = None, + mime_type: Optional[MIMEType] = None, +) +``` + +|参数|说明| +|---|---| +|`fp`|图片的路径、IO对象或文件内容的bytes| +|`display_name`|Dataset Viewer上展示的名字| +|`shape`|图片的Width、Height和channel| +|`mime_type`|MIMEType支持的类型| + +### 使用示例 + +```python +import typing as t +from starwhale import Audio + +def iter_item() -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + for path in validation_ds_paths: + with path.open() as f: + for item in f.readlines(): + item = item.strip() + if not item: + continue + + data_path = dataset_dir / item + data = Audio( + data_path, display_name=item, shape=(1,), mime_type=MIMEType.WAV + ) + + speaker_id, utterance_num = data_path.stem.split("_nohash_") + annotations = { + "label": data_path.parent.name, + "speaker_id": speaker_id, + "utterance_num": int(utterance_num), + } + yield data, annotations +``` + +### 函数 + +#### to_types + +```python +to_bytes(encoding: str= "utf-8") -> bytes +``` + +#### carry_raw_data + +```python +carry_raw_data() -> GrayscaleImage +``` + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +## starwhale.Text + +文本类型,默认为 `utf-8` 格式。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L380)。 + +```python +Text( + content: str, + encoding: str = "utf-8", +) +``` + +|参数|说明| +|---|---| +|`content`|text内容| +|`encoding`|text的编码格式| + +### 使用示例 + +```python +import typing as t +from pathlib import Path +from starwhale import Text + +def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + root_dir = Path(__file__).parent.parent / "data" + + with (root_dir / "fra-test.txt").open("r") as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith("CC-BY"): + continue + + _data, _label, *_ = line.split("\t") + data = Text(_data, encoding="utf-8") + annotations = {"label": _label} + yield data, annotations +``` + +### 函数 + +#### to_types + +```python +to_bytes(encoding: str= "utf-8") -> bytes +``` + +#### carry_raw_data + +```python +carry_raw_data() -> GrayscaleImage +``` + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +#### to_str + +```python +to_str() -> str +``` + +## starwhale.Binary + +二进制类型,用bytes存储,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L258)。 + +```python +Binary( + fp: _TArtifactFP = "", + mime_type: MIMEType = MIMEType.UNDEFINED, +) +``` + +|参数|说明| +|---|---| +|`fp`|路径、IO对象或文件内容的bytes| +|`mime_type`|MIMEType支持的类型| + +### 函数 + +#### to_types + +```python +to_bytes(encoding: str= "utf-8") -> bytes +``` + +#### carry_raw_data + +```python +carry_raw_data() -> GrayscaleImage +``` + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +## starwhale.Link +Link类型,用来制作 `remote-link` 和 `user-raw` 类型的数据集。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L432)。 + +```python +Link( + uri: str, + auth: Optional[LinkAuth] = DefaultS3LinkAuth, + offset: int = 0, + size: int = -1, + data_type: Optional[BaseArtifact] = None, + with_local_fs_data: bool = False, +) +``` + +|参数|说明| +|---|---| +|`uri`|原始数据的uri地址,目前支持localFS和S3两种协议| +|`auth`|Link Auth信息| +|`offset`|数据相对uri指向的文件偏移量| +|`size`|数据大小| +|`data_type`|Link指向的实际数据类型,目前支持 `Binary`, `Image`, `Text`, `Audio` 四种类型| +|`with_local_fs_data`|是否包含本地文件系统中的数据,用于表示user-raw格式的数据| + +### 使用示例 + +```python +import typing as t +import struct +from pathlib import Path + +from starwhale import Link + +def iter_item() -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + root_dir = Path(__file__).parent.parent / "data" + data_fpath = root_dir / "t10k-images-idx3-ubyte" + label_fpath = root_dir / "t10k-labels-idx1-ubyte" + + with data_fpath.open("rb") as data_file, label_fpath.open("rb") as label_file: + _, data_number, height, width = struct.unpack(">IIII", data_file.read(16)) + _, label_number = struct.unpack(">II", label_file.read(8)) + + image_size = height * width + offset = 16 + + for i in range(0, min(data_number, label_number)): + _data = Link( + uri=str(data_fpath.absolute()), + offset=offset, + size=image_size, + data_type=GrayscaleImage( + display_name=f"{i}", shape=(height, width, 1) + ), + with_local_fs_data=True, + ) + _label = struct.unpack(">B", label_file.read(1))[0] + yield _data, {"label": _label} + offset += image_size + +``` + +### 函数 + +#### astype + +```python +astype() -> Dict[str, t.Any] +``` + +## starwhale.S3LinkAuth + +当数据存储在基于S3协议的对象存储上时,该类型负责描述授权、密钥信息。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L52)。 + +```python +S3LinkAuth( + name: str = "", + access_key: str = "", + secret: str = "", + endpoint: str = "", + region: str = "local", +) +``` + +|参数|说明| +|---|---| +|`name`|Auth的名称| +|`access_key`|S3连接中的access_key| +|`secret`|S3连接中的secret| +|`endpoint`|S3连接中的endpoint地址| +|`region`|bucket所在的S3 region,默认为local| + +### 使用示例 + +```python +import struct +import typing as t +from pathlib import Path + +from starwhale import ( + Link, + S3LinkAuth, + GrayscaleImage, + UserRawBuildExecutor, +) +class LinkRawDatasetProcessExecutor(UserRawBuildExecutor): + _auth = S3LinkAuth(name="mnist", access_key="minioadmin", secret="minioadmin") + _endpoint = "10.131.0.1:9000" + _bucket = "users" + + def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + root_dir = Path(__file__).parent.parent / "data" + + with (root_dir / "t10k-labels-idx1-ubyte").open("rb") as label_file: + _, label_number = struct.unpack(">II", label_file.read(8)) + + offset = 16 + image_size = 28 * 28 + + uri = f"s3://{self._endpoint}@{self._bucket}/dataset/mnist/t10k-images-idx3-ubyte" + for i in range(label_number): + _data = Link( + f"{uri}", + self._auth, + offset=offset, + size=image_size, + data_type=GrayscaleImage(display_name=f"{i}", shape=(28, 28, 1)), + ) + _label = struct.unpack(">B", label_file.read(1))[0] + yield _data, {"label": _label} + offset += image_size +``` + +## starwhale.LocalFSLinkAuth + +描述数据存储在本地文件系统上,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L151)。 + +```python +LocalFSLinkAuth = partial(LinkAuth, ltype=LinkType.LocalFS) +``` + +## starwhale.DefaultS3LinkAuth + +使用默认值初始化 `S3LinkAuth` 类型后得到的变量, Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L152)。 + +```python +DefaultS3LinkAuth = S3LinkAuth() +``` + +## starwhale.MIMEType + +描述Starwhale支持的多媒体类型,用Python Enum类型实现,用在 `Image`、`Video` 等类型的mime_type 属性上,能更好的进行Dataset Viewer。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L106)。 + +```python +class MIMEType(Enum): + PNG = "image/png" + JPEG = "image/jpeg" + WEBP = "image/webp" + SVG = "image/svg+xml" + GIF = "image/gif" + APNG = "image/apng" + AVIF = "image/avif" + MP4 = "video/mp4" + AVI = "video/avi" + WAV = "audio/wav" + MP3 = "audio/mp3" + PLAIN = "text/plain" + CSV = "text/csv" + HTML = "text/html" + GRAYSCALE = "x/grayscale" + UNDEFINED = "x/undefined" +``` + +## starwhale.LinkType + +描述Starwhale支持的remote-link类型,用Python Enum类型实现,目前支持 `LocalFS` 和 `S3` 两种类型。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/core/dataset/type.py#L23)。 + +```python +class LinkType(Enum): + LocalFS = "local_fs" + S3 = "s3" + UNDEFINED = "undefined" +``` diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/dataset.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/dataset.md index e69de29bb2..ad733ad626 100644 --- a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/dataset.md +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/dataset.md @@ -0,0 +1,165 @@ +--- +title: 数据集构建和加载 +--- + +## starwhale.SWDSBinBuildExecutor + +提供swds格式的数据集构建类,需要用户实现 `iter_item` 函数,返回一个可迭代的对象,包含data和annotations。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/dataset/builder.py#L138)。 + +```python +class DatasetProcessExecutor(SWDSBinBuildExecutor): + def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + ... +``` + +`iter_item` 返回一个可迭代的对象,通常写法是for循环中,yield data和annotations。对于swds格式的数据集,data一般为 `Audio`,`Image`,`Text`、`GrayscaleImage`和`Binary`。也接受用户yield bytes类型的data,会自动转化成 `Binary` 类型。以[MNIST](https://github.com/star-whale/starwhale/tree/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/example/mnist)为例,构建swds的数据集基本代码如下: + +```python +import struct +import typing as t +from pathlib import Path + +from starwhale import ( + Link, + GrayscaleImage, + SWDSBinBuildExecutor, +) + +class DatasetProcessExecutor(SWDSBinBuildExecutor): + def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + root_dir = Path(__file__).parent.parent / "data" + + with (root_dir / "t10k-images-idx3-ubyte").open("rb") as data_file, ( + root_dir / "t10k-labels-idx1-ubyte" + ).open("rb") as label_file: + _, data_number, height, width = struct.unpack(">IIII", data_file.read(16)) + _, label_number = struct.unpack(">II", label_file.read(8)) + print( + f">data({data_file.name}) split data:{data_number}, label:{label_number} group" + ) + image_size = height * width + + for i in range(0, min(data_number, label_number)): + _data = data_file.read(image_size) + _label = struct.unpack(">B", label_file.read(1))[0] + yield GrayscaleImage( + _data, + display_name=f"{i}", + shape=(height, width, 1), + ), {"label": _label} +``` + +## starwhale.UserRawBuildExecutor + +提供remote-link和user-raw格式的数据集构建类,需要用户实现 `iter_item` 函数,返回一个可迭代的对象,包含data和annotations,其中data需要是一个 `starwhale.Link` 类型。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/dataset/builder.py#L307)。 + +```python +class RawDatasetProcessExecutor(UserRawBuildExecutor): + def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + ... +``` + +以[Speech Commands](https://github.com/star-whale/starwhale/tree/main/example/speech_command)为例,构建remote-link的数据集基本代码如下: + +```python +import typing as t +from pathlib import Path + +from starwhale import ( + Link, + Audio, + MIMEType, + S3LinkAuth, + UserRawBuildExecutor, +) +class LinkRawDatasetBuildExecutor(UserRawBuildExecutor): + + _auth = S3LinkAuth( + name="speech", access_key="minioadmin", secret="minioadmin", region="local" + ) + _addr = "10.131.0.1:9000" + _bucket = "users" + + def iter_item(self) -> t.Generator[t.Tuple[t.Any, t.Any], None, None]: + import boto3 + from botocore.client import Config + + s3 = boto3.resource( + "s3", + endpoint_url=f"http://{self._addr}", + aws_access_key_id=self._auth.access_key, + aws_secret_access_key=self._auth.secret, + config=Config(signature_version="s3v4"), + region_name=self._auth.region, + ) + + objects = s3.Bucket(self._bucket).objects.filter( + Prefix="dataset/SpeechCommands/speech_commands_v0.02" + ) + + for obj in objects: + path = Path(obj.key) # type: ignore + command = path.parent.name + if ( + command == "_background_noise_" + or "_nohash_" not in path.name + or obj.size < 10240 + or not path.name.endswith(".wav") + ): + continue + + speaker_id, utterance_num = path.stem.split("_nohash_") + uri = f"s3://{self._addr}@{self._bucket}/{obj.key.lstrip('/')}" + data = Link( + uri, + self._auth, + size=obj.size, + data_type=Audio( + display_name=f"{command}/{path.name}", + mime_type=MIMEType.WAV, + shape=(1,), + ), + ) + annotations = { + "label": command, + "speaker_id": speaker_id, + "utterance_num": int(utterance_num), + } + yield data, annotations +``` + +## starwhale.BuildExecutor + +`SWDSBinBuildExecutor` 类的别称,同为swds格式的数据集构建类。 + +## starwhale.get_data_loader + +获取Starwhale Dataset的Data Loader,是一个可迭代的对象,能够获取数据集中具体样本的索引、data和annotations。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/dataset/loader.py)。该函数返回的loader有两种:一种是表示swds格式的 `SWDSBinDataLoader`, 另一种是表示remote-link或user-raw格式的 `UserRawDataLoader`。两种loader类型目前都能处理在LocalFS和S3协议的对象存储上数据。 + +```python +def get_data_loader( + dataset_uri: URI, + start: int = 0, + end: int = sys.maxsize, + logger: t.Union[loguru.Logger, None] = None, +) -> DataLoader: +``` + +|参数|说明| +|---|---| +|`dataset_uri`| `starwhale.URI` 对象 | +|`start`| 数据集index的起始位,默认从0开始 | +|`end`| 数据集index的结束位。start和end表示是左闭右开的区间,即 `start <= i < end` | +|`logger`|可传入自定义的logger对象| + +使用示例如下: + +```python +from starwhale import get_data_loader, URI + +uri = URI("mnist/version/latest", expected_type="dataset") +data_loader = get_data_loader(dataset_uri=uri) + +for idx, data, annotations in data_loader: + ... +``` diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/evaluation.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/evaluation.md index e69de29bb2..e2a87baf2f 100644 --- a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/evaluation.md +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/evaluation.md @@ -0,0 +1,195 @@ +--- +title: 模型评测 +--- + +## starwhale.PipelineHandler + +提供默认的模型评测过程定义,需要用户实现 `ppl` 和 `cmp` 函数。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/model.py)。 + +```python +from abc import ABCMeta, abstractmethod + +class PipelineHandler(metaclass=ABCMeta): + def __init__(self, + ignore_annotations: bool = False, + ignore_error: bool = False, + ) -> None: + ... + + @abstractmethod + def ppl(self, data: Any, **kw: Any) -> Any: + raise NotImplementedError + + @abstractmethod + def cmp(self, ppl_result: PPLResultIterator) -> Any + raise NotImplementedError +``` + +`PipelineHandler` 类实例化时可以定义两个参数:当`ignore_annotations`为False时,PPLResultIterator中会携带数据集所对应的 annotations信息,保证index上与推理结果是一一对应的;当 `ignore_error`为True是,会忽略ppl过程中的错误,可以解决比较大的数据集样本中,有个别数据错误导致ppl失败,进而导致无法完成评测的问题。 + +`ppl` 函数用来进行推理,输入参数为 data和kw。data表示数据集中某个样本,kw为一个字典,目前包含 `annotations` 和 `index`。每条数据集样本都会调用`ppl`函数,输出为模型推理值,会自动被记录和存储,可以在cmp函数中通过 `ppl_result` 参数获取。 + +`cmp` 函数一般用来进行推理结果的汇总,并产生最终的评测报告数据,只会调用一次。`cmp` 函数的参数为 `ppl_result` ,该值是 `PPLResultIterator` 类型,可以被迭代。迭代出来的对象为一个字典,包含 `result`, `annotations` 和 `data_id` 三个元素。`result` 为 `ppl` 返回的元素,由于使用了 pickle做序列化-反序列化,data["result"] 变量直接能获取ppl函数return的值;`annotations` 为构建数据集时写入的,此阶段的result["annotations"]为一个dict类型。`data_id` 表示数据集对应的index。 + +另外,在PipelineHandler及其子类中可以访问 `self.context` 获取 `starwhale.Context` 类型的上下文信息。 + +常见的使用方法示例如下: + +```python + +class Example(PipelineHandler): + def __init__(self) -> None: + super().__init__() + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = self._load_model(self.device) + + def ppl(self, img: Image, **kw): + data_tensor = self._pre(img) + output = self.model(data_tensor) + return self._post(output) + + def cmp(self, ppl_result): + result, label, pr = [], [], [] + for _data in ppl_result: + label.append(_data["annotations"]["label"]) + result.extend(_data["result"][0]) + pr.extend(_data["result"][1]) + return label, result, pr + + def _pre(self, input: Image) -> torch.Tensor: + ... + + def _post(self, input): + ... + + def _load_model(self, device): + ... +``` + +## starwhale.Context + +执行模型评测过程中传入的上下文信息,包括Project、Task ID等。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/job.py)。Context的内容是自动注入的,用户通过 `@pass_context` 使用context,或在 继承 `PipelineHandler` 类内使用,目前Context可以获得如下值: + +```python + +@pass_context +def func(ctx: Context): + ... + print(ctx.project) + print(ctx.version) + print(ctx.step) + ... + +Context( + workdir: Path, + step: str = "", + total: int = 1, + index: int = 0, + dataset_uris: t.List[str] = [], + version: str = "", + project: str = "", +) +``` + +|参数|说明| +|---|----| +|project|project名字| +|version|Evaluation 版本号| +|step|step名字| +|total|step下所有的task数量| +|index|当前task的索引编号,从零开始| +|dataset_uris|dataset uri字符串的列表| +|workdir|model.yaml所在目录| + +## starwhale.PPLResultIterator + +`cmp`函数中使用,是一个可迭代的对象,能够输出 `ppl` 结果,数据集index和对应的数据集annotations。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/model.py)。 + +```python +from starwhale import PipelineHandler, PPLResultIterator + +class Example(PipelineHandler): + def cmp( + self, ppl_result: PPLResultIterator + ) -> t.Tuple[t.List[int], t.List[int], t.List[t.List[float]]]: + result, label, pr = [], [], [] + for _data in ppl_result: + label.append(_data["annotations"]["label"]) + result.extend(_data["result"][0]) + pr.extend(_data["result"][1]) + print(_data["data_id"]) + return label, result, pr + +``` + +## starwhale.multi_classification + +修饰器,适用于多分类问题,用来简化cmp结果的进一步计算和结果存储,能更好的呈现评测结果。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/metric.py)。 + +```python + +@multi_classification( + confusion_matrix_normalize="all", + show_hamming_loss=True, + show_cohen_kappa_score=True, + show_roc_auc=True, + all_labels=[i for i in range(0, 10)], +) +def cmp(ppl_result: PPLResultIterator) -> t.Tuple[t.List[int], t.List[int], t.List[t.List[float]]]: + label, result, probability_matrix = [], [], [] + return label, result, probability_matrix + +@multi_classification( + confusion_matrix_normalize="all", + show_hamming_loss=True, + show_cohen_kappa_score=True, + show_roc_auc=False, + all_labels=[i for i in range(0, 10)], +) +def cmp(ppl_result: PPLResultIterator) -> t.Tuple[t.List[int], t.List[int], t.List[t.List[float]]]: + label, result = [], [], [] + return label, result +``` + +|参数|说明| +|---|----| +|`confusion_matrix_normalize`| `true`(rows), `pred`(columns) 或 `all`(rows+columns) | +|`show_hamming_loss`|是否计算hamming loss| +|`show_cohen_kappa_score`|是否计算 cohen kappa score| +|`show_roc_auc`|是否计算roc/auc, 计算的时候,需要函数返回(label,result, probability_matrix) 三元组,否则只需返回(label, result) 两元组即可| +|all_labels|所有的labels| + +`multi_classification` 修饰器使用sklearn lib对多分类问题进行结果分析,输出confusion matrix, roc, auc等值,并且会写入到 starwhale的 DataStore 中。使用的时候需要对所修饰的函数返回值有一定要求,返回(label, result, probability_matrix) 或 (label, result)。 + +## starwhale.step + +修饰器,可以指定DAG的依赖关系和Task数量、资源等配置,实现用户自定义评测过程。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/api/_impl/job.py)。使用 `step` 可以完全不依赖于 `PipelineHandler` 预定义的基本模型评测过程,可以自行定义多阶段和每个阶段的依赖、资源和任务并发数等。 + +```python +@step( + resources: Optional[List[str]] = None, + concurrency: int = 1, + task_num: int = 1, + needs: Optional[List[str]] = None, +) +def func(): + ... + +``` + +|参数|说明| +|---|----| +|`resources`|该step中每个task所依赖的资源情况| +|`concurrency`|task执行的并发度| +|`task_num`|step会被分成task的数量| +|`needs`|依赖的step列表| + +`resources` 格式为 {名称}:{数量}。名称为资源的种类,目前支持 `cpu`、`gpu` 和 `memory`。当种类为 `cpu` 时,数量的类型为float, 没有单位,1表示1个cpu core,对应Kubernetes resource的request;当种类为 `gpu` 时,数量的类型为int,没有单位,1表示1个gpu,对应Kubernetes resource的request和limit;当种类为 `memory`时,数量的类型为float,没有单位,1表示1MB内存,对应Kubernetes resource的request。`resources` 使用列表的方式支持指定多个资源,且这些资源都满足时才会进行调度。当不写 `resources` 时,会使用所在Kubernetes的cpu、memory默认值。 `resources` 表示的是一个task执行的时所需要的资源情况,并不是step所有task的资源总和限制。**目前 `resources` 只在Cloud Instance中生效**。 `resources` 使用例子如下: + +```python +@step() +@step(resources=["cpu=1"]) +@step(resources=["gpu=1"]) +@step(resources=["memory=100"]) +@step(resources=["cpu=0.1", "gpu=1", "memory=100"]) +``` diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/other.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/other.md new file mode 100644 index 0000000000..b1ff31f7ee --- /dev/null +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/other.md @@ -0,0 +1,53 @@ +--- +title: 其他SDK +--- + +## starwhale.\__version__ + +Starwhale SDK和Cli版本,是字符串常量。 + +```python +>>> from starwhale import __version__ +>>> print(__version__) +0.3.0rc10 +``` + +## starwhale.URI + +starwhale uri的类定义,可以将字符串转化成URI对象。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/base/uri.py)。 + +```python +URI( + raw: str, + expected_type: str = URIType.UNKNOWN +) +``` + +|参数|说明| +|---|---| +|`raw`| starwhale uri的字符串 | +|`expected_type`| 可以对有歧义的uri字符串强制指定为某种类型 | + +```python +>>> dataset_uri = URI("mnist/version/latest", expected_type=URIType.DATASET) +>>> model_uri = URI("mnist/version/latest", expected_type=URIType.MODEL) +>>> runtime_uri = URI("mnist/version/latest", expected_type=URIType.RUNTIME) +>>> dataset_uri = URI("dataset/mnist/version/latest") +``` + +上面例子中,uri的原始字符串都是 `mnist/version/latest`,这是一个有歧义的URI,但当指定了 `expected_type` 参数后,可以明确指定为预期的URI。 + +## starwhale.URIType + +描述 `starwhale.URI` 类型,Github上的[代码链接](https://github.com/star-whale/starwhale/blob/dc6e6fdeae2f7c5bd0e72ccd8fb50768b1ce0826/client/starwhale/base/type.py)。 + +```python +class URIType: + INSTANCE = "instance" + PROJECT = "project" + MODEL = "model" + DATASET = "dataset" + RUNTIME = "runtime" + EVALUATION = "evaluation" + UNKNOWN = "unknown" +``` diff --git a/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/overview.md b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/overview.md new file mode 100644 index 0000000000..c8154b72f3 --- /dev/null +++ b/docs/i18n/zh/docusaurus-plugin-content-docs/current/reference/sdk/overview.md @@ -0,0 +1,43 @@ +--- +title: 基本信息 +--- + +Starwhale 提供一系列的Python SDK,帮助用户更容易的制作数据集、调用模型评测、追踪和展示评测结果等。Python SDK多数场景下与YAML和CLI配合使用,完成模型评测等核心任务。 + +## 类 + +- `class PipelineHandler`: 提供默认的模型评测过程定义,需要用户实现 `ppl` 和 `cmp` 函数。 +- `class Context`: 执行模型评测过程中传入的上下文信息,包括Project、Task ID等。 +- `class SWDSBinBuildExecutor`: 提供swds格式的数据集构建类,需要用户实现 `iter_item` 函数。 +- `class UserRawBuildExecutor`: 提供remote-link和user-raw格式的数据集构建类,需要用户实现 `iter_item` 函数。 +- `class BuildExecutor`: `SWDSBinBuildExecutor` 类的别称,同为swds格式的数据集构建类。 +- `class PPLResultIterator`: `cmp`函数中使用,是一个可迭代的对象,能够输出 `ppl` 结果,数据集index和对应的数据集annotations。 +- `class URI`: starwhale uri的类定义,可以将字符串转化成URI对象。 + +## 函数 + +- `multi_classification`: 修饰器,适用于多分类问题,用来简化cmp结果的进一步计算和结果存储,能更好的呈现评测结果。 +- `step`: 修饰器,可以指定DAG的依赖关系和Task数量、资源等配置,实现用户自定义评测过程。 +- `get_data_loader`: 获取Starwhale Dataset的Data Loader,是一个可迭代的对象,能够获取数据集中具体样本的索引、data和annotations。 + +## 数据类型 + +- `COCOObjectAnnotation`: 提供COCO类型的定义。 +- `GrayscaleImage`: 灰度图类型,比如MNIST中数字手写体图片,是 `Image` 类型的一个特例。 +- `BoundingBox`: 边界框类型,目前为 `LTWH` 格式,即 `left_x`, `top_y`, `width` 和 `height`。 +- `ClassLabel`: 描述label的数量和类型。 +- `Image`: 图片类型。 +- `Audio`: 音频类型。 +- `Text`: 文本类型,默认为 `utf-8` 格式。 +- `Binary`: 二进制类型,用bytes存储。 +- `Link`: Link类型,用来制作 `remote-link` 和 `user-raw` 类型的数据集。 +- `S3LinkAuth`: 当数据存储在基于S3协议的对象存储上时,该类型负责描述授权、密钥信息。 +- `LocalFSLinkAuth`: 描述数据存储在本地文件系统上。 +- `DefaultS3LinkAuth`: 使用默认值初始化 `S3LinkAuth` 类型后得到的变量。 +- `MIMEType`: 描述Starwhale支持的多媒体类型,用在 `Image`、`Video` 等类型的mime_type 属性上,能更好的进行Dataset Viewer。 +- `LinkType`: 描述Starwhale支持的remote-link类型,目前支持 `LocalFS` 和 `S3` 两种类型。 + +## 其他 + +- `__version__`: Starwhale SDK和Cli版本,是字符串常量。 +- `URIType`: 描述 `starwhale.URI` 类型。 diff --git a/docs/sidebars.js b/docs/sidebars.js index 24c061efde..20e9ae0ee8 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -56,8 +56,11 @@ module.exports = { "reference/cli/eval", "reference/cli/utilities"], "Python SDK": [ + "reference/sdk/overview", + "reference/sdk/data_type", "reference/sdk/dataset", "reference/sdk/evaluation", + "reference/sdk/other", ], }, "Community": [