Skip to content

Commit

Permalink
example(dataset): add sintel dataset; add NumpyBinary in sw (#1782)
Browse files Browse the repository at this point in the history
  • Loading branch information
anda-ren authored Feb 15, 2023
1 parent 6a82567 commit 84f2f55
Show file tree
Hide file tree
Showing 10 changed files with 254 additions and 0 deletions.
2 changes: 2 additions & 0 deletions client/starwhale/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
ClassLabel,
S3LinkAuth,
BoundingBox,
NumpyBinary,
BoundingBox3D,
BuildExecutor,
GrayscaleImage,
Expand Down Expand Up @@ -55,6 +56,7 @@
"LinkType",
"BuildExecutor",
"Binary",
"NumpyBinary",
"Text",
"Line",
"Point",
Expand Down
2 changes: 2 additions & 0 deletions client/starwhale/api/_impl/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ClassLabel,
S3LinkAuth,
BoundingBox,
NumpyBinary,
BoundingBox3D,
GrayscaleImage,
LocalFSLinkAuth,
Expand All @@ -36,6 +37,7 @@
"LinkType",
"BuildExecutor",
"Binary",
"NumpyBinary",
"Text",
"Line",
"Point",
Expand Down
2 changes: 2 additions & 0 deletions client/starwhale/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ClassLabel,
S3LinkAuth,
BoundingBox,
NumpyBinary,
BoundingBox3D,
BuildExecutor,
GrayscaleImage,
Expand All @@ -38,6 +39,7 @@
"LinkType",
"BuildExecutor",
"Binary",
"NumpyBinary",
"Text",
"Line",
"Point",
Expand Down
25 changes: 25 additions & 0 deletions client/starwhale/core/dataset/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,31 @@ def to_numpy(self) -> numpy.ndarray:
return numpy.array(self.to_bytes(), dtype=self.dtype)


class NumpyBinary(BaseArtifact, SwObject):
def __init__(
self,
fp: _TArtifactFP,
dtype: t.Type,
shape: _TShape,
link: t.Optional[Link] = None,
) -> None:
super().__init__(
fp=fp,
type=ArtifactType.Binary,
shape=shape,
dtype=dtype,
link=link,
)

def to_numpy(self) -> numpy.ndarray:
return numpy.frombuffer(self.to_bytes(), dtype=self.dtype).reshape(self.shape) # type: ignore

def to_tensor(self) -> t.Any:
from starwhale.integrations.pytorch import convert_numpy_to_tensor

return convert_numpy_to_tensor(self.to_numpy())


class Image(BaseArtifact, SwObject):
def __init__(
self,
Expand Down
9 changes: 9 additions & 0 deletions client/tests/sdk/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from concurrent.futures import as_completed, ThreadPoolExecutor

import numpy
import numpy as np
import torch
import pytest
from requests_mock import Mocker
Expand Down Expand Up @@ -53,6 +54,7 @@
MIMEType,
ClassLabel,
BoundingBox,
NumpyBinary,
ArtifactType,
BoundingBox3D,
GrayscaleImage,
Expand Down Expand Up @@ -859,6 +861,13 @@ def test_binary(self) -> None:
"display_name": "",
}

def test_numpy_binary(self) -> None:
np_array = np.array([[1.008, 6.94, 22.990], [39.098, 85.468, 132.91]])
b = NumpyBinary(np_array.tobytes(), np_array.dtype, np_array.shape)
assert b.to_bytes() == np_array.tobytes()
np.testing.assert_array_equal(b.to_numpy(), np_array)
assert torch.equal(torch.from_numpy(np_array), b.to_tensor())

def test_image(self) -> None:
fp = io.StringIO("test")
img = Image(fp, display_name="t", shape=[28, 28, 3], mime_type=MIMEType.PNG)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -604,3 +604,17 @@ BoundingBox3D(
|---|-------------------|
|`bbox_a`| 在二维UI上,3D框靠近用户的一面 |
|`bbox_b`| 在二维UI上,3D框远离用户的一面 |

## 15. starwhale.NumpyBinary

在构建`Dataset`的时候,用户可以使`NumpyBinary`来存储`ndarray`,以提高starwhale的存储效率。`fp``ndarray`在本地的存储路径,或者`bytes``dtype``numpy``dtype``shape``ndarray`的形状,`link`可以作为`fp`的备选输入。Github上的[代码链接](https://github.com/star-whale/starwhale/blob/02ed82a406ef403416a6faf67f41341e68c38acd/client/starwhale/core/dataset/type.py#L326)

```python
NumpyBinary(
self,
fp: _TArtifactFP,
dtype: t.Type,
shape: _TShape,
link: t.Optional[Link] = None,
)
```
37 changes: 37 additions & 0 deletions example/datasets/sintel/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
title: The `sintel` Dataset
---

## The MPI Sintel FLow Dataset Description

- [Homepage](http://sintel.is.tue.mpg.de/)

## The `sintel` dataset Structure

### Data Fields

- `data` of type dict:
- `frame0/albedo`: `starwhale.Image`
- `frame0/clean`: `starwhale.Image`
- `frame0/final`: `starwhale.Image`
- `frame1/albedo`: `starwhale.Image`
- `frame1/clean`: `starwhale.Image`
- `frame1/final`: `starwhale.Image`
- `flow_viz`: `starwhale.Image`
- `flow_bin`: `starwhale.NumpyBinary`
- `pix_invalid`: `starwhale.Image`
- `pix_occlusions`: `starwhale.Image`

## Build `sintel` Dataset locally

```shell
python3 dataset.py
```

## Example

Output the `1024`th record of the `sintel` dataset.

```shell
python3 example.py
```
114 changes: 114 additions & 0 deletions example/datasets/sintel/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import numpy as np
import requests

from starwhale import Link, Image, dataset, MIMEType, NumpyBinary # noqa: F401
from starwhale.utils.retry import http_retry

PATH_ROOT = (
"https://starwhale-examples.oss-cn-beijing.aliyuncs.com/dataset/sintel/training"
)
PATH_ALBEDO = "albedo"
PATH_CLEAN = "clean"
PATH_FINAL = "final"
PATH_FLOW = "flow"
PATH_FLOW_VIZ = "flow_viz"
PATH_INVALID = "invalid"
PATH_OCCLUSIONS = "occlusions"
SUFFIX_FLO = ".flo"
SUFFIX_PNG = ".png"
PREFIX_FILE = "frame_"


@http_retry
def request_link_json(index_link):
return requests.get(index_link, timeout=10).json()


@http_retry
def request_link_content(link):
return requests.get(link, timeout=10).content


def file_path(dir, dir2, file_name):
return f"{PATH_ROOT}/{dir}/{dir2}/{file_name}"


def next_image(image_name: str):
i_n = image_name.replace(PREFIX_FILE, "")
i_n = i_n.replace(SUFFIX_PNG, "")
return PREFIX_FILE + str(int(i_n) + 1).zfill(4) + SUFFIX_PNG


def path_to_image(image_path: str):
return Image(
link=Link(image_path),
mime_type=MIMEType.PNG,
)


def flo_to_binary(flow_bytes: bytes):
(magic,) = np.frombuffer(flow_bytes[0:4], np.float32)
if 202021.25 != magic:
print("Magic number incorrect. Invalid .flo file")
else:
w = np.frombuffer(flow_bytes[4:8], np.int32)
h = np.frombuffer(flow_bytes[8:12], np.int32)
print(f"Reading {w} x {h} flo file")
data = np.frombuffer(flow_bytes[12:], np.float32)
return NumpyBinary(
fp=data.tobytes(), shape=(h.item(), w.item(), 2), dtype=np.float32
)


def build_ds():
ds = dataset("sintel", create=True)
json = request_link_json(f"{PATH_ROOT}/tree.json")
for folder in json:
if folder.get("name") != PATH_FLOW:
continue
for d in folder["contents"]:
_dir = d["name"]
for f in d["contents"]:
flo_fn = str(f["name"])
img_file_name = flo_fn.replace(SUFFIX_FLO, SUFFIX_PNG)
ds.append(
{
"frame0/albedo": path_to_image(
file_path(PATH_ALBEDO, _dir, img_file_name)
),
"frame0/clean": path_to_image(
file_path(PATH_CLEAN, _dir, img_file_name)
),
"frame0/final": path_to_image(
file_path(PATH_FINAL, _dir, img_file_name)
),
"frame1/albedo": path_to_image(
file_path(PATH_ALBEDO, _dir, next_image(img_file_name))
),
"frame1/clean": path_to_image(
file_path(PATH_CLEAN, _dir, next_image(img_file_name))
),
"frame1/final": path_to_image(
file_path(PATH_FINAL, _dir, next_image(img_file_name))
),
"flow_viz": path_to_image(
file_path(PATH_FLOW_VIZ, _dir, img_file_name)
),
"flow_bin": flo_to_binary(
request_link_content(file_path(PATH_FLOW, _dir, flo_fn))
),
"pix_occlusions": path_to_image(
file_path(PATH_OCCLUSIONS, _dir, img_file_name)
),
"pix_invalid": path_to_image(
file_path(PATH_INVALID, _dir, img_file_name)
),
}
)

ds.commit()
ds.close()


if __name__ == "__main__":
build_ds()
48 changes: 48 additions & 0 deletions example/datasets/sintel/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import io

import flow_vis
import matplotlib.pyplot as plt
from PIL import Image as PILImage

from starwhale import dataset

ds_name = "sintel/version/latest"
ds = dataset(ds_name)
row = ds[1024]
data = row.data
fig, ax = plt.subplots(
3, 3, figsize=(25, 16), gridspec_kw={"wspace": 0.1, "hspace": 0.1}
)
albedo0 = data["frame0/albedo"].to_bytes()
clean0 = data["frame0/clean"].to_bytes()
final0 = data["frame0/final"].to_bytes()
albedo1 = data["frame1/albedo"].to_bytes()
clean1 = data["frame1/clean"].to_bytes()
final1 = data["frame1/final"].to_bytes()
flow_viz = data["flow_viz"].to_bytes()
flow_bin = data["flow_bin"].to_numpy()
pix_occlusions = data["pix_occlusions"].to_bytes()
with PILImage.open(io.BytesIO(albedo0)) as img1, PILImage.open(
io.BytesIO(clean0)
) as img2, PILImage.open(io.BytesIO(final0)) as img3, PILImage.open(
io.BytesIO(albedo1)
) as img4, PILImage.open(
io.BytesIO(final1)
) as img5, PILImage.open(
io.BytesIO(clean1)
) as img6, PILImage.open(
io.BytesIO(flow_viz)
) as img7, PILImage.open(
io.BytesIO(pix_occlusions)
) as img9:
img8 = flow_vis.flow_to_color(flow_bin, convert_to_bgr=False)
ax[0][0].imshow(img1)
ax[0][1].imshow(img2)
ax[0][2].imshow(img3)
ax[1][0].imshow(img4)
ax[1][1].imshow(img5)
ax[1][2].imshow(img6)
ax[2][0].imshow(img7)
ax[2][1].imshow(img8)
ax[2][2].imshow(img9)
fig.show()
1 change: 1 addition & 0 deletions example/datasets/sintel/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
flow_vis

0 comments on commit 84f2f55

Please sign in to comment.