Skip to content

Commit

Permalink
example: add coco dataset (#1449)
Browse files Browse the repository at this point in the history
  • Loading branch information
anda-ren authored Nov 11, 2022
1 parent c172676 commit 5e4e011
Show file tree
Hide file tree
Showing 14 changed files with 252 additions and 23 deletions.
2 changes: 2 additions & 0 deletions client/starwhale/core/dataset/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,8 @@ def read(self, size: int) -> bytes:
raise FormatError(f"{_r}:{type(_r)} is not bytes or memoryview type")

def _read(self, size: int) -> memoryview:
if size <= 0:
return self.obj.get()["Body"].read() # type:ignore
# TODO: use smart_open 3rd lib?
if (self._current + size) <= len(self._buffer):
end = self._current + size
Expand Down
8 changes: 8 additions & 0 deletions example/datasets/coco/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.POHNY: raw
raw:
mkdir -p data
[ -f data/panoptic_annotations_trainval2017.zip ] || wget wget http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip -O data/panoptic_annotations_trainval2017.zip
[ -f data/val2017.zip ] || wget http://images.cocodataset.org/zips/val2017.zip -O data/val2017.zip
[ -d data/val2017 ] || unzip data/val2017.zip -d data
[ -d data/annotations ] || unzip data/panoptic_annotations_trainval2017.zip -d data
[ -d data/annotations/panoptic_val2017 ] || unzip data/annotations/panoptic_val2017.zip -d data/annotations
42 changes: 42 additions & 0 deletions example/datasets/coco/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
title: The `coco-raw` Dataset
---

## The COCO Dataset Description

- [Homepage](https://cocodataset.org/#home)

## The `coco-raw` dataset Structure

### Data Fields

- `data`: `starwhale.Image` loaded as bytes array
- `annotations` of type dict:
- `mask`: `starwhale.Link` loaded as dict
- `uri`: the path where the `mask` file sits
- `segments_info`: array of `segment_info`
- `bbox_view`: `starwhale.BoundingBox` used by viewer
- other original fields


## Build `coco-raw` Dataset locally

- download raw data

```shell
make download
```

- build `coco-raw` dataset

```shell
swcli dataset build . --name coco-raw --handler dataset:do_iter_item
```

## Example

Output the first 1 record of the `coco-raw` dataset.

```shell
python3 example.py
```
107 changes: 107 additions & 0 deletions example/datasets/coco/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import os
import json
from pathlib import Path

import boto3
from botocore.client import Config as S3Config

from starwhale import Link, Image, MIMEType, S3LinkAuth, BoundingBox # noqa: F401
from starwhale.core.dataset.store import S3Connection, S3StorageBackend # noqa: F401

ROOT_DIR = Path(__file__).parent
DATA_DIR = ROOT_DIR / "data"


def do_iter_item():
with (DATA_DIR / "annotations" / "panoptic_val2017.json").open("r") as f:
index = json.load(f)
img_dict = {img["id"]: img for img in index["images"]}
for anno in index["annotations"]:
img_meta = img_dict[anno["image_id"]]
img_name = img_meta["file_name"]
img_pth = DATA_DIR / "val2017" / img_name
img_shape = (img_meta["height"], img_meta["width"])
msk_f_name = anno["file_name"]
msk_f_pth = DATA_DIR / "annotations" / "panoptic_val2017" / msk_f_name
segs_info = anno["segments_info"]
for sg in segs_info:
x, y, w, h = sg["bbox"]
sg["bbox_view"] = BoundingBox(x=x, y=y, width=w, height=h)

anno["mask"] = Link(
auth=None,
with_local_fs_data=True,
data_type=Image(
display_name=msk_f_name, shape=img_shape, mime_type=MIMEType.PNG
),
uri=str(msk_f_pth.absolute()),
)
yield Link(
uri=str(img_pth.absolute()),
data_type=Image(display_name=img_name, shape=img_shape),
with_local_fs_data=True,
), anno


PATH_ROOT = "dataset/coco/extracted"
_ak = os.environ.get("SW_S3_AK", "starwhale")
_sk = os.environ.get("SW_S3_SK", "starwhale")
_endpoint = os.environ.get("SW_S3_EDP", "http://10.131.0.1:9000")
_region = os.environ.get("SW_S3_REGION", "local")
_auth = S3LinkAuth(
name="SW_S3", access_key=_ak, secret=_sk, endpoint=_endpoint, region=_region
)
_bucket = "users"
RUI_ROOT = f"{_bucket}/{PATH_ROOT}"


def do_iter_item_from_remote():
s3 = boto3.resource(
"s3",
endpoint_url=_endpoint,
aws_access_key_id=_ak,
aws_secret_access_key=_sk,
config=S3Config(
s3={},
connect_timeout=6000,
read_timeout=6000,
signature_version="s3v4",
retries={
"total_max_attempts": 1,
"mode": "standard",
},
),
region_name=_region,
)

index = json.loads(
s3.Object(_bucket, f"{PATH_ROOT}/annotations/panoptic_val2017.json")
.get()["Body"]
.read()
.decode("utf8")
)
img_dict = images2dict(index["images"])
for anno in index["annotations"]:
img_meta = img_dict[anno["image_id"]]
img_name = img_meta["file_name"]
img_shape = (img_meta["height"], img_meta["width"])
msk_f_name = anno["file_name"]
segs_info = anno["segments_info"]
for sg in segs_info:
x, y, w, h = sg["bbox"]
sg["bbox_view"] = BoundingBox(x=x, y=y, width=w, height=h)

anno["mask"] = Link(
auth=None,
with_local_fs_data=False,
data_type=Image(
display_name=msk_f_name, shape=img_shape, mime_type=MIMEType.PNG
),
uri=f"s3://{RUI_ROOT}/annotations/panoptic_val2017/{msk_f_name}",
)
yield Link(
auth=_auth,
uri=f"s3://{RUI_ROOT}/val2017/{img_name}",
data_type=Image(display_name=img_name, shape=img_shape),
with_local_fs_data=False,
), anno
68 changes: 68 additions & 0 deletions example/datasets/coco/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import io
import os
from urllib.parse import urlparse

from PIL import Image as PILImage
from PIL import ImageDraw

from starwhale import URI, URIType, get_data_loader
from starwhale.core.dataset.store import S3Connection, S3StorageBackend


def draw_bbox(img, bbox_view_):
bbox1 = ImageDraw.Draw(img)
bbox1.rectangle(
[
(bbox_view_["x"], bbox_view_["y"]),
(
bbox_view_["x"] + bbox_view_["width"],
bbox_view_["y"] + bbox_view_["height"],
),
],
fill=None,
outline="red",
)


def raw():
uri = URI("coco-raw/version/latest", expected_type=URIType.DATASET)
for idx, data, annotations in get_data_loader(uri, 0, 1):
with PILImage.open(io.BytesIO(data.fp)) as img, PILImage.open(
annotations["mask"]["uri"]
).convert("RGBA") as msk:
for seg in annotations["segments_info"]:
draw_bbox(img, seg["bbox_view"])

msk.putalpha(127)
img.paste(msk, (0, 0), mask=msk)
img.show()


_ak = os.environ.get("SW_S3_AK", "starwhale")
_sk = os.environ.get("SW_S3_SK", "starwhale")
_endpoint = os.environ.get("SW_S3_EDP", "http://10.131.0.1:9000")
_region = os.environ.get("SW_S3_REGION", "local")
_bucket = "users"


def link():
s3 = S3StorageBackend(S3Connection(_endpoint, _ak, _sk, _region, _bucket))
uri = URI("coco-link/version/latest", expected_type=URIType.DATASET)
for idx, data, annotations in get_data_loader(uri, 0, 1):
with PILImage.open(io.BytesIO(data.fp)) as img, PILImage.open(
io.BytesIO(
s3._make_file(_bucket, urlparse(annotations["mask"]["uri"]).path).read(
-1
)
)
).convert("RGBA") as msk:
for seg in annotations["segments_info"]:
draw_bbox(img, seg["bbox_view"])

msk.putalpha(127)
img.paste(msk, (0, 0), mask=msk)
img.show()


if __name__ == "__main__":
raw()
2 changes: 2 additions & 0 deletions example/datasets/coco/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
starwhale==0.3.1
Pillow==9.2.0
4 changes: 2 additions & 2 deletions scripts/e2e_test/docker/Dockerfile.e2e
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FROM homepage-ca.intra.starwhale.ai:5000/docker-e2e:0.6
FROM homepage-ca.intra.starwhale.ai:5000/docker-e2e:0.7
COPY entrypoint.sh /
ENTRYPOINT ["/entrypoint.sh"]
ENTRYPOINT ["/entrypoint.sh"]
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ void pullLinkContent(
@Parameter(name = "authName", description = "auth name the link used")
@RequestParam(name = "authName", required = false) String authName,
@Parameter(name = "offset", description = "offset in the content")
@RequestParam(name = "offset", required = false) String offset,
@RequestParam(name = "offset", required = false) Long offset,
@Parameter(name = "size", description = "data size")
@RequestParam(name = "size", required = false) String size,
@RequestParam(name = "size", required = false) Long size,
HttpServletResponse httpResponse);

@Operation(summary = "Sign SWDS uri to get a temporarily accessible link",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ public void pullDs(String projectUrl, String datasetUrl, String versionUrl,

@Override
public void pullLinkContent(String projectUrl, String datasetUrl, String versionUrl,
String uri, String authName, String offset, String size, HttpServletResponse httpResponse) {
String uri, String authName, Long offset, Long size, HttpServletResponse httpResponse) {
if (!StringUtils.hasText(datasetUrl) || !StringUtils.hasText(versionUrl)) {
throw new StarwhaleApiException(
new SwValidationException(ValidSubject.DATASET, "please provide name and version for the DS "),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ public DatasetVersionEntity query(String projectUrl, String datasetUrl, String v
return versionEntity;
}


public DataIndexDesc nextData(DataReadRequest request) {
var dataRange = dataLoader.next(request);
return Objects.isNull(dataRange) ? null : DataIndexDesc.builder()
Expand All @@ -309,8 +310,8 @@ public DataIndexDesc nextData(DataReadRequest request) {
.build();
}

public byte[] dataOf(Long datasetId, String uri, String authName, String offset,
String size) {
public byte[] dataOf(Long datasetId, String uri, String authName, Long offset,
Long size) {
return dsFileGetter.dataOf(datasetId, uri, authName, offset, size);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,18 @@ public DsFileGetter(StorageAccessParser storageAccessParser,
this.datasetVersionMapper = datasetVersionMapper;
}

public byte[] dataOf(Long datasetId, String uri, String authName, String offset,
String size) {
public byte[] dataOf(Long datasetId, String uri, String authName, Long offset,
Long size) {
StorageAccessService storageAccessService = storageAccessParser.getStorageAccessServiceFromAuth(
datasetId, uri, authName);
String path = checkPath(datasetId, uri, storageAccessService);
long sizeLong = (long) ColumnTypeScalar.INT64.decode(size);
long offsetLong = (long) ColumnTypeScalar.INT64.decode(offset);
try (InputStream inputStream = validParam(sizeLong, offsetLong) ? storageAccessService.get(path,
offsetLong, sizeLong) : storageAccessService.get(path)) {
try (InputStream inputStream = validParam(size, offset) ? storageAccessService.get(path,
offset, size) : storageAccessService.get(path)) {
return inputStream.readAllBytes();
} catch (IOException e) {
throw new SwProcessException(ErrorType.STORAGE, "error while accessing storage", e);
} catch (IOException ioException) {
log.error("error while accessing storage ", ioException);
throw new SwProcessException(ErrorType.STORAGE,
String.format("error while accessing storage : %s", ioException.getMessage()));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,17 +259,17 @@ public void write(int b) {
});
given(datasetService.query(anyString(), anyString(), anyString()))
.willReturn(DatasetVersionEntity.builder().id(1L).build());
given(datasetService.dataOf(same(1L), anyString(), anyString(), anyString(), anyString()))
given(datasetService.dataOf(same(1L), anyString(), anyString(), any(), any()))
.willReturn(new byte[]{100});

controller.pullLinkContent("p1", "d1", "v1", "", "", "", "", response);
controller.pullLinkContent("p1", "d1", "v1", "", "", 1L, 1L, response);
assertThat(str.toString(), is("100"));

assertThrows(StarwhaleApiException.class,
() -> controller.pullLinkContent("p1", "d1", "", "", "", "", "", response));
() -> controller.pullLinkContent("p1", "d1", "", "", "", 1L, 1L, response));

assertThrows(StarwhaleApiException.class,
() -> controller.pullLinkContent("p1", "", "v1", "", "", "", "", response));
() -> controller.pullLinkContent("p1", "", "v1", "", "", 1L, 1L, response));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,10 +372,10 @@ public void testQuery() {

@Test
public void testDataOf() {
given(dsFileGetter.dataOf(same(1L), anyString(), anyString(), anyString(), anyString()))
given(dsFileGetter.dataOf(same(1L), anyString(), anyString(), any(), any()))
.willReturn(new byte[1]);

var res = dsFileGetter.dataOf(1L, "", "", "", "");
var res = dsFileGetter.dataOf(1L, "", "", 1L, 1L);
assertThat(res, notNullValue());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ public void testDataOf() throws IOException {
when(versionMapper.getVersionById(anyLong())).thenReturn(
DatasetVersionEntity.builder().storagePath("bdc").build());
DsFileGetter fileGetter = new DsFileGetter(storageAccessParser, versionMapper);
byte[] bytes = fileGetter.dataOf(1L, "bdcsd", "", (String) ColumnTypeScalar.INT64.encode(1, false),
(String) ColumnTypeScalar.INT64.encode(1, false));
byte[] bytes = fileGetter.dataOf(1L, "bdcsd", "", 1L, 1L);
Assertions.assertEquals("abc", new String(bytes));

}
Expand Down

0 comments on commit 5e4e011

Please sign in to comment.