Skip to content

Commit

Permalink
feat: Support v2 VLenUTF8 codec (#96)
Browse files Browse the repository at this point in the history
  • Loading branch information
manzt authored Aug 22, 2023
1 parent f233388 commit 97e7df1
Show file tree
Hide file tree
Showing 22 changed files with 428 additions and 108 deletions.
38 changes: 38 additions & 0 deletions .changeset/nice-jars-explode.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
"@zarrita/indexing": patch
"@zarrita/ndarray": patch
"zarrita": patch
"@zarrita/core": patch
---

feat: Support `VLenUTF8` codec in v2 and introduce a strided JS "object" Array.

```python
import zarr
import numcodecs

zarr.create_dataset(
"data.zarr",
data=np.array(
[[["a", "aa"], ["aaa", "aaaa"]],
[["b", "bb"], ["bbb", "bbbb"]]],
dtype=object
),
dtype="|O",
object_codec=numcodecs.VLenUTF8(),
chunks=(1, 1, 2),
)
```

```typescript
import * as zarr from "zarrita";

let store = zarr.FetchStore("http://localhost:8080/data.zarr");
let arr = await zarr.open.v2(store, { kind: "array" });
let result = zarr.get(arr);
// {
// data: ["a", "aa", "aaa", "aaaa", "b", "bb", "bbb", "bbbb"],
// shape: [2, 2, 2],
// stride: [4, 2, 1],
// }
```
28 changes: 28 additions & 0 deletions fixtures/v2/data.zarr/.zmetadata
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,34 @@
],
"zarr_format": 2
},
"3d.chunked.O/.zarray": {
"chunks": [
1,
1,
2
],
"compressor": {
"blocksize": 0,
"clevel": 5,
"cname": "lz4",
"id": "blosc",
"shuffle": 1
},
"dtype": "|O",
"fill_value": 0,
"filters": [
{
"id": "vlen-utf8"
}
],
"order": "C",
"shape": [
2,
2,
2
],
"zarr_format": 2
},
"3d.chunked.i2/.zarray": {
"chunks": [
1,
Expand Down
28 changes: 28 additions & 0 deletions fixtures/v2/data.zarr/3d.chunked.O/.zarray
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"chunks": [
1,
1,
2
],
"compressor": {
"blocksize": 0,
"clevel": 5,
"cname": "lz4",
"id": "blosc",
"shuffle": 1
},
"dtype": "|O",
"fill_value": 0,
"filters": [
{
"id": "vlen-utf8"
}
],
"order": "C",
"shape": [
2,
2,
2
],
"zarr_format": 2
}
Binary file added fixtures/v2/data.zarr/3d.chunked.O/0.0.0
Binary file not shown.
Binary file added fixtures/v2/data.zarr/3d.chunked.O/0.1.0
Binary file not shown.
Binary file added fixtures/v2/data.zarr/3d.chunked.O/1.0.0
Binary file not shown.
Binary file added fixtures/v2/data.zarr/3d.chunked.O/1.1.0
Binary file not shown.
146 changes: 116 additions & 30 deletions fixtures/v2/generate-v2.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,174 @@
import zarr
import zarr
import numpy as np
from numcodecs import Zlib, Blosc, LZ4, Zstd
import shutil
from numcodecs import Zlib, Blosc, LZ4, Zstd, VLenUTF8

store = zarr.DirectoryStore('data.zarr')
shutil.rmtree("data.zarr", ignore_errors=True)

store = zarr.DirectoryStore("data.zarr")
root = zarr.open_group(store)
root.attrs['answer'] = 42
root.attrs["answer"] = 42

# 1d.contiguous.zlib.i2
root.create_dataset("1d.contiguous.zlib.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zlib())
root.create_dataset(
"1d.contiguous.zlib.i2",
data=[1, 2, 3, 4],
dtype="i2",
chunks=(4,),
compressor=Zlib(),
)

# 1d.contiguous.blosc.i2
root.create_dataset("1d.contiguous.blosc.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Blosc())
root.create_dataset(
"1d.contiguous.blosc.i2",
data=[1, 2, 3, 4],
dtype="i2",
chunks=(4,),
compressor=Blosc(),
)

# 1d.contiguous.lz4.i2
root.create_dataset("1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=LZ4())
root.create_dataset(
"1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=LZ4()
)

# 1d.contiguous.zstd.i2
root.create_dataset("1d.contiguous.zstd.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zstd())
root.create_dataset(
"1d.contiguous.zstd.i2",
data=[1, 2, 3, 4],
dtype="i2",
chunks=(4,),
compressor=Zstd(),
)

# 1d.contiguous.raw.i2
root.create_dataset("1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=None)
root.create_dataset(
"1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=None
)


# 1d.contiguous.i4
root.create_dataset('1d.contiguous.i4', data=[1, 2, 3, 4], dtype='i4', chunks=(4,))
root.create_dataset("1d.contiguous.i4", data=[1, 2, 3, 4], dtype="i4", chunks=(4,))

# 1d.contiguous.u1
root.create_dataset('1d.contiguous.u1', data=[255, 0, 255, 0], dtype='u1', chunks=(4,))
root.create_dataset("1d.contiguous.u1", data=[255, 0, 255, 0], dtype="u1", chunks=(4,))

# 1d.contiguous.<f4
root.create_dataset('1d.contiguous.f4.le', data=[-1000.5, 0, 1000.5, 0], dtype='<f4', chunks=(4,))
root.create_dataset(
"1d.contiguous.f4.le", data=[-1000.5, 0, 1000.5, 0], dtype="<f4", chunks=(4,)
)

# 1d.contiguous.>f4
root.create_dataset('1d.contiguous.f4.be', data=[-1000.5, 0, 1000.5, 0], dtype='>f4', chunks=(4,))
root.create_dataset(
"1d.contiguous.f4.be", data=[-1000.5, 0, 1000.5, 0], dtype=">f4", chunks=(4,)
)

# 1d.contiguous.f8
root.create_dataset('1d.contiguous.f8', data=[1.5, 2.5, 3.5, 4.5], dtype='f8', chunks=(4,))
root.create_dataset(
"1d.contiguous.f8", data=[1.5, 2.5, 3.5, 4.5], dtype="f8", chunks=(4,)
)

# 1d.contiguous.<U13
root.create_dataset('1d.contiguous.U13.le', data=['a', 'b', 'cc', 'd'], dtype='<U13', chunks=(4,))
root.create_dataset(
"1d.contiguous.U13.le", data=["a", "b", "cc", "d"], dtype="<U13", chunks=(4,)
)

# 1d.contiguous.>U13
root.create_dataset('1d.contiguous.U13.be', data=['a', 'b', 'cc', 'd'], dtype='>U13', chunks=(4,))
root.create_dataset(
"1d.contiguous.U13.be", data=["a", "b", "cc", "d"], dtype=">U13", chunks=(4,)
)

# 1d.contiguous.U7
root.create_dataset('1d.contiguous.U7', data=['a', 'b', 'cc', 'd'], dtype='U7', chunks=(4,))
root.create_dataset(
"1d.contiguous.U7", data=["a", "b", "cc", "d"], dtype="U7", chunks=(4,)
)

# 1d.contiguous.S7
root.create_dataset('1d.contiguous.S7', data=['a', 'b', 'cc', 'd'], dtype='S7', chunks=(4,))
root.create_dataset(
"1d.contiguous.S7", data=["a", "b", "cc", "d"], dtype="S7", chunks=(4,)
)

# 1d.contiguous.b1
root.create_dataset('1d.contiguous.b1', data=[True, False, True, False], dtype='b1', chunks=(4,))

root.create_dataset(
"1d.contiguous.b1", data=[True, False, True, False], dtype="b1", chunks=(4,)
)


# 1d.chunked.i2
root.create_dataset('1d.chunked.i2', data=[1, 2, 3, 4], dtype='i2', chunks=(2,))
root.create_dataset("1d.chunked.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(2,))

# 1d.chunked.ragged.i2
root.create_dataset('1d.chunked.ragged.i2', data=[1, 2, 3, 4, 5], dtype='i2', chunks=(2,))
root.create_dataset(
"1d.chunked.ragged.i2", data=[1, 2, 3, 4, 5], dtype="i2", chunks=(2,)
)


# 2d.contiguous.i2
root.create_dataset('2d.contiguous.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(2,2))
root.create_dataset(
"2d.contiguous.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(2, 2)
)

# 2d.chunked.i2
root.create_dataset('2d.chunked.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(1,1))
root.create_dataset("2d.chunked.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(1, 1))

# 2d.chunked.U7
root.create_dataset('2d.chunked.U7', data=[['a', 'b'],['cc', 'd']], dtype='U7', chunks=(1,1))
root.create_dataset(
"2d.chunked.U7", data=[["a", "b"], ["cc", "d"]], dtype="U7", chunks=(1, 1)
)

# 2d.chunked.ragged.i2
root.create_dataset('2d.chunked.ragged.i2', data=[[1, 2, 3],[4, 5, 6], [7, 8, 9]], dtype='i2', chunks=(2,2))
root.create_dataset(
"2d.chunked.ragged.i2",
data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
dtype="i2",
chunks=(2, 2),
)

# 3d.contiguous.i2
root.create_dataset('3d.contiguous.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,3))
root.create_dataset(
"3d.contiguous.i2",
data=np.arange(27).reshape(3, 3, 3),
dtype="i2",
chunks=(3, 3, 3),
)

# 3d.chunked.i2
root.create_dataset('3d.chunked.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(1,1,1))
root.create_dataset(
"3d.chunked.i2", data=np.arange(27).reshape(3, 3, 3), dtype="i2", chunks=(1, 1, 1)
)

# 3d.chunked.mixed.i2.C
root.create_dataset('3d.chunked.mixed.i2.C', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,1))
root.create_dataset(
"3d.chunked.mixed.i2.C",
data=np.arange(27).reshape(3, 3, 3),
dtype="i2",
chunks=(3, 3, 1),
)

# 3d.chunked.mixed.i2.F
arr = root.create_dataset('3d.chunked.mixed.i2.F', data=np.arange(27).reshape(3,3,3), order="F", dtype='i2', chunks=(3,3,1))
arr = root.create_dataset(
"3d.chunked.mixed.i2.F",
data=np.arange(27).reshape(3, 3, 3),
order="F",
dtype="i2",
chunks=(3, 3, 1),
)


# 3d.chunked.o
data = np.array(
[
[["a", "aa"], ["aaa", "aaaa"]],
[["b", "bb"], ["bbb", "bbbb"]],
],
dtype=object,
)
root.create_dataset(
"3d.chunked.O",
data=data,
object_codec=VLenUTF8(),
dtype="O",
chunks=(1, 1, 2),
)

zarr.consolidate_metadata(store)
1 change: 1 addition & 0 deletions packages/core/__tests__/consolidated.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ describe("openConsolidated", () => {
"/2d.chunked.i2" => "array",
"/2d.chunked.ragged.i2" => "array",
"/2d.contiguous.i2" => "array",
"/3d.chunked.O" => "array",
"/3d.chunked.i2" => "array",
"/3d.chunked.mixed.i2.C" => "array",
"/3d.chunked.mixed.i2.F" => "array",
Expand Down
18 changes: 18 additions & 0 deletions packages/core/__tests__/open.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,24 @@ describe("v2", () => {
});
});

describe("3d.chunked.O", async () => {
let arr = await open.v2(store.resolve("/3d.chunked.O"), {
kind: "array",
});
it.each([
[[0, 0, 0], ["a", "aa"]],
[[1, 0, 0], ["b", "bb"]],
[[0, 1, 0], ["aaa", "aaaa"]],
[[1, 1, 0], ["bbb", "bbbb"]],
])(`getChunk(%j) -> %j`, async (index, expected) => {
expect(await arr.getChunk(index)).toStrictEqual({
data: expected,
shape: [1, 1, 2],
stride: [2, 2, 1],
});
});
});

describe("3d.chunked.mixed.i2.C", async () => {
let arr = await open.v2(store.resolve("/3d.chunked.mixed.i2.C"), {
kind: "array",
Expand Down
5 changes: 5 additions & 0 deletions packages/core/__tests__/util.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ describe("is_dtype", () => {
["uint64", false],
["v2:U6", false],
["v2:S6", false],
["v2:object", false],
])("is_dtype(%s, 'number') -> %s", (dtype, expected) => {
expect(is_dtype(dtype, "number")).toBe(expected);
});
Expand All @@ -104,6 +105,7 @@ describe("is_dtype", () => {
["uint64", false],
["v2:U6", false],
["v2:S6", false],
["v2:object", false],
])("is_dtype(%s, 'boolean') -> %s", (dtype, expected) => {
expect(is_dtype(dtype, "boolean")).toBe(expected);
});
Expand All @@ -122,6 +124,7 @@ describe("is_dtype", () => {
["uint64", true],
["v2:U6", false],
["v2:S6", false],
["v2:object", false],
])("is_dtype(%s, 'bigint') -> %s", (dtype, expected) => {
expect(is_dtype(dtype, "bigint")).toBe(expected);
});
Expand All @@ -140,6 +143,7 @@ describe("is_dtype", () => {
["uint64", false],
["v2:U6", true],
["v2:S6", true],
["v2:object", false],
])("is_dtype(%s, 'string') -> %s", (dtype, expected) => {
expect(is_dtype(dtype, "string")).toBe(expected);
});
Expand All @@ -158,6 +162,7 @@ describe("is_dtype", () => {
"uint64",
"v2:U6",
"v2:S6",
"v2:object",
])("is_dtype(%s, %s) -> true", (dtype) => {
expect(is_dtype(dtype, dtype)).toBe(true);
});
Expand Down
Loading

0 comments on commit 97e7df1

Please sign in to comment.