feat: Support v2 VLenUTF8 codec (#96)

manzt · Aug 22, 2023 · 97e7df1 · 97e7df1
1 parent f233388
commit 97e7df1
Show file tree

Hide file tree

Showing 22 changed files with 428 additions and 108 deletions.
diff --git a/.changeset/nice-jars-explode.md b/.changeset/nice-jars-explode.md
@@ -0,0 +1,38 @@
+---
+"@zarrita/indexing": patch
+"@zarrita/ndarray": patch
+"zarrita": patch
+"@zarrita/core": patch
+---
+
+feat: Support `VLenUTF8` codec in v2 and introduce a strided JS "object" Array.
+
+```python
+import zarr
+import numcodecs
+
+zarr.create_dataset(
+    "data.zarr",
+    data=np.array(
+        [[["a", "aa"], ["aaa", "aaaa"]],
+        [["b", "bb"], ["bbb", "bbbb"]]],
+        dtype=object
+    ),
+    dtype="|O",
+    object_codec=numcodecs.VLenUTF8(),
+    chunks=(1, 1, 2),
+)
+```
+
+```typescript
+import * as zarr from "zarrita";
+
+let store = zarr.FetchStore("http://localhost:8080/data.zarr");
+let arr = await zarr.open.v2(store, { kind: "array" });
+let result = zarr.get(arr);
+// {
+//   data: ["a", "aa", "aaa", "aaaa", "b", "bb", "bbb", "bbbb"],
+//   shape: [2, 2, 2],
+//   stride: [4, 2, 1],
+// }
+```
diff --git a/fixtures/v2/data.zarr/.zmetadata b/fixtures/v2/data.zarr/.zmetadata
@@ -419,6 +419,34 @@
             ],
             "zarr_format": 2
         },
+        "3d.chunked.O/.zarray": {
+            "chunks": [
+                1,
+                1,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|O",
+            "fill_value": 0,
+            "filters": [
+                {
+                    "id": "vlen-utf8"
+                }
+            ],
+            "order": "C",
+            "shape": [
+                2,
+                2,
+                2
+            ],
+            "zarr_format": 2
+        },
         "3d.chunked.i2/.zarray": {
             "chunks": [
                 1,

diff --git a/fixtures/v2/data.zarr/3d.chunked.O/.zarray b/fixtures/v2/data.zarr/3d.chunked.O/.zarray
@@ -0,0 +1,28 @@
+{
+    "chunks": [
+        1,
+        1,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|O",
+    "fill_value": 0,
+    "filters": [
+        {
+            "id": "vlen-utf8"
+        }
+    ],
+    "order": "C",
+    "shape": [
+        2,
+        2,
+        2
+    ],
+    "zarr_format": 2
+}
diff --git a/fixtures/v2/data.zarr/3d.chunked.O/0.0.0 b/fixtures/v2/data.zarr/3d.chunked.O/0.0.0
diff --git a/fixtures/v2/data.zarr/3d.chunked.O/0.1.0 b/fixtures/v2/data.zarr/3d.chunked.O/0.1.0
diff --git a/fixtures/v2/data.zarr/3d.chunked.O/1.0.0 b/fixtures/v2/data.zarr/3d.chunked.O/1.0.0
diff --git a/fixtures/v2/data.zarr/3d.chunked.O/1.1.0 b/fixtures/v2/data.zarr/3d.chunked.O/1.1.0
diff --git a/fixtures/v2/generate-v2.py b/fixtures/v2/generate-v2.py
@@ -1,88 +1,174 @@
-import zarr 
+import zarr
 import numpy as np
-from numcodecs import Zlib, Blosc, LZ4, Zstd
+import shutil
+from numcodecs import Zlib, Blosc, LZ4, Zstd, VLenUTF8
 
-store = zarr.DirectoryStore('data.zarr')
+shutil.rmtree("data.zarr", ignore_errors=True)
+
+store = zarr.DirectoryStore("data.zarr")
 root = zarr.open_group(store)
-root.attrs['answer'] = 42
+root.attrs["answer"] = 42
 
 # 1d.contiguous.zlib.i2
-root.create_dataset("1d.contiguous.zlib.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zlib())
+root.create_dataset(
+    "1d.contiguous.zlib.i2",
+    data=[1, 2, 3, 4],
+    dtype="i2",
+    chunks=(4,),
+    compressor=Zlib(),
+)
 
 # 1d.contiguous.blosc.i2
-root.create_dataset("1d.contiguous.blosc.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Blosc())
+root.create_dataset(
+    "1d.contiguous.blosc.i2",
+    data=[1, 2, 3, 4],
+    dtype="i2",
+    chunks=(4,),
+    compressor=Blosc(),
+)
 
 # 1d.contiguous.lz4.i2
-root.create_dataset("1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=LZ4())
+root.create_dataset(
+    "1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=LZ4()
+)
 
 # 1d.contiguous.zstd.i2
-root.create_dataset("1d.contiguous.zstd.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zstd())
+root.create_dataset(
+    "1d.contiguous.zstd.i2",
+    data=[1, 2, 3, 4],
+    dtype="i2",
+    chunks=(4,),
+    compressor=Zstd(),
+)
 
 # 1d.contiguous.raw.i2
-root.create_dataset("1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=None)
+root.create_dataset(
+    "1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=None
+)
 
 
 # 1d.contiguous.i4
-root.create_dataset('1d.contiguous.i4', data=[1, 2, 3, 4], dtype='i4', chunks=(4,))
+root.create_dataset("1d.contiguous.i4", data=[1, 2, 3, 4], dtype="i4", chunks=(4,))
 
 # 1d.contiguous.u1
-root.create_dataset('1d.contiguous.u1', data=[255, 0, 255, 0], dtype='u1', chunks=(4,))
+root.create_dataset("1d.contiguous.u1", data=[255, 0, 255, 0], dtype="u1", chunks=(4,))
 
 # 1d.contiguous.<f4
-root.create_dataset('1d.contiguous.f4.le', data=[-1000.5, 0, 1000.5, 0], dtype='<f4', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.f4.le", data=[-1000.5, 0, 1000.5, 0], dtype="<f4", chunks=(4,)
+)
 
 # 1d.contiguous.>f4
-root.create_dataset('1d.contiguous.f4.be', data=[-1000.5, 0, 1000.5, 0], dtype='>f4', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.f4.be", data=[-1000.5, 0, 1000.5, 0], dtype=">f4", chunks=(4,)
+)
 
 # 1d.contiguous.f8
-root.create_dataset('1d.contiguous.f8', data=[1.5, 2.5, 3.5, 4.5], dtype='f8', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.f8", data=[1.5, 2.5, 3.5, 4.5], dtype="f8", chunks=(4,)
+)
 
 # 1d.contiguous.<U13
-root.create_dataset('1d.contiguous.U13.le', data=['a', 'b', 'cc', 'd'], dtype='<U13', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.U13.le", data=["a", "b", "cc", "d"], dtype="<U13", chunks=(4,)
+)
 
 # 1d.contiguous.>U13
-root.create_dataset('1d.contiguous.U13.be', data=['a', 'b', 'cc', 'd'], dtype='>U13', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.U13.be", data=["a", "b", "cc", "d"], dtype=">U13", chunks=(4,)
+)
 
 # 1d.contiguous.U7
-root.create_dataset('1d.contiguous.U7', data=['a', 'b', 'cc', 'd'], dtype='U7', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.U7", data=["a", "b", "cc", "d"], dtype="U7", chunks=(4,)
+)
 
 # 1d.contiguous.S7
-root.create_dataset('1d.contiguous.S7', data=['a', 'b', 'cc', 'd'], dtype='S7', chunks=(4,))
+root.create_dataset(
+    "1d.contiguous.S7", data=["a", "b", "cc", "d"], dtype="S7", chunks=(4,)
+)
 
 # 1d.contiguous.b1
-root.create_dataset('1d.contiguous.b1', data=[True, False, True, False], dtype='b1', chunks=(4,))
-
+root.create_dataset(
+    "1d.contiguous.b1", data=[True, False, True, False], dtype="b1", chunks=(4,)
+)
 
 
 # 1d.chunked.i2
-root.create_dataset('1d.chunked.i2', data=[1, 2, 3, 4], dtype='i2', chunks=(2,))
+root.create_dataset("1d.chunked.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(2,))
 
 # 1d.chunked.ragged.i2
-root.create_dataset('1d.chunked.ragged.i2', data=[1, 2, 3, 4, 5], dtype='i2', chunks=(2,))
+root.create_dataset(
+    "1d.chunked.ragged.i2", data=[1, 2, 3, 4, 5], dtype="i2", chunks=(2,)
+)
 
 
 # 2d.contiguous.i2
-root.create_dataset('2d.contiguous.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(2,2))
+root.create_dataset(
+    "2d.contiguous.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(2, 2)
+)
 
 # 2d.chunked.i2
-root.create_dataset('2d.chunked.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(1,1))
+root.create_dataset("2d.chunked.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(1, 1))
 
 # 2d.chunked.U7
-root.create_dataset('2d.chunked.U7', data=[['a', 'b'],['cc', 'd']], dtype='U7', chunks=(1,1))
+root.create_dataset(
+    "2d.chunked.U7", data=[["a", "b"], ["cc", "d"]], dtype="U7", chunks=(1, 1)
+)
 
 # 2d.chunked.ragged.i2
-root.create_dataset('2d.chunked.ragged.i2', data=[[1, 2, 3],[4, 5, 6], [7, 8, 9]], dtype='i2', chunks=(2,2))
+root.create_dataset(
+    "2d.chunked.ragged.i2",
+    data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    dtype="i2",
+    chunks=(2, 2),
+)
 
 # 3d.contiguous.i2
-root.create_dataset('3d.contiguous.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,3))
+root.create_dataset(
+    "3d.contiguous.i2",
+    data=np.arange(27).reshape(3, 3, 3),
+    dtype="i2",
+    chunks=(3, 3, 3),
+)
 
 # 3d.chunked.i2
-root.create_dataset('3d.chunked.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(1,1,1))
+root.create_dataset(
+    "3d.chunked.i2", data=np.arange(27).reshape(3, 3, 3), dtype="i2", chunks=(1, 1, 1)
+)
 
 # 3d.chunked.mixed.i2.C
-root.create_dataset('3d.chunked.mixed.i2.C', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,1))
+root.create_dataset(
+    "3d.chunked.mixed.i2.C",
+    data=np.arange(27).reshape(3, 3, 3),
+    dtype="i2",
+    chunks=(3, 3, 1),
+)
 
 # 3d.chunked.mixed.i2.F
-arr = root.create_dataset('3d.chunked.mixed.i2.F', data=np.arange(27).reshape(3,3,3), order="F", dtype='i2', chunks=(3,3,1))
+arr = root.create_dataset(
+    "3d.chunked.mixed.i2.F",
+    data=np.arange(27).reshape(3, 3, 3),
+    order="F",
+    dtype="i2",
+    chunks=(3, 3, 1),
+)
+
+
+# 3d.chunked.o
+data = np.array(
+    [
+        [["a", "aa"], ["aaa", "aaaa"]],
+        [["b", "bb"], ["bbb", "bbbb"]],
+    ],
+    dtype=object,
+)
+root.create_dataset(
+    "3d.chunked.O",
+    data=data,
+    object_codec=VLenUTF8(),
+    dtype="O",
+    chunks=(1, 1, 2),
+)
 
 zarr.consolidate_metadata(store)
diff --git a/packages/core/__tests__/consolidated.test.ts b/packages/core/__tests__/consolidated.test.ts
@@ -38,6 +38,7 @@ describe("openConsolidated", () => {
 			  "/2d.chunked.i2" => "array",
 			  "/2d.chunked.ragged.i2" => "array",
 			  "/2d.contiguous.i2" => "array",
+			  "/3d.chunked.O" => "array",
 			  "/3d.chunked.i2" => "array",
 			  "/3d.chunked.mixed.i2.C" => "array",
 			  "/3d.chunked.mixed.i2.F" => "array",

diff --git a/packages/core/__tests__/open.test.ts b/packages/core/__tests__/open.test.ts
@@ -291,6 +291,24 @@ describe("v2", () => {
 		});
 	});
 
+	describe("3d.chunked.O", async () => {
+		let arr = await open.v2(store.resolve("/3d.chunked.O"), {
+			kind: "array",
+		});
+		it.each([
+			[[0, 0, 0], ["a", "aa"]],
+			[[1, 0, 0], ["b", "bb"]],
+			[[0, 1, 0], ["aaa", "aaaa"]],
+			[[1, 1, 0], ["bbb", "bbbb"]],
+		])(`getChunk(%j) -> %j`, async (index, expected) => {
+			expect(await arr.getChunk(index)).toStrictEqual({
+				data: expected,
+				shape: [1, 1, 2],
+				stride: [2, 2, 1],
+			});
+		});
+	});
+
 	describe("3d.chunked.mixed.i2.C", async () => {
 		let arr = await open.v2(store.resolve("/3d.chunked.mixed.i2.C"), {
 			kind: "array",

diff --git a/packages/core/__tests__/util.test.ts b/packages/core/__tests__/util.test.ts
@@ -86,6 +86,7 @@ describe("is_dtype", () => {
 		["uint64", false],
 		["v2:U6", false],
 		["v2:S6", false],
+		["v2:object", false],
 	])("is_dtype(%s, 'number') -> %s", (dtype, expected) => {
 		expect(is_dtype(dtype, "number")).toBe(expected);
 	});
@@ -104,6 +105,7 @@ describe("is_dtype", () => {
 		["uint64", false],
 		["v2:U6", false],
 		["v2:S6", false],
+		["v2:object", false],
 	])("is_dtype(%s, 'boolean') -> %s", (dtype, expected) => {
 		expect(is_dtype(dtype, "boolean")).toBe(expected);
 	});
@@ -122,6 +124,7 @@ describe("is_dtype", () => {
 		["uint64", true],
 		["v2:U6", false],
 		["v2:S6", false],
+		["v2:object", false],
 	])("is_dtype(%s, 'bigint') -> %s", (dtype, expected) => {
 		expect(is_dtype(dtype, "bigint")).toBe(expected);
 	});
@@ -140,6 +143,7 @@ describe("is_dtype", () => {
 		["uint64", false],
 		["v2:U6", true],
 		["v2:S6", true],
+		["v2:object", false],
 	])("is_dtype(%s, 'string') -> %s", (dtype, expected) => {
 		expect(is_dtype(dtype, "string")).toBe(expected);
 	});
@@ -158,6 +162,7 @@ describe("is_dtype", () => {
 		"uint64",
 		"v2:U6",
 		"v2:S6",
+		"v2:object",
 	])("is_dtype(%s, %s) -> true", (dtype) => {
 		expect(is_dtype(dtype, dtype)).toBe(true);
 	});