Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC: Add memory-mapping example to storage guide #2737

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docs/user-guide/storage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,34 @@ Zarr data (metadata and chunks) to a dictionary.:
>>> zarr.create_array(store=store, shape=(2,), dtype='float64')
<Array memory://... shape=(2,) dtype=float64>

Memory-Mapped Store
~~~~~~~~~~~~~~~~~~~~

For performance optimization when working with uncompressed data, you can create a memory-mapped store by subclassing :class:`zarr.storage.LocalStore`.
Memory mapping allows direct access to portions of chunk data without loading entire chunks into memory, which can be beneficial when you need to
read small slices from large chunks.:

>>> import mmap
>>> from zarr.storage import LocalStore
>>>
>>> class MemoryMappedDirectoryStore(LocalStore):
... def _fromfile(self, fn):
... with open(fn, "rb") as fh:
... return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ))
>>>
>>> # Create an array with large chunks
>>> z = zarr.create_array('data/example.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='float64')
>>> z[:] = 42 # Fill with test data
>>>
>>> # Open with memory mapping for efficient access
>>> mmap_store = MemoryMappedDirectoryStore('data/example.zarr')
>>> z = zarr.open_array(store=mmap_store)
>>>
>>> # Access small slices efficiently
>>> chunk_data = z[500:600, 500:600] # Only maps the needed portion into memory
>>> chunk_data[0, 0] # Verify data
42.0

.. _user-guide-custom-stores:

Developing custom stores
Expand Down
83 changes: 83 additions & 0 deletions tests/test_store/test_mmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from __future__ import annotations

import mmap
from typing import TYPE_CHECKING

import pytest

import zarr
from zarr.core.buffer import Buffer, cpu
from zarr.storage import LocalStore
from zarr.testing.store import StoreTests

if TYPE_CHECKING:
import pathlib


class MemoryMappedDirectoryStore(LocalStore):
def _fromfile(self, fn: str) -> memoryview:
with open(fn, "rb") as fh:
return memoryview(mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ))
Comment on lines +18 to +20
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think _fromfile will ever be invoked, since it's not part of the LocalStore API



class TestMemoryMappedDirectoryStore(StoreTests[MemoryMappedDirectoryStore, cpu.Buffer]):
store_cls = MemoryMappedDirectoryStore
buffer_cls = cpu.Buffer

async def get(self, store: MemoryMappedDirectoryStore, key: str) -> Buffer:
return self.buffer_cls.from_bytes((store.root / key).read_bytes())

async def set(self, store: MemoryMappedDirectoryStore, key: str, value: Buffer) -> None:
parent = (store.root / key).parent
if not parent.exists():
parent.mkdir(parents=True)
(store.root / key).write_bytes(value.to_bytes())

@pytest.fixture
def store_kwargs(self, tmpdir) -> dict[str, str]:
return {"root": str(tmpdir)}

def test_store_repr(self, store: MemoryMappedDirectoryStore) -> None:
assert str(store) == f"file://{store.root.as_posix()}"

def test_store_supports_writes(self, store: MemoryMappedDirectoryStore) -> None:
assert store.supports_writes

def test_store_supports_partial_writes(self, store: MemoryMappedDirectoryStore) -> None:
assert store.supports_partial_writes

def test_store_supports_listing(self, store: MemoryMappedDirectoryStore) -> None:
assert store.supports_listing

async def test_empty_with_empty_subdir(self, store: MemoryMappedDirectoryStore) -> None:
assert await store.is_empty("")
(store.root / "foo/bar").mkdir(parents=True)
assert await store.is_empty("")

def test_creates_new_directory(self, tmp_path: pathlib.Path):
target = tmp_path.joinpath("a", "b", "c")
assert not target.exists()

store = self.store_cls(root=target)
zarr.group(store=store)

async def test_mmap_slice_reads(self, store: MemoryMappedDirectoryStore) -> None:
"""Test reading slices with memory mapping"""
# Create array with large chunks
z = zarr.create_array(store=store, shape=(2000, 2000), chunks=(1000, 1000), dtype="float64")
# Write test data
data = zarr.full(shape=(2000, 2000), chunks=(1000, 1000), fill_value=42.0, dtype="float64")
z[:] = data[:]

# Test reading various slices
slices = [
# Within single chunk
(slice(100, 200), slice(100, 200)),
# Across chunk boundaries
(slice(900, 1100), slice(900, 1100)),
# Full chunk
(slice(0, 1000), slice(0, 1000)),
]

for test_slice in slices:
assert (z[test_slice] == data[test_slice]).all()
Loading