-
Notifications
You must be signed in to change notification settings - Fork 650
Labels
Area - Out of core 💾Working with on disk dataWorking with on disk data
Description
Please make sure these conditions are met
- I have checked that this issue has not already been reported.
- I have confirmed this bug exists on the latest version of scanpy.
- (optional) I have confirmed this bug exists on the main branch of scanpy.
What happened?
Dask QC either fails for smaller chunksize and runs OOM for larger datasets and crashes. This happens for both zarr2
, zarr3
and zarrs
. The datasize is 1.3Million cells.
When setting up a minimal client:
cluster = LocalCluster(n_workers=1, threads_per_worker=1,processes=True)
client = Client(cluster)
I get a different error. To me it almost looks like that dask want to do the reduction in memory.
Minimal code sample
from packaging.version import parse as parse_version
if parse_version(ad.__version__) < parse_version("0.12.0rc1"):
from anndata.experimental import read_elem_as_dask as read_dask
else:
from anndata.experimental import read_elem_lazy as read_dask
import scanpy as sc
import anndata as ad
import zarr
import h5py
#import zarrs
#zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"})
SPARSE_CHUNK_SIZE = 20_000
data_pth = "zarr/nvidia_1.3M.zarr/" #1.3Million Cells
#data_pth = "zarr/nvidia_1.3M_v3.zarr/" #1.3Million Cells
f = zarr.open(data_pth)
X = f["X"]
shape = X.attrs["shape"]
adata = ad.AnnData(
X = read_dask(X, (SPARSE_CHUNK_SIZE, shape[1])),
obs = ad.io.read_elem(f["obs"]),
var = ad.io.read_elem(f["var"])
)
adata.var_names_make_unique()
sc.pp.calculate_qc_metrics(adata)
Error output
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/utils.py:88, in meta_from_array(x, ndim, dtype)
87 if ndim > x.ndim:
---> 88 meta = meta[(Ellipsis,) + tuple(None for _ in range(ndim - meta.ndim))]
89 meta = meta[tuple(slice(0, 0, None) for _ in range(meta.ndim))]
IndexError: number of dimensions must be within [0, 64], indexing result would have 66
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
File <timed eval>:1
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/preprocessing/_qc.py:311, in calculate_qc_metrics(adata, expr_type, var_type, qc_vars, percent_top, layer, use_raw, inplace, log1p, parallel)
308 if isinstance(qc_vars, str):
309 qc_vars = [qc_vars]
--> 311 obs_metrics = describe_obs(
312 adata,
313 expr_type=expr_type,
314 var_type=var_type,
315 qc_vars=qc_vars,
316 percent_top=percent_top,
317 inplace=inplace,
318 X=X,
319 log1p=log1p,
320 )
321 var_metrics = describe_var(
322 adata,
323 expr_type=expr_type,
(...) 327 log1p=log1p,
328 )
330 if not inplace:
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/preprocessing/_qc.py:112, in describe_obs(adata, expr_type, var_type, qc_vars, percent_top, layer, use_raw, log1p, inplace, X, parallel)
109 X.eliminate_zeros()
110 obs_metrics = pd.DataFrame(index=adata.obs_names)
111 obs_metrics[f"n_{var_type}_by_{expr_type}"] = materialize_as_ndarray(
--> 112 axis_nnz(X, axis=1)
113 )
114 if log1p:
115 obs_metrics[f"log1p_n_{var_type}_by_{expr_type}"] = np.log1p(
116 obs_metrics[f"n_{var_type}_by_{expr_type}"]
117 )
File ~/micromamba/envs/rapids-25.04/lib/python3.12/functools.py:912, in singledispatch.<locals>.wrapper(*args, **kw)
908 if not args:
909 raise TypeError(f'{funcname} requires at least '
910 '1 positional argument')
--> 912 return dispatch(args[0].__class__)(*args, **kw)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/_utils/__init__.py:773, in _(X, axis)
771 @axis_nnz.register(DaskArray)
772 def _(X: DaskArray, axis: Literal[0, 1]) -> DaskArray:
--> 773 return X.map_blocks(
774 partial(axis_nnz, axis=axis),
775 dtype=np.int64,
776 meta=np.array([], dtype=np.int64),
777 drop_axis=0,
778 chunks=len(X.to_delayed()) * (X.chunksize[int(not axis)],),
779 )
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/core.py:2763, in Array.map_blocks(self, func, *args, **kwargs)
2761 @wraps(map_blocks)
2762 def map_blocks(self, func, *args, **kwargs):
-> 2763 return map_blocks(func, self, *args, **kwargs)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/core.py:893, in map_blocks(func, name, token, dtype, chunks, drop_axis, new_axis, enforce_ndim, meta, *args, **kwargs)
877 out = blockwise(
878 apply_and_enforce,
879 out_ind,
(...) 890 **kwargs,
891 )
892 else:
--> 893 out = blockwise(
894 func,
895 out_ind,
896 *concat(argpairs),
897 name=name,
898 new_axes=new_axes,
899 dtype=dtype,
900 concatenate=True,
901 align_arrays=False,
902 adjust_chunks=adjust_chunks,
903 meta=meta,
904 **kwargs,
905 )
907 extra_argpairs = []
908 extra_names = []
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/blockwise.py:290, in blockwise(func, out_ind, name, token, dtype, adjust_chunks, new_axes, align_arrays, concatenate, meta, *args, **kwargs)
287 from dask.array.utils import compute_meta
289 meta = compute_meta(func, dtype, *args[::2], **kwargs)
--> 290 return new_da_object(graph, out, chunks, meta=meta, dtype=dtype)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/core.py:6061, in new_da_object(dsk, name, chunks, meta, dtype)
6059 return from_graph(dict(dsk), meta, divisions, dsk.layers[name].keys(), name)
6060 else:
-> 6061 return Array(dsk, name=name, chunks=chunks, meta=meta, dtype=dtype)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/core.py:1379, in Array.__new__(cls, dask, name, chunks, dtype, meta, shape)
1377 if self.chunks is None:
1378 raise ValueError(CHUNKS_NONE_ERROR_MESSAGE)
-> 1379 self._meta = meta_from_array(meta, ndim=self.ndim, dtype=dtype)
1381 for plugin in config.get("array_plugins", ()):
1382 result = plugin(self)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/array/utils.py:97, in meta_from_array(x, ndim, dtype)
95 meta = np.ma.array(np.empty((0,) * ndim, dtype=dtype or x.dtype), mask=True)
96 except Exception:
---> 97 meta = np.empty((0,) * ndim, dtype=dtype or x.dtype)
99 if np.isscalar(meta):
100 meta = np.array(meta)
ValueError: maximum supported dimension for an ndarray is currently 64, found 66
______
Error Mininmal Client
______
2025-05-16 12:36:36,178 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 50.23 GiB -- Worker memory limit: 62.70 GiB
2025-05-16 12:37:10,396 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 50.33 GiB -- Worker memory limit: 62.70 GiB
2025-05-16 12:37:45,139 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 50.23 GiB -- Worker memory limit: 62.70 GiB
2025-05-16 12:38:19,520 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 50.18 GiB -- Worker memory limit: 62.70 GiB
---------------------------------------------------------------------------
KilledWorker Traceback (most recent call last)
File <timed eval>:1
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/preprocessing/_qc.py:311, in calculate_qc_metrics(adata, expr_type, var_type, qc_vars, percent_top, layer, use_raw, inplace, log1p, parallel)
308 if isinstance(qc_vars, str):
309 qc_vars = [qc_vars]
--> 311 obs_metrics = describe_obs(
312 adata,
313 expr_type=expr_type,
314 var_type=var_type,
315 qc_vars=qc_vars,
316 percent_top=percent_top,
317 inplace=inplace,
318 X=X,
319 log1p=log1p,
320 )
321 var_metrics = describe_var(
322 adata,
323 expr_type=expr_type,
(...) 327 log1p=log1p,
328 )
330 if not inplace:
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/preprocessing/_qc.py:111, in describe_obs(adata, expr_type, var_type, qc_vars, percent_top, layer, use_raw, log1p, inplace, X, parallel)
109 X.eliminate_zeros()
110 obs_metrics = pd.DataFrame(index=adata.obs_names)
--> 111 obs_metrics[f"n_{var_type}_by_{expr_type}"] = materialize_as_ndarray(
112 axis_nnz(X, axis=1)
113 )
114 if log1p:
115 obs_metrics[f"log1p_n_{var_type}_by_{expr_type}"] = np.log1p(
116 obs_metrics[f"n_{var_type}_by_{expr_type}"]
117 )
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/scanpy/preprocessing/_distributed.py:40, in materialize_as_ndarray(a)
38 """Compute distributed arrays and convert them to numpy ndarrays."""
39 if isinstance(a, DaskArray):
---> 40 return a.compute()
41 if not isinstance(a, tuple):
42 return np.asarray(a)
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/base.py:374, in DaskMethodsMixin.compute(self, **kwargs)
350 def compute(self, **kwargs):
351 """Compute this dask collection
352
353 This turns a lazy Dask collection into its in-memory equivalent.
(...) 372 dask.compute
373 """
--> 374 (result,) = compute(self, traverse=False, **kwargs)
375 return result
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/dask/base.py:662, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
659 postcomputes.append(x.__dask_postcompute__())
661 with shorten_traceback():
--> 662 results = schedule(dsk, keys, **kwargs)
664 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~/micromamba/envs/rapids-25.04/lib/python3.12/site-packages/distributed/client.py:2426, in Client._gather(self, futures, errors, direct, local_worker)
2424 exception = st.exception
2425 traceback = st.traceback
-> 2426 raise exception.with_traceback(traceback)
2427 if errors == "skip":
2428 bad_keys.add(key)
KilledWorker: Attempted to run task ('axis_nnz-89d0f4bf8205275d0894347154890e5e', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) on 4 different workers, but all those workers died while running it. The last worker that attempt to run the task was tcp://127.0.0.1:45199. Inspecting worker logs is often a good next step to diagnose what went wrong. For more information see https://distributed.dask.org/en/stable/killed.html.
Versions
| Package | Version |
| ----------- | ----------------- |
| dask | 2025.2.0 |
| dask-cuda | 25.4.0 (25.04.00) |
| distributed | 2025.2.0 |
| rmm-cu12 | 25.4.0 (25.04.00) |
| cupy | 14.0.0a1 |
| anndata | 0.12.0rc1 |
| scanpy | 1.11.1 |
| packaging | 25.0 |
| zarr | 2.18.7 |
| h5py | 3.13.0 |
| Dependency | Version |
| ---------------------- | ----------------- |
| typing_extensions | 4.13.2 |
| parso | 0.8.4 |
| debugpy | 1.8.14 |
| wrapt | 1.17.2 |
| zict | 3.0.0 |
| pandas | 2.2.3 |
| Jinja2 | 3.1.6 |
| libcuvs-cu12 | 25.4.0 (25.04.00) |
| click | 8.1.8 |
| python-dateutil | 2.9.0.post0 |
| ipython | 9.1.0 |
| pillow | 11.2.1 |
| sortedcontainers | 2.4.0 |
| matplotlib | 3.10.1 |
| pluggy | 1.5.0 |
| cuda-bindings | 12.8.0 |
| cloudpickle | 3.1.1 |
| psutil | 7.0.0 |
| libcudf-cu12 | 25.4.0 (25.04.00) |
| prompt_toolkit | 3.0.51 |
| stack-data | 0.6.3 |
| bokeh | 3.7.2 |
| setuptools | 79.0.1 |
| tblib | 3.1.0 |
| rapids-dask-dependency | 25.4.0 |
| scipy | 1.15.2 |
| cachetools | 5.5.2 |
| threadpoolctl | 3.6.0 |
| numcodecs | 0.15.1 |
| tqdm | 4.67.1 |
| tornado | 6.4.2 |
| statsmodels | 0.14.4 |
| libkvikio-cu12 | 25.4.0 (25.04.00) |
| nvidia-ml-py | 12.570.86 |
| librmm-cu12 | 25.4.0 (25.04.00) |
| libraft-cu12 | 25.4.0 (25.04.00) |
| platformdirs | 4.3.7 |
| rapids-logger | 0.1.1 |
| pytz | 2025.2 |
| Cython | 3.0.12 |
| natsort | 8.4.0 |
| nvtx | 0.2.11 |
| jupyter_client | 8.6.3 |
| session-info2 | 0.1.2 |
| crc32c | 2.7.1 |
| iniconfig | 2.1.0 |
| pylibcudf-cu12 | 25.4.0 |
| traitlets | 5.14.3 |
| pyparsing | 3.2.3 |
| xyzservices | 2025.4.0 |
| dask-cudf-cu12 | 25.4.0 (25.04.00) |
| llvmlite | 0.43.0 |
| msgpack | 1.1.0 |
| pytest | 8.3.5 |
| wcwidth | 0.2.13 |
| cycler | 0.12.1 |
| fsspec | 2025.3.2 |
| cudf-cu12 | 25.4.0 (25.04.00) |
| joblib | 1.4.2 |
| scikit-learn | 1.5.2 |
| libcuml-cu12 | 25.4.0 (25.04.00) |
| decorator | 5.2.1 |
| numpy | 2.0.2 |
| Pygments | 2.19.1 |
| defusedxml | 0.7.1 |
| patsy | 1.0.1 |
| kiwisolver | 1.4.8 |
| treelite | 4.4.1 |
| pyarrow | 19.0.1 |
| pure_eval | 0.2.3 |
| cuml-cu12 | 25.4.0 (25.04.00) |
| jupyter_core | 5.7.2 |
| Deprecated | 1.2.18 |
| toolz | 1.0.0 |
| ipykernel | 6.29.5 |
| numba | 0.60.0 |
| locket | 1.0.0 |
| MarkupSafe | 3.0.2 |
| prometheus_client | 0.21.1 |
| charset-normalizer | 3.4.1 |
| pylibraft-cu12 | 25.4.0 (25.04.00) |
| PyYAML | 6.0.2 |
| legacy-api-wrap | 1.4.1 |
| pyzmq | 26.4.0 |
| comm | 0.2.2 |
| fastrlock | 0.8.3 |
| pynvjitlink-cu12 | 0.5.2 |
| jedi | 0.19.2 |
| executing | 2.2.0 |
| six | 1.17.0 |
| asciitree | 0.3.3 |
| ipywidgets | 8.1.6 |
| asttokens | 3.0.0 |
| zstandard | 0.23.0 |
| Component | Info |
| --------- | ------------------------------------------------------------------------------ |
| Python | 3.12.10 | packaged by conda-forge | (main, Apr 10 2025, 22:21:13) [GCC 13.3.0] |
| OS | Linux-6.8.0-59-generic-x86_64-with-glibc2.35 |
| CPU | 32 logical CPU cores, x86_64 |
| GPU | ID: 0, NVIDIA GeForce RTX 3090, Driver: 575.51.03, Memory: 24576 MiB |
| Updated | 2025-05-16 10:31 |
Metadata
Metadata
Assignees
Labels
Area - Out of core 💾Working with on disk dataWorking with on disk data