Merge branch 'main' into fix-clip-dask-sparse

scverse · Nov 11, 2024 · ae8bd60 · ae8bd60
2 parents ebada2f + a227c12
commit ae8bd60
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 26 deletions.
diff --git a/docs/release-notes/3284.performance.md b/docs/release-notes/3284.performance.md
@@ -0,0 +1 @@
+* Speed up {func}`~scanpy.pp.regress_out` {smaller}`P Ashish, P Angerer & S Dicks`
diff --git a/notebooks b/notebooks
diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py
@@ -8,7 +8,7 @@
 import warnings
 from functools import singledispatch
 from itertools import repeat
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, TypeVar
 
 import numba
 import numpy as np
@@ -18,7 +18,7 @@
 from sklearn.utils import check_array, sparsefuncs
 
 from .. import logging as logg
-from .._compat import old_positionals
+from .._compat import njit, old_positionals
 from .._settings import settings as sett
 from .._utils import (
     _check_array_function_arguments,
@@ -31,6 +31,7 @@
 )
 from ..get import _get_obs_rep, _set_obs_rep
 from ._distributed import materialize_as_ndarray
+from ._utils import _to_dense
 
 # install dask if available
 try:
@@ -624,6 +625,34 @@ def normalize_per_cell(
     return X if copy else None
 
 
+DT = TypeVar("DT")
+
+
+@njit
+def get_resid(
+    data: np.ndarray,
+    regressor: np.ndarray,
+    coeff: np.ndarray,
+) -> np.ndarray:
+    for i in numba.prange(data.shape[0]):
+        data[i] -= regressor[i] @ coeff
+    return data
+
+
+def numpy_regress_out(
+    data: np.ndarray,
+    regressor: np.ndarray,
+) -> np.ndarray:
+    """\
+    Numba kernel for regress out unwanted sorces of variantion.
+    Finding coefficient using Linear regression (Linear Least Squares).
+    """
+    inv_gram_matrix = np.linalg.inv(regressor.T @ regressor)
+    coeff = inv_gram_matrix @ (regressor.T @ data)
+    data = get_resid(data, regressor, coeff)
+    return data
+
+
 @old_positionals("layer", "n_jobs", "copy")
 def regress_out(
     adata: AnnData,
@@ -678,7 +707,6 @@ def regress_out(
 
     if issparse(X):
         logg.info("    sparse input is densified and may lead to high memory use")
-        X = X.toarray()
 
     n_jobs = sett.n_jobs if n_jobs is None else n_jobs
 
@@ -695,6 +723,8 @@ def regress_out(
             )
         logg.debug("... regressing on per-gene means within categories")
         regressors = np.zeros(X.shape, dtype="float32")
+        X = _to_dense(X, order="F") if issparse(X) else X
+        # TODO figure out if we should use a numba kernel for this
         for category in adata.obs[keys[0]].cat.categories:
             mask = (category == adata.obs[keys[0]]).values
             for ix, x in enumerate(X.T):
@@ -707,32 +737,44 @@ def regress_out(
 
         # add column of ones at index 0 (first column)
         regressors.insert(0, "ones", 1.0)
+        regressors = regressors.to_numpy()
 
-    len_chunk = int(np.ceil(min(1000, X.shape[1]) / n_jobs))
-    n_chunks = int(np.ceil(X.shape[1] / len_chunk))
+    # if the regressors are not categorical and the matrix is not singular
+    # use the shortcut numpy_regress_out
+    if not variable_is_categorical and np.linalg.det(regressors.T @ regressors) != 0:
+        X = _to_dense(X, order="C") if issparse(X) else X
+        res = numpy_regress_out(X, regressors)
 
-    # split the adata.X matrix by columns in chunks of size n_chunk
-    # (the last chunk could be of smaller size than the others)
-    chunk_list = np.array_split(X, n_chunks, axis=1)
-    regressors_chunk = (
-        np.array_split(regressors, n_chunks, axis=1)
-        if variable_is_categorical
-        else repeat(regressors)
-    )
+    # for a categorical variable or if the above checks failed,
+    # we fall back to the GLM implemetation of regression.
+    else:
+        # split the adata.X matrix by columns in chunks of size n_chunk
+        # (the last chunk could be of smaller size than the others)
+        len_chunk = int(np.ceil(min(1000, X.shape[1]) / n_jobs))
+        n_chunks = int(np.ceil(X.shape[1] / len_chunk))
+        X = _to_dense(X, order="F") if issparse(X) else X
+        chunk_list = np.array_split(X, n_chunks, axis=1)
+        regressors_chunk = (
+            np.array_split(regressors, n_chunks, axis=1)
+            if variable_is_categorical
+            else repeat(regressors)
+        )
 
-    # each task is passed a data chunk (e.g. `adata.X[:, 0:100]``) and the regressors.
-    # This data will be passed to each of the jobs.
-    # TODO: figure out how to test that this doesn't oversubscribe resources
-    res = Parallel(n_jobs=n_jobs)(
-        delayed(_regress_out_chunk)(
-            data_chunk, regres, variable_is_categorical=variable_is_categorical
+        # each task is passed a data chunk (e.g. `adata.X[:, 0:100]``) and the regressors.
+        # This data will be passed to each of the jobs.
+        # TODO: figure out how to test that this doesn't oversubscribe resources
+        res = Parallel(n_jobs=n_jobs)(
+            delayed(_regress_out_chunk)(
+                data_chunk, regres, variable_is_categorical=variable_is_categorical
+            )
+            for data_chunk, regres in zip(chunk_list, regressors_chunk, strict=False)
         )
-        for data_chunk, regres in zip(chunk_list, regressors_chunk, strict=False)
-    )
 
-    # res is a list of vectors (each corresponding to a regressed gene column).
-    # The transpose is needed to get the matrix in the shape needed
-    _set_obs_rep(adata, np.vstack(res).T, layer=layer)
+        # res is a list of vectors (each corresponding to a regressed gene column).
+        # The transpose is needed to get the matrix in the shape needed
+        res = np.vstack(res).T
+
+    _set_obs_rep(adata, res, layer=layer)
     logg.info("    finished", time=start)
     return adata if copy else None
 

diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py
@@ -160,3 +160,46 @@ def sample_comb(
         np.prod(dims), nsamp, random_state=random_state, method=method
     )
     return np.vstack(np.unravel_index(idx, dims)).T
+
+
+def _to_dense(
+    X: sparse.spmatrix,
+    order: Literal["C", "F"] = "C",
+) -> NDArray:
+    """\
+    Numba kernel for np.toarray() function
+    """
+    out = np.zeros(X.shape, dtype=X.dtype, order=order)
+    if X.format == "csr":
+        _to_dense_csr_numba(X.indptr, X.indices, X.data, out, X.shape)
+    elif X.format == "csc":
+        _to_dense_csc_numba(X.indptr, X.indices, X.data, out, X.shape)
+    else:
+        out = X.toarray(order=order)
+    return out
+
+
+@njit
+def _to_dense_csc_numba(
+    indptr: NDArray,
+    indices: NDArray,
+    data: NDArray,
+    X: NDArray,
+    shape: tuple[int, int],
+) -> None:
+    for c in numba.prange(X.shape[1]):
+        for i in range(indptr[c], indptr[c + 1]):
+            X[indices[i], c] = data[i]
+
+
+@njit
+def _to_dense_csr_numba(
+    indptr: NDArray,
+    indices: NDArray,
+    data: NDArray,
+    X: NDArray,
+    shape: tuple[int, int],
+) -> None:
+    for r in numba.prange(shape[0]):
+        for i in range(indptr[r], indptr[r + 1]):
+            X[r, indices[i]] = data[i]
diff --git a/tests/_data/regress_test_small.npy b/tests/_data/regress_test_small.npy
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from itertools import product
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -9,7 +10,7 @@
 from anndata.tests.helpers import asarray, assert_equal
 from numpy.testing import assert_allclose
 from scipy import sparse as sp
-from scipy.sparse import issparse
+from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, issparse
 
 import scanpy as sc
 from testing.scanpy._helpers import (
@@ -21,6 +22,9 @@
 from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced
 from testing.scanpy._pytest.params import ARRAY_TYPES
 
+HERE = Path(__file__).parent
+DATA_PATH = HERE / "_data"
+
 
 def test_log1p(tmp_path):
     A = np.random.rand(200, 10).astype(np.float32)
@@ -323,6 +327,16 @@ def test_regress_out_constants():
     assert_equal(adata, adata_copy)
 
 
+def test_regress_out_reproducible():
+    adata = pbmc68k_reduced()
+    adata = adata.raw.to_adata()[:200, :200].copy()
+    sc.pp.regress_out(adata, keys=["n_counts", "percent_mito"])
+    # This file was generated from the original implementation in version 1.10.3
+    # Now we compare new implementation with the old one
+    tester = np.load(DATA_PATH / "regress_test_small.npy")
+    np.testing.assert_allclose(adata.X, tester)
+
+
 def test_regress_out_constants_equivalent():
     # Tests that constant values don't change results
     # (since support for constant values is implemented by us)
@@ -524,3 +538,14 @@ def test_filter_cells(array_type, max_genes, max_counts, min_genes, min_counts):
     if issparse(adata.X):
         adata.X = adata.X.todense()
     assert_allclose(X, adata.X, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize("array_type", [csr_matrix, csc_matrix, coo_matrix])
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_todense(array_type, order):
+    x_org = np.array([[0, 1, 2], [3, 0, 4]])
+    x_sparse = array_type(x_org)
+    x_dense = sc.pp._utils._to_dense(x_sparse, order=order)
+    np.testing.assert_array_equal(x_dense, x_org)
+    assert x_dense.flags["C_CONTIGUOUS"] == (order == "C")
+    assert x_dense.flags["F_CONTIGUOUS"] == (order == "F")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		* Speed up {func}`~scanpy.pp.regress_out` {smaller}`P Ashish, P Angerer & S Dicks`
+2 −2		.pre-commit-config.yaml
+115 −121		dask.ipynb
+467 −326		integrating-data-using-ingest.ipynb
+186 −110		paga-paul15.ipynb
+72 −192		pbmc3k.ipynb