Skip to content

Commit

Permalink
Merge branch 'main' into fix-clip-dask-sparse
Browse files Browse the repository at this point in the history
  • Loading branch information
flying-sheep authored Nov 11, 2024
2 parents ebada2f + a227c12 commit ae8bd60
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 26 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/3284.performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Speed up {func}`~scanpy.pp.regress_out` {smaller}`P Ashish, P Angerer & S Dicks`
2 changes: 1 addition & 1 deletion notebooks
90 changes: 66 additions & 24 deletions src/scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import warnings
from functools import singledispatch
from itertools import repeat
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, TypeVar

import numba
import numpy as np
Expand All @@ -18,7 +18,7 @@
from sklearn.utils import check_array, sparsefuncs

from .. import logging as logg
from .._compat import old_positionals
from .._compat import njit, old_positionals
from .._settings import settings as sett
from .._utils import (
_check_array_function_arguments,
Expand All @@ -31,6 +31,7 @@
)
from ..get import _get_obs_rep, _set_obs_rep
from ._distributed import materialize_as_ndarray
from ._utils import _to_dense

# install dask if available
try:
Expand Down Expand Up @@ -624,6 +625,34 @@ def normalize_per_cell(
return X if copy else None


DT = TypeVar("DT")


@njit
def get_resid(
data: np.ndarray,
regressor: np.ndarray,
coeff: np.ndarray,
) -> np.ndarray:
for i in numba.prange(data.shape[0]):
data[i] -= regressor[i] @ coeff
return data


def numpy_regress_out(
data: np.ndarray,
regressor: np.ndarray,
) -> np.ndarray:
"""\
Numba kernel for regress out unwanted sorces of variantion.
Finding coefficient using Linear regression (Linear Least Squares).
"""
inv_gram_matrix = np.linalg.inv(regressor.T @ regressor)
coeff = inv_gram_matrix @ (regressor.T @ data)
data = get_resid(data, regressor, coeff)
return data


@old_positionals("layer", "n_jobs", "copy")
def regress_out(
adata: AnnData,
Expand Down Expand Up @@ -678,7 +707,6 @@ def regress_out(

if issparse(X):
logg.info(" sparse input is densified and may lead to high memory use")
X = X.toarray()

n_jobs = sett.n_jobs if n_jobs is None else n_jobs

Expand All @@ -695,6 +723,8 @@ def regress_out(
)
logg.debug("... regressing on per-gene means within categories")
regressors = np.zeros(X.shape, dtype="float32")
X = _to_dense(X, order="F") if issparse(X) else X
# TODO figure out if we should use a numba kernel for this
for category in adata.obs[keys[0]].cat.categories:
mask = (category == adata.obs[keys[0]]).values
for ix, x in enumerate(X.T):
Expand All @@ -707,32 +737,44 @@ def regress_out(

# add column of ones at index 0 (first column)
regressors.insert(0, "ones", 1.0)
regressors = regressors.to_numpy()

len_chunk = int(np.ceil(min(1000, X.shape[1]) / n_jobs))
n_chunks = int(np.ceil(X.shape[1] / len_chunk))
# if the regressors are not categorical and the matrix is not singular
# use the shortcut numpy_regress_out
if not variable_is_categorical and np.linalg.det(regressors.T @ regressors) != 0:
X = _to_dense(X, order="C") if issparse(X) else X
res = numpy_regress_out(X, regressors)

# split the adata.X matrix by columns in chunks of size n_chunk
# (the last chunk could be of smaller size than the others)
chunk_list = np.array_split(X, n_chunks, axis=1)
regressors_chunk = (
np.array_split(regressors, n_chunks, axis=1)
if variable_is_categorical
else repeat(regressors)
)
# for a categorical variable or if the above checks failed,
# we fall back to the GLM implemetation of regression.
else:
# split the adata.X matrix by columns in chunks of size n_chunk
# (the last chunk could be of smaller size than the others)
len_chunk = int(np.ceil(min(1000, X.shape[1]) / n_jobs))
n_chunks = int(np.ceil(X.shape[1] / len_chunk))
X = _to_dense(X, order="F") if issparse(X) else X
chunk_list = np.array_split(X, n_chunks, axis=1)
regressors_chunk = (
np.array_split(regressors, n_chunks, axis=1)
if variable_is_categorical
else repeat(regressors)
)

# each task is passed a data chunk (e.g. `adata.X[:, 0:100]``) and the regressors.
# This data will be passed to each of the jobs.
# TODO: figure out how to test that this doesn't oversubscribe resources
res = Parallel(n_jobs=n_jobs)(
delayed(_regress_out_chunk)(
data_chunk, regres, variable_is_categorical=variable_is_categorical
# each task is passed a data chunk (e.g. `adata.X[:, 0:100]``) and the regressors.
# This data will be passed to each of the jobs.
# TODO: figure out how to test that this doesn't oversubscribe resources
res = Parallel(n_jobs=n_jobs)(
delayed(_regress_out_chunk)(
data_chunk, regres, variable_is_categorical=variable_is_categorical
)
for data_chunk, regres in zip(chunk_list, regressors_chunk, strict=False)
)
for data_chunk, regres in zip(chunk_list, regressors_chunk, strict=False)
)

# res is a list of vectors (each corresponding to a regressed gene column).
# The transpose is needed to get the matrix in the shape needed
_set_obs_rep(adata, np.vstack(res).T, layer=layer)
# res is a list of vectors (each corresponding to a regressed gene column).
# The transpose is needed to get the matrix in the shape needed
res = np.vstack(res).T

_set_obs_rep(adata, res, layer=layer)
logg.info(" finished", time=start)
return adata if copy else None

Expand Down
43 changes: 43 additions & 0 deletions src/scanpy/preprocessing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,46 @@ def sample_comb(
np.prod(dims), nsamp, random_state=random_state, method=method
)
return np.vstack(np.unravel_index(idx, dims)).T


def _to_dense(
X: sparse.spmatrix,
order: Literal["C", "F"] = "C",
) -> NDArray:
"""\
Numba kernel for np.toarray() function
"""
out = np.zeros(X.shape, dtype=X.dtype, order=order)
if X.format == "csr":
_to_dense_csr_numba(X.indptr, X.indices, X.data, out, X.shape)
elif X.format == "csc":
_to_dense_csc_numba(X.indptr, X.indices, X.data, out, X.shape)
else:
out = X.toarray(order=order)
return out


@njit
def _to_dense_csc_numba(
indptr: NDArray,
indices: NDArray,
data: NDArray,
X: NDArray,
shape: tuple[int, int],
) -> None:
for c in numba.prange(X.shape[1]):
for i in range(indptr[c], indptr[c + 1]):
X[indices[i], c] = data[i]


@njit
def _to_dense_csr_numba(
indptr: NDArray,
indices: NDArray,
data: NDArray,
X: NDArray,
shape: tuple[int, int],
) -> None:
for r in numba.prange(shape[0]):
for i in range(indptr[r], indptr[r + 1]):
X[r, indices[i]] = data[i]
Binary file added tests/_data/regress_test_small.npy
Binary file not shown.
27 changes: 26 additions & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from itertools import product
from pathlib import Path

import numpy as np
import pandas as pd
Expand All @@ -9,7 +10,7 @@
from anndata.tests.helpers import asarray, assert_equal
from numpy.testing import assert_allclose
from scipy import sparse as sp
from scipy.sparse import issparse
from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, issparse

import scanpy as sc
from testing.scanpy._helpers import (
Expand All @@ -21,6 +22,9 @@
from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced
from testing.scanpy._pytest.params import ARRAY_TYPES

HERE = Path(__file__).parent
DATA_PATH = HERE / "_data"


def test_log1p(tmp_path):
A = np.random.rand(200, 10).astype(np.float32)
Expand Down Expand Up @@ -323,6 +327,16 @@ def test_regress_out_constants():
assert_equal(adata, adata_copy)


def test_regress_out_reproducible():
adata = pbmc68k_reduced()
adata = adata.raw.to_adata()[:200, :200].copy()
sc.pp.regress_out(adata, keys=["n_counts", "percent_mito"])
# This file was generated from the original implementation in version 1.10.3
# Now we compare new implementation with the old one
tester = np.load(DATA_PATH / "regress_test_small.npy")
np.testing.assert_allclose(adata.X, tester)


def test_regress_out_constants_equivalent():
# Tests that constant values don't change results
# (since support for constant values is implemented by us)
Expand Down Expand Up @@ -524,3 +538,14 @@ def test_filter_cells(array_type, max_genes, max_counts, min_genes, min_counts):
if issparse(adata.X):
adata.X = adata.X.todense()
assert_allclose(X, adata.X, rtol=1e-5, atol=1e-5)


@pytest.mark.parametrize("array_type", [csr_matrix, csc_matrix, coo_matrix])
@pytest.mark.parametrize("order", ["C", "F"])
def test_todense(array_type, order):
x_org = np.array([[0, 1, 2], [3, 0, 4]])
x_sparse = array_type(x_org)
x_dense = sc.pp._utils._to_dense(x_sparse, order=order)
np.testing.assert_array_equal(x_dense, x_org)
assert x_dense.flags["C_CONTIGUOUS"] == (order == "C")
assert x_dense.flags["F_CONTIGUOUS"] == (order == "F")

0 comments on commit ae8bd60

Please sign in to comment.