Skip to content

Backport PR #3353 on branch 1.11.x (Speed up categorical regressor with numba) #3654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/3353.performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Speed up for a categorical regressor in {func}`~scanpy.pp.regress_out` {smaller}`S Dicks` {smaller}`I Gold`
31 changes: 25 additions & 6 deletions src/scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,23 @@ def normalize_per_cell(
DT = TypeVar("DT")


@njit
def _create_regressor_categorical(
X: np.ndarray, number_categories: int, cat_array: np.ndarray
) -> np.ndarray:
# create regressor matrix for categorical variables
# would be best to use X dtype but this matches old behavior
regressors = np.zeros(X.shape, dtype=np.float32)
# iterate over categories
for category in range(number_categories):
# iterate over genes and calculate mean expression
# for each gene per category
mask = category == cat_array
for ix in numba.prange(X.T.shape[0]):
regressors[mask, ix] = X.T[ix, mask].mean()
return regressors


@njit
def get_resid(
data: np.ndarray,
Expand Down Expand Up @@ -735,13 +752,15 @@ def regress_out(
)
raise ValueError(msg)
logg.debug("... regressing on per-gene means within categories")
regressors = np.zeros(X.shape, dtype="float32")
# set number of categories to the same dtype as the categories
cat_array = adata.obs[keys[0]].cat.codes.to_numpy()
number_categories = cat_array.dtype.type(len(adata.obs[keys[0]].cat.categories))

X = _to_dense(X, order="F") if isinstance(X, CSBase) else X
# TODO figure out if we should use a numba kernel for this
for category in adata.obs[keys[0]].cat.categories:
mask = (category == adata.obs[keys[0]]).values
for ix, x in enumerate(X.T):
regressors[mask, ix] = x[mask].mean()
if np.issubdtype(X.dtype, np.integer):
target_dtype = np.float32 if X.dtype.itemsize <= 4 else np.float64
X = X.astype(target_dtype)
regressors = _create_regressor_categorical(X, number_categories, cat_array)
variable_is_categorical = True
# regress on one or several ordinal variables
else:
Expand Down
Binary file added tests/_data/cat_regressor_for_int_input.npy
Binary file not shown.
Binary file added tests/_data/regress_test_small_cat.npy
Binary file not shown.
36 changes: 31 additions & 5 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,25 @@ def test_regress_out_ordinal():
np.testing.assert_array_equal(single.X, multi.X)


@pytest.mark.parametrize("dtype", [np.uint32, np.float64, np.uint64])
def test_regress_out_int(dtype):
adata = pbmc3k()[:200, :200].copy()
adata.X = adata.X.astype(np.float64 if dtype != np.uint32 else np.float32)
dtype = adata.X.dtype
adata.obs["labels"] = pd.Categorical(
(["A"] * (adata.X.shape[0] - 100)) + (["B"] * 100)
)
adata_other = adata.copy()
adata_other.X = adata_other.X.astype(dtype)
# results using only one processor
sc.pp.regress_out(adata, keys=["labels"])
sc.pp.regress_out(adata_other, keys=["labels"])
assert_equal(adata_other, adata)
# This file was generated under scanpy 1.10.3
ground_truth = np.load(DATA_PATH / "cat_regressor_for_int_input.npy")
np.testing.assert_allclose(ground_truth, adata_other.X, atol=1e-5, rtol=1e-5)


@pytest.mark.parametrize("dtype", [np.int64, np.float64, np.int32])
def test_regress_out_layer(dtype):
from scipy.sparse import random
Expand Down Expand Up @@ -468,14 +487,21 @@ def test_regress_out_constants():
assert_equal(adata, adata_copy)


def test_regress_out_reproducible():
adata = pbmc68k_reduced()
@pytest.mark.parametrize(
("keys", "test_file", "atol"),
[
(["n_counts", "percent_mito"], "regress_test_small.npy", 0.0),
(["bulk_labels"], "regress_test_small_cat.npy", 1e-6),
],
)
def test_regress_out_reproducible(keys, test_file, atol):
adata = sc.datasets.pbmc68k_reduced()
adata = adata.raw.to_adata()[:200, :200].copy()
sc.pp.regress_out(adata, keys=["n_counts", "percent_mito"])
sc.pp.regress_out(adata, keys=keys)
# This file was generated from the original implementation in version 1.10.3
# Now we compare new implementation with the old one
tester = np.load(DATA_PATH / "regress_test_small.npy")
np.testing.assert_allclose(adata.X, tester)
tester = np.load(DATA_PATH / test_file)
np.testing.assert_allclose(adata.X, tester, atol=atol)


def test_regress_out_constants_equivalent():
Expand Down
Loading