scverse · flying-sheep · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/docs/conf.py b/docs/conf.py
@@ -73,14 +73,14 @@
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx.ext.autosummary",
-    "sphinx.ext.extlinks",
     "sphinxcontrib.bibtex",
     "matplotlib.sphinxext.plot_directive",
     "sphinx_autodoc_typehints",  # needs to be after napoleon
     "git_ref",  # needs to be before scanpydoc.rtd_github_links
     "scanpydoc",  # needs to be before sphinx.ext.linkcode
     "sphinx.ext.linkcode",
     "sphinx_design",
+    "sphinx_issues",
     "sphinx_tabs.tabs",
     "sphinxext.opengraph",
     *[p.stem for p in (HERE / "extensions").glob("*.py") if p.stem not in {"git_ref"}],
@@ -245,8 +245,5 @@ def setup(app: Sphinx):
 plot_working_directory = HERE.parent  # Project root
 
 # link config
-extlinks = {
-    "issue": ("https://github.com/scverse/scanpy/issues/%s", "issue%s"),
-    "pr": ("https://github.com/scverse/scanpy/pull/%s", "pr%s"),
-}
+issues_github_path = "scverse/scanpy"
 rtd_links_prefix = PurePosixPath("src")
diff --git a/docs/references.bib b/docs/references.bib
@@ -69,6 +69,18 @@ @article{Becht2018
   pages     = {38--44},
 }
 
+@inproceedings{Benson2013,
+  author    = {Benson, Austin R. and Gleich, David F. and Demmel, James},
+  booktitle = {2013 IEEE International Conference on Big Data},
+  title     = {Direct QR factorizations for tall-and-skinny matrices in MapReduce architectures},
+  year      = {2013},
+  url       = {https://doi.org/10.1109/BigData.2013.6691583},
+  doi       = {10.1109/bigdata.2013.6691583},
+  publisher = {IEEE},
+  month     = {oct},
+  pages     = {264--272},
+}
+
 @article{Bernstein2020,
   author    = {Bernstein, Nicholas J. and Fong, Nicole L. and Lam, Irene and Roy, Margaret A. and Hendrickson, David G. and Kelley, David R.},
   title     = {Solo: Doublet Identification in Single-Cell RNA-Seq via Semi-Supervised Deep Learning},
@@ -293,6 +305,17 @@ @article{Haghverdi2018
   pages     = {421--427},
 }
 
+@misc{Halko2009,
+  author    = {Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A.},
+  doi       = {10.48550/ARXIV.0909.4061},
+  url       = {https://arxiv.org/abs/0909.4061},
+  keywords  = {Numerical Analysis (math.NA), Probability (math.PR), FOS: Mathematics, FOS: Mathematics},
+  title     = {Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions},
+  publisher = {arXiv},
+  year      = {2009},
+  copyright = {arXiv.org perpetual, non-exclusive license},
+}
+
 @article{Hie2019,
   author    = {Hie, Brian and Bryson, Bryan and Berger, Bonnie},
   title     = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama},

diff --git a/docs/release-notes/3655.doc.md b/docs/release-notes/3655.doc.md
@@ -0,0 +1 @@
+Clarify use of implementations in {func}`scanpy.pp.pca` docs. {smaller}`P Angerer`
diff --git a/pyproject.toml b/pyproject.toml
@@ -111,6 +111,7 @@ doc = [
     "sphinx-book-theme>=1.1.0",
     "scanpydoc>=0.15.3",
     "sphinx-autodoc-typehints>=1.25.2",
+    "sphinx-issues>=5.0.1",
     "myst-parser>=2",
     "myst-nb>=1",
     "sphinx-design",

diff --git a/src/scanpy/preprocessing/_pca/__init__.py b/src/scanpy/preprocessing/_pca/__init__.py
@@ -64,89 +64,99 @@ def pca(  # noqa: PLR0912, PLR0913, PLR0915
     n_comps: int | None = None,
     *,
     layer: str | None = None,
-    zero_center: bool | None = True,
+    zero_center: bool = True,
     svd_solver: SvdSolver | None = None,
+    chunked: bool = False,
+    chunk_size: int | None = None,
     random_state: _LegacyRandom = 0,
     return_info: bool = False,
     mask_var: NDArray[np.bool_] | str | None | Empty = _empty,
     use_highly_variable: bool | None = None,
     dtype: DTypeLike = "float32",
-    chunked: bool = False,
-    chunk_size: int | None = None,
     key_added: str | None = None,
     copy: bool = False,
 ) -> AnnData | np.ndarray | CSBase | None:
     r"""Principal component analysis :cite:p:`Pedregosa2011`.
 
     Computes PCA coordinates, loadings and variance decomposition.
-    Uses the implementation of *scikit-learn* :cite:p:`Pedregosa2011`.
-
-    .. versionchanged:: 1.5.0
-
-        In previous versions, computing a PCA on a sparse matrix would make
-        a dense copy of the array for mean centering.
-        As of scanpy 1.5.0, mean centering is implicit.
-        While results are extremely similar, they are not exactly the same.
-        If you would like to reproduce the old results, pass a dense array.
+    Uses the following implementations (and defaults for `svd_solver`):
+
+    .. list-table::
+       :header-rows: 1
+       :stub-columns: 1
+
+       - -
+         - :class:`~numpy.ndarray`, :class:`~scipy.sparse.spmatrix`, or :class:`~scipy.sparse.sparray`
+         - :class:`dask.array.Array`
+       - - `chunked=False`, `zero_center=True`
+         - sklearn :class:`~sklearn.decomposition.PCA` (`'arpack'`)
+         - - *dense*: dask-ml :class:`~dask_ml.decomposition.PCA`\ [#high-mem]_ (`'auto'`)
+           - *sparse* or `svd_solver='covariance_eigh'`: custom implementation (`'covariance_eigh'`)
+       - - `chunked=False`, `zero_center=False`
+         - sklearn :class:`~sklearn.decomposition.TruncatedSVD` (`'randomized'`)
+         - dask-ml :class:`~dask_ml.decomposition.TruncatedSVD`\ [#dense-only]_ (`'tsqr'`)
+       - - `chunked=True` (`zero_center` ignored)
+         - sklearn :class:`~sklearn.decomposition.IncrementalPCA` (`'auto'`)
+         - dask-ml :class:`~dask_ml.decomposition.IncrementalPCA`\ [#densifies]_ (`'auto'`)
+
+    .. [#high-mem] Consider `svd_solver='covariance_eigh'` to reduce memory usage (see :issue:`dask/dask-ml#985`).
+    .. [#dense-only] This implementation can not handle sparse chunks, try manually densifying them.
+    .. [#densifies] This implementation densifies sparse chunks and therefore has increased memory usage.
 
     Parameters
     ----------
     data
         The (annotated) data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
     n_comps
-        Number of principal components to compute. Defaults to 50, or 1 - minimum
-        dimension size of selected representation.
+        Number of principal components to compute. Defaults to 50,
+        or 1 - minimum dimension size of selected representation.
     layer
         If provided, which element of layers to use for PCA.
     zero_center
-        If `True`, compute standard PCA from covariance matrix.
-        If `False`, omit zero-centering variables
-        (uses *scikit-learn* :class:`~sklearn.decomposition.TruncatedSVD` or
-        *dask-ml* :class:`~dask_ml.decomposition.TruncatedSVD`),
-        which allows to handle sparse input efficiently.
-        Passing `None` decides automatically based on sparseness of the data.
+        If `True`, compute (or approximate) PCA from covariance matrix.
+        If `False`, performa a truncated SVD instead of PCA.
+
+        Our default PCA algorithms (see `svd_solver`) support implicit zero-centering,
+        and therefore efficiently operating on sparse data.
     svd_solver
-        SVD solver to use:
+        SVD solver to use.
+        See table above to see which solver class is used based on `chunked` and `zero_center`,
+        as well as the default solver for each class when `svd_solver=None`.
+
+        Efficient computation of the principal components of a sparse matrix
+        currently only works with the `'arpack`' or `'covariance_eigh`' solver.
 
         `None`
-            See `chunked` and `zero_center` descriptions to determine which class will be used.
-            Depending on the class and the type of X different values for default will be set.
-            For sparse *dask* arrays, will use `'covariance_eigh'`.
-            If *scikit-learn* :class:`~sklearn.decomposition.PCA` is used, will give `'arpack'`,
-            if *scikit-learn* :class:`~sklearn.decomposition.TruncatedSVD` is used, will give `'randomized'`,
-            if *dask-ml* :class:`~dask_ml.decomposition.PCA` or :class:`~dask_ml.decomposition.IncrementalPCA` is used, will give `'auto'`,
-            if *dask-ml* :class:`~dask_ml.decomposition.TruncatedSVD` is used, will give `'tsqr'`
+            Choose automatically based on solver class (see table above).
         `'arpack'`
-            for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)
-            Not available with *dask* arrays.
+            ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`).
+            Not available for *dask* arrays.
         `'covariance_eigh'`
             Classic eigendecomposition of the covariance matrix, suited for tall-and-skinny matrices.
-            With dask, array must be CSR or dense and chunked as (N, adata.shape[1]).
+            With dask, array must be CSR or dense and chunked as `(N, adata.shape[1])`.
         `'randomized'`
-            for the randomized algorithm due to Halko (2009). For *dask* arrays,
-            this will use :func:`~dask.array.linalg.svd_compressed`.
+            Randomized algorithm from :cite:t:`Halko2009`.
+            For *dask* arrays, this will use :func:`~dask.array.linalg.svd_compressed`.
         `'auto'`
-            chooses automatically depending on the size of the problem.
+            Choose automatically depending on the size of the problem:
+            Will use `'full'` for small shapes and `'randomized'` for large shapes.
         `'tsqr'`
-            Only available with dense *dask* arrays. "tsqr"
-            algorithm from Benson et. al. (2013).
+            “tall-and-skinny QR” algorithm from :cite:t:`Benson2013`.
+            Only available for dense *dask* arrays.
 
         .. versionchanged:: 1.9.3
            Default value changed from `'arpack'` to None.
         .. versionchanged:: 1.4.5
            Default value changed from `'auto'` to `'arpack'`.
-
-        Efficient computation of the principal components of a sparse matrix
-        currently only works with the `'arpack`' or `'covariance_eigh`' solver.
-
-        If X is a sparse *dask* array, a custom `'covariance_eigh'` solver will be used.
-        If X is a dense *dask* array, *dask-ml* classes :class:`~dask_ml.decomposition.PCA`,
-        :class:`~dask_ml.decomposition.IncrementalPCA`, or
-        :class:`~dask_ml.decomposition.TruncatedSVD` will be used.
-        Otherwise their *scikit-learn* counterparts :class:`~sklearn.decomposition.PCA`,
-        :class:`~sklearn.decomposition.IncrementalPCA`, or
-        :class:`~sklearn.decomposition.TruncatedSVD` will be used.
+    chunked
+        If `True`, perform an incremental PCA on segments of `chunk_size`.
+        Automatically zero centers and ignores settings of `zero_center`, `random_seed` and `svd_solver`.
+        If `False`, perform a full PCA/truncated SVD (see `svd_solver` and `zero_center`).
+        See table above for which solver class is used.
+    chunk_size
+        Number of observations to include in each chunk.
+        Required if `chunked=True` was passed.
     random_state
         Change to use different initial states for the optimization.
     return_info
@@ -157,16 +167,6 @@ def pca(  # noqa: PLR0912, PLR0913, PLR0915
         Layer of `adata` to use as expression values.
     dtype
         Numpy data type string to which to convert the result.
-    chunked
-        If `True`, perform an incremental PCA on segments of `chunk_size`.
-        The incremental PCA automatically zero centers and ignores settings of
-        `random_seed` and `svd_solver`. Uses sklearn :class:`~sklearn.decomposition.IncrementalPCA` or
-        *dask-ml* :class:`~dask_ml.decomposition.IncrementalPCA`. If `False`, perform a full PCA and
-        use sklearn :class:`~sklearn.decomposition.PCA` or
-        *dask-ml* :class:`~dask_ml.decomposition.PCA`
-    chunk_size
-        Number of observations to include in each chunk.
-        Required if `chunked=True` was passed.
     key_added
         If not specified, the embedding is stored as
         :attr:`~anndata.AnnData.obsm`\ `['X_pca']`, the loadings as
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Clarify use of implementations in {func}`scanpy.pp.pca` docs. {smaller}`P Angerer`