From 12b033e86b9760070f1f14fc95917b3a401d6ea9 Mon Sep 17 00:00:00 2001 From: Paul Fisher Date: Wed, 11 Jan 2023 15:44:45 -0500 Subject: [PATCH] Add `ExperimentAxisQuery`. This extracts the public interface of `tiledbsoma.query.ExperimentAxisQuery` and puts it here. Because there is no TileDB-specific behavior in `ExperimentAxisQuery`, I believe we can actually pull the entire implementation here (it only uses the properties of Experiment itself) but that is for a later date. --- python-spec/pyproject.toml | 13 ++- python-spec/src/somacore/__init__.py | 5 + python-spec/src/somacore/composed.py | 17 ++++ python-spec/src/somacore/query/__init__.py | 0 python-spec/src/somacore/query/axis.py | 81 +++++++++++++++ python-spec/src/somacore/query/query.py | 112 +++++++++++++++++++++ python-spec/testing/test_query_axis.py | 54 ++++++++++ 7 files changed, 278 insertions(+), 4 deletions(-) create mode 100644 python-spec/src/somacore/query/__init__.py create mode 100644 python-spec/src/somacore/query/axis.py create mode 100644 python-spec/src/somacore/query/query.py create mode 100644 python-spec/testing/test_query_axis.py diff --git a/python-spec/pyproject.toml b/python-spec/pyproject.toml index 8bf4adf5..0bd54a92 100644 --- a/python-spec/pyproject.toml +++ b/python-spec/pyproject.toml @@ -7,7 +7,13 @@ name = "somacore" description = "Python-language API specification and base utilities for implementation of the SOMA system." dynamic = ["version"] readme = "./README.md" -dependencies = ["attrs>=22.1", "numpy>=1.21", "pyarrow", "typing-extensions"] +dependencies = [ + "anndata", + "attrs>=22.1", + "numpy>=1.21", + "pyarrow", + "typing-extensions", +] requires-python = "~=3.7" [project.optional-dependencies] @@ -18,7 +24,6 @@ packages.find.where = ["src"] package-data.somacore = ["py.typed"] dynamic.version.attr = "somacore._version.version" - [tool.isort] profile = "black" line_length = 88 @@ -26,6 +31,6 @@ force_single_line = true single_line_exclusions = ["typing", "typing_extensions"] [[tool.mypy.overrides]] -# pyarrow currently does not have canonical type stubs. -module = ["pyarrow"] +# These dependencies do not currently have canonical type stubs. +module = ["anndata", "pyarrow"] ignore_missing_imports = true diff --git a/python-spec/src/somacore/__init__.py b/python-spec/src/somacore/__init__.py index 4aef2226..a8980d08 100644 --- a/python-spec/src/somacore/__init__.py +++ b/python-spec/src/somacore/__init__.py @@ -9,6 +9,7 @@ from somacore import data from somacore import ephemeral from somacore import options +from somacore.query import axis __version__ = _version.version __version_tuple__ = _version.version_tuple @@ -29,6 +30,8 @@ BatchSize = options.BatchSize ResultOrder = options.ResultOrder +AxisQuery = axis.AxisQuery + __all__ = ( "SOMAObject", "Collection", @@ -40,4 +43,6 @@ "SparseRead", "IOfN", "BatchSize", + "ResultOrder", + "AxisQuery", ) diff --git a/python-spec/src/somacore/composed.py b/python-spec/src/somacore/composed.py index 47c67f85..fae9528d 100644 --- a/python-spec/src/somacore/composed.py +++ b/python-spec/src/somacore/composed.py @@ -1,10 +1,14 @@ """Implementations of the composed SOMA data types.""" +from typing import Optional + from typing_extensions import Final from somacore import _wrap from somacore import base from somacore import data +from somacore.query import axis +from somacore.query import query class Measurement(_wrap.CollectionProxy): @@ -68,4 +72,17 @@ class Experiment(_wrap.CollectionProxy): ms = _wrap.item(base.Collection[Measurement]) """A collection of named measurements.""" + def axis_query( + self, + measurement_name: str, + *, + obs_query: Optional[axis.AxisQuery] = None, + var_query: Optional[axis.AxisQuery] = None, + ) -> query.ExperimentAxisQuery: + """Creates an axis query over this experiment. + + See :class:`query.ExperimentAxisQuery` for details on usage. + """ + raise NotImplementedError() + soma_type: Final = "SOMAExperiment" diff --git a/python-spec/src/somacore/query/__init__.py b/python-spec/src/somacore/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python-spec/src/somacore/query/axis.py b/python-spec/src/somacore/query/axis.py new file mode 100644 index 00000000..e96a9c63 --- /dev/null +++ b/python-spec/src/somacore/query/axis.py @@ -0,0 +1,81 @@ +from typing import Any, Optional, Sequence, Tuple + +import attrs +import numpy as np +import pyarrow as pa +from typing_extensions import TypeGuard + +from somacore import options + + +def _canonicalize_coords( + in_coords: options.SparseDFCoords, +) -> Tuple[options.SparseDFCoord, ...]: + """Validates coordinates and freezes sequences as tuples. + + This is not strictly necessary; DataFrame will report these errors + eventually but doing it now makes for better UX. + """ + if not _is_normal_sequence(in_coords): + raise TypeError( + "query coordinates must be a normal sequence, not `str` or `bytes`." + ) + return tuple(_canonicalize_coord(c) for c in in_coords) + + +def _canonicalize_coord(coord: options.SparseDFCoord) -> options.SparseDFCoord: + """Validates a single coordinate, freezing mutable sequences.""" + if coord is None or isinstance( + coord, (int, slice, pa.Array, pa.ChunkedArray, np.ndarray) + ): + return coord + if _is_normal_sequence(coord): + # We're trusting here that the elements of the user's sequence are + # appropriate. If this is not the case, it will raise down the line. + return tuple(coord) + raise TypeError(f"{type(coord)} object cannot be used as a coordinate.") + + +def _is_normal_sequence(it: Any) -> TypeGuard[Sequence]: + return not isinstance(it, (str, bytes)) and isinstance(it, Sequence) + + +@attrs.define(frozen=True, kw_only=True) +class AxisQuery: + """Single-axis dataframe query with coordinates and a value filter. + + [lifecycle: experimental] + Per dimension, the AxisQuery can have value of: + + * None - all data + * Coordinates - a set of coordinates on the axis dataframe index, + expressed in any type or format supported by ``DataFrame.read()``. + * A SOMA ``value_filter`` across columns in the axis dataframe, + expressed as string + * Or, a combination of coordinates and value filter. + + Examples:: + + AxisQuery() # all data + AxisQuery(coords=(slice(1,10),)) # 1D, slice + AxisQuery(coords=([0,1,2])) # 1D, point indexing using array-like + AxisQuery(coords=(slice(None), numpy.array([0,88,1001]))) # 2D + AxisQuery(value_filter="tissue == 'lung'") + AxisQuery(coords=(slice(1,None),), value_filter="tissue == 'lung'") + ``` + """ + + value_filter: Optional[str] = attrs.field( + default=None, + validator=attrs.validators.optional(attrs.validators.instance_of(str)), + ) + """A string specifying a SOMA ``value_filter``.""" + coords: Tuple[options.SparseDFCoord, ...] = attrs.field( + default=(slice(None),), + converter=_canonicalize_coords, + ) + """Query (slice) by dimension. + + The tuple must have a length less than or equal to the number of dimensions, + and be of a type supported by ``DataFrame``. + """ diff --git a/python-spec/src/somacore/query/query.py b/python-spec/src/somacore/query/query.py new file mode 100644 index 00000000..f1faf577 --- /dev/null +++ b/python-spec/src/somacore/query/query.py @@ -0,0 +1,112 @@ +import abc +import contextlib +from typing import Any, Optional, Sequence + +import anndata +import pyarrow as pa +from typing_extensions import TypedDict + +from somacore import data + + +class AxisColumnNames(TypedDict, total=False): + """Specifies column names for experiment axis query read operations.""" + + obs: Optional[Sequence[str]] + """obs columns to use. All columns if ``None`` or not present.""" + var: Optional[Sequence[str]] + """var columns to use. All columns if ``None`` or not present.""" + + +class ExperimentAxisQuery(contextlib.AbstractContextManager, metaclass=abc.ABCMeta): + @abc.abstractmethod + def obs( + self, *, column_names: Optional[Sequence[str]] = None + ) -> data.ReadIter[pa.Table]: + """Returns ``obs`` as an Arrow table iterator.""" + raise NotImplementedError() + + @abc.abstractmethod + def var( + self, *, column_names: Optional[Sequence[str]] = None + ) -> data.ReadIter[pa.Table]: + """Returns ``var`` as an Arrow table iterator.""" + raise NotImplementedError() + + @abc.abstractmethod + def obs_joinids(self) -> pa.Array: + """Returns ``obs`` ``soma_joinids`` as an Arrow array.""" + raise NotImplementedError() + + @abc.abstractmethod + def var_joinids(self) -> pa.Array: + """Returns ``var`` ``soma_joinids`` as an Arrow array.""" + raise NotImplementedError() + + @property + def n_obs(self) -> int: + """The number of ``obs`` axis query results.""" + return len(self.obs_joinids()) + + @property + def n_vars(self) -> int: + """The number of ``var`` axis query results.""" + return len(self.var_joinids()) + + @abc.abstractmethod + def X(self, layer_name: str) -> data.SparseRead: + """Returns an ``X`` layer as ``SparseRead`` data. + + :param layer_name: The X layer name to return. + """ + raise NotImplementedError() + + @abc.abstractmethod + def obsp(self, layer: str) -> data.SparseRead: + """Return an ``obsp`` layer as a SparseNDArrayRead""" + raise NotImplementedError() + + @abc.abstractmethod + def varp(self, layer: str) -> data.SparseRead: + """Return an ``varp`` layer as a SparseNDArrayRead""" + raise NotImplementedError() + + @abc.abstractmethod + def to_anndata( + self, + X_name: str, + *, + column_names: Optional[AxisColumnNames] = None, + X_layers: Sequence[str] = (), + ) -> anndata.AnnData: + """ + Execute the query and return result as an ``AnnData`` in-memory object. + + :param X_name: The name of the X layer to read and return + in the ``X`` slot. + :param column_names: The columns in the ``var`` and ``obs`` dataframes + to read. + :param X_layers: Additional X layers to read and return + in the ``layers`` slot. + """ + raise NotImplementedError() + + # Context management + + @abc.abstractmethod + def close(self) -> None: + """Releases resources associated with this query. + + This method must be idempotent. + """ + raise NotImplementedError() + + def __exit__(self, *_: Any) -> None: + self.close() + + def __del__(self) -> None: + """Ensure that we're closed when our last ref disappears.""" + # If any superclass in our MRO has a __del__, call it. + sdel = getattr(super(), "__del__", lambda: None) + sdel() + self.close() diff --git a/python-spec/testing/test_query_axis.py b/python-spec/testing/test_query_axis.py new file mode 100644 index 00000000..28506f37 --- /dev/null +++ b/python-spec/testing/test_query_axis.py @@ -0,0 +1,54 @@ +from typing import Tuple + +import numpy as np +import pytest +from pytest import mark + +from somacore import options +from somacore.query import axis + + +@mark.parametrize( + ["coords", "want"], + [ + ((slice(1, 10),), (slice(1, 10),)), + ([0, 1, 2], (0, 1, 2)), + ((slice(None), [0, 88, 1001]), (slice(None), (0, 88, 1001))), + pytest.param( + ("string-coord",), + ("string-coord",), + marks=mark.xfail(reason="strings not supported yet"), + ), + pytest.param( + (b"bytes-coord",), + (b"bytes-coord",), + marks=mark.xfail(reason="bytes not supported yet"), + ), + ], +) +def test_canonicalization( + coords: options.SparseDFCoords, want: Tuple[options.SparseDFCoord, ...] +) -> None: + axq = axis.AxisQuery(coords=coords) + assert want == axq.coords + + +def test_canonicalization_nparray() -> None: + axq = axis.AxisQuery(coords=(1, np.array([1, 2, 3]))) + + one, arr = axq.coords + assert one == 1 + assert (np.array([1, 2, 3]) == arr).all() + + +@mark.parametrize( + ["coords"], + [ + ("forbid bare strings",), + (b"forbid bare byteses",), + ([1, 1.5, 2],), + ], +) +def test_canonicalization_bad(coords) -> None: + with pytest.raises(TypeError): + axis.AxisQuery(coords=coords)