Skip to content

Commit

Permalink
Add ExperimentAxisQuery.
Browse files Browse the repository at this point in the history
This extracts the public interface of
`tiledbsoma.query.ExperimentAxisQuery` and puts it here.

Because there is no TileDB-specific behavior in `ExperimentAxisQuery`,
I believe we can actually pull the entire implementation here (it only
uses the properties of Experiment itself) but that is for a later date.
  • Loading branch information
thetorpedodog committed Jan 12, 2023
1 parent fe5abfb commit 12b033e
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 4 deletions.
13 changes: 9 additions & 4 deletions python-spec/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@ name = "somacore"
description = "Python-language API specification and base utilities for implementation of the SOMA system."
dynamic = ["version"]
readme = "./README.md"
dependencies = ["attrs>=22.1", "numpy>=1.21", "pyarrow", "typing-extensions"]
dependencies = [
"anndata",
"attrs>=22.1",
"numpy>=1.21",
"pyarrow",
"typing-extensions",
]
requires-python = "~=3.7"

[project.optional-dependencies]
Expand All @@ -18,14 +24,13 @@ packages.find.where = ["src"]
package-data.somacore = ["py.typed"]
dynamic.version.attr = "somacore._version.version"


[tool.isort]
profile = "black"
line_length = 88
force_single_line = true
single_line_exclusions = ["typing", "typing_extensions"]

[[tool.mypy.overrides]]
# pyarrow currently does not have canonical type stubs.
module = ["pyarrow"]
# These dependencies do not currently have canonical type stubs.
module = ["anndata", "pyarrow"]
ignore_missing_imports = true
5 changes: 5 additions & 0 deletions python-spec/src/somacore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from somacore import data
from somacore import ephemeral
from somacore import options
from somacore.query import axis

__version__ = _version.version
__version_tuple__ = _version.version_tuple
Expand All @@ -29,6 +30,8 @@
BatchSize = options.BatchSize
ResultOrder = options.ResultOrder

AxisQuery = axis.AxisQuery

__all__ = (
"SOMAObject",
"Collection",
Expand All @@ -40,4 +43,6 @@
"SparseRead",
"IOfN",
"BatchSize",
"ResultOrder",
"AxisQuery",
)
17 changes: 17 additions & 0 deletions python-spec/src/somacore/composed.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""Implementations of the composed SOMA data types."""

from typing import Optional

from typing_extensions import Final

from somacore import _wrap
from somacore import base
from somacore import data
from somacore.query import axis
from somacore.query import query


class Measurement(_wrap.CollectionProxy):
Expand Down Expand Up @@ -68,4 +72,17 @@ class Experiment(_wrap.CollectionProxy):
ms = _wrap.item(base.Collection[Measurement])
"""A collection of named measurements."""

def axis_query(
self,
measurement_name: str,
*,
obs_query: Optional[axis.AxisQuery] = None,
var_query: Optional[axis.AxisQuery] = None,
) -> query.ExperimentAxisQuery:
"""Creates an axis query over this experiment.
See :class:`query.ExperimentAxisQuery` for details on usage.
"""
raise NotImplementedError()

soma_type: Final = "SOMAExperiment"
Empty file.
81 changes: 81 additions & 0 deletions python-spec/src/somacore/query/axis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any, Optional, Sequence, Tuple

import attrs
import numpy as np
import pyarrow as pa
from typing_extensions import TypeGuard

from somacore import options


def _canonicalize_coords(
in_coords: options.SparseDFCoords,
) -> Tuple[options.SparseDFCoord, ...]:
"""Validates coordinates and freezes sequences as tuples.
This is not strictly necessary; DataFrame will report these errors
eventually but doing it now makes for better UX.
"""
if not _is_normal_sequence(in_coords):
raise TypeError(
"query coordinates must be a normal sequence, not `str` or `bytes`."
)
return tuple(_canonicalize_coord(c) for c in in_coords)


def _canonicalize_coord(coord: options.SparseDFCoord) -> options.SparseDFCoord:
"""Validates a single coordinate, freezing mutable sequences."""
if coord is None or isinstance(
coord, (int, slice, pa.Array, pa.ChunkedArray, np.ndarray)
):
return coord
if _is_normal_sequence(coord):
# We're trusting here that the elements of the user's sequence are
# appropriate. If this is not the case, it will raise down the line.
return tuple(coord)
raise TypeError(f"{type(coord)} object cannot be used as a coordinate.")


def _is_normal_sequence(it: Any) -> TypeGuard[Sequence]:
return not isinstance(it, (str, bytes)) and isinstance(it, Sequence)


@attrs.define(frozen=True, kw_only=True)
class AxisQuery:
"""Single-axis dataframe query with coordinates and a value filter.
[lifecycle: experimental]
Per dimension, the AxisQuery can have value of:
* None - all data
* Coordinates - a set of coordinates on the axis dataframe index,
expressed in any type or format supported by ``DataFrame.read()``.
* A SOMA ``value_filter`` across columns in the axis dataframe,
expressed as string
* Or, a combination of coordinates and value filter.
Examples::
AxisQuery() # all data
AxisQuery(coords=(slice(1,10),)) # 1D, slice
AxisQuery(coords=([0,1,2])) # 1D, point indexing using array-like
AxisQuery(coords=(slice(None), numpy.array([0,88,1001]))) # 2D
AxisQuery(value_filter="tissue == 'lung'")
AxisQuery(coords=(slice(1,None),), value_filter="tissue == 'lung'")
```
"""

value_filter: Optional[str] = attrs.field(
default=None,
validator=attrs.validators.optional(attrs.validators.instance_of(str)),
)
"""A string specifying a SOMA ``value_filter``."""
coords: Tuple[options.SparseDFCoord, ...] = attrs.field(
default=(slice(None),),
converter=_canonicalize_coords,
)
"""Query (slice) by dimension.
The tuple must have a length less than or equal to the number of dimensions,
and be of a type supported by ``DataFrame``.
"""
112 changes: 112 additions & 0 deletions python-spec/src/somacore/query/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import abc
import contextlib
from typing import Any, Optional, Sequence

import anndata
import pyarrow as pa
from typing_extensions import TypedDict

from somacore import data


class AxisColumnNames(TypedDict, total=False):
"""Specifies column names for experiment axis query read operations."""

obs: Optional[Sequence[str]]
"""obs columns to use. All columns if ``None`` or not present."""
var: Optional[Sequence[str]]
"""var columns to use. All columns if ``None`` or not present."""


class ExperimentAxisQuery(contextlib.AbstractContextManager, metaclass=abc.ABCMeta):
@abc.abstractmethod
def obs(
self, *, column_names: Optional[Sequence[str]] = None
) -> data.ReadIter[pa.Table]:
"""Returns ``obs`` as an Arrow table iterator."""
raise NotImplementedError()

@abc.abstractmethod
def var(
self, *, column_names: Optional[Sequence[str]] = None
) -> data.ReadIter[pa.Table]:
"""Returns ``var`` as an Arrow table iterator."""
raise NotImplementedError()

@abc.abstractmethod
def obs_joinids(self) -> pa.Array:
"""Returns ``obs`` ``soma_joinids`` as an Arrow array."""
raise NotImplementedError()

@abc.abstractmethod
def var_joinids(self) -> pa.Array:
"""Returns ``var`` ``soma_joinids`` as an Arrow array."""
raise NotImplementedError()

@property
def n_obs(self) -> int:
"""The number of ``obs`` axis query results."""
return len(self.obs_joinids())

@property
def n_vars(self) -> int:
"""The number of ``var`` axis query results."""
return len(self.var_joinids())

@abc.abstractmethod
def X(self, layer_name: str) -> data.SparseRead:
"""Returns an ``X`` layer as ``SparseRead`` data.
:param layer_name: The X layer name to return.
"""
raise NotImplementedError()

@abc.abstractmethod
def obsp(self, layer: str) -> data.SparseRead:
"""Return an ``obsp`` layer as a SparseNDArrayRead"""
raise NotImplementedError()

@abc.abstractmethod
def varp(self, layer: str) -> data.SparseRead:
"""Return an ``varp`` layer as a SparseNDArrayRead"""
raise NotImplementedError()

@abc.abstractmethod
def to_anndata(
self,
X_name: str,
*,
column_names: Optional[AxisColumnNames] = None,
X_layers: Sequence[str] = (),
) -> anndata.AnnData:
"""
Execute the query and return result as an ``AnnData`` in-memory object.
:param X_name: The name of the X layer to read and return
in the ``X`` slot.
:param column_names: The columns in the ``var`` and ``obs`` dataframes
to read.
:param X_layers: Additional X layers to read and return
in the ``layers`` slot.
"""
raise NotImplementedError()

# Context management

@abc.abstractmethod
def close(self) -> None:
"""Releases resources associated with this query.
This method must be idempotent.
"""
raise NotImplementedError()

def __exit__(self, *_: Any) -> None:
self.close()

def __del__(self) -> None:
"""Ensure that we're closed when our last ref disappears."""
# If any superclass in our MRO has a __del__, call it.
sdel = getattr(super(), "__del__", lambda: None)
sdel()
self.close()
54 changes: 54 additions & 0 deletions python-spec/testing/test_query_axis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Tuple

import numpy as np
import pytest
from pytest import mark

from somacore import options
from somacore.query import axis


@mark.parametrize(
["coords", "want"],
[
((slice(1, 10),), (slice(1, 10),)),
([0, 1, 2], (0, 1, 2)),
((slice(None), [0, 88, 1001]), (slice(None), (0, 88, 1001))),
pytest.param(
("string-coord",),
("string-coord",),
marks=mark.xfail(reason="strings not supported yet"),
),
pytest.param(
(b"bytes-coord",),
(b"bytes-coord",),
marks=mark.xfail(reason="bytes not supported yet"),
),
],
)
def test_canonicalization(
coords: options.SparseDFCoords, want: Tuple[options.SparseDFCoord, ...]
) -> None:
axq = axis.AxisQuery(coords=coords)
assert want == axq.coords


def test_canonicalization_nparray() -> None:
axq = axis.AxisQuery(coords=(1, np.array([1, 2, 3])))

one, arr = axq.coords
assert one == 1
assert (np.array([1, 2, 3]) == arr).all()


@mark.parametrize(
["coords"],
[
("forbid bare strings",),
(b"forbid bare byteses",),
([1, 1.5, 2],),
],
)
def test_canonicalization_bad(coords) -> None:
with pytest.raises(TypeError):
axis.AxisQuery(coords=coords)

0 comments on commit 12b033e

Please sign in to comment.