[python] Update TileDB-SOMA to use the somacore release with spatia…

…l datatypes (#3078) This updates the somacore version and adds the new spatial types to TileDB-SOMA as empty classes that throw `NotImplementedErrors`. It updates `Experiment` and `Measurement` to match the updated somacore API. Co-authored-by: Aaron Wolen <aaron@wolen.com> Co-authored-by: nguyenv <vivian@tiledb.com> Co-authored-by: John Kerl <kerl.john.r@gmail.com>
single-cell-data · Sep 27, 2024 · 95c2ba7 · 95c2ba7
1 parent c58eef2
commit 95c2ba7
Show file tree

Hide file tree

Showing 9 changed files with 1,168 additions and 2 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
         # Pandas 2.x types (e.g. `pd.Series[Any]`). See `_types.py` or https://github.com/single-cell-data/TileDB-SOMA/issues/2839
         # for more info.
         - "pandas-stubs>=2"
-        - "somacore==1.0.17"
+        - "somacore==1.0.18"
         - types-setuptools
       args: ["--config-file=apis/python/pyproject.toml", "apis/python/src", "apis/python/devtools"]
       pass_filenames: false
diff --git a/apis/python/setup.py b/apis/python/setup.py
@@ -335,7 +335,7 @@ def run(self):
         "scanpy>=1.9.2",
         "scipy",
         # Note: the somacore version is in .pre-commit-config.yaml too
-        "somacore==1.0.17",
+        "somacore==1.0.18",
         "tiledb~=0.32.0",
         "typing-extensions",  # Note "-" even though `import typing_extensions`
     ],

diff --git a/apis/python/src/tiledbsoma/_constants.py b/apis/python/src/tiledbsoma/_constants.py
@@ -7,6 +7,12 @@
 """
 
 SOMA_JOINID = "soma_joinid"
+SOMA_GEOMETRY = "soma_geometry"
 SOMA_OBJECT_TYPE_METADATA_KEY = "soma_object_type"
 SOMA_ENCODING_VERSION_METADATA_KEY = "soma_encoding_version"
 SOMA_ENCODING_VERSION = "1"
+
+SPATIAL_DISCLAIMER = (
+    "Support for spatial types is experimental. Changes to both the API and data "
+    "storage may not be backwards compatible."
+)
diff --git a/apis/python/src/tiledbsoma/_experiment.py b/apis/python/src/tiledbsoma/_experiment.py
@@ -16,6 +16,7 @@
 from ._dataframe import DataFrame
 from ._indexer import IntIndexer
 from ._measurement import Measurement
+from ._scene import Scene
 from ._soma_object import AnySOMAObject
 
 
@@ -24,6 +25,7 @@ class Experiment(  # type: ignore[misc]  # __eq__ false positive
     experiment.Experiment[  # type: ignore[type-var]
         DataFrame,
         Collection[Measurement],
+        Collection[Scene],
         AnySOMAObject,
     ],
 ):
@@ -43,6 +45,8 @@ class Experiment(  # type: ignore[misc]  # __eq__ false positive
             defined in this dataframe.
         ms (Collection):
             A collection of named measurements.
+        spatial (Collection):
+            A collection of spatial scenes.
 
     Example:
         >>> import tiledbsoma
@@ -69,6 +73,8 @@ class Experiment(  # type: ignore[misc]  # __eq__ false positive
     _subclass_constrained_soma_types = {
         "obs": ("SOMADataFrame",),
         "ms": ("SOMACollection",),
+        "spatial": ("SOMACollection",),
+        "obs_spatial_presence": ("SOMADataFrame",),
     }
 
     def axis_query(  # type: ignore

diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2024 The Chan Zuckerberg Initiative Foundation
+# Copyright (c) 2024 TileDB, Inc.
+#
+# Licensed under the MIT License.
+"""
+Implementation of a SOMA Geometry DataFrame
+"""
+
+import warnings
+from typing import Any, Optional, Sequence, Tuple, Union
+
+import pyarrow as pa
+import somacore
+from somacore import CoordinateSpace, CoordinateTransform, options
+from typing_extensions import Self
+
+from ._constants import SOMA_GEOMETRY, SOMA_JOINID, SPATIAL_DISCLAIMER
+from ._dataframe import Domain
+from ._read_iters import TableReadIter
+from ._types import OpenTimestamp
+from .options import SOMATileDBContext
+
+_UNBATCHED = options.BatchSize()
+
+
+class GeometryDataFrame(somacore.GeometryDataFrame):
+    """A specialized SOMA object for storing complex geometries with spatial indexing.
+
+    The ``GeometryDataFrame`` class is designed to store and manage geometric shapes
+    such as polygons, lines, and multipoints, along with additional columns for
+    associated attributes.
+
+    Lifecycle:
+        Experimental.
+    """
+
+    __slots__ = ()
+
+    # Lifecycle
+
+    @classmethod
+    def create(
+        cls,
+        uri: str,
+        *,
+        schema: pa.Schema,
+        index_column_names: Sequence[str] = (SOMA_JOINID, SOMA_GEOMETRY),
+        axis_names: Sequence[str] = ("x", "y"),
+        domain: Optional[Domain] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+        context: Optional[SOMATileDBContext] = None,
+        tiledb_timestamp: Optional[OpenTimestamp] = None,
+    ) -> Self:
+        """Creates a new ``GeometryDataFrame`` at the given URI.
+
+        The schema of the created geometry dataframe will include a column named
+        ``soma_joinid`` of type ``pyarrow.int64``, with negative values
+        disallowed, and a column named ``soma_geometry of type ``pyarrow.binary`` or
+        ``pyarrow.large_binary``.  If a ``soma_joinid`` column or ``soma_geometry``
+        are present in the provided schema, they must be of the correct type.  If
+        either the ``soma_joinid`` column or ``soma_geometry`` column are not provided,
+        one will be added. The ``soma_joinid`` may be an index column. The
+        ``soma_geometry`` column must be an index column.
+
+        Args:
+            uri: The URI where the dataframe will be created.
+            schema: Arrow schema defining the per-column schema. This schema
+                must define all columns, including columns to be named as index
+                columns.  If the schema includes types unsupported by the SOMA
+                implementation, a ValueError will be raised.
+            index_column_names: A list of column names to use as user-defined
+                index columns (e.g., ``['cell_type', 'tissue_type']``).
+                All named columns must exist in the schema, and at least one
+                index column name is required.
+            axis_names: An ordered list of axis column names that correspond to the
+                names of the axes of the coordinate space the geometries are defined
+                on.
+            domain: An optional sequence of tuples specifying the domain of each
+                index column. Two tuples must be provided for the ``soma_geometry``
+                column which store the width followed by the height. Each tuple should
+                be a pair consisting of the minimum and maximum values storable in the
+                index column. If omitted entirely, or if ``None`` in a given dimension,
+                the corresponding index-column domain will use the minimum and maximum
+                possible values for the column's datatype.  This makes a dataframe
+                growable.
+
+        Returns:
+            The newly created geometry dataframe, opened for writing.
+
+        Lifecycle:
+            Experimental.
+        """
+        warnings.warn(SPATIAL_DISCLAIMER)
+        raise NotImplementedError()
+
+    # Data operations
+
+    def read(
+        self,
+        coords: options.SparseDFCoords = (),
+        column_names: Optional[Sequence[str]] = None,
+        *,
+        batch_size: options.BatchSize = _UNBATCHED,
+        partitions: Optional[options.ReadPartitions] = None,
+        result_order: options.ResultOrderStr = options.ResultOrder.AUTO,
+        value_filter: Optional[str] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> TableReadIter:
+        """Reads a user-defined slice of data into Arrow tables.
+
+        Args:
+            coords: for each index dimension, which rows to read.
+                Defaults to ``()``, meaning no constraint -- all IDs.
+            column_names: the named columns to read and return.
+                Defaults to ``None``, meaning no constraint -- all column names.
+            partitions: If present, specifies that this is part of
+                a partitioned read, and which part of the data to include.
+            result_order: the order to return results, specified as a
+                :class:`~options.ResultOrder` or its string value.
+            value_filter: an optional value filter to apply to the results.
+                The default of ``None`` represents no filter. Value filter
+                syntax is implementation-defined; see the documentation
+                for the particular SOMA implementation for details.
+        Returns:
+            A :class:`ReadIter` of :class:`pa.Table`s.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    def read_spatial_region(
+        self,
+        region: Optional[options.SpatialRegion] = None,
+        column_names: Optional[Sequence[str]] = None,
+        *,
+        region_transform: Optional[CoordinateTransform] = None,
+        region_coord_space: Optional[CoordinateSpace] = None,
+        batch_size: options.BatchSize = _UNBATCHED,
+        partitions: Optional[options.ReadPartitions] = None,
+        result_order: options.ResultOrderStr = options.ResultOrder.AUTO,
+        value_filter: Optional[str] = None,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> somacore.SpatialRead[somacore.ReadIter[pa.Table]]:
+        """Reads data intersecting an user-defined region of space into a
+        :class:`SpatialRead` with data in Arrow tables.
+
+
+        Args:
+            region: The region to query. May be a box in the form
+                [x_min, y_min, x_max, y_max] (for 2D images), a box in the form
+                [x_min, y_min, z_min, x_max, y_max, z_max] (for 3D images), or
+                a shapely Geometry.
+            column_names: The named columns to read and return.
+                Defaults to ``None``, meaning no constraint -- all column names.
+            region_transform: An optional coordinate transform from the read region to the
+                coordinate system of the spatial dataframe.
+                Defaults to ``None``, meaning an identity transform.
+            region_coord_space: An optional coordinate space for the region being read.
+                Defaults to ``None``, coordinate space will be inferred from transform.
+            batch_size: The size of batched reads.
+                Defaults to `unbatched`.
+            partitions: If present, specifies that this is part of a partitioned read,
+                and which part of the data to include.
+            result_order: the order to return results, specified as a
+                :class:`~options.ResultOrder` or its string value.
+            value_filter: an optional value filter to apply to the results.
+                The default of ``None`` represents no filter. Value filter
+                syntax is implementation-defined; see the documentation
+                for the particular SOMA implementation for details.
+
+        Returns:
+            A :class:`SpatialRead` with :class:`ReadIter` of :class:`pa.Table`s data.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    def write(
+        self,
+        values: Union[pa.RecordBatch, pa.Table],
+        *,
+        platform_config: Optional[options.PlatformConfig] = None,
+    ) -> Self:
+        """Writes the data from an Arrow table to the persistent object.
+
+        As duplicate index values are not allowed, index values already present
+        in the object are overwritten and new index values are added.
+
+        Args:
+            values: An Arrow table containing all columns, including
+                the index columns. The schema for the values must match
+                the schema for the ``DataFrame``.
+
+        Returns: ``self``, to enable method chaining.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    # Metadata operations
+
+    @property
+    def schema(self) -> pa.Schema:
+        """The schema of the data in this dataframe.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    @property
+    def index_column_names(self) -> Tuple[str, ...]:
+        """The names of the index (dimension) columns.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    @property
+    def axis_names(self) -> Tuple[str, ...]:
+        """The names of the axes of the coordinate space the data is defined on.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    @property
+    def coordinate_space(self) -> Optional[CoordinateSpace]:
+        """Coordinate space for this geometry dataframe.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    @coordinate_space.setter
+    def coordinate_space(self, value: CoordinateSpace) -> None:
+        """Coordinate space for this geometry dataframe.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
+
+    @property
+    def domain(self) -> Tuple[Tuple[Any, Any], ...]:
+        """The allowable range of values in each index column.
+
+        Returns: a tuple of minimum and maximum values, inclusive,
+            storable on each index column of the dataframe.
+
+        Lifecycle:
+            Experimental.
+        """
+        raise NotImplementedError()
diff --git a/apis/python/src/tiledbsoma/_measurement.py b/apis/python/src/tiledbsoma/_measurement.py
@@ -80,4 +80,5 @@ class Measurement(  # type: ignore[misc]  # __eq__ false positive
         "obsp": ("SOMACollection",),
         "varm": ("SOMACollection",),
         "varp": ("SOMACollection",),
+        "var_spatial_presence": ("SOMADataFrame",),
     }