Skip to content

Commit

Permalink
[python] Implement IntIndexer as class that wraps around `clib.IntI…
Browse files Browse the repository at this point in the history
…ndexer` (#2310)

Replace the `tiledbsoma_build_index` function with direct use of an `IntIndexer` class that wraps `clib.IntIndexer`.
  • Loading branch information
jp-dark authored Mar 26, 2024
1 parent b613f3b commit bc1cd79
Show file tree
Hide file tree
Showing 11 changed files with 93 additions and 94 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ repos:
- id: mypy
additional_dependencies:
- "pandas-stubs==1.5.3.230214"
- "somacore==1.0.9"
- "somacore==1.0.10"
- "types-setuptools==67.4.0.3"
args: ["--config-file=apis/python/pyproject.toml", "apis/python/src", "apis/python/devtools"]
pass_filenames: false
2 changes: 1 addition & 1 deletion apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def run(self):
"pyarrow>=9.0.0; platform_system!='Darwin'",
"scanpy>=1.9.2",
"scipy",
"somacore==1.0.9",
"somacore==1.0.10",
"tiledb~=0.27.0",
"typing-extensions", # Note "-" even though `import typing_extensions`
],
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
get_storage_engine,
show_package_versions,
)
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._measurement import Measurement
from ._sparse_nd_array import SparseNDArray
from .options import SOMATileDBContext, TileDBCreateOptions
Expand All @@ -173,7 +173,6 @@
__all__ = [
"AxisColumnNames",
"AxisQuery",
"tiledbsoma_build_index",
"Collection",
"DataFrame",
"DenseNDArray",
Expand All @@ -184,6 +183,7 @@
"get_implementation",
"get_SOMA_version",
"get_storage_engine",
"IntIndexer",
"Measurement",
"open",
"ResultOrder",
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ._collection import Collection, CollectionBase
from ._dataframe import DataFrame
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._measurement import Measurement
from ._tdb_handles import Wrapper
from ._tiledb_object import AnyTileDBObject
Expand Down Expand Up @@ -95,7 +95,7 @@ def axis_query( # type: ignore
obs_query=obs_query or query.AxisQuery(),
var_query=var_query or query.AxisQuery(),
index_factory=functools.partial(
tiledbsoma_build_index,
IntIndexer,
context=self.context,
),
)
52 changes: 0 additions & 52 deletions apis/python/src/tiledbsoma/_index_util.py

This file was deleted.

65 changes: 65 additions & 0 deletions apis/python/src/tiledbsoma/_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, List, Optional, Union

import numpy as np
import numpy.typing as npt
import pandas as pd
import pyarrow as pa

from tiledbsoma import pytiledbsoma as clib

if TYPE_CHECKING:
from .options import SOMATileDBContext

IndexerDataType = Union[
npt.NDArray[np.int64],
pa.Array,
pa.IntegerArray,
pd.Series,
pd.arrays.IntegerArray,
pa.ChunkedArray,
List[int],
]


class IntIndexer:
"""A re-indexer for unique integer indices.
Lifecycle:
Experimental.
"""

def __init__(
self, data: IndexerDataType, *, context: Optional["SOMATileDBContext"] = None
):
"""Initialize re-indexer for provied indices.
Args:
data:
Integer keys used to build the index (hash) table.
context:
``SOMATileDBContext`` object containing concurrecy level.
Lifecycle:
Experimental.
"""
self._context = context
self._reindexer = clib.IntIndexer(
None if self._context is None else self._context.native_context
)
self._reindexer.map_locations(data)

def get_indexer(self, target: IndexerDataType) -> Any:
"""Compute underlying indices of index for target data.
Compatible with Pandas' Index.get_indexer method.
Args:
target: Data to return re-index data for.
"""
return (
self._reindexer.get_indexer_pyarrow(target)
if isinstance(target, (pa.Array, pa.ChunkedArray))
else self._reindexer.get_indexer_general(target)
)
20 changes: 9 additions & 11 deletions apis/python/src/tiledbsoma/_read_iters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

from . import _util
from ._exception import SOMAError
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._types import NTuple
from .options import SOMATileDBContext

Expand Down Expand Up @@ -138,10 +138,7 @@ def __init__(
self.axes_to_reindex = set(range(self.ndim)) - set(self.reindex_disable_on_axis)
assert context is not None
self.minor_axes_indexer = {
d: tiledbsoma_build_index(
self.joinids[d].to_numpy(),
context=context,
)
d: IntIndexer(self.joinids[d].to_numpy(), context=context)
for d in (self.axes_to_reindex - set((self.major_axis,)))
}

Expand Down Expand Up @@ -257,9 +254,8 @@ def _reindexed_table_reader(
if d in self.axes_to_reindex:
if d == self.major_axis:
assert self.context is not None
col = tiledbsoma_build_index(
coords[self.major_axis],
context=self.context,
col = IntIndexer(
coords[self.major_axis], context=self.context
).get_indexer(
col.to_numpy(),
)
Expand Down Expand Up @@ -337,9 +333,11 @@ def _create_reader(self) -> Iterator[BlockwiseScipyReadIterResult]:
"""
Private. Iterator over SparseNDArray producing sequence of scipy sparse matrix.
"""
yield from self._cs_reader(
_pool=self._threadpool
) if self.compress else self._coo_reader(_pool=self._threadpool)
yield from (
self._cs_reader(_pool=self._threadpool)
if self.compress
else self._coo_reader(_pool=self._threadpool)
)

def _sorted_tbl_reader(
self, _pool: Optional[ThreadPoolExecutor] = None
Expand Down
10 changes: 2 additions & 8 deletions apis/python/src/tiledbsoma/reindexer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,10 @@ void load_reindexer(py::module& m) {
// Perform lookup for a large input array of keys and writes the
// looked up values into previously allocated array (works for the
// cases in which python and R pre-allocate the array)
.def(
"get_indexer",
[](IntIndexer& indexer, py::array_t<int64_t> lookups) {
return get_indexer_general(indexer, lookups);
})
.def("get_indexer_general", get_indexer_general)
// If the input is not arrow (does not have _export_to_c attribute),
// it will be handled using a general input method.
.def("get_indexer", [](IntIndexer& indexer, py::object py_arrow_array) {
return get_indexer_py_arrow(indexer, py_arrow_array);
});
.def("get_indexer_pyarrow", get_indexer_py_arrow);
}

} // namespace libtiledbsomacpp
6 changes: 3 additions & 3 deletions apis/python/tests/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pyarrow as pa
import pytest

from tiledbsoma._index_util import tiledbsoma_build_index
from tiledbsoma._indexer import IntIndexer
from tiledbsoma.options import SOMATileDBContext
from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context

Expand All @@ -23,7 +23,7 @@ def test_duplicate_key_indexer_error(
):
context = _validate_soma_tiledb_context(SOMATileDBContext())
with pytest.raises(RuntimeError, match="There are duplicate keys."):
tiledbsoma_build_index(keys, context=context)
IntIndexer(keys, context=context)

pd_index = pd.Index(keys)
with pytest.raises(pd.errors.InvalidIndexError):
Expand Down Expand Up @@ -101,7 +101,7 @@ def test_indexer(contextual: bool, keys: np.array, lookups: np.array):
num_threads = 10

def target():
indexer = tiledbsoma_build_index(keys, context=context)
indexer = IntIndexer(keys, context=context)
results = indexer.get_indexer(lookups)
all_results.append(results)

Expand Down
20 changes: 7 additions & 13 deletions apis/python/tests/test_reindexer_api.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
import numpy as np

from tiledbsoma import SOMATileDBContext, tiledbsoma_build_index
from typing import Optional

import numpy as np
import pytest

def test_reindexer_api_thread_count():
keys = np.arange(3, 10, 2)
ids = np.arange(3, 10, 2)
expected = np.array([0, 1, 2, 3])
indexer = tiledbsoma_build_index(keys)
result = indexer.get_indexer(ids)
assert np.equal(result.all(), expected.all())
from tiledbsoma import IntIndexer, SOMATileDBContext


def test_reindexer_api_context():
context = SOMATileDBContext()
@pytest.mark.parametrize("context", [None, SOMATileDBContext()])
def test_reindexer_api(context: Optional[SOMATileDBContext]):
keys = np.arange(3, 10, 2)
ids = np.arange(3, 10, 2)
expected = np.array([0, 1, 2, 3])
indexer = tiledbsoma_build_index(keys, context=context)
indexer = IntIndexer(keys, context=context)
result = indexer.get_indexer(ids)
assert np.equal(result.all(), expected.all())
2 changes: 1 addition & 1 deletion libtiledbsoma/src/reindexer/reindexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ class IntIndexer {

} // namespace tiledbsoma

#endif // TILEDBSOMA_REINDEXER_H
#endif // TILEDBSOMA_REINDEXER_H

0 comments on commit bc1cd79

Please sign in to comment.