Skip to content

Commit

Permalink
[python] Let registrar provide new shapes for resize (#3152)
Browse files Browse the repository at this point in the history
* [python] Complete 3140

* add unit-test case

* [python] Let registration provide new shapes for resize [skip ci]

* unit-test cases
  • Loading branch information
johnkerl authored Oct 9, 2024
1 parent 46bf2ec commit e80f509
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def _acquire_experiment_mappings(

if experiment_uri is not None:
if not tiledbsoma.Experiment.exists(experiment_uri, context=context):
raise ValueError("cannot find experiment at URI {experiment_uri}")
raise ValueError(f"cannot find experiment at URI {experiment_uri}")

# Pre-check
with tiledbsoma.Experiment.open(experiment_uri, context=context) as exp:
Expand Down Expand Up @@ -488,6 +488,26 @@ def __str__(self) -> str:
lines.append(f"{k}/var:{len(v.data)}")
return "\n".join(lines)

def get_obs_shape(self) -> int:
"""Reports the new obs shape which the experiment will need to be
resized to in order to accommodate the data contained within the
registration."""
if len(self.obs_axis.data.values()) == 0:
return 0
return 1 + max(self.obs_axis.data.values())

def get_var_shapes(self) -> Dict[str, int]:
"""Reports the new var shapes, one per measurement, which the experiment
will need to be resized to in order to accommodate the data contained
within the registration."""
retval: Dict[str, int] = {}
for key, axis in self.var_axes.items():
if len(axis.data.values()) == 0:
retval[key] = 0
else:
retval[key] = 1 + max(axis.data.values())
return retval

def to_json(self) -> str:
return json.dumps(self, default=attrs.asdict, sort_keys=True, indent=4)

Expand Down
34 changes: 32 additions & 2 deletions apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,8 @@ def test_pandas_indexing(
signature_col_names: List[Union[str, Tuple[str, str]]],
):
"""
The `default_index_name` for registration can interact with column- and index-names in a variety of ways; this test
exercises several of them.
The `default_index_name` for registration can interact with column- and
index-names in a variety of ways; this test exercises several of them.
"""
df = PANDAS_INDEXING_TEST_DF.copy()
index_col = index_col_and_name[0]
Expand Down Expand Up @@ -300,6 +300,9 @@ def test_isolated_anndata_mappings(obs_field_name, var_field_name):
["RAW2", "TP53", "VEGFA"]
).data == (6, 3, 4)

assert rd.get_obs_shape() == 3
assert rd.get_var_shapes() == {"measname": 5, "raw": 7}


@pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"])
@pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"])
Expand All @@ -319,6 +322,9 @@ def test_isolated_h5ad_mappings(obs_field_name, var_field_name):
["RAW2", "TP53", "VEGFA"]
).data == (6, 3, 4)

assert rd.get_obs_shape() == 3
assert rd.get_var_shapes() == {"measname": 5, "raw": 7}


@pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"])
@pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"])
Expand All @@ -337,6 +343,9 @@ def test_isolated_soma_experiment_mappings(obs_field_name, var_field_name):
["RAW2", "TP53", "VEGFA"]
).data == (6, 3, 4)

assert rd.get_obs_shape() == 3
assert rd.get_var_shapes() == {"measname": 5, "raw": 7}


@pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"])
@pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"])
Expand Down Expand Up @@ -430,6 +439,9 @@ def test_multiples_without_experiment(
"ZZZ3": 9,
}

assert rd.get_obs_shape() == 12
assert rd.get_var_shapes() == {"measname": 7, "raw": 10}

# Now do the ingestion per se. Note that once registration is done sequentially, ingest order
# mustn't matter, and in fact, can be done in parallel. This is why we test various permutations
# of the ordering of the h5ad file names.
Expand Down Expand Up @@ -677,6 +689,9 @@ def test_multiples_with_experiment(obs_field_name, var_field_name):
"ZZZ3": 9,
}

assert rd.get_obs_shape() == 12
assert rd.get_var_shapes() == {"measname": 7, "raw": 10}


@pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"])
@pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"])
Expand All @@ -691,6 +706,9 @@ def test_append_items_with_experiment(obs_field_name, var_field_name):
var_field_name=var_field_name,
)

assert rd.get_obs_shape() == 6
assert rd.get_var_shapes() == {"measname": 5, "raw": 7}

adata2 = ad.read_h5ad(h5ad2)

original = adata2.copy()
Expand Down Expand Up @@ -1054,6 +1072,9 @@ def test_registration_with_batched_reads(tmp_path, soma_larger, use_small_buffer

assert len(rd.obs_axis.data) == 1000

assert rd.get_obs_shape() == 1000
assert rd.get_var_shapes() == {"measname": 6}


def test_ealm_expose():
"""Checks that this is exported from tiledbsoma.io._registration"""
Expand Down Expand Up @@ -1163,6 +1184,9 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b):
var_field_name=var_field_name,
)

assert rd.get_obs_shape() == nobs_a + nobs_b
assert rd.get_var_shapes() == {"meas": 4, "raw": 0}

tiledbsoma.io.from_anndata(
soma_uri, adata, measurement_name=measurement_name, registration_mapping=rd
)
Expand All @@ -1181,6 +1205,9 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b):
var_field_name=var_field_name,
)

assert rd.get_obs_shape() == nobs_a + nobs_b
assert rd.get_var_shapes() == {"meas": 4}

tiledbsoma.io.from_anndata(
soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd
)
Expand Down Expand Up @@ -1256,6 +1283,9 @@ def test_multimodal_names(tmp_path, conftest_pbmc3k_adata):
var_field_name=adata_protein.var.index.name,
)

assert rd.get_obs_shape() == 2638
assert rd.get_var_shapes() == {"protein": 500, "raw": 13714}

# Ingest the second anndata object into the protein measurement
tiledbsoma.io.from_anndata(
experiment_uri=uri,
Expand Down

0 comments on commit e80f509

Please sign in to comment.