Skip to content

Commit

Permalink
[python] improve uns DataFrame I/O (#2874)
Browse files Browse the repository at this point in the history
* don't mutate users' `uns` DataFrames in `from_anndata`

* `_extract_uns`: use `_read_dataframe`
  • Loading branch information
ryan-williams authored Aug 14, 2024
1 parent 9f82dad commit 1e64fdd
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 45 deletions.
3 changes: 2 additions & 1 deletion apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2567,7 +2567,8 @@ def _ingest_uns_node(
num_rows = value.shape[0]
with _write_dataframe(
_util.uri_joinpath(coll.uri, key),
value,
# _write_dataframe modifies passed DataFrame in-place (adding a "soma_joinid" index)
value.copy(),
None,
axis_mapping=AxisIDMapping.identity(num_rows),
**ingest_platform_ctx,
Expand Down
24 changes: 12 additions & 12 deletions apis/python/src/tiledbsoma/io/outgest.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ def _extract_X_key(


def _read_dataframe(
df: DataFrame, default_index_name: Optional[str], fallback_index_name: str
df: DataFrame,
default_index_name: Optional[str] = None,
fallback_index_name: Optional[str] = None,
) -> pd.DataFrame:
"""Outgest a SOMA DataFrame to Pandas, including restoring the original index{,.name}.
Expand Down Expand Up @@ -175,7 +177,7 @@ def _read_dataframe(
#
# NOTE: several edge cases result in the outgested DF not matching the original DF; see
# https://github.com/single-cell-data/TileDB-SOMA/issues/2829.
if fallback_index_name in pdf:
if fallback_index_name is not None and fallback_index_name in pdf:
pdf.set_index(fallback_index_name, inplace=True)
pdf.index.name = None

Expand Down Expand Up @@ -444,20 +446,18 @@ def _extract_uns(
extracted[key] = _extract_uns(element, level=level + 1)
elif isinstance(element, DataFrame):
hint = element.metadata.get(_UNS_OUTGEST_HINT_KEY)
pdf = element.read().concat().to_pandas()
if hint is None:
extracted[key] = pdf
elif hint == _UNS_OUTGEST_HINT_1D:
if hint == _UNS_OUTGEST_HINT_1D:
pdf = element.read().concat().to_pandas()
extracted[key] = _outgest_uns_1d_string_array(pdf, element.uri)
elif hint == _UNS_OUTGEST_HINT_2D:
pdf = element.read().concat().to_pandas()
extracted[key] = _outgest_uns_2d_string_array(pdf, element.uri)
else:
msg = (
f"Warning: uns {collection.uri}[{key!r}] has "
+ "{_UNS_OUTGEST_HINT_KEY} as unrecognized {hint}: leaving this as Pandas DataFrame"
)
logging.log_io_same(msg)
extracted[key] = pdf
if hint is not None:
logging.log_io_same(
f"Warning: uns {collection.uri}[{key!r}] has {_UNS_OUTGEST_HINT_KEY} as unrecognized {hint}: leaving this as Pandas DataFrame"
)
extracted[key] = _read_dataframe(element, fallback_index_name="index")
elif isinstance(element, SparseNDArray):
extracted[key] = element.read().tables().concat().to_pandas()
elif isinstance(element, DenseNDArray):
Expand Down
34 changes: 2 additions & 32 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import tiledbsoma
import tiledbsoma.io
from tiledbsoma import SOMA_JOINID, Experiment, _constants, _factory
from tiledbsoma import Experiment, _constants, _factory
from tiledbsoma._soma_object import SOMAObject
from tiledbsoma.io._common import _TILEDBSOMA_TYPE
import tiledb
Expand Down Expand Up @@ -1024,19 +1024,7 @@ def test_uns_io(tmp_path, outgest_uns_keys):
soma_uri = tmp_path.as_posix()

tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name="RNA")

# NOTE: `from_anndata` mutates user-provided DataFrames in `uns`, demoting their index to a column named "index",
# and installing a `soma_joinid` index. Here we patch the "expected" adata to reflect this, before comparing to
# the post-`froM_anndata` `adata`.
# TODO: fix `from_anndata` to not modify DataFrames in user-provided `uns`.
expected_adata = deepcopy(adata0)
for k in ["pd_df_indexed", "pd_df_nonindexed"]:
df = expected_adata.uns[k]
expected_adata.uns[k] = df.reset_index().set_index(
pd.Index(list(range(len(df))), name=SOMA_JOINID)
)

assert_adata_equal(expected_adata, adata)
assert_adata_equal(adata0, adata)

with tiledbsoma.Experiment.open(soma_uri) as exp:
adata2 = tiledbsoma.io.to_anndata(
Expand All @@ -1047,24 +1035,6 @@ def test_uns_io(tmp_path, outgest_uns_keys):

expected_adata = deepcopy(adata0)

# When outgesting `uns` DataFrames, `to_anndata` fails to remove the `soma_joinid` column added
# during ingest. It also demotes the original `df.index` to a column named "index". Here we
# patch the "expected" adata to reflect this, before comparing to the post-`to_anndata`
# `adata`.
# TODO: use `_read_dataframe` during `uns` DataFrame outgest (which does a better job restoring
# the original pd.DataFrame, dropping `soma_joinid` and restoring the original `df.index`).
for k in ["pd_df_indexed", "pd_df_nonindexed"]:
df = expected_adata.uns[k]
expected_adata.uns[k] = (
df
# original index becomes column (with default name "index")
.reset_index()
# soma_joinid index added during ingest
.set_index(pd.Index(list(range(len(df))), name=SOMA_JOINID))
# soma_joinid outgested as first column
.reset_index()
)

# Outgest also fails to restore `obs` and `var` correctly, in this case because the ingested
# `obs`/`var` had columns named "obs_id"/"var_id", which get mistaken for "default index"
# columns and set as `df.index` on outgest; their names are also removed. This corresponds to
Expand Down

0 comments on commit 1e64fdd

Please sign in to comment.