Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Complete basic anndata I/O tests #667

Merged
merged 2 commits into from
Jan 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions apis/python/src/tiledbsoma/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,16 +522,18 @@ def to_anndata(
measurement = experiment.ms[measurement_name]

obs_df = experiment.obs.read_as_pandas_all()
obs_df.reset_index(inplace=True)
obs_df.drop([SOMA_JOINID], axis=1, inplace=True)
obs_df.set_index("obs_id", inplace=True)

var_df = measurement.var.read_as_pandas_all()
var_df.reset_index(inplace=True)
var_df.drop([SOMA_JOINID], axis=1, inplace=True)
var_df.set_index("var_id", inplace=True)

nobs = len(obs_df.index)
nvar = len(var_df.index)

X_data = measurement.X["data"]
X_csr = None
assert X_data is not None
X_dtype = None # some datasets have no X
if isinstance(X_data, DenseNDArray):
Expand All @@ -544,32 +546,32 @@ def to_anndata(
else:
raise TypeError(f"Unexpected NDArray type {type(X_data)}")

# XXX FIX OBSM/VARM SHAPES

obsm = {}
if measurement.obsm.exists():
if "obsm" in measurement and measurement.obsm.exists():
for key in measurement.obsm.keys():
shape = measurement.obsm[key].shape
assert len(shape) == 2
mat = measurement.obsm[key].read_numpy((slice(None),) * len(shape))
obsm[key] = sp.csr_array(mat)
# The spelling `sp.csr_array` is more idiomatic but doesn't exist until Python 3.8
obsm[key] = sp.csr_matrix(mat)

varm = {}
if measurement.varm.exists():
if "varm" in measurement and measurement.varm.exists():
for key in measurement.varm.keys():
shape = measurement.varm[key].shape
assert len(shape) == 2
mat = measurement.varm[key].read_numpy((slice(None),) * len(shape))
varm[key] = sp.csr_array(mat)
# The spelling `sp.csr_array` is more idiomatic but doesn't exist until Python 3.8
varm[key] = sp.csr_matrix(mat)

obsp = {}
if measurement.obsp.exists():
if "obsp" in measurement and measurement.obsp.exists():
for key in measurement.obsp.keys():
mat = measurement.obsp[key].read_as_pandas_all()
obsp[key] = util_scipy.csr_from_tiledb_df(mat, nobs, nobs)

varp = {}
if measurement.varp.exists():
if "varp" in measurement and measurement.varp.exists():
for key in measurement.varp.keys():
mat = measurement.varp[key].read_as_pandas_all()
varp[key] = util_scipy.csr_from_tiledb_df(mat, nvar, nvar)
Expand Down
212 changes: 102 additions & 110 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ def test_import_anndata(adata):

# Ingest
exp = tiledbsoma.Experiment(output_path)
tiledbsoma.io.from_anndata(exp, orig, "mRNA")
tiledbsoma.io.from_anndata(exp, orig, "RNA")

# Structure:
# pbmc-small Experiment:
# obs DataFrame (80,)
# ms Collection:
# mRNA Measurement:
# RNA Measurement:
# X Collection:
# data SparseNDArray (80, 20)
# obsp Collection:
Expand All @@ -60,127 +60,119 @@ def test_import_anndata(adata):
assert G.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMAExperiment"

# Check obs
df = exp.obs.read_as_pandas_all()
assert sorted(df.columns.to_list()) == sorted(
obs = exp.obs.read_as_pandas_all()
assert sorted(obs.columns.to_list()) == sorted(
orig.obs_keys() + ["soma_joinid", "obs_id"]
)
assert (
exp.obs.metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMADataFrame"
)
assert sorted(df["obs_id"]) == sorted(list(orig.obs_names))
assert sorted(obs["obs_id"]) == sorted(list(orig.obs_names))
# Convenience accessor
assert sorted(exp.obs.keys()) == sorted(
list(orig.obs.keys()) + ["soma_joinid", "obs_id"]
)

# Check var
var = exp.ms["RNA"].var.read_as_pandas_all()
assert sorted(var.columns.to_list()) == sorted(
orig.var_keys() + ["soma_joinid", "var_id"]
)
assert (
exp.ms["RNA"].var.metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMADataFrame"
)
assert sorted(var["var_id"]) == sorted(list(orig.var_names))
# Convenience accessor
assert sorted(exp.ms["RNA"].var.keys()) == sorted(
list(orig.var.keys()) + ["soma_joinid", "var_id"]
)

# Check X/data (dense)
# with tiledb.open(os.path.join(output_path, "X", "data")) as A:
# df = A[:]
# keys = list(df.keys())
# assert keys == ["value", "obs_id", "var_id"]
# assert A.ndim == 2
# assert A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMAAssayMatrix"
# Convenience accessors
# assert exp.X["data"].shape() == exp.X.data.shape()

# # Check X/raw (sparse)
# with tiledb.open(os.path.join(output_path, "raw", "X", "data")) as A:
# df = A.df[:]
# assert df.columns.to_list() == ["obs_id", "var_id", "value"]
# # verify sparsity of raw data
# assert df.shape[0] == orig.raw.X.nnz
# assert A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMAAssayMatrix"

# TODO: PORT FROM V0 TO V1

# # Check var
# with tiledb.open(os.path.join(output_path, "var")) as A:
# df = A.df[:]
# assert df.columns.to_list() == orig.var_keys()
# assert (
# A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMAAnnotationDataFrame"
# )
# assert sorted(exp.var.ids()) == sorted(list(orig.var_names))
# # Convenience accessors
# assert exp.var_keys() == exp.var_names
# assert exp.var_names == exp.var.ids()
# assert exp.n_var == len(exp.var.ids())
#
# # Check some annotation matrices
# # Note: pbmc-small doesn't have varp.
# assert sorted(exp.obsm.keys()) == sorted(orig.obsm.keys())
# for key in orig.obsm_keys():
# with tiledb.open(os.path.join(output_path, "obsm", key)) as A:
# df = A.df[:]
# assert df.shape[0] == orig.obsm[key].shape[0]
# assert exp.obsm[key].shape() == orig.obsm[key].shape
# assert (
# A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY]
# == "SOMAAnnotationMatrix"
# )
# # Convenience accessors: exp.obsm.X_pca <-> exp.obsm['X_pca']
# for key in exp.obsm.keys():
# assert getattr(exp.obsm, key).shape() == exp.obsm[key].shape()
#
# assert sorted(exp.varm.keys()) == sorted(orig.varm.keys())
# for key in orig.varm_keys():
# with tiledb.open(os.path.join(output_path, "varm", key)) as A:
# df = A.df[:]
# assert df.shape[0] == orig.varm[key].shape[0]
# assert exp.varm[key].shape() == orig.varm[key].shape
# assert (
# A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY]
# == "SOMAAnnotationMatrix"
# )
# # Convenience accessors:
# for key in exp.varm.keys():
# assert getattr(exp.varm, key).shape() == exp.varm[key].shape()
#
# assert sorted(exp.obsp.keys()) == sorted(orig.obsp.keys())
# for key in list(orig.obsp.keys()):
# with tiledb.open(os.path.join(output_path, "obsp", key)) as A:
# df = A.df[:]
# assert df.columns.to_list() == ["obs_id_i", "obs_id_j", "value"]
# assert df.shape[0] == orig.obsp[key].nnz
# # https://github.com/single-cell-data/TileDB-SOMA/issues/125
# # At present (without that PR's suggested enhancement) the best we
# # can get is the NNZ x attrs shape -- note that there are two
# # dims and one attr so the shape is nnz x 1.
# shape = exp.obsp[key].df().shape
# assert shape[0] == orig.obsp[key].nnz
# assert shape[1] == 1
# assert A.meta[tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY] == "SOMAAssayMatrix"
# # Convenience accessors:
# for key in exp.obsp.keys():
# assert getattr(exp.obsp, key).shape() == exp.obsp[key].shape()
X = exp.ms["RNA"].X["data"].read_tensor(coords=(slice(None), slice(None)))
assert X.shape == orig.X.shape
assert (
exp.ms["RNA"]
.X["data"]
.metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMADenseNDArray"
)

# Check raw/X/data (sparse)
X = next(
exp.ms["raw"]
.X["data"]
.read_sparse_tensor(coords=(slice(None), slice(None)), format="coo")
)
assert X.shape == orig.raw.X.shape
assert (
exp.ms["raw"]
.X["data"]
.metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMASparseNDArray"
)

# Check some annotation matrices
# Note: pbmc-small doesn't have varp.

obsm = exp.ms["RNA"].obsm
assert sorted(obsm.keys()) == sorted(orig.obsm.keys())
for key in list(orig.obsm.keys()):
matrix = obsm[key].read_tensor(coords=(slice(None), slice(None)))
assert matrix.shape == orig.obsm[key].shape
assert (
obsm[key].metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMADenseNDArray"
)

varm = exp.ms["RNA"].varm
assert sorted(varm.keys()) == sorted(orig.varm.keys())
for key in list(orig.varm.keys()):
matrix = varm[key].read_tensor(coords=(slice(None), slice(None)))
assert matrix.shape == orig.varm[key].shape
assert (
varm[key].metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMADenseNDArray"
)

obsp = exp.ms["RNA"].obsp
assert sorted(obsp.keys()) == sorted(orig.obsp.keys())
for key in list(orig.obsp.keys()):
matrix = next(
obsp[key].read_sparse_tensor(
coords=(slice(None), slice(None)), format="coo"
)
)
assert matrix.shape == orig.obsp[key].shape
assert (
obsp[key].metadata.get(tiledbsoma.util.SOMA_OBJECT_TYPE_METADATA_KEY)
== "SOMASparseNDArray"
)

tempdir.cleanup()


# def test_export_anndata(adata):
#
# # Set up anndata input path and tiledb-group output path
# tempdir = tempfile.TemporaryDirectory()
# output_path = tempdir.name
#
# orig = adata
#
# # Ingest
# exp = tiledbsoma.Experiment(output_path)
# tiledbsoma.io.from_anndata(exp, orig)
#
# readback = tiledbsoma.io.to_anndata(exp)
#
# assert readback.obs.shape == orig.obs.shape
# assert readback.var.shape == orig.var.shape
# assert readback.X.shape == orig.X.shape
#
# for key in orig.obsm.keys():
# assert readback.obsm[key].shape == orig.obsm[key].shape
# for key in orig.varm.keys():
# assert readback.varm[key].shape == orig.varm[key].shape
# for key in orig.obsp.keys():
# assert readback.obsp[key].shape == orig.obsp[key].shape
# for key in orig.varp.keys():
# assert readback.varp[key].shape == orig.varp[key].shape
def test_export_anndata(adata):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

orig = adata

exp = tiledbsoma.Experiment(output_path)
tiledbsoma.io.from_anndata(exp, orig, measurement_name="RNA")

readback = tiledbsoma.io.to_anndata(exp, measurement_name="RNA")

assert readback.obs.shape == orig.obs.shape
assert readback.var.shape == orig.var.shape
assert readback.X.shape == orig.X.shape

for key in orig.obsm.keys():
assert readback.obsm[key].shape == orig.obsm[key].shape
for key in orig.varm.keys():
assert readback.varm[key].shape == orig.varm[key].shape
for key in orig.obsp.keys():
assert readback.obsp[key].shape == orig.obsp[key].shape
for key in orig.varp.keys():
assert readback.varp[key].shape == orig.varp[key].shape