Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.7] [python] Do not mutate adata.obs/adata.var on ingest #2105

Merged
merged 1 commit into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions apis/python/src/tiledbsoma/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from itertools import zip_longest
from typing import Any, Optional, Tuple, Type, TypeVar

import pandas as pd
import pyarrow as pa
import somacore
from somacore import options
Expand Down Expand Up @@ -284,3 +285,29 @@ def pa_types_is_string_or_bytes(dtype: pa.DataType) -> bool:
or pa.types.is_string(dtype)
or pa.types.is_binary(dtype)
)


def anndata_dataframe_unmodified(old: pd.DataFrame, new: pd.DataFrame) -> bool:
"""
Checks that we didn't mutate the object while ingesting. Intended for unit tests.
"""
try:
return (old == new).all().all()
except ValueError:
# Can be thrown when columns don't match -- which is what we check for
return False


def anndata_dataframe_unmodified_nan_safe(old: pd.DataFrame, new: pd.DataFrame) -> bool:
"""
Same as anndata_dataframe_unmodified, except it works with NaN data.
A key property of NaN is it's not equal to itself: x != x.
"""

if old.index.name != new.index.name:
return False
if len(old) != len(new):
return False
if any(old.keys() != new.keys()):
return False
return True
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/io/_registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _string_dict_from_pandas_dataframe(
allow the same.
"""

df = df.head(1) # since reset_index can be expensive on full data
df = df.head(1).copy() # since reset_index can be expensive on full data
if df.index.name is None or df.index.name == "index":
df.reset_index(inplace=True)
if default_index_name in df:
Expand Down
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/io/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def decategoricalize_obs_or_var(obs_or_var: pd.DataFrame) -> pd.DataFrame:
},
)
else:
return obs_or_var
return obs_or_var.copy()


@typeguard_ignore
Expand Down
17 changes: 13 additions & 4 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,10 +1204,16 @@ def _write_dataframe(
context: Optional[SOMATileDBContext] = None,
axis_mapping: AxisIDMapping,
) -> DataFrame:
# The id_column_name is for disambiguating rows in append mode;
# it may or may not be an index name in the input AnnData obs/var.
#
# The original_index_name is the index name in the AnnData obs/var.
"""
The id_column_name is for disambiguating rows in append mode;
it may or may not be an index name in the input AnnData obs/var.

The original_index_name is the index name in the AnnData obs/var.

This helper mutates the input dataframe, for parsimony of memory usage.
The caller should have copied anything pointing to a user-provided
adata.obs, adata.var, etc.
"""
original_index_name = None
if df.index is not None and df.index.name is not None and df.index.name != "index":
original_index_name = df.index.name
Expand Down Expand Up @@ -1540,6 +1546,9 @@ def _update_dataframe(
"""
See ``update_obs`` and ``update_var``. This is common helper code shared by both.
"""
new_data = (
new_data.copy()
) # Further operations are in-place for parsimony of memory usage
if sdf.closed or sdf.mode != "w":
raise SOMAError(f"DataFrame must be open for write: {sdf.uri}")
old_sig = signatures._string_dict_from_arrow_schema(sdf.schema)
Expand Down
53 changes: 51 additions & 2 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import tiledbsoma
import tiledbsoma.io
from tiledbsoma import _constants, _factory
from tiledbsoma._util import (
anndata_dataframe_unmodified,
anndata_dataframe_unmodified_nan_safe,
)

HERE = Path(__file__).parent

Expand Down Expand Up @@ -109,6 +113,7 @@ def adata(h5ad_file):
[tiledbsoma.SparseNDArray, tiledbsoma.DenseNDArray],
)
def test_import_anndata(adata, ingest_modes, X_kind):
original = adata.copy()
adata = adata.copy()

have_ingested = False
Expand All @@ -133,6 +138,9 @@ def test_import_anndata(adata, ingest_modes, X_kind):
if ingest_mode != "schema_only":
have_ingested = True

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

exp = tiledbsoma.Experiment.open(uri)

assert exp.metadata[metakey] == "SOMAExperiment"
Expand Down Expand Up @@ -411,13 +419,17 @@ def test_ingest_relative(h5ad_file_extended, use_relative_uri):
def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended, ingest_uns_keys):
tmp_uri = tmp_path.as_uri()
original = anndata.read(h5ad_file_extended)
adata = anndata.read(h5ad_file_extended)
uri = tiledbsoma.io.from_anndata(
tmp_uri,
original,
adata,
measurement_name="hello",
uns_keys=ingest_uns_keys,
)

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with tiledbsoma.Experiment.open(uri) as exp:
uns = exp.ms["hello"]["uns"]
assert isinstance(uns, tiledbsoma.Collection)
Expand Down Expand Up @@ -446,7 +458,7 @@ def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended, ingest_uns_keys)
assert isinstance(random_state, tiledbsoma.DenseNDArray)
assert np.array_equal(random_state.read().to_numpy(), np.array([0]))
got_pca_variance = uns["pca"]["variance"].read().to_numpy()
assert np.array_equal(got_pca_variance, original.uns["pca"]["variance"])
assert np.array_equal(got_pca_variance, adata.uns["pca"]["variance"])
else:
assert set(uns) == set(ingest_uns_keys)

Expand Down Expand Up @@ -481,7 +493,13 @@ def test_add_matrix_to_collection(adata):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

original = adata.copy()

uri = tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

exp = tiledbsoma.Experiment.open(uri)
with _factory.open(output_path) as exp_r:
assert list(exp_r.ms["RNA"].X.keys()) == ["data"]
Expand Down Expand Up @@ -602,8 +620,13 @@ def add_matrix_to_collection(

tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name
original = adata.copy()

uri = tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

exp = tiledbsoma.Experiment.open(uri)
with _factory.open(output_path) as exp_r:
assert list(exp_r.ms["RNA"].X.keys()) == ["data"]
Expand Down Expand Up @@ -656,8 +679,13 @@ def test_export_anndata(adata):
tempdir = tempfile.TemporaryDirectory()
output_path = tempdir.name

original = adata.copy()

tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA")

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with _factory.open(output_path) as exp:
with pytest.raises(ValueError):
tiledbsoma.io.to_anndata(
Expand Down Expand Up @@ -700,9 +728,14 @@ def test_null_obs(adata, tmp_path: Path):
# Create column of partially-null values
rng = np.random.RandomState(seed)
adata.obs["empty_partial"] = rng.choice((np.NaN, 1.0), adata.n_obs, True)

original = adata.copy()
uri = tiledbsoma.io.from_anndata(
output_path, adata, "RNA", ingest_mode="write", X_kind=tiledbsoma.SparseNDArray
)
assert anndata_dataframe_unmodified_nan_safe(original.obs, adata.obs)
assert anndata_dataframe_unmodified_nan_safe(original.var, adata.var)

exp = tiledbsoma.Experiment.open(uri)
with tiledb.open(exp.obs.uri, "r") as obs:
# Explicitly check columns created above
Expand All @@ -717,6 +750,7 @@ def test_null_obs(adata, tmp_path: Path):

def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path):
adata = anndata.read_h5ad(h5ad_file_with_obsm_holes.as_posix())
original = adata.copy()
assert 1 == 1

# This data file is prepared such that obsm["X_pca"] has shape (2638, 50)
Expand All @@ -728,6 +762,9 @@ def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path):
output_path = tmp_path.as_posix()
tiledbsoma.io.from_anndata(output_path, adata, "RNA")

assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

exp = tiledbsoma.Experiment.open(output_path)

# Verify the bounding box on the SOMA SparseNDArray
Expand Down Expand Up @@ -860,6 +897,7 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var
X[i, j] = 100 + 10 * i + j

adata = anndata.AnnData(X=X, obs=obs, var=var, dtype=X.dtype)
original = adata.copy()

uri = tmp_path.as_posix()

Expand All @@ -871,6 +909,8 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var
obs_id_name=obs_id_name,
var_id_name=var_id_name,
)
assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with tiledbsoma.Experiment.open(uri) as exp:
assert obs_id_name in exp.obs.keys()
Expand Down Expand Up @@ -950,10 +990,13 @@ def test_uns_io(tmp_path, outgest_uns_keys):
uns=uns,
dtype=X.dtype,
)
original = adata.copy()

soma_uri = tmp_path.as_posix()

tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name="RNA")
assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with tiledbsoma.Experiment.open(soma_uri) as exp:
bdata = tiledbsoma.io.to_anndata(
Expand Down Expand Up @@ -1002,7 +1045,10 @@ def test_string_nan_columns(tmp_path, adata, write_index):

# Step 2
uri = tmp_path.as_posix()
original = adata.copy()
tiledbsoma.io.from_anndata(uri, adata, measurement_name="RNA")
assert anndata_dataframe_unmodified_nan_safe(original.obs, adata.obs)
assert anndata_dataframe_unmodified_nan_safe(original.var, adata.var)

# Step 3
with tiledbsoma.open(uri, "r") as exp:
Expand Down Expand Up @@ -1058,7 +1104,10 @@ def test_index_names_io(tmp_path, obs_index_name, var_index_name):

soma_uri = tmp_path.as_posix()

original = adata.copy()
tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name)
assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with tiledbsoma.Experiment.open(soma_uri) as exp:
bdata = tiledbsoma.io.to_anndata(exp, measurement_name)
Expand Down
4 changes: 4 additions & 0 deletions apis/python/tests/test_platform_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tiledbsoma
import tiledbsoma.io
import tiledbsoma.options._tiledb_create_options as tco
from tiledbsoma._util import anndata_dataframe_unmodified

HERE = Path(__file__).parent

Expand All @@ -27,6 +28,7 @@ def adata(h5ad_file):

def test_platform_config(adata):
# Set up anndata input path and tiledb-group output path
original = adata.copy()
with tempfile.TemporaryDirectory() as output_path:
# Ingest
tiledbsoma.io.from_anndata(
Expand All @@ -53,6 +55,8 @@ def test_platform_config(adata):
}
},
)
assert anndata_dataframe_unmodified(original.obs, adata.obs)
assert anndata_dataframe_unmodified(original.var, adata.var)

with tiledbsoma.Experiment.open(output_path) as exp:
x_data = exp.ms["RNA"].X["data"]
Expand Down
11 changes: 11 additions & 0 deletions apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import tiledbsoma.io
import tiledbsoma.io._registration as registration
from tiledbsoma._util import anndata_dataframe_unmodified


def _create_anndata(
Expand Down Expand Up @@ -720,6 +721,8 @@ def test_append_items_with_experiment(soma1, h5ad2):

adata2 = ad.read_h5ad(h5ad2)

original = adata2.copy()

with tiledbsoma.Experiment.open(soma1, "w") as exp1:
tiledbsoma.io.append_obs(
exp1,
Expand All @@ -744,6 +747,9 @@ def test_append_items_with_experiment(soma1, h5ad2):
registration_mapping=rd,
)

assert anndata_dataframe_unmodified(original.obs, adata2.obs)
assert anndata_dataframe_unmodified(original.var, adata2.var)

expect_obs_soma_joinids = list(range(6))
expect_var_soma_joinids = list(range(5))

Expand Down Expand Up @@ -827,6 +833,8 @@ def test_append_with_disjoint_measurements(

anndata2 = anndata1 if use_same_cells else anndata4

original = anndata2.copy()

rd = tiledbsoma.io.register_anndatas(
soma_uri,
[anndata2],
Expand All @@ -842,6 +850,9 @@ def test_append_with_disjoint_measurements(
registration_mapping=rd,
)

assert anndata_dataframe_unmodified(original.obs, anndata2.obs)
assert anndata_dataframe_unmodified(original.var, anndata2.var)

# exp/obs, use_same_cells=True: exp/obs, use_same_cells=False:
# soma_joinid obs_id cell_type is_primary_data soma_joinid obs_id cell_type is_primary_data
# 0 0 AAAT B cell 1 0 0 AAAT B cell 1
Expand Down
Loading
Loading