diff --git a/apis/python/src/tiledbsoma/_util.py b/apis/python/src/tiledbsoma/_util.py index d2ea2db55e..3c2498c8a6 100644 --- a/apis/python/src/tiledbsoma/_util.py +++ b/apis/python/src/tiledbsoma/_util.py @@ -10,6 +10,7 @@ from itertools import zip_longest from typing import Any, Optional, Tuple, Type, TypeVar +import pandas as pd import pyarrow as pa import somacore from somacore import options @@ -284,3 +285,29 @@ def pa_types_is_string_or_bytes(dtype: pa.DataType) -> bool: or pa.types.is_string(dtype) or pa.types.is_binary(dtype) ) + + +def anndata_dataframe_unmodified(old: pd.DataFrame, new: pd.DataFrame) -> bool: + """ + Checks that we didn't mutate the object while ingesting. Intended for unit tests. + """ + try: + return (old == new).all().all() + except ValueError: + # Can be thrown when columns don't match -- which is what we check for + return False + + +def anndata_dataframe_unmodified_nan_safe(old: pd.DataFrame, new: pd.DataFrame) -> bool: + """ + Same as anndata_dataframe_unmodified, except it works with NaN data. + A key property of NaN is it's not equal to itself: x != x. + """ + + if old.index.name != new.index.name: + return False + if len(old) != len(new): + return False + if any(old.keys() != new.keys()): + return False + return True diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py index d8ea978690..3e88c230c2 100644 --- a/apis/python/src/tiledbsoma/io/_registration/signatures.py +++ b/apis/python/src/tiledbsoma/io/_registration/signatures.py @@ -67,7 +67,7 @@ def _string_dict_from_pandas_dataframe( allow the same. """ - df = df.head(1) # since reset_index can be expensive on full data + df = df.head(1).copy() # since reset_index can be expensive on full data if df.index.name is None or df.index.name == "index": df.reset_index(inplace=True) if default_index_name in df: diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py index f4f972fc1f..6de5007557 100644 --- a/apis/python/src/tiledbsoma/io/conversions.py +++ b/apis/python/src/tiledbsoma/io/conversions.py @@ -31,7 +31,7 @@ def decategoricalize_obs_or_var(obs_or_var: pd.DataFrame) -> pd.DataFrame: }, ) else: - return obs_or_var + return obs_or_var.copy() @typeguard_ignore diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 428f6e4ef0..e5f4ed6fa9 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1204,10 +1204,16 @@ def _write_dataframe( context: Optional[SOMATileDBContext] = None, axis_mapping: AxisIDMapping, ) -> DataFrame: - # The id_column_name is for disambiguating rows in append mode; - # it may or may not be an index name in the input AnnData obs/var. - # - # The original_index_name is the index name in the AnnData obs/var. + """ + The id_column_name is for disambiguating rows in append mode; + it may or may not be an index name in the input AnnData obs/var. + + The original_index_name is the index name in the AnnData obs/var. + + This helper mutates the input dataframe, for parsimony of memory usage. + The caller should have copied anything pointing to a user-provided + adata.obs, adata.var, etc. + """ original_index_name = None if df.index is not None and df.index.name is not None and df.index.name != "index": original_index_name = df.index.name @@ -1540,6 +1546,9 @@ def _update_dataframe( """ See ``update_obs`` and ``update_var``. This is common helper code shared by both. """ + new_data = ( + new_data.copy() + ) # Further operations are in-place for parsimony of memory usage if sdf.closed or sdf.mode != "w": raise SOMAError(f"DataFrame must be open for write: {sdf.uri}") old_sig = signatures._string_dict_from_arrow_schema(sdf.schema) diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 2a8db5ad44..3df0b08e7a 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -15,6 +15,10 @@ import tiledbsoma import tiledbsoma.io from tiledbsoma import _constants, _factory +from tiledbsoma._util import ( + anndata_dataframe_unmodified, + anndata_dataframe_unmodified_nan_safe, +) HERE = Path(__file__).parent @@ -109,6 +113,7 @@ def adata(h5ad_file): [tiledbsoma.SparseNDArray, tiledbsoma.DenseNDArray], ) def test_import_anndata(adata, ingest_modes, X_kind): + original = adata.copy() adata = adata.copy() have_ingested = False @@ -133,6 +138,9 @@ def test_import_anndata(adata, ingest_modes, X_kind): if ingest_mode != "schema_only": have_ingested = True + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + exp = tiledbsoma.Experiment.open(uri) assert exp.metadata[metakey] == "SOMAExperiment" @@ -411,13 +419,17 @@ def test_ingest_relative(h5ad_file_extended, use_relative_uri): def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended, ingest_uns_keys): tmp_uri = tmp_path.as_uri() original = anndata.read(h5ad_file_extended) + adata = anndata.read(h5ad_file_extended) uri = tiledbsoma.io.from_anndata( tmp_uri, - original, + adata, measurement_name="hello", uns_keys=ingest_uns_keys, ) + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + with tiledbsoma.Experiment.open(uri) as exp: uns = exp.ms["hello"]["uns"] assert isinstance(uns, tiledbsoma.Collection) @@ -446,7 +458,7 @@ def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended, ingest_uns_keys) assert isinstance(random_state, tiledbsoma.DenseNDArray) assert np.array_equal(random_state.read().to_numpy(), np.array([0])) got_pca_variance = uns["pca"]["variance"].read().to_numpy() - assert np.array_equal(got_pca_variance, original.uns["pca"]["variance"]) + assert np.array_equal(got_pca_variance, adata.uns["pca"]["variance"]) else: assert set(uns) == set(ingest_uns_keys) @@ -481,7 +493,13 @@ def test_add_matrix_to_collection(adata): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() + uri = tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + exp = tiledbsoma.Experiment.open(uri) with _factory.open(output_path) as exp_r: assert list(exp_r.ms["RNA"].X.keys()) == ["data"] @@ -602,8 +620,13 @@ def add_matrix_to_collection( tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() uri = tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + exp = tiledbsoma.Experiment.open(uri) with _factory.open(output_path) as exp_r: assert list(exp_r.ms["RNA"].X.keys()) == ["data"] @@ -656,8 +679,13 @@ def test_export_anndata(adata): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() + tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + with _factory.open(output_path) as exp: with pytest.raises(ValueError): tiledbsoma.io.to_anndata( @@ -700,9 +728,14 @@ def test_null_obs(adata, tmp_path: Path): # Create column of partially-null values rng = np.random.RandomState(seed) adata.obs["empty_partial"] = rng.choice((np.NaN, 1.0), adata.n_obs, True) + + original = adata.copy() uri = tiledbsoma.io.from_anndata( output_path, adata, "RNA", ingest_mode="write", X_kind=tiledbsoma.SparseNDArray ) + assert anndata_dataframe_unmodified_nan_safe(original.obs, adata.obs) + assert anndata_dataframe_unmodified_nan_safe(original.var, adata.var) + exp = tiledbsoma.Experiment.open(uri) with tiledb.open(exp.obs.uri, "r") as obs: # Explicitly check columns created above @@ -717,6 +750,7 @@ def test_null_obs(adata, tmp_path: Path): def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path): adata = anndata.read_h5ad(h5ad_file_with_obsm_holes.as_posix()) + original = adata.copy() assert 1 == 1 # This data file is prepared such that obsm["X_pca"] has shape (2638, 50) @@ -728,6 +762,9 @@ def test_export_obsm_with_holes(h5ad_file_with_obsm_holes, tmp_path): output_path = tmp_path.as_posix() tiledbsoma.io.from_anndata(output_path, adata, "RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + exp = tiledbsoma.Experiment.open(output_path) # Verify the bounding box on the SOMA SparseNDArray @@ -860,6 +897,7 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var X[i, j] = 100 + 10 * i + j adata = anndata.AnnData(X=X, obs=obs, var=var, dtype=X.dtype) + original = adata.copy() uri = tmp_path.as_posix() @@ -871,6 +909,8 @@ def test_id_names(tmp_path, obs_id_name, var_id_name, indexify_obs, indexify_var obs_id_name=obs_id_name, var_id_name=var_id_name, ) + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(uri) as exp: assert obs_id_name in exp.obs.keys() @@ -950,10 +990,13 @@ def test_uns_io(tmp_path, outgest_uns_keys): uns=uns, dtype=X.dtype, ) + original = adata.copy() soma_uri = tmp_path.as_posix() tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(soma_uri) as exp: bdata = tiledbsoma.io.to_anndata( @@ -1002,7 +1045,10 @@ def test_string_nan_columns(tmp_path, adata, write_index): # Step 2 uri = tmp_path.as_posix() + original = adata.copy() tiledbsoma.io.from_anndata(uri, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified_nan_safe(original.obs, adata.obs) + assert anndata_dataframe_unmodified_nan_safe(original.var, adata.var) # Step 3 with tiledbsoma.open(uri, "r") as exp: @@ -1058,7 +1104,10 @@ def test_index_names_io(tmp_path, obs_index_name, var_index_name): soma_uri = tmp_path.as_posix() + original = adata.copy() tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name) + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(soma_uri) as exp: bdata = tiledbsoma.io.to_anndata(exp, measurement_name) diff --git a/apis/python/tests/test_platform_config.py b/apis/python/tests/test_platform_config.py index 3849e33b53..680936c1e0 100644 --- a/apis/python/tests/test_platform_config.py +++ b/apis/python/tests/test_platform_config.py @@ -8,6 +8,7 @@ import tiledbsoma import tiledbsoma.io import tiledbsoma.options._tiledb_create_options as tco +from tiledbsoma._util import anndata_dataframe_unmodified HERE = Path(__file__).parent @@ -27,6 +28,7 @@ def adata(h5ad_file): def test_platform_config(adata): # Set up anndata input path and tiledb-group output path + original = adata.copy() with tempfile.TemporaryDirectory() as output_path: # Ingest tiledbsoma.io.from_anndata( @@ -53,6 +55,8 @@ def test_platform_config(adata): } }, ) + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: x_data = exp.ms["RNA"].X["data"] diff --git a/apis/python/tests/test_registration_mappings.py b/apis/python/tests/test_registration_mappings.py index bcfa2e838d..15e3d15d06 100644 --- a/apis/python/tests/test_registration_mappings.py +++ b/apis/python/tests/test_registration_mappings.py @@ -12,6 +12,7 @@ import tiledbsoma.io import tiledbsoma.io._registration as registration +from tiledbsoma._util import anndata_dataframe_unmodified def _create_anndata( @@ -720,6 +721,8 @@ def test_append_items_with_experiment(soma1, h5ad2): adata2 = ad.read_h5ad(h5ad2) + original = adata2.copy() + with tiledbsoma.Experiment.open(soma1, "w") as exp1: tiledbsoma.io.append_obs( exp1, @@ -744,6 +747,9 @@ def test_append_items_with_experiment(soma1, h5ad2): registration_mapping=rd, ) + assert anndata_dataframe_unmodified(original.obs, adata2.obs) + assert anndata_dataframe_unmodified(original.var, adata2.var) + expect_obs_soma_joinids = list(range(6)) expect_var_soma_joinids = list(range(5)) @@ -827,6 +833,8 @@ def test_append_with_disjoint_measurements( anndata2 = anndata1 if use_same_cells else anndata4 + original = anndata2.copy() + rd = tiledbsoma.io.register_anndatas( soma_uri, [anndata2], @@ -842,6 +850,9 @@ def test_append_with_disjoint_measurements( registration_mapping=rd, ) + assert anndata_dataframe_unmodified(original.obs, anndata2.obs) + assert anndata_dataframe_unmodified(original.var, anndata2.var) + # exp/obs, use_same_cells=True: exp/obs, use_same_cells=False: # soma_joinid obs_id cell_type is_primary_data soma_joinid obs_id cell_type is_primary_data # 0 0 AAAT B cell 1 0 0 AAAT B cell 1 diff --git a/apis/python/tests/test_registration_signatures.py b/apis/python/tests/test_registration_signatures.py index fd51a3944d..777d91e26e 100644 --- a/apis/python/tests/test_registration_signatures.py +++ b/apis/python/tests/test_registration_signatures.py @@ -6,6 +6,7 @@ import tiledbsoma.io import tiledbsoma.io._registration.signatures as signatures +from tiledbsoma._util import anndata_dataframe_unmodified HERE = Path(__file__).parent @@ -28,7 +29,11 @@ def test_signature_serdes(canned_h5ad_file, canned_anndata): assert "var_schema" in text1 assert sig == signatures.Signature.from_json(text1) + original = canned_anndata.copy() sig = signatures.Signature.from_anndata(canned_anndata) + assert anndata_dataframe_unmodified(original.obs, canned_anndata.obs) + assert anndata_dataframe_unmodified(original.var, canned_anndata.var) + text2 = sig.to_json() assert sig == signatures.Signature.from_json(text2) @@ -36,7 +41,11 @@ def test_signature_serdes(canned_h5ad_file, canned_anndata): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + uri = tiledbsoma.io.from_anndata(output_path, canned_anndata, "RNA") + assert anndata_dataframe_unmodified(original.obs, canned_anndata.obs) + assert anndata_dataframe_unmodified(original.var, canned_anndata.var) + sig = signatures.Signature.from_soma_experiment(uri) text3 = sig.to_json() assert sig == signatures.Signature.from_json(text3) @@ -48,11 +57,16 @@ def test_compatible(canned_anndata): # Check that zero inputs result in zero incompatibility signatures.Signature.check_compatible({}) + original = canned_anndata.copy() sig1 = signatures.Signature.from_anndata(canned_anndata) + assert anndata_dataframe_unmodified(original.obs, canned_anndata.obs) + assert anndata_dataframe_unmodified(original.var, canned_anndata.var) tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name uri = tiledbsoma.io.from_anndata(output_path, canned_anndata, "RNA") + assert anndata_dataframe_unmodified(original.obs, canned_anndata.obs) + assert anndata_dataframe_unmodified(original.var, canned_anndata.var) sig2 = signatures.Signature.from_soma_experiment(uri) # Check that single inputs result in zero incompatibility @@ -73,6 +87,11 @@ def test_compatible(canned_anndata): # Check incompatibility of modified AnnData adata3 = canned_anndata del adata3.obs["groups"] + + original = adata3.copy() sig3 = signatures.Signature.from_anndata(adata3) + assert anndata_dataframe_unmodified(original.obs, adata3.obs) + assert anndata_dataframe_unmodified(original.var, adata3.var) + with pytest.raises(ValueError): signatures.Signature.check_compatible({"orig": sig1, "anndata3": sig3}) diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py index 484f19ab31..0325a15167 100644 --- a/apis/python/tests/test_update_dataframes.py +++ b/apis/python/tests/test_update_dataframes.py @@ -9,6 +9,7 @@ import tiledbsoma import tiledbsoma.io +from tiledbsoma._util import anndata_dataframe_unmodified HERE = Path(__file__).parent @@ -30,7 +31,10 @@ def adata(h5ad_file): def test_no_change(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: o1 = exp.obs.schema @@ -46,6 +50,8 @@ def test_no_change(adata, readback): with tiledbsoma.Experiment.open(output_path, "w") as exp: tiledbsoma.io.update_obs(exp, new_obs) tiledbsoma.io.update_var(exp, new_var, "RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema @@ -59,7 +65,10 @@ def test_no_change(adata, readback): def test_add(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: exp.ms["RNA"].var.schema @@ -82,9 +91,13 @@ def test_add(adata, readback): new_var["vst.mean.sq"] = new_var["vst.mean"] ** 2 + new_obs_save = new_obs.copy() + new_var_save = new_var.copy() with tiledbsoma.Experiment.open(output_path, "w") as exp: tiledbsoma.io.update_obs(exp, new_obs) tiledbsoma.io.update_var(exp, new_var, "RNA") + assert anndata_dataframe_unmodified(new_obs, new_obs_save) + assert anndata_dataframe_unmodified(new_var, new_var_save) with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema @@ -105,7 +118,10 @@ def test_add(adata, readback): def test_drop(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: exp.ms["RNA"].var.schema @@ -120,9 +136,13 @@ def test_drop(adata, readback): del new_obs["groups"] del new_var["vst.mean"] + new_obs_save = new_obs.copy() + new_var_save = new_var.copy() with tiledbsoma.Experiment.open(output_path, "w") as exp: tiledbsoma.io.update_obs(exp, new_obs) tiledbsoma.io.update_var(exp, new_var, "RNA") + assert anndata_dataframe_unmodified(new_obs, new_obs_save) + assert anndata_dataframe_unmodified(new_var, new_var_save) with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema @@ -138,7 +158,10 @@ def test_drop(adata, readback): def test_change(adata, readback): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: o1 = exp.obs.schema @@ -154,11 +177,15 @@ def test_change(adata, readback): new_obs["groups"] = np.arange(new_obs.shape[0], dtype=np.int16) new_var["vst.mean"] = np.arange(new_var.shape[0], dtype=np.int32) + new_obs_save = new_obs.copy() + new_var_save = new_var.copy() with tiledbsoma.Experiment.open(output_path, "w") as exp: with pytest.raises(ValueError): tiledbsoma.io.update_obs(exp, new_obs) with pytest.raises(ValueError): tiledbsoma.io.update_var(exp, new_var, "RNA") + assert anndata_dataframe_unmodified(new_obs, new_obs_save) + assert anndata_dataframe_unmodified(new_var, new_var_save) with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema @@ -174,7 +201,10 @@ def test_change_counts(adata, readback, shift_and_exc): shift, exc = shift_and_exc tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name + original = adata.copy() tiledbsoma.io.from_anndata(output_path, adata, measurement_name="RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) with tiledbsoma.Experiment.open(output_path) as exp: o1 = exp.obs.schema @@ -207,16 +237,25 @@ def test_change_counts(adata, readback, shift_and_exc): ) if exc is None: + new_obs_save = new_obs.copy() + new_var_save = new_var.copy() with tiledbsoma.Experiment.open(output_path, "w") as exp: tiledbsoma.io.update_obs(exp, new_obs) tiledbsoma.io.update_var(exp, new_var, measurement_name="RNA") + assert anndata_dataframe_unmodified(new_obs, new_obs_save) + assert anndata_dataframe_unmodified(new_var, new_var_save) + else: with tiledbsoma.Experiment.open(output_path, "w") as exp: with pytest.raises(exc): tiledbsoma.io.update_obs(exp, new_obs) with pytest.raises(exc): tiledbsoma.io.update_var(exp, new_var, measurement_name="RNA") + + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) + with tiledbsoma.Experiment.open(output_path) as exp: o2 = exp.obs.schema v2 = exp.ms["RNA"].var.schema