Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunked writing of h5py.Dataset and zarr.Array #1624

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,12 @@ def write_list(
# It's in the `AnnData.concatenate` docstring, but should we keep it?
@_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
def write_basic(
f: GroupStorageType,
k: str,
Expand All @@ -391,7 +390,50 @@ def write_basic(
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write methods which underlying library handles natively."""
f.create_dataset(k, data=elem, **dataset_kwargs)
dtype = dataset_kwargs.get("dtype", elem.dtype)
f.create_dataset(k, data=elem, **dataset_kwargs, dtype=dtype)


def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType):
"""
Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.

* If `dest` has chunks, it will return the chunks of `dest`.
* If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
"""
if dest.chunks and hasattr(dest, "iter_chunks"):
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved
return dest.iter_chunks()
else:
itemsize = elem.dtype.itemsize
shape = elem.shape
# Number of elements to write
entry_chunk_size = 100 * 1024 * 1024 // itemsize
# Number of rows that works out to
n_rows = max(entry_chunk_size // shape[0], 1000)
Comment on lines +412 to +414
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should any of this be configurable?

Copy link
Contributor

@ilan-gold ilan-gold Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As things stand, the value is currently dependent on both the shape and an arbitrary cutoff in max....so Given the current implementation, we could make 2-3 things configurable which seems like overkill. Perhaps just n_rows should be a setting with 1000 as the default?

Copy link
Member

@flying-sheep flying-sheep Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have

chunk_size: int = 6000, # TODO, probably make this 2d chunks
, documented as “Used only when loading sparse dataset that is stored as dense.”

Also

chunks: tuple[int, ...] | None = None,

Let’s not create multiple different/incompatible conventions / features under the same name.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is an argument for not calling it chunk_size? I wasn't proposing literally calling it n_rows but just that variable being the settings as opposed to entry_chunk_size or the max value

Copy link
Member

@flying-sheep flying-sheep Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s an argument for keeping our terminology consistent when we get around to make this configurable. But we can also not do that for now.

return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved


@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
def write_chunked_dense_array_to_group(
f: GroupStorageType,
k: str,
elem: ArrayStorageType,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write to a h5py.Dataset in chunks.

`h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
zarr since zarr handles this automatically.
"""
dtype = dataset_kwargs.get("dtype", elem.dtype)
dest = f.create_dataset(k, shape=elem.shape, **dataset_kwargs, dtype=dtype)

for chunk in _iter_chunks_for_copy(elem, dest):
dest[chunk] = elem[chunk]


_REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
Expand Down Expand Up @@ -602,10 +644,14 @@ def write_sparse_compressed(
# Allow resizing for hdf5
if isinstance(f, H5Group) and "maxshape" not in dataset_kwargs:
dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs)

g.create_dataset("data", data=value.data, **dataset_kwargs)
g.create_dataset("indices", data=value.indices, **dataset_kwargs)
g.create_dataset("indptr", data=value.indptr, dtype=indptr_dtype, **dataset_kwargs)
_writer.write_elem(g, "data", value.data, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "indices", value.indices, dataset_kwargs=dataset_kwargs)
_writer.write_elem(
g,
"indptr",
value.indptr,
dataset_kwargs={"dtype": indptr_dtype, **dataset_kwargs},
)


write_csr = partial(write_sparse_compressed, fmt="csr")
Expand Down
6 changes: 2 additions & 4 deletions tests/test_io_dispatched.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,5 @@ def zarr_reader(func, elem_name: str, elem, iospec):
write_dispatched(f, "/", adata, callback=zarr_writer)
_ = read_dispatched(f, zarr_reader)

assert h5ad_write_keys == zarr_write_keys
assert h5ad_read_keys == zarr_read_keys

assert sorted(h5ad_write_keys) == sorted(h5ad_read_keys)
assert sorted(h5ad_write_keys) == sorted(zarr_write_keys)
assert sorted(h5ad_read_keys) == sorted(zarr_read_keys)
12 changes: 12 additions & 0 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,18 @@ def create_sparse_store(
pytest.param(
pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
),
pytest.param(
zarr.ones((100, 100), chunks=(10, 10)),
"array",
id="zarr_dense_array",
),
pytest.param(
create_dense_store(
h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
)["X"],
"array",
id="h5_dense_array",
),
# pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
# TODO consider how specific encodings should be. Should we be fully describing the written type?
# Currently the info we add is: "what you wouldn't be able to figure out yourself"
Expand Down