Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use automatic chunking in from_zarr #6419

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions dask/array/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2841,7 +2841,7 @@ def from_array(


def from_zarr(
url, component=None, storage_options=None, chunks=None, name=None, **kwargs
url, component=None, storage_options=None, chunks="auto", name=None, **kwargs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring entry for chunks should probably be updated to reflect this default

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would you pass to retain the previous behaviour?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You would get the chunking from the Zarr array and pass it in explicitly:

chunks=my_zarr_array.chunks

Or if you wanted the smaller chunk sizes you would specify a smaller chunk size, perhaps in bytes

chunks="1 MiB"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also all the same logic that we currently have for any other dataset that defines a chunks= attribute. I think that the default behavior is usually optimal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but it may be worthwhile indicating in the docstring how to get the inherent chunking.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep chunks=None or something similar as an easy way to get the chunks on disk? It may not be easy to construct the my_zarr_array if the only has a URL, say.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could. Can I ask for additional motivation though? We don't currently do this for HDF5 or NetCDF or any other format. Why would we do this for Zarr? Why do we care about the old behavior? I expect that adding docs on this is just as likely to lead people astray as it is to help them.

As a reminder, the automatic chunking is decently smart and we haven't ever gotten complaints about the choices that it makes, despite pretty heavy usage. It will find a chunking that aligns with the existing chunking in storage, but is mildly larger in other dimensions if necessary.

):
"""Load array from the zarr storage format

Expand All @@ -2859,13 +2859,15 @@ def from_zarr(
storage_options: dict
Any additional parameters for the storage backend (ignored for local
paths)
chunks: tuple of ints or tuples of ints
Passed to ``da.from_array``, allows setting the chunks on
initialisation, if the chunking scheme in the on-disc dataset is not
optimal for the calculations to follow.
chunks: str, int, tuple
Passed to ``da.from_array``. See docstring there.
name : str, optional
An optional keyname for the array. Defaults to hashing the input
kwargs: passed to ``zarr.Array``.

See Also
--------
from_array
"""
import zarr

Expand All @@ -2880,10 +2882,10 @@ def from_zarr(
else:
mapper = url
z = zarr.Array(mapper, read_only=True, path=component, **kwargs)
chunks = chunks if chunks is not None else z.chunks

if name is None:
name = "from-zarr-" + tokenize(z, component, storage_options, chunks, **kwargs)
return from_array(z, chunks, name=name)
return from_array(z, chunks=chunks, name=name)


def to_zarr(
Expand Down
22 changes: 16 additions & 6 deletions dask/array/tests/test_array_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3857,7 +3857,22 @@ def test_zarr_roundtrip():
a.to_zarr(d)
a2 = da.from_zarr(d)
assert_eq(a, a2)
assert a2.chunks == a.chunks


def test_from_zarr_align_chunks():
zarr = pytest.importorskip("zarr")
a = zarr.ones(shape=(40000, 40000), chunks=(1000, 4))
x = da.from_zarr(a)
assert x.chunksize[0] % 1000 == 0
assert x.chunksize[1] % 4 == 0
assert x.chunksize[0] > x.chunksize[1]


def test_from_zarr_auto_chunks():
zarr = pytest.importorskip("zarr")
a = zarr.array(range(100), chunks=(1,))
x = da.from_zarr(a)
assert x.npartitions < 100


@pytest.mark.parametrize("compute", [False, True])
Expand All @@ -3868,7 +3883,6 @@ def test_zarr_return_stored(compute):
a2 = a.to_zarr(d, compute=compute, return_stored=True)
assert isinstance(a2, Array)
assert_eq(a, a2, check_graph=False)
assert a2.chunks == a.chunks


def test_to_zarr_delayed_creates_no_metadata():
Expand All @@ -3891,7 +3905,6 @@ def test_zarr_existing_array():
a.to_zarr(z)
a2 = da.from_zarr(z)
assert_eq(a, a2)
assert a2.chunks == a.chunks


def test_to_zarr_unknown_chunks_raises():
Expand Down Expand Up @@ -3921,7 +3934,6 @@ def test_zarr_pass_mapper():
a.to_zarr(mapper)
a2 = da.from_zarr(mapper)
assert_eq(a, a2)
assert a2.chunks == a.chunks


def test_zarr_group():
Expand All @@ -3942,7 +3954,6 @@ def test_zarr_group():

a2 = da.from_zarr(d, component="test")
assert_eq(a, a2)
assert a2.chunks == a.chunks


@pytest.mark.parametrize(
Expand Down Expand Up @@ -3972,7 +3983,6 @@ def test_zarr_nocompute():
dask.compute(out)
a2 = da.from_zarr(d)
assert_eq(a, a2)
assert a2.chunks == a.chunks


def test_tiledb_roundtrip():
Expand Down
1 change: 0 additions & 1 deletion dask/tests/test_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ def test_zarr_distributed_roundtrip():
a.to_zarr(d)
a2 = da.from_zarr(d)
assert_eq(a, a2)
assert a2.chunks == a.chunks


def test_zarr_in_memory_distributed_err(c):
Expand Down