From 15fc2b87358183bd25896b46b387e616d9e967ea Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 10 Sep 2022 17:44:28 -0400 Subject: [PATCH 001/158] generalise chunk methods to allow cubed --- xarray/core/dataarray.py | 3 + xarray/core/dataset.py | 18 +++++- xarray/core/pycompat.py | 13 +++- xarray/core/types.py | 7 +++ xarray/core/variable.py | 129 +++++++++++++++++++++++++++------------ 5 files changed, 126 insertions(+), 44 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d5bc22f9f88..38cf5e6a0f9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -77,6 +77,7 @@ from .rolling import DataArrayCoarsen, DataArrayRolling from .types import ( CoarsenBoundaryOptions, + CubedSpec, DatetimeUnitOptions, ErrorOptions, ErrorOptionsWithWarn, @@ -1155,6 +1156,8 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + manager: Literal["dask", "cubed"] = "dask", + spec: CubedSpec = None, **chunks_kwargs: Any, ) -> T_DataArray: """Coerce this array's data into a dask arrays with the given chunks. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cf1f68d9343..d7e8dc8d85c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -117,6 +117,7 @@ QueryParserOptions, ReindexMethodOptions, SideOptions, + T_CubedSpec, T_Xarray, ) from .weighted import DatasetWeighted @@ -260,6 +261,8 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, + manager: Literal["dask", "cubed"] = "dask", + spec: T_CubedSpec = None, ): from dask.base import tokenize @@ -267,11 +270,18 @@ def _maybe_chunk( chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: # when rechunking by different amounts, make sure dask names change - # by provinding chunks as an input to tokenize. + # by providing chunks as an input to tokenize. # subtle bugs result otherwise. see GH3350 token2 = tokenize(name, token if token else var._data, chunks) name2 = f"{name_prefix}{name}-{token2}" - var = var.chunk(chunks, name=name2, lock=lock, inline_array=inline_array) + var = var.chunk( + chunks, + name=name2, + lock=lock, + inline_array=inline_array, + manager=manager, + spec=spec, + ) if overwrite_encoded_chunks and var.chunks is not None: var.encoding["chunks"] = tuple(x[0] for x in var.chunks) @@ -2165,6 +2175,8 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + manager: Literal["dask", "cubed"] = "dask", + spec: T_CubedSpec = None, **chunks_kwargs: Any, ) -> T_Dataset: """Coerce all arrays in this dataset into dask arrays with the given @@ -2227,7 +2239,7 @@ def chunk( ) variables = { - k: _maybe_chunk(k, v, chunks, token, lock, name_prefix) + k: _maybe_chunk(k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec) for k, v in self.variables.items() } return self._replace(variables) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 09ee13e4941..6531b4f5289 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -31,10 +31,13 @@ def __init__(self, mod): duck_array_type = (duck_array_module.ndarray,) elif mod == "sparse": duck_array_type = (duck_array_module.SparseArray,) + elif mod == "cubed": + duck_array_type = (duck_array_module.CoreArray,) else: raise NotImplementedError - except ImportError: # pragma: no cover + except ImportError as err: # pragma: no cover + print(err) duck_array_module = None duck_array_version = Version("0.0.0") duck_array_type = () @@ -53,6 +56,10 @@ def __init__(self, mod): sparse_array_type = sp.type sparse_version = sp.version +cub = DuckArrayModule("cubed") +cubed_version = cub.version +cubed_array_type = cub.type + cupy_array_type = DuckArrayModule("cupy").type @@ -67,3 +74,7 @@ def is_dask_collection(x): def is_duck_dask_array(x): return is_duck_array(x) and is_dask_collection(x) + + +def is_chunked_array(x): + return is_duck_dask_array(x) or isinstance(x, cubed_array_type) diff --git a/xarray/core/types.py b/xarray/core/types.py index 03323f6c598..a291ea9d8cb 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -18,6 +18,11 @@ except ImportError: DaskArray = np.ndarray # type: ignore + try: + from cubed import Spec + except ImportError: + Spec = None # type: ignore + # TODO: Turn on when https://github.com/python/mypy/issues/11871 is fixed. # Can be uncommented if using pyright though. # import sys @@ -50,6 +55,8 @@ VarCompatible = Union["Variable", "ScalarOrArray"] GroupByIncompatible = Union["Variable", "GroupBy"] +T_CubedSpec = TypeVar("T_CubedSpec", bound="Spec") + ErrorOptions = Literal["raise", "ignore"] ErrorOptionsWithWarn = Literal["raise", "warn", "ignore"] diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6e172c06730..2678989d0f4 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -38,9 +38,11 @@ from .options import OPTIONS, _get_keep_attrs from .pycompat import ( DuckArrayModule, + cubed_array_type, cupy_array_type, dask_array_type, integer_types, + is_chunked_array, is_duck_dask_array, sparse_array_type, ) @@ -64,6 +66,7 @@ pd.Index, ) + dask_array_type + + cubed_array_type + cupy_array_type ) # https://github.com/python/mypy/issues/224 @@ -74,6 +77,7 @@ ErrorOptionsWithWarn, PadModeOptions, PadReflectOptions, + T_CubedSpec, T_Variable, ) @@ -462,7 +466,7 @@ def load(self, **kwargs): -------- dask.array.compute """ - if is_duck_dask_array(self._data): + if is_chunked_array(self._data): self._data = as_compatible_data(self._data.compute(**kwargs)) elif not is_duck_array(self._data): self._data = np.asarray(self._data) @@ -1052,6 +1056,8 @@ def chunk( name: str = None, lock: bool = False, inline_array: bool = False, + manager: Literal["dask", "cubed"] = "dask", + spec: T_CubedSpec = None, **chunks_kwargs: Any, ) -> Variable: """Coerce this array's data into a dask array with the given chunks. @@ -1093,54 +1099,89 @@ def chunk( xarray.unify_chunks dask.array.from_array """ - import dask.array as da + if manager == "dask": + import dask.array as da + + if chunks is None: + warnings.warn( + "None value for 'chunks' is deprecated. " + "It will raise an error in the future. Use instead '{}'", + category=FutureWarning, + ) + chunks = {} - if chunks is None: - warnings.warn( - "None value for 'chunks' is deprecated. " - "It will raise an error in the future. Use instead '{}'", - category=FutureWarning, - ) - chunks = {} + if isinstance(chunks, (float, str, int, tuple, list)): + pass # dask.array.from_array can handle these directly + else: + chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") - if isinstance(chunks, (float, str, int, tuple, list)): - pass # dask.array.from_array can handle these directly - else: - chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + if utils.is_dict_like(chunks): + chunks = { + self.get_axis_num(dim): chunk for dim, chunk in chunks.items() + } - if utils.is_dict_like(chunks): - chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + data = self._data + if is_duck_dask_array(data): + data = data.rechunk(chunks) + elif isinstance(data, cubed_array_type): + raise TypeError("Trying to rechunk a cubed array using dask") + else: + if isinstance(data, indexing.ExplicitlyIndexed): + # Unambiguously handle array storage backends (like NetCDF4 and h5py) + # that can't handle general array indexing. For example, in netCDF4 you + # can do "outer" indexing along two dimensions independent, which works + # differently from how NumPy handles it. + # da.from_array works by using lazy indexing with a tuple of slices. + # Using OuterIndexer is a pragmatic choice: dask does not yet handle + # different indexing types in an explicit way: + # https://github.com/dask/dask/issues/2883 + data = indexing.ImplicitToExplicitIndexingAdapter( + data, indexing.OuterIndexer + ) - data = self._data - if is_duck_dask_array(data): - data = data.rechunk(chunks) - else: - if isinstance(data, indexing.ExplicitlyIndexed): - # Unambiguously handle array storage backends (like NetCDF4 and h5py) - # that can't handle general array indexing. For example, in netCDF4 you - # can do "outer" indexing along two dimensions independent, which works - # differently from how NumPy handles it. - # da.from_array works by using lazy indexing with a tuple of slices. - # Using OuterIndexer is a pragmatic choice: dask does not yet handle - # different indexing types in an explicit way: - # https://github.com/dask/dask/issues/2883 - data = indexing.ImplicitToExplicitIndexingAdapter( - data, indexing.OuterIndexer + # All of our lazily loaded backend array classes should use NumPy + # array operations. + kwargs = {"meta": np.ndarray} + else: + kwargs = {} + + if utils.is_dict_like(chunks): + chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape)) + + data = da.from_array( + data, + chunks, + name=name, + lock=lock, + inline_array=inline_array, + **kwargs, ) - # All of our lazily loaded backend array classes should use NumPy - # array operations. - kwargs = {"meta": np.ndarray} - else: - kwargs = {} + elif manager == "cubed": + import cubed + + chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape)) + chunks = { + self.get_axis_num(dim): chunk for dim, chunk in chunks.items() + } - data = da.from_array( - data, chunks, name=name, lock=lock, inline_array=inline_array, **kwargs + data = self._data + if isinstance(data, cubed_array_type): + data = data.rechunk(chunks) + elif is_duck_dask_array(data): + raise TypeError("Trying to rechunk a dask array using cubed") + + data = cubed.from_array( + data, + chunks, + spec=spec, ) + else: + raise ValueError + return self._replace(data=data) def to_numpy(self) -> np.ndarray: @@ -1149,7 +1190,7 @@ def to_numpy(self) -> np.ndarray: data = self.data # TODO first attempt to call .to_numpy() once some libraries implement it - if isinstance(data, dask_array_type): + if isinstance(data, (dask_array_type, cubed_array_type)): data = data.compute() if isinstance(data, cupy_array_type): data = data.get() @@ -2760,7 +2801,15 @@ def values(self, values): f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." ) - def chunk(self, chunks={}, name=None, lock=False, inline_array=False): + def chunk( + self, + chunks={}, + name=None, + lock=False, + inline_array=False, + manager="dask", + spec=None, + ): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) From 5e05b712e9db4dcc6a8b8e085a4f627349b12322 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 10 Sep 2022 22:03:58 +0000 Subject: [PATCH 002/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d7e8dc8d85c..2844905dabc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2239,7 +2239,9 @@ def chunk( ) variables = { - k: _maybe_chunk(k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec) + k: _maybe_chunk( + k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec + ) for k, v in self.variables.items() } return self._replace(variables) From cff89eeffd2687e944c5792d1476fa8cd813712c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 10 Sep 2022 18:57:44 -0400 Subject: [PATCH 003/158] fic typing typo --- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 38cf5e6a0f9..80e72082c0b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -77,7 +77,6 @@ from .rolling import DataArrayCoarsen, DataArrayRolling from .types import ( CoarsenBoundaryOptions, - CubedSpec, DatetimeUnitOptions, ErrorOptions, ErrorOptionsWithWarn, @@ -88,6 +87,7 @@ QueryParserOptions, ReindexMethodOptions, SideOptions, + T_CubedSpec, T_DataArray, T_Xarray, ) @@ -1157,7 +1157,7 @@ def chunk( lock: bool = False, inline_array: bool = False, manager: Literal["dask", "cubed"] = "dask", - spec: CubedSpec = None, + spec: T_CubedSpec = None, **chunks_kwargs: Any, ) -> T_DataArray: """Coerce this array's data into a dask arrays with the given chunks. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d7e8dc8d85c..2844905dabc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2239,7 +2239,9 @@ def chunk( ) variables = { - k: _maybe_chunk(k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec) + k: _maybe_chunk( + k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec + ) for k, v in self.variables.items() } return self._replace(variables) From 60d44bc39d097f5be11f6ea663a46a085c6e632a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 11 Sep 2022 16:37:08 -0400 Subject: [PATCH 004/158] fixed circular import --- xarray/core/pycompat.py | 8 ++------ xarray/core/utils.py | 4 ++-- xarray/core/variable.py | 31 +++++++++++++++++++------------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 6531b4f5289..ec1c7de7da1 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -36,8 +36,7 @@ def __init__(self, mod): else: raise NotImplementedError - except ImportError as err: # pragma: no cover - print(err) + except ImportError: # pragma: no cover duck_array_module = None duck_array_version = Version("0.0.0") duck_array_type = () @@ -56,10 +55,6 @@ def __init__(self, mod): sparse_array_type = sp.type sparse_version = sp.version -cub = DuckArrayModule("cubed") -cubed_version = cub.version -cubed_array_type = cub.type - cupy_array_type = DuckArrayModule("cupy").type @@ -77,4 +72,5 @@ def is_duck_dask_array(x): def is_chunked_array(x): + cubed_array_type = DuckArrayModule("cubed").type return is_duck_dask_array(x) or isinstance(x, cubed_array_type) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 51bf1346506..8eb61ce69cc 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -290,7 +290,7 @@ def either_dict_or_kwargs( def _is_scalar(value, include_0d): - from .variable import NON_NUMPY_SUPPORTED_ARRAY_TYPES + from .variable import _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES if include_0d: include_0d = getattr(value, "ndim", None) == 0 @@ -298,7 +298,7 @@ def _is_scalar(value, include_0d): include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES) + isinstance(value, (Iterable,) + _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES()) or hasattr(value, "__array_function__") or hasattr(value, "__array_namespace__") ) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2678989d0f4..ebd6aa4006d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -38,7 +38,6 @@ from .options import OPTIONS, _get_keep_attrs from .pycompat import ( DuckArrayModule, - cubed_array_type, cupy_array_type, dask_array_type, integer_types, @@ -60,15 +59,21 @@ maybe_coerce_to_str, ) -NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( - ( - indexing.ExplicitlyIndexed, - pd.Index, + +def _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES(): + """Required instead of a global to avoid circular import errors with cubed""" + + return ( + ( + indexing.ExplicitlyIndexed, + pd.Index, + ) + + dask_array_type + + cupy_array_type + + DuckArrayModule("cubed").type ) - + dask_array_type - + cubed_array_type - + cupy_array_type -) + + # https://github.com/python/mypy/issues/224 BASIC_INDEXING_TYPES = integer_types + (slice,) @@ -216,7 +221,7 @@ def as_compatible_data(data, fastpath=False): if isinstance(data, (Variable, DataArray)): return data.data - if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES): + if isinstance(data, _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES()): return _maybe_wrap_data(data) if isinstance(data, tuple): @@ -1123,7 +1128,7 @@ def chunk( data = self._data if is_duck_dask_array(data): data = data.rechunk(chunks) - elif isinstance(data, cubed_array_type): + elif isinstance(data, DuckArrayModule("cubed").type): raise TypeError("Trying to rechunk a cubed array using dask") else: if isinstance(data, indexing.ExplicitlyIndexed): @@ -1168,7 +1173,7 @@ def chunk( } data = self._data - if isinstance(data, cubed_array_type): + if isinstance(data, cubed.Array): data = data.rechunk(chunks) elif is_duck_dask_array(data): raise TypeError("Trying to rechunk a dask array using cubed") @@ -1190,6 +1195,8 @@ def to_numpy(self) -> np.ndarray: data = self.data # TODO first attempt to call .to_numpy() once some libraries implement it + # cubed has to be imported dynamically as cubed imports rechunker which imports xarray + cubed_array_type = DuckArrayModule("cubed").type if isinstance(data, (dask_array_type, cubed_array_type)): data = data.compute() if isinstance(data, cupy_array_type): From 5ddba7e9286a498c50deea21bf1c3da9c4b392a3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 11 Sep 2022 16:48:02 -0400 Subject: [PATCH 005/158] fix some mypy errors --- xarray/core/types.py | 2 +- xarray/core/variable.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/core/types.py b/xarray/core/types.py index a291ea9d8cb..31832444cb8 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -19,7 +19,7 @@ DaskArray = np.ndarray # type: ignore try: - from cubed import Spec + from cubed import Spec # type: ignore except ImportError: Spec = None # type: ignore diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ebd6aa4006d..e61db0517ea 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1163,9 +1163,12 @@ def chunk( ) elif manager == "cubed": - import cubed + import cubed # type: ignore - chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + if isinstance(chunks, (float, str, int, tuple, list)): + raise TypeError # unsure if cubed.from_array can handle this directly + else: + chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") if utils.is_dict_like(chunks): chunks = { From 37d0d6665d76333ef8b7e3831f1fcf20123eba46 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 11 Sep 2022 17:38:05 -0400 Subject: [PATCH 006/158] added cubed to mypy ignore list --- setup.cfg | 2 ++ xarray/core/types.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 29b4a3b0b8b..b806c57cb1a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -183,6 +183,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-cftime.*] ignore_missing_imports = True +[mypy-cubed.*] +ignore_missing_imports = True [mypy-cupy.*] ignore_missing_imports = True [mypy-dask.*] diff --git a/xarray/core/types.py b/xarray/core/types.py index 31832444cb8..a291ea9d8cb 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -19,7 +19,7 @@ DaskArray = np.ndarray # type: ignore try: - from cubed import Spec # type: ignore + from cubed import Spec except ImportError: Spec = None # type: ignore From cdcb3fb7d87e7420210ed4a3219f4ee90f56fe21 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 12 Sep 2022 17:56:23 -0400 Subject: [PATCH 007/158] simplify __array_ufunc__ check --- xarray/core/arithmetic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xarray/core/arithmetic.py b/xarray/core/arithmetic.py index ff7af02abfc..8d6a1d3ed8c 100644 --- a/xarray/core/arithmetic.py +++ b/xarray/core/arithmetic.py @@ -16,7 +16,7 @@ from .common import ImplementsArrayReduce, ImplementsDatasetReduce from .ops import IncludeCumMethods, IncludeNumpySameMethods, IncludeReduceMethods from .options import OPTIONS, _get_keep_attrs -from .pycompat import dask_array_type +from .pycompat import is_duck_array class SupportsArithmetic: @@ -33,12 +33,11 @@ class SupportsArithmetic: # TODO: allow extending this with some sort of registration system _HANDLED_TYPES = ( - np.ndarray, np.generic, numbers.Number, bytes, str, - ) + dask_array_type + ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .computation import apply_ufunc @@ -46,7 +45,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # See the docstring example for numpy.lib.mixins.NDArrayOperatorsMixin. out = kwargs.get("out", ()) for x in inputs + out: - if not isinstance(x, self._HANDLED_TYPES + (SupportsArithmetic,)): + if not is_duck_array(x) and not isinstance( + x, self._HANDLED_TYPES + (SupportsArithmetic,) + ): return NotImplemented if ufunc.signature is not None: From 73e45635f29aa6e4ee4794ddc5ebf95a0620ab0d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 12 Sep 2022 17:59:28 -0400 Subject: [PATCH 008/158] Revert "simplify __array_ufunc__ check" as I pushed to wrong branch This reverts commit cdcb3fb7d87e7420210ed4a3219f4ee90f56fe21. --- xarray/core/arithmetic.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/xarray/core/arithmetic.py b/xarray/core/arithmetic.py index 8d6a1d3ed8c..ff7af02abfc 100644 --- a/xarray/core/arithmetic.py +++ b/xarray/core/arithmetic.py @@ -16,7 +16,7 @@ from .common import ImplementsArrayReduce, ImplementsDatasetReduce from .ops import IncludeCumMethods, IncludeNumpySameMethods, IncludeReduceMethods from .options import OPTIONS, _get_keep_attrs -from .pycompat import is_duck_array +from .pycompat import dask_array_type class SupportsArithmetic: @@ -33,11 +33,12 @@ class SupportsArithmetic: # TODO: allow extending this with some sort of registration system _HANDLED_TYPES = ( + np.ndarray, np.generic, numbers.Number, bytes, str, - ) + ) + dask_array_type def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .computation import apply_ufunc @@ -45,9 +46,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # See the docstring example for numpy.lib.mixins.NDArrayOperatorsMixin. out = kwargs.get("out", ()) for x in inputs + out: - if not is_duck_array(x) and not isinstance( - x, self._HANDLED_TYPES + (SupportsArithmetic,) - ): + if not isinstance(x, self._HANDLED_TYPES + (SupportsArithmetic,)): return NotImplemented if ufunc.signature is not None: From 5995685b2a10bba2e422e94102a25b65d6fa6ed5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 20 Sep 2022 13:26:09 -0400 Subject: [PATCH 009/158] update cubed array type --- xarray/core/pycompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index ec1c7de7da1..680deeba809 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -32,7 +32,7 @@ def __init__(self, mod): elif mod == "sparse": duck_array_type = (duck_array_module.SparseArray,) elif mod == "cubed": - duck_array_type = (duck_array_module.CoreArray,) + duck_array_type = (duck_array_module.Array,) else: raise NotImplementedError From 320b09f7eef5508f6f1709775a5f75ec32899573 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 20 Sep 2022 13:56:18 -0400 Subject: [PATCH 010/158] fix missed conflict --- xarray/core/variable.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c92fc4dc805..fd27d08c465 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1198,13 +1198,9 @@ def to_numpy(self) -> np.ndarray: data = self.data # TODO first attempt to call .to_numpy() once some libraries implement it -<<<<<<< HEAD # cubed has to be imported dynamically as cubed imports rechunker which imports xarray cubed_array_type = DuckArrayModule("cubed").type - if isinstance(data, (dask_array_type, cubed_array_type)): -======= if hasattr(data, "chunks"): ->>>>>>> main data = data.compute() if isinstance(data, cupy_array_type): data = data.get() From 3facfd64ac5da902f0cb75ce61ed4016f8df1653 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 20 Sep 2022 16:23:42 -0400 Subject: [PATCH 011/158] sketch for ChunkManager adapter class --- xarray/core/parallelcompat.py | 196 ++++++++++++++++++++++++++++++++++ xarray/core/variable.py | 100 ++++------------- 2 files changed, 214 insertions(+), 82 deletions(-) create mode 100644 xarray/core/parallelcompat.py diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py new file mode 100644 index 00000000000..083f6e27354 --- /dev/null +++ b/xarray/core/parallelcompat.py @@ -0,0 +1,196 @@ +""" +The code in this module is an experiment in going from N=1 to N=2 parallel computing frameworks in xarray. +It could later be used as the basis for a public interface allowing any N frameworks to interoperate with xarray, +but for now it is just a private experiment. +""" + +from abc import ABC, abstractmethod +from typing import Any + +import numpy as np + +from .pycompat import DuckArrayModule, is_duck_dask_array +from . import indexing, utils + +CHUNK_MANAGERS = {} + + +def _get_chunk_manager(name: str) -> "ChunkManager": + if name in CHUNK_MANAGERS: + chunkmanager = CHUNK_MANAGERS[name] + return chunkmanager() + else: + raise ImportError(f"ChunkManager {name} has not been defined") + + +class ChunkManager(ABC): + """ + Adapter between a particular parallel computing framework and xarray. + + Attributes + ---------- + array_type + Type of the array class this parallel computing framework provides. + + Parallel frameworks need to provide an array class that supports the array API standard. + Used for type checking. + """ + + @staticmethod + @abstractmethod + def chunks(arr): + ... + + @staticmethod + @abstractmethod + def chunk(data: np.ndarray, chunks, **kwargs): + ... + + @staticmethod + @abstractmethod + def rechunk(data: Any, chunks, **kwargs): + ... + + @staticmethod + @abstractmethod + def compute(arr, **kwargs) -> np.ndarray: + ... + + @staticmethod + @abstractmethod + def apply_ufunc(): + """ + Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. + """ + ... + + @staticmethod + def map_blocks(): + """Called by xarray.map_blocks.""" + raise NotImplementedError() + + @staticmethod + def blockwise(): + """Called by some niche functions in xarray.""" + raise NotImplementedError() + + +class DaskManager(ChunkManager): + def __init__(self): + from dask.array import Array + + self.array_type = Array + + @staticmethod + def chunks(arr: "dask.array.Array"): + return arr.chunks + + @staticmethod + def chunk(data: Any, chunks, **kwargs): + import dask.array as da + + # dask-specific kwargs + name = kwargs.pop("name", None) + lock = kwargs.pop("lock", False) + inline_array = kwargs.pop("inline_array", False) + + if is_duck_dask_array(data): + data = data.rechunk(chunks) + elif isinstance(data, DuckArrayModule("cubed").type): + raise TypeError("Trying to rechunk a cubed array using dask") + else: + if isinstance(data, indexing.ExplicitlyIndexed): + # Unambiguously handle array storage backends (like NetCDF4 and h5py) + # that can't handle general array indexing. For example, in netCDF4 you + # can do "outer" indexing along two dimensions independent, which works + # differently from how NumPy handles it. + # da.from_array works by using lazy indexing with a tuple of slices. + # Using OuterIndexer is a pragmatic choice: dask does not yet handle + # different indexing types in an explicit way: + # https://github.com/dask/dask/issues/2883 + data = indexing.ImplicitToExplicitIndexingAdapter( + data, indexing.OuterIndexer + ) + + # All of our lazily loaded backend array classes should use NumPy + # array operations. + dask_kwargs = {"meta": np.ndarray} + else: + dask_kwargs = {} + + if utils.is_dict_like(chunks): + chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) + + data = da.from_array( + data, + chunks, + name=name, + lock=lock, + inline_array=inline_array, + **dask_kwargs, + ) + return data + + @staticmethod + def rechunk(chunks, **kwargs): + ... + + @staticmethod + def compute(arr, **kwargs): + return arr.compute(**kwargs) + + @staticmethod + def apply_ufunc(): + from dask.array.gufunc import apply_gufunc + ... + + @staticmethod + def map_blocks(): + from dask.array import map_blocks + ... + + @staticmethod + def blockwise(): + from dask.array import blockwise + ... + + +try: + import dask + + CHUNK_MANAGERS["dask"] = DaskManager +except ImportError: + pass + + +class CubedManager(ChunkManager): + def __init__(self): + from cubed import Array + + self.array_type = Array + + def chunk(self, data: np.ndarray, chunks, **kwargs): + import cubed # type: ignore + + spec = kwargs.pop("spec", None) + + if isinstance(data, cubed.Array): + data = data.rechunk(chunks) + elif is_duck_dask_array(data): + raise TypeError("Trying to rechunk a dask array using cubed") + else: + data = cubed.from_array( + data, + chunks, + spec=spec, + ) + + return data + + +try: + import cubed + + CHUNK_MANAGERS["cubed"] = CubedManager +except ImportError: + pass diff --git a/xarray/core/variable.py b/xarray/core/variable.py index fd27d08c465..78b73e90607 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -36,6 +36,7 @@ ) from .npcompat import QUANTILE_METHODS, ArrayLike from .options import OPTIONS, _get_keep_attrs +from .parallelcompat import _get_chunk_manager from .pycompat import ( DuckArrayModule, cupy_array_type, @@ -67,7 +68,6 @@ def _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES(): indexing.ExplicitlyIndexed, pd.Index, ) - + dask_array_type + cupy_array_type + DuckArrayModule("cubed").type ) @@ -1104,91 +1104,27 @@ def chunk( xarray.unify_chunks dask.array.from_array """ - if manager == "dask": - import dask.array as da - - if chunks is None: - warnings.warn( - "None value for 'chunks' is deprecated. " - "It will raise an error in the future. Use instead '{}'", - category=FutureWarning, - ) - chunks = {} - - if isinstance(chunks, (float, str, int, tuple, list)): - pass # dask.array.from_array can handle these directly - else: - chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") - - if utils.is_dict_like(chunks): - chunks = { - self.get_axis_num(dim): chunk for dim, chunk in chunks.items() - } - - data = self._data - if is_duck_dask_array(data): - data = data.rechunk(chunks) - elif isinstance(data, DuckArrayModule("cubed").type): - raise TypeError("Trying to rechunk a cubed array using dask") - else: - if isinstance(data, indexing.ExplicitlyIndexed): - # Unambiguously handle array storage backends (like NetCDF4 and h5py) - # that can't handle general array indexing. For example, in netCDF4 you - # can do "outer" indexing along two dimensions independent, which works - # differently from how NumPy handles it. - # da.from_array works by using lazy indexing with a tuple of slices. - # Using OuterIndexer is a pragmatic choice: dask does not yet handle - # different indexing types in an explicit way: - # https://github.com/dask/dask/issues/2883 - data = indexing.ImplicitToExplicitIndexingAdapter( - data, indexing.OuterIndexer - ) - - # All of our lazily loaded backend array classes should use NumPy - # array operations. - kwargs = {"meta": np.ndarray} - else: - kwargs = {} - - if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape)) - - data = da.from_array( - data, - chunks, - name=name, - lock=lock, - inline_array=inline_array, - **kwargs, - ) - - elif manager == "cubed": - import cubed # type: ignore - - if isinstance(chunks, (float, str, int, tuple, list)): - raise TypeError # unsure if cubed.from_array can handle this directly - else: - chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + chunk_manager = _get_chunk_manager(manager) - if utils.is_dict_like(chunks): - chunks = { - self.get_axis_num(dim): chunk for dim, chunk in chunks.items() - } + kwargs = dict(name=name, lock=lock, inline_array=inline_array, spec=spec) - data = self._data - if isinstance(data, cubed.Array): - data = data.rechunk(chunks) - elif is_duck_dask_array(data): - raise TypeError("Trying to rechunk a dask array using cubed") - - data = cubed.from_array( - data, - chunks, - spec=spec, + if chunks is None: + warnings.warn( + "None value for 'chunks' is deprecated. " + "It will raise an error in the future. Use instead '{}'", + category=FutureWarning, ) + chunks = {} + if isinstance(chunks, (float, str, int, tuple, list)): + pass # dask.array.from_array can handle these directly else: - raise ValueError + chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + + if utils.is_dict_like(chunks): + chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + + data = chunk_manager.chunk(self._data, chunks, **kwargs) return self._replace(data=data) @@ -1199,7 +1135,7 @@ def to_numpy(self) -> np.ndarray: # TODO first attempt to call .to_numpy() once some libraries implement it # cubed has to be imported dynamically as cubed imports rechunker which imports xarray - cubed_array_type = DuckArrayModule("cubed").type + # cubed_array_type = DuckArrayModule("cubed").type if hasattr(data, "chunks"): data = data.compute() if isinstance(data, cupy_array_type): From c616a85dacd374fe6b9063041dcec1fd8e18a67d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Sep 2022 20:25:15 +0000 Subject: [PATCH 012/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/parallelcompat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 083f6e27354..0975e3cd6c1 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -9,8 +9,8 @@ import numpy as np -from .pycompat import DuckArrayModule, is_duck_dask_array from . import indexing, utils +from .pycompat import DuckArrayModule, is_duck_dask_array CHUNK_MANAGERS = {} @@ -142,16 +142,19 @@ def compute(arr, **kwargs): @staticmethod def apply_ufunc(): from dask.array.gufunc import apply_gufunc + ... @staticmethod def map_blocks(): from dask.array import map_blocks + ... @staticmethod def blockwise(): from dask.array import blockwise + ... From ecabaa46fea9c3aec9bd0f3774c49e2642cfb100 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 Sep 2022 11:23:38 -0400 Subject: [PATCH 013/158] Remove erroneous docstring about usage of map_blocks Co-authored-by: Deepak Cherian --- xarray/core/parallelcompat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 0975e3cd6c1..aed39375ea4 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -66,7 +66,6 @@ def apply_ufunc(): @staticmethod def map_blocks(): - """Called by xarray.map_blocks.""" raise NotImplementedError() @staticmethod From e53a58894a875c192146838ceab2551ba6ef865f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 Sep 2022 11:24:14 -0400 Subject: [PATCH 014/158] apply_ufunc -> apply_gufunc Co-authored-by: Deepak Cherian --- xarray/core/parallelcompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index aed39375ea4..e537a08c482 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -58,7 +58,7 @@ def compute(arr, **kwargs) -> np.ndarray: @staticmethod @abstractmethod - def apply_ufunc(): + def apply_gufunc(): """ Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. """ From fe21edd324acbe87917afe4bab10e43882dd0ed6 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 Sep 2022 11:24:42 -0400 Subject: [PATCH 015/158] chunk -> from_array Co-authored-by: Deepak Cherian --- xarray/core/parallelcompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index e537a08c482..8cad033925f 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -43,7 +43,7 @@ def chunks(arr): @staticmethod @abstractmethod - def chunk(data: np.ndarray, chunks, **kwargs): + def from_array(data: np.ndarray, chunks, **kwargs): ... @staticmethod From 3f6aedc368b4df2d8829ef34a8e77574fc317152 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 23 Sep 2022 11:36:27 -0400 Subject: [PATCH 016/158] remove staticmethods --- xarray/core/parallelcompat.py | 42 ++++++++++++----------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 8cad033925f..7380668ee98 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -36,40 +36,33 @@ class ChunkManager(ABC): Used for type checking. """ - @staticmethod @abstractmethod - def chunks(arr): + def chunks(self, arr): ... - @staticmethod @abstractmethod - def from_array(data: np.ndarray, chunks, **kwargs): + def from_array(self, data: np.ndarray, chunks, **kwargs): ... - @staticmethod @abstractmethod - def rechunk(data: Any, chunks, **kwargs): + def rechunk(self, data: Any, chunks, **kwargs): ... - @staticmethod @abstractmethod - def compute(arr, **kwargs) -> np.ndarray: + def compute(self, arr, **kwargs) -> np.ndarray: ... - @staticmethod @abstractmethod - def apply_gufunc(): + def apply_gufunc(self): """ Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. """ ... - @staticmethod - def map_blocks(): + def map_blocks(self): raise NotImplementedError() - @staticmethod - def blockwise(): + def blockwise(self): """Called by some niche functions in xarray.""" raise NotImplementedError() @@ -80,12 +73,10 @@ def __init__(self): self.array_type = Array - @staticmethod - def chunks(arr: "dask.array.Array"): + def chunks(self, arr: "dask.array.Array"): return arr.chunks - @staticmethod - def chunk(data: Any, chunks, **kwargs): + def chunk(self, data: Any, chunks, **kwargs): import dask.array as da # dask-specific kwargs @@ -130,28 +121,23 @@ def chunk(data: Any, chunks, **kwargs): ) return data - @staticmethod - def rechunk(chunks, **kwargs): + def rechunk(self, chunks, **kwargs): ... - @staticmethod - def compute(arr, **kwargs): + def compute(self, arr, **kwargs): return arr.compute(**kwargs) - @staticmethod - def apply_ufunc(): + def apply_ufunc(self): from dask.array.gufunc import apply_gufunc ... - @staticmethod - def map_blocks(): + def map_blocks(self): from dask.array import map_blocks ... - @staticmethod - def blockwise(): + def blockwise(self): from dask.array import blockwise ... From ea8f482e2e8734536caeca48f4481b1bab717400 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 23 Sep 2022 14:02:26 -0400 Subject: [PATCH 017/158] attempt to type methods of ABC --- xarray/core/parallelcompat.py | 51 ++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7380668ee98..abfeff09318 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -3,16 +3,20 @@ It could later be used as the basis for a public interface allowing any N frameworks to interoperate with xarray, but for now it is just a private experiment. """ - from abc import ABC, abstractmethod -from typing import Any +from typing import Dict, Generic, Tuple, TypeVar import numpy as np +from typing_extensions import TypeAlias from . import indexing, utils from .pycompat import DuckArrayModule, is_duck_dask_array -CHUNK_MANAGERS = {} +T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") +T_ChunkedArray = TypeVar("T_ChunkedArray") +T_Chunks = Tuple[Tuple[int, ...], ...] + +CHUNK_MANAGERS: Dict[str, T_ChunkManager] = {} def _get_chunk_manager(name: str) -> "ChunkManager": @@ -23,7 +27,7 @@ def _get_chunk_manager(name: str) -> "ChunkManager": raise ImportError(f"ChunkManager {name} has not been defined") -class ChunkManager(ABC): +class ChunkManager(ABC, Generic[T_ChunkedArray]): """ Adapter between a particular parallel computing framework and xarray. @@ -36,20 +40,26 @@ class ChunkManager(ABC): Used for type checking. """ + array_type: T_ChunkedArray + @abstractmethod - def chunks(self, arr): + def chunks(self, data: T_ChunkedArray) -> T_Chunks: ... @abstractmethod - def from_array(self, data: np.ndarray, chunks, **kwargs): + def from_array( + self, data: np.ndarray, chunks: T_Chunks, **kwargs + ) -> T_ChunkedArray: ... @abstractmethod - def rechunk(self, data: Any, chunks, **kwargs): + def rechunk( + self, data: T_ChunkedArray, chunks: T_Chunks, **kwargs + ) -> T_ChunkedArray: ... @abstractmethod - def compute(self, arr, **kwargs) -> np.ndarray: + def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: ... @abstractmethod @@ -68,15 +78,18 @@ def blockwise(self): class DaskManager(ChunkManager): + + array_type: "dask.array.Array" + def __init__(self): from dask.array import Array self.array_type = Array - def chunks(self, arr: "dask.array.Array"): - return arr.chunks + def chunks(self, data): + return data.chunks - def chunk(self, data: Any, chunks, **kwargs): + def from_array(self, data: np.ndarray, chunks, **kwargs): import dask.array as da # dask-specific kwargs @@ -85,7 +98,7 @@ def chunk(self, data: Any, chunks, **kwargs): inline_array = kwargs.pop("inline_array", False) if is_duck_dask_array(data): - data = data.rechunk(chunks) + data = self.rechunk(data, chunks) elif isinstance(data, DuckArrayModule("cubed").type): raise TypeError("Trying to rechunk a cubed array using dask") else: @@ -121,13 +134,13 @@ def chunk(self, data: Any, chunks, **kwargs): ) return data - def rechunk(self, chunks, **kwargs): - ... + def rechunk(self, data, chunks, **kwargs): + return data.rechunk(chunks, **kwargs) - def compute(self, arr, **kwargs): - return arr.compute(**kwargs) + def compute(self, data, **kwargs): + return data.compute(**kwargs) - def apply_ufunc(self): + def apply_gufunc(self): from dask.array.gufunc import apply_gufunc ... @@ -157,7 +170,7 @@ def __init__(self): self.array_type = Array - def chunk(self, data: np.ndarray, chunks, **kwargs): + def from_array(self, data: np.ndarray, chunks, **kwargs): import cubed # type: ignore spec = kwargs.pop("spec", None) @@ -177,7 +190,7 @@ def chunk(self, data: np.ndarray, chunks, **kwargs): try: - import cubed + import cubed # type: ignore CHUNK_MANAGERS["cubed"] = CubedManager except ImportError: From c49ab8e17c7e2cdc8afb978ad73138357c86e819 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 23 Sep 2022 14:26:04 -0400 Subject: [PATCH 018/158] from_array --- xarray/core/variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 78b73e90607..572e2b5d8d9 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1124,7 +1124,7 @@ def chunk( if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} - data = chunk_manager.chunk(self._data, chunks, **kwargs) + data = chunk_manager.from_array(self._data, chunks, **kwargs) return self._replace(data=data) From 26d18685fc2dbdd3b5ea236899504447c4319758 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 23 Sep 2022 15:02:28 -0400 Subject: [PATCH 019/158] attempt to specify types --- xarray/core/parallelcompat.py | 39 ++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index abfeff09318..1f933ba4039 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -4,7 +4,7 @@ but for now it is just a private experiment. """ from abc import ABC, abstractmethod -from typing import Dict, Generic, Tuple, TypeVar +from typing import Dict, Generic, Tuple, Type, TypeVar import numpy as np from typing_extensions import TypeAlias @@ -14,7 +14,7 @@ T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") -T_Chunks = Tuple[Tuple[int, ...], ...] +T_Chunks: TypeAlias = Tuple[Tuple[int, ...], ...] CHUNK_MANAGERS: Dict[str, T_ChunkManager] = {} @@ -33,17 +33,22 @@ class ChunkManager(ABC, Generic[T_ChunkedArray]): Attributes ---------- - array_type + array_cls Type of the array class this parallel computing framework provides. Parallel frameworks need to provide an array class that supports the array API standard. Used for type checking. """ - array_type: T_ChunkedArray + array_cls: Type[T_ChunkedArray] + + @abstractmethod + def __init__(self): + ... @abstractmethod def chunks(self, data: T_ChunkedArray) -> T_Chunks: + ... @abstractmethod @@ -77,19 +82,22 @@ def blockwise(self): raise NotImplementedError() -class DaskManager(ChunkManager): +T_DaskArray = TypeVar("T_DaskArray", bound="dask.array.Array") - array_type: "dask.array.Array" + +class DaskManager(ChunkManager[T_DaskArray]): + + array_cls: T_DaskArray def __init__(self): from dask.array import Array - self.array_type = Array + self.array_cls = Array - def chunks(self, data): + def chunks(self, data: T_DaskArray) -> T_Chunks: return data.chunks - def from_array(self, data: np.ndarray, chunks, **kwargs): + def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_DaskArray: import dask.array as da # dask-specific kwargs @@ -134,10 +142,10 @@ def from_array(self, data: np.ndarray, chunks, **kwargs): ) return data - def rechunk(self, data, chunks, **kwargs): + def rechunk(self, data: T_DaskArray, chunks, **kwargs) -> T_DaskArray: return data.rechunk(chunks, **kwargs) - def compute(self, data, **kwargs): + def compute(self, data: T_DaskArray, **kwargs) -> np.ndarray: return data.compute(**kwargs) def apply_gufunc(self): @@ -164,13 +172,16 @@ def blockwise(self): pass -class CubedManager(ChunkManager): +T_CubedArray = TypeVar("T_CubedArray", bound="cubed.Array") + + +class CubedManager(ChunkManager[T_CubedArray]): def __init__(self): from cubed import Array - self.array_type = Array + self.array_cls = Array - def from_array(self, data: np.ndarray, chunks, **kwargs): + def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: import cubed # type: ignore spec = kwargs.pop("spec", None) From e9b4a336bd6d528e1a843fbc8c2cf8ae6b4e2c2a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 26 Sep 2022 10:32:56 -0400 Subject: [PATCH 020/158] method for checking array type --- xarray/core/parallelcompat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 1f933ba4039..f6c8750edbf 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -4,7 +4,7 @@ but for now it is just a private experiment. """ from abc import ABC, abstractmethod -from typing import Dict, Generic, Tuple, Type, TypeVar +from typing import Any, Dict, Generic, Tuple, Type, TypeVar import numpy as np from typing_extensions import TypeAlias @@ -46,6 +46,9 @@ class ChunkManager(ABC, Generic[T_ChunkedArray]): def __init__(self): ... + def is_array_type(self, data: Any) -> bool: + return isinstance(data, self.array_cls) + @abstractmethod def chunks(self, data: T_ChunkedArray) -> T_Chunks: From c7c95899104a9888171110e4f7c9af9d71fa559a Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Thu, 27 Oct 2022 20:07:45 +0200 Subject: [PATCH 021/158] Update pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 271abc0aab1..2f4e55bdc5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ module = [ "cf_units.*", "cfgrib.*", "cftime.*", + "cubed.*", "cupy.*", "dask.*", "distributed.*", From 56e9d0fcf9d0a88697d4482609d0f3efa4bf535f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Jan 2023 23:12:08 +0000 Subject: [PATCH 022/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/parallelcompat.py | 4 ++-- xarray/core/variable.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index f6c8750edbf..f1fefb37fa6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -9,8 +9,8 @@ import numpy as np from typing_extensions import TypeAlias -from . import indexing, utils -from .pycompat import DuckArrayModule, is_duck_dask_array +from xarray.core import indexing, utils +from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f09d8e67e33..5e63c1f1b08 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -35,7 +35,7 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from .parallelcompat import _get_chunk_manager +from xarray.core.parallelcompat import _get_chunk_manager from xarray.core.pycompat import ( DuckArrayModule, array_type, From 3b16ccae3cdfb41306cff1c88236d7ecace2b3f0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 21 Jan 2023 23:00:14 -0500 Subject: [PATCH 023/158] fixed import errors --- xarray/core/parallelcompat.py | 4 ++-- xarray/core/utils.py | 4 ++-- xarray/core/variable.py | 11 +++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index f6c8750edbf..f1fefb37fa6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -9,8 +9,8 @@ import numpy as np from typing_extensions import TypeAlias -from . import indexing, utils -from .pycompat import DuckArrayModule, is_duck_dask_array +from xarray.core import indexing, utils +from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 3dbaee7fb4c..a4a61d407f9 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -285,7 +285,7 @@ def either_dict_or_kwargs( def _is_scalar(value, include_0d): - from xarray.core.variable import _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES + from xarray.core.variable import _get_non_numpy_supported_array_types if include_0d: include_0d = getattr(value, "ndim", None) == 0 @@ -293,7 +293,7 @@ def _is_scalar(value, include_0d): include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES()) + isinstance(value, (Iterable,) + _get_non_numpy_supported_array_types()) or hasattr(value, "__array_function__") or hasattr(value, "__array_namespace__") ) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f09d8e67e33..81b24dddd19 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import functools import itertools import math import numbers @@ -35,15 +36,13 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from .parallelcompat import _get_chunk_manager +from xarray.core.parallelcompat import _get_chunk_manager from xarray.core.pycompat import ( DuckArrayModule, array_type, - cupy_array_type, integer_types, is_chunked_array, is_duck_dask_array, - sparse_array_type, ) from xarray.core.utils import ( Frozen, @@ -60,7 +59,8 @@ ) -def _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES(): +@functools.cache +def _get_non_numpy_supported_array_types(): """Required instead of a global to avoid circular import errors with cubed""" return ( @@ -68,7 +68,6 @@ def _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES(): indexing.ExplicitlyIndexed, pd.Index, ) - + cupy_array_type + DuckArrayModule("cubed").type ) @@ -276,7 +275,7 @@ def as_compatible_data(data, fastpath=False): if isinstance(data, (Variable, DataArray)): return data.data - if isinstance(data, _get_NON_NUMPY_SUPPORTED_ARRAY_TYPES()): + if isinstance(data, _get_non_numpy_supported_array_types()): data = _possibly_convert_datetime_or_timedelta_index(data) return _maybe_wrap_data(data) From 7ac3323724bddf556855d6cad003426198e1d0de Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 21 Jan 2023 23:56:22 -0500 Subject: [PATCH 024/158] generalize .chunk method kwargs --- xarray/core/dataarray.py | 5 ++--- xarray/core/dataset.py | 19 +++++++++++-------- xarray/core/variable.py | 27 +++++++++++++-------------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4b1425703e2..993628981ce 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -100,7 +100,6 @@ QueryParserOptions, ReindexMethodOptions, SideOptions, - T_CubedSpec, T_DataArray, T_Xarray, ) @@ -1266,8 +1265,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - manager: Literal["dask", "cubed"] = "dask", - spec: T_CubedSpec = None, + from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: """Coerce this array's data into a dask arrays with the given chunks. @@ -1332,6 +1330,7 @@ def chunk( token=token, lock=lock, inline_array=inline_array, + from_array_kwargs=from_array_kwargs, ) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1cb26187171..a3e78a60d63 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -128,7 +128,6 @@ QueryParserOptions, ReindexMethodOptions, SideOptions, - T_CubedSpec, T_Xarray, ) from xarray.core.weighted import DatasetWeighted @@ -272,8 +271,7 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, - manager: Literal["dask", "cubed"] = "dask", - spec: T_CubedSpec = None, + from_array_kwargs=None, ): from dask.base import tokenize @@ -290,8 +288,7 @@ def _maybe_chunk( name=name2, lock=lock, inline_array=inline_array, - manager=manager, - spec=spec, + from_array_kwargs=from_array_kwargs, ) if overwrite_encoded_chunks and var.chunks is not None: @@ -2209,8 +2206,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - manager: Literal["dask", "cubed"] = "dask", - spec: T_CubedSpec = None, + from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: """Coerce all arrays in this dataset into dask arrays with the given @@ -2274,7 +2270,14 @@ def chunk( variables = { k: _maybe_chunk( - k, v, chunks, token, lock, name_prefix, manager=manager, spec=spec + k, + v, + chunks, + token, + lock, + name_prefix, + inline_array=inline_array, + from_array_kwargs=from_array_kwargs, ) for k, v in self.variables.items() } diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 81b24dddd19..66089319861 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -64,12 +64,9 @@ def _get_non_numpy_supported_array_types(): """Required instead of a global to avoid circular import errors with cubed""" return ( - ( - indexing.ExplicitlyIndexed, - pd.Index, - ) - + DuckArrayModule("cubed").type - ) + indexing.ExplicitlyIndexed, + pd.Index, + ) + DuckArrayModule("cubed").type # https://github.com/python/mypy/issues/224 @@ -82,7 +79,6 @@ def _get_non_numpy_supported_array_types(): PadModeOptions, PadReflectOptions, QuantileMethods, - T_CubedSpec, T_Variable, ) @@ -1160,8 +1156,7 @@ def chunk( name: str | None = None, lock: bool = False, inline_array: bool = False, - manager: Literal["dask", "cubed"] = "dask", - spec: T_CubedSpec = None, + from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: """Coerce this array's data into a dask array with the given chunks. @@ -1203,9 +1198,14 @@ def chunk( xarray.unify_chunks dask.array.from_array """ - chunk_manager = _get_chunk_manager(manager) - kwargs = dict(name=name, lock=lock, inline_array=inline_array, spec=spec) + if from_array_kwargs is None: + from_array_kwargs = {} + chunk_manager = _get_chunk_manager(from_array_kwargs.pop("manager", "dask")) + + _from_array_kwargs = dict( + name=name, lock=lock, inline_array=inline_array, **from_array_kwargs + ) if chunks is None: warnings.warn( @@ -1223,7 +1223,7 @@ def chunk( if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} - data = chunk_manager.from_array(self._data, chunks, **kwargs) + data = chunk_manager.from_array(self._data, chunks, **_from_array_kwargs) return self._replace(data=data) @@ -2879,8 +2879,7 @@ def chunk( name=None, lock=False, inline_array=False, - manager="dask", - spec=None, + from_array_kwargs=None, ): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) From e732b87cfa94ddbf7c46e6f2fbd58fbd96af55bd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 22 Jan 2023 00:05:35 -0500 Subject: [PATCH 025/158] used dask functions in dask chunkmanager --- xarray/core/parallelcompat.py | 95 ++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 8 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index f1fefb37fa6..feca586dc25 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -151,20 +151,99 @@ def rechunk(self, data: T_DaskArray, chunks, **kwargs) -> T_DaskArray: def compute(self, data: T_DaskArray, **kwargs) -> np.ndarray: return data.compute(**kwargs) - def apply_gufunc(self): + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + axis=None, + keepdims=False, + output_dtypes=None, + output_sizes=None, + vectorize=None, + allow_rechunk=False, + meta=None, + **kwargs, + ): from dask.array.gufunc import apply_gufunc - ... - - def map_blocks(self): + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + keepdims=keepdims, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + allow_rechunk=allow_rechunk, + meta=meta, + **kwargs, + ) + + def map_blocks( + self, + func, + *args, + name=None, + token=None, + dtype=None, + chunks=None, + drop_axis=None, + new_axis=None, + enforce_ndim=False, + meta=None, + **kwargs, + ): from dask.array import map_blocks - ... - - def blockwise(self): + return map_blocks( + func, + *args, + name=name, + token=token, + dtype=dtype, + chunks=chunks, + drop_axis=drop_axis, + new_axis=new_axis, + enforce_ndim=enforce_ndim, + meta=meta, + **kwargs, + ) + + def blockwise( + self, + func, + out_ind, + *args, + name=None, + token=None, + dtype=None, + adjust_chunks=None, + new_axes=None, + align_arrays=True, + concatenate=None, + meta=None, + **kwargs, + ): from dask.array import blockwise - ... + return blockwise( + func, + out_ind, + *args, + name=name, + token=token, + dtype=dtype, + adjust_chunks=adjust_chunks, + new_axes=new_axes, + align_arrays=align_arrays, + concatenate=concatenate, + meta=meta, + **kwargs, + ) try: From 8442e1fd7cbff2917a1313bc74020672d97107d3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 22 Jan 2023 00:43:10 -0500 Subject: [PATCH 026/158] define signatures for apply_gufunc, blockwise, map_blocks --- xarray/core/parallelcompat.py | 83 +++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index feca586dc25..b3af0fe055b 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -71,16 +71,42 @@ def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: ... @abstractmethod - def apply_gufunc(self): + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + keepdims=False, + output_dtypes=None, + vectorize=None, + **kwargs, + ): """ Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. """ ... - def map_blocks(self): + def map_blocks( + self, + func, + *args, + dtype=None, + **kwargs, + ): + """Currently only called in a couple of really niche places in xarray. Not even called in xarray.map_blocks.""" raise NotImplementedError() - def blockwise(self): + def blockwise( + self, + func, + out_ind, + *args, + adjust_chunks=None, + new_axes=None, + align_arrays=True, + **kwargs, + ): """Called by some niche functions in xarray.""" raise NotImplementedError() @@ -281,6 +307,57 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: return data + def map_blocks( + self, + func, + *args, + dtype=None, + chunks=None, + drop_axis=[], + new_axis=None, + **kwargs, + ): + from cubed.core.ops import map_blocks + + return map_blocks( + func, + *args, + dtype=dtype, + chunks=chunks, + drop_axis=drop_axis, + new_axis=new_axis, + **kwargs, + ) + + def blockwise( + self, + func, + out_ind, + *args: Any, + # can't type this as mypy assumes args are all same type, but blockwise args alternate types + dtype=None, + adjust_chunks=None, + new_axes=None, + align_arrays=True, + target_store=None, + **kwargs, + ): + from cubed.core.ops import blockwise + + # TODO where to get the target_store kwarg from? Filter down from a blockwise call? Set as attribute on CubedManager? + + return blockwise( + func, + out_ind, + *args, + dtype=dtype, + adjust_chunks=adjust_chunks, + new_axes=new_axes, + align_arrays=align_arrays, + target_store=target_store, + **kwargs, + ) + try: import cubed # type: ignore From 37174314fc4bb1885bfafac3e09444eb8ea8d367 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 22 Jan 2023 01:03:13 -0500 Subject: [PATCH 027/158] prototype function to detect which parallel backend to use --- xarray/core/parallelcompat.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index b3af0fe055b..4fbc927b6f6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -22,11 +22,30 @@ def _get_chunk_manager(name: str) -> "ChunkManager": if name in CHUNK_MANAGERS: chunkmanager = CHUNK_MANAGERS[name] - return chunkmanager() + return chunkmanager else: raise ImportError(f"ChunkManager {name} has not been defined") +def _detect_parallel_array_type(*args) -> "ChunkManager": + """Detects which parallel backend should be used for given arrays (e.g. a list of dask arrays)""" + # TODO assert all arrays are the same type (or numpy) + arr = args[0] + + # iterate over defined chunk managers, seeing if each recognises this array type + for chunkmanager in CHUNK_MANAGERS.values(): + if chunkmanager.is_array_type(arr): + return chunkmanager + + raise ChunkManagerNotFoundError( + f"Could not find a Chunk Manager which recognises type {type(arr)}" + ) + + +class ChunkManagerNotFoundError(Exception): + ... + + class ChunkManager(ABC, Generic[T_ChunkedArray]): """ Adapter between a particular parallel computing framework and xarray. @@ -275,7 +294,7 @@ def blockwise( try: import dask - CHUNK_MANAGERS["dask"] = DaskManager + CHUNK_MANAGERS["dask"] = DaskManager() except ImportError: pass @@ -362,6 +381,6 @@ def blockwise( try: import cubed # type: ignore - CHUNK_MANAGERS["cubed"] = CubedManager + CHUNK_MANAGERS["cubed"] = CubedManager() except ImportError: pass From 7ac6531915556bdae424bcfe2767ce88b2a0bbd7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 10:15:31 -0500 Subject: [PATCH 028/158] add cubed.apply_gufunc --- xarray/core/parallelcompat.py | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 4fbc927b6f6..7f80c4451a6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -377,6 +377,42 @@ def blockwise( **kwargs, ) + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + axis=None, + keepdims=False, + output_dtypes=None, + output_sizes=None, + vectorize=None, + allow_rechunk=False, + meta=None, + **kwargs, + ): + if allow_rechunk: + raise NotImplementedError( + "cubed.apply_gufunc doesn't support allow_rechunk" + ) + if keepdims: + raise NotImplementedError("cubed.apply_gufunc doesn't support keepdims") + + from cubed import apply_gufunc + + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + **kwargs, + ) + try: import cubed # type: ignore From e423bfba132c61a99d2510dae2817e0b3b5c3eeb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 10:19:14 -0500 Subject: [PATCH 029/158] ruffify --- xarray/core/parallelcompat.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7f80c4451a6..e271b62ab59 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -4,7 +4,7 @@ but for now it is just a private experiment. """ from abc import ABC, abstractmethod -from typing import Any, Dict, Generic, Tuple, Type, TypeVar +from typing import Any, Generic, TypeVar import numpy as np from typing_extensions import TypeAlias @@ -14,9 +14,9 @@ T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") -T_Chunks: TypeAlias = Tuple[Tuple[int, ...], ...] +T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] -CHUNK_MANAGERS: Dict[str, T_ChunkManager] = {} +CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} def _get_chunk_manager(name: str) -> "ChunkManager": @@ -59,7 +59,7 @@ class ChunkManager(ABC, Generic[T_ChunkedArray]): Used for type checking. """ - array_cls: Type[T_ChunkedArray] + array_cls: type[T_ChunkedArray] @abstractmethod def __init__(self): @@ -70,7 +70,6 @@ def is_array_type(self, data: Any) -> bool: @abstractmethod def chunks(self, data: T_ChunkedArray) -> T_Chunks: - ... @abstractmethod @@ -134,7 +133,6 @@ def blockwise( class DaskManager(ChunkManager[T_DaskArray]): - array_cls: T_DaskArray def __init__(self): From 149db9d733519418f89a6abe2bb46825f97b9c13 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 10:29:47 -0500 Subject: [PATCH 030/158] add rechunk and compute methods for cubed --- xarray/core/parallelcompat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index e271b62ab59..45e9c4c863f 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -324,6 +324,12 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: return data + def rechunk(self, data: T_CubedArray, chunks, **kwargs) -> T_CubedArray: + return data.rechunk(chunks, **kwargs) + + def compute(self, data: T_CubedArray, **kwargs) -> np.ndarray: + return data.compute(**kwargs) + def map_blocks( self, func, From 280c563f04f7c8aecc485ae4c36db93f7f330082 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 13:51:02 -0500 Subject: [PATCH 031/158] xr.apply_ufunc now dispatches to chunkmanager.apply_gufunc --- xarray/core/computation.py | 27 +++++++++++++++------------ xarray/core/parallelcompat.py | 31 +++++++++++++++++++++++++------ xarray/core/pycompat.py | 1 + 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 2305e753cee..b42b66a8617 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -29,7 +29,8 @@ from xarray.core.indexes import Index, filter_indexes_from_coords from xarray.core.merge import merge_attrs, merge_coordinates_without_align from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import _detect_parallel_array_type +from xarray.core.pycompat import is_chunked_array, is_duck_dask_array from xarray.core.types import Dims, T_DataArray from xarray.core.utils import is_dict_like, is_scalar from xarray.core.variable import Variable @@ -683,16 +684,18 @@ def apply_variable_ufunc( for arg, core_dims in zip(args, signature.input_core_dims) ] - if any(is_duck_dask_array(array) for array in input_data): + if any(is_chunked_array(array) for array in input_data): if dask == "forbidden": raise ValueError( - "apply_ufunc encountered a dask array on an " - "argument, but handling for dask arrays has not " + "apply_ufunc encountered a chunked array on an " + "argument, but handling for chunked arrays has not " "been enabled. Either set the ``dask`` argument " "or load your data into memory first with " "``.load()`` or ``.compute()``" ) elif dask == "parallelized": + chunk_manager = _detect_parallel_array_type(*input_data) + numpy_func = func if dask_gufunc_kwargs is None: @@ -705,7 +708,7 @@ def apply_variable_ufunc( for n, (data, core_dims) in enumerate( zip(input_data, signature.input_core_dims) ): - if is_duck_dask_array(data): + if is_chunked_array(data): # core dimensions cannot span multiple chunks for axis, dim in enumerate(core_dims, start=-len(core_dims)): if len(data.chunks[axis]) != 1: @@ -713,7 +716,7 @@ def apply_variable_ufunc( f"dimension {dim} on {n}th function argument to " "apply_ufunc with dask='parallelized' consists of " "multiple chunks, but is also a core dimension. To " - "fix, either rechunk into a single dask array chunk along " + "fix, either rechunk into a single array chunk along " f"this dimension, i.e., ``.chunk(dict({dim}=-1))``, or " "pass ``allow_rechunk=True`` in ``dask_gufunc_kwargs`` " "but beware that this may significantly increase memory usage." @@ -740,9 +743,7 @@ def apply_variable_ufunc( ) def func(*arrays): - import dask.array as da - - res = da.apply_gufunc( + res = chunk_manager.apply_gufunc( numpy_func, signature.to_gufunc_string(exclude_dims), *arrays, @@ -754,11 +755,11 @@ def func(*arrays): return res elif dask == "allowed": + # TODO Check chunked array types here too? pass else: raise ValueError( - "unknown setting for dask array handling in " - "apply_ufunc: {}".format(dask) + "unknown setting for chunked array handling in " f"apply_ufunc: {dask}" ) else: if vectorize: @@ -820,7 +821,7 @@ def func(*arrays): def apply_array_ufunc(func, *args, dask="forbidden"): """Apply a ndarray level function over ndarray objects.""" - if any(is_duck_dask_array(arg) for arg in args): + if any(is_chunked_array(arg) for arg in args): if dask == "forbidden": raise ValueError( "apply_ufunc encountered a dask array on an " @@ -2069,6 +2070,7 @@ def _calc_idxminmax( # This will run argmin or argmax. indx = func(array, dim=dim, axis=None, keep_attrs=keep_attrs, skipna=skipna) + # TODO generalize this to other types of chunked array # Handle dask arrays. if is_duck_dask_array(array.data): import dask.array @@ -2161,6 +2163,7 @@ def unify_chunks(*objects: Dataset | DataArray) -> tuple[Dataset | DataArray, .. if not unify_chunks_args: return objects + # TODO generalize this to go through ChunkManager # Run dask.array.core.unify_chunks from dask.array.core import unify_chunks diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 45e9c4c863f..53f8f60c642 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -10,7 +10,7 @@ from typing_extensions import TypeAlias from xarray.core import indexing, utils -from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array +from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") @@ -28,17 +28,36 @@ def _get_chunk_manager(name: str) -> "ChunkManager": def _detect_parallel_array_type(*args) -> "ChunkManager": - """Detects which parallel backend should be used for given arrays (e.g. a list of dask arrays)""" - # TODO assert all arrays are the same type (or numpy) - arr = args[0] + """ + Detects which parallel backend should be used for given set of arrays. + + Also checks that all arrays are of same chunking type (i.e. not a mix of cubed and dask). + """ + + # TODO this list is probably redundant with something inside xarray.apply_ufunc + ALLOWED_NON_CHUNKED_TYPES = {int, float, np.ndarray} + + chunked_array_types_found = { + type(a) + for a in args + if is_chunked_array(a) and type(a) not in ALLOWED_NON_CHUNKED_TYPES + } + + # Asserts all arrays are the same type (or numpy etc.) + if len(chunked_array_types_found) > 1: + raise TypeError( + f"Mixing chunked array types is not supported, but received types {chunked_array_types_found}" + ) + + (chunked_arr_type,) = chunked_array_types_found # iterate over defined chunk managers, seeing if each recognises this array type for chunkmanager in CHUNK_MANAGERS.values(): - if chunkmanager.is_array_type(arr): + if chunked_arr_type == chunkmanager.array_cls: return chunkmanager raise ChunkManagerNotFoundError( - f"Could not find a Chunk Manager which recognises type {type(arr)}" + f"Could not find a Chunk Manager which recognises type {chunked_arr_type}" ) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 8d83fa9c644..62de269846d 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -84,5 +84,6 @@ def is_duck_dask_array(x): def is_chunked_array(x): + # TODO this should not special-case the two libraries like this cubed_array_type = DuckArrayModule("cubed").type return is_duck_dask_array(x) or isinstance(x, cubed_array_type) From 42186e73318b2498c84b8e788df18d6a13aafa1c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 16:04:32 -0500 Subject: [PATCH 032/158] CubedManager.chunks --- xarray/core/parallelcompat.py | 3 +++ xarray/core/pycompat.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 53f8f60c642..43be9ed18f8 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -325,6 +325,9 @@ def __init__(self): self.array_cls = Array + def chunks(self, data: T_CubedArray) -> T_Chunks: + return data.chunks + def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: import cubed # type: ignore diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 62de269846d..77bc59ea560 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -30,7 +30,7 @@ class DuckArrayModule: available: bool def __init__(self, mod: ModType) -> None: - duck_array_module: ModuleType | None = None + duck_array_module: ModuleType | None duck_array_version: Version duck_array_type: DuckArrayTypes try: From 103a75512fd1b1af929c5c2480840dec907d423c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 16:12:04 -0500 Subject: [PATCH 033/158] attempt to keep dask and cubed imports lazy --- xarray/core/parallelcompat.py | 11 ++++++----- xarray/tests/test_plugins.py | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 43be9ed18f8..82c65200be8 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -21,8 +21,8 @@ def _get_chunk_manager(name: str) -> "ChunkManager": if name in CHUNK_MANAGERS: - chunkmanager = CHUNK_MANAGERS[name] - return chunkmanager + chunkmanager_cls = CHUNK_MANAGERS[name] + return chunkmanager_cls() else: raise ImportError(f"ChunkManager {name} has not been defined") @@ -52,7 +52,8 @@ def _detect_parallel_array_type(*args) -> "ChunkManager": (chunked_arr_type,) = chunked_array_types_found # iterate over defined chunk managers, seeing if each recognises this array type - for chunkmanager in CHUNK_MANAGERS.values(): + for chunkmanager_cls in CHUNK_MANAGERS.values(): + chunkmanager = chunkmanager_cls() if chunked_arr_type == chunkmanager.array_cls: return chunkmanager @@ -311,7 +312,7 @@ def blockwise( try: import dask - CHUNK_MANAGERS["dask"] = DaskManager() + CHUNK_MANAGERS["dask"] = DaskManager except ImportError: pass @@ -443,6 +444,6 @@ def apply_gufunc( try: import cubed # type: ignore - CHUNK_MANAGERS["cubed"] = CubedManager() + CHUNK_MANAGERS["cubed"] = CubedManager except ImportError: pass diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index 2160b8d16ed..d521623531e 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -209,6 +209,7 @@ def test_lazy_import() -> None: "sparse", "cupy", "pint", + "cubed", ] # ensure that none of the above modules has been imported before modules_backup = {} From f2bce3da2c34ddf3b05833ced3eaf1765e3fb5d5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 16:53:16 -0500 Subject: [PATCH 034/158] generalize idxmax --- xarray/core/computation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index b42b66a8617..b2df8c9518f 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -2070,13 +2070,11 @@ def _calc_idxminmax( # This will run argmin or argmax. indx = func(array, dim=dim, axis=None, keep_attrs=keep_attrs, skipna=skipna) - # TODO generalize this to other types of chunked array - # Handle dask arrays. - if is_duck_dask_array(array.data): - import dask.array - + # Handle chunked arrays (e.g. dask). + if is_chunked_array(array.data): + chunkmanager = _detect_parallel_array_type(array.data) chunks = dict(zip(array.dims, array.chunks)) - dask_coord = dask.array.from_array(array[dim].data, chunks=chunks[dim]) + dask_coord = chunkmanager.from_array(array[dim].data, chunks=chunks[dim]) res = indx.copy(data=dask_coord[indx.data.ravel()].reshape(indx.shape)) # we need to attach back the dim name res.name = dim From f09947db739a03ae643c29d4d5e75b4ddf68ac0d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 17:14:20 -0500 Subject: [PATCH 035/158] move unify_chunks import to ChunkManager --- xarray/core/computation.py | 11 ++++------- xarray/core/parallelcompat.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index b2df8c9518f..fe4f4e4a6b7 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -2161,17 +2161,14 @@ def unify_chunks(*objects: Dataset | DataArray) -> tuple[Dataset | DataArray, .. if not unify_chunks_args: return objects - # TODO generalize this to go through ChunkManager - # Run dask.array.core.unify_chunks - from dask.array.core import unify_chunks - - _, dask_data = unify_chunks(*unify_chunks_args) - dask_data_iter = iter(dask_data) + chunkmanager = _detect_parallel_array_type(*[arg for arg in unify_chunks_args]) + _, chunked_data = chunkmanager.unify_chunks(*unify_chunks_args) + chunked_data_iter = iter(chunked_data) out: list[Dataset | DataArray] = [] for obj, ds in zip(objects, datasets): for k, v in ds._variables.items(): if v.chunks is not None: - ds._variables[k] = v.copy(data=next(dask_data_iter)) + ds._variables[k] = v.copy(data=next(chunked_data_iter)) out.append(obj._from_temp_dataset(ds) if isinstance(obj, DataArray) else ds) return tuple(out) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 82c65200be8..fadbaf05b45 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -48,6 +48,8 @@ def _detect_parallel_array_type(*args) -> "ChunkManager": raise TypeError( f"Mixing chunked array types is not supported, but received types {chunked_array_types_found}" ) + elif len(chunked_array_types_found) == 0: + raise TypeError("Expected a chunked array type but none were found") (chunked_arr_type,) = chunked_array_types_found @@ -148,6 +150,12 @@ def blockwise( """Called by some niche functions in xarray.""" raise NotImplementedError() + def unify_chunks( + self, *args, **kwargs + ) -> tuple[dict[str, T_Chunks], list[T_ChunkedArray]]: + """Called by xr.unify_chunks.""" + raise NotImplementedError() + T_DaskArray = TypeVar("T_DaskArray", bound="dask.array.Array") @@ -308,6 +316,13 @@ def blockwise( **kwargs, ) + def unify_chunks( + self, *args, **kwargs + ) -> tuple[dict[str, T_Chunks], list[T_DaskArray]]: + from dask.array.core import unify_chunks + + return unify_chunks(*args, **kwargs) + try: import dask From e760f105d6f62ed75ed42fb4d3a8394e47945f94 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 17:42:40 -0500 Subject: [PATCH 036/158] generalize Dataset.load() --- xarray/core/dataset.py | 11 ++++++----- xarray/core/parallelcompat.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4b1cab12d27..d1f40c20014 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -73,7 +73,8 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import array_type, is_duck_dask_array +from xarray.core.parallelcompat import _detect_parallel_array_type +from xarray.core.pycompat import array_type, is_chunked_array, is_duck_dask_array from xarray.core.types import QuantileMethods, T_Dataset from xarray.core.utils import ( Default, @@ -744,13 +745,13 @@ def load(self: T_Dataset, **kwargs) -> T_Dataset: """ # access .data to coerce everything to numpy or dask arrays lazy_data = { - k: v._data for k, v in self.variables.items() if is_duck_dask_array(v._data) + k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) } if lazy_data: - import dask.array as da + chunkmanager = _detect_parallel_array_type(*lazy_data.values()) - # evaluate all the dask arrays simultaneously - evaluated_data = da.compute(*lazy_data.values(), **kwargs) + # evaluate all the chunked arrays simultaneously + evaluated_data = chunkmanager.compute(*lazy_data.values(), **kwargs) for k, data in zip(lazy_data, evaluated_data): self.variables[k].data = data diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index fadbaf05b45..0925db5932c 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -216,11 +216,14 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_DaskArray: ) return data + # TODO is simple method propagation like this necessary? def rechunk(self, data: T_DaskArray, chunks, **kwargs) -> T_DaskArray: return data.rechunk(chunks, **kwargs) - def compute(self, data: T_DaskArray, **kwargs) -> np.ndarray: - return data.compute(**kwargs) + def compute(self, *data: T_DaskArray, **kwargs) -> np.ndarray: + from dask.array import compute + + return compute(*data, **kwargs) def apply_gufunc( self, @@ -365,8 +368,10 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: def rechunk(self, data: T_CubedArray, chunks, **kwargs) -> T_CubedArray: return data.rechunk(chunks, **kwargs) - def compute(self, data: T_CubedArray, **kwargs) -> np.ndarray: - return data.compute(**kwargs) + def compute(self, *data: T_CubedArray, **kwargs) -> np.ndarray: + from cubed import compute + + return compute(*data, **kwargs) def map_blocks( self, From b1a4e35c4153334032b0ec3337f870c4a5992caf Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 18:06:22 -0500 Subject: [PATCH 037/158] check explicitly for chunks attribute instead of hard-coding cubed --- xarray/core/pycompat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 77bc59ea560..f4ee1422486 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -84,6 +84,4 @@ def is_duck_dask_array(x): def is_chunked_array(x): - # TODO this should not special-case the two libraries like this - cubed_array_type = DuckArrayModule("cubed").type - return is_duck_dask_array(x) or isinstance(x, cubed_array_type) + return is_duck_dask_array(x) or hasattr(x, "chunks") From 5320f4d73aec2e4dec131ec3e24a813f2045ffc1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 18:16:59 -0500 Subject: [PATCH 038/158] better function names --- xarray/core/computation.py | 8 ++++---- xarray/core/dataset.py | 4 ++-- xarray/core/parallelcompat.py | 4 ++-- xarray/core/variable.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index fe4f4e4a6b7..7b775d69e80 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -29,7 +29,7 @@ from xarray.core.indexes import Index, filter_indexes_from_coords from xarray.core.merge import merge_attrs, merge_coordinates_without_align from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import _detect_parallel_array_type +from xarray.core.parallelcompat import get_chunked_array_type from xarray.core.pycompat import is_chunked_array, is_duck_dask_array from xarray.core.types import Dims, T_DataArray from xarray.core.utils import is_dict_like, is_scalar @@ -694,7 +694,7 @@ def apply_variable_ufunc( "``.load()`` or ``.compute()``" ) elif dask == "parallelized": - chunk_manager = _detect_parallel_array_type(*input_data) + chunk_manager = get_chunked_array_type(*input_data) numpy_func = func @@ -2072,7 +2072,7 @@ def _calc_idxminmax( # Handle chunked arrays (e.g. dask). if is_chunked_array(array.data): - chunkmanager = _detect_parallel_array_type(array.data) + chunkmanager = get_chunked_array_type(array.data) chunks = dict(zip(array.dims, array.chunks)) dask_coord = chunkmanager.from_array(array[dim].data, chunks=chunks[dim]) res = indx.copy(data=dask_coord[indx.data.ravel()].reshape(indx.shape)) @@ -2161,7 +2161,7 @@ def unify_chunks(*objects: Dataset | DataArray) -> tuple[Dataset | DataArray, .. if not unify_chunks_args: return objects - chunkmanager = _detect_parallel_array_type(*[arg for arg in unify_chunks_args]) + chunkmanager = get_chunked_array_type(*[arg for arg in unify_chunks_args]) _, chunked_data = chunkmanager.unify_chunks(*unify_chunks_args) chunked_data_iter = iter(chunked_data) out: list[Dataset | DataArray] = [] diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d1f40c20014..c8a87a9f013 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -73,7 +73,7 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import _detect_parallel_array_type +from xarray.core.parallelcompat import get_chunked_array_type from xarray.core.pycompat import array_type, is_chunked_array, is_duck_dask_array from xarray.core.types import QuantileMethods, T_Dataset from xarray.core.utils import ( @@ -748,7 +748,7 @@ def load(self: T_Dataset, **kwargs) -> T_Dataset: k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) } if lazy_data: - chunkmanager = _detect_parallel_array_type(*lazy_data.values()) + chunkmanager = get_chunked_array_type(*lazy_data.values()) # evaluate all the chunked arrays simultaneously evaluated_data = chunkmanager.compute(*lazy_data.values(), **kwargs) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 0925db5932c..d5132f11cfe 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -19,7 +19,7 @@ CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} -def _get_chunk_manager(name: str) -> "ChunkManager": +def get_chunkmanager(name: str) -> "ChunkManager": if name in CHUNK_MANAGERS: chunkmanager_cls = CHUNK_MANAGERS[name] return chunkmanager_cls() @@ -27,7 +27,7 @@ def _get_chunk_manager(name: str) -> "ChunkManager": raise ImportError(f"ChunkManager {name} has not been defined") -def _detect_parallel_array_type(*args) -> "ChunkManager": +def get_chunked_array_type(*args) -> "ChunkManager": """ Detects which parallel backend should be used for given set of arrays. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6e3b30b7c4a..6c0130a59c6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -27,7 +27,7 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import _get_chunk_manager +from xarray.core.parallelcompat import get_chunkmanager from xarray.core.pycompat import ( DuckArrayModule, array_type, @@ -1192,7 +1192,7 @@ def chunk( if from_array_kwargs is None: from_array_kwargs = {} - chunk_manager = _get_chunk_manager(from_array_kwargs.pop("manager", "dask")) + chunk_manager = get_chunkmanager(from_array_kwargs.pop("manager", "dask")) _from_array_kwargs = dict( name=name, lock=lock, inline_array=inline_array, **from_array_kwargs From 45ed5d21f4d888daba6ccd64d2e0b353ddd71baf Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 18:18:01 -0500 Subject: [PATCH 039/158] add cubed version of unify_chunks --- xarray/core/parallelcompat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index d5132f11cfe..53505b3660d 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -460,6 +460,13 @@ def apply_gufunc( **kwargs, ) + def unify_chunks( + self, *args, **kwargs + ) -> tuple[dict[str, T_Chunks], list[T_CubedArray]]: + from cubed.core import unify_chunks + + return unify_chunks(*args, **kwargs) + try: import cubed # type: ignore From eec096b5f6f9ba687167adeb1bdf288a8cee6c3d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 18:59:33 -0500 Subject: [PATCH 040/158] recognize wrapped duck dask arrays (e.g. pint wrapping dask) --- xarray/core/parallelcompat.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 53505b3660d..69d5b31c0ad 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -37,30 +37,30 @@ def get_chunked_array_type(*args) -> "ChunkManager": # TODO this list is probably redundant with something inside xarray.apply_ufunc ALLOWED_NON_CHUNKED_TYPES = {int, float, np.ndarray} - chunked_array_types_found = { - type(a) + chunked_arrays = [ + a for a in args if is_chunked_array(a) and type(a) not in ALLOWED_NON_CHUNKED_TYPES - } + ] # Asserts all arrays are the same type (or numpy etc.) - if len(chunked_array_types_found) > 1: + chunked_array_types = {type(a) for a in chunked_arrays} + if len(chunked_array_types) > 1: raise TypeError( - f"Mixing chunked array types is not supported, but received types {chunked_array_types_found}" + f"Mixing chunked array types is not supported, but received multiple types: {chunked_array_types}" ) - elif len(chunked_array_types_found) == 0: - raise TypeError("Expected a chunked array type but none were found") - - (chunked_arr_type,) = chunked_array_types_found + elif len(chunked_array_types) == 0: + raise TypeError("Expected a chunked array but none were found") # iterate over defined chunk managers, seeing if each recognises this array type + chunked_arr = chunked_arrays[0] for chunkmanager_cls in CHUNK_MANAGERS.values(): chunkmanager = chunkmanager_cls() - if chunked_arr_type == chunkmanager.array_cls: + if chunkmanager.is_chunked_array(chunked_arr): return chunkmanager raise ChunkManagerNotFoundError( - f"Could not find a Chunk Manager which recognises type {chunked_arr_type}" + f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}" ) @@ -87,7 +87,7 @@ class ChunkManager(ABC, Generic[T_ChunkedArray]): def __init__(self): ... - def is_array_type(self, data: Any) -> bool: + def is_chunked_array(self, data: Any) -> bool: return isinstance(data, self.array_cls) @abstractmethod @@ -168,6 +168,9 @@ def __init__(self): self.array_cls = Array + def is_chunked_array(self, data: Any) -> bool: + return is_duck_dask_array(data) + def chunks(self, data: T_DaskArray) -> T_Chunks: return data.chunks From c64ff5f360c6d3a72c8d03cd3f4aa9b8b21d05c9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 6 Mar 2023 21:06:43 -0500 Subject: [PATCH 041/158] add some tests for fetching ChunkManagers --- xarray/core/parallelcompat.py | 7 +- xarray/core/types.py | 3 + xarray/tests/test_parallelcompat.py | 152 ++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 xarray/tests/test_parallelcompat.py diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 69d5b31c0ad..724406f2930 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -7,14 +7,13 @@ from typing import Any, Generic, TypeVar import numpy as np -from typing_extensions import TypeAlias from xarray.core import indexing, utils from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array +from xarray.core.types import T_Chunks T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") -T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} @@ -43,6 +42,8 @@ def get_chunked_array_type(*args) -> "ChunkManager": if is_chunked_array(a) and type(a) not in ALLOWED_NON_CHUNKED_TYPES ] + print(chunked_arrays) + # Asserts all arrays are the same type (or numpy etc.) chunked_array_types = {type(a) for a in chunked_arrays} if len(chunked_array_types) > 1: @@ -164,6 +165,8 @@ class DaskManager(ChunkManager[T_DaskArray]): array_cls: T_DaskArray def __init__(self): + # TODO can we replace this with a class attribute instead? + from dask.array import Array self.array_cls = Array diff --git a/xarray/core/types.py b/xarray/core/types.py index 56f7ca74a71..642b4631f5c 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -15,6 +15,7 @@ import numpy as np import pandas as pd from packaging.version import Version +from typing_extensions import TypeAlias if TYPE_CHECKING: from numpy._typing import _SupportsDType @@ -107,7 +108,9 @@ VarCompatible = Union["Variable", "ScalarOrArray"] GroupByIncompatible = Union["Variable", "GroupBy"] +T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] T_CubedSpec = TypeVar("T_CubedSpec", bound="Spec") + Dims = Union[str, Iterable[Hashable], "ellipsis", None] OrderedDims = Union[str, Sequence[Union[Hashable, "ellipsis"]], "ellipsis", None] diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py new file mode 100644 index 00000000000..faccabe2515 --- /dev/null +++ b/xarray/tests/test_parallelcompat.py @@ -0,0 +1,152 @@ +from typing import Any, Optional + +import numpy as np +import pytest + +from xarray.core.parallelcompat import ( + CHUNK_MANAGERS, + ChunkManager, + DaskManager, + get_chunked_array_type, + get_chunkmanager, +) +from xarray.core.types import T_Chunks + +dask = pytest.importorskip("dask") + + +class DummyChunkedArray(np.ndarray): + """ + Mock-up of a chunked array class. + + Adds a (non-functional) .chunks attribute by following this example in the numpy docs + https://numpy.org/doc/stable/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray + """ + + chunks: Optional[T_Chunks] + + def __new__( + cls, + shape, + dtype=float, + buffer=None, + offset=0, + strides=None, + order=None, + chunks=None, + ): + obj = super().__new__(cls, shape, dtype, buffer, offset, strides, order) + obj.chunks = chunks + return obj + + def __array_finalize__(self, obj): + if obj is None: + return + self.chunks = getattr(obj, "chunks", None) + + +class DummyChunkManager(ChunkManager): + """Mock-up of ChunkManager class for DummyChunkedArray""" + + def __init__(self): + self.array_cls = DummyChunkedArray + + def is_chunked_array(self, data: Any) -> bool: + return isinstance(data, DummyChunkedArray) + + def chunks(self, data: DummyChunkedArray) -> T_Chunks: + return data.chunks + + def from_array( + self, data: np.ndarray, chunks: T_Chunks, **kwargs + ) -> DummyChunkedArray: + from dask import array as da + + return da.from_array(data, chunks, **kwargs) + + def rechunk(self, data: DummyChunkedArray, chunks, **kwargs) -> DummyChunkedArray: + return data.rechunk(chunks, **kwargs) + + def compute(self, *data: DummyChunkedArray, **kwargs) -> np.ndarray: + from dask.array import compute + + return compute(*data, **kwargs) + + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + axis=None, + keepdims=False, + output_dtypes=None, + output_sizes=None, + vectorize=None, + allow_rechunk=False, + meta=None, + **kwargs, + ): + from dask.array.gufunc import apply_gufunc + + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + keepdims=keepdims, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + allow_rechunk=allow_rechunk, + meta=meta, + **kwargs, + ) + + +class TestGetChunkManager: + # TODO do these need setups and teardowns? + + def test_get_chunkmanger(self): + CHUNK_MANAGERS["dummy"] = DummyChunkManager + + chunkmanager = get_chunkmanager("dummy") + assert isinstance(chunkmanager, DummyChunkManager) + + def test_fail_on_nonexistent_chunkmanager(self): + with pytest.raises(ImportError, match="nonsense has not been defined"): + get_chunkmanager("nonsense") + + +class TestGetChunkedArrayType: + def test_detect_chunked_arrays(self): + CHUNK_MANAGERS["dummy"] = DummyChunkManager + dummy_arr = DummyChunkedArray([1, 2, 3]) + + chunk_manager = get_chunked_array_type(dummy_arr) + assert isinstance(chunk_manager, DummyChunkManager) + + def test_ignore_inmemory_arrays(self): + CHUNK_MANAGERS["dummy"] = DummyChunkManager + dummy_arr = DummyChunkedArray([1, 2, 3]) + + chunk_manager = get_chunked_array_type(*[dummy_arr, 1.0, np.array([5, 6])]) + assert isinstance(chunk_manager, DummyChunkManager) + + with pytest.raises(TypeError, match="Expected a chunked array"): + get_chunked_array_type(5.0) + + def test_detect_dask_by_default(self): + dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) + + chunk_manager = get_chunked_array_type(dask_arr) + assert isinstance(chunk_manager, DaskManager) + + def test_raise_on_mixed_types(self): + CHUNK_MANAGERS["dummy"] = DummyChunkManager + dummy_arr = DummyChunkedArray([1, 2, 3]) + dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) + + with pytest.raises(TypeError, match="received multiple types"): + get_chunked_array_type(*[dask_arr, dummy_arr]) From 8a3790512e545b72e1f33b8ea96a454d4fd3a561 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 8 Mar 2023 19:54:58 -0500 Subject: [PATCH 042/158] add from_array_kwargs to open_dataset --- xarray/backends/api.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8891ac2986b..dd8bcb5f155 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -298,6 +298,7 @@ def _chunk_ds( chunks, overwrite_encoded_chunks, inline_array, + from_array_kwargs, **extra_tokens, ): from dask.base import tokenize @@ -317,6 +318,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, + from_array_kwargs=from_array_kwargs, ) return backend_ds._replace(variables) @@ -329,6 +331,7 @@ def _dataset_from_backend_dataset( cache, overwrite_encoded_chunks, inline_array, + from_array_kwargs, **extra_tokens, ): if not isinstance(chunks, (int, dict)) and chunks not in {None, "auto"}: @@ -347,6 +350,7 @@ def _dataset_from_backend_dataset( chunks, overwrite_encoded_chunks, inline_array, + from_array_kwargs, **extra_tokens, ) @@ -374,6 +378,7 @@ def open_dataset( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + from_array_kwargs=None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> Dataset: @@ -537,6 +542,7 @@ def open_dataset( cache, overwrite_encoded_chunks, inline_array, + from_array_kwargs, drop_variables=drop_variables, **decoders, **kwargs, @@ -559,6 +565,7 @@ def open_dataarray( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + from_array_kwargs: dict[str, Any] = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataArray: @@ -696,6 +703,7 @@ def open_dataarray( cache=cache, drop_variables=drop_variables, inline_array=inline_array, + from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, From 989d6bbeed59c81f2925570de387a43c17874d1e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 8 Mar 2023 19:59:23 -0500 Subject: [PATCH 043/158] add from_array_kwargs to open_zarr --- xarray/backends/zarr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3b0335aa5a6..36b217d5314 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -702,6 +702,7 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, + from_array_kwargs=None, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -837,6 +838,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, decode_timedelta=decode_timedelta, use_cftime=use_cftime, From 8c7fe7915c1fa646dce17acfe87b749a162bfbc1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 9 Mar 2023 13:43:15 -0500 Subject: [PATCH 044/158] pipe constructors through chunkmanager --- xarray/core/common.py | 115 ++++++++++++++++++++++++++-------- xarray/core/parallelcompat.py | 16 ++++- 2 files changed, 102 insertions(+), 29 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 3a73f463ea9..8043c4e6639 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -12,7 +12,8 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array, is_duck_dask_array from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar try: @@ -1366,28 +1367,40 @@ def __getitem__(self, value): @overload def full_like( - other: DataArray, fill_value: Any, dtype: DTypeLikeSave = None + other: DataArray, + fill_value: Any, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, ) -> DataArray: ... @overload def full_like( - other: Dataset, fill_value: Any, dtype: DTypeMaybeMapping = None + other: Dataset, + fill_value: Any, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset: ... @overload def full_like( - other: Variable, fill_value: Any, dtype: DTypeLikeSave = None + other: Variable, + fill_value: Any, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, ) -> Variable: ... @overload def full_like( - other: Dataset | DataArray, fill_value: Any, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + fill_value: Any, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray: ... @@ -1397,6 +1410,7 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: ... @@ -1405,6 +1419,7 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: """Return a new object with the same shape and type as a given object. @@ -1532,7 +1547,10 @@ def full_like( data_vars = { k: _full_like_variable( - v.variable, fill_value.get(k, dtypes.NA), dtype_.get(k, None) + v.variable, + fill_value.get(k, dtypes.NA), + dtype_.get(k, None), + from_array_kwargs, ) for k, v in other.data_vars.items() } @@ -1541,7 +1559,7 @@ def full_like( if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a DataArray") return DataArray( - _full_like_variable(other.variable, fill_value, dtype), + _full_like_variable(other.variable, fill_value, dtype, from_array_kwargs), dims=other.dims, coords=other.coords, attrs=other.attrs, @@ -1550,13 +1568,16 @@ def full_like( elif isinstance(other, Variable): if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a Variable") - return _full_like_variable(other, fill_value, dtype) + return _full_like_variable(other, fill_value, dtype, from_array_kwargs) else: raise TypeError("Expected DataArray, Dataset, or Variable") def _full_like_variable( - other: Variable, fill_value: Any, dtype: DTypeLike = None + other: Variable, + fill_value: Any, + dtype: DTypeLike = None, + from_array_kwargs: dict[str, Any] = None, ) -> Variable: """Inner function of full_like, where other must be a variable""" from xarray.core.variable import Variable @@ -1564,13 +1585,17 @@ def _full_like_variable( if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype) - if is_duck_dask_array(other.data): - import dask.array + if is_chunked_array(other.data): + chunkmanager = get_chunked_array_type(other.data) if dtype is None: dtype = other.dtype - data = dask.array.full( - other.shape, fill_value, dtype=dtype, chunks=other.data.chunks + data = chunkmanager.array_api.full( + other.shape, + fill_value, + dtype=dtype, + chunks=other.data.chunks, + from_array_kwargs=from_array_kwargs, ) else: data = np.full_like(other.data, fill_value, dtype=dtype) @@ -1579,36 +1604,54 @@ def _full_like_variable( @overload -def zeros_like(other: DataArray, dtype: DTypeLikeSave = None) -> DataArray: +def zeros_like( + other: DataArray, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, +) -> DataArray: ... @overload -def zeros_like(other: Dataset, dtype: DTypeMaybeMapping = None) -> Dataset: +def zeros_like( + other: Dataset, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, +) -> Dataset: ... @overload -def zeros_like(other: Variable, dtype: DTypeLikeSave = None) -> Variable: +def zeros_like( + other: Variable, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, +) -> Variable: ... @overload def zeros_like( - other: Dataset | DataArray, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray: ... @overload def zeros_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: ... def zeros_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: """Return a new object of zeros with the same shape and type as a given dataarray or dataset. @@ -1662,40 +1705,58 @@ def zeros_like( full_like """ - return full_like(other, 0, dtype) + return full_like(other, 0, dtype, from_array_kwargs) @overload -def ones_like(other: DataArray, dtype: DTypeLikeSave = None) -> DataArray: +def ones_like( + other: DataArray, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, +) -> DataArray: ... @overload -def ones_like(other: Dataset, dtype: DTypeMaybeMapping = None) -> Dataset: +def ones_like( + other: Dataset, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, +) -> Dataset: ... @overload -def ones_like(other: Variable, dtype: DTypeLikeSave = None) -> Variable: +def ones_like( + other: Variable, + dtype: DTypeLikeSave = None, + from_array_kwargs: dict[str, Any] = None, +) -> Variable: ... @overload def ones_like( - other: Dataset | DataArray, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray: ... @overload def ones_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: ... def ones_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping = None, + from_array_kwargs: dict[str, Any] = None, ) -> Dataset | DataArray | Variable: """Return a new object of ones with the same shape and type as a given dataarray or dataset. @@ -1741,7 +1802,7 @@ def ones_like( full_like """ - return full_like(other, 1, dtype) + return full_like(other, 1, dtype, from_array_kwargs) def get_chunksizes( diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 724406f2930..67b168ff1f2 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -42,8 +42,6 @@ def get_chunked_array_type(*args) -> "ChunkManager": if is_chunked_array(a) and type(a) not in ALLOWED_NON_CHUNKED_TYPES ] - print(chunked_arrays) - # Asserts all arrays are the same type (or numpy etc.) chunked_array_types = {type(a) for a in chunked_arrays} if len(chunked_array_types) > 1: @@ -111,6 +109,10 @@ def rechunk( def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: ... + def array_api(self) -> Any: + """Return the array_api namespace following the python array API standard.""" + raise NotImplementedError() + @abstractmethod def apply_gufunc( self, @@ -231,6 +233,11 @@ def compute(self, *data: T_DaskArray, **kwargs) -> np.ndarray: return compute(*data, **kwargs) + def array_api(self) -> Any: + from dask import array as da + + return da + def apply_gufunc( self, func, @@ -379,6 +386,11 @@ def compute(self, *data: T_CubedArray, **kwargs) -> np.ndarray: return compute(*data, **kwargs) + def array_api(self) -> Any: + from cubed import array_api + + return array_api + def map_blocks( self, func, From 0222b55c012d982cf313af15163d6113a3d10a77 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 9 Mar 2023 13:53:43 -0500 Subject: [PATCH 045/158] generalize map_blocks inside coding --- xarray/coding/strings.py | 14 +++++++------- xarray/coding/variables.py | 11 ++++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index 61b3ab7c46c..ffe1b1a8d50 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -14,7 +14,7 @@ unpack_for_encoding, ) from xarray.core import indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.variable import Variable @@ -134,10 +134,10 @@ def bytes_to_char(arr): if arr.dtype.kind != "S": raise ValueError("argument must have a fixed-width bytes dtype") - if is_duck_dask_array(arr): - import dask.array as da + if is_chunked_array(arr): + chunkmanager = get_chunked_array_type(arr) - return da.map_blocks( + return chunkmanager.map_blocks( _numpy_bytes_to_char, arr, dtype="S1", @@ -169,8 +169,8 @@ def char_to_bytes(arr): # can't make an S0 dtype return np.zeros(arr.shape[:-1], dtype=np.string_) - if is_duck_dask_array(arr): - import dask.array as da + if is_chunked_array(arr): + chunkmanager = get_chunked_array_type(arr) if len(arr.chunks[-1]) > 1: raise ValueError( @@ -179,7 +179,7 @@ def char_to_bytes(arr): ) dtype = np.dtype("S" + str(arr.shape[-1])) - return da.map_blocks( + return chunkmanager.map_blocks( _numpy_char_to_bytes, arr, dtype=dtype, diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index c290307b4b6..93cdb4e0ae3 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -10,7 +10,8 @@ import pandas as pd from xarray.core import dtypes, duck_array_ops, indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array from xarray.core.variable import Variable if TYPE_CHECKING: @@ -57,7 +58,7 @@ class _ElementwiseFunctionArray(indexing.ExplicitlyIndexedNDArrayMixin): """ def __init__(self, array, func: Callable, dtype: np.typing.DTypeLike): - assert not is_duck_dask_array(array) + assert not is_chunked_array(array) self.array = indexing.as_indexable(array) self.func = func self._dtype = dtype @@ -93,10 +94,10 @@ def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): ------- Either a dask.array.Array or _ElementwiseFunctionArray. """ - if is_duck_dask_array(array): - import dask.array as da + if is_chunked_array(array): + chunkmanager = get_chunked_array_type(array) - return da.map_blocks(func, array, dtype=dtype) + return chunkmanager.map_blocks(func, array, dtype=dtype) else: return _ElementwiseFunctionArray(array, func, dtype) From afc6abcfd58e8451333e6a6333b52162893f9cc2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 13 Mar 2023 19:25:44 +0000 Subject: [PATCH 046/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 6498dd8aebd..cf6ff65f2c3 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -15,7 +15,7 @@ from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.parallelcompat import get_chunked_array_type from xarray.core.pdcompat import _convert_base_to_offset -from xarray.core.pycompat import is_duck_dask_array, is_chunked_array +from xarray.core.pycompat import is_chunked_array from xarray.core.utils import ( Frozen, either_dict_or_kwargs, From 2c0cc268145b66f7d3f201f6f885c0f580bc995e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 13 Mar 2023 16:42:54 -0400 Subject: [PATCH 047/158] fixed full_like --- xarray/core/common.py | 6 +++++- xarray/core/parallelcompat.py | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 8043c4e6639..7ce69bc8e9a 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1590,12 +1590,16 @@ def _full_like_variable( if dtype is None: dtype = other.dtype + + if from_array_kwargs is None: + from_array_kwargs = {} + data = chunkmanager.array_api.full( other.shape, fill_value, dtype=dtype, chunks=other.data.chunks, - from_array_kwargs=from_array_kwargs, + **from_array_kwargs, ) else: data = np.full_like(other.data, fill_value, dtype=dtype) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 67b168ff1f2..d178aeb5dfe 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -109,6 +109,7 @@ def rechunk( def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: ... + @property def array_api(self) -> Any: """Return the array_api namespace following the python array API standard.""" raise NotImplementedError() @@ -233,6 +234,7 @@ def compute(self, *data: T_DaskArray, **kwargs) -> np.ndarray: return compute(*data, **kwargs) + @property def array_api(self) -> Any: from dask import array as da @@ -386,6 +388,7 @@ def compute(self, *data: T_CubedArray, **kwargs) -> np.ndarray: return compute(*data, **kwargs) + @property def array_api(self) -> Any: from cubed import array_api From c398d987a6d5959a0e8b7999f88ba1b183b25374 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 13 Mar 2023 17:54:37 -0400 Subject: [PATCH 048/158] add from_array_kwargs to open_zarr --- xarray/backends/zarr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 36b217d5314..1c2f420311f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -17,6 +17,7 @@ ) from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing +from xarray.core.parallelcompat import get_chunkmanager from xarray.core.pycompat import integer_types from xarray.core.utils import ( FrozenDict, @@ -804,9 +805,13 @@ def open_zarr( """ from xarray.backends.api import open_dataset + if from_array_kwargs is None: + from_array_kwargs = {} + if chunks == "auto": + manager = from_array_kwargs.get("manager", "dask") try: - import dask.array # noqa + get_chunkmanager(manager) # attempt to import that parallel backend chunks = {} except ImportError: From 598bf12f53f3a5763ba44d141712c6d0da7a72ba Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 00:23:15 -0400 Subject: [PATCH 049/158] don't import dask.tokenize --- xarray/backends/api.py | 16 ++++++++++++---- xarray/backends/zarr.py | 2 +- xarray/core/common.py | 2 ++ xarray/core/dataset.py | 28 +++++++++++++++++++--------- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dd8bcb5f155..8fc81a611e2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -301,11 +301,16 @@ def _chunk_ds( from_array_kwargs, **extra_tokens, ): - from dask.base import tokenize + if from_array_kwargs["manager"] == "dask": + from dask.base import tokenize - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = f"open_dataset-{token}" + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = f"open_dataset-{token}" + else: + # not used + token = (None,) + name_prefix = None variables = {} for name, var in backend_ds.variables.items(): @@ -514,6 +519,9 @@ def open_dataset( if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {"manager": "dask"} + backend = plugins.get_backend(engine) decoders = _resolve_decoders_kwargs( diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1c2f420311f..e877306842c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -806,7 +806,7 @@ def open_zarr( from xarray.backends.api import open_dataset if from_array_kwargs is None: - from_array_kwargs = {} + from_array_kwargs = {"manager": "dask"} if chunks == "auto": manager = from_array_kwargs.get("manager", "dask") diff --git a/xarray/core/common.py b/xarray/core/common.py index 24a2308aea1..532ab136db1 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1619,6 +1619,8 @@ def _full_like_variable( if dtype is None: dtype = other.dtype + raise NotImplementedError() + if from_array_kwargs is None: from_array_kwargs = {} diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d8e1f831747..d7b9b53e677 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -208,7 +208,7 @@ def _get_chunk(var, chunks): Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ - import dask.array as da + from dask.array.core import normalize_chunks if isinstance(var, IndexVariable): return {} @@ -226,7 +226,9 @@ def _get_chunk(var, chunks): chunks.get(dim, None) or preferred_chunk_sizes for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape) ) - chunk_shape = da.core.normalize_chunks( + + # TODO ideally replace this with non-dask version + chunk_shape = normalize_chunks( chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape ) @@ -273,16 +275,21 @@ def _maybe_chunk( inline_array=False, from_array_kwargs=None, ): - from dask.base import tokenize - if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - # when rechunking by different amounts, make sure dask names change - # by providing chunks as an input to tokenize. - # subtle bugs result otherwise. see GH3350 - token2 = tokenize(name, token if token else var._data, chunks) - name2 = f"{name_prefix}{name}-{token2}" + if from_array_kwargs["manager"] == "dask": + from dask.base import tokenize + + # when rechunking by different amounts, make sure dask names change + # by providing chunks as an input to tokenize. + # subtle bugs result otherwise. see GH3350 + token2 = tokenize(name, token if token else var._data, chunks) + name2 = f"{name_prefix}{name}-{token2}" + else: + # not used + name2 = None + var = var.chunk( chunks, name=name2, @@ -2268,6 +2275,9 @@ def chunk( f"some chunks keys are not dimensions on this object: {bad_dims}" ) + if from_array_kwargs is None: + from_array_kwargs = {"manager": "dask"} + variables = { k: _maybe_chunk( k, From 7bef18877c7a9ed9505e389c3ba99165159ac3b2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 12:46:50 -0400 Subject: [PATCH 050/158] fix bugs with passing from_array_kwargs down --- xarray/backends/api.py | 2 +- xarray/core/common.py | 2 -- xarray/core/dataset.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8fc81a611e2..17d94ede52c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -323,7 +323,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, - from_array_kwargs=from_array_kwargs, + from_array_kwargs=from_array_kwargs.copy(), ) return backend_ds._replace(variables) diff --git a/xarray/core/common.py b/xarray/core/common.py index 532ab136db1..24a2308aea1 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1619,8 +1619,6 @@ def _full_like_variable( if dtype is None: dtype = other.dtype - raise NotImplementedError() - if from_array_kwargs is None: from_array_kwargs = {} diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d7b9b53e677..148e8d08845 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2287,7 +2287,7 @@ def chunk( lock, name_prefix, inline_array=inline_array, - from_array_kwargs=from_array_kwargs, + from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() } From 7af539578054bbf871283a7b454f473a5c6a59c7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 13:33:43 -0400 Subject: [PATCH 051/158] generalise reductions by adding to chunkmanager --- xarray/core/dask_array_ops.py | 9 +++--- xarray/core/duck_array_ops.py | 5 +-- xarray/core/parallelcompat.py | 60 ++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 24c5f698a27..84c99cd4f63 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -5,6 +5,7 @@ from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] from xarray.core import dtypes, nputils +from xarray.core.parallelcompat import get_chunked_array_type def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): @@ -103,16 +104,16 @@ def _first_last_wrapper(array, *, axis, op, keepdims): def _first_or_last(darray, axis, op): - import dask.array + chunkmanager = get_chunked_array_type(darray) # This will raise the same error message seen for numpy axis = normalize_axis_index(axis, darray.ndim) wrapped_op = partial(_first_last_wrapper, op=op) - return dask.array.reduction( + return chunkmanager.reduction( darray, - chunk=wrapped_op, - aggregate=wrapped_op, + func=wrapped_op, + aggregate_func=wrapped_op, axis=axis, dtype=darray.dtype, keepdims=False, # match numpy version diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 84e66803fe8..998d697fa3b 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -33,6 +33,7 @@ from xarray.core import dask_array_ops, dtypes, nputils from xarray.core.nputils import nanfirst, nanlast +from xarray.core.parallelcompat import is_chunked_array from xarray.core.pycompat import array_type, is_duck_dask_array from xarray.core.utils import is_duck_array, module_available @@ -640,7 +641,7 @@ def first(values, axis, skipna=None): """Return the first non-NA elements in this array along the given axis""" if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN - if is_duck_dask_array(values): + if is_chunked_array(values): return dask_array_ops.nanfirst(values, axis) else: return nanfirst(values, axis) @@ -651,7 +652,7 @@ def last(values, axis, skipna=None): """Return the last non-NA elements in this array along the given axis""" if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN - if is_duck_dask_array(values): + if is_chunked_array(values): return dask_array_ops.nanlast(values, axis) else: return nanlast(values, axis) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index d178aeb5dfe..8ce8b9d5e8b 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -4,7 +4,8 @@ but for now it is just a private experiment. """ from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar +from collections.abc import Sequence +from typing import Any, Callable, Generic, TypeVar, Union import numpy as np @@ -114,6 +115,19 @@ def array_api(self) -> Any: """Return the array_api namespace following the python array API standard.""" raise NotImplementedError() + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Callable = None, + aggregate_func: Callable = None, + axis: Union[int, Sequence[int]] = None, + dtype: np.dtype = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + """Used in some reductions like nanfirst, which is used by groupby.first""" + raise NotImplementedError() + @abstractmethod def apply_gufunc( self, @@ -240,6 +254,28 @@ def array_api(self) -> Any: return da + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Callable = None, + aggregate_func: Callable = None, + axis: Union[int, Sequence[int]] = None, + dtype: np.dtype = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + from dask.array import reduction + + return reduction( + arr, + chunk=func, + combine=combine_func, + aggregate=aggregate_func, + axis=axis, + dtype=dtype, + keepdims=keepdims, + ) + def apply_gufunc( self, func, @@ -394,6 +430,28 @@ def array_api(self) -> Any: return array_api + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Callable = None, + aggregate_func: Callable = None, + axis: Union[int, Sequence[int]] = None, + dtype: np.dtype = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + from cubed.core.ops import reduction + + return reduction( + arr, + func=func, + combine_func=combine_func, + aggegrate_func=aggregate_func, # TODO fix the typo in argument name in cubed + axis=axis, + dtype=dtype, + keepdims=keepdims, + ) + def map_blocks( self, func, From 287e96ca79dd822d585e7079e27a4a88cb0382f9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 13:40:59 -0400 Subject: [PATCH 052/158] moved nanfirst/nanlast to duck_array_ops from dask_array_ops --- xarray/core/dask_array_ops.py | 38 ------------------------------- xarray/core/duck_array_ops.py | 42 ++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 44 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 84c99cd4f63..d2d3e4a6d1c 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,11 +1,6 @@ from __future__ import annotations -from functools import partial - -from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] - from xarray.core import dtypes, nputils -from xarray.core.parallelcompat import get_chunked_array_type def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): @@ -97,36 +92,3 @@ def _fill_with_last_one(a, b): axis=axis, dtype=array.dtype, ) - - -def _first_last_wrapper(array, *, axis, op, keepdims): - return op(array, axis, keepdims=keepdims) - - -def _first_or_last(darray, axis, op): - chunkmanager = get_chunked_array_type(darray) - - # This will raise the same error message seen for numpy - axis = normalize_axis_index(axis, darray.ndim) - - wrapped_op = partial(_first_last_wrapper, op=op) - return chunkmanager.reduction( - darray, - func=wrapped_op, - aggregate_func=wrapped_op, - axis=axis, - dtype=darray.dtype, - keepdims=False, # match numpy version - ) - - -def nanfirst(darray, axis): - from xarray.core.duck_array_ops import nanfirst - - return _first_or_last(darray, axis, op=nanfirst) - - -def nanlast(darray, axis): - from xarray.core.duck_array_ops import nanlast - - return _first_or_last(darray, axis, op=nanlast) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 998d697fa3b..c54d409585e 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -9,6 +9,7 @@ import datetime import inspect import warnings +from functools import partial from importlib import import_module import numpy as np @@ -29,11 +30,11 @@ zeros_like, # noqa ) from numpy import concatenate as _concatenate +from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] from numpy.lib.stride_tricks import sliding_window_view # noqa from xarray.core import dask_array_ops, dtypes, nputils -from xarray.core.nputils import nanfirst, nanlast -from xarray.core.parallelcompat import is_chunked_array +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.pycompat import array_type, is_duck_dask_array from xarray.core.utils import is_duck_array, module_available @@ -642,9 +643,9 @@ def first(values, axis, skipna=None): if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN if is_chunked_array(values): - return dask_array_ops.nanfirst(values, axis) - else: return nanfirst(values, axis) + else: + return nputils.nanfirst(values, axis) return take(values, 0, axis=axis) @@ -653,9 +654,9 @@ def last(values, axis, skipna=None): if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN if is_chunked_array(values): - return dask_array_ops.nanlast(values, axis) - else: return nanlast(values, axis) + else: + return nputils.nanlast(values, axis) return take(values, -1, axis=axis) @@ -674,3 +675,32 @@ def push(array, n, axis): return dask_array_ops.push(array, n, axis) else: return push(array, n, axis) + + +def _first_last_wrapper(array, *, axis, op, keepdims): + return op(array, axis, keepdims=keepdims) + + +def _first_or_last(darray, axis, op): + chunkmanager = get_chunked_array_type(darray) + + # This will raise the same error message seen for numpy + axis = normalize_axis_index(axis, darray.ndim) + + wrapped_op = partial(_first_last_wrapper, op=op) + return chunkmanager.reduction( + darray, + func=wrapped_op, + aggregate_func=wrapped_op, + axis=axis, + dtype=darray.dtype, + keepdims=False, # match numpy version + ) + + +def nanfirst(darray, axis): + return _first_or_last(darray, axis, op=nputils.nanfirst) + + +def nanlast(darray, axis): + return _first_or_last(darray, axis, op=nputils.nanlast) From 8bbc1415cd1be781bdd484c9e6e5d6a5c323e55a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 13:56:24 -0400 Subject: [PATCH 053/158] generalize interp --- xarray/core/missing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 1caa79a7dfd..41b2bc812cd 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -15,7 +15,7 @@ from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import datetime_to_numeric, push, timedelta_to_numeric from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.types import Interp1dOptions, InterpOptions from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import Variable, broadcast_variables @@ -693,8 +693,8 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): else: func, kwargs = _get_interpolator_nd(method, **kwargs) - if is_duck_dask_array(var): - import dask.array as da + if is_chunked_array(var): + chunkmanager = get_chunked_array_type(var) ndim = var.ndim nconst = ndim - len(x) @@ -716,7 +716,7 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): *new_x_arginds, ) - _, rechunked = da.unify_chunks(*args) + _, rechunked = chunkmanager.unify_chunks(*args) args = tuple(elem for pair in zip(rechunked, args[1::2]) for elem in pair) @@ -741,8 +741,8 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): meta = var._meta - return da.blockwise( - _dask_aware_interpnd, + return chunkmanager.blockwise( + _chunked_aware_interpnd, out_ind, *args, interp_func=func, @@ -785,8 +785,8 @@ def _interpnd(var, x, new_x, func, kwargs): return rslt.reshape(rslt.shape[:-1] + new_x[0].shape) -def _dask_aware_interpnd(var, *coords, interp_func, interp_kwargs, localize=True): - """Wrapper for `_interpnd` through `blockwise` +def _chunked_aware_interpnd(var, *coords, interp_func, interp_kwargs, localize=True): + """Wrapper for `_interpnd` through `blockwise` for chunked arrays. The first half arrays in `coords` are original coordinates, the other half are destination coordinates From 6cfe9fa5056b260071cd97b44b4423158ed80a05 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 14:08:41 -0400 Subject: [PATCH 054/158] generalized chunk_hint function inside indexing --- xarray/core/indexing.py | 15 +++++++++------ xarray/core/parallelcompat.py | 4 +++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index becf1554453..428ec63eda0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -17,6 +17,7 @@ from xarray.core import duck_array_ops from xarray.core.nputils import NumpyVIndexAdapter from xarray.core.options import OPTIONS +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.pycompat import array_type, integer_types, is_duck_dask_array from xarray.core.types import T_Xarray from xarray.core.utils import ( @@ -1075,16 +1076,15 @@ def _arrayize_vectorized_indexer(indexer, shape): return VectorizedIndexer(tuple(new_key)) -def _dask_array_with_chunks_hint(array, chunks): - """Create a dask array using the chunks hint for dimensions of size > 1.""" - import dask.array as da +def _chunked_array_with_chunks_hint(array, chunks, chunkmanager): + """Create a chunked array using the chunks hint for dimensions of size > 1.""" if len(chunks) < array.ndim: raise ValueError("not enough chunks in hint") new_chunks = [] for chunk, size in zip(chunks, array.shape): new_chunks.append(chunk if size > 1 else (1,)) - return da.from_array(array, new_chunks) + return chunkmanager.from_array(array, new_chunks) def _logical_any(args): @@ -1098,8 +1098,11 @@ def _masked_result_drop_slice(key, data=None): new_keys = [] for k in key: if isinstance(k, np.ndarray): - if is_duck_dask_array(data): - new_keys.append(_dask_array_with_chunks_hint(k, chunks_hint)) + if is_chunked_array(data): + chunkmanager = get_chunked_array_type(data) + new_keys.append( + _chunked_array_with_chunks_hint(k, chunks_hint, chunkmanager) + ) elif isinstance(data, array_type("sparse")): import sparse diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 8ce8b9d5e8b..1aa8c9e15ec 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -9,7 +9,7 @@ import numpy as np -from xarray.core import indexing, utils +from xarray.core import utils from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array from xarray.core.types import T_Chunks @@ -197,6 +197,8 @@ def chunks(self, data: T_DaskArray) -> T_Chunks: def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_DaskArray: import dask.array as da + from xarray.core import indexing + # dask-specific kwargs name = kwargs.pop("name", None) lock = kwargs.pop("lock", False) From 4ca044b3d74be6ec05214fda9c0470b0a007a96f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 14:36:23 -0400 Subject: [PATCH 055/158] DaskIndexingAdapter->ChunkedIndexingAdapter --- xarray/core/indexing.py | 12 ++++++------ xarray/tests/test_backends.py | 2 +- xarray/tests/test_variable.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 428ec63eda0..662a3f198fa 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -18,7 +18,7 @@ from xarray.core.nputils import NumpyVIndexAdapter from xarray.core.options import OPTIONS from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array -from xarray.core.pycompat import array_type, integer_types, is_duck_dask_array +from xarray.core.pycompat import array_type, integer_types from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -676,8 +676,8 @@ def as_indexable(array): return NumpyIndexingAdapter(array) if isinstance(array, pd.Index): return PandasIndexingAdapter(array) - if is_duck_dask_array(array): - return DaskIndexingAdapter(array) + if is_chunked_array(array): + return ChunkedIndexingAdapter(array) if hasattr(array, "__array_function__"): return NdArrayLikeIndexingAdapter(array) if hasattr(array, "__array_namespace__"): @@ -1309,7 +1309,7 @@ def __getitem__(self, key): if isinstance(key, BasicIndexer): return self.array[key.tuple] elif isinstance(key, OuterIndexer): - # manual orthogonal indexing (implemented like DaskIndexingAdapter) + # manual orthogonal indexing (implemented like ChunkedIndexingAdapter) key = key.tuple value = self.array for axis, subkey in reversed(list(enumerate(key))): @@ -1335,8 +1335,8 @@ def transpose(self, order): return xp.permute_dims(self.array, order) -class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): - """Wrap a dask array to support explicit indexing.""" +class ChunkedIndexingAdapter(ExplicitlyIndexedNDArrayMixin): + """Wrap a chunked array (e.g. a dask array) to support explicit indexing.""" __slots__ = ("array",) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bc6b095fc4e..ea96f125ac2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -772,7 +772,7 @@ def find_and_validate_array(obj): if isinstance(obj.array, np.ndarray): assert isinstance(obj, indexing.NumpyIndexingAdapter) elif isinstance(obj.array, dask_array_type): - assert isinstance(obj, indexing.DaskIndexingAdapter) + assert isinstance(obj, indexing.ChunkedIndexingAdapter) elif isinstance(obj.array, pd.Index): assert isinstance(obj, indexing.PandasIndexingAdapter) else: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f656818c71f..a80f3866cee 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -16,8 +16,8 @@ from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( BasicIndexer, + ChunkedIndexingAdapter, CopyOnWriteArray, - DaskIndexingAdapter, LazilyIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, @@ -2725,15 +2725,15 @@ def test_MemoryCachedArray(self): self.check_vectorized_indexing(v) @requires_dask - def test_DaskIndexingAdapter(self): + def test_ChunkedIndexingAdapter(self): import dask.array as da da = da.asarray(self.d) - v = Variable(dims=("x", "y"), data=DaskIndexingAdapter(da)) + v = Variable(dims=("x", "y"), data=ChunkedIndexingAdapter(da)) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=("x", "y"), data=CopyOnWriteArray(DaskIndexingAdapter(da))) + v = Variable(dims=("x", "y"), data=CopyOnWriteArray(ChunkedIndexingAdapter(da))) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) From 2a4c38bf3531b9891c5eec26983238f8d3183b41 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 15:26:46 -0400 Subject: [PATCH 056/158] Revert "DaskIndexingAdapter->ChunkedIndexingAdapter" This reverts commit 4ca044b3d74be6ec05214fda9c0470b0a007a96f. --- xarray/core/indexing.py | 12 ++++++------ xarray/tests/test_backends.py | 2 +- xarray/tests/test_variable.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 662a3f198fa..428ec63eda0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -18,7 +18,7 @@ from xarray.core.nputils import NumpyVIndexAdapter from xarray.core.options import OPTIONS from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array -from xarray.core.pycompat import array_type, integer_types +from xarray.core.pycompat import array_type, integer_types, is_duck_dask_array from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -676,8 +676,8 @@ def as_indexable(array): return NumpyIndexingAdapter(array) if isinstance(array, pd.Index): return PandasIndexingAdapter(array) - if is_chunked_array(array): - return ChunkedIndexingAdapter(array) + if is_duck_dask_array(array): + return DaskIndexingAdapter(array) if hasattr(array, "__array_function__"): return NdArrayLikeIndexingAdapter(array) if hasattr(array, "__array_namespace__"): @@ -1309,7 +1309,7 @@ def __getitem__(self, key): if isinstance(key, BasicIndexer): return self.array[key.tuple] elif isinstance(key, OuterIndexer): - # manual orthogonal indexing (implemented like ChunkedIndexingAdapter) + # manual orthogonal indexing (implemented like DaskIndexingAdapter) key = key.tuple value = self.array for axis, subkey in reversed(list(enumerate(key))): @@ -1335,8 +1335,8 @@ def transpose(self, order): return xp.permute_dims(self.array, order) -class ChunkedIndexingAdapter(ExplicitlyIndexedNDArrayMixin): - """Wrap a chunked array (e.g. a dask array) to support explicit indexing.""" +class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): + """Wrap a dask array to support explicit indexing.""" __slots__ = ("array",) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ea96f125ac2..bc6b095fc4e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -772,7 +772,7 @@ def find_and_validate_array(obj): if isinstance(obj.array, np.ndarray): assert isinstance(obj, indexing.NumpyIndexingAdapter) elif isinstance(obj.array, dask_array_type): - assert isinstance(obj, indexing.ChunkedIndexingAdapter) + assert isinstance(obj, indexing.DaskIndexingAdapter) elif isinstance(obj.array, pd.Index): assert isinstance(obj, indexing.PandasIndexingAdapter) else: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index a80f3866cee..f656818c71f 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -16,8 +16,8 @@ from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( BasicIndexer, - ChunkedIndexingAdapter, CopyOnWriteArray, + DaskIndexingAdapter, LazilyIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, @@ -2725,15 +2725,15 @@ def test_MemoryCachedArray(self): self.check_vectorized_indexing(v) @requires_dask - def test_ChunkedIndexingAdapter(self): + def test_DaskIndexingAdapter(self): import dask.array as da da = da.asarray(self.d) - v = Variable(dims=("x", "y"), data=ChunkedIndexingAdapter(da)) + v = Variable(dims=("x", "y"), data=DaskIndexingAdapter(da)) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=("x", "y"), data=CopyOnWriteArray(ChunkedIndexingAdapter(da))) + v = Variable(dims=("x", "y"), data=CopyOnWriteArray(DaskIndexingAdapter(da))) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) From 45a4c98f88a795fe99b506de3724afe5c7d110e1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 15:37:28 -0400 Subject: [PATCH 057/158] pass cubed-related kwargs down through to_zarr by adding .store to ChunkManager --- xarray/backends/api.py | 3 +- xarray/backends/common.py | 16 +++++--- xarray/core/parallelcompat.py | 70 +++++++++++++++++++++++++++++------ xarray/core/types.py | 6 --- 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 17d94ede52c..341b91392c5 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1548,6 +1548,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + store_kwargs: dict[str, Any] = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to a zarr ztore @@ -1669,7 +1670,7 @@ def to_zarr( writer = ArrayWriter() # TODO: figure out how to properly handle unlimited_dims dump_to_store(dataset, zstore, writer, encoding=encoding) - writes = writer.sync(compute=compute) + writes = writer.sync(compute=compute, store_kwargs=store_kwargs) if compute: _finalize_store(writes, zstore) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 050493e3034..65e13b16500 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -11,7 +11,8 @@ from xarray.conventions import cf_encoder from xarray.core import indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array from xarray.core.utils import FrozenDict, NdimSizeLenMixin, is_remote_uri if TYPE_CHECKING: @@ -151,7 +152,7 @@ def __init__(self, lock=None): self.lock = lock def add(self, source, target, region=None): - if is_duck_dask_array(source): + if is_chunked_array(source): self.sources.append(source) self.targets.append(target) self.regions.append(region) @@ -161,21 +162,26 @@ def add(self, source, target, region=None): else: target[...] = source - def sync(self, compute=True): + def sync(self, compute=True, store_kwargs=None): if self.sources: - import dask.array as da + print(self.sources) + chunkmanager = get_chunked_array_type(*self.sources) # TODO: consider wrapping targets with dask.delayed, if this makes # for any discernible difference in perforance, e.g., # targets = [dask.delayed(t) for t in self.targets] - delayed_store = da.store( + if store_kwargs is None: + store_kwargs = {} + + delayed_store = chunkmanager.store( self.sources, self.targets, lock=self.lock, compute=compute, flush=True, regions=self.regions, + **store_kwargs, ) self.sources = [] self.targets = [] diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 1aa8c9e15ec..bcfeebc0f0d 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -5,7 +5,7 @@ """ from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import Any, Callable, Generic, TypeVar, Union +from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, Union import numpy as np @@ -18,6 +18,17 @@ CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} +if TYPE_CHECKING: + try: + from cubed import Array as CubedArray + except ImportError: + CubedArray = Any + + try: + from zarr.core import Array as ZarrArray + except ImportError: + ZarrArray = Any + def get_chunkmanager(name: str) -> "ChunkManager": if name in CHUNK_MANAGERS: @@ -174,6 +185,15 @@ def unify_chunks( """Called by xr.unify_chunks.""" raise NotImplementedError() + def store( + self, + sources: Union[T_ChunkedArray, Sequence[T_ChunkedArray]], + targets: Any, + **kwargs: dict[str, Any], + ): + """Used when writing to any backend.""" + raise NotImplementedError() + T_DaskArray = TypeVar("T_DaskArray", bound="dask.array.Array") @@ -379,6 +399,22 @@ def unify_chunks( return unify_chunks(*args, **kwargs) + def store( + self, + sources: Union[T_DaskArray, Sequence[T_DaskArray]], + targets: Any, + **kwargs: dict[str, Any], + ): + from dask.array import store + + # TODO separate expected store kwargs from other compute kwargs? + + return store( + sources=sources, + targets=targets, + **kwargs, + ) + try: import dask @@ -388,19 +424,16 @@ def unify_chunks( pass -T_CubedArray = TypeVar("T_CubedArray", bound="cubed.Array") - - -class CubedManager(ChunkManager[T_CubedArray]): +class CubedManager(ChunkManager[CubedArray]): def __init__(self): from cubed import Array self.array_cls = Array - def chunks(self, data: T_CubedArray) -> T_Chunks: + def chunks(self, data: CubedArray) -> T_Chunks: return data.chunks - def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: + def from_array(self, data: np.ndarray, chunks, **kwargs) -> CubedArray: import cubed # type: ignore spec = kwargs.pop("spec", None) @@ -418,10 +451,10 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_CubedArray: return data - def rechunk(self, data: T_CubedArray, chunks, **kwargs) -> T_CubedArray: + def rechunk(self, data: CubedArray, chunks, **kwargs) -> CubedArray: return data.rechunk(chunks, **kwargs) - def compute(self, *data: T_CubedArray, **kwargs) -> np.ndarray: + def compute(self, *data: CubedArray, **kwargs) -> np.ndarray: from cubed import compute return compute(*data, **kwargs) @@ -543,15 +576,28 @@ def apply_gufunc( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list[T_CubedArray]]: + ) -> tuple[dict[str, T_Chunks], list[CubedArray]]: from cubed.core import unify_chunks return unify_chunks(*args, **kwargs) + def store( + self, + sources: Union[CubedArray, Sequence[CubedArray]], + targets: Union[ZarrArray, Sequence[ZarrArray]], + **kwargs: dict[str, Any], + ): + """Used when writing to any backend.""" + from cubed.core.ops import store -try: - import cubed # type: ignore + return store( + sources, + targets, + **kwargs, + ) + +try: CHUNK_MANAGERS["cubed"] = CubedManager except ImportError: pass diff --git a/xarray/core/types.py b/xarray/core/types.py index 642b4631f5c..7a464f000b1 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -34,11 +34,6 @@ except ImportError: DaskArray = np.ndarray # type: ignore - try: - from cubed import Spec - except ImportError: - Spec = None # type: ignore - # TODO: Turn on when https://github.com/python/mypy/issues/11871 is fixed. # Can be uncommented if using pyright though. # import sys @@ -109,7 +104,6 @@ GroupByIncompatible = Union["Variable", "GroupBy"] T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] -T_CubedSpec = TypeVar("T_CubedSpec", bound="Spec") Dims = Union[str, Iterable[Hashable], "ellipsis", None] OrderedDims = Union[str, Sequence[Union[Hashable, "ellipsis"]], "ellipsis", None] From dee5b33628158db411640bdac53a448d813782ba Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:02:56 -0400 Subject: [PATCH 058/158] fix typing_extensions on py3.9 --- xarray/core/types.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/core/types.py b/xarray/core/types.py index 7a464f000b1..caf5814bf62 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -1,6 +1,7 @@ from __future__ import annotations import datetime +import sys from collections.abc import Hashable, Iterable, Sequence from typing import ( TYPE_CHECKING, @@ -15,7 +16,11 @@ import numpy as np import pandas as pd from packaging.version import Version -from typing_extensions import TypeAlias + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias if TYPE_CHECKING: from numpy._typing import _SupportsDType From 176d7facdc9c03e259f49c68d45d6d1b0683325d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:10:45 -0400 Subject: [PATCH 059/158] fix ImportError with cubed array type --- xarray/core/parallelcompat.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index bcfeebc0f0d..c32179e6bf5 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -424,16 +424,16 @@ def store( pass -class CubedManager(ChunkManager[CubedArray]): +class CubedManager(ChunkManager["CubedArray"]): def __init__(self): from cubed import Array self.array_cls = Array - def chunks(self, data: CubedArray) -> T_Chunks: + def chunks(self, data: "CubedArray") -> T_Chunks: return data.chunks - def from_array(self, data: np.ndarray, chunks, **kwargs) -> CubedArray: + def from_array(self, data: np.ndarray, chunks, **kwargs) -> "CubedArray": import cubed # type: ignore spec = kwargs.pop("spec", None) @@ -451,10 +451,10 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> CubedArray: return data - def rechunk(self, data: CubedArray, chunks, **kwargs) -> CubedArray: + def rechunk(self, data: "CubedArray", chunks, **kwargs) -> "CubedArray": return data.rechunk(chunks, **kwargs) - def compute(self, *data: CubedArray, **kwargs) -> np.ndarray: + def compute(self, *data: "CubedArray", **kwargs) -> np.ndarray: from cubed import compute return compute(*data, **kwargs) @@ -576,15 +576,15 @@ def apply_gufunc( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list[CubedArray]]: + ) -> tuple[dict[str, T_Chunks], list["CubedArray"]]: from cubed.core import unify_chunks return unify_chunks(*args, **kwargs) def store( self, - sources: Union[CubedArray, Sequence[CubedArray]], - targets: Union[ZarrArray, Sequence[ZarrArray]], + sources: Union["CubedArray", Sequence["CubedArray"]], + targets: Union["ZarrArray", Sequence["ZarrArray"]], **kwargs: dict[str, Any], ): """Used when writing to any backend.""" From 9e58d6d916ea1abda4e92cda6a05200c89a37957 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:27:16 -0400 Subject: [PATCH 060/158] give up trying to import TypeAlias in CI --- xarray/core/parallelcompat.py | 5 ++++- xarray/core/types.py | 8 -------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index c32179e6bf5..e2b6cc45622 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -11,11 +11,14 @@ from xarray.core import utils from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array -from xarray.core.types import T_Chunks T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") +# TODO importing TypeAlias is a pain on python 3.9 without typing_extensions in the CI +# T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] +T_Chunks = Any + CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} if TYPE_CHECKING: diff --git a/xarray/core/types.py b/xarray/core/types.py index caf5814bf62..0f11b16b003 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -1,7 +1,6 @@ from __future__ import annotations import datetime -import sys from collections.abc import Hashable, Iterable, Sequence from typing import ( TYPE_CHECKING, @@ -17,11 +16,6 @@ import pandas as pd from packaging.version import Version -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - if TYPE_CHECKING: from numpy._typing import _SupportsDType from numpy.typing import ArrayLike @@ -108,8 +102,6 @@ VarCompatible = Union["Variable", "ScalarOrArray"] GroupByIncompatible = Union["Variable", "GroupBy"] -T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] - Dims = Union[str, Iterable[Hashable], "ellipsis", None] OrderedDims = Union[str, Sequence[Union[Hashable, "ellipsis"]], "ellipsis", None] From a6219a0cf04be78c09c74475d0e462d5c5bb5ec2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:40:07 -0400 Subject: [PATCH 061/158] fix import of T_Chunks --- xarray/core/parallelcompat.py | 3 +-- xarray/tests/test_parallelcompat.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index e2b6cc45622..b129c6359aa 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -12,14 +12,13 @@ from xarray.core import utils from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array -T_ChunkManager = TypeVar("T_ChunkManager", bound="ChunkManager") T_ChunkedArray = TypeVar("T_ChunkedArray") # TODO importing TypeAlias is a pain on python 3.9 without typing_extensions in the CI # T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] T_Chunks = Any -CHUNK_MANAGERS: dict[str, T_ChunkManager] = {} +CHUNK_MANAGERS: dict[str, type["ChunkManager"]] = {} if TYPE_CHECKING: try: diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index faccabe2515..5a8fddb19c7 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -7,10 +7,10 @@ CHUNK_MANAGERS, ChunkManager, DaskManager, + T_Chunks, get_chunked_array_type, get_chunkmanager, ) -from xarray.core.types import T_Chunks dask = pytest.importorskip("dask") From 9f219940850c18937ef9ecad276d4b9c83932cf5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:43:49 -0400 Subject: [PATCH 062/158] fix no_implicit_optional warnings --- xarray/backends/api.py | 27 ++++++++---- xarray/core/common.py | 78 +++++++++++++++++------------------ xarray/core/parallelcompat.py | 26 ++++++------ 3 files changed, 70 insertions(+), 61 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 341b91392c5..e6389eff010 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -6,7 +6,16 @@ from glob import glob from io import BytesIO from numbers import Number -from typing import TYPE_CHECKING, Any, Callable, Final, Literal, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Final, + Literal, + Union, + cast, + overload, +) import numpy as np @@ -371,8 +380,8 @@ def _dataset_from_backend_dataset( def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - engine: T_Engine = None, - chunks: T_Chunks = None, + engine: T_Engine | None = None, + chunks: T_Chunks | None = None, cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, @@ -561,8 +570,8 @@ def open_dataset( def open_dataarray( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - engine: T_Engine = None, - chunks: T_Chunks = None, + engine: T_Engine | None = None, + chunks: T_Chunks | None = None, cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, @@ -573,7 +582,7 @@ def open_dataarray( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, - from_array_kwargs: dict[str, Any] = None, + from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataArray: @@ -743,7 +752,7 @@ def open_dataarray( def open_mfdataset( paths: str | NestedSequence[str | os.PathLike], - chunks: T_Chunks = None, + chunks: T_Chunks | None = None, concat_dim: str | DataArray | Index @@ -753,7 +762,7 @@ def open_mfdataset( | None = None, compat: CompatOptions = "no_conflicts", preprocess: Callable[[Dataset], Dataset] | None = None, - engine: T_Engine = None, + engine: T_Engine | None = None, data_vars: Literal["all", "minimal", "different"] | list[str] = "all", coords="different", combine: Literal["by_coords", "nested"] = "by_coords", @@ -1548,7 +1557,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] = None, + store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to a zarr ztore diff --git a/xarray/core/common.py b/xarray/core/common.py index 24a2308aea1..0642edfb35b 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -160,7 +160,7 @@ def __int__(self: Any) -> int: def __complex__(self: Any) -> complex: return complex(self.values) - def __array__(self: Any, dtype: DTypeLike = None) -> np.ndarray: + def __array__(self: Any, dtype: DTypeLike | None = None) -> np.ndarray: return np.asarray(self.values, dtype=dtype) def __repr__(self) -> str: @@ -1397,8 +1397,8 @@ def __getitem__(self, value): def full_like( other: DataArray, fill_value: Any, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1407,8 +1407,8 @@ def full_like( def full_like( other: Dataset, fill_value: Any, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1417,8 +1417,8 @@ def full_like( def full_like( other: Variable, fill_value: Any, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1427,8 +1427,8 @@ def full_like( def full_like( other: Dataset | DataArray, fill_value: Any, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1437,8 +1437,8 @@ def full_like( def full_like( other: Dataset | DataArray | Variable, fill_value: Any, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1446,8 +1446,8 @@ def full_like( def full_like( other: Dataset | DataArray | Variable, fill_value: Any, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object with the same shape and type as a given object. @@ -1604,8 +1604,8 @@ def full_like( def _full_like_variable( other: Variable, fill_value: Any, - dtype: DTypeLike = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLike | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: """Inner function of full_like, where other must be a variable""" from xarray.core.variable import Variable @@ -1638,8 +1638,8 @@ def _full_like_variable( @overload def zeros_like( other: DataArray, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1647,8 +1647,8 @@ def zeros_like( @overload def zeros_like( other: Dataset, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1656,8 +1656,8 @@ def zeros_like( @overload def zeros_like( other: Variable, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1665,8 +1665,8 @@ def zeros_like( @overload def zeros_like( other: Dataset | DataArray, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1674,16 +1674,16 @@ def zeros_like( @overload def zeros_like( other: Dataset | DataArray | Variable, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... def zeros_like( other: Dataset | DataArray | Variable, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of zeros with the same shape and type as a given dataarray or dataset. @@ -1743,8 +1743,8 @@ def zeros_like( @overload def ones_like( other: DataArray, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1752,8 +1752,8 @@ def ones_like( @overload def ones_like( other: Dataset, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1761,8 +1761,8 @@ def ones_like( @overload def ones_like( other: Variable, - dtype: DTypeLikeSave = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeLikeSave | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1770,8 +1770,8 @@ def ones_like( @overload def ones_like( other: Dataset | DataArray, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1779,16 +1779,16 @@ def ones_like( @overload def ones_like( other: Dataset | DataArray | Variable, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... def ones_like( other: Dataset | DataArray | Variable, - dtype: DTypeMaybeMapping = None, - from_array_kwargs: dict[str, Any] = None, + dtype: DTypeMaybeMapping | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of ones with the same shape and type as a given dataarray or dataset. diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index b129c6359aa..b973d4b9fb8 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -5,7 +5,7 @@ """ from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, Union +from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, Union import numpy as np @@ -132,10 +132,10 @@ def reduction( self, arr: T_ChunkedArray, func: Callable, - combine_func: Callable = None, - aggregate_func: Callable = None, - axis: Union[int, Sequence[int]] = None, - dtype: np.dtype = None, + combine_func: Optional[Callable] = None, + aggregate_func: Optional[Callable] = None, + axis: Optional[Union[int, Sequence[int]]] = None, + dtype: Optional[np.dtype] = None, keepdims: bool = False, ) -> T_ChunkedArray: """Used in some reductions like nanfirst, which is used by groupby.first""" @@ -282,10 +282,10 @@ def reduction( self, arr: T_ChunkedArray, func: Callable, - combine_func: Callable = None, - aggregate_func: Callable = None, - axis: Union[int, Sequence[int]] = None, - dtype: np.dtype = None, + combine_func: Optional[Callable] = None, + aggregate_func: Optional[Callable] = None, + axis: Optional[Union[int, Sequence[int]]] = None, + dtype: Optional[np.dtype] = None, keepdims: bool = False, ) -> T_ChunkedArray: from dask.array import reduction @@ -471,10 +471,10 @@ def reduction( self, arr: T_ChunkedArray, func: Callable, - combine_func: Callable = None, - aggregate_func: Callable = None, - axis: Union[int, Sequence[int]] = None, - dtype: np.dtype = None, + combine_func: Optional[Callable] = None, + aggregate_func: Optional[Callable] = None, + axis: Optional[Union[int, Sequence[int]]] = None, + dtype: Optional[np.dtype] = None, keepdims: bool = False, ) -> T_ChunkedArray: from cubed.core.ops import reduction From eb7bb0b315d313f986b3426a67d4455fee83d8f9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 16:58:54 -0400 Subject: [PATCH 063/158] don't define CubedManager if cubed can't be imported --- xarray/core/parallelcompat.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index b973d4b9fb8..688034c90bb 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -436,16 +436,16 @@ def chunks(self, data: "CubedArray") -> T_Chunks: return data.chunks def from_array(self, data: np.ndarray, chunks, **kwargs) -> "CubedArray": - import cubed # type: ignore + from cubed import Array, from_array spec = kwargs.pop("spec", None) - if isinstance(data, cubed.Array): + if isinstance(data, Array): data = data.rechunk(chunks) elif is_duck_dask_array(data): raise TypeError("Trying to rechunk a dask array using cubed") else: - data = cubed.from_array( + data = from_array( data, chunks, spec=spec, @@ -600,6 +600,8 @@ def store( try: + import cubed # noqa + CHUNK_MANAGERS["cubed"] = CubedManager except ImportError: pass From 57733de0a9eaa0279b9196ca20d80dd86324161c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 17:38:30 -0400 Subject: [PATCH 064/158] fix local mypy errors --- xarray/backends/plugins.py | 2 +- xarray/core/parallelcompat.py | 35 ++++++++++------------------- xarray/core/pycompat.py | 2 +- xarray/core/types.py | 10 +++++++++ xarray/tests/test_parallelcompat.py | 5 +++++ 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index bae1dcd2225..7a1a4d3d861 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -128,7 +128,7 @@ def list_engines() -> dict[str, BackendEntrypoint]: def guess_engine( store_spec: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, -): +) -> str | type[BackendEntrypoint]: engines = list_engines() for engine, backend in engines.items(): diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 688034c90bb..ae506a377ea 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -21,15 +21,7 @@ CHUNK_MANAGERS: dict[str, type["ChunkManager"]] = {} if TYPE_CHECKING: - try: - from cubed import Array as CubedArray - except ImportError: - CubedArray = Any - - try: - from zarr.core import Array as ZarrArray - except ImportError: - ZarrArray = Any + from xarray.core.types import CubedArray, DaskArray, ZarrArray def get_chunkmanager(name: str) -> "ChunkManager": @@ -197,11 +189,8 @@ def store( raise NotImplementedError() -T_DaskArray = TypeVar("T_DaskArray", bound="dask.array.Array") - - -class DaskManager(ChunkManager[T_DaskArray]): - array_cls: T_DaskArray +class DaskManager(ChunkManager["DaskArray"]): + array_cls: type["DaskArray"] def __init__(self): # TODO can we replace this with a class attribute instead? @@ -213,10 +202,10 @@ def __init__(self): def is_chunked_array(self, data: Any) -> bool: return is_duck_dask_array(data) - def chunks(self, data: T_DaskArray) -> T_Chunks: + def chunks(self, data: "DaskArray") -> T_Chunks: return data.chunks - def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_DaskArray: + def from_array(self, data, chunks, **kwargs) -> "DaskArray": import dask.array as da from xarray.core import indexing @@ -264,10 +253,10 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> T_DaskArray: return data # TODO is simple method propagation like this necessary? - def rechunk(self, data: T_DaskArray, chunks, **kwargs) -> T_DaskArray: + def rechunk(self, data: "DaskArray", chunks, **kwargs) -> "DaskArray": return data.rechunk(chunks, **kwargs) - def compute(self, *data: T_DaskArray, **kwargs) -> np.ndarray: + def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: from dask.array import compute return compute(*data, **kwargs) @@ -396,16 +385,16 @@ def blockwise( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list[T_DaskArray]]: + ) -> tuple[dict[str, T_Chunks], list["DaskArray"]]: from dask.array.core import unify_chunks return unify_chunks(*args, **kwargs) def store( self, - sources: Union[T_DaskArray, Sequence[T_DaskArray]], + sources: Union["DaskArray", Sequence["DaskArray"]], targets: Any, - **kwargs: dict[str, Any], + **kwargs, ): from dask.array import store @@ -419,14 +408,14 @@ def store( try: - import dask - CHUNK_MANAGERS["dask"] = DaskManager except ImportError: pass class CubedManager(ChunkManager["CubedArray"]): + array_cls: type["CubedArray"] + def __init__(self): from cubed import Array diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index f4ee1422486..5afd7fb90da 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -12,7 +12,7 @@ integer_types = (int, np.integer) if TYPE_CHECKING: - ModType = Literal["dask", "pint", "cupy", "sparse"] + ModType = Literal["dask", "pint", "cupy", "sparse", "cubed"] DuckArrayTypes = tuple[type[Any], ...] # TODO: improve this? maybe Generic diff --git a/xarray/core/types.py b/xarray/core/types.py index 0f11b16b003..944646bf116 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -33,6 +33,16 @@ except ImportError: DaskArray = np.ndarray # type: ignore + try: + from cubed import Array as CubedArray + except ImportError: + CubedArray = np.ndarray + + try: + from zarr.core import Array as ZarrArray + except ImportError: + ZarrArray = np.ndarray + # TODO: Turn on when https://github.com/python/mypy/issues/11871 is fixed. # Can be uncommented if using pyright though. # import sys diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 5a8fddb19c7..d45f29c4f63 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -44,6 +44,11 @@ def __array_finalize__(self, obj): return self.chunks = getattr(obj, "chunks", None) + def rechunk(self, chunks, **kwargs): + copied = self.copy() + copied.chunks = chunks + return copied + class DummyChunkManager(ChunkManager): """Mock-up of ChunkManager class for DummyChunkedArray""" From 4c58b28adc2e2a4ff26492ce7933f9b41da482e6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 20:15:32 -0400 Subject: [PATCH 065/158] don't explicitly pass enforce_ndim into dask.array.map_blocks --- xarray/core/parallelcompat.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index ae506a377ea..313fd433389 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -325,29 +325,22 @@ def map_blocks( self, func, *args, - name=None, - token=None, dtype=None, chunks=None, drop_axis=None, new_axis=None, - enforce_ndim=False, - meta=None, **kwargs, ): from dask.array import map_blocks + # pass through name, meta, token as kwargs return map_blocks( func, *args, - name=name, - token=token, dtype=dtype, chunks=chunks, drop_axis=drop_axis, new_axis=new_axis, - enforce_ndim=enforce_ndim, - meta=meta, **kwargs, ) From d07830c789c751a3c6b7b73ef4a359ceec1f100b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 14 Mar 2023 20:56:02 -0400 Subject: [PATCH 066/158] fix drop_axis default --- xarray/core/parallelcompat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 313fd433389..a28966e2c05 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -155,6 +155,9 @@ def map_blocks( func, *args, dtype=None, + chunks=None, + drop_axis=[], + new_axis=None, **kwargs, ): """Currently only called in a couple of really niche places in xarray. Not even called in xarray.map_blocks.""" @@ -327,7 +330,7 @@ def map_blocks( *args, dtype=None, chunks=None, - drop_axis=None, + drop_axis=[], new_axis=None, **kwargs, ): From 3ae21d9116162bedcd242b04f25dabb29a5f6c31 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Mar 2023 16:08:04 -0400 Subject: [PATCH 067/158] use indexing adapter on cubed arrays too --- xarray/core/parallelcompat.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index a28966e2c05..17cf78860a4 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -423,6 +423,9 @@ def chunks(self, data: "CubedArray") -> T_Chunks: def from_array(self, data: np.ndarray, chunks, **kwargs) -> "CubedArray": from cubed import Array, from_array + from xarray.core import indexing + + # cubed-specific kwargs spec = kwargs.pop("spec", None) if isinstance(data, Array): @@ -430,6 +433,22 @@ def from_array(self, data: np.ndarray, chunks, **kwargs) -> "CubedArray": elif is_duck_dask_array(data): raise TypeError("Trying to rechunk a dask array using cubed") else: + if isinstance(data, indexing.ExplicitlyIndexed): + # Unambiguously handle array storage backends (like NetCDF4 and h5py) + # that can't handle general array indexing. For example, in netCDF4 you + # can do "outer" indexing along two dimensions independent, which works + # differently from how NumPy handles it. + # da.from_array works by using lazy indexing with a tuple of slices. + # Using OuterIndexer is a pragmatic choice: dask does not yet handle + # different indexing types in an explicit way: + # https://github.com/dask/dask/issues/2883 + data = indexing.ImplicitToExplicitIndexingAdapter( + data, indexing.OuterIndexer + ) + + if utils.is_dict_like(chunks): + chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) + data = from_array( data, chunks, From 7ef012998e50fbb146bed1cf24bcfc930c684468 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Mar 2023 17:04:35 -0400 Subject: [PATCH 068/158] use array API-compatible version of astype function --- xarray/coding/variables.py | 4 ++-- xarray/conventions.py | 2 +- xarray/core/computation.py | 2 +- xarray/core/dataset.py | 4 ++-- xarray/core/nanops.py | 5 +++-- xarray/core/rolling.py | 6 +++--- xarray/core/variable.py | 4 ++-- xarray/core/weighted.py | 5 ++++- 8 files changed, 18 insertions(+), 14 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 93cdb4e0ae3..dd657985713 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -264,7 +264,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if "scale_factor" in encoding or "add_offset" in encoding: dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) - data = data.astype(dtype=dtype, copy=True) + data = duck_array_ops.astype(data, dtype=dtype, copy=True) if "add_offset" in encoding: data -= pop_to(encoding, attrs, "add_offset", name=name) if "scale_factor" in encoding: @@ -311,7 +311,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if "_FillValue" in attrs: new_fill = signed_dtype.type(attrs["_FillValue"]) attrs["_FillValue"] = new_fill - data = duck_array_ops.around(data).astype(signed_dtype) + data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) return Variable(dims, data, attrs, encoding, fastpath=True) else: diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..e11bf3873f1 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -136,7 +136,7 @@ def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable stacklevel=10, ) data = np.around(data) - data = data.astype(dtype=dtype) + data = duck_array_ops.astype(data, dtype=dtype) var = Variable(dims, data, attrs, encoding, fastpath=True) return var diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 7b775d69e80..ac43637672a 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -2022,7 +2022,7 @@ def to_floatable(x: DataArray) -> DataArray: ) elif x.dtype.kind == "m": # timedeltas - return x.astype(float) + return duck_array_ops.astype(x, dtype=float) return x if isinstance(data, Dataset): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5ae0f7512bb..ee8bc200419 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1589,7 +1589,7 @@ def _setitem_check(self, key, value): val = np.array(val) # type conversion - new_value[name] = val.astype(var_k.dtype, copy=False) + new_value[name] = duck_array_ops.astype(val, dtype=var_k.dtype, copy=False) # check consistency of dimension sizes and dimension coordinates if isinstance(value, DataArray) or isinstance(value, Dataset): @@ -2331,7 +2331,7 @@ def _validate_indexers( if v.dtype.kind in "US": index = self._indexes[k].to_pandas_index() if isinstance(index, pd.DatetimeIndex): - v = v.astype("datetime64[ns]") + v = duck_array_ops.astype(v, dtype="datetime64[ns]") elif isinstance(index, CFTimeIndex): v = _parse_array_of_cftime_strings(v, index.date_type) diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index 022de845c4c..3b8ddfe032d 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -6,6 +6,7 @@ from xarray.core import dtypes, nputils, utils from xarray.core.duck_array_ops import ( + astype, count, fillna, isnull, @@ -22,7 +23,7 @@ def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, "ndim", False): null_mask = (np.take(mask.shape, axis).prod() - mask.sum(axis) - min_count) < 0 dtype, fill_value = dtypes.maybe_promote(result.dtype) - result = where(null_mask, fill_value, result.astype(dtype)) + result = where(null_mask, fill_value, astype(result, dtype)) elif getattr(result, "dtype", None) not in dtypes.NAT_TYPES: null_mask = mask.size - mask.sum() @@ -140,7 +141,7 @@ def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs): value_mean = _nanmean_ddof_object( ddof=0, value=value, axis=axis, keepdims=True, **kwargs ) - squared = (value.astype(value_mean.dtype) - value_mean) ** 2 + squared = (astype(value, value_mean.dtype) - value_mean) ** 2 return _nanmean_ddof_object(ddof, squared, axis=axis, keepdims=keepdims, **kwargs) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 8b9f31bfdfd..961468dce03 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -158,9 +158,9 @@ def method(self, keep_attrs=None, **kwargs): return method def _mean(self, keep_attrs, **kwargs): - result = self.sum(keep_attrs=False, **kwargs) / self.count( - keep_attrs=False - ).astype(self.obj.dtype, copy=False) + result = self.sum(keep_attrs=False, **kwargs) / duck_array_ops.astype( + self.count(keep_attrs=False), dtype=self.obj.dtype, copy=False + ) if keep_attrs: result.attrs = self.obj.attrs return result diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3bfda611f02..4a2abc4742c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -205,10 +205,10 @@ def _as_nanosecond_precision(data): nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) else: nanosecond_precision_dtype = "datetime64[ns]" - return data.astype(nanosecond_precision_dtype) + return duck_array_ops.astype(data, nanosecond_precision_dtype) elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) - return data.astype("timedelta64[ns]") + return duck_array_ops.astype(data, "timedelta64[ns]") else: return data diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index 904c6a4d980..e21091fad6b 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -238,7 +238,10 @@ def _sum_of_weights(self, da: DataArray, dim: Dims = None) -> DataArray: # (and not 2); GH4074 if self.weights.dtype == bool: sum_of_weights = self._reduce( - mask, self.weights.astype(int), dim=dim, skipna=False + mask, + duck_array_ops.astype(self.weights, dtype=int), + dim=dim, + skipna=False, ) else: sum_of_weights = self._reduce(mask, self.weights, dim=dim, skipna=False) From ec229636f55ca5be4210a67b0c5975ffd621e5a4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Mar 2023 17:24:42 -0400 Subject: [PATCH 069/158] whatsnew --- doc/whats-new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1907a916dbc..0e5e0efeaf0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -69,6 +69,12 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Experimental support for wrapping chunked array libraries other than dask . + A new ABC is defined :py:class:`xr.core.parallelcompat.ChunkManager` which can be overridden and then registered + by chunked array implementations. (:issue:`6807`, :pull:`7019`) + By `Tom Nicholas `_. + + .. _whats-new.2023.02.0: From 4c8d773c5c0970f69be1ba1f81a42c4a9c9b448a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 16 Mar 2023 17:25:24 -0400 Subject: [PATCH 070/158] document new kwargs --- xarray/backends/api.py | 14 +++++++++++++- xarray/core/common.py | 16 ++++++++++++++++ xarray/core/dataarray.py | 5 +++++ xarray/core/dataset.py | 12 ++++++++++++ xarray/core/variable.py | 5 +++++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e6389eff010..dda5a98bdcc 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -392,7 +392,7 @@ def open_dataset( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, - from_array_kwargs=None, + from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> Dataset: @@ -485,6 +485,11 @@ def open_dataset( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -677,6 +682,11 @@ def open_dataarray( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -1516,6 +1526,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore: ... @@ -1538,6 +1549,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... diff --git a/xarray/core/common.py b/xarray/core/common.py index 0642edfb35b..87fa95c1af7 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1462,6 +1462,12 @@ def full_like( dtype : dtype or dict-like of dtype, optional dtype of the new array. If a dict-like, maps dtypes to variables. If omitted, it defaults to other.dtype. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Returns ------- @@ -1694,6 +1700,11 @@ def zeros_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1799,6 +1810,11 @@ def ones_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b1037967c4f..c4d9cf4908f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1281,6 +1281,11 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ee8bc200419..9398c754125 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1959,6 +1959,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore: ... @@ -1979,6 +1980,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -1997,6 +1999,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. @@ -2085,6 +2088,10 @@ def to_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + store_kwargs : dict + Additional keyword arguments passed on to the `ChunkManager.store` method used to store + chunked arrays. For example for a dask array additional kwargs will be passed eventually to + :py:func:`dask.array.store()`. Experimental API that should not be relied upon. Returns ------- @@ -2246,6 +2253,11 @@ def chunk( inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 4a2abc4742c..5fdea26b934 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1186,6 +1186,11 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. From f4de57769c5d45ce5647731d65a844dfec02f9b8 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 21 Mar 2023 16:02:13 -0400 Subject: [PATCH 071/158] add chunkmanager entrypoint --- setup.cfg | 4 + xarray/core/daskmanager.py | 226 +++++++++++++++++++++++++++ xarray/core/parallelcompat.py | 282 ++++++---------------------------- 3 files changed, 274 insertions(+), 238 deletions(-) create mode 100644 xarray/core/daskmanager.py diff --git a/setup.cfg b/setup.cfg index 759dea985bc..c429b11cd53 100644 --- a/setup.cfg +++ b/setup.cfg @@ -134,6 +134,10 @@ xarray = static/css/* static/html/* +[options.entry_points] +xarray.chunkmanagers = + dask = xarray.core.daskmanager:DaskManager + [tool:pytest] python_files = test_*.py testpaths = xarray/tests properties diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py new file mode 100644 index 00000000000..dbc642272ef --- /dev/null +++ b/xarray/core/daskmanager.py @@ -0,0 +1,226 @@ +from collections.abc import Sequence +from typing import Any, Callable, Optional, Union + +import numpy as np + +from xarray.core import utils +from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks +from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array +from xarray.core.types import DaskArray + + +class DaskManager(ChunkManagerEntrypoint["DaskArray"]): + array_cls: type["DaskArray"] + + def __init__(self): + # TODO can we replace this with a class attribute instead? + + from dask.array import Array + + self.array_cls = Array + + def is_chunked_array(self, data: Any) -> bool: + return is_duck_dask_array(data) + + def chunks(self, data: "DaskArray") -> T_Chunks: + return data.chunks + + def from_array(self, data, chunks, **kwargs) -> "DaskArray": + import dask.array as da + + from xarray.core import indexing + + # dask-specific kwargs + name = kwargs.pop("name", None) + lock = kwargs.pop("lock", False) + inline_array = kwargs.pop("inline_array", False) + + if is_duck_dask_array(data): + data = self.rechunk(data, chunks) + elif isinstance(data, DuckArrayModule("cubed").type): + raise TypeError("Trying to rechunk a cubed array using dask") + else: + if isinstance(data, indexing.ExplicitlyIndexed): + # Unambiguously handle array storage backends (like NetCDF4 and h5py) + # that can't handle general array indexing. For example, in netCDF4 you + # can do "outer" indexing along two dimensions independent, which works + # differently from how NumPy handles it. + # da.from_array works by using lazy indexing with a tuple of slices. + # Using OuterIndexer is a pragmatic choice: dask does not yet handle + # different indexing types in an explicit way: + # https://github.com/dask/dask/issues/2883 + data = indexing.ImplicitToExplicitIndexingAdapter( + data, indexing.OuterIndexer + ) + + # All of our lazily loaded backend array classes should use NumPy + # array operations. + dask_kwargs = {"meta": np.ndarray} + else: + dask_kwargs = {} + + if utils.is_dict_like(chunks): + chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) + + data = da.from_array( + data, + chunks, + name=name, + lock=lock, + inline_array=inline_array, + **dask_kwargs, + ) + return data + + # TODO is simple method propagation like this necessary? + def rechunk(self, data: "DaskArray", chunks, **kwargs) -> "DaskArray": + return data.rechunk(chunks, **kwargs) + + def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: + from dask.array import compute + + return compute(*data, **kwargs) + + @property + def array_api(self) -> Any: + from dask import array as da + + return da + + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Optional[Callable] = None, + aggregate_func: Optional[Callable] = None, + axis: Optional[Union[int, Sequence[int]]] = None, + dtype: Optional[np.dtype] = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + from dask.array import reduction + + return reduction( + arr, + chunk=func, + combine=combine_func, + aggregate=aggregate_func, + axis=axis, + dtype=dtype, + keepdims=keepdims, + ) + + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + axis=None, + keepdims=False, + output_dtypes=None, + output_sizes=None, + vectorize=None, + allow_rechunk=False, + meta=None, + **kwargs, + ): + from dask.array.gufunc import apply_gufunc + + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + keepdims=keepdims, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + allow_rechunk=allow_rechunk, + meta=meta, + **kwargs, + ) + + def map_blocks( + self, + func, + *args, + dtype=None, + chunks=None, + drop_axis=[], + new_axis=None, + **kwargs, + ): + from dask.array import map_blocks + + # pass through name, meta, token as kwargs + return map_blocks( + func, + *args, + dtype=dtype, + chunks=chunks, + drop_axis=drop_axis, + new_axis=new_axis, + **kwargs, + ) + + def blockwise( + self, + func, + out_ind, + *args, + name=None, + token=None, + dtype=None, + adjust_chunks=None, + new_axes=None, + align_arrays=True, + concatenate=None, + meta=None, + **kwargs, + ): + from dask.array import blockwise + + return blockwise( + func, + out_ind, + *args, + name=name, + token=token, + dtype=dtype, + adjust_chunks=adjust_chunks, + new_axes=new_axes, + align_arrays=align_arrays, + concatenate=concatenate, + meta=meta, + **kwargs, + ) + + def unify_chunks( + self, *args, **kwargs + ) -> tuple[dict[str, T_Chunks], list["DaskArray"]]: + from dask.array.core import unify_chunks + + return unify_chunks(*args, **kwargs) + + def store( + self, + sources: Union["DaskArray", Sequence["DaskArray"]], + targets: Any, + **kwargs, + ): + from dask.array import store + + # TODO separate expected store kwargs from other compute kwargs? + + return store( + sources=sources, + targets=targets, + **kwargs, + ) + + +# try: +# CHUNK_MANAGERS["dask"] = DaskManager +# except ImportError: +# pass diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 17cf78860a4..76217bb4c1f 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -3,14 +3,17 @@ It could later be used as the basis for a public interface allowing any N frameworks to interoperate with xarray, but for now it is just a private experiment. """ +import functools +import sys from abc import ABC, abstractmethod from collections.abc import Sequence +from importlib.metadata import entry_points from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, Union import numpy as np from xarray.core import utils -from xarray.core.pycompat import DuckArrayModule, is_chunked_array, is_duck_dask_array +from xarray.core.pycompat import is_chunked_array, is_duck_dask_array T_ChunkedArray = TypeVar("T_ChunkedArray") @@ -18,21 +21,49 @@ # T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] T_Chunks = Any -CHUNK_MANAGERS: dict[str, type["ChunkManager"]] = {} +CHUNK_MANAGERS: dict[str, type["ChunkManagerEntrypoint"]] = {} if TYPE_CHECKING: - from xarray.core.types import CubedArray, DaskArray, ZarrArray + from xarray.core.types import CubedArray, ZarrArray -def get_chunkmanager(name: str) -> "ChunkManager": - if name in CHUNK_MANAGERS: - chunkmanager_cls = CHUNK_MANAGERS[name] - return chunkmanager_cls() +@functools.lru_cache(maxsize=1) +def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: + """ + Return a dictionary of available engines and their BackendEntrypoint objects. + + Notes + ----- + # New selection mechanism introduced with Python 3.10. See GH6514. + """ + if sys.version_info >= (3, 10): + entrypoints = entry_points(group="xarray.chunkmanagers") else: - raise ImportError(f"ChunkManager {name} has not been defined") + entrypoints = entry_points().get("xarray.chunkmanagers", ()) + + # Load entrypoints and instantiate chunkmanagers only once + return {name: entrypoint.load()() for name, entrypoint in entrypoints} + + +def get_chunkmanager(manager: str) -> "ChunkManagerEntrypoint": + """Get namespace of chunk-handling methods for a specified parallel chunk manager, e.g. dask.""" + + if isinstance(manager, str): + chunkmanagers = list_chunkmanagers() + if manager not in chunkmanagers: + raise ValueError( + f"unrecognized chunk manager {manager} - must be one of: {list(chunkmanagers)}" + ) + + chunkmanager = chunkmanagers[manager] + else: + # TODO should we accept type[ChunkManagerEntrypoint] too? + raise TypeError("manager must be a string") + + return chunkmanager -def get_chunked_array_type(*args) -> "ChunkManager": +def get_chunked_array_type(*args) -> "ChunkManagerEntrypoint": """ Detects which parallel backend should be used for given set of arrays. @@ -59,8 +90,8 @@ def get_chunked_array_type(*args) -> "ChunkManager": # iterate over defined chunk managers, seeing if each recognises this array type chunked_arr = chunked_arrays[0] - for chunkmanager_cls in CHUNK_MANAGERS.values(): - chunkmanager = chunkmanager_cls() + chunkmanagers = list_chunkmanagers() + for chunkmanager in chunkmanagers.values(): if chunkmanager.is_chunked_array(chunked_arr): return chunkmanager @@ -73,7 +104,7 @@ class ChunkManagerNotFoundError(Exception): ... -class ChunkManager(ABC, Generic[T_ChunkedArray]): +class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): """ Adapter between a particular parallel computing framework and xarray. @@ -192,224 +223,7 @@ def store( raise NotImplementedError() -class DaskManager(ChunkManager["DaskArray"]): - array_cls: type["DaskArray"] - - def __init__(self): - # TODO can we replace this with a class attribute instead? - - from dask.array import Array - - self.array_cls = Array - - def is_chunked_array(self, data: Any) -> bool: - return is_duck_dask_array(data) - - def chunks(self, data: "DaskArray") -> T_Chunks: - return data.chunks - - def from_array(self, data, chunks, **kwargs) -> "DaskArray": - import dask.array as da - - from xarray.core import indexing - - # dask-specific kwargs - name = kwargs.pop("name", None) - lock = kwargs.pop("lock", False) - inline_array = kwargs.pop("inline_array", False) - - if is_duck_dask_array(data): - data = self.rechunk(data, chunks) - elif isinstance(data, DuckArrayModule("cubed").type): - raise TypeError("Trying to rechunk a cubed array using dask") - else: - if isinstance(data, indexing.ExplicitlyIndexed): - # Unambiguously handle array storage backends (like NetCDF4 and h5py) - # that can't handle general array indexing. For example, in netCDF4 you - # can do "outer" indexing along two dimensions independent, which works - # differently from how NumPy handles it. - # da.from_array works by using lazy indexing with a tuple of slices. - # Using OuterIndexer is a pragmatic choice: dask does not yet handle - # different indexing types in an explicit way: - # https://github.com/dask/dask/issues/2883 - data = indexing.ImplicitToExplicitIndexingAdapter( - data, indexing.OuterIndexer - ) - - # All of our lazily loaded backend array classes should use NumPy - # array operations. - dask_kwargs = {"meta": np.ndarray} - else: - dask_kwargs = {} - - if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) - - data = da.from_array( - data, - chunks, - name=name, - lock=lock, - inline_array=inline_array, - **dask_kwargs, - ) - return data - - # TODO is simple method propagation like this necessary? - def rechunk(self, data: "DaskArray", chunks, **kwargs) -> "DaskArray": - return data.rechunk(chunks, **kwargs) - - def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: - from dask.array import compute - - return compute(*data, **kwargs) - - @property - def array_api(self) -> Any: - from dask import array as da - - return da - - def reduction( - self, - arr: T_ChunkedArray, - func: Callable, - combine_func: Optional[Callable] = None, - aggregate_func: Optional[Callable] = None, - axis: Optional[Union[int, Sequence[int]]] = None, - dtype: Optional[np.dtype] = None, - keepdims: bool = False, - ) -> T_ChunkedArray: - from dask.array import reduction - - return reduction( - arr, - chunk=func, - combine=combine_func, - aggregate=aggregate_func, - axis=axis, - dtype=dtype, - keepdims=keepdims, - ) - - def apply_gufunc( - self, - func, - signature, - *args, - axes=None, - axis=None, - keepdims=False, - output_dtypes=None, - output_sizes=None, - vectorize=None, - allow_rechunk=False, - meta=None, - **kwargs, - ): - from dask.array.gufunc import apply_gufunc - - return apply_gufunc( - func, - signature, - *args, - axes=axes, - axis=axis, - keepdims=keepdims, - output_dtypes=output_dtypes, - output_sizes=output_sizes, - vectorize=vectorize, - allow_rechunk=allow_rechunk, - meta=meta, - **kwargs, - ) - - def map_blocks( - self, - func, - *args, - dtype=None, - chunks=None, - drop_axis=[], - new_axis=None, - **kwargs, - ): - from dask.array import map_blocks - - # pass through name, meta, token as kwargs - return map_blocks( - func, - *args, - dtype=dtype, - chunks=chunks, - drop_axis=drop_axis, - new_axis=new_axis, - **kwargs, - ) - - def blockwise( - self, - func, - out_ind, - *args, - name=None, - token=None, - dtype=None, - adjust_chunks=None, - new_axes=None, - align_arrays=True, - concatenate=None, - meta=None, - **kwargs, - ): - from dask.array import blockwise - - return blockwise( - func, - out_ind, - *args, - name=name, - token=token, - dtype=dtype, - adjust_chunks=adjust_chunks, - new_axes=new_axes, - align_arrays=align_arrays, - concatenate=concatenate, - meta=meta, - **kwargs, - ) - - def unify_chunks( - self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list["DaskArray"]]: - from dask.array.core import unify_chunks - - return unify_chunks(*args, **kwargs) - - def store( - self, - sources: Union["DaskArray", Sequence["DaskArray"]], - targets: Any, - **kwargs, - ): - from dask.array import store - - # TODO separate expected store kwargs from other compute kwargs? - - return store( - sources=sources, - targets=targets, - **kwargs, - ) - - -try: - CHUNK_MANAGERS["dask"] = DaskManager -except ImportError: - pass - - -class CubedManager(ChunkManager["CubedArray"]): +class CubedManager(ChunkManagerEntrypoint["CubedArray"]): array_cls: type["CubedArray"] def __init__(self): @@ -601,11 +415,3 @@ def store( targets, **kwargs, ) - - -try: - import cubed # noqa - - CHUNK_MANAGERS["cubed"] = CubedManager -except ImportError: - pass From 1cd72831f64592c506a43f7da6895eb54ed8296b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 21 Mar 2023 16:32:11 -0400 Subject: [PATCH 072/158] move CubedManager to a separate package --- xarray/core/parallelcompat.py | 211 ++-------------------------------- 1 file changed, 12 insertions(+), 199 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 76217bb4c1f..7b2f7809758 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -8,12 +8,19 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from importlib.metadata import entry_points -from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + Optional, + TypeVar, + Union, +) import numpy as np -from xarray.core import utils -from xarray.core.pycompat import is_chunked_array, is_duck_dask_array +from xarray.core.pycompat import is_chunked_array T_ChunkedArray = TypeVar("T_ChunkedArray") @@ -24,7 +31,7 @@ CHUNK_MANAGERS: dict[str, type["ChunkManagerEntrypoint"]] = {} if TYPE_CHECKING: - from xarray.core.types import CubedArray, ZarrArray + pass @functools.lru_cache(maxsize=1) @@ -42,7 +49,7 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: entrypoints = entry_points().get("xarray.chunkmanagers", ()) # Load entrypoints and instantiate chunkmanagers only once - return {name: entrypoint.load()() for name, entrypoint in entrypoints} + return {entrypoint.name: entrypoint.load()() for entrypoint in entrypoints} def get_chunkmanager(manager: str) -> "ChunkManagerEntrypoint": @@ -221,197 +228,3 @@ def store( ): """Used when writing to any backend.""" raise NotImplementedError() - - -class CubedManager(ChunkManagerEntrypoint["CubedArray"]): - array_cls: type["CubedArray"] - - def __init__(self): - from cubed import Array - - self.array_cls = Array - - def chunks(self, data: "CubedArray") -> T_Chunks: - return data.chunks - - def from_array(self, data: np.ndarray, chunks, **kwargs) -> "CubedArray": - from cubed import Array, from_array - - from xarray.core import indexing - - # cubed-specific kwargs - spec = kwargs.pop("spec", None) - - if isinstance(data, Array): - data = data.rechunk(chunks) - elif is_duck_dask_array(data): - raise TypeError("Trying to rechunk a dask array using cubed") - else: - if isinstance(data, indexing.ExplicitlyIndexed): - # Unambiguously handle array storage backends (like NetCDF4 and h5py) - # that can't handle general array indexing. For example, in netCDF4 you - # can do "outer" indexing along two dimensions independent, which works - # differently from how NumPy handles it. - # da.from_array works by using lazy indexing with a tuple of slices. - # Using OuterIndexer is a pragmatic choice: dask does not yet handle - # different indexing types in an explicit way: - # https://github.com/dask/dask/issues/2883 - data = indexing.ImplicitToExplicitIndexingAdapter( - data, indexing.OuterIndexer - ) - - if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) - - data = from_array( - data, - chunks, - spec=spec, - ) - - return data - - def rechunk(self, data: "CubedArray", chunks, **kwargs) -> "CubedArray": - return data.rechunk(chunks, **kwargs) - - def compute(self, *data: "CubedArray", **kwargs) -> np.ndarray: - from cubed import compute - - return compute(*data, **kwargs) - - @property - def array_api(self) -> Any: - from cubed import array_api - - return array_api - - def reduction( - self, - arr: T_ChunkedArray, - func: Callable, - combine_func: Optional[Callable] = None, - aggregate_func: Optional[Callable] = None, - axis: Optional[Union[int, Sequence[int]]] = None, - dtype: Optional[np.dtype] = None, - keepdims: bool = False, - ) -> T_ChunkedArray: - from cubed.core.ops import reduction - - return reduction( - arr, - func=func, - combine_func=combine_func, - aggegrate_func=aggregate_func, # TODO fix the typo in argument name in cubed - axis=axis, - dtype=dtype, - keepdims=keepdims, - ) - - def map_blocks( - self, - func, - *args, - dtype=None, - chunks=None, - drop_axis=[], - new_axis=None, - **kwargs, - ): - from cubed.core.ops import map_blocks - - return map_blocks( - func, - *args, - dtype=dtype, - chunks=chunks, - drop_axis=drop_axis, - new_axis=new_axis, - **kwargs, - ) - - def blockwise( - self, - func, - out_ind, - *args: Any, - # can't type this as mypy assumes args are all same type, but blockwise args alternate types - dtype=None, - adjust_chunks=None, - new_axes=None, - align_arrays=True, - target_store=None, - **kwargs, - ): - from cubed.core.ops import blockwise - - # TODO where to get the target_store kwarg from? Filter down from a blockwise call? Set as attribute on CubedManager? - - return blockwise( - func, - out_ind, - *args, - dtype=dtype, - adjust_chunks=adjust_chunks, - new_axes=new_axes, - align_arrays=align_arrays, - target_store=target_store, - **kwargs, - ) - - def apply_gufunc( - self, - func, - signature, - *args, - axes=None, - axis=None, - keepdims=False, - output_dtypes=None, - output_sizes=None, - vectorize=None, - allow_rechunk=False, - meta=None, - **kwargs, - ): - if allow_rechunk: - raise NotImplementedError( - "cubed.apply_gufunc doesn't support allow_rechunk" - ) - if keepdims: - raise NotImplementedError("cubed.apply_gufunc doesn't support keepdims") - - from cubed import apply_gufunc - - return apply_gufunc( - func, - signature, - *args, - axes=axes, - axis=axis, - output_dtypes=output_dtypes, - output_sizes=output_sizes, - vectorize=vectorize, - **kwargs, - ) - - def unify_chunks( - self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list["CubedArray"]]: - from cubed.core import unify_chunks - - return unify_chunks(*args, **kwargs) - - def store( - self, - sources: Union["CubedArray", Sequence["CubedArray"]], - targets: Union["ZarrArray", Sequence["ZarrArray"]], - **kwargs: dict[str, Any], - ): - """Used when writing to any backend.""" - from cubed.core.ops import store - - return store( - sources, - targets, - **kwargs, - ) From 53867114ba577192549620a88046e4ad9f1f8199 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 21 Mar 2023 17:15:27 -0400 Subject: [PATCH 073/158] guess chunkmanager based on whats available --- xarray/backends/api.py | 2 +- xarray/backends/zarr.py | 8 ++++---- xarray/core/daskmanager.py | 6 ++++-- xarray/core/dataset.py | 2 +- xarray/core/parallelcompat.py | 30 +++++++++++++++++++---------- xarray/core/variable.py | 4 ++-- xarray/tests/test_parallelcompat.py | 6 +++--- 7 files changed, 35 insertions(+), 23 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dda5a98bdcc..d084cd005c5 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -534,7 +534,7 @@ def open_dataset( engine = plugins.guess_engine(filename_or_obj) if from_array_kwargs is None: - from_array_kwargs = {"manager": "dask"} + from_array_kwargs = {"manager": None} backend = plugins.get_backend(engine) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e877306842c..b3689b83d23 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -17,7 +17,7 @@ ) from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing -from xarray.core.parallelcompat import get_chunkmanager +from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.pycompat import integer_types from xarray.core.utils import ( FrozenDict, @@ -806,12 +806,12 @@ def open_zarr( from xarray.backends.api import open_dataset if from_array_kwargs is None: - from_array_kwargs = {"manager": "dask"} + from_array_kwargs = {"manager": None} if chunks == "auto": - manager = from_array_kwargs.get("manager", "dask") + manager = from_array_kwargs.get("manager", None) try: - get_chunkmanager(manager) # attempt to import that parallel backend + guess_chunkmanager(manager) # attempt to import that parallel backend chunks = {} except ImportError: diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index dbc642272ef..83e2cb8a122 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -1,12 +1,14 @@ from collections.abc import Sequence -from typing import Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import numpy as np from xarray.core import utils from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array -from xarray.core.types import DaskArray + +if TYPE_CHECKING: + from xarray.core.types import DaskArray class DaskManager(ChunkManagerEntrypoint["DaskArray"]): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9398c754125..374771ba4f4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2293,7 +2293,7 @@ def chunk( ) if from_array_kwargs is None: - from_array_kwargs = {"manager": "dask"} + from_array_kwargs = {"manager": None} variables = { k: _maybe_chunk( diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7b2f7809758..6f78aa8b18f 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -28,8 +28,6 @@ # T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] T_Chunks = Any -CHUNK_MANAGERS: dict[str, type["ChunkManagerEntrypoint"]] = {} - if TYPE_CHECKING: pass @@ -37,7 +35,7 @@ @functools.lru_cache(maxsize=1) def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: """ - Return a dictionary of available engines and their BackendEntrypoint objects. + Return a dictionary of available chunk managers and their ChunkManagerEntrypoint objects. Notes ----- @@ -48,27 +46,39 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: else: entrypoints = entry_points().get("xarray.chunkmanagers", ()) - # Load entrypoints and instantiate chunkmanagers only once + # Load entrypoints and instantiate chunkmanagers only once, return {entrypoint.name: entrypoint.load()() for entrypoint in entrypoints} -def get_chunkmanager(manager: str) -> "ChunkManagerEntrypoint": - """Get namespace of chunk-handling methods for a specified parallel chunk manager, e.g. dask.""" +def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": + """ + Get namespace of chunk-handling methods, guessing from what's available. + + If the name of a specific ChunkManager is given (e.g. "dask"), then use that. + Else use whatever is installed, defaulting to dask if there are multiple options. + """ + + chunkmanagers = list_chunkmanagers() + + if manager is None: + if len(chunkmanagers) == 1: + # use the only option available + manager = next(iter(chunkmanagers.keys())) + else: + # default to trying to use dask + manager = "dask" if isinstance(manager, str): - chunkmanagers = list_chunkmanagers() if manager not in chunkmanagers: raise ValueError( f"unrecognized chunk manager {manager} - must be one of: {list(chunkmanagers)}" ) - chunkmanager = chunkmanagers[manager] + return chunkmanagers[manager] else: # TODO should we accept type[ChunkManagerEntrypoint] too? raise TypeError("manager must be a string") - return chunkmanager - def get_chunked_array_type(*args) -> "ChunkManagerEntrypoint": """ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 5fdea26b934..d51978d8e99 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -27,7 +27,7 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import get_chunkmanager +from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.pycompat import ( DuckArrayModule, array_type, @@ -1209,7 +1209,7 @@ def chunk( if from_array_kwargs is None: from_array_kwargs = {} - chunk_manager = get_chunkmanager(from_array_kwargs.pop("manager", "dask")) + chunk_manager = guess_chunkmanager(from_array_kwargs.pop("manager", None)) _from_array_kwargs = dict( name=name, lock=lock, inline_array=inline_array, **from_array_kwargs diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index d45f29c4f63..c99379be7c8 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -9,7 +9,7 @@ DaskManager, T_Chunks, get_chunked_array_type, - get_chunkmanager, + guess_chunkmanager, ) dask = pytest.importorskip("dask") @@ -116,12 +116,12 @@ class TestGetChunkManager: def test_get_chunkmanger(self): CHUNK_MANAGERS["dummy"] = DummyChunkManager - chunkmanager = get_chunkmanager("dummy") + chunkmanager = guess_chunkmanager("dummy") assert isinstance(chunkmanager, DummyChunkManager) def test_fail_on_nonexistent_chunkmanager(self): with pytest.raises(ImportError, match="nonsense has not been defined"): - get_chunkmanager("nonsense") + guess_chunkmanager("nonsense") class TestGetChunkedArrayType: From c431a5fb6ab871191ef8d61910af80405853477a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 16:17:52 -0400 Subject: [PATCH 074/158] fix bug with tokenizing --- xarray/core/dataset.py | 4 ++-- xarray/core/parallelcompat.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 374771ba4f4..ac8f3c4dea5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -73,7 +73,7 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager_name from xarray.core.pycompat import ( array_type, is_chunked_array, @@ -2293,7 +2293,7 @@ def chunk( ) if from_array_kwargs is None: - from_array_kwargs = {"manager": None} + from_array_kwargs = {"manager": guess_chunkmanager_name(None)} variables = { k: _maybe_chunk( diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 6f78aa8b18f..d49fc246778 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -50,14 +50,7 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: return {entrypoint.name: entrypoint.load()() for entrypoint in entrypoints} -def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": - """ - Get namespace of chunk-handling methods, guessing from what's available. - - If the name of a specific ChunkManager is given (e.g. "dask"), then use that. - Else use whatever is installed, defaulting to dask if there are multiple options. - """ - +def guess_chunkmanager_name(manager: Optional[str]) -> str: chunkmanagers = list_chunkmanagers() if manager is None: @@ -68,6 +61,20 @@ def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": # default to trying to use dask manager = "dask" + return manager + + +def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": + """ + Get namespace of chunk-handling methods, guessing from what's available. + + If the name of a specific ChunkManager is given (e.g. "dask"), then use that. + Else use whatever is installed, defaulting to dask if there are multiple options. + """ + + chunkmanagers = list_chunkmanagers() + manager = guess_chunkmanager_name(manager) + if isinstance(manager, str): if manager not in chunkmanagers: raise ValueError( From 7ab90474a3b03f8da0eb22b42e3d3585d3349a3f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 16:59:41 -0400 Subject: [PATCH 075/158] adapt tests to emulate existence of entrypoint --- xarray/core/parallelcompat.py | 11 +++++++++-- xarray/tests/test_parallelcompat.py | 20 ++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index d49fc246778..c2fe529c791 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -32,6 +32,10 @@ pass +# Only used for testing purposes, as a real entrypoint is hard to mock +EXAMPLE_CHUNKMANAGERS: dict[str, "ChunkManagerEntrypoint"] = {} + + @functools.lru_cache(maxsize=1) def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: """ @@ -46,8 +50,11 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: else: entrypoints = entry_points().get("xarray.chunkmanagers", ()) - # Load entrypoints and instantiate chunkmanagers only once, - return {entrypoint.name: entrypoint.load()() for entrypoint in entrypoints} + # Load entrypoints and instantiate chunkmanagers only once + _example_chunkmanagers = {k: v() for k, v in EXAMPLE_CHUNKMANAGERS.items()} + return { + entrypoint.name: entrypoint.load()() for entrypoint in entrypoints + } | _example_chunkmanagers def guess_chunkmanager_name(manager: Optional[str]) -> str: diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index c99379be7c8..4c1dcf03c01 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -3,10 +3,10 @@ import numpy as np import pytest +from xarray.core.daskmanager import DaskManager from xarray.core.parallelcompat import ( - CHUNK_MANAGERS, - ChunkManager, - DaskManager, + EXAMPLE_CHUNKMANAGERS, + ChunkManagerEntrypoint, T_Chunks, get_chunked_array_type, guess_chunkmanager, @@ -50,7 +50,7 @@ def rechunk(self, chunks, **kwargs): return copied -class DummyChunkManager(ChunkManager): +class DummyChunkManager(ChunkManagerEntrypoint): """Mock-up of ChunkManager class for DummyChunkedArray""" def __init__(self): @@ -114,26 +114,26 @@ class TestGetChunkManager: # TODO do these need setups and teardowns? def test_get_chunkmanger(self): - CHUNK_MANAGERS["dummy"] = DummyChunkManager + EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager chunkmanager = guess_chunkmanager("dummy") assert isinstance(chunkmanager, DummyChunkManager) def test_fail_on_nonexistent_chunkmanager(self): - with pytest.raises(ImportError, match="nonsense has not been defined"): - guess_chunkmanager("nonsense") + with pytest.raises(ValueError, match="unrecognized chunk manager foo"): + guess_chunkmanager("foo") class TestGetChunkedArrayType: def test_detect_chunked_arrays(self): - CHUNK_MANAGERS["dummy"] = DummyChunkManager + EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(dummy_arr) assert isinstance(chunk_manager, DummyChunkManager) def test_ignore_inmemory_arrays(self): - CHUNK_MANAGERS["dummy"] = DummyChunkManager + EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(*[dummy_arr, 1.0, np.array([5, 6])]) @@ -149,7 +149,7 @@ def test_detect_dask_by_default(self): assert isinstance(chunk_manager, DaskManager) def test_raise_on_mixed_types(self): - CHUNK_MANAGERS["dummy"] = DummyChunkManager + EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager dummy_arr = DummyChunkedArray([1, 2, 3]) dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) From 72f8f5f766ea36ab8eb3af5356387e718a30d0d6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 17:07:27 -0400 Subject: [PATCH 076/158] use fixture to setup/teardown dummy entrypoint --- xarray/tests/test_parallelcompat.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 4c1dcf03c01..c7bb746948f 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -110,12 +110,16 @@ def apply_gufunc( ) -class TestGetChunkManager: - # TODO do these need setups and teardowns? +@pytest.fixture +def register_dummy_chunkmanager(): + """Mocks the registering of an additional ChunkManagerEntrypoint.""" + EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager + yield + del EXAMPLE_CHUNKMANAGERS["dummy"] - def test_get_chunkmanger(self): - EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager +class TestGetChunkManager: + def test_get_chunkmanger(self, register_dummy_chunkmanager): chunkmanager = guess_chunkmanager("dummy") assert isinstance(chunkmanager, DummyChunkManager) @@ -125,15 +129,13 @@ def test_fail_on_nonexistent_chunkmanager(self): class TestGetChunkedArrayType: - def test_detect_chunked_arrays(self): - EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager + def test_detect_chunked_arrays(self, register_dummy_chunkmanager): dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(dummy_arr) assert isinstance(chunk_manager, DummyChunkManager) - def test_ignore_inmemory_arrays(self): - EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager + def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager): dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(*[dummy_arr, 1.0, np.array([5, 6])]) @@ -148,8 +150,7 @@ def test_detect_dask_by_default(self): chunk_manager = get_chunked_array_type(dask_arr) assert isinstance(chunk_manager, DaskManager) - def test_raise_on_mixed_types(self): - EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager + def test_raise_on_mixed_types(self, register_dummy_chunkmanager): dummy_arr = DummyChunkedArray([1, 2, 3]) dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) From 34c6aea60fad7ea7f14bf729277d37959543859f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 17:40:04 -0400 Subject: [PATCH 077/158] refactor to make DaskManager unavailable if dask not installed --- xarray/core/daskmanager.py | 8 ++------ xarray/core/parallelcompat.py | 27 +++++++++++++++++++++------ xarray/tests/test_parallelcompat.py | 13 +++++++++---- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 83e2cb8a122..c177b08d8b4 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -4,6 +4,7 @@ import numpy as np from xarray.core import utils +from xarray.core.duck_array_ops import dask_available from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array @@ -13,6 +14,7 @@ class DaskManager(ChunkManagerEntrypoint["DaskArray"]): array_cls: type["DaskArray"] + available: bool = dask_available def __init__(self): # TODO can we replace this with a class attribute instead? @@ -220,9 +222,3 @@ def store( targets=targets, **kwargs, ) - - -# try: -# CHUNK_MANAGERS["dask"] = DaskManager -# except ImportError: -# pass diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index c2fe529c791..4de2c578df6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -7,7 +7,7 @@ import sys from abc import ABC, abstractmethod from collections.abc import Sequence -from importlib.metadata import entry_points +from importlib.metadata import EntryPoint, entry_points from typing import ( TYPE_CHECKING, Any, @@ -50,11 +50,25 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: else: entrypoints = entry_points().get("xarray.chunkmanagers", ()) - # Load entrypoints and instantiate chunkmanagers only once - _example_chunkmanagers = {k: v() for k, v in EXAMPLE_CHUNKMANAGERS.items()} - return { - entrypoint.name: entrypoint.load()() for entrypoint in entrypoints - } | _example_chunkmanagers + return load_chunkmanagers(entrypoints) + + +def load_chunkmanagers( + entrypoints: dict[str, EntryPoint] +) -> dict[str, "ChunkManagerEntrypoint"]: + """Load entrypoints and instantiate chunkmanagers only once.""" + + loaded_entrypoints = { + entrypoint.name: entrypoint.load() for entrypoint in entrypoints + } + + # TODO will this work if dask is not installed? We don't want to instantiate the chunkmanager if its not available + available_chunkmanagers = { + name: chunkmanager() + for name, chunkmanager in (loaded_entrypoints | EXAMPLE_CHUNKMANAGERS).items() + if chunkmanager.available + } + return available_chunkmanagers def guess_chunkmanager_name(manager: Optional[str]) -> str: @@ -149,6 +163,7 @@ class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): """ array_cls: type[T_ChunkedArray] + available: bool = True @abstractmethod def __init__(self): diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index c7bb746948f..5a1d4fc1d7d 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -11,8 +11,7 @@ get_chunked_array_type, guess_chunkmanager, ) - -dask = pytest.importorskip("dask") +from xarray.tests import requires_dask class DummyChunkedArray(np.ndarray): @@ -144,15 +143,21 @@ def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager): with pytest.raises(TypeError, match="Expected a chunked array"): get_chunked_array_type(5.0) + @requires_dask def test_detect_dask_by_default(self): - dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) + import dask.array as da + + dask_arr = da.from_array([1, 2, 3], chunks=(1,)) chunk_manager = get_chunked_array_type(dask_arr) assert isinstance(chunk_manager, DaskManager) + @requires_dask def test_raise_on_mixed_types(self, register_dummy_chunkmanager): + import dask.array as da + dummy_arr = DummyChunkedArray([1, 2, 3]) - dask_arr = dask.array.from_array([1, 2, 3], chunks=(1,)) + dask_arr = da.from_array([1, 2, 3], chunks=(1,)) with pytest.raises(TypeError, match="received multiple types"): get_chunked_array_type(*[dask_arr, dummy_arr]) From fb9466d9120bd8557dac5f32bbb35e16905c3699 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 17:44:11 -0400 Subject: [PATCH 078/158] typing --- xarray/core/parallelcompat.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 4de2c578df6..0571c120144 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -9,7 +9,6 @@ from collections.abc import Sequence from importlib.metadata import EntryPoint, entry_points from typing import ( - TYPE_CHECKING, Any, Callable, Generic, @@ -28,12 +27,9 @@ # T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] T_Chunks = Any -if TYPE_CHECKING: - pass - # Only used for testing purposes, as a real entrypoint is hard to mock -EXAMPLE_CHUNKMANAGERS: dict[str, "ChunkManagerEntrypoint"] = {} +EXAMPLE_CHUNKMANAGERS: dict[str, type["ChunkManagerEntrypoint"]] = {} @functools.lru_cache(maxsize=1) @@ -54,7 +50,7 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: def load_chunkmanagers( - entrypoints: dict[str, EntryPoint] + entrypoints: Sequence[EntryPoint], ) -> dict[str, "ChunkManagerEntrypoint"]: """Load entrypoints and instantiate chunkmanagers only once.""" From 36b2be02a547e2a40960f31c8916c31723b70ccb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 17:47:34 -0400 Subject: [PATCH 079/158] move whatsnew to latest xarray version --- doc/whats-new.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 645788856c0..c74661f66c7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,6 +45,12 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Experimental support for wrapping chunked array libraries other than dask. + A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntryPoint` - which can be overridden and then + registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`) + By `Tom Nicholas `_. + + .. _whats-new.2023.03.0: v2023.03.0 (March 22, 2023) @@ -108,11 +114,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ -- Experimental support for wrapping chunked array libraries other than dask . - A new ABC is defined :py:class:`xr.core.parallelcompat.ChunkManager` which can be overridden and then registered - by chunked array implementations. (:issue:`6807`, :pull:`7019`) - By `Tom Nicholas `_. - - Pin pandas to ``<2``. By `Deepak Cherian `_. From 77a1e4e4c53af5581e4c4aff4df82841f53e9d52 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 22 Mar 2023 17:48:39 -0400 Subject: [PATCH 080/158] remove superfluous lines from whatsnew --- doc/whats-new.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c74661f66c7..0239bcac2cb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -113,8 +113,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ - - - Pin pandas to ``<2``. By `Deepak Cherian `_. .. _whats-new.2023.02.0: From a6222f91cfb15e5bd76c05cbf5b7eac81e840386 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 12:48:46 -0400 Subject: [PATCH 081/158] fix bug where zarr backend attempted to use dask when not installed --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b3689b83d23..07cbe0b95d2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -814,7 +814,7 @@ def open_zarr( guess_chunkmanager(manager) # attempt to import that parallel backend chunks = {} - except ImportError: + except ValueError: chunks = None if kwargs: From 61fe236195714c6e83ecaf76d04987293f1576e4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 23 Mar 2023 15:23:58 -0400 Subject: [PATCH 082/158] Remove rogue print statement Co-authored-by: Deepak Cherian --- xarray/backends/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 65e13b16500..6142eee39ab 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -164,7 +164,6 @@ def add(self, source, target, region=None): def sync(self, compute=True, store_kwargs=None): if self.sources: - print(self.sources) chunkmanager = get_chunked_array_type(*self.sources) # TODO: consider wrapping targets with dask.delayed, if this makes From a7a6a6ef4b61aafda54e948827361edc3c2ba9b5 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 23 Mar 2023 15:25:21 -0400 Subject: [PATCH 083/158] Clarify what's new Co-authored-by: Deepak Cherian --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d084cd005c5..8e9b0d60ce7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -489,7 +489,7 @@ def open_dataset( Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + :py:func:`dask.array.from_array`. This is experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. From aa649963ced2f09ba508cce126f08bbcb822a26a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 18:18:55 -0400 Subject: [PATCH 084/158] use monkeypatch to mock registering of dummy chunkmanager --- xarray/core/parallelcompat.py | 7 +------ xarray/tests/test_parallelcompat.py | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 0571c120144..846f0c46a48 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -28,10 +28,6 @@ T_Chunks = Any -# Only used for testing purposes, as a real entrypoint is hard to mock -EXAMPLE_CHUNKMANAGERS: dict[str, type["ChunkManagerEntrypoint"]] = {} - - @functools.lru_cache(maxsize=1) def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: """ @@ -58,10 +54,9 @@ def load_chunkmanagers( entrypoint.name: entrypoint.load() for entrypoint in entrypoints } - # TODO will this work if dask is not installed? We don't want to instantiate the chunkmanager if its not available available_chunkmanagers = { name: chunkmanager() - for name, chunkmanager in (loaded_entrypoints | EXAMPLE_CHUNKMANAGERS).items() + for name, chunkmanager in loaded_entrypoints.items() if chunkmanager.available } return available_chunkmanagers diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 5a1d4fc1d7d..7a9bfda267c 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -5,11 +5,11 @@ from xarray.core.daskmanager import DaskManager from xarray.core.parallelcompat import ( - EXAMPLE_CHUNKMANAGERS, ChunkManagerEntrypoint, T_Chunks, get_chunked_array_type, guess_chunkmanager, + list_chunkmanagers, ) from xarray.tests import requires_dask @@ -110,11 +110,24 @@ def apply_gufunc( @pytest.fixture -def register_dummy_chunkmanager(): - """Mocks the registering of an additional ChunkManagerEntrypoint.""" - EXAMPLE_CHUNKMANAGERS["dummy"] = DummyChunkManager +def register_dummy_chunkmanager(monkeypatch): + """ + Mocks the registering of an additional ChunkManagerEntrypoint. + + This preserves the presence of the existing DaskManager, so a test that relies on this and DaskManager both being + returned from list_chunkmanagers() at once would still work. + + The monkeypatching changes the behavior of list_chunkmanagers when called inside xarray.core.parallelcompat, + but not when called from this tests file. + """ + # Should include DaskManager iff dask is available to be imported + preregistered_chunkmanagers = list_chunkmanagers() + + monkeypatch.setattr( + "xarray.core.parallelcompat.list_chunkmanagers", + lambda: {"dummy": DummyChunkManager()} | preregistered_chunkmanagers, + ) yield - del EXAMPLE_CHUNKMANAGERS["dummy"] class TestGetChunkManager: @@ -152,6 +165,8 @@ def test_detect_dask_by_default(self): chunk_manager = get_chunked_array_type(dask_arr) assert isinstance(chunk_manager, DaskManager) + # TODO test that dask is default choice even if other chunkmanagers installed + @requires_dask def test_raise_on_mixed_types(self, register_dummy_chunkmanager): import dask.array as da From db11947f47e227627fe50519adee944a4ad931df Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 18:32:42 -0400 Subject: [PATCH 085/158] more tests for guessing chunkmanager correctly --- xarray/tests/test_parallelcompat.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 7a9bfda267c..cab953798eb 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -11,7 +11,7 @@ guess_chunkmanager, list_chunkmanagers, ) -from xarray.tests import requires_dask +from xarray.tests import has_dask, requires_dask class DummyChunkedArray(np.ndarray): @@ -139,6 +139,21 @@ def test_fail_on_nonexistent_chunkmanager(self): with pytest.raises(ValueError, match="unrecognized chunk manager foo"): guess_chunkmanager("foo") + @requires_dask + def test_get_dask_if_installed(self): + chunk_manager = guess_chunkmanager(None) + assert isinstance(chunk_manager, DaskManager) + + @pytest.mark.skipif(has_dask, reason="requires dask not to be installed") + def test_dont_get_dask_if_not_installed(self): + with pytest.raises(ValueError, match="unrecognized chunk manager dask"): + guess_chunkmanager("dask") + + @requires_dask + def test_choose_dask_over_other_chunkmanagers(self, register_dummy_chunkmanager): + chunk_manager = guess_chunkmanager(None) + assert isinstance(chunk_manager, DaskManager) + class TestGetChunkedArrayType: def test_detect_chunked_arrays(self, register_dummy_chunkmanager): @@ -156,8 +171,12 @@ def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager): with pytest.raises(TypeError, match="Expected a chunked array"): get_chunked_array_type(5.0) + def test_raise_if_no_arrays_chunked(self, register_dummy_chunkmanager): + with pytest.raises(TypeError, match="Expected a chunked array "): + get_chunked_array_type(*[1.0, np.array([5, 6])]) + @requires_dask - def test_detect_dask_by_default(self): + def test_detect_dask_if_installed(self): import dask.array as da dask_arr = da.from_array([1, 2, 3], chunks=(1,)) @@ -165,10 +184,8 @@ def test_detect_dask_by_default(self): chunk_manager = get_chunked_array_type(dask_arr) assert isinstance(chunk_manager, DaskManager) - # TODO test that dask is default choice even if other chunkmanagers installed - @requires_dask - def test_raise_on_mixed_types(self, register_dummy_chunkmanager): + def test_raise_on_mixed_array_types(self, register_dummy_chunkmanager): import dask.array as da dummy_arr = DummyChunkedArray([1, 2, 3]) From 2c18df61d8a2676d1a02a5475d70b54484a4b668 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 18:35:39 -0400 Subject: [PATCH 086/158] raise TypeError if no chunkmanager found for array types --- xarray/core/parallelcompat.py | 6 +----- xarray/tests/test_parallelcompat.py | 8 ++++++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 846f0c46a48..1988dfbe4ed 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -131,15 +131,11 @@ def get_chunked_array_type(*args) -> "ChunkManagerEntrypoint": if chunkmanager.is_chunked_array(chunked_arr): return chunkmanager - raise ChunkManagerNotFoundError( + raise TypeError( f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}" ) -class ChunkManagerNotFoundError(Exception): - ... - - class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): """ Adapter between a particular parallel computing framework and xarray. diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index cab953798eb..8c93437530d 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -175,6 +175,14 @@ def test_raise_if_no_arrays_chunked(self, register_dummy_chunkmanager): with pytest.raises(TypeError, match="Expected a chunked array "): get_chunked_array_type(*[1.0, np.array([5, 6])]) + def test_raise_if_no_matching_chunkmanagers(self): + dummy_arr = DummyChunkedArray([1, 2, 3]) + + with pytest.raises( + TypeError, match="Could not find a Chunk Manager which recognises" + ): + get_chunked_array_type(dummy_arr) + @requires_dask def test_detect_dask_if_installed(self): import dask.array as da From 2e49154b4664e29efba4e9a022e5f31e1a6f64db Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 23 Mar 2023 19:16:34 -0400 Subject: [PATCH 087/158] Correct is_chunked_array check Co-authored-by: Deepak Cherian --- xarray/core/pycompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index a6c81586aa7..f3f089a924c 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -84,7 +84,7 @@ def is_duck_dask_array(x): def is_chunked_array(x): - return is_duck_dask_array(x) or hasattr(x, "chunks") + return is_duck_dask_array(x) or (is_duck_array(x) and hasattr(x, "chunks")) def is_0d_dask_array(x): From 748e90d54a49d748ad08e253d1d28aada79bbed7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 22:11:36 -0400 Subject: [PATCH 088/158] vendor dask.array.core.normalize_chunks --- xarray/core/daskvendor.py | 480 ++++++++++++++++++++++++++++++++++++++ xarray/core/dataset.py | 4 +- 2 files changed, 481 insertions(+), 3 deletions(-) create mode 100644 xarray/core/daskvendor.py diff --git a/xarray/core/daskvendor.py b/xarray/core/daskvendor.py new file mode 100644 index 00000000000..1a0719bec05 --- /dev/null +++ b/xarray/core/daskvendor.py @@ -0,0 +1,480 @@ +"""Functions vendored from dask.""" + +from __future__ import annotations + +import collections +import math +from numbers import Integral, Number + +import numpy as np + + +def is_integer(i) -> bool: + """ + >>> is_integer(6) + True + >>> is_integer(42.0) + True + >>> is_integer("abc") + False + """ + return isinstance(i, Integral) or (isinstance(i, float) and i.is_integer()) + + +def parse_bytes(s: float | str) -> int: + """Parse byte string to numbers + >>> from dask.utils import parse_bytes + >>> parse_bytes("100") + 100 + >>> parse_bytes("100 MB") + 100000000 + >>> parse_bytes("100M") + 100000000 + >>> parse_bytes("5kB") + 5000 + >>> parse_bytes("5.4 kB") + 5400 + >>> parse_bytes("1kiB") + 1024 + >>> parse_bytes("1e6") + 1000000 + >>> parse_bytes("1e6 kB") + 1000000000 + >>> parse_bytes("MB") + 1000000 + >>> parse_bytes(123) + 123 + >>> parse_bytes("5 foos") + Traceback (most recent call last): + ... + ValueError: Could not interpret 'foos' as a byte unit + """ + if isinstance(s, (int, float)): + return int(s) + s = s.replace(" ", "") + if not any(char.isdigit() for char in s): + s = "1" + s + + for i in range(len(s) - 1, -1, -1): + if not s[i].isalpha(): + break + index = i + 1 + + prefix = s[:index] + suffix = s[index:] + + try: + n = float(prefix) + except ValueError as e: + raise ValueError("Could not interpret '%s' as a number" % prefix) from e + + try: + multiplier = byte_sizes[suffix.lower()] + except KeyError as e: + raise ValueError("Could not interpret '%s' as a byte unit" % suffix) from e + + result = n * multiplier + return int(result) + + +byte_sizes = { + "kB": 10**3, + "MB": 10**6, + "GB": 10**9, + "TB": 10**12, + "PB": 10**15, + "KiB": 2**10, + "MiB": 2**20, + "GiB": 2**30, + "TiB": 2**40, + "PiB": 2**50, + "B": 1, + "": 1, +} +byte_sizes = {k.lower(): v for k, v in byte_sizes.items()} +byte_sizes.update({k[0]: v for k, v in byte_sizes.items() if k and "i" not in k}) +byte_sizes.update({k[:-1]: v for k, v in byte_sizes.items() if k and "i" in k}) +unknown_chunk_message = ( + "\n\n" + "A possible solution: " + "https://docs.dask.org/en/latest/array-chunks.html#unknown-chunks\n" + "Summary: to compute chunks sizes, use\n\n" + " x.compute_chunk_sizes() # for Dask Array `x`\n" + " ddf.to_dask_array(lengths=True) # for Dask DataFrame `ddf`" +) + + +def blockdims_from_blockshape(shape, chunks): + """ + Vendored from dask.array.core + + >>> blockdims_from_blockshape((10, 10), (4, 3)) + ((4, 4, 2), (3, 3, 3, 1)) + >>> blockdims_from_blockshape((10, 0), (4, 0)) + ((4, 4, 2), (0,)) + """ + if chunks is None: + raise TypeError("Must supply chunks= keyword argument") + if shape is None: + raise TypeError("Must supply shape= keyword argument") + if np.isnan(sum(shape)) or np.isnan(sum(chunks)): + raise ValueError( + "Array chunk sizes are unknown. shape: %s, chunks: %s%s" + % (shape, chunks, unknown_chunk_message) + ) + if not all(map(is_integer, chunks)): + raise ValueError("chunks can only contain integers.") + if not all(map(is_integer, shape)): + raise ValueError("shape can only contain integers.") + shape = tuple(map(int, shape)) + chunks = tuple(map(int, chunks)) + return tuple( + ((bd,) * (d // bd) + ((d % bd,) if d % bd else ()) if d else (0,)) + for d, bd in zip(shape, chunks) + ) + + +CHUNKS_NONE_ERROR_MESSAGE = """ +You must specify a chunks= keyword argument. +This specifies the chunksize of your array blocks. +See the following documentation page for details: + https://docs.dask.org/en/latest/array-creation.html#chunks +""".strip() + + +def normalize_chunks( + chunks, shape=None, limit=None, dtype=None, previous_chunks=None +) -> tuple[tuple[int, ...], ...]: + """ + Normalize chunks to tuple of tuples. + + This takes in a variety of input types and information and produces a full + tuple-of-tuples result for chunks, suitable to be passed to Array or + rechunk or any other operation that creates a Dask array. + + Vendored from dask.array.core + + Parameters + ---------- + chunks: tuple, int, dict, or string + The chunks to be normalized. See examples below for more details + shape: Tuple[int] + The shape of the array + limit: int (optional) + The maximum block size to target in bytes, + if freedom is given to choose + dtype: np.dtype + previous_chunks: Tuple[Tuple[int]] optional + Chunks from a previous array that we should use for inspiration when + rechunking auto dimensions. If not provided but auto-chunking exists + then auto-dimensions will prefer square-like chunk shapes. + + Examples + -------- + Specify uniform chunk sizes + + >>> from dask.array.core import normalize_chunks + >>> normalize_chunks((2, 2), shape=(5, 6)) + ((2, 2, 1), (2, 2, 2)) + + Also passes through fully explicit tuple-of-tuples + + >>> normalize_chunks(((2, 2, 1), (2, 2, 2)), shape=(5, 6)) + ((2, 2, 1), (2, 2, 2)) + + Cleans up lists to tuples + + >>> normalize_chunks([[2, 2], [3, 3]]) + ((2, 2), (3, 3)) + + Expands integer inputs 10 -> (10, 10) + + >>> normalize_chunks(10, shape=(30, 5)) + ((10, 10, 10), (5,)) + + Expands dict inputs + + >>> normalize_chunks({0: 2, 1: 3}, shape=(6, 6)) + ((2, 2, 2), (3, 3)) + + The values -1 and None get mapped to full size + + >>> normalize_chunks((5, -1), shape=(10, 10)) + ((5, 5), (10,)) + + Use the value "auto" to automatically determine chunk sizes along certain + dimensions. This uses the ``limit=`` and ``dtype=`` keywords to + determine how large to make the chunks. The term "auto" can be used + anywhere an integer can be used. See array chunking documentation for more + information. + + >>> normalize_chunks(("auto",), shape=(20,), limit=5, dtype="uint8") + ((5, 5, 5, 5),) + + You can also use byte sizes (see :func:`dask.utils.parse_bytes`) in place of + "auto" to ask for a particular size + + >>> normalize_chunks("1kiB", shape=(2000,), dtype="float32") + ((256, 256, 256, 256, 256, 256, 256, 208),) + + Respects null dimensions + + >>> normalize_chunks((), shape=(0, 0)) + ((0,), (0,)) + """ + if dtype and not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) + if chunks is None: + raise ValueError(CHUNKS_NONE_ERROR_MESSAGE) + if isinstance(chunks, list): + chunks = tuple(chunks) + if isinstance(chunks, (Number, str)): + chunks = (chunks,) * len(shape) + if isinstance(chunks, dict): + chunks = tuple(chunks.get(i, None) for i in range(len(shape))) + if isinstance(chunks, np.ndarray): + chunks = chunks.tolist() + if not chunks and shape and all(s == 0 for s in shape): + chunks = ((0,),) * len(shape) + + if ( + shape + and len(shape) == 1 + and len(chunks) > 1 + and all(isinstance(c, (Number, str)) for c in chunks) + ): + chunks = (chunks,) + + if shape and len(chunks) != len(shape): + raise ValueError( + "Chunks and shape must be of the same length/dimension. " + "Got chunks={}, shape={}".format(chunks, shape) + ) + if -1 in chunks or None in chunks: + chunks = tuple(s if c == -1 or c is None else c for c, s in zip(chunks, shape)) + + # If specifying chunk size in bytes, use that value to set the limit. + # Verify there is only one consistent value of limit or chunk-bytes used. + for c in chunks: + if isinstance(c, str) and c != "auto": + parsed = parse_bytes(c) + if limit is None: + limit = parsed + elif parsed != limit: + raise ValueError( + "Only one consistent value of limit or chunk is allowed." + "Used {} != {}".format(parsed, limit) + ) + # Substitute byte limits with 'auto' now that limit is set. + chunks = tuple("auto" if isinstance(c, str) and c != "auto" else c for c in chunks) + + if any(c == "auto" for c in chunks): + chunks = auto_chunks(chunks, shape, limit, dtype, previous_chunks) + + if shape is not None: + chunks = tuple(c if c not in {None, -1} else s for c, s in zip(chunks, shape)) + + if chunks and shape is not None: + chunks = sum( + ( + blockdims_from_blockshape((s,), (c,)) + if not isinstance(c, (tuple, list)) + else (c,) + for s, c in zip(shape, chunks) + ), + (), + ) + for c in chunks: + if not c: + raise ValueError( + "Empty tuples are not allowed in chunks. Express " + "zero length dimensions with 0(s) in chunks" + ) + + if shape is not None: + if len(chunks) != len(shape): + raise ValueError( + "Input array has %d dimensions but the supplied " + "chunks has only %d dimensions" % (len(shape), len(chunks)) + ) + if not all( + c == s or (math.isnan(c) or math.isnan(s)) + for c, s in zip(map(sum, chunks), shape) + ): + raise ValueError( + "Chunks do not add up to shape. " + "Got chunks={}, shape={}".format(chunks, shape) + ) + + return tuple( + tuple(int(x) if not math.isnan(x) else np.nan for x in c) for c in chunks # type: ignore[misc] + ) + + +def _compute_multiplier(limit: int, dtype, largest_block: int, result): + """ + Utility function for auto_chunk, to fin how much larger or smaller the ideal + chunk size is relative to what we have now. + """ + return ( + limit + / dtype.itemsize + / largest_block + / math.prod(r for r in result.values() if r) + ) + + +def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None): + """Determine automatic chunks + This takes in a chunks value that contains ``"auto"`` values in certain + dimensions and replaces those values with concrete dimension sizes that try + to get chunks to be of a certain size in bytes, provided by the ``limit=`` + keyword. If multiple dimensions are marked as ``"auto"`` then they will + all respond to meet the desired byte limit, trying to respect the aspect + ratio of their dimensions in ``previous_chunks=``, if given. + Parameters + ---------- + chunks: Tuple + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "auto" + shape: Tuple[int] + limit: int, str + The maximum allowable size of a chunk in bytes + previous_chunks: Tuple[Tuple[int]] + See also + -------- + normalize_chunks: for full docstring and parameters + """ + if previous_chunks is not None: + previous_chunks = tuple( + c if isinstance(c, tuple) else (c,) for c in previous_chunks + ) + chunks = list(chunks) + + autos = {i for i, c in enumerate(chunks) if c == "auto"} + if not autos: + return tuple(chunks) + + if limit is None: + limit = "128MiB" # config.get("array.chunk-size") + if isinstance(limit, str): + limit = parse_bytes(limit) + + if dtype is None: + raise TypeError("dtype must be known for auto-chunking") + + if dtype.hasobject: + raise NotImplementedError( + "Can not use auto rechunking with object dtype. " + "We are unable to estimate the size in bytes of object data" + ) + + for x in tuple(chunks) + tuple(shape): + if ( + isinstance(x, Number) + and np.isnan(x) + or isinstance(x, tuple) + and np.isnan(x).any() + ): + raise ValueError( + "Can not perform automatic rechunking with unknown " + "(nan) chunk sizes.%s" % unknown_chunk_message + ) + + limit = max(1, limit) + + largest_block = math.prod( + cs if isinstance(cs, Number) else max(cs) for cs in chunks if cs != "auto" + ) + + if previous_chunks: + # Base ideal ratio on the median chunk size of the previous chunks + result = {a: np.median(previous_chunks[a]) for a in autos} + + ideal_shape = [] + for i, s in enumerate(shape): + chunk_frequencies = frequencies(previous_chunks[i]) + mode, count = max(chunk_frequencies.items(), key=lambda kv: kv[1]) + if mode > 1 and count >= len(previous_chunks[i]) / 2: + ideal_shape.append(mode) + else: + ideal_shape.append(s) + + # How much larger or smaller the ideal chunk size is relative to what we have now + multiplier = _compute_multiplier(limit, dtype, largest_block, result) + + last_multiplier = 0 + last_autos = set() + while ( + multiplier != last_multiplier or autos != last_autos + ): # while things change + last_multiplier = multiplier # record previous values + last_autos = set(autos) # record previous values + + # Expand or contract each of the dimensions appropriately + for a in sorted(autos): + if ideal_shape[a] == 0: + result[a] = 0 + continue + proposed = result[a] * multiplier ** (1 / len(autos)) + if proposed > shape[a]: # we've hit the shape boundary + autos.remove(a) + largest_block *= shape[a] + chunks[a] = shape[a] + del result[a] + else: + result[a] = round_to(proposed, ideal_shape[a]) + + # recompute how much multiplier we have left, repeat + multiplier = _compute_multiplier(limit, dtype, largest_block, result) + + for k, v in result.items(): + chunks[k] = v + return tuple(chunks) + + else: + # Check if dtype.itemsize is greater than 0 + if dtype.itemsize == 0: + raise ValueError( + "auto-chunking with dtype.itemsize == 0 is not supported, please pass in `chunks` explicitly" + ) + size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos)) + small = [i for i in autos if shape[i] < size] + if small: + for i in small: + chunks[i] = (shape[i],) + return auto_chunks(chunks, shape, limit, dtype) + + for i in autos: + chunks[i] = round_to(size, shape[i]) + + return tuple(chunks) + + +def round_to(c, s): + """Return a chunk dimension that is close to an even multiple or factor + We want values for c that are nicely aligned with s. + If c is smaller than s we use the original chunk size and accept an + uneven chunk at the end. + If c is larger than s then we want the largest multiple of s that is still + smaller than c. + """ + if c <= s: + return max(1, int(c)) + else: + return c // s * s + + +def frequencies(seq): + """ + Find number of occurrences of each value in seq. + + >>> frequencies(["cat", "cat", "ox", "pig", "pig", "cat"]) + {'cat': 3, 'ox': 1, 'pig': 2} + + Vendored from pytoolz. + """ + d = collections.defaultdict(int) + for item in seq: + d[item] += 1 + return dict(d) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ac8f3c4dea5..504f4dae312 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -51,6 +51,7 @@ ) from xarray.core.computation import unify_chunks from xarray.core.coordinates import DatasetCoordinates, assert_coordinate_consistent +from xarray.core.daskvendor import normalize_chunks from xarray.core.duck_array_ops import datetime_to_numeric from xarray.core.indexes import ( Index, @@ -213,8 +214,6 @@ def _get_chunk(var, chunks): Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ - from dask.array.core import normalize_chunks - if isinstance(var, IndexVariable): return {} dims = var.dims @@ -232,7 +231,6 @@ def _get_chunk(var, chunks): for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape) ) - # TODO ideally replace this with non-dask version chunk_shape = normalize_chunks( chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape ) From dae2fe4e41f15285f5be7c07b0ddf112b9171d0f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 22:18:48 -0400 Subject: [PATCH 089/158] add default implementation of rechunk in ABC --- xarray/core/daskmanager.py | 4 ---- xarray/core/parallelcompat.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index c177b08d8b4..1bbea555387 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -76,10 +76,6 @@ def from_array(self, data, chunks, **kwargs) -> "DaskArray": ) return data - # TODO is simple method propagation like this necessary? - def rechunk(self, data: "DaskArray", chunks, **kwargs) -> "DaskArray": - return data.rechunk(chunks, **kwargs) - def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: from dask.array import compute diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 1988dfbe4ed..e2ab7a823d1 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -169,11 +169,11 @@ def from_array( ) -> T_ChunkedArray: ... - @abstractmethod def rechunk( self, data: T_ChunkedArray, chunks: T_Chunks, **kwargs ) -> T_ChunkedArray: - ... + """Called when .chunk is called on an xarray object that is already chunked.""" + return data.rechunk(chunks, **kwargs) @abstractmethod def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: From 4ef500c37fba36febb2e6415224921d5b03f3c6d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 22:22:31 -0400 Subject: [PATCH 090/158] remove cubed-specific type check in daskmanager --- xarray/core/daskmanager.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 1bbea555387..5e0d94d1c15 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -6,7 +6,7 @@ from xarray.core import utils from xarray.core.duck_array_ops import dask_available from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks -from xarray.core.pycompat import DuckArrayModule, is_duck_dask_array +from xarray.core.pycompat import is_duck_dask_array if TYPE_CHECKING: from xarray.core.types import DaskArray @@ -41,8 +41,6 @@ def from_array(self, data, chunks, **kwargs) -> "DaskArray": if is_duck_dask_array(data): data = self.rechunk(data, chunks) - elif isinstance(data, DuckArrayModule("cubed").type): - raise TypeError("Trying to rechunk a cubed array using dask") else: if isinstance(data, indexing.ExplicitlyIndexed): # Unambiguously handle array storage backends (like NetCDF4 and h5py) From ba66419d1094c7f8f545bf88b07efc0df8b32689 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 23 Mar 2023 22:27:14 -0400 Subject: [PATCH 091/158] nanfirst->chunked_nanfirst --- xarray/core/duck_array_ops.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index c54d409585e..4d7998e1475 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -643,7 +643,7 @@ def first(values, axis, skipna=None): if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN if is_chunked_array(values): - return nanfirst(values, axis) + return chunked_nanfirst(values, axis) else: return nputils.nanfirst(values, axis) return take(values, 0, axis=axis) @@ -654,7 +654,7 @@ def last(values, axis, skipna=None): if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN if is_chunked_array(values): - return nanlast(values, axis) + return chunked_nanlast(values, axis) else: return nputils.nanlast(values, axis) return take(values, -1, axis=axis) @@ -681,7 +681,7 @@ def _first_last_wrapper(array, *, axis, op, keepdims): return op(array, axis, keepdims=keepdims) -def _first_or_last(darray, axis, op): +def _chunked_first_or_last(darray, axis, op): chunkmanager = get_chunked_array_type(darray) # This will raise the same error message seen for numpy @@ -698,9 +698,9 @@ def _first_or_last(darray, axis, op): ) -def nanfirst(darray, axis): - return _first_or_last(darray, axis, op=nputils.nanfirst) +def chunked_nanfirst(darray, axis): + return _chunked_first_or_last(darray, axis, op=nputils.nanfirst) -def nanlast(darray, axis): - return _first_or_last(darray, axis, op=nputils.nanlast) +def chunked_nanlast(darray, axis): + return _chunked_first_or_last(darray, axis, op=nputils.nanlast) From 7fd4617549fc0e453055a2f7b4825b8bfc020206 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 24 Mar 2023 12:39:59 -0400 Subject: [PATCH 092/158] revert adding cubed to NON_NUMPY_SUPPORTED_ARRAY_TYPES --- xarray/core/utils.py | 4 ++-- xarray/core/variable.py | 19 +++++-------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 78d506296fe..86317272006 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -287,7 +287,7 @@ def either_dict_or_kwargs( def _is_scalar(value, include_0d): - from xarray.core.variable import _get_non_numpy_supported_array_types + from xarray.core.variable import NON_NUMPY_SUPPORTED_ARRAY_TYPES if include_0d: include_0d = getattr(value, "ndim", None) == 0 @@ -295,7 +295,7 @@ def _is_scalar(value, include_0d): include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + _get_non_numpy_supported_array_types()) + isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES()) or hasattr(value, "__array_function__") or hasattr(value, "__array_namespace__") ) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3cae34ec14d..a3a9dc43750 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,7 +1,6 @@ from __future__ import annotations import copy -import functools import itertools import math import numbers @@ -29,7 +28,6 @@ from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.pycompat import ( - DuckArrayModule, array_type, integer_types, is_0d_dask_array, @@ -50,17 +48,10 @@ maybe_coerce_to_str, ) - -@functools.cache -def _get_non_numpy_supported_array_types(): - """Required instead of a global to avoid circular import errors with cubed""" - - return ( - indexing.ExplicitlyIndexed, - pd.Index, - ) + DuckArrayModule("cubed").type - - +NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( + indexing.ExplicitlyIndexed, + pd.Index, +) # https://github.com/python/mypy/issues/224 BASIC_INDEXING_TYPES = integer_types + (slice,) @@ -263,7 +254,7 @@ def as_compatible_data(data, fastpath=False): if isinstance(data, (Variable, DataArray)): return data.data - if isinstance(data, _get_non_numpy_supported_array_types()): + if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES): data = _possibly_convert_datetime_or_timedelta_index(data) return _maybe_wrap_data(data) From 69d77c9d4a25483df3b72210213f3a8585e1d390 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 24 Mar 2023 13:09:38 -0400 Subject: [PATCH 093/158] licensing to vendor functions from dask --- README.md | 3 +- xarray/core/{daskvendor.py => daskcompat.py} | 32 +++++++++++++++++++- xarray/core/dataset.py | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) rename xarray/core/{daskvendor.py => daskcompat.py} (90%) diff --git a/README.md b/README.md index d5a7bcdaeb2..83c598e83bf 100644 --- a/README.md +++ b/README.md @@ -122,12 +122,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -Xarray bundles portions of pandas, NumPy and Seaborn, all of which are +Xarray bundles portions of pandas, NumPy, Seaborn and dask, all of which are available under a "3-clause BSD" license: - pandas: setup.py, xarray/util/print_versions.py - NumPy: xarray/core/npcompat.py - Seaborn: _determine_cmap_params in xarray/core/plot/utils.py +- Dask: xarray/core/daskcompat.py Xarray also bundles portions of CPython, which is available under the "Python Software Foundation License" in xarray/core/pycompat.py. diff --git a/xarray/core/daskvendor.py b/xarray/core/daskcompat.py similarity index 90% rename from xarray/core/daskvendor.py rename to xarray/core/daskcompat.py index 1a0719bec05..c12404decb4 100644 --- a/xarray/core/daskvendor.py +++ b/xarray/core/daskcompat.py @@ -1,4 +1,34 @@ -"""Functions vendored from dask.""" +# For reference, here is a copy of the dask copyright notice: + +# BSD 3-Clause License + +# Copyright (c) 2014, Anaconda, Inc. and contributors +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 504f4dae312..5ff2b3706f0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -51,7 +51,7 @@ ) from xarray.core.computation import unify_chunks from xarray.core.coordinates import DatasetCoordinates, assert_coordinate_consistent -from xarray.core.daskvendor import normalize_chunks +from xarray.core.daskcompat import normalize_chunks from xarray.core.duck_array_ops import datetime_to_numeric from xarray.core.indexes import ( Index, From 83378578f65f07fbcadcfeddb14ae7b652c75c5f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 24 Mar 2023 13:40:41 -0400 Subject: [PATCH 094/158] fix bug --- xarray/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 86317272006..08625fe7d95 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -295,7 +295,7 @@ def _is_scalar(value, include_0d): include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES()) + isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES) or hasattr(value, "__array_function__") or hasattr(value, "__array_namespace__") ) From 9850a466dab4dcd5f42e8140224979a3d1fb0c85 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 24 Mar 2023 13:40:56 -0400 Subject: [PATCH 095/158] ignore mypy error --- xarray/core/parallelcompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index e2ab7a823d1..68fa0d91538 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -173,7 +173,7 @@ def rechunk( self, data: T_ChunkedArray, chunks: T_Chunks, **kwargs ) -> T_ChunkedArray: """Called when .chunk is called on an xarray object that is already chunked.""" - return data.rechunk(chunks, **kwargs) + return data.rechunk(chunks, **kwargs) # type: ignore[attr-defined] @abstractmethod def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: From 488fd5b7fc04c776f3122a2f797ce478fe1929ab Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 29 Mar 2023 16:03:10 -0400 Subject: [PATCH 096/158] separate chunk_manager kwarg from from_array_kwargs dict --- xarray/backends/api.py | 36 ++++++++---- xarray/backends/zarr.py | 19 +++++-- xarray/core/common.py | 119 +++++++++++++++++++++++++++++++-------- xarray/core/dataarray.py | 14 +++-- xarray/core/dataset.py | 21 +++++-- xarray/core/variable.py | 25 +++++--- 6 files changed, 179 insertions(+), 55 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8e9b0d60ce7..a241ac0443a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -307,10 +307,11 @@ def _chunk_ds( chunks, overwrite_encoded_chunks, inline_array, + chunk_manager, from_array_kwargs, **extra_tokens, ): - if from_array_kwargs["manager"] == "dask": + if chunk_manager == "dask": from dask.base import tokenize mtime = _get_mtime(filename_or_obj) @@ -332,6 +333,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs.copy(), ) return backend_ds._replace(variables) @@ -345,6 +347,7 @@ def _dataset_from_backend_dataset( cache, overwrite_encoded_chunks, inline_array, + chunk_manager, from_array_kwargs, **extra_tokens, ): @@ -364,6 +367,7 @@ def _dataset_from_backend_dataset( chunks, overwrite_encoded_chunks, inline_array, + chunk_manager, from_array_kwargs, **extra_tokens, ) @@ -392,6 +396,7 @@ def open_dataset( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, @@ -485,11 +490,15 @@ def open_dataset( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + chunk_manager: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. This is experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -534,7 +543,7 @@ def open_dataset( engine = plugins.guess_engine(filename_or_obj) if from_array_kwargs is None: - from_array_kwargs = {"manager": None} + from_array_kwargs = {} backend = plugins.get_backend(engine) @@ -564,6 +573,7 @@ def open_dataset( cache, overwrite_encoded_chunks, inline_array, + chunk_manager, from_array_kwargs, drop_variables=drop_variables, **decoders, @@ -587,6 +597,7 @@ def open_dataarray( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, @@ -682,11 +693,15 @@ def open_dataarray( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + chunk_manager: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -730,6 +745,7 @@ def open_dataarray( cache=cache, drop_variables=drop_variables, inline_array=inline_array, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, use_cftime=use_cftime, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 07cbe0b95d2..321e18a29cc 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -3,6 +3,7 @@ import json import os import warnings +from typing import Any import numpy as np @@ -703,7 +704,8 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, - from_array_kwargs=None, + chunk_manager: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -788,6 +790,15 @@ def open_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + chunk_manager: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -806,12 +817,11 @@ def open_zarr( from xarray.backends.api import open_dataset if from_array_kwargs is None: - from_array_kwargs = {"manager": None} + from_array_kwargs = {} if chunks == "auto": - manager = from_array_kwargs.get("manager", None) try: - guess_chunkmanager(manager) # attempt to import that parallel backend + guess_chunkmanager(chunk_manager) # attempt to import that parallel backend chunks = {} except ValueError: @@ -843,6 +853,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, decode_timedelta=decode_timedelta, diff --git a/xarray/core/common.py b/xarray/core/common.py index 87fa95c1af7..3ecc713058d 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -13,7 +13,7 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.pycompat import is_chunked_array from xarray.core.utils import ( @@ -1398,6 +1398,8 @@ def full_like( other: DataArray, fill_value: Any, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1408,6 +1410,8 @@ def full_like( other: Dataset, fill_value: Any, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1418,6 +1422,8 @@ def full_like( other: Variable, fill_value: Any, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1428,6 +1434,8 @@ def full_like( other: Dataset | DataArray, fill_value: Any, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1438,6 +1446,8 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1447,9 +1457,14 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: - """Return a new object with the same shape and type as a given object. + """ + Return a new object with the same shape and type as a given object. + + Returned object will be chunked if if the given object is chunked, or if chunks or chunk_manager are specified. Parameters ---------- @@ -1462,12 +1477,18 @@ def full_like( dtype : dtype or dict-like of dtype, optional dtype of the new array. If a dict-like, maps dtypes to variables. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunk_manager: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. - + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1584,6 +1605,8 @@ def full_like( v.variable, fill_value.get(k, dtypes.NA), dtype_.get(k, None), + chunks, + chunk_manager, from_array_kwargs, ) for k, v in other.data_vars.items() @@ -1593,7 +1616,14 @@ def full_like( if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a DataArray") return DataArray( - _full_like_variable(other.variable, fill_value, dtype, from_array_kwargs), + _full_like_variable( + other.variable, + fill_value, + dtype, + chunks, + chunk_manager, + from_array_kwargs, + ), dims=other.dims, coords=other.coords, attrs=other.attrs, @@ -1602,7 +1632,9 @@ def full_like( elif isinstance(other, Variable): if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a Variable") - return _full_like_variable(other, fill_value, dtype, from_array_kwargs) + return _full_like_variable( + other, fill_value, dtype, chunks, chunk_manager, from_array_kwargs + ) else: raise TypeError("Expected DataArray, Dataset, or Variable") @@ -1611,6 +1643,8 @@ def _full_like_variable( other: Variable, fill_value: Any, dtype: DTypeLike | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: """Inner function of full_like, where other must be a variable""" @@ -1619,8 +1653,11 @@ def _full_like_variable( if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype) - if is_chunked_array(other.data): - chunkmanager = get_chunked_array_type(other.data) + if is_chunked_array(other.data) or chunk_manager is not None or chunks != {}: + if chunk_manager is None: + chunkmanager = get_chunked_array_type(other.data) + else: + chunkmanager = guess_chunkmanager(chunk_manager) if dtype is None: dtype = other.dtype @@ -1632,7 +1669,7 @@ def _full_like_variable( other.shape, fill_value, dtype=dtype, - chunks=other.data.chunks, + chunks=chunks if chunks else other.data.chunks, **from_array_kwargs, ) else: @@ -1645,6 +1682,8 @@ def _full_like_variable( def zeros_like( other: DataArray, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1654,6 +1693,8 @@ def zeros_like( def zeros_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1663,6 +1704,8 @@ def zeros_like( def zeros_like( other: Variable, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1672,6 +1715,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1681,6 +1726,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1689,6 +1736,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of zeros with the same shape and @@ -1700,11 +1749,18 @@ def zeros_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunk_manager: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1748,13 +1804,15 @@ def zeros_like( full_like """ - return full_like(other, 0, dtype, from_array_kwargs) + return full_like(other, 0, dtype, chunks, chunk_manager, from_array_kwargs) @overload def ones_like( other: DataArray, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1764,6 +1822,8 @@ def ones_like( def ones_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1773,6 +1833,8 @@ def ones_like( def ones_like( other: Variable, dtype: DTypeLikeSave | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1782,6 +1844,8 @@ def ones_like( def ones_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1791,6 +1855,8 @@ def ones_like( def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1799,6 +1865,8 @@ def ones_like( def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, + chunks={}, + chunk_manager: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of ones with the same shape and @@ -1810,11 +1878,18 @@ def ones_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunk_manager: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1850,7 +1925,7 @@ def ones_like( full_like """ - return full_like(other, 1, dtype, from_array_kwargs) + return full_like(other, 1, dtype, chunks, chunk_manager, from_array_kwargs) def get_chunksizes( diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c4d9cf4908f..bc8d65b200b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1253,6 +1253,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + chunk_manager: str | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: @@ -1281,11 +1282,15 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunk_manager: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. @@ -1323,6 +1328,7 @@ def chunk( token=token, lock=lock, inline_array=inline_array, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs, ) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5ff2b3706f0..6e01ef55e7c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -276,12 +276,13 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, + chunk_manager=None, from_array_kwargs=None, ): if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - if from_array_kwargs["manager"] == "dask": + if chunk_manager == "dask": from dask.base import tokenize # when rechunking by different amounts, make sure dask names change @@ -298,6 +299,7 @@ def _maybe_chunk( name=name2, lock=lock, inline_array=inline_array, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs, ) @@ -2223,6 +2225,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + chunk_manager: str | None = None, from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: @@ -2251,11 +2254,15 @@ def chunk( inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunk_manager: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided @@ -2290,8 +2297,9 @@ def chunk( f"some chunks keys are not dimensions on this object: {bad_dims}" ) + chunk_manager = guess_chunkmanager_name(chunk_manager) if from_array_kwargs is None: - from_array_kwargs = {"manager": guess_chunkmanager_name(None)} + from_array_kwargs = {} variables = { k: _maybe_chunk( @@ -2302,6 +2310,7 @@ def chunk( lock, name_prefix, inline_array=inline_array, + chunk_manager=chunk_manager, from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() diff --git a/xarray/core/variable.py b/xarray/core/variable.py index a3a9dc43750..3da3b3c491c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1150,6 +1150,7 @@ def chunk( name: str | None = None, lock: bool = False, inline_array: bool = False, + chunk_manager: str | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: @@ -1177,6 +1178,10 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunk_manager: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. from_array_kwargs: dict Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. @@ -1198,14 +1203,6 @@ def chunk( dask.array.from_array """ - if from_array_kwargs is None: - from_array_kwargs = {} - chunk_manager = guess_chunkmanager(from_array_kwargs.pop("manager", None)) - - _from_array_kwargs = dict( - name=name, lock=lock, inline_array=inline_array, **from_array_kwargs - ) - if chunks is None: warnings.warn( "None value for 'chunks' is deprecated. " @@ -1215,6 +1212,7 @@ def chunk( chunks = {} if isinstance(chunks, (float, str, int, tuple, list)): + # TODO we shouldn't assume here that other chunkmanagers can handle these types pass # dask.array.from_array can handle these directly else: chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") @@ -1222,7 +1220,15 @@ def chunk( if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} - data = chunk_manager.from_array(self._data, chunks, **_from_array_kwargs) + _chunk_manager = guess_chunkmanager(chunk_manager) + + if from_array_kwargs is None: + from_array_kwargs = {} + _from_array_kwargs = dict( + name=name, lock=lock, inline_array=inline_array, **from_array_kwargs + ) + + data = _chunk_manager.from_array(self._data, chunks, **_from_array_kwargs) return self._replace(data=data) @@ -2878,6 +2884,7 @@ def chunk( name=None, lock=False, inline_array=False, + chunk_manager=None, from_array_kwargs=None, ): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() From 00bcf6c90c4caf7dbcde219a38480b9f8ac0a294 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 29 Mar 2023 17:21:00 -0400 Subject: [PATCH 097/158] rename kwarg to chunked_array_type --- xarray/backends/api.py | 22 ++++----- xarray/backends/zarr.py | 16 ++++--- xarray/core/common.py | 74 ++++++++++++++--------------- xarray/core/computation.py | 4 +- xarray/core/dataarray.py | 12 ++--- xarray/core/dataset.py | 20 ++++---- xarray/core/variable.py | 14 +++--- xarray/tests/test_parallelcompat.py | 4 +- 8 files changed, 84 insertions(+), 82 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a241ac0443a..b2f6d850070 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -307,11 +307,11 @@ def _chunk_ds( chunks, overwrite_encoded_chunks, inline_array, - chunk_manager, + chunked_array_type, from_array_kwargs, **extra_tokens, ): - if chunk_manager == "dask": + if chunked_array_type == "dask": from dask.base import tokenize mtime = _get_mtime(filename_or_obj) @@ -333,7 +333,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs.copy(), ) return backend_ds._replace(variables) @@ -347,7 +347,7 @@ def _dataset_from_backend_dataset( cache, overwrite_encoded_chunks, inline_array, - chunk_manager, + chunked_array_type, from_array_kwargs, **extra_tokens, ): @@ -367,7 +367,7 @@ def _dataset_from_backend_dataset( chunks, overwrite_encoded_chunks, inline_array, - chunk_manager, + chunked_array_type, from_array_kwargs, **extra_tokens, ) @@ -396,7 +396,7 @@ def open_dataset( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, @@ -490,7 +490,7 @@ def open_dataset( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. @@ -573,7 +573,7 @@ def open_dataset( cache, overwrite_encoded_chunks, inline_array, - chunk_manager, + chunked_array_type, from_array_kwargs, drop_variables=drop_variables, **decoders, @@ -597,7 +597,7 @@ def open_dataarray( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, @@ -693,7 +693,7 @@ def open_dataarray( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. @@ -745,7 +745,7 @@ def open_dataarray( cache=cache, drop_variables=drop_variables, inline_array=inline_array, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, use_cftime=use_cftime, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 321e18a29cc..769bc3b7cd6 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -704,7 +704,7 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, **kwargs, ): @@ -790,15 +790,15 @@ def open_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -821,7 +821,9 @@ def open_zarr( if chunks == "auto": try: - guess_chunkmanager(chunk_manager) # attempt to import that parallel backend + guess_chunkmanager( + chunked_array_type + ) # attempt to import that parallel backend chunks = {} except ValueError: @@ -853,7 +855,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, decode_timedelta=decode_timedelta, diff --git a/xarray/core/common.py b/xarray/core/common.py index 3ecc713058d..a852113c526 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1399,7 +1399,7 @@ def full_like( fill_value: Any, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1411,7 +1411,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1423,7 +1423,7 @@ def full_like( fill_value: Any, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1435,7 +1435,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1447,7 +1447,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1458,13 +1458,13 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """ Return a new object with the same shape and type as a given object. - Returned object will be chunked if if the given object is chunked, or if chunks or chunk_manager are specified. + Returned object will be chunked if if the given object is chunked, or if chunks or chunked_array_type are specified. Parameters ---------- @@ -1480,7 +1480,7 @@ def full_like( chunks : int, "auto", tuple of int or mapping of Hashable to int, optional Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or ``{"x": 5, "y": 5}``. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. @@ -1606,7 +1606,7 @@ def full_like( fill_value.get(k, dtypes.NA), dtype_.get(k, None), chunks, - chunk_manager, + chunked_array_type, from_array_kwargs, ) for k, v in other.data_vars.items() @@ -1621,7 +1621,7 @@ def full_like( fill_value, dtype, chunks, - chunk_manager, + chunked_array_type, from_array_kwargs, ), dims=other.dims, @@ -1633,7 +1633,7 @@ def full_like( if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a Variable") return _full_like_variable( - other, fill_value, dtype, chunks, chunk_manager, from_array_kwargs + other, fill_value, dtype, chunks, chunked_array_type, from_array_kwargs ) else: raise TypeError("Expected DataArray, Dataset, or Variable") @@ -1644,7 +1644,7 @@ def _full_like_variable( fill_value: Any, dtype: DTypeLike | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: """Inner function of full_like, where other must be a variable""" @@ -1653,11 +1653,11 @@ def _full_like_variable( if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype) - if is_chunked_array(other.data) or chunk_manager is not None or chunks != {}: - if chunk_manager is None: + if is_chunked_array(other.data) or chunked_array_type is not None or chunks != {}: + if chunked_array_type is None: chunkmanager = get_chunked_array_type(other.data) else: - chunkmanager = guess_chunkmanager(chunk_manager) + chunkmanager = guess_chunkmanager(chunked_array_type) if dtype is None: dtype = other.dtype @@ -1683,7 +1683,7 @@ def zeros_like( other: DataArray, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1694,7 +1694,7 @@ def zeros_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1705,7 +1705,7 @@ def zeros_like( other: Variable, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1716,7 +1716,7 @@ def zeros_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1727,7 +1727,7 @@ def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1737,7 +1737,7 @@ def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of zeros with the same shape and @@ -1752,15 +1752,15 @@ def zeros_like( chunks : int, "auto", tuple of int or mapping of Hashable to int, optional Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or ``{"x": 5, "y": 5}``. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1804,7 +1804,7 @@ def zeros_like( full_like """ - return full_like(other, 0, dtype, chunks, chunk_manager, from_array_kwargs) + return full_like(other, 0, dtype, chunks, chunked_array_type, from_array_kwargs) @overload @@ -1812,7 +1812,7 @@ def ones_like( other: DataArray, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @@ -1823,7 +1823,7 @@ def ones_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @@ -1834,7 +1834,7 @@ def ones_like( other: Variable, dtype: DTypeLikeSave | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @@ -1845,7 +1845,7 @@ def ones_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1856,7 +1856,7 @@ def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1866,7 +1866,7 @@ def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, chunks={}, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of ones with the same shape and @@ -1881,15 +1881,15 @@ def ones_like( chunks : int, "auto", tuple of int or mapping of Hashable to int, optional Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or ``{"x": 5, "y": 5}``. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1925,7 +1925,7 @@ def ones_like( full_like """ - return full_like(other, 1, dtype, chunks, chunk_manager, from_array_kwargs) + return full_like(other, 1, dtype, chunks, chunked_array_type, from_array_kwargs) def get_chunksizes( diff --git a/xarray/core/computation.py b/xarray/core/computation.py index f7349f1882a..0030c0259dd 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -694,7 +694,7 @@ def apply_variable_ufunc( "``.load()`` or ``.compute()``" ) elif dask == "parallelized": - chunk_manager = get_chunked_array_type(*input_data) + chunkmanager = get_chunked_array_type(*input_data) numpy_func = func @@ -743,7 +743,7 @@ def apply_variable_ufunc( ) def func(*arrays): - res = chunk_manager.apply_gufunc( + res = chunkmanager.apply_gufunc( numpy_func, signature.to_gufunc_string(exclude_dims), *arrays, diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bc8d65b200b..e1695b673f8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1253,7 +1253,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: @@ -1282,15 +1282,15 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. @@ -1328,7 +1328,7 @@ def chunk( token=token, lock=lock, inline_array=inline_array, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, ) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6e01ef55e7c..95137354061 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -276,13 +276,13 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, - chunk_manager=None, + chunked_array_type=None, from_array_kwargs=None, ): if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - if chunk_manager == "dask": + if chunked_array_type == "dask": from dask.base import tokenize # when rechunking by different amounts, make sure dask names change @@ -299,7 +299,7 @@ def _maybe_chunk( name=name2, lock=lock, inline_array=inline_array, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, ) @@ -2225,7 +2225,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: @@ -2254,15 +2254,15 @@ def chunk( inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided @@ -2297,7 +2297,7 @@ def chunk( f"some chunks keys are not dimensions on this object: {bad_dims}" ) - chunk_manager = guess_chunkmanager_name(chunk_manager) + chunked_array_type = guess_chunkmanager_name(chunked_array_type) if from_array_kwargs is None: from_array_kwargs = {} @@ -2310,7 +2310,7 @@ def chunk( lock, name_prefix, inline_array=inline_array, - chunk_manager=chunk_manager, + chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3da3b3c491c..b7f96795aaa 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1150,7 +1150,7 @@ def chunk( name: str | None = None, lock: bool = False, inline_array: bool = False, - chunk_manager: str | None = None, + chunked_array_type: str | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: @@ -1178,13 +1178,13 @@ def chunk( inline_array: optional Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - chunk_manager: str, optional + chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManager.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `manager` kwarg. + Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional @@ -1220,7 +1220,7 @@ def chunk( if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} - _chunk_manager = guess_chunkmanager(chunk_manager) + chunkmanager = guess_chunkmanager(chunked_array_type) if from_array_kwargs is None: from_array_kwargs = {} @@ -1228,7 +1228,7 @@ def chunk( name=name, lock=lock, inline_array=inline_array, **from_array_kwargs ) - data = _chunk_manager.from_array(self._data, chunks, **_from_array_kwargs) + data = chunkmanager.from_array(self._data, chunks, **_from_array_kwargs) return self._replace(data=data) @@ -2884,7 +2884,7 @@ def chunk( name=None, lock=False, inline_array=False, - chunk_manager=None, + chunked_array_type=None, from_array_kwargs=None, ): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 8c93437530d..954625ca56b 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -141,8 +141,8 @@ def test_fail_on_nonexistent_chunkmanager(self): @requires_dask def test_get_dask_if_installed(self): - chunk_manager = guess_chunkmanager(None) - assert isinstance(chunk_manager, DaskManager) + chunkmanager = guess_chunkmanager(None) + assert isinstance(chunkmanager, DaskManager) @pytest.mark.skipif(has_dask, reason="requires dask not to be installed") def test_dont_get_dask_if_not_installed(self): From 844726ddcc9c8b8581977540158fc8afe73fe58f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 4 Apr 2023 16:13:28 -0400 Subject: [PATCH 098/158] refactor from_array_kwargs in .chunk ready for deprecation --- xarray/core/daskmanager.py | 1 + xarray/core/dataset.py | 15 ++++++---- xarray/core/utils.py | 60 ++++++++++++++++++++++++++++++++++++++ xarray/core/variable.py | 25 +++++++++------- 4 files changed, 85 insertions(+), 16 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 5e0d94d1c15..655569a6c82 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -42,6 +42,7 @@ def from_array(self, data, chunks, **kwargs) -> "DaskArray": if is_duck_dask_array(data): data = self.rechunk(data, chunks) else: + # TODO move this up to variable.chunk if isinstance(data, indexing.ExplicitlyIndexed): # Unambiguously handle array storage backends (like NetCDF4 and h5py) # that can't handle general array indexing. For example, in netCDF4 you diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 95137354061..fcb628160ac 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -281,7 +281,9 @@ def _maybe_chunk( ): if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} + if var.ndim: + chunked_array_type = guess_chunkmanager_name(chunked_array_type) if chunked_array_type == "dask": from dask.base import tokenize @@ -290,15 +292,16 @@ def _maybe_chunk( # subtle bugs result otherwise. see GH3350 token2 = tokenize(name, token if token else var._data, chunks) name2 = f"{name_prefix}{name}-{token2}" - else: - # not used - name2 = None + from_array_kwargs["name"] = name2 + + from_array_kwargs = utils.consolidate_dask_from_array_kwargs( + from_array_kwargs, + lock=lock, + inline_array=inline_array, + ) var = var.chunk( chunks, - name=name2, - lock=lock, - inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, ) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 08625fe7d95..0a19cfade20 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -1202,3 +1202,63 @@ def emit_user_level_warning(message, category=None): """Emit a warning at the user level by inspecting the stack trace.""" stacklevel = find_stack_level() warnings.warn(message, category=category, stacklevel=stacklevel) + + +def consolidate_dask_from_array_kwargs( + from_array_kwargs, name=None, lock=None, inline_array=None +): + """ + Merge dask-specific kwargs with arbitrary from_array_kwargs dict. + + Temporary function, to be deleted once explicitly passing dask-specific kwargs to .chunk() is deprecated. + """ + + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="name", + passed_kwarg_value=name, + default=None, + err_msg_dict_name="from_array_kwargs", + ) + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="lock", + passed_kwarg_value=lock, + default=False, + err_msg_dict_name="from_array_kwargs", + ) + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="inline_array", + passed_kwarg_value=inline_array, + default=False, + err_msg_dict_name="from_array_kwargs", + ) + + return from_array_kwargs + + +def _resolve_doubly_passed_kwarg( + kwargs_dict, + kwarg_name, + passed_kwarg_value, + default, + err_msg_dict_name, +): + # if in kwargs_dict but not passed explicitly then just pass kwargs_dict through unaltered + if kwarg_name in kwargs_dict and passed_kwarg_value is None: + pass + # if passed explicitly but not in kwargs_dict then use that + elif kwarg_name not in kwargs_dict and passed_kwarg_value is not None: + kwargs_dict[kwarg_name] = passed_kwarg_value + # if in neither then use default + elif kwarg_name not in kwargs_dict and passed_kwarg_value is None: + kwargs_dict[kwarg_name] = default + # if in both then raise + else: + raise ValueError( + f"argument {kwarg_name} cannot be passed both as a keyword argument and within " + f"the {err_msg_dict_name} dictionary" + ) + + return kwargs_dict diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b7f96795aaa..d369b015c0d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1148,8 +1148,8 @@ def chunk( | Mapping[Any, None | int | tuple[int, ...]] ) = {}, name: str | None = None, - lock: bool = False, - inline_array: bool = False, + lock: bool | None = None, + inline_array: bool | None = None, chunked_array_type: str | None = None, from_array_kwargs=None, **chunks_kwargs: Any, @@ -1172,17 +1172,17 @@ def chunk( name : str, optional Used to generate the name for this array in the internal dask graph. Does not need not be unique. - lock : optional + lock : bool, optional Passed on to :py:func:`dask.array.from_array`, if the array is not - already as dask array. - inline_array: optional + already as dask array. Default is False. + inline_array : bool, optional Passed on to :py:func:`dask.array.from_array`, if the array is not - already as dask array. + already as dask array. Default is False. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to @@ -1224,8 +1224,13 @@ def chunk( if from_array_kwargs is None: from_array_kwargs = {} - _from_array_kwargs = dict( - name=name, lock=lock, inline_array=inline_array, **from_array_kwargs + + # TODO deprecate passing these dask-specific arguments explicitly. In future just pass everything via from_array_kwargs + _from_array_kwargs = utils.consolidate_dask_from_array_kwargs( + from_array_kwargs, + name=name, + lock=lock, + inline_array=inline_array, ) data = chunkmanager.from_array(self._data, chunks, **_from_array_kwargs) From 3d56a3df1100e80f807a216a6ace77fcce10ee84 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Apr 2023 11:23:02 -0400 Subject: [PATCH 099/158] print statements in test so I can comment on them --- xarray/tests/test_backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 12e101a475d..ca5905f78a9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3604,6 +3604,8 @@ def test_open_mfdataset(self) -> None: ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) + print(original) + print(actual) assert_identical(original, actual) with open_mfdataset( [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} From 1952c55826d9cf6ebafd0964e37b3316b06ac52e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Apr 2023 11:27:52 -0400 Subject: [PATCH 100/158] remove print statements now I've commented on them in PR --- xarray/backends/api.py | 4 +++- xarray/tests/test_backends.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 08345d80028..f6ef9a22afe 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -32,6 +32,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.indexes import Index +from xarray.core.parallelcompat import guess_chunkmanager_name from xarray.core.utils import is_remote_uri if TYPE_CHECKING: @@ -310,12 +311,13 @@ def _chunk_ds( from_array_kwargs, **extra_tokens, ): + chunked_array_type = guess_chunkmanager_name(chunked_array_type) if chunked_array_type == "dask": from dask.base import tokenize mtime = _get_mtime(filename_or_obj) token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = f"open_dataset-{token}" + name_prefix = "open_dataset-" else: # not used token = (None,) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ca5905f78a9..12e101a475d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3604,8 +3604,6 @@ def test_open_mfdataset(self) -> None: ) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) - print(original) - print(actual) assert_identical(original, actual) with open_mfdataset( [tmp1, tmp2], concat_dim="x", combine="nested", chunks={"x": 3} From 3ba8d42971bb8468a710e919009a8fac3dbd7288 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Apr 2023 11:33:08 -0400 Subject: [PATCH 101/158] should fix dask naming tests --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fcb628160ac..27a7d528b95 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -292,10 +292,10 @@ def _maybe_chunk( # subtle bugs result otherwise. see GH3350 token2 = tokenize(name, token if token else var._data, chunks) name2 = f"{name_prefix}{name}-{token2}" - from_array_kwargs["name"] = name2 from_array_kwargs = utils.consolidate_dask_from_array_kwargs( from_array_kwargs, + name=name2, lock=lock, inline_array=inline_array, ) From 53d6094f9624a459512e9f88a6bb5c55d0e65a19 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Apr 2023 14:53:14 -0400 Subject: [PATCH 102/158] make dask-specific kwargs explicit in from_array --- xarray/core/daskmanager.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 655569a6c82..1b551c0a6a2 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -29,16 +29,13 @@ def is_chunked_array(self, data: Any) -> bool: def chunks(self, data: "DaskArray") -> T_Chunks: return data.chunks - def from_array(self, data, chunks, **kwargs) -> "DaskArray": + def from_array( + self, data, chunks, name=None, lock=False, inline_array=False + ) -> "DaskArray": import dask.array as da from xarray.core import indexing - # dask-specific kwargs - name = kwargs.pop("name", None) - lock = kwargs.pop("lock", False) - inline_array = kwargs.pop("inline_array", False) - if is_duck_dask_array(data): data = self.rechunk(data, chunks) else: From 7dc658186194aa31e788e106c7527a8909de0a2e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 01:51:11 -0400 Subject: [PATCH 103/158] debugging print statements --- xarray/backends/api.py | 2 ++ xarray/core/dataset.py | 2 ++ xarray/core/variable.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f6ef9a22afe..21ecbac05ea 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -323,6 +323,8 @@ def _chunk_ds( token = (None,) name_prefix = None + print(f"inside _chunk_ds chunks={chunks}") + variables = {} for name, var in backend_ds.variables.items(): var_chunks = _get_chunk(var, chunks) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d9379fbe56c..2d6ad6ba62d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -282,6 +282,8 @@ def _maybe_chunk( if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} + print(f"inside _maybe_chunk chunks={chunks}") + if var.ndim: chunked_array_type = guess_chunkmanager_name(chunked_array_type) if chunked_array_type == "dask": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 8c84a2914d5..3b2370167c9 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1241,6 +1241,8 @@ def chunk( inline_array=inline_array, ) + print(f"inside variable.chunk chunks={chunks}") + data = chunkmanager.from_array(self._data, chunks, **_from_array_kwargs) return self._replace(data=data) From fcaf49983de7b3bbb27f17de7b52b4fe1ca4a900 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 02:46:45 -0400 Subject: [PATCH 104/158] Revert "debugging print statements" This reverts commit 7dc658186194aa31e788e106c7527a8909de0a2e. --- xarray/backends/api.py | 2 -- xarray/core/dataset.py | 2 -- xarray/core/variable.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 21ecbac05ea..f6ef9a22afe 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -323,8 +323,6 @@ def _chunk_ds( token = (None,) name_prefix = None - print(f"inside _chunk_ds chunks={chunks}") - variables = {} for name, var in backend_ds.variables.items(): var_chunks = _get_chunk(var, chunks) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2d6ad6ba62d..d9379fbe56c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -282,8 +282,6 @@ def _maybe_chunk( if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} - print(f"inside _maybe_chunk chunks={chunks}") - if var.ndim: chunked_array_type = guess_chunkmanager_name(chunked_array_type) if chunked_array_type == "dask": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3b2370167c9..8c84a2914d5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1241,8 +1241,6 @@ def chunk( inline_array=inline_array, ) - print(f"inside variable.chunk chunks={chunks}") - data = chunkmanager.from_array(self._data, chunks, **_from_array_kwargs) return self._replace(data=data) From 64df7e8f9962855ff40a0f2118987bc731ff545a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 02:47:36 -0400 Subject: [PATCH 105/158] fix gnarly bug with auto-determining chunksizes caused by not referring to dask.config --- xarray/core/daskcompat.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/xarray/core/daskcompat.py b/xarray/core/daskcompat.py index c12404decb4..e35e8924cbf 100644 --- a/xarray/core/daskcompat.py +++ b/xarray/core/daskcompat.py @@ -386,7 +386,13 @@ def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None): return tuple(chunks) if limit is None: - limit = "128MiB" # config.get("array.chunk-size") + try: + from dask import config + + # TODO plug this into configuration of other chunk managers + limit = config.get("array.chunk-size") + except ImportError: + limit = "128MiB" if isinstance(limit, str): limit = parse_bytes(limit) From 747ada5357933d2aa2299a18d5ae8fb00d8b8b78 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 02:53:26 -0400 Subject: [PATCH 106/158] hopefully fix broken docstring --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f6ef9a22afe..a3e3a50ada0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -694,7 +694,7 @@ def open_dataarray( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. - chunked_array_type: str, optional + chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. From 9b33ab718e4deb80dcc34efb35600a5f2c657ee4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 02:54:53 -0400 Subject: [PATCH 107/158] Revert "make dask-specific kwargs explicit in from_array" This reverts commit 53d6094f9624a459512e9f88a6bb5c55d0e65a19. --- xarray/core/daskmanager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 1b551c0a6a2..655569a6c82 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -29,13 +29,16 @@ def is_chunked_array(self, data: Any) -> bool: def chunks(self, data: "DaskArray") -> T_Chunks: return data.chunks - def from_array( - self, data, chunks, name=None, lock=False, inline_array=False - ) -> "DaskArray": + def from_array(self, data, chunks, **kwargs) -> "DaskArray": import dask.array as da from xarray.core import indexing + # dask-specific kwargs + name = kwargs.pop("name", None) + lock = kwargs.pop("lock", False) + inline_array = kwargs.pop("inline_array", False) + if is_duck_dask_array(data): data = self.rechunk(data, chunks) else: From 6a7a0430c97fcca93266836070fcb5b7b5bba636 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 11:30:32 -0400 Subject: [PATCH 108/158] show chunksize limit used in failing tests --- xarray/core/daskcompat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/core/daskcompat.py b/xarray/core/daskcompat.py index e35e8924cbf..20622dc64c4 100644 --- a/xarray/core/daskcompat.py +++ b/xarray/core/daskcompat.py @@ -393,6 +393,9 @@ def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None): limit = config.get("array.chunk-size") except ImportError: limit = "128MiB" + + print(f"array.chunk-size limit used = {limit}") + if isinstance(limit, str): limit = parse_bytes(limit) From 20f92c6c8866b3801dc32d49f8c912a74fa36bb8 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 13:10:08 -0400 Subject: [PATCH 109/158] move lazy indexing adapter up out of chunkmanager code --- xarray/core/daskmanager.py | 53 +++++++---------------------------- xarray/core/parallelcompat.py | 1 + xarray/core/variable.py | 26 ++++++++++++++++- 3 files changed, 36 insertions(+), 44 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 655569a6c82..757f169c00f 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -3,8 +3,8 @@ import numpy as np -from xarray.core import utils from xarray.core.duck_array_ops import dask_available +from xarray.core.indexing import ImplicitToExplicitIndexingAdapter from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks from xarray.core.pycompat import is_duck_dask_array @@ -32,48 +32,15 @@ def chunks(self, data: "DaskArray") -> T_Chunks: def from_array(self, data, chunks, **kwargs) -> "DaskArray": import dask.array as da - from xarray.core import indexing - - # dask-specific kwargs - name = kwargs.pop("name", None) - lock = kwargs.pop("lock", False) - inline_array = kwargs.pop("inline_array", False) - - if is_duck_dask_array(data): - data = self.rechunk(data, chunks) - else: - # TODO move this up to variable.chunk - if isinstance(data, indexing.ExplicitlyIndexed): - # Unambiguously handle array storage backends (like NetCDF4 and h5py) - # that can't handle general array indexing. For example, in netCDF4 you - # can do "outer" indexing along two dimensions independent, which works - # differently from how NumPy handles it. - # da.from_array works by using lazy indexing with a tuple of slices. - # Using OuterIndexer is a pragmatic choice: dask does not yet handle - # different indexing types in an explicit way: - # https://github.com/dask/dask/issues/2883 - data = indexing.ImplicitToExplicitIndexingAdapter( - data, indexing.OuterIndexer - ) - - # All of our lazily loaded backend array classes should use NumPy - # array operations. - dask_kwargs = {"meta": np.ndarray} - else: - dask_kwargs = {} - - if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) - - data = da.from_array( - data, - chunks, - name=name, - lock=lock, - inline_array=inline_array, - **dask_kwargs, - ) - return data + if isinstance(data, ImplicitToExplicitIndexingAdapter): + # lazily loaded backend array classes should use NumPy array operations. + kwargs["meta"] = np.ndarray + + return da.from_array( + data, + chunks, + **kwargs, + ) def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: from dask.array import compute diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 68fa0d91538..a0f9eb5b5c5 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -167,6 +167,7 @@ def chunks(self, data: T_ChunkedArray) -> T_Chunks: def from_array( self, data: np.ndarray, chunks: T_Chunks, **kwargs ) -> T_ChunkedArray: + """Called when .chunk is called on an xarray object that is not already chunked.""" ... def rechunk( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 8c84a2914d5..86dafa0a3fa 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1241,7 +1241,31 @@ def chunk( inline_array=inline_array, ) - data = chunkmanager.from_array(self._data, chunks, **_from_array_kwargs) + data = self._data + if chunkmanager.is_chunked_array(data): + data = chunkmanager.rechunk(data, chunks) + else: + if isinstance(data, indexing.ExplicitlyIndexed): + # Unambiguously handle array storage backends (like NetCDF4 and h5py) + # that can't handle general array indexing. For example, in netCDF4 you + # can do "outer" indexing along two dimensions independent, which works + # differently from how NumPy handles it. + # da.from_array works by using lazy indexing with a tuple of slices. + # Using OuterIndexer is a pragmatic choice: dask does not yet handle + # different indexing types in an explicit way: + # https://github.com/dask/dask/issues/2883 + data = indexing.ImplicitToExplicitIndexingAdapter( + data, indexing.OuterIndexer + ) + + if utils.is_dict_like(chunks): + chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) + + data = chunkmanager.from_array( + data, + chunks, + **_from_array_kwargs, + ) return self._replace(data=data) From 796a577cbccf14701a0da1f5cb8860006b78bba4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 6 Apr 2023 19:34:37 -0400 Subject: [PATCH 110/158] try upgrading minimum version of dask --- ci/requirements/min-all-deps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index e50d08264b8..6c77d6538f1 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -14,7 +14,7 @@ dependencies: - cdms2=3.1 - cftime=1.5 - coveralls - - dask-core=2022.1 + - dask-core=2022.9.2 - distributed=2022.1 - flox=0.5 - h5netcdf=0.13 From 29d0c920d6a70a97c914d52bd73b6237d97295d4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 11 Apr 2023 14:36:42 -0400 Subject: [PATCH 111/158] Revert "try upgrading minimum version of dask" This reverts commit 796a577cbccf14701a0da1f5cb8860006b78bba4. --- ci/requirements/min-all-deps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 6c77d6538f1..e50d08264b8 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -14,7 +14,7 @@ dependencies: - cdms2=3.1 - cftime=1.5 - coveralls - - dask-core=2022.9.2 + - dask-core=2022.1 - distributed=2022.1 - flox=0.5 - h5netcdf=0.13 From 031017b3c36fb35498afa600c45e2f432ef33d3a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 11 Apr 2023 16:13:43 -0400 Subject: [PATCH 112/158] un-vendor dask.array.core.normalize_chunks --- xarray/backends/api.py | 7 +- xarray/core/daskcompat.py | 519 ---------------------------- xarray/core/daskmanager.py | 19 + xarray/core/dataset.py | 5 +- xarray/core/parallelcompat.py | 13 + xarray/core/variable.py | 1 + xarray/tests/test_parallelcompat.py | 14 +- 7 files changed, 53 insertions(+), 525 deletions(-) delete mode 100644 xarray/core/daskcompat.py diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a3e3a50ada0..454aa2d50bb 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -32,7 +32,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.indexes import Index -from xarray.core.parallelcompat import guess_chunkmanager_name +from xarray.core.parallelcompat import guess_chunkmanager, guess_chunkmanager_name from xarray.core.utils import is_remote_uri if TYPE_CHECKING: @@ -311,6 +311,7 @@ def _chunk_ds( from_array_kwargs, **extra_tokens, ): + # TODO refactor this so chunked_array_type = guess_chunkmanager_name(chunked_array_type) if chunked_array_type == "dask": from dask.base import tokenize @@ -323,9 +324,11 @@ def _chunk_ds( token = (None,) name_prefix = None + chunkmanager = guess_chunkmanager(chunked_array_type) + variables = {} for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks) + var_chunks = _get_chunk(var, chunks, chunkmanager) variables[name] = _maybe_chunk( name, var, diff --git a/xarray/core/daskcompat.py b/xarray/core/daskcompat.py deleted file mode 100644 index 20622dc64c4..00000000000 --- a/xarray/core/daskcompat.py +++ /dev/null @@ -1,519 +0,0 @@ -# For reference, here is a copy of the dask copyright notice: - -# BSD 3-Clause License - -# Copyright (c) 2014, Anaconda, Inc. and contributors -# All rights reserved. - -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: - -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. - -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. - -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import annotations - -import collections -import math -from numbers import Integral, Number - -import numpy as np - - -def is_integer(i) -> bool: - """ - >>> is_integer(6) - True - >>> is_integer(42.0) - True - >>> is_integer("abc") - False - """ - return isinstance(i, Integral) or (isinstance(i, float) and i.is_integer()) - - -def parse_bytes(s: float | str) -> int: - """Parse byte string to numbers - >>> from dask.utils import parse_bytes - >>> parse_bytes("100") - 100 - >>> parse_bytes("100 MB") - 100000000 - >>> parse_bytes("100M") - 100000000 - >>> parse_bytes("5kB") - 5000 - >>> parse_bytes("5.4 kB") - 5400 - >>> parse_bytes("1kiB") - 1024 - >>> parse_bytes("1e6") - 1000000 - >>> parse_bytes("1e6 kB") - 1000000000 - >>> parse_bytes("MB") - 1000000 - >>> parse_bytes(123) - 123 - >>> parse_bytes("5 foos") - Traceback (most recent call last): - ... - ValueError: Could not interpret 'foos' as a byte unit - """ - if isinstance(s, (int, float)): - return int(s) - s = s.replace(" ", "") - if not any(char.isdigit() for char in s): - s = "1" + s - - for i in range(len(s) - 1, -1, -1): - if not s[i].isalpha(): - break - index = i + 1 - - prefix = s[:index] - suffix = s[index:] - - try: - n = float(prefix) - except ValueError as e: - raise ValueError("Could not interpret '%s' as a number" % prefix) from e - - try: - multiplier = byte_sizes[suffix.lower()] - except KeyError as e: - raise ValueError("Could not interpret '%s' as a byte unit" % suffix) from e - - result = n * multiplier - return int(result) - - -byte_sizes = { - "kB": 10**3, - "MB": 10**6, - "GB": 10**9, - "TB": 10**12, - "PB": 10**15, - "KiB": 2**10, - "MiB": 2**20, - "GiB": 2**30, - "TiB": 2**40, - "PiB": 2**50, - "B": 1, - "": 1, -} -byte_sizes = {k.lower(): v for k, v in byte_sizes.items()} -byte_sizes.update({k[0]: v for k, v in byte_sizes.items() if k and "i" not in k}) -byte_sizes.update({k[:-1]: v for k, v in byte_sizes.items() if k and "i" in k}) -unknown_chunk_message = ( - "\n\n" - "A possible solution: " - "https://docs.dask.org/en/latest/array-chunks.html#unknown-chunks\n" - "Summary: to compute chunks sizes, use\n\n" - " x.compute_chunk_sizes() # for Dask Array `x`\n" - " ddf.to_dask_array(lengths=True) # for Dask DataFrame `ddf`" -) - - -def blockdims_from_blockshape(shape, chunks): - """ - Vendored from dask.array.core - - >>> blockdims_from_blockshape((10, 10), (4, 3)) - ((4, 4, 2), (3, 3, 3, 1)) - >>> blockdims_from_blockshape((10, 0), (4, 0)) - ((4, 4, 2), (0,)) - """ - if chunks is None: - raise TypeError("Must supply chunks= keyword argument") - if shape is None: - raise TypeError("Must supply shape= keyword argument") - if np.isnan(sum(shape)) or np.isnan(sum(chunks)): - raise ValueError( - "Array chunk sizes are unknown. shape: %s, chunks: %s%s" - % (shape, chunks, unknown_chunk_message) - ) - if not all(map(is_integer, chunks)): - raise ValueError("chunks can only contain integers.") - if not all(map(is_integer, shape)): - raise ValueError("shape can only contain integers.") - shape = tuple(map(int, shape)) - chunks = tuple(map(int, chunks)) - return tuple( - ((bd,) * (d // bd) + ((d % bd,) if d % bd else ()) if d else (0,)) - for d, bd in zip(shape, chunks) - ) - - -CHUNKS_NONE_ERROR_MESSAGE = """ -You must specify a chunks= keyword argument. -This specifies the chunksize of your array blocks. -See the following documentation page for details: - https://docs.dask.org/en/latest/array-creation.html#chunks -""".strip() - - -def normalize_chunks( - chunks, shape=None, limit=None, dtype=None, previous_chunks=None -) -> tuple[tuple[int, ...], ...]: - """ - Normalize chunks to tuple of tuples. - - This takes in a variety of input types and information and produces a full - tuple-of-tuples result for chunks, suitable to be passed to Array or - rechunk or any other operation that creates a Dask array. - - Vendored from dask.array.core - - Parameters - ---------- - chunks: tuple, int, dict, or string - The chunks to be normalized. See examples below for more details - shape: Tuple[int] - The shape of the array - limit: int (optional) - The maximum block size to target in bytes, - if freedom is given to choose - dtype: np.dtype - previous_chunks: Tuple[Tuple[int]] optional - Chunks from a previous array that we should use for inspiration when - rechunking auto dimensions. If not provided but auto-chunking exists - then auto-dimensions will prefer square-like chunk shapes. - - Examples - -------- - Specify uniform chunk sizes - - >>> from dask.array.core import normalize_chunks - >>> normalize_chunks((2, 2), shape=(5, 6)) - ((2, 2, 1), (2, 2, 2)) - - Also passes through fully explicit tuple-of-tuples - - >>> normalize_chunks(((2, 2, 1), (2, 2, 2)), shape=(5, 6)) - ((2, 2, 1), (2, 2, 2)) - - Cleans up lists to tuples - - >>> normalize_chunks([[2, 2], [3, 3]]) - ((2, 2), (3, 3)) - - Expands integer inputs 10 -> (10, 10) - - >>> normalize_chunks(10, shape=(30, 5)) - ((10, 10, 10), (5,)) - - Expands dict inputs - - >>> normalize_chunks({0: 2, 1: 3}, shape=(6, 6)) - ((2, 2, 2), (3, 3)) - - The values -1 and None get mapped to full size - - >>> normalize_chunks((5, -1), shape=(10, 10)) - ((5, 5), (10,)) - - Use the value "auto" to automatically determine chunk sizes along certain - dimensions. This uses the ``limit=`` and ``dtype=`` keywords to - determine how large to make the chunks. The term "auto" can be used - anywhere an integer can be used. See array chunking documentation for more - information. - - >>> normalize_chunks(("auto",), shape=(20,), limit=5, dtype="uint8") - ((5, 5, 5, 5),) - - You can also use byte sizes (see :func:`dask.utils.parse_bytes`) in place of - "auto" to ask for a particular size - - >>> normalize_chunks("1kiB", shape=(2000,), dtype="float32") - ((256, 256, 256, 256, 256, 256, 256, 208),) - - Respects null dimensions - - >>> normalize_chunks((), shape=(0, 0)) - ((0,), (0,)) - """ - if dtype and not isinstance(dtype, np.dtype): - dtype = np.dtype(dtype) - if chunks is None: - raise ValueError(CHUNKS_NONE_ERROR_MESSAGE) - if isinstance(chunks, list): - chunks = tuple(chunks) - if isinstance(chunks, (Number, str)): - chunks = (chunks,) * len(shape) - if isinstance(chunks, dict): - chunks = tuple(chunks.get(i, None) for i in range(len(shape))) - if isinstance(chunks, np.ndarray): - chunks = chunks.tolist() - if not chunks and shape and all(s == 0 for s in shape): - chunks = ((0,),) * len(shape) - - if ( - shape - and len(shape) == 1 - and len(chunks) > 1 - and all(isinstance(c, (Number, str)) for c in chunks) - ): - chunks = (chunks,) - - if shape and len(chunks) != len(shape): - raise ValueError( - "Chunks and shape must be of the same length/dimension. " - "Got chunks={}, shape={}".format(chunks, shape) - ) - if -1 in chunks or None in chunks: - chunks = tuple(s if c == -1 or c is None else c for c, s in zip(chunks, shape)) - - # If specifying chunk size in bytes, use that value to set the limit. - # Verify there is only one consistent value of limit or chunk-bytes used. - for c in chunks: - if isinstance(c, str) and c != "auto": - parsed = parse_bytes(c) - if limit is None: - limit = parsed - elif parsed != limit: - raise ValueError( - "Only one consistent value of limit or chunk is allowed." - "Used {} != {}".format(parsed, limit) - ) - # Substitute byte limits with 'auto' now that limit is set. - chunks = tuple("auto" if isinstance(c, str) and c != "auto" else c for c in chunks) - - if any(c == "auto" for c in chunks): - chunks = auto_chunks(chunks, shape, limit, dtype, previous_chunks) - - if shape is not None: - chunks = tuple(c if c not in {None, -1} else s for c, s in zip(chunks, shape)) - - if chunks and shape is not None: - chunks = sum( - ( - blockdims_from_blockshape((s,), (c,)) - if not isinstance(c, (tuple, list)) - else (c,) - for s, c in zip(shape, chunks) - ), - (), - ) - for c in chunks: - if not c: - raise ValueError( - "Empty tuples are not allowed in chunks. Express " - "zero length dimensions with 0(s) in chunks" - ) - - if shape is not None: - if len(chunks) != len(shape): - raise ValueError( - "Input array has %d dimensions but the supplied " - "chunks has only %d dimensions" % (len(shape), len(chunks)) - ) - if not all( - c == s or (math.isnan(c) or math.isnan(s)) - for c, s in zip(map(sum, chunks), shape) - ): - raise ValueError( - "Chunks do not add up to shape. " - "Got chunks={}, shape={}".format(chunks, shape) - ) - - return tuple( - tuple(int(x) if not math.isnan(x) else np.nan for x in c) for c in chunks # type: ignore[misc] - ) - - -def _compute_multiplier(limit: int, dtype, largest_block: int, result): - """ - Utility function for auto_chunk, to fin how much larger or smaller the ideal - chunk size is relative to what we have now. - """ - return ( - limit - / dtype.itemsize - / largest_block - / math.prod(r for r in result.values() if r) - ) - - -def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None): - """Determine automatic chunks - This takes in a chunks value that contains ``"auto"`` values in certain - dimensions and replaces those values with concrete dimension sizes that try - to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. If multiple dimensions are marked as ``"auto"`` then they will - all respond to meet the desired byte limit, trying to respect the aspect - ratio of their dimensions in ``previous_chunks=``, if given. - Parameters - ---------- - chunks: Tuple - A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "auto" - shape: Tuple[int] - limit: int, str - The maximum allowable size of a chunk in bytes - previous_chunks: Tuple[Tuple[int]] - See also - -------- - normalize_chunks: for full docstring and parameters - """ - if previous_chunks is not None: - previous_chunks = tuple( - c if isinstance(c, tuple) else (c,) for c in previous_chunks - ) - chunks = list(chunks) - - autos = {i for i, c in enumerate(chunks) if c == "auto"} - if not autos: - return tuple(chunks) - - if limit is None: - try: - from dask import config - - # TODO plug this into configuration of other chunk managers - limit = config.get("array.chunk-size") - except ImportError: - limit = "128MiB" - - print(f"array.chunk-size limit used = {limit}") - - if isinstance(limit, str): - limit = parse_bytes(limit) - - if dtype is None: - raise TypeError("dtype must be known for auto-chunking") - - if dtype.hasobject: - raise NotImplementedError( - "Can not use auto rechunking with object dtype. " - "We are unable to estimate the size in bytes of object data" - ) - - for x in tuple(chunks) + tuple(shape): - if ( - isinstance(x, Number) - and np.isnan(x) - or isinstance(x, tuple) - and np.isnan(x).any() - ): - raise ValueError( - "Can not perform automatic rechunking with unknown " - "(nan) chunk sizes.%s" % unknown_chunk_message - ) - - limit = max(1, limit) - - largest_block = math.prod( - cs if isinstance(cs, Number) else max(cs) for cs in chunks if cs != "auto" - ) - - if previous_chunks: - # Base ideal ratio on the median chunk size of the previous chunks - result = {a: np.median(previous_chunks[a]) for a in autos} - - ideal_shape = [] - for i, s in enumerate(shape): - chunk_frequencies = frequencies(previous_chunks[i]) - mode, count = max(chunk_frequencies.items(), key=lambda kv: kv[1]) - if mode > 1 and count >= len(previous_chunks[i]) / 2: - ideal_shape.append(mode) - else: - ideal_shape.append(s) - - # How much larger or smaller the ideal chunk size is relative to what we have now - multiplier = _compute_multiplier(limit, dtype, largest_block, result) - - last_multiplier = 0 - last_autos = set() - while ( - multiplier != last_multiplier or autos != last_autos - ): # while things change - last_multiplier = multiplier # record previous values - last_autos = set(autos) # record previous values - - # Expand or contract each of the dimensions appropriately - for a in sorted(autos): - if ideal_shape[a] == 0: - result[a] = 0 - continue - proposed = result[a] * multiplier ** (1 / len(autos)) - if proposed > shape[a]: # we've hit the shape boundary - autos.remove(a) - largest_block *= shape[a] - chunks[a] = shape[a] - del result[a] - else: - result[a] = round_to(proposed, ideal_shape[a]) - - # recompute how much multiplier we have left, repeat - multiplier = _compute_multiplier(limit, dtype, largest_block, result) - - for k, v in result.items(): - chunks[k] = v - return tuple(chunks) - - else: - # Check if dtype.itemsize is greater than 0 - if dtype.itemsize == 0: - raise ValueError( - "auto-chunking with dtype.itemsize == 0 is not supported, please pass in `chunks` explicitly" - ) - size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos)) - small = [i for i in autos if shape[i] < size] - if small: - for i in small: - chunks[i] = (shape[i],) - return auto_chunks(chunks, shape, limit, dtype) - - for i in autos: - chunks[i] = round_to(size, shape[i]) - - return tuple(chunks) - - -def round_to(c, s): - """Return a chunk dimension that is close to an even multiple or factor - We want values for c that are nicely aligned with s. - If c is smaller than s we use the original chunk size and accept an - uneven chunk at the end. - If c is larger than s then we want the largest multiple of s that is still - smaller than c. - """ - if c <= s: - return max(1, int(c)) - else: - return c // s * s - - -def frequencies(seq): - """ - Find number of occurrences of each value in seq. - - >>> frequencies(["cat", "cat", "ox", "pig", "pig", "cat"]) - {'cat': 3, 'ox': 1, 'pig': 2} - - Vendored from pytoolz. - """ - d = collections.defaultdict(int) - for item in seq: - d[item] += 1 - return dict(d) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 757f169c00f..4b9cec29628 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -29,6 +29,25 @@ def is_chunked_array(self, data: Any) -> bool: def chunks(self, data: "DaskArray") -> T_Chunks: return data.chunks + def normalize_chunks( + self, + chunks: Union[tuple, int, dict, str], + shape: Union[tuple[int], None] = None, + limit: Union[int, None] = None, + dtype: Union[np.dtype, None] = None, + previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + ) -> tuple[tuple[int, ...], ...]: + """Called by open_dataset""" + from dask.array.core import normalize_chunks + + return normalize_chunks( + chunks, + shape=shape, + limit=limit, + dtype=dtype, + previous_chunks=previous_chunks, + ) + def from_array(self, data, chunks, **kwargs) -> "DaskArray": import dask.array as da diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d9379fbe56c..a4968bc2d6f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -51,7 +51,6 @@ ) from xarray.core.computation import unify_chunks from xarray.core.coordinates import DatasetCoordinates, assert_coordinate_consistent -from xarray.core.daskcompat import normalize_chunks from xarray.core.duck_array_ops import datetime_to_numeric from xarray.core.indexes import ( Index, @@ -209,7 +208,7 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) -def _get_chunk(var, chunks): +def _get_chunk(var, chunks, chunkmanager): """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ @@ -231,7 +230,7 @@ def _get_chunk(var, chunks): for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape) ) - chunk_shape = normalize_chunks( + chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape ) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index a0f9eb5b5c5..42f80676383 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -62,6 +62,7 @@ def load_chunkmanagers( return available_chunkmanagers +# TODO refactor to remove this function in favour of allowing passing either a string or the ChunkManager instance def guess_chunkmanager_name(manager: Optional[str]) -> str: chunkmanagers = list_chunkmanagers() @@ -163,6 +164,18 @@ def is_chunked_array(self, data: Any) -> bool: def chunks(self, data: T_ChunkedArray) -> T_Chunks: ... + @abstractmethod + def normalize_chunks( + self, + chunks: Union[tuple, int, dict, str], + shape: Union[tuple[int], None] = None, + limit: Union[int, None] = None, + dtype: Union[np.dtype, None] = None, + previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + ) -> tuple[tuple[int, ...], ...]: + """Called by open_dataset""" + ... + @abstractmethod def from_array( self, data: np.ndarray, chunks: T_Chunks, **kwargs diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 86dafa0a3fa..0c04013778e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1221,6 +1221,7 @@ def chunk( if isinstance(chunks, (float, str, int, tuple, list)): # TODO we shouldn't assume here that other chunkmanagers can handle these types + # TODO should we call normalize_chunks here? pass # dask.array.from_array can handle these directly else: chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 954625ca56b..1ef26462622 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -1,4 +1,4 @@ -from typing import Any, Optional +from typing import Any, Optional, Union import numpy as np import pytest @@ -61,6 +61,18 @@ def is_chunked_array(self, data: Any) -> bool: def chunks(self, data: DummyChunkedArray) -> T_Chunks: return data.chunks + def normalize_chunks( + self, + chunks: Union[tuple, int, dict, str], + shape: Union[tuple[int], None] = None, + limit: Union[int, None] = None, + dtype: Union[np.dtype, None] = None, + previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + ) -> tuple[tuple[int, ...], ...]: + from dask.array import normalize_chunks + + return normalize_chunks(chunks, shape, limit, dtype, previous_chunks) + def from_array( self, data: np.ndarray, chunks: T_Chunks, **kwargs ) -> DummyChunkedArray: From 14a122674051fdc2cd00329c2215e3b6802e9293 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 12 Apr 2023 14:40:52 -0400 Subject: [PATCH 113/158] refactored to all passing ChunkManagerEntrypoint objects directly --- xarray/backends/api.py | 14 ++++++------ xarray/core/dataarray.py | 5 ++++- xarray/core/dataset.py | 22 +++++++++++++------ xarray/core/parallelcompat.py | 34 ++++++++++++++--------------- xarray/core/variable.py | 9 ++++++-- xarray/tests/test_parallelcompat.py | 2 +- 6 files changed, 50 insertions(+), 36 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 454aa2d50bb..a23f015d94f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -29,10 +29,11 @@ _nested_combine, combine_by_coords, ) +from xarray.core.daskmanager import DaskManager from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.indexes import Index -from xarray.core.parallelcompat import guess_chunkmanager, guess_chunkmanager_name +from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.utils import is_remote_uri if TYPE_CHECKING: @@ -311,9 +312,10 @@ def _chunk_ds( from_array_kwargs, **extra_tokens, ): - # TODO refactor this so - chunked_array_type = guess_chunkmanager_name(chunked_array_type) - if chunked_array_type == "dask": + chunkmanager = guess_chunkmanager(chunked_array_type) + + # TODO refactor to move this dask-specific logic inside the DaskManager class + if isinstance(chunkmanager, DaskManager): from dask.base import tokenize mtime = _get_mtime(filename_or_obj) @@ -324,8 +326,6 @@ def _chunk_ds( token = (None,) name_prefix = None - chunkmanager = guess_chunkmanager(chunked_array_type) - variables = {} for name, var in backend_ds.variables.items(): var_chunks = _get_chunk(var, chunks, chunkmanager) @@ -337,7 +337,7 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, - chunked_array_type=chunked_array_type, + chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), ) return backend_ds._replace(variables) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d46e17e8124..f60b283c0ad 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -36,6 +36,7 @@ from xarray.core.indexing import is_fancy_indexer, map_index_queries from xarray.core.merge import PANDAS_TYPES, MergeError, _create_indexes_from_coords from xarray.core.options import OPTIONS, _get_keep_attrs +from xarray.core.parallelcompat import ChunkManagerEntrypoint # noqa from xarray.core.utils import ( Default, HybridMappingProxy, @@ -1260,7 +1261,9 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunked_array_type: str | None = None, + chunked_array_type: str + | ChunkManagerEntryPoint + | None = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0b819cf545a..2a230f82b1c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -51,6 +51,7 @@ ) from xarray.core.computation import unify_chunks from xarray.core.coordinates import DatasetCoordinates, assert_coordinate_consistent +from xarray.core.daskmanager import DaskManager from xarray.core.duck_array_ops import datetime_to_numeric from xarray.core.indexes import ( Index, @@ -73,7 +74,11 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager_name +from xarray.core.parallelcompat import ( # noqa + ChunkManagerEntrypoint, + get_chunked_array_type, + guess_chunkmanager, +) from xarray.core.pycompat import ( array_type, is_chunked_array, @@ -275,15 +280,16 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, - chunked_array_type=None, + chunked_array_type: str + | ChunkManagerEntryPoint = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, ): if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - chunked_array_type = guess_chunkmanager_name(chunked_array_type) - if chunked_array_type == "dask": + guess_chunkmanager(chunked_array_type) + if isinstance(chunked_array_type, DaskManager): from dask.base import tokenize # when rechunking by different amounts, make sure dask names change @@ -2235,7 +2241,9 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunked_array_type: str | None = None, + chunked_array_type: str + | ChunkManagerEntryPoint + | None = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: @@ -2307,7 +2315,7 @@ def chunk( f"some chunks keys are not dimensions on this object: {bad_dims}" ) - chunked_array_type = guess_chunkmanager_name(chunked_array_type) + chunkmanager = guess_chunkmanager(chunked_array_type) if from_array_kwargs is None: from_array_kwargs = {} @@ -2320,7 +2328,7 @@ def chunk( lock, name_prefix, inline_array=inline_array, - chunked_array_type=chunked_array_type, + chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 42f80676383..de74c97091b 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -62,8 +62,16 @@ def load_chunkmanagers( return available_chunkmanagers -# TODO refactor to remove this function in favour of allowing passing either a string or the ChunkManager instance -def guess_chunkmanager_name(manager: Optional[str]) -> str: +def guess_chunkmanager( + manager: Union[str, "ChunkManagerEntrypoint", None] +) -> "ChunkManagerEntrypoint": + """ + Get namespace of chunk-handling methods, guessing from what's available. + + If the name of a specific ChunkManager is given (e.g. "dask"), then use that. + Else use whatever is installed, defaulting to dask if there are multiple options. + """ + chunkmanagers = list_chunkmanagers() if manager is None: @@ -74,20 +82,6 @@ def guess_chunkmanager_name(manager: Optional[str]) -> str: # default to trying to use dask manager = "dask" - return manager - - -def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": - """ - Get namespace of chunk-handling methods, guessing from what's available. - - If the name of a specific ChunkManager is given (e.g. "dask"), then use that. - Else use whatever is installed, defaulting to dask if there are multiple options. - """ - - chunkmanagers = list_chunkmanagers() - manager = guess_chunkmanager_name(manager) - if isinstance(manager, str): if manager not in chunkmanagers: raise ValueError( @@ -95,9 +89,13 @@ def guess_chunkmanager(manager: Optional[str]) -> "ChunkManagerEntrypoint": ) return chunkmanagers[manager] + elif isinstance(manager, ChunkManagerEntrypoint): + # already a valid ChunkManager so just pass through + return manager else: - # TODO should we accept type[ChunkManagerEntrypoint] too? - raise TypeError("manager must be a string") + raise TypeError( + f"manager must be a string or instance of ChunkManagerEntryPoint, but received type {type(manager)}" + ) def get_chunked_array_type(*args) -> "ChunkManagerEntrypoint": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0c04013778e..87c567bc6cb 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -26,7 +26,10 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import guess_chunkmanager +from xarray.core.parallelcompat import ( # noqa + ChunkManagerEntrypoint, + guess_chunkmanager, +) from xarray.core.pycompat import ( array_type, integer_types, @@ -1158,7 +1161,9 @@ def chunk( name: str | None = None, lock: bool | None = None, inline_array: bool | None = None, - chunked_array_type: str | None = None, + chunked_array_type: str + | ChunkManagerEntryPoint + | None = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 1ef26462622..0f984ec84ee 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -69,7 +69,7 @@ def normalize_chunks( dtype: Union[np.dtype, None] = None, previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, ) -> tuple[tuple[int, ...], ...]: - from dask.array import normalize_chunks + from dask.array.core import normalize_chunks return normalize_chunks(chunks, shape, limit, dtype, previous_chunks) From 5dd9d35639e81873f47695d7a8f333ccd9a0ec2c Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 17 Apr 2023 14:42:34 -0400 Subject: [PATCH 114/158] Remove redundant Nones from types Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a23f015d94f..80dc7d31322 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -388,8 +388,8 @@ def _dataset_from_backend_dataset( def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - engine: T_Engine | None = None, - chunks: T_Chunks | None = None, + engine: T_Engine = None, + chunks: T_Chunks = None, cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, From 5a4629428acb490e68453ef2c29325cb683b4ae1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 17 Apr 2023 14:42:50 -0400 Subject: [PATCH 115/158] From future import annotations Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/daskmanager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 4b9cec29628..e70447ed908 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Callable, Optional, Union From d6b56c66c405bedfbec0a1bf8056ce90cb8edea4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 18:44:46 +0000 Subject: [PATCH 116/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/daskmanager.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index e70447ed908..e9436b5ee50 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable import numpy as np @@ -15,7 +15,7 @@ class DaskManager(ChunkManagerEntrypoint["DaskArray"]): - array_cls: type["DaskArray"] + array_cls: type[DaskArray] available: bool = dask_available def __init__(self): @@ -28,16 +28,16 @@ def __init__(self): def is_chunked_array(self, data: Any) -> bool: return is_duck_dask_array(data) - def chunks(self, data: "DaskArray") -> T_Chunks: + def chunks(self, data: DaskArray) -> T_Chunks: return data.chunks def normalize_chunks( self, - chunks: Union[tuple, int, dict, str], - shape: Union[tuple[int], None] = None, - limit: Union[int, None] = None, - dtype: Union[np.dtype, None] = None, - previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + chunks: tuple | int | dict | str, + shape: tuple[int] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: tuple[tuple[int, ...], ...] | None = None, ) -> tuple[tuple[int, ...], ...]: """Called by open_dataset""" from dask.array.core import normalize_chunks @@ -50,7 +50,7 @@ def normalize_chunks( previous_chunks=previous_chunks, ) - def from_array(self, data, chunks, **kwargs) -> "DaskArray": + def from_array(self, data, chunks, **kwargs) -> DaskArray: import dask.array as da if isinstance(data, ImplicitToExplicitIndexingAdapter): @@ -63,7 +63,7 @@ def from_array(self, data, chunks, **kwargs) -> "DaskArray": **kwargs, ) - def compute(self, *data: "DaskArray", **kwargs) -> np.ndarray: + def compute(self, *data: DaskArray, **kwargs) -> np.ndarray: from dask.array import compute return compute(*data, **kwargs) @@ -78,10 +78,10 @@ def reduction( self, arr: T_ChunkedArray, func: Callable, - combine_func: Optional[Callable] = None, - aggregate_func: Optional[Callable] = None, - axis: Optional[Union[int, Sequence[int]]] = None, - dtype: Optional[np.dtype] = None, + combine_func: Callable | None = None, + aggregate_func: Callable | None = None, + axis: int | Sequence[int] | None = None, + dtype: np.dtype | None = None, keepdims: bool = False, ) -> T_ChunkedArray: from dask.array import reduction @@ -185,14 +185,14 @@ def blockwise( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list["DaskArray"]]: + ) -> tuple[dict[str, T_Chunks], list[DaskArray]]: from dask.array.core import unify_chunks return unify_chunks(*args, **kwargs) def store( self, - sources: Union["DaskArray", Sequence["DaskArray"]], + sources: DaskArray | Sequence[DaskArray], targets: Any, **kwargs, ): From 471d22aa07080a65847d652f9e11c76416cc2d1f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 17 Apr 2023 14:48:13 -0400 Subject: [PATCH 117/158] From functools import annotations Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/parallelcompat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index de74c97091b..b9036462bce 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -3,6 +3,8 @@ It could later be used as the basis for a public interface allowing any N frameworks to interoperate with xarray, but for now it is just a private experiment. """ +from __future__ import annotations + import functools import sys from abc import ABC, abstractmethod From 8378f4365ff8e698d9ab4e6039662d66dc7550e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 18:48:52 +0000 Subject: [PATCH 118/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/parallelcompat.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index b9036462bce..8f023c7bfde 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -14,9 +14,7 @@ Any, Callable, Generic, - Optional, TypeVar, - Union, ) import numpy as np @@ -31,7 +29,7 @@ @functools.lru_cache(maxsize=1) -def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: +def list_chunkmanagers() -> dict[str, ChunkManagerEntrypoint]: """ Return a dictionary of available chunk managers and their ChunkManagerEntrypoint objects. @@ -49,7 +47,7 @@ def list_chunkmanagers() -> dict[str, "ChunkManagerEntrypoint"]: def load_chunkmanagers( entrypoints: Sequence[EntryPoint], -) -> dict[str, "ChunkManagerEntrypoint"]: +) -> dict[str, ChunkManagerEntrypoint]: """Load entrypoints and instantiate chunkmanagers only once.""" loaded_entrypoints = { @@ -65,8 +63,8 @@ def load_chunkmanagers( def guess_chunkmanager( - manager: Union[str, "ChunkManagerEntrypoint", None] -) -> "ChunkManagerEntrypoint": + manager: str | ChunkManagerEntrypoint | None, +) -> ChunkManagerEntrypoint: """ Get namespace of chunk-handling methods, guessing from what's available. @@ -100,7 +98,7 @@ def guess_chunkmanager( ) -def get_chunked_array_type(*args) -> "ChunkManagerEntrypoint": +def get_chunked_array_type(*args) -> ChunkManagerEntrypoint: """ Detects which parallel backend should be used for given set of arrays. @@ -167,11 +165,11 @@ def chunks(self, data: T_ChunkedArray) -> T_Chunks: @abstractmethod def normalize_chunks( self, - chunks: Union[tuple, int, dict, str], - shape: Union[tuple[int], None] = None, - limit: Union[int, None] = None, - dtype: Union[np.dtype, None] = None, - previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + chunks: tuple | int | dict | str, + shape: tuple[int] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: tuple[tuple[int, ...], ...] | None = None, ) -> tuple[tuple[int, ...], ...]: """Called by open_dataset""" ... @@ -202,10 +200,10 @@ def reduction( self, arr: T_ChunkedArray, func: Callable, - combine_func: Optional[Callable] = None, - aggregate_func: Optional[Callable] = None, - axis: Optional[Union[int, Sequence[int]]] = None, - dtype: Optional[np.dtype] = None, + combine_func: Callable | None = None, + aggregate_func: Callable | None = None, + axis: int | Sequence[int] | None = None, + dtype: np.dtype | None = None, keepdims: bool = False, ) -> T_ChunkedArray: """Used in some reductions like nanfirst, which is used by groupby.first""" @@ -262,7 +260,7 @@ def unify_chunks( def store( self, - sources: Union[T_ChunkedArray, Sequence[T_ChunkedArray]], + sources: T_ChunkedArray | Sequence[T_ChunkedArray], targets: Any, **kwargs: dict[str, Any], ): From f8b1020b114a2c1df1bb21102cbd0be54b303a85 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 17 Apr 2023 14:49:07 -0400 Subject: [PATCH 119/158] From future import annotations Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/tests/test_parallelcompat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 0f984ec84ee..11502f0e10e 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any, Optional, Union import numpy as np From 11676abe2bdabfec476d76133b05a62f009591fa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 18:50:07 +0000 Subject: [PATCH 120/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_parallelcompat.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 11502f0e10e..8d2a51c3ada 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Optional, Union +from typing import Any import numpy as np import pytest @@ -24,7 +24,7 @@ class DummyChunkedArray(np.ndarray): https://numpy.org/doc/stable/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray """ - chunks: Optional[T_Chunks] + chunks: T_Chunks | None def __new__( cls, @@ -65,11 +65,11 @@ def chunks(self, data: DummyChunkedArray) -> T_Chunks: def normalize_chunks( self, - chunks: Union[tuple, int, dict, str], - shape: Union[tuple[int], None] = None, - limit: Union[int, None] = None, - dtype: Union[np.dtype, None] = None, - previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, + chunks: tuple | int | dict | str, + shape: tuple[int] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: tuple[tuple[int, ...], ...] | None = None, ) -> tuple[tuple[int, ...], ...]: from dask.array.core import normalize_chunks From 76ce09e7171f84d1696650e637913d3d2ef5e3f0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 1 May 2023 15:41:38 -0400 Subject: [PATCH 121/158] defined type for NormalizedChunks --- xarray/backends/api.py | 2 +- xarray/core/daskmanager.py | 14 +++++++------- xarray/core/parallelcompat.py | 16 ++++++++-------- xarray/core/types.py | 3 +++ xarray/tests/test_parallelcompat.py | 20 ++++++++++---------- 5 files changed, 29 insertions(+), 26 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 80dc7d31322..9c88befba5c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -49,6 +49,7 @@ CompatOptions, JoinOptions, NestedSequence, + T_Chunks, ) T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"] @@ -59,7 +60,6 @@ str, # no nice typing support for custom backends None, ] - T_Chunks = Union[int, dict[Any, Any], Literal["auto"], None] T_NetcdfTypes = Literal[ "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC" ] diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index e9436b5ee50..6f7ca867695 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -7,11 +7,11 @@ from xarray.core.duck_array_ops import dask_available from xarray.core.indexing import ImplicitToExplicitIndexingAdapter -from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray, T_Chunks +from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray from xarray.core.pycompat import is_duck_dask_array if TYPE_CHECKING: - from xarray.core.types import DaskArray + from xarray.core.types import DaskArray, T_Chunks, T_NormalizedChunks class DaskManager(ChunkManagerEntrypoint["DaskArray"]): @@ -28,17 +28,17 @@ def __init__(self): def is_chunked_array(self, data: Any) -> bool: return is_duck_dask_array(data) - def chunks(self, data: DaskArray) -> T_Chunks: + def chunks(self, data: DaskArray) -> T_NormalizedChunks: return data.chunks def normalize_chunks( self, - chunks: tuple | int | dict | str, + chunks: T_Chunks, shape: tuple[int] | None = None, limit: int | None = None, dtype: np.dtype | None = None, - previous_chunks: tuple[tuple[int, ...], ...] | None = None, - ) -> tuple[tuple[int, ...], ...]: + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: """Called by open_dataset""" from dask.array.core import normalize_chunks @@ -185,7 +185,7 @@ def blockwise( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list[DaskArray]]: + ) -> tuple[dict[str, T_NormalizedChunks], list[DaskArray]]: from dask.array.core import unify_chunks return unify_chunks(*args, **kwargs) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 8f023c7bfde..7322ac4b662 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -11,6 +11,7 @@ from collections.abc import Sequence from importlib.metadata import EntryPoint, entry_points from typing import ( + TYPE_CHECKING, Any, Callable, Generic, @@ -23,9 +24,8 @@ T_ChunkedArray = TypeVar("T_ChunkedArray") -# TODO importing TypeAlias is a pain on python 3.9 without typing_extensions in the CI -# T_Chunks: TypeAlias = tuple[tuple[int, ...], ...] -T_Chunks = Any +if TYPE_CHECKING: + from xarray.core.types import T_Chunks, T_NormalizedChunks @functools.lru_cache(maxsize=1) @@ -159,18 +159,18 @@ def is_chunked_array(self, data: Any) -> bool: return isinstance(data, self.array_cls) @abstractmethod - def chunks(self, data: T_ChunkedArray) -> T_Chunks: + def chunks(self, data: T_ChunkedArray) -> T_NormalizedChunks: ... @abstractmethod def normalize_chunks( self, - chunks: tuple | int | dict | str, + chunks: T_Chunks, shape: tuple[int] | None = None, limit: int | None = None, dtype: np.dtype | None = None, - previous_chunks: tuple[tuple[int, ...], ...] | None = None, - ) -> tuple[tuple[int, ...], ...]: + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: """Called by open_dataset""" ... @@ -254,7 +254,7 @@ def blockwise( def unify_chunks( self, *args, **kwargs - ) -> tuple[dict[str, T_Chunks], list[T_ChunkedArray]]: + ) -> tuple[dict[str, T_NormalizedChunks], list[T_ChunkedArray]]: """Called by xr.unify_chunks.""" raise NotImplementedError() diff --git a/xarray/core/types.py b/xarray/core/types.py index 944646bf116..f3342071107 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -115,6 +115,9 @@ Dims = Union[str, Iterable[Hashable], "ellipsis", None] OrderedDims = Union[str, Sequence[Union[Hashable, "ellipsis"]], "ellipsis", None] +T_Chunks = Union[int, dict[Any, Any], Literal["auto"], None] +T_NormalizedChunks = tuple[tuple[int, ...], ...] + ErrorOptions = Literal["raise", "ignore"] ErrorOptionsWithWarn = Literal["raise", "warn", "ignore"] diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 11502f0e10e..3edd97e9311 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Optional, Union +from typing import Any import numpy as np import pytest @@ -8,11 +8,11 @@ from xarray.core.daskmanager import DaskManager from xarray.core.parallelcompat import ( ChunkManagerEntrypoint, - T_Chunks, get_chunked_array_type, guess_chunkmanager, list_chunkmanagers, ) +from xarray.core.types import T_Chunks, T_NormalizedChunks from xarray.tests import has_dask, requires_dask @@ -24,7 +24,7 @@ class DummyChunkedArray(np.ndarray): https://numpy.org/doc/stable/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray """ - chunks: Optional[T_Chunks] + chunks: T_NormalizedChunks def __new__( cls, @@ -60,17 +60,17 @@ def __init__(self): def is_chunked_array(self, data: Any) -> bool: return isinstance(data, DummyChunkedArray) - def chunks(self, data: DummyChunkedArray) -> T_Chunks: + def chunks(self, data: DummyChunkedArray) -> T_NormalizedChunks: return data.chunks def normalize_chunks( self, - chunks: Union[tuple, int, dict, str], - shape: Union[tuple[int], None] = None, - limit: Union[int, None] = None, - dtype: Union[np.dtype, None] = None, - previous_chunks: Union[tuple[tuple[int, ...], ...], None] = None, - ) -> tuple[tuple[int, ...], ...]: + chunks: T_Chunks, + shape: tuple[int] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: from dask.array.core import normalize_chunks return normalize_chunks(chunks, shape, limit, dtype, previous_chunks) From 604bbf38cfd84a9a208a1de0b8786815a6da4998 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 May 2023 19:51:27 +0000 Subject: [PATCH 122/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/variable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7492fa00098..cc9aee457fe 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -27,7 +27,6 @@ ) from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.parallelcompat import ( - ChunkManagerEntrypoint, guess_chunkmanager, ) from xarray.core.pycompat import ( From 7604594b1d4381b8ee1f028d2d91819d939bd64f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 1 May 2023 16:10:11 -0400 Subject: [PATCH 123/158] standardized capitalization of ChunkManagerEntrypoint --- xarray/backends/zarr.py | 2 +- xarray/core/common.py | 4 ++-- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 6 +++--- xarray/core/parallelcompat.py | 2 +- xarray/core/variable.py | 8 +++----- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 49f266c573a..0c1b1eaa8ec 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -808,7 +808,7 @@ def open_zarr( Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. diff --git a/xarray/core/common.py b/xarray/core/common.py index a852113c526..87593a849e4 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1757,7 +1757,7 @@ def zeros_like( Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. @@ -1886,7 +1886,7 @@ def ones_like( Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f60b283c0ad..e9111e79180 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1262,7 +1262,7 @@ def chunk( lock: bool = False, inline_array: bool = False, chunked_array_type: str - | ChunkManagerEntryPoint + | ChunkManagerEntrypoint | None = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, **chunks_kwargs: Any, @@ -1297,7 +1297,7 @@ def chunk( Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2a230f82b1c..e6266d4422e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -281,7 +281,7 @@ def _maybe_chunk( overwrite_encoded_chunks=False, inline_array=False, chunked_array_type: str - | ChunkManagerEntryPoint = None, # noqa: F821 # type: ignore[name-defined] + | ChunkManagerEntrypoint = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, ): if chunks is not None: @@ -2242,7 +2242,7 @@ def chunk( lock: bool = False, inline_array: bool = False, chunked_array_type: str - | ChunkManagerEntryPoint + | ChunkManagerEntrypoint | None = None, # noqa: F821 # type: ignore[name-defined] from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], @@ -2277,7 +2277,7 @@ def chunk( Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7322ac4b662..0c09db1165a 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -94,7 +94,7 @@ def guess_chunkmanager( return manager else: raise TypeError( - f"manager must be a string or instance of ChunkManagerEntryPoint, but received type {type(manager)}" + f"manager must be a string or instance of ChunkManagerEntrypoint, but received type {type(manager)}" ) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7492fa00098..d03b10b03e5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1161,9 +1161,7 @@ def chunk( name: str | None = None, lock: bool | None = None, inline_array: bool | None = None, - chunked_array_type: str - | ChunkManagerEntryPoint - | None = None, # noqa: F821 # type: ignore[name-defined] + chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: @@ -1193,10 +1191,10 @@ def chunk( already as dask array. Default is False. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntrypoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict, optional - Additional keyword arguments passed on to the `ChunkManagerEntryPoint.from_array` method used to create + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. From 355555fc49d898341c9b7d81ff4ee4759ebebc4e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 1 May 2023 16:14:14 -0400 Subject: [PATCH 124/158] ensure ruff doesn't remove import --- xarray/core/variable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d5102d1e017..191fa6a8bec 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -58,6 +58,7 @@ BASIC_INDEXING_TYPES = integer_types + (slice,) if TYPE_CHECKING: + from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.types import ( Dims, ErrorOptionsWithWarn, From 6eac87aa3822b78c8111be49f65a6de2606dcb29 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 2 May 2023 11:50:32 -0400 Subject: [PATCH 125/158] ignore remaining typing errors stemming from unclear dask typing for chunks arguments --- xarray/core/dataset.py | 3 +-- xarray/core/parallelcompat.py | 5 ++++- xarray/core/variable.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 053ebb996ff..35835204631 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -280,8 +280,7 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, - chunked_array_type: str - | ChunkManagerEntrypoint = None, # noqa: F821 # type: ignore[name-defined] + chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, ): if chunks is not None: diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 0c09db1165a..cd122dbc399 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -182,7 +182,10 @@ def from_array( ... def rechunk( - self, data: T_ChunkedArray, chunks: T_Chunks, **kwargs + self, + data: T_ChunkedArray, + chunks: T_NormalizedChunks | tuple[int, ...] | T_Chunks, + **kwargs, ) -> T_ChunkedArray: """Called when .chunk is called on an xarray object that is already chunked.""" return data.rechunk(chunks, **kwargs) # type: ignore[attr-defined] diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 191fa6a8bec..25dfbf85556 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1259,7 +1259,7 @@ def chunk( data = self._data if chunkmanager.is_chunked_array(data): - data = chunkmanager.rechunk(data, chunks) + data = chunkmanager.rechunk(data, chunks) # type: ignore[arg-type] else: if isinstance(data, indexing.ExplicitlyIndexed): # Unambiguously handle array storage backends (like NetCDF4 and h5py) @@ -1279,7 +1279,7 @@ def chunk( data = chunkmanager.from_array( data, - chunks, + chunks, # type: ignore[arg-type] **_from_array_kwargs, ) From 4ec837019a7103c28f8c036e999f399eeb6ee6be Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 2 May 2023 13:46:34 -0400 Subject: [PATCH 126/158] rename store_kwargs->chunkmanager_store_kwargs --- xarray/backends/api.py | 10 ++++++---- xarray/backends/common.py | 8 ++++---- xarray/core/daskmanager.py | 2 -- xarray/core/dataset.py | 10 +++++----- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9c88befba5c..0157e0d9d66 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1546,7 +1546,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore: ... @@ -1569,7 +1569,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -1589,7 +1589,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to a zarr ztore @@ -1711,7 +1711,9 @@ def to_zarr( writer = ArrayWriter() # TODO: figure out how to properly handle unlimited_dims dump_to_store(dataset, zstore, writer, encoding=encoding) - writes = writer.sync(compute=compute, store_kwargs=store_kwargs) + writes = writer.sync( + compute=compute, chunkmanager_store_kwargs=chunkmanager_store_kwargs + ) if compute: _finalize_store(writes, zstore) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 2c581468c97..50ac606a83e 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -164,7 +164,7 @@ def add(self, source, target, region=None): else: target[...] = source - def sync(self, compute=True, store_kwargs=None): + def sync(self, compute=True, chunkmanager_store_kwargs=None): if self.sources: chunkmanager = get_chunked_array_type(*self.sources) @@ -172,8 +172,8 @@ def sync(self, compute=True, store_kwargs=None): # for any discernible difference in perforance, e.g., # targets = [dask.delayed(t) for t in self.targets] - if store_kwargs is None: - store_kwargs = {} + if chunkmanager_store_kwargs is None: + chunkmanager_store_kwargs = {} delayed_store = chunkmanager.store( self.sources, @@ -182,7 +182,7 @@ def sync(self, compute=True, store_kwargs=None): compute=compute, flush=True, regions=self.regions, - **store_kwargs, + **chunkmanager_store_kwargs, ) self.sources = [] self.targets = [] diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 6f7ca867695..cef6c2e3805 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -198,8 +198,6 @@ def store( ): from dask.array import store - # TODO separate expected store kwargs from other compute kwargs? - return store( sources=sources, targets=targets, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 35835204631..3298a30424a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1972,7 +1972,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore: ... @@ -1994,7 +1994,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -2013,7 +2013,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, - store_kwargs: dict[str, Any] | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. @@ -2102,7 +2102,7 @@ def to_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. - store_kwargs : dict + chunkmanager_store_kwargs : dict Additional keyword arguments passed on to the `ChunkManager.store` method used to store chunked arrays. For example for a dask array additional kwargs will be passed eventually to :py:func:`dask.array.store()`. Experimental API that should not be relied upon. @@ -2151,7 +2151,7 @@ def to_zarr( region=region, safe_chunks=safe_chunks, zarr_version=zarr_version, - store_kwargs=store_kwargs, + chunkmanager_store_kwargs=chunkmanager_store_kwargs, ) def __repr__(self) -> str: From 316c63d55f4e2c317b028842f752a40596f16c6d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 2 May 2023 14:31:50 -0400 Subject: [PATCH 127/158] missed return value --- xarray/core/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3298a30424a..318a7f72e71 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -287,7 +287,9 @@ def _maybe_chunk( chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - guess_chunkmanager(chunked_array_type) + chunked_array_type = guess_chunkmanager( + chunked_array_type + ) # coerce string to ChunkManagerEntrypoint type if isinstance(chunked_array_type, DaskManager): from dask.base import tokenize From 9cd9078dfe645543a3f1fff0a776e080800e6820 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 3 May 2023 21:37:59 -0400 Subject: [PATCH 128/158] array API fixes for astype --- xarray/core/accessor_str.py | 73 ++++++++++++++++++++++--------------- xarray/core/variable.py | 6 +-- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index c6c4af87d1c..31028f10350 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -51,6 +51,7 @@ import numpy as np +from xarray.core import duck_array_ops from xarray.core.computation import apply_ufunc from xarray.core.types import T_DataArray @@ -2085,13 +2086,16 @@ def _get_res_multi(val, pat): else: # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=_get_res_multi, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxgroups}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=_get_res_multi, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxgroups}, + ), + self._obj.dtype.kind, + ) def extractall( self, @@ -2258,15 +2262,18 @@ def _get_res(val, ipat, imaxcount=maxcount, dtype=self._obj.dtype): return res - return self._apply( - # dtype MUST be object or strings can be truncated - # See: https://github.com/numpy/numpy/issues/8352 - func=_get_res, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[group_dim, match_dim]], - output_sizes={group_dim: maxgroups, match_dim: maxcount}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + # dtype MUST be object or strings can be truncated + # See: https://github.com/numpy/numpy/issues/8352 + func=_get_res, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[group_dim, match_dim]], + output_sizes={group_dim: maxgroups, match_dim: maxcount}, + ), + self._obj.dtype.kind, + ) def findall( self, @@ -2385,13 +2392,16 @@ def _partitioner( # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=arrfunc, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: 3}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=arrfunc, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: 3}, + ), + self._obj.dtype.kind, + ) def partition( self, @@ -2510,13 +2520,16 @@ def _dosplit(mystr, sep, maxsplit=maxsplit, dtype=self._obj.dtype): # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=_dosplit, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxsplit}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=_dosplit, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxsplit}, + ), + self._obj.dtype.kind, + ) def split( self, diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 25dfbf85556..feedd891c8d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1420,7 +1420,7 @@ def _shift_one_dim(self, dim, count, fill_value=dtypes.NA): pads = [(0, 0) if d != dim else dim_pad for d in self.dims] data = np.pad( - trimmed_data.astype(dtype), + duck_array_ops.astype(trimmed_data, dtype), pads, mode="constant", constant_values=fill_value, @@ -1569,7 +1569,7 @@ def pad( pad_option_kwargs["reflect_type"] = reflect_type array = np.pad( - self.data.astype(dtype, copy=False), + duck_array_ops.astype(self.data, dtype, copy=False), pad_width_by_index, mode=mode, **pad_option_kwargs, @@ -2437,7 +2437,7 @@ def rolling_window( """ if fill_value is dtypes.NA: # np.nan is passed dtype, fill_value = dtypes.maybe_promote(self.dtype) - var = self.astype(dtype, copy=False) + var = duck_array_ops.astype(self, dtype, copy=False) else: dtype = self.dtype var = self From 5dc20162c95feb099178ebb3acf5e22a8400174d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 4 May 2023 00:25:49 -0400 Subject: [PATCH 129/158] Revert "array API fixes for astype" This reverts commit 9cd9078dfe645543a3f1fff0a776e080800e6820. --- xarray/core/accessor_str.py | 73 +++++++++++++++---------------------- xarray/core/variable.py | 6 +-- 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 31028f10350..c6c4af87d1c 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -51,7 +51,6 @@ import numpy as np -from xarray.core import duck_array_ops from xarray.core.computation import apply_ufunc from xarray.core.types import T_DataArray @@ -2086,16 +2085,13 @@ def _get_res_multi(val, pat): else: # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return duck_array_ops.astype( - self._apply( - func=_get_res_multi, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxgroups}, - ), - self._obj.dtype.kind, - ) + return self._apply( + func=_get_res_multi, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxgroups}, + ).astype(self._obj.dtype.kind) def extractall( self, @@ -2262,18 +2258,15 @@ def _get_res(val, ipat, imaxcount=maxcount, dtype=self._obj.dtype): return res - return duck_array_ops.astype( - self._apply( - # dtype MUST be object or strings can be truncated - # See: https://github.com/numpy/numpy/issues/8352 - func=_get_res, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[group_dim, match_dim]], - output_sizes={group_dim: maxgroups, match_dim: maxcount}, - ), - self._obj.dtype.kind, - ) + return self._apply( + # dtype MUST be object or strings can be truncated + # See: https://github.com/numpy/numpy/issues/8352 + func=_get_res, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[group_dim, match_dim]], + output_sizes={group_dim: maxgroups, match_dim: maxcount}, + ).astype(self._obj.dtype.kind) def findall( self, @@ -2392,16 +2385,13 @@ def _partitioner( # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return duck_array_ops.astype( - self._apply( - func=arrfunc, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: 3}, - ), - self._obj.dtype.kind, - ) + return self._apply( + func=arrfunc, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: 3}, + ).astype(self._obj.dtype.kind) def partition( self, @@ -2520,16 +2510,13 @@ def _dosplit(mystr, sep, maxsplit=maxsplit, dtype=self._obj.dtype): # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return duck_array_ops.astype( - self._apply( - func=_dosplit, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxsplit}, - ), - self._obj.dtype.kind, - ) + return self._apply( + func=_dosplit, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxsplit}, + ).astype(self._obj.dtype.kind) def split( self, diff --git a/xarray/core/variable.py b/xarray/core/variable.py index feedd891c8d..25dfbf85556 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1420,7 +1420,7 @@ def _shift_one_dim(self, dim, count, fill_value=dtypes.NA): pads = [(0, 0) if d != dim else dim_pad for d in self.dims] data = np.pad( - duck_array_ops.astype(trimmed_data, dtype), + trimmed_data.astype(dtype), pads, mode="constant", constant_values=fill_value, @@ -1569,7 +1569,7 @@ def pad( pad_option_kwargs["reflect_type"] = reflect_type array = np.pad( - duck_array_ops.astype(self.data, dtype, copy=False), + self.data.astype(dtype, copy=False), pad_width_by_index, mode=mode, **pad_option_kwargs, @@ -2437,7 +2437,7 @@ def rolling_window( """ if fill_value is dtypes.NA: # np.nan is passed dtype, fill_value = dtypes.maybe_promote(self.dtype) - var = duck_array_ops.astype(self, dtype, copy=False) + var = self.astype(dtype, copy=False) else: dtype = self.dtype var = self From c8b9ee70bb9d76b5c08b60ef727da185e243dcbb Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 12 May 2023 07:02:54 +0200 Subject: [PATCH 130/158] Apply suggestions from code review --- xarray/tests/test_parallelcompat.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 3edd97e9311..ee4943f2b35 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -145,38 +145,38 @@ def register_dummy_chunkmanager(monkeypatch): class TestGetChunkManager: - def test_get_chunkmanger(self, register_dummy_chunkmanager): + def test_get_chunkmanger(self, register_dummy_chunkmanager) -> None: chunkmanager = guess_chunkmanager("dummy") assert isinstance(chunkmanager, DummyChunkManager) - def test_fail_on_nonexistent_chunkmanager(self): + def test_fail_on_nonexistent_chunkmanager(self) -> None: with pytest.raises(ValueError, match="unrecognized chunk manager foo"): guess_chunkmanager("foo") @requires_dask - def test_get_dask_if_installed(self): + def test_get_dask_if_installed(self) -> None: chunkmanager = guess_chunkmanager(None) assert isinstance(chunkmanager, DaskManager) @pytest.mark.skipif(has_dask, reason="requires dask not to be installed") - def test_dont_get_dask_if_not_installed(self): + def test_dont_get_dask_if_not_installed(self) -> None: with pytest.raises(ValueError, match="unrecognized chunk manager dask"): guess_chunkmanager("dask") @requires_dask - def test_choose_dask_over_other_chunkmanagers(self, register_dummy_chunkmanager): + def test_choose_dask_over_other_chunkmanagers(self, register_dummy_chunkmanager) -> None: chunk_manager = guess_chunkmanager(None) assert isinstance(chunk_manager, DaskManager) class TestGetChunkedArrayType: - def test_detect_chunked_arrays(self, register_dummy_chunkmanager): + def test_detect_chunked_arrays(self, register_dummy_chunkmanager) -> None: dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(dummy_arr) assert isinstance(chunk_manager, DummyChunkManager) - def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager): + def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager) -> None: dummy_arr = DummyChunkedArray([1, 2, 3]) chunk_manager = get_chunked_array_type(*[dummy_arr, 1.0, np.array([5, 6])]) @@ -185,11 +185,11 @@ def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager): with pytest.raises(TypeError, match="Expected a chunked array"): get_chunked_array_type(5.0) - def test_raise_if_no_arrays_chunked(self, register_dummy_chunkmanager): + def test_raise_if_no_arrays_chunked(self, register_dummy_chunkmanager) -> None: with pytest.raises(TypeError, match="Expected a chunked array "): get_chunked_array_type(*[1.0, np.array([5, 6])]) - def test_raise_if_no_matching_chunkmanagers(self): + def test_raise_if_no_matching_chunkmanagers(self) -> None: dummy_arr = DummyChunkedArray([1, 2, 3]) with pytest.raises( @@ -198,7 +198,7 @@ def test_raise_if_no_matching_chunkmanagers(self): get_chunked_array_type(dummy_arr) @requires_dask - def test_detect_dask_if_installed(self): + def test_detect_dask_if_installed(self) -> None: import dask.array as da dask_arr = da.from_array([1, 2, 3], chunks=(1,)) From 5c957580f63652c40f341b5c778341743c493dc2 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 12 May 2023 07:03:23 +0200 Subject: [PATCH 131/158] Update xarray/tests/test_parallelcompat.py --- xarray/tests/test_parallelcompat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index ee4943f2b35..0fa9c57f934 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -207,7 +207,7 @@ def test_detect_dask_if_installed(self) -> None: assert isinstance(chunk_manager, DaskManager) @requires_dask - def test_raise_on_mixed_array_types(self, register_dummy_chunkmanager): + def test_raise_on_mixed_array_types(self, register_dummy_chunkmanager) -> None: import dask.array as da dummy_arr = DummyChunkedArray([1, 2, 3]) From a61a30a628f21fe80b8c873f379fe43e88967ab7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 May 2023 05:03:29 +0000 Subject: [PATCH 132/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_parallelcompat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 0fa9c57f934..dbef6fed6e5 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -164,7 +164,9 @@ def test_dont_get_dask_if_not_installed(self) -> None: guess_chunkmanager("dask") @requires_dask - def test_choose_dask_over_other_chunkmanagers(self, register_dummy_chunkmanager) -> None: + def test_choose_dask_over_other_chunkmanagers( + self, register_dummy_chunkmanager + ) -> None: chunk_manager = guess_chunkmanager(None) assert isinstance(chunk_manager, DaskManager) From ea35a3203291ff472ecf8b18340558179fa6d2d0 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 11:40:19 -0400 Subject: [PATCH 133/158] overridden -> subclassed Co-authored-by: Justus Magin --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d5be24e3f90..84bfb4f9720 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -160,7 +160,7 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Experimental support for wrapping chunked array libraries other than dask. - A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntryPoint` - which can be overridden and then + A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntryPoint` - which can be subclassed and then registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`) By `Tom Nicholas `_. - Don't assume that arrays read from disk will be Numpy arrays. This is a step toward From e68b327e440a7909fcf9c3dde7b308c539b6bbbf Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 11:59:06 -0400 Subject: [PATCH 134/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0c1b1eaa8ec..a4012a8a733 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -807,7 +807,7 @@ def open_zarr( Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to From 956c0557c8d9977da132858f43187a0df1b47a71 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 13:59:35 -0400 Subject: [PATCH 135/158] ensured all compute calls go through chunkmanager --- xarray/core/daskmanager.py | 2 +- xarray/core/parallelcompat.py | 3 ++- xarray/core/variable.py | 10 ++++++---- xarray/tests/test_dask.py | 11 +++++------ xarray/tests/test_parallelcompat.py | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index cef6c2e3805..77de8de6080 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -63,7 +63,7 @@ def from_array(self, data, chunks, **kwargs) -> DaskArray: **kwargs, ) - def compute(self, *data: DaskArray, **kwargs) -> np.ndarray: + def compute(self, *data: DaskArray, **kwargs) -> tuple[np.ndarray, ...]: from dask.array import compute return compute(*data, **kwargs) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index cd122dbc399..83eedbdc411 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -191,7 +191,8 @@ def rechunk( return data.rechunk(chunks, **kwargs) # type: ignore[attr-defined] @abstractmethod - def compute(self, data: T_ChunkedArray, **kwargs) -> np.ndarray: + def compute(self, *data: T_ChunkedArray, **kwargs) -> tuple[np.ndarray, ...]: + """Used anytime something needs to computed, including multiple arrays at once.""" ... @property diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 25dfbf85556..b940bb494aa 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -27,6 +27,7 @@ ) from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.parallelcompat import ( + get_chunked_array_type, guess_chunkmanager, ) from xarray.core.pycompat import ( @@ -539,7 +540,9 @@ def load(self, **kwargs): dask.array.compute """ if is_chunked_array(self._data): - self._data = as_compatible_data(self._data.compute(**kwargs)) + chunkmanager = get_chunked_array_type(self._data) + loaded_data, *_ = chunkmanager.compute(self._data, **kwargs) + self._data = as_compatible_data(loaded_data) elif isinstance(self._data, indexing.ExplicitlyIndexed): self._data = self._data.get_duck_array() elif not is_duck_array(self._data): @@ -1291,10 +1294,9 @@ def to_numpy(self) -> np.ndarray: data = self.data # TODO first attempt to call .to_numpy() once some libraries implement it - # cubed has to be imported dynamically as cubed imports rechunker which imports xarray - # cubed_array_type = DuckArrayModule("cubed").type if hasattr(data, "chunks"): - data = data.compute() + chunkmanager = get_chunked_array_type(data) + data, *_ = chunkmanager.compute(data) if isinstance(data, array_type("cupy")): data = data.get() # pint has to be imported dynamically as pint imports xarray diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 1171464a962..ed18718043b 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -904,13 +904,12 @@ def test_to_dask_dataframe_dim_order(self): @pytest.mark.parametrize("method", ["load", "compute"]) def test_dask_kwargs_variable(method): - x = Variable("y", da.from_array(np.arange(3), chunks=(2,))) - # args should be passed on to da.Array.compute() - with mock.patch.object( - da.Array, "compute", return_value=np.arange(3) - ) as mock_compute: + chunked_array = da.from_array(np.arange(3), chunks=(2,)) + x = Variable("y", chunked_array) + # args should be passed on to dask.compute() (via DaskManager.compute()) + with mock.patch.object(da, "compute", return_value=(np.arange(3),)) as mock_compute: getattr(x, method)(foo="bar") - mock_compute.assert_called_with(foo="bar") + mock_compute.assert_called_with(chunked_array, foo="bar") @pytest.mark.parametrize("method", ["load", "compute", "persist"]) diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 3edd97e9311..ccce651949f 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -85,7 +85,7 @@ def from_array( def rechunk(self, data: DummyChunkedArray, chunks, **kwargs) -> DummyChunkedArray: return data.rechunk(chunks, **kwargs) - def compute(self, *data: DummyChunkedArray, **kwargs) -> np.ndarray: + def compute(self, *data: DummyChunkedArray, **kwargs) -> tuple[np.ndarray, ...]: from dask.array import compute return compute(*data, **kwargs) From cf0c28e5a80a574bdff4267b51ee6f047daa4e38 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 18:02:40 -0400 Subject: [PATCH 136/158] Raise if multiple chunkmanagers recognize array type Co-authored-by: Justus Magin --- xarray/core/parallelcompat.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 83eedbdc411..9c094315309 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -126,13 +126,19 @@ def get_chunked_array_type(*args) -> ChunkManagerEntrypoint: # iterate over defined chunk managers, seeing if each recognises this array type chunked_arr = chunked_arrays[0] chunkmanagers = list_chunkmanagers() - for chunkmanager in chunkmanagers.values(): - if chunkmanager.is_chunked_array(chunked_arr): - return chunkmanager - - raise TypeError( - f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}" - ) + selected = [ + chunkmanager + for chunkmanager in chunkmanagers.values() + if chunkmanager.is_chunked_array(chunked_arr) + ] + if not selected: + raise TypeError( + f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}" + ) + elif len(selected) >= 2: + raise TypeError(f"Multiple ChunkManagers recognise type {type(chunked_arr)}") + else: + return selected[0] class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): From 4f2ec2721c24945546a693dd3dcc5f1dcb23ca4d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 21:36:03 -0400 Subject: [PATCH 137/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 335879baa2f..12cc876efbf 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1887,7 +1887,7 @@ def ones_like( Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to From 5f2f56935e6a5280208237a039b0e8ad474f494b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 21:40:20 -0400 Subject: [PATCH 138/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 12cc876efbf..93a8568c89a 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1758,7 +1758,7 @@ def zeros_like( Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to From 929db33b5838615528ca8e8e54956accbb3ca0da Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 21:40:30 -0400 Subject: [PATCH 139/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 93a8568c89a..673a7a6cfc5 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1486,7 +1486,7 @@ def full_like( Which chunked array type to coerce the underlying data array to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed From 876f81c499a2439d991508130ca92416f7da3ce7 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 21:41:18 -0400 Subject: [PATCH 140/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 90378758455..60aeb72219a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2104,7 +2104,7 @@ def to_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. - chunkmanager_store_kwargs : dict + chunkmanager_store_kwargs : dict, optional Additional keyword arguments passed on to the `ChunkManager.store` method used to store chunked arrays. For example for a dask array additional kwargs will be passed eventually to :py:func:`dask.array.store()`. Experimental API that should not be relied upon. From ad0a706e0ef2f575734b52d02a16a12cd1af1ac8 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 12 May 2023 21:42:05 -0400 Subject: [PATCH 141/158] from_array_kwargs is optional Co-authored-by: Justus Magin --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 60aeb72219a..4499b4918c9 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2277,7 +2277,7 @@ def chunk( Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to From 115b52bf8efd0cf451293d9ef8fa62621036b9f9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 22:29:24 -0400 Subject: [PATCH 142/158] fixes for chunk methods --- xarray/core/dataarray.py | 12 ++++++------ xarray/core/dataset.py | 6 +++--- xarray/core/variable.py | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 44a68c95ed3..4a4a9063e49 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1290,21 +1290,21 @@ def chunk( Prefix for the name of the new dask array. token : str, optional Token uniquely identifying this array. - lock : optional + lock : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - inline_array: optional + inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. chunked_array_type: str, optional Which chunked array type to coerce the underlying data array to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 90378758455..948a1709901 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2277,11 +2277,11 @@ def chunk( Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. Experimental API that should not be relied upon. - from_array_kwargs: dict + from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b940bb494aa..399c3bddb21 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1198,12 +1198,12 @@ def chunk( name : str, optional Used to generate the name for this array in the internal dask graph. Does not need not be unique. - lock : bool, optional + lock : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not - already as dask array. Default is False. - inline_array : bool, optional + already as dask array. + inline_array : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not - already as dask array. Default is False. + already as dask array. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntrypoint` system. @@ -1211,8 +1211,8 @@ def chunk( from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. From bdf7600c3ea08d20bb4deacfd1c8a2d49880e21f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 22:41:05 -0400 Subject: [PATCH 143/158] correct readme to reflect fact we aren't vendoring dask in this PR any more --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index c031653ffcc..41db66fd395 100644 --- a/README.md +++ b/README.md @@ -122,13 +122,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -Xarray bundles portions of pandas, NumPy, Seaborn and dask, all of which are +Xarray bundles portions of pandas, NumPy and Seaborn, all of which are available under a "3-clause BSD" license: - pandas: setup.py, xarray/util/print_versions.py - NumPy: xarray/core/npcompat.py - Seaborn: _determine_cmap_params in xarray/core/plot/utils.py -- Dask: xarray/core/daskcompat.py Xarray also bundles portions of CPython, which is available under the "Python Software Foundation License" in xarray/core/pycompat.py. From 06bb5085cb24a1f439e9353ae7fad0c6f59ef264 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 22:43:17 -0400 Subject: [PATCH 144/158] update whatsnew --- doc/whats-new.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 84bfb4f9720..61344d87863 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -52,6 +52,11 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Experimental support for wrapping chunked array libraries other than dask. + A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntrypoint` - which can be subclassed and then + registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`) + By `Tom Nicholas `_. + .. _whats-new.2023.04.2: @@ -159,10 +164,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ -- Experimental support for wrapping chunked array libraries other than dask. - A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntryPoint` - which can be subclassed and then - registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`) - By `Tom Nicholas `_. - Don't assume that arrays read from disk will be Numpy arrays. This is a step toward enabling reads from a Zarr store using the `Kvikio `_ or `TensorStore `_ libraries. From ba00558aef5b39cb4b5fda6917677f93240b7d17 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 23:15:48 -0400 Subject: [PATCH 145/158] more docstring corrections --- xarray/core/common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 673a7a6cfc5..a520a597ae7 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1489,7 +1489,7 @@ def full_like( from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + For example, with dask as the default chunked array type, this method would pass additional kwargs to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns @@ -1761,8 +1761,8 @@ def zeros_like( from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1890,8 +1890,8 @@ def ones_like( from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to - :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- From 6a99454550c81325f5d7181b5addcbf1f1dfd4a7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 12 May 2023 23:36:50 -0400 Subject: [PATCH 146/158] remove comment --- xarray/core/computation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 5de8bfacdcd..685307fc8c3 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -747,7 +747,6 @@ def func(*arrays): return res elif dask == "allowed": - # TODO Check chunked array types here too? pass else: raise ValueError( From 95d81e866a27129159533b0da4a210c3f56113e5 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 16 May 2023 14:00:45 -0400 Subject: [PATCH 147/158] Raise NotImplementedErrors in all abstract methods Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/parallelcompat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 9c094315309..3ba6a071d85 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -159,14 +159,14 @@ class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): @abstractmethod def __init__(self): - ... + raise NotImplementedError() def is_chunked_array(self, data: Any) -> bool: return isinstance(data, self.array_cls) @abstractmethod def chunks(self, data: T_ChunkedArray) -> T_NormalizedChunks: - ... + raise NotImplementedError() @abstractmethod def normalize_chunks( @@ -178,14 +178,14 @@ def normalize_chunks( previous_chunks: T_NormalizedChunks | None = None, ) -> T_NormalizedChunks: """Called by open_dataset""" - ... + raise NotImplementedError() @abstractmethod def from_array( self, data: np.ndarray, chunks: T_Chunks, **kwargs ) -> T_ChunkedArray: """Called when .chunk is called on an xarray object that is not already chunked.""" - ... + raise NotImplementedError() def rechunk( self, @@ -199,7 +199,7 @@ def rechunk( @abstractmethod def compute(self, *data: T_ChunkedArray, **kwargs) -> tuple[np.ndarray, ...]: """Used anytime something needs to computed, including multiple arrays at once.""" - ... + raise NotImplementedError() @property def array_api(self) -> Any: From e5e309694e923b35224701e7071c27bbf7334edc Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 16 May 2023 14:49:55 -0400 Subject: [PATCH 148/158] type hints for every arg in ChunkManagerEntryPOint methods --- xarray/core/parallelcompat.py | 44 ++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 3ba6a071d85..100e72e08db 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -8,7 +8,7 @@ import functools import sys from abc import ABC, abstractmethod -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from importlib.metadata import EntryPoint, entry_points from typing import ( TYPE_CHECKING, @@ -222,13 +222,13 @@ def reduction( @abstractmethod def apply_gufunc( self, - func, - signature, - *args, - axes=None, - keepdims=False, - output_dtypes=None, - vectorize=None, + func: Callable, + signature: str, + *args: Any, + axes: Sequence[tuple[int, ...]] | None = None, + keepdims: bool = False, + output_dtypes: Sequence[np.typing.DTypeLike] | None = None, + vectorize: bool | None = None, **kwargs, ): """ @@ -238,12 +238,12 @@ def apply_gufunc( def map_blocks( self, - func, - *args, - dtype=None, - chunks=None, - drop_axis=[], - new_axis=None, + func: Callable, + *args: Any, + dtype: np.typing.DTypeLike | None = None, + chunks: tuple[int, ...] | None = None, + drop_axis: int | Sequence[int] | None = None, + new_axis: int | Sequence[int] | None = None, **kwargs, ): """Currently only called in a couple of really niche places in xarray. Not even called in xarray.map_blocks.""" @@ -251,19 +251,21 @@ def map_blocks( def blockwise( self, - func, - out_ind, - *args, - adjust_chunks=None, - new_axes=None, - align_arrays=True, + func: Callable, + out_ind: Iterable, + *args: Any, # can't type this as mypy assumes args are all same type, but dask blockwise args alternate types + adjust_chunks: dict[Any, Callable] | None = None, + new_axes: dict[Any, int] | None = None, + align_arrays: bool = True, **kwargs, ): """Called by some niche functions in xarray.""" raise NotImplementedError() def unify_chunks( - self, *args, **kwargs + self, + *args: Any, # can't type this as mypy assumes args are all same type, but dask unify_chunks args alternate types + **kwargs, ) -> tuple[dict[str, T_NormalizedChunks], list[T_ChunkedArray]]: """Called by xr.unify_chunks.""" raise NotImplementedError() From a22143647b20004176a829f74220df54f0ae900e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 16 May 2023 16:09:28 -0400 Subject: [PATCH 149/158] more explicit typing + fixes for mypy errors revealed --- xarray/core/daskmanager.py | 67 +++++++++++++++-------------- xarray/core/dataset.py | 6 +-- xarray/core/parallelcompat.py | 6 +-- xarray/core/pycompat.py | 2 +- xarray/core/utils.py | 19 ++++---- xarray/core/variable.py | 4 +- xarray/tests/test_parallelcompat.py | 4 +- 7 files changed, 57 insertions(+), 51 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 77de8de6080..2af9b88f4ba 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING, Any, Callable import numpy as np @@ -18,7 +18,7 @@ class DaskManager(ChunkManagerEntrypoint["DaskArray"]): array_cls: type[DaskArray] available: bool = dask_available - def __init__(self): + def __init__(self) -> None: # TODO can we replace this with a class attribute instead? from dask.array import Array @@ -33,8 +33,8 @@ def chunks(self, data: DaskArray) -> T_NormalizedChunks: def normalize_chunks( self, - chunks: T_Chunks, - shape: tuple[int] | None = None, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: np.dtype | None = None, previous_chunks: T_NormalizedChunks | None = None, @@ -50,7 +50,7 @@ def normalize_chunks( previous_chunks=previous_chunks, ) - def from_array(self, data, chunks, **kwargs) -> DaskArray: + def from_array(self, data: Any, chunks, **kwargs) -> DaskArray: import dask.array as da if isinstance(data, ImplicitToExplicitIndexingAdapter): @@ -98,17 +98,17 @@ def reduction( def apply_gufunc( self, - func, - signature, - *args, - axes=None, - axis=None, - keepdims=False, - output_dtypes=None, - output_sizes=None, - vectorize=None, - allow_rechunk=False, - meta=None, + func: Callable, + signature: str, + *args: Any, + axes: Sequence[tuple[int, ...]] | None = None, + axis: int | None = None, + keepdims: bool = False, + output_dtypes: Sequence[np.typing.DTypeLike] | None = None, + output_sizes: dict[str, int] | None = None, + vectorize: bool | None = None, + allow_rechunk: bool = False, + meta: tuple[np.ndarray, ...] | None = None, **kwargs, ): from dask.array.gufunc import apply_gufunc @@ -130,12 +130,12 @@ def apply_gufunc( def map_blocks( self, - func, - *args, - dtype=None, - chunks=None, - drop_axis=[], - new_axis=None, + func: Callable, + *args: Any, + dtype: np.typing.DTypeLike | None = None, + chunks: tuple[int, ...] | None = None, + drop_axis: int | Sequence[int] | None = None, + new_axis: int | Sequence[int] | None = None, **kwargs, ): from dask.array import map_blocks @@ -153,16 +153,17 @@ def map_blocks( def blockwise( self, - func, - out_ind, - *args, - name=None, + func: Callable, + out_ind: Iterable, + *args: Any, + # can't type this as mypy assumes args are all same type, but dask blockwise args alternate types + name: str | None = None, token=None, - dtype=None, - adjust_chunks=None, - new_axes=None, - align_arrays=True, - concatenate=None, + dtype: np.dtype | None = None, + adjust_chunks: dict[Any, Callable] | None = None, + new_axes: dict[Any, int] | None = None, + align_arrays: bool = True, + concatenate: bool | None = None, meta=None, **kwargs, ): @@ -184,7 +185,9 @@ def blockwise( ) def unify_chunks( - self, *args, **kwargs + self, + *args: Any, # can't type this as mypy assumes args are all same type, but dask unify_chunks args alternate types + **kwargs, ) -> tuple[dict[str, T_NormalizedChunks], list[DaskArray]]: from dask.array.core import unify_chunks diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 31cf13f48de..4b26d4a3e5f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -213,7 +213,7 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) -def _get_chunk(var, chunks, chunkmanager): +def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ @@ -252,7 +252,7 @@ def _get_chunk(var, chunks, chunkmanager): # expresses the preferred chunks, the sequence sums to the size. preferred_stops = ( range(preferred_chunk_sizes, size, preferred_chunk_sizes) - if isinstance(preferred_chunk_sizes, Number) + if isinstance(preferred_chunk_sizes, int) else itertools.accumulate(preferred_chunk_sizes[:-1]) ) # Gather any stop indices of the specified chunks that are not a stop index @@ -263,7 +263,7 @@ def _get_chunk(var, chunks, chunkmanager): ) if breaks: warnings.warn( - "The specified Dask chunks separate the stored chunks along " + "The specified chunks separate the stored chunks along " f'dimension "{dim}" starting at index {min(breaks)}. This could ' "degrade performance. Instead, consider rechunking after loading." ) diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 100e72e08db..7d4458b8ae5 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -158,7 +158,7 @@ class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): available: bool = True @abstractmethod - def __init__(self): + def __init__(self) -> None: raise NotImplementedError() def is_chunked_array(self, data: Any) -> bool: @@ -171,8 +171,8 @@ def chunks(self, data: T_ChunkedArray) -> T_NormalizedChunks: @abstractmethod def normalize_chunks( self, - chunks: T_Chunks, - shape: tuple[int] | None = None, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: np.dtype | None = None, previous_chunks: T_NormalizedChunks | None = None, diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index f3f089a924c..db77ef56fd1 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -83,7 +83,7 @@ def is_duck_dask_array(x): return is_duck_array(x) and is_dask_collection(x) -def is_chunked_array(x): +def is_chunked_array(x) -> bool: return is_duck_dask_array(x) or (is_duck_array(x) and hasattr(x, "chunks")) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 7d1d94d60d2..6ed0b2c4318 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -1205,8 +1205,11 @@ def emit_user_level_warning(message, category=None): def consolidate_dask_from_array_kwargs( - from_array_kwargs, name=None, lock=None, inline_array=None -): + from_array_kwargs: dict, + name: str | None = None, + lock: bool | None = None, + inline_array: bool | None = None, +) -> dict: """ Merge dask-specific kwargs with arbitrary from_array_kwargs dict. @@ -1239,12 +1242,12 @@ def consolidate_dask_from_array_kwargs( def _resolve_doubly_passed_kwarg( - kwargs_dict, - kwarg_name, - passed_kwarg_value, - default, - err_msg_dict_name, -): + kwargs_dict: dict, + kwarg_name: str, + passed_kwarg_value: str | bool | None, + default: bool | None, + err_msg_dict_name: str, +) -> dict: # if in kwargs_dict but not passed explicitly then just pass kwargs_dict through unaltered if kwarg_name in kwargs_dict and passed_kwarg_value is None: pass diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 399c3bddb21..1683233c18d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -374,7 +374,7 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): self.encoding = encoding @property - def dtype(self): + def dtype(self) -> np.dtype: """ Data-type of the array’s elements. @@ -386,7 +386,7 @@ def dtype(self): return self._data.dtype @property - def shape(self): + def shape(self) -> tuple[int, ...]: """ Tuple of array dimensions. diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py index 88ff7968966..2c3378a2816 100644 --- a/xarray/tests/test_parallelcompat.py +++ b/xarray/tests/test_parallelcompat.py @@ -65,8 +65,8 @@ def chunks(self, data: DummyChunkedArray) -> T_NormalizedChunks: def normalize_chunks( self, - chunks: T_Chunks, - shape: tuple[int] | None = None, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: np.dtype | None = None, previous_chunks: T_NormalizedChunks | None = None, From fe2e9b37929524578403e6517918e319c6d043d4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 16 May 2023 16:16:43 -0400 Subject: [PATCH 150/158] Keyword-only arguments in full_like etc. Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/common.py | 56 ++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index a520a597ae7..96391464085 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1400,7 +1400,8 @@ def full_like( other: DataArray, fill_value: Any, dtype: DTypeLikeSave | None = None, - chunks={}, + *, + chunks: T_Chunks={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1412,7 +1413,8 @@ def full_like( other: Dataset, fill_value: Any, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1424,7 +1426,8 @@ def full_like( other: Variable, fill_value: Any, dtype: DTypeLikeSave | None = None, - chunks={}, + *. + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1436,7 +1439,8 @@ def full_like( other: Dataset | DataArray, fill_value: Any, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks:T_Chunks={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1448,7 +1452,8 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1459,7 +1464,8 @@ def full_like( other: Dataset | DataArray | Variable, fill_value: Any, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1645,7 +1651,7 @@ def _full_like_variable( other: Variable, fill_value: Any, dtype: DTypeLike | None = None, - chunks={}, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1684,7 +1690,8 @@ def _full_like_variable( def zeros_like( other: DataArray, dtype: DTypeLikeSave | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1695,7 +1702,8 @@ def zeros_like( def zeros_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1706,7 +1714,8 @@ def zeros_like( def zeros_like( other: Variable, dtype: DTypeLikeSave | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1717,7 +1726,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1728,7 +1738,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1738,7 +1749,8 @@ def zeros_like( def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1813,7 +1825,8 @@ def zeros_like( def ones_like( other: DataArray, dtype: DTypeLikeSave | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1824,7 +1837,8 @@ def ones_like( def ones_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1835,7 +1849,8 @@ def ones_like( def ones_like( other: Variable, dtype: DTypeLikeSave | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1846,7 +1861,8 @@ def ones_like( def ones_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1857,7 +1873,8 @@ def ones_like( def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1867,7 +1884,8 @@ def ones_like( def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, - chunks={}, + *, + chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: From 7bcaecee1cf2df988f665880ff403836a3d292a5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 May 2023 20:17:21 +0000 Subject: [PATCH 151/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 96391464085..13cb421f92e 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1861,7 +1861,7 @@ def ones_like( def ones_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, - *, + *, chunks: T_Chunks ={}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, From fecf7ed9f229fdecaaf0b5996924aed305296b20 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 16 May 2023 16:33:52 -0400 Subject: [PATCH 152/158] None as default instead of {} --- xarray/core/common.py | 67 +++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 96391464085..397d6de226a 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -47,6 +47,7 @@ DTypeLikeSave, ScalarOrArray, SideOptions, + T_Chunks, T_DataWithCoords, T_Variable, ) @@ -1401,7 +1402,7 @@ def full_like( fill_value: Any, dtype: DTypeLikeSave | None = None, *, - chunks: T_Chunks={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1414,7 +1415,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1426,8 +1427,8 @@ def full_like( other: Variable, fill_value: Any, dtype: DTypeLikeSave | None = None, - *. - chunks: T_Chunks ={}, + *, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1440,7 +1441,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, *, - chunks:T_Chunks={}, + chunks: T_Chunks = {}, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1453,7 +1454,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1465,7 +1466,7 @@ def full_like( fill_value: Any, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1651,7 +1652,7 @@ def _full_like_variable( other: Variable, fill_value: Any, dtype: DTypeLike | None = None, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1661,7 +1662,11 @@ def _full_like_variable( if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype) - if is_chunked_array(other.data) or chunked_array_type is not None or chunks != {}: + if ( + is_chunked_array(other.data) + or chunked_array_type is not None + or chunks is not None + ): if chunked_array_type is None: chunkmanager = get_chunked_array_type(other.data) else: @@ -1691,7 +1696,7 @@ def zeros_like( other: DataArray, dtype: DTypeLikeSave | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1703,7 +1708,7 @@ def zeros_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1715,7 +1720,7 @@ def zeros_like( other: Variable, dtype: DTypeLikeSave | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1727,7 +1732,7 @@ def zeros_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1739,7 +1744,7 @@ def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1750,7 +1755,7 @@ def zeros_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1818,7 +1823,14 @@ def zeros_like( full_like """ - return full_like(other, 0, dtype, chunks, chunked_array_type, from_array_kwargs) + return full_like( + other, + 0, + dtype, + chunks=chunks, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) @overload @@ -1826,7 +1838,7 @@ def ones_like( other: DataArray, dtype: DTypeLikeSave | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: @@ -1838,7 +1850,7 @@ def ones_like( other: Dataset, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: @@ -1850,7 +1862,7 @@ def ones_like( other: Variable, dtype: DTypeLikeSave | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: @@ -1861,8 +1873,8 @@ def ones_like( def ones_like( other: Dataset | DataArray, dtype: DTypeMaybeMapping | None = None, - *, - chunks: T_Chunks ={}, + *, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: @@ -1874,7 +1886,7 @@ def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1885,7 +1897,7 @@ def ones_like( other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping | None = None, *, - chunks: T_Chunks ={}, + chunks: T_Chunks = None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: @@ -1945,7 +1957,14 @@ def ones_like( full_like """ - return full_like(other, 1, dtype, chunks, chunked_array_type, from_array_kwargs) + return full_like( + other, + 1, + dtype, + chunks=chunks, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) def get_chunksizes( From 660ef416338dca2a190c04a35a515dcf47848f71 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 16 May 2023 16:52:54 -0400 Subject: [PATCH 153/158] fix bug apparently introduced by changing default type of drop_axis kwarg to map_blocks --- xarray/core/daskmanager.py | 2 +- xarray/core/parallelcompat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index 2af9b88f4ba..b2bcc04d9d1 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -134,7 +134,7 @@ def map_blocks( *args: Any, dtype: np.typing.DTypeLike | None = None, chunks: tuple[int, ...] | None = None, - drop_axis: int | Sequence[int] | None = None, + drop_axis: int | Sequence[int] = [], new_axis: int | Sequence[int] | None = None, **kwargs, ): diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7d4458b8ae5..cfe1bcbbed0 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -242,7 +242,7 @@ def map_blocks( *args: Any, dtype: np.typing.DTypeLike | None = None, chunks: tuple[int, ...] | None = None, - drop_axis: int | Sequence[int] | None = None, + drop_axis: int | Sequence[int] = [], new_axis: int | Sequence[int] | None = None, **kwargs, ): From e6d6f1f7e870bf06b308221728f02add24585508 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 16 May 2023 18:00:50 -0400 Subject: [PATCH 154/158] Removed hopefully-unnecessary mypy ignore Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4b26d4a3e5f..00941db1f83 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2244,7 +2244,7 @@ def chunk( inline_array: bool = False, chunked_array_type: str | ChunkManagerEntrypoint - | None = None, # noqa: F821 # type: ignore[name-defined] + | None = None, from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: From c7fbe79f7164851825af77532d528379658639db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 May 2023 22:01:32 +0000 Subject: [PATCH 155/158] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 00941db1f83..a9dbeb7fc85 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2242,9 +2242,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunked_array_type: str - | ChunkManagerEntrypoint - | None = None, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: From d728427186ea8974ecf1087f4da9f749e84bb53f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 16 May 2023 18:11:44 -0400 Subject: [PATCH 156/158] removed unnecessary mypy ignores --- xarray/core/dataarray.py | 6 ++---- xarray/core/dataset.py | 8 +++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4a4a9063e49..bc2450abc9d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -36,7 +36,6 @@ from xarray.core.indexing import is_fancy_indexer, map_index_queries from xarray.core.merge import PANDAS_TYPES, MergeError, _create_indexes_from_coords from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import ChunkManagerEntrypoint # noqa from xarray.core.utils import ( Default, HybridMappingProxy, @@ -78,6 +77,7 @@ from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy + from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling from xarray.core.types import ( @@ -1265,9 +1265,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunked_array_type: str - | ChunkManagerEntrypoint - | None = None, # noqa: F821 # type: ignore[name-defined] + chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 00941db1f83..d2ecd65ba58 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -74,8 +74,7 @@ ) from xarray.core.missing import get_clean_interp_index from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import ( # noqa - ChunkManagerEntrypoint, +from xarray.core.parallelcompat import ( get_chunked_array_type, guess_chunkmanager, ) @@ -118,6 +117,7 @@ from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy from xarray.core.merge import CoercibleMapping + from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling from xarray.core.types import ( @@ -2242,9 +2242,7 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, - chunked_array_type: str - | ChunkManagerEntrypoint - | None = None, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: From 51db5f23e0ffeb3bd1a8e565b4e1a26f1f8ff4e4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 17 May 2023 23:48:40 -0400 Subject: [PATCH 157/158] change default value of drop_axis kwarg in map_blocks and catch when dask version < 2022.9.1 --- xarray/core/daskmanager.py | 9 ++++++++- xarray/core/parallelcompat.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index b2bcc04d9d1..cb90a448c26 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -3,7 +3,9 @@ from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING, Any, Callable +import dask import numpy as np +from packaging.version import Version from xarray.core.duck_array_ops import dask_available from xarray.core.indexing import ImplicitToExplicitIndexingAdapter @@ -134,12 +136,17 @@ def map_blocks( *args: Any, dtype: np.typing.DTypeLike | None = None, chunks: tuple[int, ...] | None = None, - drop_axis: int | Sequence[int] = [], + drop_axis: int | Sequence[int] | None = None, new_axis: int | Sequence[int] | None = None, **kwargs, ): from dask.array import map_blocks + if drop_axis is None and Version(dask.__version__) >= Version("2022.9.1"): + # See https://github.com/pydata/xarray/pull/7019#discussion_r1196729489 + # TODO remove once dask minimum version >= 2022.9.1 + drop_axis = [] + # pass through name, meta, token as kwargs return map_blocks( func, diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index cfe1bcbbed0..7d4458b8ae5 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -242,7 +242,7 @@ def map_blocks( *args: Any, dtype: np.typing.DTypeLike | None = None, chunks: tuple[int, ...] | None = None, - drop_axis: int | Sequence[int] = [], + drop_axis: int | Sequence[int] | None = None, new_axis: int | Sequence[int] | None = None, **kwargs, ): From c69c56319dfbe84ac5e59b97a497ed8de799905d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 18 May 2023 12:12:12 -0400 Subject: [PATCH 158/158] fix checking of dask version in map_blocks --- xarray/core/daskmanager.py | 4 ++-- xarray/core/parallelcompat.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py index cb90a448c26..56d8dc9e23a 100644 --- a/xarray/core/daskmanager.py +++ b/xarray/core/daskmanager.py @@ -3,7 +3,6 @@ from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING, Any, Callable -import dask import numpy as np from packaging.version import Version @@ -140,9 +139,10 @@ def map_blocks( new_axis: int | Sequence[int] | None = None, **kwargs, ): + import dask from dask.array import map_blocks - if drop_axis is None and Version(dask.__version__) >= Version("2022.9.1"): + if drop_axis is None and Version(dask.__version__) < Version("2022.9.1"): # See https://github.com/pydata/xarray/pull/7019#discussion_r1196729489 # TODO remove once dask minimum version >= 2022.9.1 drop_axis = [] diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py index 7d4458b8ae5..4df4ff235c6 100644 --- a/xarray/core/parallelcompat.py +++ b/xarray/core/parallelcompat.py @@ -234,7 +234,7 @@ def apply_gufunc( """ Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. """ - ... + raise NotImplementedError() def map_blocks( self, @@ -246,7 +246,7 @@ def map_blocks( new_axis: int | Sequence[int] | None = None, **kwargs, ): - """Currently only called in a couple of really niche places in xarray. Not even called in xarray.map_blocks.""" + """Called in elementwise operations, but notably not called in xarray.map_blocks.""" raise NotImplementedError() def blockwise(