Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ds.to_dict with data as arrays, not lists #7739

Merged
merged 19 commits into from
Apr 28, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ New Features
- Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`.
(:issue:`7692`, :pull:`7693`) .
By `Joe Hamman <https://github.com/jhamman>`_.
- Keyword argument `data='array'` to both :py:meth:`xarray.Dataset.to_dict` and
:py:meth:`xarray.DataArray.to_dict` will now return data as the underlying array type. Python lists are returned for `data='list'` or `data=True`. Supplying `data=False` only returns the schema without data. ``encoding=True`` returns the encoding dictionary for the underlying variable also.
(:issue:`1599`, :pull:`7739`) .
By `James McCreight <https://github.com/jmccreight>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
10 changes: 7 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4174,7 +4174,9 @@ def to_zarr(
zarr_version=zarr_version,
)

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool | Literal["list", "array"] = "list", encoding: bool = False
) -> dict[str, Any]:
"""
Convert this xarray.DataArray into a dictionary following xarray
naming conventions.
Expand All @@ -4185,9 +4187,11 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:

Parameters
----------
data : bool, default: True
data : bool or {"list", "array"}, default: "list"
Whether to include the actual data in the dictionary. When set to
False, returns just the schema.
False, returns just the schema. If set to "list" (or True for
backwards compatibility), returns data in lists of Python data types.
If set to "array", returns data as in numpy.ndarrays.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.

Expand Down
13 changes: 9 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6441,7 +6441,9 @@ def to_dask_dataframe(

return df

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool | Literal["list", "array"] = "list", encoding: bool = False
) -> dict[str, Any]:
"""
Convert this dataset to a dictionary following xarray naming
conventions.
Expand All @@ -6452,9 +6454,11 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:

Parameters
----------
data : bool, default: True
data : bool or {"list", "array"}, default: "list"
Whether to include the actual data in the dictionary. When set to
False, returns just the schema.
False, returns just the schema. If set to "list" (or True for
backwards compatibility), returns data in lists of Python data types.
If set to "array", returns data as underlying array type.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.

Expand Down Expand Up @@ -6560,7 +6564,8 @@ def from_dict(cls: type[T_Dataset], d: Mapping[Any, Any]) -> T_Dataset:
)
try:
variable_dict = {
k: (v["dims"], v["data"], v.get("attrs")) for k, v in variables
k: (v["dims"], v["data"], v.get("attrs"), v.get("encoding"))
for k, v in variables
}
except KeyError as e:
raise ValueError(
Expand Down
21 changes: 17 additions & 4 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,24 @@ def to_index(self) -> pd.Index:
"""Convert this variable to a pandas.Index"""
return self.to_index_variable().to_index()

def to_dict(self, data: bool = True, encoding: bool = False) -> dict:
def to_dict(
self, data: bool | str = "list", encoding: bool = False
) -> dict[str, Any]:
"""Dictionary representation of variable."""
item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)}
if data:
item["data"] = ensure_us_time_resolution(self.values).tolist()
item: dict[str, Any] = {
"dims": self.dims,
"attrs": decode_numpy_dict_values(self.attrs),
}
if data is not False:
if data is True or data == "list":
jmccreight marked this conversation as resolved.
Show resolved Hide resolved
item["data"] = ensure_us_time_resolution(self.to_numpy())
item["data"] = item["data"].tolist()
elif data == "array":
item["data"] = ensure_us_time_resolution(self.data)
else:
msg = 'data argument must be bool, "list", or "array"'
raise ValueError(msg)

jmccreight marked this conversation as resolved.
Show resolved Hide resolved
else:
item.update({"dtype": str(self.dtype), "shape": self.shape})

Expand Down
56 changes: 40 additions & 16 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections.abc import Hashable
from copy import deepcopy
from textwrap import dedent
from typing import Any, Final, cast
from typing import Any, Final, Literal, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -3345,46 +3345,70 @@ def test_series_categorical_index(self) -> None:
arr = DataArray(s)
assert "'a'" in repr(arr) # should not error

@pytest.mark.parametrize("use_dask", [True, False])
@pytest.mark.parametrize("data", ["list", "array", True])
@pytest.mark.parametrize("encoding", [True, False])
def test_to_and_from_dict(self, encoding) -> None:
def test_to_and_from_dict(
self, encoding: bool, data: bool | Literal["list", "array"], use_dask: bool
) -> None:
if use_dask and not has_dask:
pytest.skip("requires dask")
encoding_data = {"bar": "spam"}
array = DataArray(
np.random.randn(2, 3), {"x": ["a", "b"]}, ["x", "y"], name="foo"
)
array.encoding = {"bar": "spam"}
expected = {
array.encoding = encoding_data

return_data = array.values
jmccreight marked this conversation as resolved.
Show resolved Hide resolved
coords_data = np.array(["a", "b"])
if data == "list" or data is True:
return_data = return_data.tolist()
coords_data = coords_data.tolist()

expected: dict[str, Any] = {
"name": "foo",
"dims": ("x", "y"),
"data": array.values.tolist(),
"data": return_data,
"attrs": {},
"coords": {"x": {"dims": ("x",), "data": ["a", "b"], "attrs": {}}},
"coords": {"x": {"dims": ("x",), "data": coords_data, "attrs": {}}},
}
if encoding:
expected["encoding"] = {"bar": "spam"}
actual = array.to_dict(encoding=encoding)
expected["encoding"] = encoding_data

if has_dask:
da = array.chunk()
else:
da = array

if data == "array" or data is False:
with raise_if_dask_computes():
actual = da.to_dict(encoding=encoding, data=data)
else:
actual = da.to_dict(encoding=encoding, data=data)

jmccreight marked this conversation as resolved.
Show resolved Hide resolved
# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)
dcherian marked this conversation as resolved.
Show resolved Hide resolved
jmccreight marked this conversation as resolved.
Show resolved Hide resolved

# check roundtrip
assert_identical(array, DataArray.from_dict(actual))
assert_identical(da, DataArray.from_dict(actual))

# a more bare bones representation still roundtrips
d = {
"name": "foo",
"dims": ("x", "y"),
"data": array.values.tolist(),
"data": da.values.tolist(),
"coords": {"x": {"dims": "x", "data": ["a", "b"]}},
}
assert_identical(array, DataArray.from_dict(d))
assert_identical(da, DataArray.from_dict(d))

# and the most bare bones representation still roundtrips
d = {"name": "foo", "dims": ("x", "y"), "data": array.values}
assert_identical(array.drop_vars("x"), DataArray.from_dict(d))
d = {"name": "foo", "dims": ("x", "y"), "data": da.values}
assert_identical(da.drop_vars("x"), DataArray.from_dict(d))

# missing a dims in the coords
d = {
"dims": ("x", "y"),
"data": array.values,
"data": da.values,
"coords": {"x": {"data": ["a", "b"]}},
}
with pytest.raises(
Expand All @@ -3407,7 +3431,7 @@ def test_to_and_from_dict(self, encoding) -> None:
endiantype = "<U1" if sys.byteorder == "little" else ">U1"
expected_no_data["coords"]["x"].update({"dtype": endiantype, "shape": (2,)})
expected_no_data.update({"dtype": "float64", "shape": (2, 3)})
actual_no_data = array.to_dict(data=False, encoding=encoding)
actual_no_data = da.to_dict(data=False, encoding=encoding)
assert expected_no_data == actual_no_data

def test_to_and_from_dict_with_time_dim(self) -> None:
Expand Down
40 changes: 31 additions & 9 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from copy import copy, deepcopy
from io import StringIO
from textwrap import dedent
from typing import Any
from typing import Any, Literal

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -4596,7 +4596,11 @@ def test_convert_dataframe_with_many_types_and_multiindex(self) -> None:
expected = df.apply(np.asarray)
assert roundtripped.equals(expected)

def test_to_and_from_dict(self) -> None:
@pytest.mark.parametrize("encoding", [True, False])
@pytest.mark.parametrize("data", [True, "list", "array"])
def test_to_and_from_dict(
self, encoding: bool, data: bool | Literal["list", "array"]
) -> None:
# <xarray.Dataset>
# Dimensions: (t: 10)
# Coordinates:
Expand All @@ -4617,14 +4621,25 @@ def test_to_and_from_dict(self) -> None:
"b": {"dims": ("t",), "data": y.tolist(), "attrs": {}},
},
}
if encoding:
ds.t.encoding.update({"foo": "bar"})
expected["encoding"] = {}
expected["coords"]["t"]["encoding"] = ds.t.encoding
for vvs in ["a", "b"]:
expected["data_vars"][vvs]["encoding"] = {}

actual = ds.to_dict()
actual = ds.to_dict(data=data, encoding=encoding)

# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)

# check roundtrip
assert_identical(ds, Dataset.from_dict(actual))
ds_rt = Dataset.from_dict(actual)
assert_identical(ds, ds_rt)
if encoding:
assert set(ds_rt.variables) == set(ds.variables)
for vv in ds.variables:
np.testing.assert_equal(ds_rt[vv].encoding, ds[vv].encoding)

# check the data=False option
expected_no_data = expected.copy()
Expand All @@ -4635,14 +4650,18 @@ def test_to_and_from_dict(self) -> None:
expected_no_data["coords"]["t"].update({"dtype": endiantype, "shape": (10,)})
expected_no_data["data_vars"]["a"].update({"dtype": "float64", "shape": (10,)})
expected_no_data["data_vars"]["b"].update({"dtype": "float64", "shape": (10,)})
actual_no_data = ds.to_dict(data=False)
actual_no_data = ds.to_dict(data=False, encoding=encoding)
assert expected_no_data == actual_no_data

# verify coords are included roundtrip
expected_ds = ds.set_coords("b")
actual2 = Dataset.from_dict(expected_ds.to_dict())
actual2 = Dataset.from_dict(expected_ds.to_dict(data=data, encoding=encoding))

assert_identical(expected_ds, actual2)
if encoding:
assert set(expected_ds.variables) == set(actual2.variables)
for vv in ds.variables:
np.testing.assert_equal(expected_ds[vv].encoding, actual2[vv].encoding)

# test some incomplete dicts:
# this one has no attrs field, the dims are strings, and x, y are
Expand Down Expand Up @@ -4690,7 +4709,10 @@ def test_to_and_from_dict_with_time_dim(self) -> None:
roundtripped = Dataset.from_dict(ds.to_dict())
assert_identical(ds, roundtripped)

def test_to_and_from_dict_with_nan_nat(self) -> None:
@pytest.mark.parametrize("data", [True, "list", "array"])
def test_to_and_from_dict_with_nan_nat(
self, data: bool | Literal["list", "array"]
) -> None:
x = np.random.randn(10, 3)
y = np.random.randn(10, 3)
y[2] = np.nan
Expand All @@ -4706,7 +4728,7 @@ def test_to_and_from_dict_with_nan_nat(self) -> None:
"lat": ("lat", lat),
}
)
roundtripped = Dataset.from_dict(ds.to_dict())
roundtripped = Dataset.from_dict(ds.to_dict(data=data))
assert_identical(ds, roundtripped)

def test_to_dict_with_numpy_attrs(self) -> None:
Expand Down