Skip to content

Commit

Permalink
ds.to_dict with data as arrays, not lists (#7739)
Browse files Browse the repository at this point in the history
* first stab at ds.to_dict giving data as numpy objects

* update whats-new.rst

* mypy flailing: add dict typing to test_dataarray

* mypy flailing 2: add dict typing to core/varaible.py

* testing equality of encodings on Ds.from_dict(ds.to_dict()) roundtrips

* .values -> .to_numpy()

* requested changes on 4/19/23

* to_dict kwarg data handles bool and str, "list" and True return list of Python datatypes, "array" returns numpy.ndarrays, False returns only the schema

* fix mypy hashable not being string

* touch ups on to_dict() changes

* to_dict with dask, tested. other minor things

* touch up

* finalize to_dict()
  • Loading branch information
jmccreight authored Apr 28, 2023
1 parent a220022 commit 087ebbb
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 36 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ New Features
- Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`.
(:issue:`7692`, :pull:`7693`) .
By `Joe Hamman <https://github.com/jhamman>`_.
- Keyword argument `data='array'` to both :py:meth:`xarray.Dataset.to_dict` and
:py:meth:`xarray.DataArray.to_dict` will now return data as the underlying array type. Python lists are returned for `data='list'` or `data=True`. Supplying `data=False` only returns the schema without data. ``encoding=True`` returns the encoding dictionary for the underlying variable also.
(:issue:`1599`, :pull:`7739`) .
By `James McCreight <https://github.com/jmccreight>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
13 changes: 10 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4174,7 +4174,9 @@ def to_zarr(
zarr_version=zarr_version,
)

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool | Literal["list", "array"] = "list", encoding: bool = False
) -> dict[str, Any]:
"""
Convert this xarray.DataArray into a dictionary following xarray
naming conventions.
Expand All @@ -4185,9 +4187,14 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
Parameters
----------
data : bool, default: True
data : bool or {"list", "array"}, default: "list"
Whether to include the actual data in the dictionary. When set to
False, returns just the schema.
False, returns just the schema. If set to "array", returns data as
underlying array type. If set to "list" (or True for backwards
compatibility), returns data in lists of Python data types. Note
that for obtaining the "list" output efficiently, use
`da.compute().to_dict(data="list")`.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.
Expand Down
16 changes: 12 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6441,7 +6441,9 @@ def to_dask_dataframe(

return df

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool | Literal["list", "array"] = "list", encoding: bool = False
) -> dict[str, Any]:
"""
Convert this dataset to a dictionary following xarray naming
conventions.
Expand All @@ -6452,9 +6454,14 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
Parameters
----------
data : bool, default: True
data : bool or {"list", "array"}, default: "list"
Whether to include the actual data in the dictionary. When set to
False, returns just the schema.
False, returns just the schema. If set to "array", returns data as
underlying array type. If set to "list" (or True for backwards
compatibility), returns data in lists of Python data types. Note
that for obtaining the "list" output efficiently, use
`ds.compute().to_dict(data="list")`.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.
Expand Down Expand Up @@ -6560,7 +6567,8 @@ def from_dict(cls: type[T_Dataset], d: Mapping[Any, Any]) -> T_Dataset:
)
try:
variable_dict = {
k: (v["dims"], v["data"], v.get("attrs")) for k, v in variables
k: (v["dims"], v["data"], v.get("attrs"), v.get("encoding"))
for k, v in variables
}
except KeyError as e:
raise ValueError(
Expand Down
20 changes: 16 additions & 4 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,23 @@ def to_index(self) -> pd.Index:
"""Convert this variable to a pandas.Index"""
return self.to_index_variable().to_index()

def to_dict(self, data: bool = True, encoding: bool = False) -> dict:
def to_dict(
self, data: bool | str = "list", encoding: bool = False
) -> dict[str, Any]:
"""Dictionary representation of variable."""
item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)}
if data:
item["data"] = ensure_us_time_resolution(self.values).tolist()
item: dict[str, Any] = {
"dims": self.dims,
"attrs": decode_numpy_dict_values(self.attrs),
}
if data is not False:
if data in [True, "list"]:
item["data"] = ensure_us_time_resolution(self.to_numpy()).tolist()
elif data == "array":
item["data"] = ensure_us_time_resolution(self.data)
else:
msg = 'data argument must be bool, "list", or "array"'
raise ValueError(msg)

else:
item.update({"dtype": str(self.dtype), "shape": self.shape})

Expand Down
56 changes: 40 additions & 16 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections.abc import Hashable
from copy import deepcopy
from textwrap import dedent
from typing import Any, Final, cast
from typing import Any, Final, Literal, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -3345,46 +3345,70 @@ def test_series_categorical_index(self) -> None:
arr = DataArray(s)
assert "'a'" in repr(arr) # should not error

@pytest.mark.parametrize("use_dask", [True, False])
@pytest.mark.parametrize("data", ["list", "array", True])
@pytest.mark.parametrize("encoding", [True, False])
def test_to_and_from_dict(self, encoding) -> None:
def test_to_and_from_dict(
self, encoding: bool, data: bool | Literal["list", "array"], use_dask: bool
) -> None:
if use_dask and not has_dask:
pytest.skip("requires dask")
encoding_data = {"bar": "spam"}
array = DataArray(
np.random.randn(2, 3), {"x": ["a", "b"]}, ["x", "y"], name="foo"
)
array.encoding = {"bar": "spam"}
expected = {
array.encoding = encoding_data

return_data = array.to_numpy()
coords_data = np.array(["a", "b"])
if data == "list" or data is True:
return_data = return_data.tolist()
coords_data = coords_data.tolist()

expected: dict[str, Any] = {
"name": "foo",
"dims": ("x", "y"),
"data": array.values.tolist(),
"data": return_data,
"attrs": {},
"coords": {"x": {"dims": ("x",), "data": ["a", "b"], "attrs": {}}},
"coords": {"x": {"dims": ("x",), "data": coords_data, "attrs": {}}},
}
if encoding:
expected["encoding"] = {"bar": "spam"}
actual = array.to_dict(encoding=encoding)
expected["encoding"] = encoding_data

if has_dask:
da = array.chunk()
else:
da = array

if data == "array" or data is False:
with raise_if_dask_computes():
actual = da.to_dict(encoding=encoding, data=data)
else:
actual = da.to_dict(encoding=encoding, data=data)

# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)

# check roundtrip
assert_identical(array, DataArray.from_dict(actual))
assert_identical(da, DataArray.from_dict(actual))

# a more bare bones representation still roundtrips
d = {
"name": "foo",
"dims": ("x", "y"),
"data": array.values.tolist(),
"data": da.values.tolist(),
"coords": {"x": {"dims": "x", "data": ["a", "b"]}},
}
assert_identical(array, DataArray.from_dict(d))
assert_identical(da, DataArray.from_dict(d))

# and the most bare bones representation still roundtrips
d = {"name": "foo", "dims": ("x", "y"), "data": array.values}
assert_identical(array.drop_vars("x"), DataArray.from_dict(d))
d = {"name": "foo", "dims": ("x", "y"), "data": da.values}
assert_identical(da.drop_vars("x"), DataArray.from_dict(d))

# missing a dims in the coords
d = {
"dims": ("x", "y"),
"data": array.values,
"data": da.values,
"coords": {"x": {"data": ["a", "b"]}},
}
with pytest.raises(
Expand All @@ -3407,7 +3431,7 @@ def test_to_and_from_dict(self, encoding) -> None:
endiantype = "<U1" if sys.byteorder == "little" else ">U1"
expected_no_data["coords"]["x"].update({"dtype": endiantype, "shape": (2,)})
expected_no_data.update({"dtype": "float64", "shape": (2, 3)})
actual_no_data = array.to_dict(data=False, encoding=encoding)
actual_no_data = da.to_dict(data=False, encoding=encoding)
assert expected_no_data == actual_no_data

def test_to_and_from_dict_with_time_dim(self) -> None:
Expand Down
40 changes: 31 additions & 9 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from copy import copy, deepcopy
from io import StringIO
from textwrap import dedent
from typing import Any
from typing import Any, Literal

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -4596,7 +4596,11 @@ def test_convert_dataframe_with_many_types_and_multiindex(self) -> None:
expected = df.apply(np.asarray)
assert roundtripped.equals(expected)

def test_to_and_from_dict(self) -> None:
@pytest.mark.parametrize("encoding", [True, False])
@pytest.mark.parametrize("data", [True, "list", "array"])
def test_to_and_from_dict(
self, encoding: bool, data: bool | Literal["list", "array"]
) -> None:
# <xarray.Dataset>
# Dimensions: (t: 10)
# Coordinates:
Expand All @@ -4617,14 +4621,25 @@ def test_to_and_from_dict(self) -> None:
"b": {"dims": ("t",), "data": y.tolist(), "attrs": {}},
},
}
if encoding:
ds.t.encoding.update({"foo": "bar"})
expected["encoding"] = {}
expected["coords"]["t"]["encoding"] = ds.t.encoding
for vvs in ["a", "b"]:
expected["data_vars"][vvs]["encoding"] = {}

actual = ds.to_dict()
actual = ds.to_dict(data=data, encoding=encoding)

# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)

# check roundtrip
assert_identical(ds, Dataset.from_dict(actual))
ds_rt = Dataset.from_dict(actual)
assert_identical(ds, ds_rt)
if encoding:
assert set(ds_rt.variables) == set(ds.variables)
for vv in ds.variables:
np.testing.assert_equal(ds_rt[vv].encoding, ds[vv].encoding)

# check the data=False option
expected_no_data = expected.copy()
Expand All @@ -4635,14 +4650,18 @@ def test_to_and_from_dict(self) -> None:
expected_no_data["coords"]["t"].update({"dtype": endiantype, "shape": (10,)})
expected_no_data["data_vars"]["a"].update({"dtype": "float64", "shape": (10,)})
expected_no_data["data_vars"]["b"].update({"dtype": "float64", "shape": (10,)})
actual_no_data = ds.to_dict(data=False)
actual_no_data = ds.to_dict(data=False, encoding=encoding)
assert expected_no_data == actual_no_data

# verify coords are included roundtrip
expected_ds = ds.set_coords("b")
actual2 = Dataset.from_dict(expected_ds.to_dict())
actual2 = Dataset.from_dict(expected_ds.to_dict(data=data, encoding=encoding))

assert_identical(expected_ds, actual2)
if encoding:
assert set(expected_ds.variables) == set(actual2.variables)
for vv in ds.variables:
np.testing.assert_equal(expected_ds[vv].encoding, actual2[vv].encoding)

# test some incomplete dicts:
# this one has no attrs field, the dims are strings, and x, y are
Expand Down Expand Up @@ -4690,7 +4709,10 @@ def test_to_and_from_dict_with_time_dim(self) -> None:
roundtripped = Dataset.from_dict(ds.to_dict())
assert_identical(ds, roundtripped)

def test_to_and_from_dict_with_nan_nat(self) -> None:
@pytest.mark.parametrize("data", [True, "list", "array"])
def test_to_and_from_dict_with_nan_nat(
self, data: bool | Literal["list", "array"]
) -> None:
x = np.random.randn(10, 3)
y = np.random.randn(10, 3)
y[2] = np.nan
Expand All @@ -4706,7 +4728,7 @@ def test_to_and_from_dict_with_nan_nat(self) -> None:
"lat": ("lat", lat),
}
)
roundtripped = Dataset.from_dict(ds.to_dict())
roundtripped = Dataset.from_dict(ds.to_dict(data=data))
assert_identical(ds, roundtripped)

def test_to_dict_with_numpy_attrs(self) -> None:
Expand Down

0 comments on commit 087ebbb

Please sign in to comment.