-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add methods for combining variables of differing dimensionality #1597
Changes from 6 commits
8c947e7
e997f7f
3d757da
151dc71
8a1a8ef
e8594f1
0f1ba22
1e1f4d9
35e0ecf
23d9246
099d440
e40b6a2
35a2365
35715dc
5ca9a1d
2979c75
c17dc09
ce3b52e
4ade43d
6d520c2
2669797
24b2237
13587c2
95e2da9
7aa7095
e08622a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,8 @@ Enhancements | |
- Allow ``expand_dims`` method to support inserting/broadcasting dimensions | ||
with size > 1. (:issue:`2710`) | ||
By `Martin Pletcher <https://github.com/pletchm>`_. | ||
- New methods for reshaping Datasets of variables with different dimensions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this needs to move up to 0.12.3 now -- sorry for the churn here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. I moved this up to that release and added a new section header
|
||
(:issue:`1317`). By `Noah Brenowitz <https://github.com/nbren12>`_. | ||
|
||
|
||
Bug fixes | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1402,6 +1402,66 @@ def unstack(self, dim=None): | |||||
ds = self._to_temp_dataset().unstack(dim) | ||||||
return self._from_temp_dataset(ds) | ||||||
|
||||||
def to_unstacked_dataset(self, dim, level=0, | ||||||
variable_dim='variable'): | ||||||
"""Unstack DataArray expanding to Dataset along a given level of a | ||||||
stacked coordinate. | ||||||
|
||||||
This is the inverse operation of Dataset.to_stacked_array. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
dim : str | ||||||
Name of existing dimension to unstack | ||||||
level : int | ||||||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Index of level to expand to dataset along | ||||||
|
||||||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Returns | ||||||
------- | ||||||
unstacked: Dataset | ||||||
|
||||||
rabernat marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Examples | ||||||
-------- | ||||||
>>> import xarray as xr | ||||||
>>> arr = DataArray(np.arange(6).reshape(2, 3), | ||||||
... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) | ||||||
>>> data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)}) | ||||||
>>> data | ||||||
<xarray.Dataset> | ||||||
Dimensions: (x: 2, y: 3) | ||||||
Coordinates: | ||||||
* x (x) <U1 'a' 'b' | ||||||
* y (y) int64 0 1 2 | ||||||
Data variables: | ||||||
a (x, y) int64 0 1 2 3 4 5 | ||||||
b (x) int64 0 3 | ||||||
>>> stacked = data.to_stacked_array("z", ['y']) | ||||||
>>> stacked.indexes['z'] | ||||||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
MultiIndex(levels=[['a', 'b'], [0, 1, 2]], | ||||||
labels=[[0, 0, 0, 1], [0, 1, 2, -1]], | ||||||
names=['variable', 'y']) | ||||||
>>> roundtripped = stacked.to_unstacked_dataset(dim='z') | ||||||
>>> data.identical(roundtripped) | ||||||
True | ||||||
|
||||||
See Also | ||||||
-------- | ||||||
Dataset.to_stacked_array | ||||||
""" | ||||||
|
||||||
idx = self.indexes[dim] | ||||||
if not isinstance(idx, pd.MultiIndex): | ||||||
raise ValueError(dim, "is not a stacked coordinate") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add test coverage for this error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure. done |
||||||
variables = idx.levels[level] | ||||||
|
||||||
# pull variables out of datarray | ||||||
data_dict = OrderedDict() | ||||||
for k in variables: | ||||||
data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than sending as kwargs, if we send as a dict then this will work with non-str keys (though dim names is only partially supported anyway atm)
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have made this change. |
||||||
|
||||||
# unstacked dataset | ||||||
return Dataset(data_dict) | ||||||
|
||||||
def transpose(self, *dims): | ||||||
"""Return a new DataArray object with transposed dimensions. | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2627,6 +2627,107 @@ def stack(self, dimensions=None, **dimensions_kwargs): | |
result = result._stack_once(dims, new_dim) | ||
return result | ||
|
||
def to_stacked_array(self, new_dim, dims, variable_dim='variable'): | ||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can also use the syntax There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While your suggestion is certainly more analogous to |
||
"""Combine variables of differing dimensionality into a DataArray | ||
without broadcasting. | ||
|
||
This function is basically version of Dataset.to_array which does not | ||
broadcast the variables. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is a typo: 'a version' maybe reformulate to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. Changed to "This method is similar to Dataset.to_array but does not broadcast variables." |
||
|
||
Parameters | ||
---------- | ||
new_dim : str | ||
Name of the new stacked coordinate | ||
dims : Sequence[str] | ||
Dimensions to be stacked. Not all variables in the dataset need to | ||
have these dimensions. | ||
variable_dim : str, optional | ||
Name of the level in the MultiIndex object which corresponds to | ||
the variables. | ||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns | ||
------- | ||
stacked : DataArray | ||
|
||
See Also | ||
-------- | ||
Dataset.to_array | ||
Dataset.stack | ||
DataArray.to_unstacked_dataset | ||
|
||
Examples | ||
-------- | ||
|
||
>>> arr = DataArray(np.arange(6).reshape(2, 3), | ||
... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) | ||
>>> data = Dataset({'a': arr, 'b': arr.isel(y=0)}) | ||
>>> data | ||
|
||
<xarray.Dataset> | ||
Dimensions: (x: 2, y: 3) | ||
Coordinates: | ||
* x (x) <U1 'a' 'b' | ||
* y (y) int64 0 1 2 | ||
Data variables: | ||
a (x, y) int64 0 1 2 3 4 5 | ||
b (x) int64 0 3 | ||
>>> stacked = data.to_stacked_array("z", ['y']) | ||
>>> stacked.indexes['z'] | ||
|
||
MultiIndex(levels=[['a', 'b'], [0, 1, 2]], | ||
labels=[[0, 0, 0, 1], [0, 1, 2, -1]], | ||
names=['variable', 'y']) | ||
>>> stacked | ||
|
||
<xarray.DataArray 'a' (x: 2, z: 4)> | ||
array([[0, 1, 2, 0], | ||
[3, 4, 5, 3]]) | ||
Coordinates: | ||
* x (x) <U1 'a' 'b' | ||
* z (z) MultiIndex | ||
- variable (z) object 'a' 'a' 'a' 'b' | ||
- y (z) object 0 1 2 nan | ||
|
||
""" | ||
dims = tuple(dims) | ||
|
||
def f(val): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please give this as sensible name rather than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. I changed it to |
||
# ensure square output | ||
|
||
assign_coords = {variable_dim: val.name} | ||
for dim in dims: | ||
if (dim not in val.dims): | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assign_coords[dim] = None | ||
|
||
expand_dims = set(dims).difference(set(val.dims)) | ||
expand_dims.add(variable_dim) | ||
# must be list for .expand_dims | ||
expand_dims = list(expand_dims) | ||
|
||
return val.assign_coords(**assign_coords) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: here and below, per PEP8, prefer using parentheses rather than return (val.assign_coords(**assign_coords)
.expand_dims(expand_dims)
.stack(**{new_dim: (variable_dim,) + stacking_dims})) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. done. |
||
.expand_dims(expand_dims) \ | ||
.stack(**{new_dim: (variable_dim,) + dims}) | ||
|
||
# concatenate the arrays | ||
Xs = [f(self[key]) for key in self.data_vars] | ||
dataset = xr.concat(Xs, dim=new_dim) | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# coerce the levels of the MultiIndex to have the same type as the | ||
# input dimensions. This code is messy, so it might be better to just | ||
# input a dummy value for the singleton dimension. | ||
idx = dataset.indexes[new_dim] | ||
levels = [idx.levels[0]]\ | ||
+ [level.astype(self[level.name].dtype) | ||
for level in idx.levels[1:]] | ||
new_idx = idx.set_levels(levels) | ||
# patch in the new index object | ||
# dataset[new_dim].variable._data.array = new_idx | ||
# This commented line below is much cleaner than the junk above, but I | ||
# wanted to modify the IndexVariable inplace to make sure the attrs | ||
# and encodings are the same | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment still accurate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. I think it is cleaner to declare a new |
||
dataset[new_dim] = IndexVariable(new_dim, new_idx) | ||
return dataset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "data_array" might be a better name for this variable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed this |
||
|
||
def _unstack_once(self, dim): | ||
index = self.get_index(dim) | ||
# GH2619. For MultiIndex, we need to call remove_unused. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,14 @@ def create_test_multiindex(): | |
return Dataset({}, {'x': mindex}) | ||
|
||
|
||
def create_test_stacked_array(): | ||
x = DataArray(pd.Index(np.r_[:10], name='x')) | ||
y = DataArray(pd.Index(np.r_[:20], name='y')) | ||
a = x * y | ||
b = x * y * y | ||
return a, b | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to change, but this would be ideal as test fixture |
||
|
||
|
||
class InaccessibleVariableDataStore(backends.InMemoryDataStore): | ||
def __init__(self): | ||
super(InaccessibleVariableDataStore, self).__init__() | ||
|
@@ -2252,6 +2260,54 @@ def test_stack_unstack_slow(self): | |
actual = stacked.isel(z=slice(None, None, -1)).unstack('z') | ||
assert actual.identical(ds[['b']]) | ||
|
||
def test_to_stacked_array_dtype_dims(self): | ||
# make a two dimensional dataset | ||
a, b = create_test_stacked_array() | ||
D = xr.Dataset({'a': a, 'b': b}) | ||
feature_dims = ['y'] | ||
y = D.to_stacked_array('features', feature_dims) | ||
assert y.indexes['features'].levels[1].dtype == D.y.dtype | ||
assert y.dims == ('x', 'features') | ||
|
||
def test_to_stacked_array_to_unstacked_dataset(self): | ||
# make a two dimensional dataset | ||
a, b = create_test_stacked_array() | ||
D = xr.Dataset({'a': a, 'b': b}) | ||
feature_dims = ['y'] | ||
y = D.to_stacked_array('features', feature_dims)\ | ||
.transpose("x", "features") | ||
|
||
x = y.to_unstacked_dataset("features") | ||
assert_identical(D, x) | ||
|
||
# test on just one sample | ||
x0 = y[0].to_unstacked_dataset("features") | ||
d0 = D.isel(x=0) | ||
assert_identical(d0, x0) | ||
|
||
def test_to_stacked_array_to_unstacked_dataset_different_dimension(self): | ||
# test when variables have different dimensionality | ||
a, b = create_test_stacked_array() | ||
feature_dims = ['y'] | ||
D = xr.Dataset({'a': a, 'b': b.isel(y=0)}) | ||
|
||
y = D.to_stacked_array('features', feature_dims) | ||
x = y.to_unstacked_dataset('features') | ||
assert_identical(D, x) | ||
|
||
# another test | ||
ds = D.isel(x=0) | ||
ds_flat = ds.to_stacked_array('features', ['y']) | ||
ds_comp = ds_flat.to_unstacked_dataset('features') | ||
assert_identical(ds, ds_comp) | ||
|
||
def test_to_stacked_array_to_unstacked_dataset_scalar(self): | ||
a = xr.DataArray(np.r_[:6], dims=('x', ), coords={'x': np.r_[:6]}) | ||
ds = xr.Dataset({'a': a, 'b': 1.0}) | ||
ds_flat = ds.to_stacked_array('features', ['x']) | ||
ds_comp = ds_flat.to_unstacked_dataset('features') | ||
assert_identical(ds, ds_comp) | ||
|
||
def test_update(self): | ||
data = create_test_data(seed=0) | ||
expected = data.copy() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here I would mention explicitly that it does this by using a MultiIndex in the output. (Is that a correct interpretation?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.