pydata · shoyer · Jul 5, 2019 · Sep 27, 2017 · Apr 1, 2019 · Apr 1, 2019
diff --git a/doc/api.rst b/doc/api.rst
@@ -199,6 +199,7 @@ Reshaping and reorganizing
    Dataset.transpose
    Dataset.stack
    Dataset.unstack
+   Dataset.to_stacked_array
    Dataset.shift
    Dataset.roll
    Dataset.sortby
@@ -370,6 +371,7 @@ Reshaping and reorganizing
    DataArray.transpose
    DataArray.stack
    DataArray.unstack
+   DataArray.to_unstacked_dataset
    DataArray.shift
    DataArray.roll
    DataArray.sortby

diff --git a/doc/reshaping.rst b/doc/reshaping.rst
@@ -133,6 +133,36 @@ pandas, it does not automatically drop missing values. Compare:
 We departed from pandas's behavior here because predictable shapes for new
 array dimensions is necessary for :ref:`dask`.
 
+Stacking different variables together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These stacking and unstacking operations are particularly useful for reshaping
+xarray objects for use in machine learning packages, such as `scikit-learn
+<http://scikit-learn.org/stable/>`_, that usually require two-dimensional numpy
+arrays as inputs. For datasets with only one variable, we only need ``stack``
+and ``unstack``, but combining multiple variables in a
+:py:class:`xarray.Dataset` is more complicated. If the variables in the dataset
+have matching numbers of dimensions, we can call
+:py:meth:`~xarray.Dataset.to_array` and then stack along the the new coordinate.
+But :py:meth:`~xarray.Dataset.to_array` will broadcast the dataarrays together,
+which will effectively tile the lower dimensional variable along the missing
+dimensions. The method :py:meth:`xarray.Dataset.to_stacked_array` allows
+combining variables of differing dimensions without this wasteful copying while
+:py:meth:`xarray.DataArray.to_unstacked_dataset` reverses this operation. These
+methods are used like this:
+
+.. ipython:: python
+
+        arr = xr.DataArray(np.arange(6).reshape(2, 3),
+                        coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
+        data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)})
+        stacked = data.to_stacked_array("z", ['y'])
+        stacked
+        unstacked = stacked.to_unstacked_dataset("z")
+        unstacked
+
+In this example, ``stacked`` is a two dimensional array that we can easily pass to a scikit-learn or another generic numerical method.
+
 .. _reshape.set_index:
 
 Set and reset index

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -24,6 +24,8 @@ Enhancements
 - Allow ``expand_dims`` method to support inserting/broadcasting dimensions
   with size > 1. (:issue:`2710`)
   By `Martin Pletcher <https://github.com/pletchm>`_.
+- New methods for reshaping Datasets of variables with different dimensions
+  (:issue:`1317`). By `Noah Brenowitz <https://github.com/nbren12>`_.
 
 
 Bug fixes

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1402,6 +1402,66 @@ def unstack(self, dim=None):
         ds = self._to_temp_dataset().unstack(dim)
         return self._from_temp_dataset(ds)
 
+    def to_unstacked_dataset(self, dim, level=0,
+                             variable_dim='variable'):
+        """Unstack DataArray expanding to Dataset along a given level of a
+        stacked coordinate.
+
+        This is the inverse operation of Dataset.to_stacked_array.
+
+        Parameters
+        ----------
+        dim : str
+            Name of existing dimension to unstack
+        level : int
+            Index of level to expand to dataset along
+
+        Returns
+        -------
+        unstacked: Dataset
+
+        Examples
+        --------
+        >>> import xarray as xr
+        >>> arr = DataArray(np.arange(6).reshape(2, 3),
+        ...                 coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
+        >>> data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)})
+        >>> data
+        <xarray.Dataset>
+        Dimensions:  (x: 2, y: 3)
+        Coordinates:
+          * x        (x) <U1 'a' 'b'
+          * y        (y) int64 0 1 2
+        Data variables:
+            a        (x, y) int64 0 1 2 3 4 5
+            b        (x) int64 0 3
+        >>> stacked = data.to_stacked_array("z", ['y'])
+        >>> stacked.indexes['z']
+        MultiIndex(levels=[['a', 'b'], [0, 1, 2]],
+                labels=[[0, 0, 0, 1], [0, 1, 2, -1]],
+                names=['variable', 'y'])
+        >>> roundtripped = stacked.to_unstacked_dataset(dim='z')
+        >>> data.identical(roundtripped)
+        True
+
+        See Also
+        --------
+        Dataset.to_stacked_array
+        """
+
+        idx = self.indexes[dim]
+        if not isinstance(idx, pd.MultiIndex):
+            raise ValueError(dim, "is not a stacked coordinate")
+        variables = idx.levels[level]
+
+        # pull variables out of datarray
+        data_dict = OrderedDict()
+        for k in variables:
+            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
-            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
+            data_dict[k] = self.sel({variable_dim: k}).squeeze(drop=True)
-            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
+            data_dict[k] = self.sel({variable_dim: k}).squeeze(drop=True)
+
+        # unstacked dataset
+        return Dataset(data_dict)
+
     def transpose(self, *dims):
         """Return a new DataArray object with transposed dimensions.
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2627,6 +2627,107 @@ def stack(self, dimensions=None, **dimensions_kwargs):
             result = result._stack_once(dims, new_dim)
         return result
 
+    def to_stacked_array(self, new_dim, dims, variable_dim='variable'):
+        """Combine variables of differing dimensionality into a DataArray
+        without broadcasting.
+
+        This function is basically version of Dataset.to_array which does not
+        broadcast the variables.
+
+        Parameters
+        ----------
+        new_dim : str
+            Name of the new stacked coordinate
+        dims : Sequence[str]
+            Dimensions to be stacked. Not all variables in the dataset need to
+            have these dimensions.
+        variable_dim : str, optional
+            Name of the level in the MultiIndex object which corresponds to
+            the variables.
+
+        Returns
+        -------
+        stacked : DataArray
+
+        See Also
+        --------
+        Dataset.to_array
+        Dataset.stack
+        DataArray.to_unstacked_dataset
+
+        Examples
+        --------
+
+        >>> arr = DataArray(np.arange(6).reshape(2, 3),
+        ...                 coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
+        >>> data = Dataset({'a': arr, 'b': arr.isel(y=0)})
+        >>> data
+
+        <xarray.Dataset>
+        Dimensions:  (x: 2, y: 3)
+        Coordinates:
+        * x        (x) <U1 'a' 'b'
+        * y        (y) int64 0 1 2
+        Data variables:
+            a        (x, y) int64 0 1 2 3 4 5
+            b        (x) int64 0 3
+        >>> stacked = data.to_stacked_array("z", ['y'])
+        >>> stacked.indexes['z']
+
+        MultiIndex(levels=[['a', 'b'], [0, 1, 2]],
+                labels=[[0, 0, 0, 1], [0, 1, 2, -1]],
+                names=['variable', 'y'])
+        >>> stacked
+
+        <xarray.DataArray 'a' (x: 2, z: 4)>
+        array([[0, 1, 2, 0],
+            [3, 4, 5, 3]])
+        Coordinates:
+        * x         (x) <U1 'a' 'b'
+        * z         (z) MultiIndex
+        - variable  (z) object 'a' 'a' 'a' 'b'
+        - y         (z) object 0 1 2 nan
+
+        """
+        dims = tuple(dims)
+
+        def f(val):
+            # ensure square output
+
+            assign_coords = {variable_dim: val.name}
+            for dim in dims:
+                if (dim not in val.dims):
+                    assign_coords[dim] = None
+
+            expand_dims = set(dims).difference(set(val.dims))
+            expand_dims.add(variable_dim)
+            # must be list for .expand_dims
+            expand_dims = list(expand_dims)
+
+            return val.assign_coords(**assign_coords) \
+                .expand_dims(expand_dims) \
+                .stack(**{new_dim: (variable_dim,) + dims})
+
+        # concatenate the arrays
+        Xs = [f(self[key]) for key in self.data_vars]
+        dataset = xr.concat(Xs, dim=new_dim)
+
+        # coerce the levels of the MultiIndex to have the same type as the
+        # input dimensions. This code is messy, so it might be better to just
+        # input a dummy value for the singleton dimension.
+        idx = dataset.indexes[new_dim]
+        levels = [idx.levels[0]]\
+            + [level.astype(self[level.name].dtype)
+               for level in idx.levels[1:]]
+        new_idx = idx.set_levels(levels)
+        # patch in the new index object
+        # dataset[new_dim].variable._data.array = new_idx
+        # This commented line below is much cleaner than the junk above, but I
+        # wanted to modify the IndexVariable inplace to make sure the attrs
+        # and encodings are the same
+        dataset[new_dim] = IndexVariable(new_dim, new_idx)
+        return dataset
+
     def _unstack_once(self, dim):
         index = self.get_index(dim)
         # GH2619. For MultiIndex, we need to call remove_unused.

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -58,6 +58,14 @@ def create_test_multiindex():
     return Dataset({}, {'x': mindex})
 
 
+def create_test_stacked_array():
+    x = DataArray(pd.Index(np.r_[:10], name='x'))
+    y = DataArray(pd.Index(np.r_[:20], name='y'))
+    a = x * y
+    b = x * y * y
+    return a, b
+
+
 class InaccessibleVariableDataStore(backends.InMemoryDataStore):
     def __init__(self):
         super(InaccessibleVariableDataStore, self).__init__()
@@ -2252,6 +2260,54 @@ def test_stack_unstack_slow(self):
         actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
         assert actual.identical(ds[['b']])
 
+    def test_to_stacked_array_dtype_dims(self):
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        feature_dims = ['y']
+        y = D.to_stacked_array('features', feature_dims)
+        assert y.indexes['features'].levels[1].dtype == D.y.dtype
+        assert y.dims == ('x', 'features')
+
+    def test_to_stacked_array_to_unstacked_dataset(self):
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        feature_dims = ['y']
+        y = D.to_stacked_array('features', feature_dims)\
+            .transpose("x", "features")
+
+        x = y.to_unstacked_dataset("features")
+        assert_identical(D, x)
+
+        # test on just one sample
+        x0 = y[0].to_unstacked_dataset("features")
+        d0 = D.isel(x=0)
+        assert_identical(d0, x0)
+
+    def test_to_stacked_array_to_unstacked_dataset_different_dimension(self):
+        # test when variables have different dimensionality
+        a, b = create_test_stacked_array()
+        feature_dims = ['y']
+        D = xr.Dataset({'a': a, 'b': b.isel(y=0)})
+
+        y = D.to_stacked_array('features', feature_dims)
+        x = y.to_unstacked_dataset('features')
+        assert_identical(D, x)
+
+        # another test
+        ds = D.isel(x=0)
+        ds_flat = ds.to_stacked_array('features', ['y'])
+        ds_comp = ds_flat.to_unstacked_dataset('features')
+        assert_identical(ds, ds_comp)
+
+    def test_to_stacked_array_to_unstacked_dataset_scalar(self):
+        a = xr.DataArray(np.r_[:6], dims=('x', ), coords={'x': np.r_[:6]})
+        ds = xr.Dataset({'a': a, 'b': 1.0})
+        ds_flat = ds.to_stacked_array('features', ['x'])
+        ds_comp = ds_flat.to_unstacked_dataset('features')
+        assert_identical(ds, ds_comp)
+
     def test_update(self):
         data = create_test_data(seed=0)
         expected = data.copy()