pydata · shoyer · Jul 5, 2019 · Sep 27, 2017 · Apr 1, 2019 · Apr 1, 2019
diff --git a/doc/reshaping.rst b/doc/reshaping.rst
@@ -154,16 +154,26 @@ represented by a :py:class:`pandas.MultiIndex` object. These methods are used
 like this:
 
 .. ipython:: python
-
-        arr = xr.DataArray(np.arange(6).reshape(2, 3),
-                        coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
-        data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)})
-        stacked = data.to_stacked_array("z", ['y'])
+        data = xr.Dataset(
+            data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+                      'b': ('x', [6, 7])},
+            coords={'y': ['u', 'v', 'w']}
+        )
+        stacked = data.to_stacked_array("z", sample_dims=['x'])
         stacked
         unstacked = stacked.to_unstacked_dataset("z")
         unstacked
 
-In this example, ``stacked`` is a two dimensional array that we can easily pass to a scikit-learn or another generic numerical method.
+In this example, ``stacked`` is a two dimensional array that we can easily pass to a scikit-learn or another generic
+numerical method.
+
+.. note::
+
+    Unlike with ``stack``,  in ``to_stacked_array``, the user specifies the dimensions they **do not** want stacked.
+    For a machine learning task, these unstacked dimensions can be interpreted as the dimensions over which samples are
+    drawn, whereas the stacked coordinates are the features. Naturally, all variables should possess these sampling
+    dimensions.
+
 
 .. _reshape.set_index:
 

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1408,8 +1408,7 @@ def unstack(self, dim=None):
         ds = self._to_temp_dataset().unstack(dim)
         return self._from_temp_dataset(ds)
 
-    def to_unstacked_dataset(self, dim, level=0,
-                             variable_dim='variable'):
+    def to_unstacked_dataset(self, dim, level=0):
         """Unstack DataArray expanding to Dataset along a given level of a
         stacked coordinate.
 
@@ -1419,8 +1418,12 @@ def to_unstacked_dataset(self, dim, level=0,
         ----------
         dim : str
             Name of existing dimension to unstack
-        level : int
-            Index of level to expand to dataset along
+        level : int or str
+            The MultiIndex level to expand to a dataset along. Can either be
+            the integer index of the level or its name.
+        label : int, optional
+            Label of the level to expand dataset along. Overrides the label
+            argument if given.
 
         Returns
         -------
@@ -1458,7 +1461,10 @@ def to_unstacked_dataset(self, dim, level=0,
         idx = self.indexes[dim]
         if not isinstance(idx, pd.MultiIndex):
             raise ValueError(dim, "is not a stacked coordinate")
-        variables = idx.levels[level]
+
+        level_number = idx._get_level_number(level)
+        variables = idx.levels[level_number]
+        variable_dim = idx.names[level_number]
 
         # pull variables out of datarray
         data_dict = OrderedDict()
@@ -1468,7 +1474,7 @@ def to_unstacked_dataset(self, dim, level=0,
         # unstacked dataset
         return Dataset(data_dict)
 
-    def transpose(self, *dims) -> 'DataArray':
+    def transpose(self, *dims, transpose_coords=None) -> 'DataArray':
         """Return a new DataArray object with transposed dimensions.
 
         Parameters

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2650,23 +2650,27 @@ def stack(self, dimensions=None, **dimensions_kwargs):
             result = result._stack_once(dims, new_dim)
         return result
 
-    def to_stacked_array(self, new_dim, dims, variable_dim='variable'):
+    def to_stacked_array(self, new_dim, sample_dims, variable_dim='variable',
+                         name=None):
         """Combine variables of differing dimensionality into a DataArray
         without broadcasting.
 
-        This function is basically version of Dataset.to_array which does not
-        broadcast the variables.
+        This method is similar to Dataset.to_array but does not broadcast the
+        variables.
 
         Parameters
         ----------
         new_dim : str
             Name of the new stacked coordinate
-        dims : Sequence[str]
-            Dimensions to be stacked. Not all variables in the dataset need to
-            have these dimensions.
+        sample_dims : Sequence[str]
+            Dimensions that **will not** be stacked. Each array in the dataset
+            must share these dimensions. For machine learning applications,
+            these define the dimensions over which samples are drawn.
         variable_dim : str, optional
             Name of the level in the stacked coordinate which corresponds to
             the variables.
+        name : str, optional
+            Name of the new data array.
 
         Returns
         -------
@@ -2685,56 +2689,60 @@ def to_stacked_array(self, new_dim, dims, variable_dim='variable'):
 
         Examples
         --------
+        >>> data = Dataset(
+        ...     data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+        ...                'b': ('x', [6, 7])},
+        ...     coords={'y': ['u', 'v', 'w']}
+        ... )
 
-        >>> arr = DataArray(np.arange(6).reshape(2, 3),
-        ...                 coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
-        >>> data = Dataset({'a': arr, 'b': arr.isel(y=0)})
         >>> data
-
         <xarray.Dataset>
         Dimensions:  (x: 2, y: 3)
         Coordinates:
-        * x        (x) <U1 'a' 'b'
-        * y        (y) int64 0 1 2
+        * y        (y) <U1 'u' 'v' 'w'
+        Dimensions without coordinates: x
         Data variables:
             a        (x, y) int64 0 1 2 3 4 5
-            b        (x) int64 0 3
-        >>> stacked = data.to_stacked_array("z", ['y'])
-        >>> stacked.indexes['z']
-
-        MultiIndex(levels=[['a', 'b'], [0, 1, 2]],
-                labels=[[0, 0, 0, 1], [0, 1, 2, -1]],
-                names=['variable', 'y'])
-        >>> stacked
-
-        <xarray.DataArray 'a' (x: 2, z: 4)>
-        array([[0, 1, 2, 0],
-            [3, 4, 5, 3]])
+            b        (x) int64 6 7
+
+        >>> data.to_stacked_array("z", ['x'])
+        <xarray.DataArray (x: 2, z: 4)>
+        array([[0, 1, 2, 6],
+            [3, 4, 5, 7]])
         Coordinates:
-        * x         (x) <U1 'a' 'b'
         * z         (z) MultiIndex
         - variable  (z) object 'a' 'a' 'a' 'b'
-        - y         (z) object 0 1 2 nan
+        - y         (z) object 'u' 'v' 'w' nan
+        Dimensions without coordinates: x
 
         """
-        dims = tuple(dims)
+        stacking_dims = tuple(dim for dim in self.dims
+                              if dim not in sample_dims)
+
+        for variable in self:
+            dims = self[variable].dims
+            dims_include_sample_dims = set(sample_dims) <= set(dims)
+            if not dims_include_sample_dims:
+                raise ValueError(
+                    "All DataArrays must share the dims: {}. ".format(dims)
+                )
 
         def f(val):
             # ensure square output
 
             assign_coords = {variable_dim: val.name}
-            for dim in dims:
+            for dim in stacking_dims:
                 if (dim not in val.dims):
                     assign_coords[dim] = None
 
-            expand_dims = set(dims).difference(set(val.dims))
+            expand_dims = set(stacking_dims).difference(set(val.dims))
             expand_dims.add(variable_dim)
             # must be list for .expand_dims
             expand_dims = list(expand_dims)
 
             return val.assign_coords(**assign_coords) \
                 .expand_dims(expand_dims) \
-                .stack(**{new_dim: (variable_dim,) + dims})
+                .stack(**{new_dim: (variable_dim,) + stacking_dims})
 
         # concatenate the arrays
         Xs = [f(self[key]) for key in self.data_vars]
@@ -2749,6 +2757,10 @@ def f(val):
                for level in idx.levels[1:]]
         new_idx = idx.set_levels(levels)
         dataset[new_dim] = IndexVariable(new_dim, new_idx)
+
+        if name is not None:
+            dataset.name = name
+
         return dataset
 
     def _unstack_once(self, dim):

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -2405,21 +2405,41 @@ def test_stack_unstack_slow(self):
         actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
         assert actual.identical(ds[['b']])
 
+    def test_to_stacked_array_invalid_sample_dims(self):
+        data = xr.Dataset(
+            data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+                       'b': ('x', [6, 7])},
+            coords={'y': ['u', 'v', 'w']}
+        )
+        with pytest.raises(ValueError):
+            data.to_stacked_array("features", sample_dims=['y'])
+
+    def test_to_stacked_array_name(self):
+        name = 'adf9d'
+
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        sample_dims = ['x']
+
+        y = D.to_stacked_array('features', sample_dims, name=name)
+        assert y.name == name
+
     def test_to_stacked_array_dtype_dims(self):
         # make a two dimensional dataset
         a, b = create_test_stacked_array()
         D = xr.Dataset({'a': a, 'b': b})
-        feature_dims = ['y']
-        y = D.to_stacked_array('features', feature_dims)
+        sample_dims = ['x']
+        y = D.to_stacked_array('features', sample_dims)
         assert y.indexes['features'].levels[1].dtype == D.y.dtype
         assert y.dims == ('x', 'features')
 
     def test_to_stacked_array_to_unstacked_dataset(self):
         # make a two dimensional dataset
         a, b = create_test_stacked_array()
         D = xr.Dataset({'a': a, 'b': b})
-        feature_dims = ['y']
-        y = D.to_stacked_array('features', feature_dims)\
+        sample_dims = ['x']
+        y = D.to_stacked_array('features', sample_dims)\
             .transpose("x", "features")
 
         x = y.to_unstacked_dataset("features")
@@ -2433,26 +2453,13 @@ def test_to_stacked_array_to_unstacked_dataset(self):
     def test_to_stacked_array_to_unstacked_dataset_different_dimension(self):
         # test when variables have different dimensionality
         a, b = create_test_stacked_array()
-        feature_dims = ['y']
+        sample_dims = ['x']
         D = xr.Dataset({'a': a, 'b': b.isel(y=0)})
 
-        y = D.to_stacked_array('features', feature_dims)
+        y = D.to_stacked_array('features', sample_dims)
         x = y.to_unstacked_dataset('features')
         assert_identical(D, x)
 
-        # another test
-        ds = D.isel(x=0)
-        ds_flat = ds.to_stacked_array('features', ['y'])
-        ds_comp = ds_flat.to_unstacked_dataset('features')
-        assert_identical(ds, ds_comp)
-
-    def test_to_stacked_array_to_unstacked_dataset_scalar(self):
-        a = xr.DataArray(np.r_[:6], dims=('x', ), coords={'x': np.r_[:6]})
-        ds = xr.Dataset({'a': a, 'b': 1.0})
-        ds_flat = ds.to_stacked_array('features', ['x'])
-        ds_comp = ds_flat.to_unstacked_dataset('features')
-        assert_identical(ds, ds_comp)
-
     def test_update(self):
         data = create_test_data(seed=0)
         expected = data.copy()