From 216605216b1669628075a747a6b96028bf9b8051 Mon Sep 17 00:00:00 2001
From: Leonardo Uieda <leouieda@gmail.com>
Date: Tue, 28 Apr 2020 17:48:34 +0100
Subject: [PATCH] Remove "blocked" argument from train_test_split

It was a bit redundant since we have to specify `shape` or `spacing`
along with it. So we can just ask for `spacing` or `shape` and assume
`blocked=True` if they are provided. Also fix the missing description in
the gallery example and make slight changes to its wording.
---
 examples/train_test_split.py | 44 +++++++++++++++++----------
 verde/model_selection.py     | 58 ++++++++++++++++++++++--------------
 2 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/examples/train_test_split.py b/examples/train_test_split.py
index 0ec9c06e5..88942fb97 100644
--- a/examples/train_test_split.py
+++ b/examples/train_test_split.py
@@ -2,7 +2,26 @@
 Splitting data into train and test sets
 =======================================
 
+Verde gridders are mostly linear models that are used to predict data at new
+locations. As such, they are subject to *over-fitting* and we should always
+strive to quantify the quality of the model predictions (see
+:ref:`model_evaluation`). Common practice for
+doing this is to split the data into training (the one that is used to fit the
+model) and testing (the one that is used to validate the predictions) datasets.
 
+These two datasets can be generated by splitting the data randomly (without
+regard for their positions in space). This is the default behaviour of function
+:func:`verde.train_test_split`, which is based on the scikit-learn function
+:func:`sklearn.model_selection.train_test_split`. This can be problematic if
+the data points are autocorrelated (values close to each other spatially tend
+to have similar values). In these cases, splitting the data randomly can
+overestimate the prediction quality [Roberts_etal2017]_.
+
+Alternatively, Verde allows splitting the data along *spatial blocks*. In this
+case, the data are first grouped into blocks with a given size and then the
+blocks are split randomly between training and testing sets.
+
+This example compares splitting our sample dataset using both methods.
 """
 import matplotlib.pyplot as plt
 import cartopy.crs as ccrs
@@ -24,20 +43,15 @@
 # train and test are tuples = (coordinates, data, weights).
 print("Train and test size for random splits:", train[0][0].size, test[0][0].size)
 
-# A better strategy for spatial data is to first assign the data to blocks and
-# then split the blocks randomly. The size of the blocks is controlled by the
+# A different strategy is to first assign the data to blocks and then split the
+# blocks randomly. To do this, specify the size of the blocks using the
 # 'spacing' argument.
 train_block, test_block = vd.train_test_split(
-    coordinates,
-    values,
-    blocked=True,
-    spacing=0.5,
-    test_size=test_size,
-    random_state=213,
+    coordinates, values, spacing=10 / 60, test_size=test_size, random_state=213,
 )
-# Verde will attempt to balance the data between the splits so that the desired
-# amount is assigned to the test set. It won't be exact since blocks contain
-# different amounts of data points.
+# Verde will automatically attempt to balance the data between the splits so
+# that the desired amount is assigned to the test set. It won't be exact since
+# blocks contain different amounts of data points.
 print(
     "Train and test size for block splits: ",
     train_block[0][0].size,
@@ -59,12 +73,12 @@
 vd.datasets.setup_baja_bathymetry_map(ax2)
 
 ax1.set_title("Random splitting")
-ax1.plot(*train[0], ".b", markersize=1, transform=crs, label="Train")
-ax1.plot(*test[0], ".r", markersize=1, transform=crs, label="Test", alpha=0.5)
+ax1.plot(*train[0], ".b", markersize=2, transform=crs, label="Train")
+ax1.plot(*test[0], ".r", markersize=2, transform=crs, label="Test", alpha=0.5)
 
 ax2.set_title("Blocked random splitting")
-ax2.plot(*train_block[0], ".b", markersize=1, transform=crs, label="Train")
-ax2.plot(*test_block[0], ".r", markersize=1, transform=crs, label="Test")
+ax2.plot(*train_block[0], ".b", markersize=2, transform=crs, label="Train")
+ax2.plot(*test_block[0], ".r", markersize=2, transform=crs, label="Test")
 ax2.legend(loc="upper right")
 
 plt.subplots_adjust(wspace=0.15, top=1, bottom=0, left=0.05, right=0.95)
diff --git a/verde/model_selection.py b/verde/model_selection.py
index 217cc87e8..45a0136ae 100644
--- a/verde/model_selection.py
+++ b/verde/model_selection.py
@@ -278,25 +278,32 @@ def get_n_splits(self, X=None, y=None, groups=None):
 # pylint: enable=invalid-name,unused-argument
 
 
-def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs):
+def train_test_split(
+    coordinates, data, weights=None, spacing=None, shape=None, **kwargs
+):
     r"""
     Split a dataset into a training and a testing set for cross-validation.
 
     Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to
     work on single- or multi-component spatial data with optional weights.
 
-    Extra keyword arguments will be passed to the cross-validation class:
-    :class:`sklearn.model_selection.ShuffleSplit` (random splits) if
-    ``block=False`` or :class:`verde.BlockShuffleSplit` (spatially blocked
-    random splits) if ``block=True``. The exception is ``n_splits`` which is
-    always 1.
+    If arguments *shape* or *spacing* are provided, will group the data by
+    spatial blocks before random splitting (using
+    :class:`verde.BlockShuffleSplit` instead of
+    :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing*
+    specifies the size of the spatial blocks. Alternatively, use *shape* to
+    specify the number of blocks in each dimension.
 
-    Using ``block=True`` is preferred over plain random splits for spatial data
-    to avoid overestimating validation scores. This can happen because of the
-    inherent autocorrelation that is usually associated with this type of data
-    (points that are close together are more likely to have similar values).
-    See [Roberts_etal2017]_ for an overview of this topic. In this case, you
-    **must provide** a *spacing* or *shape* argument as well (see below).
+    Extra keyword arguments will be passed to the cross-validation class. The
+    exception is ``n_splits`` which is always 1.
+
+    Grouping by spatial blocks is preferred over plain random splits for
+    spatial data to avoid overestimating validation scores. This can happen
+    because of the inherent autocorrelation that is usually associated with
+    this type of data (points that are close together are more likely to have
+    similar values). See [Roberts_etal2017]_ for an overview of this topic. To
+    use spatial blocking, you **must provide** a *spacing* or *shape* argument
+    (see below).
 
     Parameters
     ----------
@@ -310,12 +317,15 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs):
         if not none, then the weights assigned to each data point. If more than
         one data component is provided, you must provide a weights array for
         each data component (if not none).
-    block : bool
-        If True, will use :class:`verde.BlockShuffleSplit` as a cross-validator
-        to first split the data into spatial blocks and then split the blocks
-        randomly into training and testing sets. When using this option, a
-        *spacing* or *shape* must be provided as well to specify the size (or
-        number) of the spatial blocks.
+    spacing : float, tuple = (s_north, s_east), or None
+        The spatial block size in the South-North and West-East directions,
+        respectively. A single value means that the spacing is equal in both
+        directions. If None, then *shape* must be provided in order to use
+        spatial blocking.
+    shape : tuple = (n_north, n_east) or None
+        The number of blocks in the South-North and West-East directions,
+        respectively. If None, then *spacing* must be provided in order to use
+        spatial blocking.
 
     Returns
     -------
@@ -394,7 +404,7 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs):
     >>> # We must specify the size of the blocks via the spacing argument.
     >>> # Blocks of 1.5 will split the domain into 4 blocks.
     >>> train, test = train_test_split(
-    ...     coordinates, data, random_state=0, blocked=True, spacing=1.5,
+    ...     coordinates, data, random_state=0, spacing=1.5,
     ... )
     >>> # The training set:
     >>> print("coords:", train[0][0], train[0][1], sep="\n")
@@ -411,12 +421,14 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs):
 
     """
     args = check_fit_input(coordinates, data, weights, unpack=False)
-    if blocked:
-        feature_matrix = np.transpose(n_1d_arrays(coordinates, 2))
-        shuffle = BlockShuffleSplit(n_splits=1, **kwargs).split(feature_matrix)
-    else:
+    if spacing is None and shape is None:
         indices = np.arange(args[1][0].size)
         shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices)
+    else:
+        feature_matrix = np.transpose(n_1d_arrays(coordinates, 2))
+        shuffle = BlockShuffleSplit(
+            n_splits=1, spacing=spacing, shape=shape, **kwargs
+        ).split(feature_matrix)
     split = next(shuffle)
     train, test = (tuple(select(i, index) for i in args) for index in split)
     return train, test