From 216605216b1669628075a747a6b96028bf9b8051 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Tue, 28 Apr 2020 17:48:34 +0100 Subject: [PATCH] Remove "blocked" argument from train_test_split It was a bit redundant since we have to specify `shape` or `spacing` along with it. So we can just ask for `spacing` or `shape` and assume `blocked=True` if they are provided. Also fix the missing description in the gallery example and make slight changes to its wording. --- examples/train_test_split.py | 44 +++++++++++++++++---------- verde/model_selection.py | 58 ++++++++++++++++++++++-------------- 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/examples/train_test_split.py b/examples/train_test_split.py index 0ec9c06e5..88942fb97 100644 --- a/examples/train_test_split.py +++ b/examples/train_test_split.py @@ -2,7 +2,26 @@ Splitting data into train and test sets ======================================= +Verde gridders are mostly linear models that are used to predict data at new +locations. As such, they are subject to *over-fitting* and we should always +strive to quantify the quality of the model predictions (see +:ref:`model_evaluation`). Common practice for +doing this is to split the data into training (the one that is used to fit the +model) and testing (the one that is used to validate the predictions) datasets. +These two datasets can be generated by splitting the data randomly (without +regard for their positions in space). This is the default behaviour of function +:func:`verde.train_test_split`, which is based on the scikit-learn function +:func:`sklearn.model_selection.train_test_split`. This can be problematic if +the data points are autocorrelated (values close to each other spatially tend +to have similar values). In these cases, splitting the data randomly can +overestimate the prediction quality [Roberts_etal2017]_. + +Alternatively, Verde allows splitting the data along *spatial blocks*. In this +case, the data are first grouped into blocks with a given size and then the +blocks are split randomly between training and testing sets. + +This example compares splitting our sample dataset using both methods. """ import matplotlib.pyplot as plt import cartopy.crs as ccrs @@ -24,20 +43,15 @@ # train and test are tuples = (coordinates, data, weights). print("Train and test size for random splits:", train[0][0].size, test[0][0].size) -# A better strategy for spatial data is to first assign the data to blocks and -# then split the blocks randomly. The size of the blocks is controlled by the +# A different strategy is to first assign the data to blocks and then split the +# blocks randomly. To do this, specify the size of the blocks using the # 'spacing' argument. train_block, test_block = vd.train_test_split( - coordinates, - values, - blocked=True, - spacing=0.5, - test_size=test_size, - random_state=213, + coordinates, values, spacing=10 / 60, test_size=test_size, random_state=213, ) -# Verde will attempt to balance the data between the splits so that the desired -# amount is assigned to the test set. It won't be exact since blocks contain -# different amounts of data points. +# Verde will automatically attempt to balance the data between the splits so +# that the desired amount is assigned to the test set. It won't be exact since +# blocks contain different amounts of data points. print( "Train and test size for block splits: ", train_block[0][0].size, @@ -59,12 +73,12 @@ vd.datasets.setup_baja_bathymetry_map(ax2) ax1.set_title("Random splitting") -ax1.plot(*train[0], ".b", markersize=1, transform=crs, label="Train") -ax1.plot(*test[0], ".r", markersize=1, transform=crs, label="Test", alpha=0.5) +ax1.plot(*train[0], ".b", markersize=2, transform=crs, label="Train") +ax1.plot(*test[0], ".r", markersize=2, transform=crs, label="Test", alpha=0.5) ax2.set_title("Blocked random splitting") -ax2.plot(*train_block[0], ".b", markersize=1, transform=crs, label="Train") -ax2.plot(*test_block[0], ".r", markersize=1, transform=crs, label="Test") +ax2.plot(*train_block[0], ".b", markersize=2, transform=crs, label="Train") +ax2.plot(*test_block[0], ".r", markersize=2, transform=crs, label="Test") ax2.legend(loc="upper right") plt.subplots_adjust(wspace=0.15, top=1, bottom=0, left=0.05, right=0.95) diff --git a/verde/model_selection.py b/verde/model_selection.py index 217cc87e8..45a0136ae 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -278,25 +278,32 @@ def get_n_splits(self, X=None, y=None, groups=None): # pylint: enable=invalid-name,unused-argument -def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): +def train_test_split( + coordinates, data, weights=None, spacing=None, shape=None, **kwargs +): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to work on single- or multi-component spatial data with optional weights. - Extra keyword arguments will be passed to the cross-validation class: - :class:`sklearn.model_selection.ShuffleSplit` (random splits) if - ``block=False`` or :class:`verde.BlockShuffleSplit` (spatially blocked - random splits) if ``block=True``. The exception is ``n_splits`` which is - always 1. + If arguments *shape* or *spacing* are provided, will group the data by + spatial blocks before random splitting (using + :class:`verde.BlockShuffleSplit` instead of + :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing* + specifies the size of the spatial blocks. Alternatively, use *shape* to + specify the number of blocks in each dimension. - Using ``block=True`` is preferred over plain random splits for spatial data - to avoid overestimating validation scores. This can happen because of the - inherent autocorrelation that is usually associated with this type of data - (points that are close together are more likely to have similar values). - See [Roberts_etal2017]_ for an overview of this topic. In this case, you - **must provide** a *spacing* or *shape* argument as well (see below). + Extra keyword arguments will be passed to the cross-validation class. The + exception is ``n_splits`` which is always 1. + + Grouping by spatial blocks is preferred over plain random splits for + spatial data to avoid overestimating validation scores. This can happen + because of the inherent autocorrelation that is usually associated with + this type of data (points that are close together are more likely to have + similar values). See [Roberts_etal2017]_ for an overview of this topic. To + use spatial blocking, you **must provide** a *spacing* or *shape* argument + (see below). Parameters ---------- @@ -310,12 +317,15 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). - block : bool - If True, will use :class:`verde.BlockShuffleSplit` as a cross-validator - to first split the data into spatial blocks and then split the blocks - randomly into training and testing sets. When using this option, a - *spacing* or *shape* must be provided as well to specify the size (or - number) of the spatial blocks. + spacing : float, tuple = (s_north, s_east), or None + The spatial block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* must be provided in order to use + spatial blocking. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* must be provided in order to use + spatial blocking. Returns ------- @@ -394,7 +404,7 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): >>> # We must specify the size of the blocks via the spacing argument. >>> # Blocks of 1.5 will split the domain into 4 blocks. >>> train, test = train_test_split( - ... coordinates, data, random_state=0, blocked=True, spacing=1.5, + ... coordinates, data, random_state=0, spacing=1.5, ... ) >>> # The training set: >>> print("coords:", train[0][0], train[0][1], sep="\n") @@ -411,12 +421,14 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): """ args = check_fit_input(coordinates, data, weights, unpack=False) - if blocked: - feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) - shuffle = BlockShuffleSplit(n_splits=1, **kwargs).split(feature_matrix) - else: + if spacing is None and shape is None: indices = np.arange(args[1][0].size) shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices) + else: + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) + shuffle = BlockShuffleSplit( + n_splits=1, spacing=spacing, shape=shape, **kwargs + ).split(feature_matrix) split = next(shuffle) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test