From 0c314c68f67a0f5ddd1292988dbbb87ac24283cc Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 27 Feb 2020 11:23:31 +0000 Subject: [PATCH 01/38] Bug fix in grid_to_table for wrong order of Dataset coords (#229) We were relying on the order of the coords attribute of the data set to meshgrid the coordinates. This is wrong because xarray takes the coordinate order from dims instead, which is what we should also do. Took this chance to refactor the code and included a test that would have caught the bug. --- verde/tests/test_utils.py | 25 ++++++++++++++++++++++++- verde/utils.py | 33 ++++++++++++++++----------------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/verde/tests/test_utils.py b/verde/tests/test_utils.py index 0f8f8722e..0d6119b99 100644 --- a/verde/tests/test_utils.py +++ b/verde/tests/test_utils.py @@ -5,11 +5,12 @@ import numpy as np import numpy.testing as npt +import xarray as xr from scipy.spatial import cKDTree # pylint: disable=no-name-in-module import pytest from ..coordinates import grid_coordinates -from ..utils import parse_engine, dummy_jit, kdtree +from ..utils import parse_engine, dummy_jit, kdtree, grid_to_table from .. import utils @@ -52,3 +53,25 @@ def test_kdtree(): npt.assert_allclose(dist, 0.1) if not use_pykdtree: assert isinstance(tree, cKDTree) + + +def test_grid_to_table_order(): + "Check that coordinates are in the right order when converting to tables" + lon, lat = grid_coordinates(region=(1, 10, -10, -1), shape=(3, 4)) + data = lon ** 2 + # If the DataArray is created with coords in an order that doesn't match + # the dims (which is valid), we were getting it wrong because we were + # relying on the order of the coords instead of dims. This test would have + # caught that bug. + grid = xr.DataArray( + data=data, + coords={"longitude": lon[0, :], "latitude": lat[:, 0]}, + dims=("latitude", "longitude"), + ).to_dataset(name="field") + table = grid_to_table(grid) + true_lat = [-10, -10, -10, -10, -5.5, -5.5, -5.5, -5.5, -1, -1, -1, -1] + true_lon = [1, 4, 7, 10, 1, 4, 7, 10, 1, 4, 7, 10] + true_field = [1, 16, 49, 100, 1, 16, 49, 100, 1, 16, 49, 100] + npt.assert_allclose(true_lat, table.latitude) + npt.assert_allclose(true_lon, table.longitude) + npt.assert_allclose(true_field, table.field) diff --git a/verde/utils.py b/verde/utils.py index 42ee7359e..bb24c7ac9 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -237,6 +237,11 @@ def grid_to_table(grid): ... coords=(np.arange(4), np.arange(5, 10)), ... dims=['northing', 'easting'] ... ) + >>> print(temperature.values) + [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] >>> grid = xr.Dataset({"temperature": temperature}) >>> table = grid_to_table(grid) >>> list(sorted(table.columns)) @@ -267,23 +272,17 @@ def grid_to_table(grid): [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] """ - coordinate_names = [*grid.coords.keys()] - coord_north = grid.coords[coordinate_names[0]].values - coord_east = grid.coords[coordinate_names[1]].values - coordinates = [i.ravel() for i in np.meshgrid(coord_east, coord_north)] - coord_dict = { - coordinate_names[0]: coordinates[1], - coordinate_names[1]: coordinates[0], - } - variable_name = [*grid.data_vars.keys()] - variable_data = grid.to_array().values - variable_arrays = variable_data.reshape( - len(variable_name), int(len(variable_data.ravel()) / len(variable_name)) - ) - var_dict = dict(zip(variable_name, variable_arrays)) - coord_dict.update(var_dict) - data = pd.DataFrame(coord_dict) - return data + data_names = list(grid.data_vars.keys()) + data_arrays = [grid[name].values.ravel() for name in data_names] + coordinate_names = list(grid[data_names[0]].dims) + north = grid.coords[coordinate_names[0]].values + east = grid.coords[coordinate_names[1]].values + # Need to flip the coordinates because the names are in northing and + # easting order + coordinates = [i.ravel() for i in np.meshgrid(east, north)][::-1] + data_dict = dict(zip(coordinate_names, coordinates)) + data_dict.update(dict(zip(data_names, data_arrays))) + return pd.DataFrame(data_dict) def kdtree(coordinates, use_pykdtree=True, **kwargs): From a939d8213d3e8603ad41495902d2b991e6b8d02e Mon Sep 17 00:00:00 2001 From: Fatiando a Terra Bot <50936856+fatiando-bot@users.noreply.github.com> Date: Thu, 27 Feb 2020 14:08:45 +0000 Subject: [PATCH 02/38] Update files from fatiando/contributing (#233) Changes have been made in https://github.com/fatiando/contributing to: MAINTENANCE.md Update the copies in this repository to match. See https://github.com/fatiando/contributing/commit/91699a0f86a96da32e98cf5abed785d9ba1bd379 --- MAINTENANCE.md | 128 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 31 deletions(-) diff --git a/MAINTENANCE.md b/MAINTENANCE.md index 810fc862c..8a8ba6b83 100644 --- a/MAINTENANCE.md +++ b/MAINTENANCE.md @@ -1,12 +1,27 @@ # Maintainers Guide -This page contains instructions for project maintainers about how our setup works, -making releases, creating packages, etc. +This page contains instructions for project maintainers about how our setup +works, making releases, creating packages, etc. If you want to make a contribution to the project, see the [Contributing Guide](CONTRIBUTING.md) instead. +## Contents + +* [Branches](#branches) +* [Reviewing and merging pull requests](#reviewing-and-merging-pull-requests) +* [Continuous Integration](#continuous-integration) +* [Citations](#citations) +* [Making a Release](#making-a-release) + * [Draft a new Zenodo release](#draft-a-new-zenodo-release) + * [Update the changelog](#update-the-changelog) + * [Check the README syntax](#check-the-readme-syntax) + * [Release](#release) + * [Archive on Zenodo](#archive-on-zenodo) + * [Update the conda package](#update-the-conda-package) + + ## Branches * *master*: Always tested and ready to become a new version. Don't push directly to this @@ -62,6 +77,43 @@ If you find any problems with the test setup and deployment, please create issue submit pull requests to that repository. +## Citations + +The citation for a package that doesn't have an associated paper will be the +Zenodo DOI for all versions. This citation will include everyone who has +contributed to the project and met our [authorship criteria](AUTHORSHIP.md). + +Include the following text in the `CITATION.rst` file: + +``` +This is research software **made by scientists**. Citations help us justify the +effort that goes into building and maintaining this project. + +If you used this software in your research, please consider +citing the following source: https://doi.org/10.5281/zenodo.3530749 + +The link above includes full citation information and export formats (BibTeX, +Mendeley, etc). +``` + +If the project has been publish as an academic paper (for example, on +[JOSS](https://joss.theoj.org)), **update the `CITATION.rst` to point to the +paper instead of the Zenodo archive**. + +``` +If you used this software in your research, please consider citing the +following publication: + + + +This is an open-access publication. The paper and the associated reviews can be +freely accessed at: https://doi.org/ + +The link above includes full citation information and export formats (BibTeX, +Mendeley, etc). +``` + + ## Making a Release We try to automate the release process as much as possible. @@ -70,7 +122,7 @@ The version number is set automatically using versioneer based information it ge git. There are a few steps that still must be done manually, though. -### Drafting a new Zenodo release +### Draft a new Zenodo release If the project already has releases on [Zenodo](https://zenodo.org/), you need to create a **New version** of it. Find the link to the latest Zenodo release on the `README.md` @@ -79,6 +131,7 @@ file of your project. Then: 1. Delete all existing files (they will be replaced with the new version). 2. Reserve a DOI and save the release draft. 3. Add as authors any new contributors who have added themselves to `AUTHORS.md`. +4. Update release date. On the other hand, if you're making the first release of the project, you need to create a **New upload** for it inside the @@ -86,7 +139,7 @@ a **New upload** for it inside the Make sure the Fatiando a Terra community is chosen when filling the release draft. The rest of the process is the same as above. -### Updating the changelog +### Update the changelog 1. Generate a list of commits between the last release tag and now: @@ -126,39 +179,52 @@ python setup.py --long-description | rst2html.py --no-raw > index.html Open `index.html` and check for any flaws or error messages. -### Pushing to PyPI and updating the documentation +### Release -After the changelog is updated, making a release should be as simple as creating a new -git tag and pushing it to Github: - -```bash -git tag v0.2.0 -git push --tags -``` +After the changelog is updated, making a release should be as simple as +creating a new git tag. +The continuous integration services will take care of pushing the package to +PyPI and creating a new version of the documentation. +A new folder with version number containing the HTML documentation will be +pushed to *gh-pages*, and the `latest` link will be updated to point to this +new folder. The tag should be version number (following [Semantic Versioning](https://semver.org/)) -with a leading `v`. -This should trigger Travis to do all the work for us. -A new source distribution will be uploaded to PyPI, a new folder with the documentation -HTML will be pushed to *gh-pages*, and the `latest` link will be updated to point to -this new folder. - -### Archiving on Zenodo +with a leading `v` (`v1.5.7`). + +To create a new tag, go to `https://github.com/fatiando/PROJECT/releases` and +click on "Draft a new release": + +1. Use the version number (including the `v`) as the "Tag version" and "Release + title". +2. The first line of the release description should be the DOI of the release: + ``` + DOI: https://doi.org/ + ``` +3. Fill the release description with a Markdown version of the **latest** + changelog entry. The `doc/changes.rst` file can be converted to Markdown + using `pandoc`: + ``` + pandoc -s doc/changes.rst -o changes.md + ``` +4. Publish the release. + +### Archive on Zenodo Grab a zip file from the Github release and upload to Zenodo using the previously reserved DOI. -### Updating the conda package +### Update the conda package -After Travis is done building the tag and all builds pass, we need to update the conda -package. -Unfortunately, this needs to be done manually for now. +After Travis is done building the tag and all builds pass, we need to update +the conda package. +Fortunately, the conda-forge bot will submit a PR updating the package for us +(it may take a couple of hours to do so). +Most releases can be merged right away but some might need further changes to +the conda recipe: -1. Fork the feedstock repository (https://github.com/conda-forge/PROJECT-feedstock) if - you haven't already. If you have a fork, update it. -2. Update the version number and sha256 hash on `recipe/meta.yaml`. You can get the hash - from the PyPI "Download files" section. -3. Add or remove any new dependencies (most are probably only `run` dependencies). -4. Make a new branch, commit, and push your changes **to your fork**. -5. Create a PR against the original feedstock master. -6. Once the CIs are passing, merge or as a maintainer to do so. +1. If the release added new dependencies, make sure they are included in the + `meta.yaml` file. +2. If dropping/adding support for Python versions (or version of other + packages) make sure the correct version restrictions are applied in + `meta.yaml`. From a878b0b60634acb20df58c40d12b8e06a7a274b1 Mon Sep 17 00:00:00 2001 From: Jesse Pisel Date: Thu, 27 Feb 2020 11:01:28 -0600 Subject: [PATCH 03/38] Use newer Mac on Azure Pipelines (#234) Changed from 10.13 (High Sierra) to 10.15 (Catalina) because Azure will remove the 10.13 VMs soon. --- .azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 0592ed71d..5429b68b3 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -71,7 +71,7 @@ jobs: displayName: 'Mac' pool: - vmImage: 'macOS-10.13' + vmImage: 'macOS-10.15' variables: CONDA_REQUIREMENTS: requirements.txt From 1725ef91f500f6df094d8cf8395a09a8a94107f3 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 6 Mar 2020 11:27:02 +0000 Subject: [PATCH 04/38] Allow grid_to_table to take DataArrays (#235) Previously would fail because we rely on getting the data variable name from the Dataset. Need to check if it's a Dataset (has the `data_vars` attribute) and set the data column name to "scalars" if it's not. Fixes #225 --- verde/utils.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/verde/utils.py b/verde/utils.py index bb24c7ac9..aaaca9b22 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -218,13 +218,16 @@ def grid_to_table(grid): Parameters ---------- - grid : :class:`xarray.Dataset` + grid : :class:`xarray.Dataset` or :class:`xarray.DataArray` A 2D grid with one or more data variables. Returns ------- table : :class:`pandas.DataFrame` Table with coordinates and variable values for each point in the grid. + Column names are taken from the grid. If *grid* is a + :class:`xarray.DataArray` that doesn't have a ``name`` attribute + defined, the column with data values will be called ``"scalars"``. Examples -------- @@ -242,16 +245,32 @@ def grid_to_table(grid): [ 5 6 7 8 9] [10 11 12 13 14] [15 16 17 18 19]] + >>> # For DataArrays, the data column will be "scalars" by default + >>> table = grid_to_table(temperature) + >>> list(sorted(table.columns)) + ['easting', 'northing', 'scalars'] + >>> print(table.scalars.values) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] + >>> print(table.northing.values) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3] + >>> print(table.easting.values) + [5 6 7 8 9 5 6 7 8 9 5 6 7 8 9 5 6 7 8 9] + >>> # If the DataArray defines a "name", we will use that instead + >>> temperature.name = "temperature_K" + >>> table = grid_to_table(temperature) + >>> list(sorted(table.columns)) + ['easting', 'northing', 'temperature_K'] + >>> # Conversion of Datasets will preserve the data variable names >>> grid = xr.Dataset({"temperature": temperature}) >>> table = grid_to_table(grid) >>> list(sorted(table.columns)) ['easting', 'northing', 'temperature'] + >>> print(table.temperature.values) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] >>> print(table.northing.values) [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3] >>> print(table.easting.values) [5 6 7 8 9 5 6 7 8 9 5 6 7 8 9 5 6 7 8 9] - >>> print(table.temperature.values) - [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] >>> # Grids with multiple data variables will have more columns. >>> wind_speed = xr.DataArray( ... np.arange(20, 40).reshape((4, 5)), @@ -272,9 +291,16 @@ def grid_to_table(grid): [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] """ - data_names = list(grid.data_vars.keys()) - data_arrays = [grid[name].values.ravel() for name in data_names] - coordinate_names = list(grid[data_names[0]].dims) + if hasattr(grid, "data_vars"): + # It's a Dataset + data_names = list(grid.data_vars.keys()) + data_arrays = [grid[name].values.ravel() for name in data_names] + coordinate_names = list(grid[data_names[0]].dims) + else: + # It's a DataArray + data_names = [grid.name if grid.name is not None else "scalars"] + data_arrays = [grid.values.ravel()] + coordinate_names = list(grid.dims) north = grid.coords[coordinate_names[0]].values east = grid.coords[coordinate_names[1]].values # Need to flip the coordinates because the names are in northing and From 9238d8da2f1bce4942ae19d0c153ac95af94eede Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 13 Mar 2020 08:05:23 +0000 Subject: [PATCH 05/38] Add function for rolling windows on irregular data (#236) The `rolling_window` function returns the indices of points falling on rolling/moving windows. It works for irregularly sampled data and is a nice complement to `xarray.DataArray.rolling` (which only works on grids). Uses a KDTree to select points but is not compatible with pykdtree (it's missing the method we need). --- doc/api/index.rst | 1 + verde/__init__.py | 1 + verde/coordinates.py | 236 +++++++++++++++++++++++++++++++- verde/tests/test_coordinates.py | 44 ++++++ 4 files changed, 276 insertions(+), 6 deletions(-) diff --git a/doc/api/index.rst b/doc/api/index.rst index 4f1e0708f..384d9750b 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -60,6 +60,7 @@ Coordinate Manipulation project_region inside block_split + rolling_window Utilities --------- diff --git a/verde/__init__.py b/verde/__init__.py index d1fd2ee62..7f565917f 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -7,6 +7,7 @@ grid_coordinates, inside, block_split, + rolling_window, profile_coordinates, get_region, pad_region, diff --git a/verde/coordinates.py b/verde/coordinates.py index b7e912d57..02532ddea 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -1,6 +1,8 @@ """ Functions for generating and manipulating coordinates. """ +import warnings + import numpy as np from sklearn.utils import check_random_state @@ -735,6 +737,7 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= See also -------- BlockReduce : Apply a reduction operation to the data in blocks (windows). + rolling_window : Select points on a rolling (moving) window. Examples -------- @@ -770,15 +773,236 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= """ if region is None: region = get_region(coordinates) - block_coords = tuple( - i.ravel() - for i in grid_coordinates( - region, spacing=spacing, shape=shape, adjust=adjust, pixel_register=True - ) + block_coords = grid_coordinates( + region, spacing=spacing, shape=shape, adjust=adjust, pixel_register=True ) tree = kdtree(block_coords) labels = tree.query(np.transpose(n_1d_arrays(coordinates, 2)))[1] - return block_coords, labels + return n_1d_arrays(block_coords, len(block_coords)), labels + + +def rolling_window( + coordinates, size, spacing=None, shape=None, region=None, adjust="spacing" +): + """ + Select points on a rolling (moving) window. + + A window of the given size is moved across the region at a given step + (specified by *spacing* or *shape*). Returns the indices of points falling + inside each window step. You can use the indices to select points falling + inside a given window. + + The size of the step when moving the windows can be specified by the + *spacing* parameter. Alternatively, the number of windows in the + South-North and West-East directions can be specified using the *shape* + parameter. **One of the two must be given.** + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). + size : float + The size of the windows. Units should match the units of *coordinates*. + spacing : float, tuple = (s_north, s_east), or None + The window size in the South-North and West-East directions, + respectively. A single value means that the size is equal in both + directions. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. If not region is given, will use the bounding region of + the given points. + adjust : {'spacing', 'region'} + Whether to adjust the spacing or the region if required. Ignored if + *shape* is given instead of *spacing*. Defaults to adjusting the + spacing. + + Returns + ------- + window_coordinates : tuple of arrays + Coordinate arrays for the center of each window. + indices : array + Each element of the array corresponds the indices of points falling + inside a window. The array will have the same shape as the + *window_coordinates*. Use the array elements to index the coordinates + for each window. The indices will depend on the number of dimensions in + the input coordinates. For example, if the coordinates are 2D arrays, + each window will contain indices for 2 dimensions (row, column). + + See also + -------- + block_split : Split a region into blocks and label points accordingly. + + Examples + -------- + + Generate a set of sample coordinates on a grid and determine the indices + of points for each rolling window: + + >>> from verde import grid_coordinates + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + >>> print(coords[0]) + [[-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.]] + >>> print(coords[1]) + [[ 6. 6. 6. 6. 6.] + [ 7. 7. 7. 7. 7.] + [ 8. 8. 8. 8. 8.] + [ 9. 9. 9. 9. 9.] + [10. 10. 10. 10. 10.]] + >>> # Get the rolling window indices + >>> window_coords, indices = rolling_window(coords, size=2, spacing=2) + >>> # Window coordinates will be 2D arrays. Their shape is the number of + >>> # windows in each dimension + >>> print(window_coords[0].shape, window_coords[1].shape) + (2, 2) (2, 2) + >>> # The there are the easting and northing coordinates for the center of + >>> # each rolling window + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> # The indices of points falling on each window will have the same shape + >>> # as the window center coordinates + >>> print(indices.shape) + (2, 2) + >>> # The points in the first window. Indices are 2D positions because the + >>> # coordinate arrays are 2D. + >>> print(len(indices[0, 0])) + 2 + >>> for dimension in indices[0, 0]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[0, 1]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [2 3 4 2 3 4 2 3 4] + >>> for dimension in indices[1, 0]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[1, 1]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [2 3 4 2 3 4 2 3 4] + >>> # To get the coordinates for each window, use indexing + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + If the coordinates are 1D, the indices will also be 1D: + + >>> coords1d = [coord.ravel() for coord in coords] + >>> window_coords, indices = rolling_window(coords1d, size=2, spacing=2) + >>> print(len(indices[0, 0])) + 1 + >>> print(indices[0, 0][0]) + [ 0 1 2 5 6 7 10 11 12] + >>> print(indices[0, 1][0]) + [ 2 3 4 7 8 9 12 13 14] + >>> print(indices[1, 0][0]) + [10 11 12 15 16 17 20 21 22] + >>> print(indices[1, 1][0]) + [12 13 14 17 18 19 22 23 24] + >>> # The returned indices can be used in the same way as before + >>> print(coords1d[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords1d[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + By default, the windows will span the entire data region. You can also + control the specific region you'd like the windows to cover: + + >>> # Coordinates on a larger region but with the same spacing as before + >>> coords = grid_coordinates((-10, 5, 0, 20), spacing=1) + >>> # Get the rolling window indices but limited to the region from before + >>> window_coords, indices = rolling_window( + ... coords, size=2, spacing=2, region=(-5, -1, 6, 10), + ... ) + >>> # The windows should still be in the same place as before + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> # And indexing the coordinates should also provide the same result + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + """ + shapes = [coord.shape for coord in coordinates] + if not all(shape == shapes[0] for shape in shapes): + raise ValueError( + "Coordinate arrays must have the same shape. Given shapes: {}".format( + shapes + ) + ) + if region is None: + region = get_region(coordinates) + # Calculate the region spanning the centers of the rolling windows + window_region = [ + dimension + (-1) ** (i % 2) * size / 2 for i, dimension in enumerate(region) + ] + _check_rolling_window_overlap(window_region, size, shape, spacing) + centers = grid_coordinates( + window_region, spacing=spacing, shape=shape, adjust=adjust + ) + # pykdtree doesn't support query_ball_point yet and we need that + tree = kdtree(coordinates, use_pykdtree=False) + # Coordinates must be transposed because the kd-tree wants them as columns + # of a matrix + # Use p=inf (infinity norm) to get square windows instead of circular ones + indices1d = tree.query_ball_point( + np.transpose(n_1d_arrays(centers, 2)), r=size / 2, p=np.inf + ) + # Make the indices array the same shape as the center coordinates array. + # That preserves the information of the number of windows in each + # dimension. Need to first create an empty array of object type because + # otherwise numpy tries to use the index tuples as dimensions (even if + # given ndim=1 explicitly). Can't make it 1D and then reshape because the + # reshape is ignored for some reason. The workaround is to create the array + # with the correct shape and assign the values to a raveled view of the + # array. + indices = np.empty(centers[0].shape, dtype="object") + # Need to convert the indices to int arrays because unravel_index doesn't + # like empty lists but can handle empty integer arrays in case a window has + # no points inside it. + indices.ravel()[:] = [ + np.unravel_index(np.array(i, dtype="int"), shape=shapes[0]) for i in indices1d + ] + return centers, indices + + +def _check_rolling_window_overlap(region, size, shape, spacing): + """ + Warn the user if there is no overlap between neighboring windows. + """ + if shape is not None: + ndims = len(shape) + dimensions = [region[i * ndims + 1] - region[i * ndims] for i in range(ndims)] + # The - 1 is because we need to divide by the number of intervals, not + # the number of nodes. + spacing = tuple(dim / (n - 1) for dim, n in zip(dimensions, shape)) + spacing = np.atleast_1d(spacing) + if np.any(spacing > size): + warnings.warn( + f"Rolling windows do not overlap (size '{size}' and spacing '{spacing}'). " + "Some data points may not be included in any window. " + "Increase size or decrease spacing to avoid this." + ) def longitude_continuity(coordinates, region): diff --git a/verde/tests/test_coordinates.py b/verde/tests/test_coordinates.py index 701273285..ec38c469b 100644 --- a/verde/tests/test_coordinates.py +++ b/verde/tests/test_coordinates.py @@ -1,6 +1,8 @@ """ Test the coordinate generation functions """ +import warnings + import numpy as np import numpy.testing as npt import pytest @@ -11,9 +13,51 @@ profile_coordinates, grid_coordinates, longitude_continuity, + rolling_window, ) +def test_rolling_window_invalid_coordinate_shapes(): + "Shapes of input coordinates must all be the same" + coordinates = [np.arange(10), np.arange(10).reshape((5, 2))] + with pytest.raises(ValueError): + rolling_window(coordinates, size=2, spacing=1) + + +def test_rolling_window_empty(): + "Make sure empty windows return an empty index" + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + # Use a larger region to make sure the first window is empty + # Doing this will raise a warning for non-overlapping windows. Capture it + # so it doesn't pollute the test log. + with warnings.catch_warnings(record=True): + windows = rolling_window(coords, size=0.001, spacing=1, region=(-7, 1, 4, 12))[ + 1 + ] + assert windows[0, 0][0].size == 0 and windows[0, 0][1].size == 0 + # Make sure we can still index with an empty array + assert coords[0][windows[0, 0]].size == 0 + + +def test_rolling_window_warnings(): + "Should warn users if the windows don't overlap" + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + # For exact same size there will be 1 point overlapping so should not warn + with warnings.catch_warnings(record=True) as warn: + rolling_window(coords, size=2, spacing=2) + assert not any(issubclass(w.category, UserWarning) for w in warn) + args = [dict(spacing=3), dict(spacing=(4, 1)), dict(shape=(1, 2))] + for arg in args: + with warnings.catch_warnings(record=True) as warn: + rolling_window(coords, size=2, **arg) + # Filter out the user warnings from some deprecation warnings that + # might be thrown by other packages. + userwarnings = [w for w in warn if issubclass(w.category, UserWarning)] + assert len(userwarnings) == 1 + assert issubclass(userwarnings[-1].category, UserWarning) + assert str(userwarnings[-1].message).split()[0] == "Rolling" + + def test_spacing_to_shape(): "Check that correct spacing and region are returned" region = (-10, 0, 0, 5) From 62cc8da9a765797d8c96aab20bb37b63f522bb79 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 13 Mar 2020 17:51:47 +0000 Subject: [PATCH 06/38] Add function to split data on an expanding window (#238) For selecting data in windows of various size around a center point. Sizes do not have to be increasing necessarily. Returns only the indices of points falling inside each window. --- doc/api/index.rst | 1 + verde/__init__.py | 1 + verde/base/utils.py | 24 ++++++- verde/coordinates.py | 135 ++++++++++++++++++++++++++++++++++++--- verde/tests/test_base.py | 16 ++++- 5 files changed, 165 insertions(+), 12 deletions(-) diff --git a/doc/api/index.rst b/doc/api/index.rst index 384d9750b..71a1b9c8c 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -61,6 +61,7 @@ Coordinate Manipulation inside block_split rolling_window + expanding_window Utilities --------- diff --git a/verde/__init__.py b/verde/__init__.py index 7f565917f..b7df9b76a 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -8,6 +8,7 @@ inside, block_split, rolling_window, + expanding_window, profile_coordinates, get_region, pad_region, diff --git a/verde/base/utils.py b/verde/base/utils.py index d35c216a8..4c44c4ca0 100644 --- a/verde/base/utils.py +++ b/verde/base/utils.py @@ -25,6 +25,21 @@ def check_data(data): return data +def check_coordinates(coordinates): + """ + Check that the given coordinate arrays are what we expect them to be. + Should be a tuple with arrays of the same shape. + """ + shapes = [coord.shape for coord in coordinates] + if not all(shape == shapes[0] for shape in shapes): + raise ValueError( + "Coordinate arrays must have the same shape. Coordinate shapes: {}".format( + shapes + ) + ) + return coordinates + + def check_fit_input(coordinates, data, weights, unpack=True): """ Validate the inputs to the fit method of gridders. @@ -59,8 +74,13 @@ def check_fit_input(coordinates, data, weights, unpack=True): """ data = check_data(data) weights = check_data(weights) - if any(i.shape != j.shape for i in coordinates for j in data): - raise ValueError("Coordinate and data arrays must have the same shape.") + coordinates = check_coordinates(coordinates) + if any(i.shape != coordinates[0].shape for i in data): + raise ValueError( + "Data arrays must have the same shape {} as coordinates. Data shapes: {}.".format( + coordinates[0].shape, [i.shape for i in data] + ) + ) if any(w is not None for w in weights): if len(weights) != len(data): raise ValueError( diff --git a/verde/coordinates.py b/verde/coordinates.py index 02532ddea..d29d70eac 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.utils import check_random_state -from .base.utils import n_1d_arrays +from .base.utils import n_1d_arrays, check_coordinates from .utils import kdtree @@ -738,6 +738,7 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= -------- BlockReduce : Apply a reduction operation to the data in blocks (windows). rolling_window : Select points on a rolling (moving) window. + expanding_window : Select points on windows of changing size. Examples -------- @@ -771,6 +772,7 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= [6 6 6 7 7 7]] """ + coordinates = check_coordinates(coordinates) if region is None: region = get_region(coordinates) block_coords = grid_coordinates( @@ -835,6 +837,7 @@ def rolling_window( See also -------- block_split : Split a region into blocks and label points accordingly. + expanding_window : Select points on windows of changing size. Examples -------- @@ -943,13 +946,7 @@ def rolling_window( [6. 6. 6. 7. 7. 7. 8. 8. 8.] """ - shapes = [coord.shape for coord in coordinates] - if not all(shape == shapes[0] for shape in shapes): - raise ValueError( - "Coordinate arrays must have the same shape. Given shapes: {}".format( - shapes - ) - ) + coordinates = check_coordinates(coordinates) if region is None: region = get_region(coordinates) # Calculate the region spanning the centers of the rolling windows @@ -981,7 +978,8 @@ def rolling_window( # like empty lists but can handle empty integer arrays in case a window has # no points inside it. indices.ravel()[:] = [ - np.unravel_index(np.array(i, dtype="int"), shape=shapes[0]) for i in indices1d + np.unravel_index(np.array(i, dtype="int"), shape=coordinates[0].shape) + for i in indices1d ] return centers, indices @@ -1005,6 +1003,125 @@ def _check_rolling_window_overlap(region, size, shape, spacing): ) +def expanding_window(coordinates, center, sizes): + """ + Select points on windows of changing size around a center point. + + Returns the indices of points falling inside each window. + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). + center : tuple + The coordinates of the center of the window. Should be in the + following order: (easting, northing, vertical, ...). + sizes : array + The sizes of the windows. Does not have to be in any particular order. + The order of indices returned will match the order of window sizes + given. Units should match the units of *coordinates* and *center*. + + Returns + ------- + indices : list + Each element of the list corresponds to the indices of points falling + inside a window. Use them to index the coordinates for each window. The + indices will depend on the number of dimensions in the input + coordinates. For example, if the coordinates are 2D arrays, each window + will contain indices for 2 dimensions (row, column). + + See also + -------- + block_split : Split a region into blocks and label points accordingly. + rolling_window : Select points on a rolling (moving) window. + + Examples + -------- + + Generate a set of sample coordinates on a grid and determine the indices + of points for each expanding window: + + >>> from verde import grid_coordinates + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + >>> print(coords[0]) + [[-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.]] + >>> print(coords[1]) + [[ 6. 6. 6. 6. 6.] + [ 7. 7. 7. 7. 7.] + [ 8. 8. 8. 8. 8.] + [ 9. 9. 9. 9. 9.] + [10. 10. 10. 10. 10.]] + >>> # Get the expanding window indices + >>> indices = expanding_window(coords, center=(-3, 8), sizes=[1, 2, 4]) + >>> # There is one index per window + >>> print(len(indices)) + 3 + >>> # The points in the first window. Indices are 2D positions because the + >>> # coordinate arrays are 2D. + >>> print(len(indices[0])) + 2 + >>> for dimension in indices[0]: + ... print(dimension) + [2] + [2] + >>> for dimension in indices[1]: + ... print(dimension) + [1 1 1 2 2 2 3 3 3] + [1 2 3 1 2 3 1 2 3] + >>> for dimension in indices[2]: + ... print(dimension) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4] + [0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4] + >>> # To get the coordinates for each window, use indexing + >>> print(coords[0][indices[0]]) + [-3.] + >>> print(coords[1][indices[0]]) + [8.] + >>> print(coords[0][indices[1]]) + [-4. -3. -2. -4. -3. -2. -4. -3. -2.] + >>> print(coords[1][indices[1]]) + [7. 7. 7. 8. 8. 8. 9. 9. 9.] + + If the coordinates are 1D, the indices will also be 1D: + + >>> coords1d = [coord.ravel() for coord in coords] + >>> indices = expanding_window(coords1d, center=(-3, 8), sizes=[1, 2, 4]) + >>> print(len(indices)) + 3 + >>> # Since coordinates are 1D, there is only one index + >>> print(len(indices[0])) + 1 + >>> print(indices[0][0]) + [12] + >>> print(indices[1][0]) + [ 6 7 8 11 12 13 16 17 18] + >>> # The returned indices can be used in the same way as before + >>> print(coords1d[0][indices[0]]) + [-3.] + >>> print(coords1d[1][indices[0]]) + [8.] + + """ + coordinates = check_coordinates(coordinates) + shape = coordinates[0].shape + center = np.atleast_2d(center) + # pykdtree doesn't support query_ball_point yet and we need that + tree = kdtree(coordinates, use_pykdtree=False) + indices = [] + for size in sizes: + # Use p=inf (infinity norm) to get square windows instead of circular + index1d = tree.query_ball_point(center, r=size / 2, p=np.inf)[0] + # Convert indices to an array to avoid errors when the index is empty + # (no points in the window). unravel_index doesn't like empty lists. + indices.append(np.unravel_index(np.array(index1d, dtype="int"), shape=shape)) + return indices + + def longitude_continuity(coordinates, region): """ Modify coordinates and region boundaries to ensure longitude continuity. diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index b46884b8f..184f65561 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -6,7 +6,7 @@ import numpy.testing as npt import pytest -from ..base.utils import check_fit_input +from ..base.utils import check_fit_input, check_coordinates from ..base.base_classes import ( BaseGridder, get_dims, @@ -16,6 +16,20 @@ from ..coordinates import grid_coordinates, scatter_points +def test_check_coordinates(): + "Should raise a ValueError is the coordinates have different shapes." + # Should not raise an error + check_coordinates([np.arange(10), np.arange(10)]) + check_coordinates([np.arange(10).reshape((5, 2)), np.arange(10).reshape((5, 2))]) + # Should raise an error + with pytest.raises(ValueError): + check_coordinates([np.arange(10), np.arange(10).reshape((5, 2))]) + with pytest.raises(ValueError): + check_coordinates( + [np.arange(10).reshape((2, 5)), np.arange(10).reshape((5, 2))] + ) + + def test_get_dims(): "Tests that get_dims returns the expected results" assert get_dims(dims=None) == ("northing", "easting") From ac0c162f3db70649798abe672ba4b2f8578105e2 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 16 Mar 2020 18:18:44 +0000 Subject: [PATCH 07/38] Explicitly set PR trigger for all branches on Azure (#244) For some reason, Azure Pipelines CI wasn't starting up PRs, only on master and tags. Explicitly setting a trigger for PRs against any branch seems to work. --- .azure-pipelines.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 5429b68b3..914cc9397 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -9,6 +9,12 @@ trigger: - master - refs/tags/* +# Make sure triggers are set for PRs to any branch. +pr: + branches: + include: + - '*' + jobs: From 516f21bc23eefaecb7067828a3b2ae6436da278f Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 16 Mar 2020 18:47:14 +0000 Subject: [PATCH 08/38] Fix bug for 3+ coordinates in windowing functions (#243) The `rolling_window` and `expanding_window` functions fail if given more 2 coordinates. If we had full support for N-D coordinates, they could be N-D windowing functions but until that time, they will have to be 2D (horizontal) only functions. Add test cases using `extra_coords` in `grid_coordinates` to generate a third coordinate. Fixes #242 --- verde/coordinates.py | 96 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 5 deletions(-) diff --git a/verde/coordinates.py b/verde/coordinates.py index d29d70eac..ea6da7cdd 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -772,7 +772,9 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= [6 6 6 7 7 7]] """ - coordinates = check_coordinates(coordinates) + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] if region is None: region = get_region(coordinates) block_coords = grid_coordinates( @@ -803,7 +805,8 @@ def rolling_window( ---------- coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the - following order: (easting, northing, vertical, ...). + following order: (easting, northing, vertical, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. size : float The size of the windows. Units should match the units of *coordinates*. spacing : float, tuple = (s_north, s_east), or None @@ -945,8 +948,54 @@ def rolling_window( >>> print(coords[1][indices[0, 0]]) [6. 6. 6. 7. 7. 7. 8. 8. 8.] + Only the first 2 coordinates are considered (assumed to be the horizontal + ones). All others will be ignored by the function. + + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1, extra_coords=20) + >>> print(coords[2]) + [[20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.]] + >>> window_coords, indices = rolling_window(coords, size=2, spacing=2) + >>> # The windows would be the same in this case since coords[2] is ignored + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> print(indices.shape) + (2, 2) + >>> for dimension in indices[0, 0]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[0, 1]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [2 3 4 2 3 4 2 3 4] + >>> for dimension in indices[1, 0]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[1, 1]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [2 3 4 2 3 4 2 3 4] + >>> # The indices can still be used with the third coordinate + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + >>> print(coords[2][indices[0, 0]]) + [20. 20. 20. 20. 20. 20. 20. 20. 20.] + """ - coordinates = check_coordinates(coordinates) + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] if region is None: region = get_region(coordinates) # Calculate the region spanning the centers of the rolling windows @@ -1013,7 +1062,8 @@ def expanding_window(coordinates, center, sizes): ---------- coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the - following order: (easting, northing, vertical, ...). + following order: (easting, northing, vertical, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. center : tuple The coordinates of the center of the window. Should be in the following order: (easting, northing, vertical, ...). @@ -1106,8 +1156,44 @@ def expanding_window(coordinates, center, sizes): >>> print(coords1d[1][indices[0]]) [8.] + Only the first 2 coordinates are considered (assumed to be the horizontal + ones). All others will be ignored by the function. + + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1, extra_coords=15) + >>> print(coords[2]) + [[15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.]] + >>> indices = expanding_window(coords, center=(-3, 8), sizes=[1, 2, 4]) + >>> # The returned indices should be the same as before, ignoring coords[2] + >>> print(len(indices[0])) + 2 + >>> for dimension in indices[0]: + ... print(dimension) + [2] + [2] + >>> for dimension in indices[1]: + ... print(dimension) + [1 1 1 2 2 2 3 3 3] + [1 2 3 1 2 3 1 2 3] + >>> for dimension in indices[2]: + ... print(dimension) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4] + [0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4] + >>> # The indices can be used to index all 3 coordinates + >>> print(coords[0][indices[0]]) + [-3.] + >>> print(coords[1][indices[0]]) + [8.] + >>> print(coords[2][indices[0]]) + [15.] + """ - coordinates = check_coordinates(coordinates) + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] shape = coordinates[0].shape center = np.atleast_2d(center) # pykdtree doesn't support query_ball_point yet and we need that From 072e9df774fb63b0362bd6c95b3e45b7a1dc240c Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 17 Mar 2020 07:07:04 -0300 Subject: [PATCH 09/38] Generalize coordinate systems in BaseGridder docs (#240) Make docstrings of `BaseGridder` more general, without specifying a coordinate system. Add `dims` as a class variable that will be used by default on `grid`, `scatter` and `profile` methods. --- verde/base/base_classes.py | 62 ++++++++++++++++++++------------------ verde/tests/test_base.py | 10 +++--- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 503f9abbe..bc581c8c3 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -83,6 +83,10 @@ class BaseGridder(BaseEstimator): """ + # The default dimension names for generated outputs + # (pd.DataFrame, xr.Dataset, etc) + dims = ("northing", "easting") + def predict(self, coordinates): """ Predict data on the given coordinate values. NOT IMPLEMENTED. @@ -148,6 +152,8 @@ def filter(self, coordinates, data, weights=None): coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. data : array or tuple of arrays The data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each @@ -190,6 +196,8 @@ def score(self, coordinates, data, weights=None): coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. data : array or tuple of arrays The data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each @@ -246,8 +254,7 @@ def grid( Parameters ---------- region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. + The west, east, south, and north boundaries of a given region. shape : tuple = (n_north, n_east) or None The number of points in the South-North and West-East directions, respectively. @@ -256,8 +263,10 @@ def grid( respectively. dims : list or None The names of the northing and easting data dimensions, - respectively, in the output grid. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output grid. Default is determined from the + ``dims`` attribute of the class. Must be defined in the following + order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** data_names : list of None The name(s) of the data variables in the output grid. Defaults to @@ -285,7 +294,7 @@ def grid( verde.grid_coordinates : Generate the coordinate values for the grid. """ - dims = get_dims(dims) + dims = self._get_dims(dims) region = get_instance_region(self, region) coordinates = grid_coordinates(region, shape=shape, spacing=spacing, **kwargs) if projection is None: @@ -327,8 +336,7 @@ def scatter( Parameters ---------- region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. + The west, east, south, and north boundaries of a given region. size : int The number of points to generate. random_state : numpy.random.RandomState or an int seed @@ -338,8 +346,10 @@ def scatter( (resulting in different numbers with each run). dims : list or None The names of the northing and easting data dimensions, - respectively, in the output dataframe. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output dataframe. Default is determined from + the ``dims`` attribute of the class. Must be defined in the + following order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** data_names : list of None The name(s) of the data variables in the output dataframe. Defaults @@ -362,7 +372,7 @@ def scatter( The interpolated values on a random set of points. """ - dims = get_dims(dims) + dims = self._get_dims(dims) region = get_instance_region(self, region) coordinates = scatter_points(region, size, random_state=random_state, **kwargs) if projection is None: @@ -411,8 +421,10 @@ def profile( The number of points to generate. dims : list or None The names of the northing and easting data dimensions, - respectively, in the output dataframe. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output dataframe. Default is determined from + the ``dims`` attribute of the class. Must be defined in the + following order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** data_names : list of None The name(s) of the data variables in the output dataframe. Defaults @@ -435,7 +447,7 @@ def profile( The interpolated values along the profile. """ - dims = get_dims(dims) + dims = self._get_dims(dims) coordinates, distances = profile_coordinates(point1, point2, size, **kwargs) if projection is None: data = check_data(self.predict(coordinates)) @@ -450,23 +462,13 @@ def profile( columns.extend(zip(data_names, data)) return pd.DataFrame(dict(columns), columns=[c[0] for c in columns]) - -def get_dims(dims): - """ - Get default dimension names. - - Examples - -------- - - >>> get_dims(dims=None) - ('northing', 'easting') - >>> get_dims(dims=('john', 'paul')) - ('john', 'paul') - - """ - if dims is not None: - return dims - return ("northing", "easting") + def _get_dims(self, dims): + """ + Get default dimension names. + """ + if dims is not None: + return dims + return self.dims def get_data_names(data, data_names): diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 184f65561..c95eef98e 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -1,4 +1,4 @@ -# pylint: disable=unused-argument,too-many-locals +# pylint: disable=unused-argument,too-many-locals,protected-access """ Test the base classes and their utility functions. """ @@ -9,7 +9,6 @@ from ..base.utils import check_fit_input, check_coordinates from ..base.base_classes import ( BaseGridder, - get_dims, get_data_names, get_instance_region, ) @@ -32,8 +31,11 @@ def test_check_coordinates(): def test_get_dims(): "Tests that get_dims returns the expected results" - assert get_dims(dims=None) == ("northing", "easting") - assert get_dims(dims=("john", "paul")) == ("john", "paul") + gridder = BaseGridder() + assert gridder._get_dims(dims=None) == ("northing", "easting") + assert gridder._get_dims(dims=("john", "paul")) == ("john", "paul") + gridder.dims = ("latitude", "longitude") + assert gridder._get_dims(dims=None) == ("latitude", "longitude") def test_get_data_names(): From 5c300d4349c09d6e7be119a8236de327e1ac13b2 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Tue, 17 Mar 2020 15:42:47 +0000 Subject: [PATCH 10/38] Add a convex hull masking function (#237) Use scipy's Delaunay triangulation to determine points that are outside the convex hull of the data points. Useful to mask out wildly extrapolated points from grids. It's easier to use than `distance_mask` because it doesn't require distance calculations (and the associated projections). Use the new function in some examples and tutorials. Normalize the coordinates prior to calculations to avoid errors with floating point conversion when triangulating (as suggested by Ari Hartikainen). Fixes #126 --- doc/api/index.rst | 10 ++- examples/convex_hull_mask.py | 51 +++++++++++++ examples/spline_weights.py | 7 +- examples/vector_uncoupled.py | 6 +- tutorials/weights.py | 5 ++ verde/__init__.py | 2 +- verde/mask.py | 139 ++++++++++++++++++++++++++++++++--- verde/tests/test_mask.py | 42 ++++++++++- 8 files changed, 238 insertions(+), 24 deletions(-) create mode 100644 examples/convex_hull_mask.py diff --git a/doc/api/index.rst b/doc/api/index.rst index 71a1b9c8c..bcc49d8bc 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -63,6 +63,15 @@ Coordinate Manipulation rolling_window expanding_window +Masking +------- + +.. autosummary:: + :toctree: generated/ + + distance_mask + convexhull_mask + Utilities --------- @@ -71,7 +80,6 @@ Utilities test maxabs - distance_mask variance_to_weights grid_to_table median_distance diff --git a/examples/convex_hull_mask.py b/examples/convex_hull_mask.py new file mode 100644 index 000000000..438e55842 --- /dev/null +++ b/examples/convex_hull_mask.py @@ -0,0 +1,51 @@ +""" +Mask grid points by convex hull +=============================== + +Sometimes, data points are unevenly distributed. In such cases, we might not +want to have interpolated grid points that are too far from any data point. +Function :func:`verde.convexhull_mask` allows us to set grid points that fall +outside of the convex hull of the data points to NaN or some other value. +""" +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import pyproj +import numpy as np +import verde as vd + +# The Baja California bathymetry dataset has big gaps on land. We want to mask +# these gaps on a dummy grid that we'll generate over the region just to show +# what that looks like. +data = vd.datasets.fetch_baja_bathymetry() +region = vd.get_region((data.longitude, data.latitude)) + +# Generate the coordinates for a regular grid mask +spacing = 10 / 60 +coordinates = vd.grid_coordinates(region, spacing=spacing) + +# Generate a mask for points. The mask is True for points that are within the +# convex hull. We can provide a projection function to convert the coordinates +# before the convex hull is calculated (Mercator in this case). +mask = vd.convexhull_mask( + data_coordinates=(data.longitude, data.latitude), + coordinates=coordinates, + projection=pyproj.Proj(proj="merc", lat_ts=data.latitude.mean()), +) +print(mask) + +# Create a dummy grid with ones that we can mask to show the results. Turn +# points that are outside of the convex hull into NaNs so they won't show up in +# our plot. +dummy_data = np.ones_like(coordinates[0]) +dummy_data[~mask] = np.nan + +# Make a plot of the masked data and the data locations. +crs = ccrs.PlateCarree() +plt.figure(figsize=(7, 6)) +ax = plt.axes(projection=ccrs.Mercator()) +ax.set_title("Only keep grid points that inside of the convex hull") +ax.plot(data.longitude, data.latitude, ".y", markersize=0.5, transform=crs) +ax.pcolormesh(*coordinates, dummy_data, transform=crs) +vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.tight_layout() +plt.show() diff --git a/examples/spline_weights.py b/examples/spline_weights.py index 73de4fd52..171fa83fd 100644 --- a/examples/spline_weights.py +++ b/examples/spline_weights.py @@ -62,11 +62,8 @@ dims=["latitude", "longitude"], data_names=["velocity"], ) -grid = vd.distance_mask( - (data.longitude, data.latitude), - maxdist=5 * spacing * 111e3, - grid=grid_full, - projection=projection, +grid = vd.convexhull_mask( + (data.longitude, data.latitude), grid=grid_full, projection=projection ) fig, axes = plt.subplots( diff --git a/examples/vector_uncoupled.py b/examples/vector_uncoupled.py index acc6db163..6f641940a 100644 --- a/examples/vector_uncoupled.py +++ b/examples/vector_uncoupled.py @@ -61,13 +61,11 @@ print("Cross-validation R^2 score: {:.2f}".format(score)) # Interpolate the wind speed onto a regular geographic grid and mask the data that are -# far from the observation points +# outside of the convex hull of the data points. grid_full = chain.grid( region, spacing=spacing, projection=projection, dims=["latitude", "longitude"] ) -grid = vd.distance_mask( - coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection -) +grid = vd.convexhull_mask(coordinates, grid=grid_full, projection=projection) # Make maps of the original and gridded wind speed plt.figure(figsize=(6, 6)) diff --git a/tutorials/weights.py b/tutorials/weights.py index 099f1f216..74f5c5c01 100644 --- a/tutorials/weights.py +++ b/tutorials/weights.py @@ -246,6 +246,8 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): dims=["latitude", "longitude"], data_names=["velocity"], ) +# Avoid showing interpolation outside of the convex hull of the data points. +grid = vd.convexhull_mask(coordinates, grid=grid, projection=projection) ######################################################################################## # Calculate an unweighted spline as well for comparison. @@ -262,6 +264,9 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): dims=["latitude", "longitude"], data_names=["velocity"], ) +grid_unweighted = vd.convexhull_mask( + coordinates, grid=grid_unweighted, projection=projection +) ######################################################################################## # Finally, plot the weighted and unweighted grids side by side. diff --git a/verde/__init__.py b/verde/__init__.py index b7df9b76a..a77e65cdc 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -15,7 +15,7 @@ project_region, longitude_continuity, ) -from .mask import distance_mask +from .mask import distance_mask, convexhull_mask from .utils import variance_to_weights, maxabs, grid_to_table from .io import load_surfer from .distances import median_distance diff --git a/verde/mask.py b/verde/mask.py index 9c06ca29f..6d92cb46d 100644 --- a/verde/mask.py +++ b/verde/mask.py @@ -3,7 +3,10 @@ """ import numpy as np -from .base import n_1d_arrays +# pylint doesn't pick up on this import for some reason +from scipy.spatial import Delaunay # pylint: disable=no-name-in-module + +from .base.utils import n_1d_arrays, check_coordinates from .utils import kdtree @@ -43,9 +46,10 @@ def distance_mask( northing will be used, all subsequent coordinates will be ignored. grid : None or :class:`xarray.Dataset` 2D grid with values to be masked. Will use the first two dimensions of - the grid as northing and easting coordinates, respectively. The mask - will be applied to *grid* using the :meth:`xarray.Dataset.where` - method. + the grid as northing and easting coordinates, respectively. For this to + work, the grid dimensions **must be ordered as northing then easting**. + The mask will be applied to *grid* using the + :meth:`xarray.Dataset.where` method. projection : callable or None If not None, then should be a callable object ``projection(easting, northing) -> (proj_easting, proj_northing)`` that takes in easting and @@ -93,14 +97,7 @@ def distance_mask( [nan nan nan nan nan nan]] """ - if coordinates is None and grid is None: - raise ValueError("Either coordinates or grid must be given.") - if coordinates is None: - dims = [grid[var].dims for var in grid.data_vars][0] - coordinates = np.meshgrid(grid.coords[dims[1]], grid.coords[dims[0]]) - if len(set(i.shape for i in coordinates)) != 1: - raise ValueError("Coordinate arrays must have the same shape.") - shape = coordinates[0].shape + coordinates, shape = _get_grid_coordinates(coordinates, grid) if projection is not None: data_coordinates = projection(*n_1d_arrays(data_coordinates, 2)) coordinates = projection(*n_1d_arrays(coordinates, 2)) @@ -110,3 +107,121 @@ def distance_mask( if grid is not None: return grid.where(mask) return mask + + +def convexhull_mask( + data_coordinates, coordinates=None, grid=None, projection=None, +): + """ + Mask grid points that are outside the convex hull of the given data points. + + Either *coordinates* or *grid* must be given: + + * If *coordinates* is not None, produces an array that is False when a + point is outside the convex hull and True otherwise. + * If *grid* is not None, produces a mask and applies it to *grid* (an + :class:`xarray.Dataset`). + + Parameters + ---------- + data_coordinates : tuple of arrays + Same as *coordinates* but for the data points. + coordinates : None or tuple of arrays + Arrays with the coordinates of each point that will be masked. Should + be in the following order: (easting, northing, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. + grid : None or :class:`xarray.Dataset` + 2D grid with values to be masked. Will use the first two dimensions of + the grid as northing and easting coordinates, respectively. For this to + work, the grid dimensions **must be ordered as northing then easting**. + The mask will be applied to *grid* using the + :meth:`xarray.Dataset.where` method. + projection : callable or None + If not None, then should be a callable object ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected easting and northing + coordinate arrays. This function will be used to project the given + coordinates (or the ones extracted from the grid) before calculating + distances. + + Returns + ------- + mask : array or :class:`xarray.Dataset` + If *coordinates* was given, then a boolean array with the same shape as + the elements of *coordinates*. If *grid* was given, then an + :class:`xarray.Dataset` with the mask applied to it. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> region = (0, 5, -10, -4) + >>> spacing = 1 + >>> coords = grid_coordinates(region, spacing=spacing) + >>> data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + >>> mask = convexhull_mask(data_coords, coordinates=coords) + >>> print(mask) + [[False False False False False False] + [False False True True False False] + [False False True True False False] + [False False True True False False] + [False False True True False False] + [False False False False False False] + [False False False False False False]] + >>> # Mask an xarray.Dataset directly + >>> import xarray as xr + >>> coords_dict = {"easting": coords[0][0, :], "northing": coords[1][:, 0]} + >>> data_vars = {"scalars": (["northing", "easting"], np.ones(mask.shape))} + >>> grid = xr.Dataset(data_vars, coords=coords_dict) + >>> masked = convexhull_mask(data_coords, grid=grid) + >>> print(masked.scalars.values) + [[nan nan nan nan nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan nan nan nan nan] + [nan nan nan nan nan nan]] + + """ + coordinates, shape = _get_grid_coordinates(coordinates, grid) + n_coordinates = 2 + # Make sure they are arrays so we can normalize + data_coordinates = n_1d_arrays(data_coordinates, n_coordinates) + coordinates = n_1d_arrays(coordinates, n_coordinates) + if projection is not None: + data_coordinates = projection(*data_coordinates) + coordinates = projection(*coordinates) + # Normalize the coordinates to avoid errors from qhull when values are very + # large (as occurs when projections are used). + means = [coord.mean() for coord in data_coordinates] + stds = [coord.std() for coord in data_coordinates] + data_coordinates = tuple( + (coord - mean) / std for coord, mean, std in zip(data_coordinates, means, stds) + ) + coordinates = tuple( + (coord - mean) / std for coord, mean, std in zip(coordinates, means, stds) + ) + triangles = Delaunay(np.transpose(data_coordinates)) + # Find the triangle that contains each grid point. + # -1 indicates that it's not in any triangle. + in_triangle = triangles.find_simplex(np.transpose(coordinates)) + mask = (in_triangle != -1).reshape(shape) + if grid is not None: + return grid.where(mask) + return mask + + +def _get_grid_coordinates(coordinates, grid): + """ + If coordinates is given, return it and their shape. Otherwise, get + coordinate arrays from the grid. + """ + if coordinates is None and grid is None: + raise ValueError("Either coordinates or grid must be given.") + if coordinates is None: + dims = [grid[var].dims for var in grid.data_vars][0] + coordinates = np.meshgrid(grid.coords[dims[1]], grid.coords[dims[0]]) + check_coordinates(coordinates) + shape = coordinates[0].shape + return coordinates, shape diff --git a/verde/tests/test_mask.py b/verde/tests/test_mask.py index cfb80ed65..5a7bd2577 100644 --- a/verde/tests/test_mask.py +++ b/verde/tests/test_mask.py @@ -6,10 +6,50 @@ import xarray as xr import pytest -from ..mask import distance_mask +from ..mask import distance_mask, convexhull_mask from ..coordinates import grid_coordinates +def test_convexhull_mask(): + "Check that the mask works for basic input" + region = (0, 5, -10, -4) + coords = grid_coordinates(region, spacing=1) + data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + mask = convexhull_mask(data_coords, coordinates=coords) + true = [ + [False, False, False, False, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, False, False, False, False], + [False, False, False, False, False, False], + ] + assert mask.tolist() == true + + +def test_convexhull_mask_projection(): + "Check that the mask works when given a projection" + region = (0, 5, -10, -4) + coords = grid_coordinates(region, spacing=1) + data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + # For a linear projection, the result should be the same since there is no + # area change in the data. + mask = convexhull_mask( + data_coords, coordinates=coords, projection=lambda e, n: (10 * e, 10 * n), + ) + true = [ + [False, False, False, False, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, False, False, False, False], + [False, False, False, False, False, False], + ] + assert mask.tolist() == true + + def test_distance_mask(): "Check that the mask works for basic input" region = (0, 5, -10, -4) From 8996a7d89fce16c104efa7e9d841fe24bcbdef12 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 23 Mar 2020 12:24:54 +0000 Subject: [PATCH 11/38] Fix bug with profile distances being unprojected (#231) `BaseGridder.profile` has the option to apply a projection to the coordinates before predicting so we can pass geographic coordinates to cartesian gridders. In these cases, the distance along the profile is calculated by `profile_coordinates` with the unprojected coordinates (in the geographic case it would be degrees). The profile point calculation is also done assuming that coordinates are Cartesian, which is clearly wrong if inputs are longitude and latitude. To fix this, project the input points prior to passing to `profile_coordinates`. This means that the distances are Cartesian and generation of profile points is also Cartesian (as is assumed by `profile_coordinates`). The generated coordinates are projected back so that the user gets longitude and latitude but distances are still projected Cartesian. Add a warning in the docstring of the modified behaviour due to this bug and a section in the tutorial on projections. --- tutorials/projections.py | 61 ++++++++++++++++++++++++++++++++++ verde/base/base_classes.py | 57 +++++++++++++++++++++++++------- verde/tests/test_base.py | 67 +++++++++++++++++++++++++++----------- 3 files changed, 155 insertions(+), 30 deletions(-) diff --git a/tutorials/projections.py b/tutorials/projections.py index 16edce9d4..0d6f34b56 100644 --- a/tutorials/projections.py +++ b/tutorials/projections.py @@ -132,5 +132,66 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.show() + +######################################################################################## +# Profiles +# -------- +# +# For profiles, things are a bit different. The projection is applied to the +# input points before coordinates are generated. So the profile will be evenly +# spaced in *projected coordinates*, not geographic coordinates. This is to +# avoid issues with calculating distances on a sphere. +# +# The coordinates returned by the ``profile`` method will be in geographic +# coordinates, so projections given to ``profile`` must take an ``inverse`` +# argument so we can undo the projection. +# +# The main utility of using a projection with ``profile`` is being able to pass +# in points in geographic coordinates and get coordinates back in that same +# system (making it easier to plot on a map). +# +# To generate a profile cutting across our bathymetry data, we can use +# longitude and latitude points taken from the map above). + +start = (-114.5, 24.7) +end = (-110, 20.5) +profile = spline.profile( + point1=start, + point2=end, + size=200, + projection=projection, + dims=("latitude", "longitude"), + data_names=["bathymetry"], +) +print(profile) + +######################################################################################## +# Plot the profile location on our geographic grid from above. + +plt.figure(figsize=(7, 6)) +ax = plt.axes(projection=ccrs.Mercator()) +ax.set_title("Profile location") +pc = grid_geo.bathymetry.plot.pcolormesh( + ax=ax, transform=ccrs.PlateCarree(), vmax=0, zorder=-1, add_colorbar=False +) +plt.colorbar(pc).set_label("meters") +ax.plot(profile.longitude, profile.latitude, "-k", transform=ccrs.PlateCarree()) +ax.text(start[0], start[1], "A", transform=ccrs.PlateCarree()) +ax.text(end[0], end[1], "B", transform=ccrs.PlateCarree()) +vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.show() + +######################################################################################## +# And finally plot the profile. + +plt.figure(figsize=(8, 3)) +ax = plt.axes() +ax.set_title("Profile of bathymetry (A-B)") +ax.plot(profile.distance, profile.bathymetry, "-k") +ax.set_xlabel("Distance (m)") +ax.set_ylabel("Bathymetry (m)") +ax.set_xlim(profile.distance.min(), profile.distance.max()) +ax.grid() plt.tight_layout() plt.show() diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index bc581c8c3..cc988d015 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -409,6 +409,30 @@ def profile( Includes the calculated Cartesian distance from *point1* for each data point in the profile. + To specify *point1* and *point2* in a coordinate system that would + require projection to Cartesian (geographic longitude and latitude, for + example), use the ``projection`` argument. With this option, the input + points will be projected using the given projection function prior to + computations. The generated Cartesian profile coordinates will be + projected back to the original coordinate system. **Note that the + profile points are evenly spaced in projected coordinates, not the + original system (e.g., geographic)**. + + .. warning:: + + **The profile calculation method with a projection has changed in + Verde 1.4.0**. Previous versions generated coordinates (assuming + they were Cartesian) and projected them afterwards. This led to + "distances" being incorrectly handled and returned in unprojected + coordinates. For example, if ``projection`` is from geographic to + Mercator, the distances would be "angles" (incorrectly calculated + as if they were Cartesian). After 1.4.0, *point1* and *point2* are + projected prior to generating coordinates for the profile, + guaranteeing that distances are properly handled in a Cartesian + system. **With this change, the profile points are now evenly + spaced in projected coordinates and the distances are returned in + projected coordinates as well.** + Parameters ---------- point1 : tuple @@ -433,13 +457,16 @@ def profile( ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. projection : callable or None - If not None, then should be a callable object - ``projection(easting, northing) -> (proj_easting, proj_northing)`` - that takes in easting and northing coordinate arrays and returns - projected northing and easting coordinate arrays. This function - will be used to project the generated profile coordinates before - passing them into ``predict``. For example, you can use this to - generate a geographic profile from a Cartesian gridder. + If not None, then should be a callable object ``projection(easting, + northing, inverse=False) -> (proj_easting, proj_northing)`` that + takes in easting and northing coordinate arrays and returns + projected northing and easting coordinate arrays. Should also take + an optional keyword argument ``inverse`` (default to False) that if + True will calculate the inverse transform instead. This function + will be used to project the profile end points before generating + coordinates and passing them into ``predict``. It will also be used + to undo the projection of the coordinates before returning the + results. Returns ------- @@ -448,11 +475,19 @@ def profile( """ dims = self._get_dims(dims) + # Project the input points to generate the profile in Cartesian + # coordinates (the distance calculation doesn't make sense in + # geographic coordinates since we don't do actual distances on a + # sphere). + if projection is not None: + point1 = projection(*point1) + point2 = projection(*point2) coordinates, distances = profile_coordinates(point1, point2, size, **kwargs) - if projection is None: - data = check_data(self.predict(coordinates)) - else: - data = check_data(self.predict(projection(*coordinates))) + data = check_data(self.predict(coordinates)) + # Project the coordinates back to have geographic coordinates for the + # profile but Cartesian distances. + if projection is not None: + coordinates = projection(*coordinates, inverse=True) data_names = get_data_names(data, data_names) columns = [ (dims[0], coordinates[1]), diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index c95eef98e..4f2f61d70 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -128,6 +128,7 @@ def test_basegridder(): coordinates_true = grid_coordinates(region, shape) data_true = angular * coordinates_true[0] + linear grid = grd.grid(region, shape) + prof = grd.profile((0, -10), (10, -10), 30) npt.assert_allclose(grd.coefs_, [linear, angular]) npt.assert_allclose(grid.scalars.values, data_true) @@ -135,40 +136,68 @@ def test_basegridder(): npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) npt.assert_allclose(grd.scatter(region, 1000, random_state=0).scalars, data) npt.assert_allclose( - grd.profile((0, 0), (10, 0), 30).scalars, - angular * coordinates_true[0][0, :] + linear, + prof.scalars, angular * coordinates_true[0][0, :] + linear, ) + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][0, :]) + npt.assert_allclose(prof.distance, coordinates_true[0][0, :]) def test_basegridder_projection(): "Test basic functionality of BaseGridder when passing in a projection" + # Lets say the projection is doubling the coordinates + def proj(lon, lat, inverse=False): + "Project from the new coordinates to the original" + if inverse: + return (lon / 2, lat / 2) + return (lon * 2, lat * 2) + + # Values in "geographic" coordinates region = (0, 10, -10, -5) - shape = (50, 30) + shape = (51, 31) angular, linear = 2, 100 coordinates = scatter_points(region, 1000, random_state=0) data = angular * coordinates[0] + linear + # Project before passing to our Cartesian gridder + grd = PolyGridder().fit(proj(coordinates[0], coordinates[1]), data) + + # Check the estimated coefficients + # The grid is estimated in projected coordinates (which are twice as large) + # so the rate of change (angular) should be half to get the same values. + npt.assert_allclose(grd.coefs_, [linear, angular / 2]) + + # The actual values for a grid coordinates_true = grid_coordinates(region, shape) data_true = angular * coordinates_true[0] + linear - grd = PolyGridder().fit(coordinates, data) - # Lets say we want to specify the region for a grid using a coordinate - # system that is lon/2, lat/2. - def proj(lon, lat): - "Project from the new coordinates to the original" - return (lon * 2, lat * 2) - - proj_region = [i / 2 for i in region] - grid = grd.grid(proj_region, shape, projection=proj) - scat = grd.scatter(proj_region, 1000, random_state=0, projection=proj) - prof = grd.profile((0, 0), (5, 0), 30, projection=proj) + # Check the scatter + scat = grd.scatter(region, 1000, random_state=0, projection=proj) + npt.assert_allclose(scat.scalars, data) + npt.assert_allclose(scat.easting, coordinates[0]) + npt.assert_allclose(scat.northing, coordinates[1]) - npt.assert_allclose(grd.coefs_, [linear, angular]) + # Check the grid + grid = grd.grid(region, shape, projection=proj) npt.assert_allclose(grid.scalars.values, data_true) - npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :] / 2) - npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0] / 2) - npt.assert_allclose(scat.scalars, data) - npt.assert_allclose(prof.scalars, angular * coordinates_true[0][0, :] + linear) + npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :]) + npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) + + # Check the profile + prof = grd.profile( + (region[0], region[-1]), (region[1], region[-1]), shape[1], projection=proj + ) + npt.assert_allclose(prof.scalars, data_true[-1, :]) + # Coordinates should still be evenly spaced since the projection is a + # multiplication. + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][-1, :]) + # Distance should still be in the projected coordinates. If the projection + # is from geographic, we shouldn't be returning distances in degrees but in + # projected meters. The distances will be evenly spaced in unprojected + # coordinates. + distance_true = np.linspace(region[0] * 2, region[1] * 2, shape[1]) + npt.assert_allclose(prof.distance, distance_true) def test_check_fit_input(): From f8e593a3a48b3c1f4e2343061b02c2b1e25eb6eb Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 27 Mar 2020 19:56:05 +0000 Subject: [PATCH 12/38] Update TravisCI configuration to new formats (#247) Travis started warning that some options in the config were deprecated. Update them to the new format to avoid crashing builds later on. --- .travis.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 42de0e256..69abef85e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,8 @@ # We use miniconda for Python so don't need any Python specific tools language: generic -# Use the container builds so we don't need sudo priviledges -sudo: false +os: linux +dist: xenial # Only build pushes to the master branch and tags. This avoids the double # builds than happen when working on a branch instead of a fork. @@ -33,7 +33,7 @@ env: - DEPLOY_PYPI=false # Specify the build configurations. Be sure to only deploy from a single build. -matrix: +jobs: include: - name: "Linux - Python 3.8" os: linux @@ -109,14 +109,14 @@ deploy: # Push the built HTML in doc/_build/html to the gh-pages branch - provider: script script: continuous-integration/travis/deploy-gh-pages.sh - skip_cleanup: true + cleanup: false on: branch: master condition: '$DEPLOY_DOCS == "true"' # Push HTML when building tags as well - provider: script script: continuous-integration/travis/deploy-gh-pages.sh - skip_cleanup: true + cleanup: false on: tags: true condition: '$DEPLOY_DOCS == "true"' From ff64a27fd17a49df70397a95d740909a40416768 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 3 Apr 2020 18:54:12 +0100 Subject: [PATCH 13/38] Add function to project 2D gridded data (#246) The `project_grid` function transforms a grid to the given projection. It re-samples the data using `ScipyGridder` (by default) and runs a blocked mean (optional) to avoid aliasing when the points aren't evenly distributed in the projected coordinates (like in polar projections). Applies a `convexhul_mask` to the grid always to avoid extrapolation to points that had no original data (this is done by scipy for linear and cubic interpolation already). The function only works for `xarray.DataArray`s (not `Dataset`) because I couldn't find an easy way to guarantee that a variables have the same dimensions. Move the new function and `project_region` to the new module `verde/projections.py`. --- doc/api/index.rst | 10 +- examples/project_grid.py | 78 ++++++++++++++++ verde/__init__.py | 2 +- verde/coordinates.py | 90 +++++++++++------- verde/projections.py | 161 ++++++++++++++++++++++++++++++++ verde/tests/test_projections.py | 115 +++++++++++++++++++++++ 6 files changed, 420 insertions(+), 36 deletions(-) create mode 100644 examples/project_grid.py create mode 100644 verde/projections.py create mode 100644 verde/tests/test_projections.py diff --git a/doc/api/index.rst b/doc/api/index.rst index bcc49d8bc..39a5b530a 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -57,12 +57,20 @@ Coordinate Manipulation profile_coordinates get_region pad_region - project_region inside block_split rolling_window expanding_window +Projection +---------- + +.. autosummary:: + :toctree: generated/ + + project_region + project_grid + Masking ------- diff --git a/examples/project_grid.py b/examples/project_grid.py new file mode 100644 index 000000000..5e80b4915 --- /dev/null +++ b/examples/project_grid.py @@ -0,0 +1,78 @@ +""" +Projection of gridded data +========================== + +Sometimes gridded data products need to be projected before they can be used. +For example, you might want to project the topography of Antarctica from +geographic latitude and longitude to a planar polar stereographic projection +before doing your analysis. When projecting, the data points will likely not +fall on a regular grid anymore and must be interpolated (re-sampled) onto a new +grid. + +The :func:`verde.project_grid` function automates this process using the +interpolation methods available in Verde. An input grid +(:class:`xarray.DataArray`) is interpolated onto a new grid in the given +`pyproj `__ projection. The function takes +care of choosing a default grid spacing and region, running a blocked mean to +avoid spatial aliasing (using :class:`~verde.BlockReduce`), and masking the +points in the new grid that aren't constrained by the original data (using +:func:`~verde.convexhull_mask`). + +In this example, we'll generate a synthetic geographic grid with a checkerboard +pattern around the South pole. We'll project the grid to South Polar +Stereographic, revealing the checkered pattern of the data. + +.. note:: + + The interpolation methods are limited to what is available in Verde and + there is only support for single 2D grids. For more sophisticated use + cases, you might want to try + `pyresample `__ instead. + +""" +import numpy as np +import matplotlib.pyplot as plt +import pyproj +import verde as vd + + +# We'll use synthetic data near the South pole to highlight the effects of the +# projection. EPSG 3031 is a South Polar Stereographic projection. +projection = pyproj.Proj("epsg:3031") + +# Create a synthetic geographic grid using a checkerboard pattern +region = (0, 360, -90, -60) +spacing = 0.25 +wavelength = 10 * 1e5 # The size of the cells in the checkerboard +checkerboard = vd.datasets.CheckerBoard( + region=vd.project_region(region, projection), w_east=wavelength, w_north=wavelength +) +data = checkerboard.grid( + region=region, + spacing=spacing, + projection=projection, + data_names=["checkerboard"], + dims=("latitude", "longitude"), +) +print("Geographic grid:") +print(data) + +# Do the projection while setting the output grid spacing (in projected meters). Set +# the coordinates names to x and y since they aren't really "northing" or +# "easting". +polar_data = vd.project_grid( + data.checkerboard, projection, spacing=0.5 * 1e5, dims=("y", "x") +) +print("\nProjected grid:") +print(polar_data) + +# Plot the original and projected grids +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6)) +data.checkerboard.plot( + ax=ax1, cbar_kwargs=dict(orientation="horizontal", aspect=50, pad=0.1) +) +ax1.set_title("Geographic Grid") +polar_data.plot(ax=ax2, cbar_kwargs=dict(orientation="horizontal", aspect=50, pad=0.1)) +ax2.set_title("Polar Stereographic Grid") +plt.tight_layout() +plt.show() diff --git a/verde/__init__.py b/verde/__init__.py index a77e65cdc..c0b5cf5bb 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -12,7 +12,6 @@ profile_coordinates, get_region, pad_region, - project_region, longitude_continuity, ) from .mask import distance_mask, convexhull_mask @@ -26,6 +25,7 @@ from .spline import Spline, SplineCV from .model_selection import cross_val_score, train_test_split from .vector import Vector, VectorSpline2D +from .projections import project_region, project_grid def test(doctest=True, verbose=True, coverage=False, figures=True): diff --git a/verde/coordinates.py b/verde/coordinates.py index ea6da7cdd..b43ebd3ca 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -110,40 +110,6 @@ def pad_region(region, pad): return padded -def project_region(region, projection): - """ - Calculate the bounding box of a region in projected coordinates. - - Parameters - ---------- - region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. - projection : callable or None - If not None, then should be a callable object (like a function) - ``projection(easting, northing) -> (proj_easting, proj_northing)`` that - takes in easting and northing coordinate arrays and returns projected - northing and easting coordinate arrays. - - Returns - ------- - proj_region : list = [W, E, S, N] - The bounding box of the projected region. - - Examples - -------- - - >>> def projection(x, y): - ... return (2*x, -1*y) - >>> project_region((3, 5, -9, -4), projection) - (6.0, 10.0, 4.0, 9.0) - - """ - east, north = grid_coordinates(region, shape=(101, 101)) - east, north = projection(east.ravel(), north.ravel()) - return (east.min(), east.max(), north.min(), north.max()) - - def scatter_points(region, size, random_state=None, extra_coords=None): """ Generate the coordinates for a random scatter of points. @@ -536,6 +502,62 @@ def spacing_to_shape(region, spacing, adjust): return (nnorth, neast), (w, e, s, n) +def shape_to_spacing(region, shape, pixel_register=False): + """ + Calculate the spacing of a grid given region and shape. + + Parameters + ---------- + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. + shape : tuple = (n_north, n_east) or None + The number of points in the South-North and West-East directions, + respectively. + pixel_register : bool + If True, the coordinates will refer to the center of each grid pixel + instead of the grid lines. In practice, this means that there will be + one less element per dimension of the grid when compared to grid line + registered (only if given *spacing* and not *shape*). Default is False. + + Returns + ------- + spacing : tuple = (s_north, s_east) + The grid spacing in the South-North and West-East directions, + respectively. + + Examples + -------- + + >>> spacing = shape_to_spacing([0, 10, -5, 1], (7, 11)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 1.0 + >>> spacing = shape_to_spacing([0, 10, -5, 1], (14, 11)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 0.5, 1.0 + >>> spacing = shape_to_spacing([0, 10, -5, 1], (7, 21)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 0.5 + >>> spacing = shape_to_spacing( + ... [-0.5, 10.5, -5.5, 1.5], (7, 11), pixel_register=True, + ... ) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 1.0 + >>> spacing = shape_to_spacing( + ... [-0.25, 10.25, -5.5, 1.5], (7, 21), pixel_register=True, + ... ) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 0.5 + + """ + spacing = [] + for i, n_points in enumerate(reversed(shape)): + if not pixel_register: + n_points -= 1 + spacing.append((region[2 * i + 1] - region[2 * i]) / n_points) + return tuple(reversed(spacing)) + + def profile_coordinates(point1, point2, size, extra_coords=None): """ Coordinates for a profile along a straight line between two points. diff --git a/verde/projections.py b/verde/projections.py new file mode 100644 index 000000000..a8a580fc4 --- /dev/null +++ b/verde/projections.py @@ -0,0 +1,161 @@ +""" +Operations with projections for grids, regions, etc. +""" +import numpy as np + +from .coordinates import grid_coordinates, get_region, shape_to_spacing, check_region +from .utils import grid_to_table +from .scipygridder import ScipyGridder +from .blockreduce import BlockReduce +from .chain import Chain +from .mask import convexhull_mask + + +def project_region(region, projection): + """ + Calculate the bounding box of a region in projected coordinates. + + Parameters + ---------- + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. + projection : callable + Should be a callable object (like a function) ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected northing and easting + coordinate arrays. + + Returns + ------- + proj_region : list = [W, E, S, N] + The bounding box of the projected region. + + Examples + -------- + + >>> def projection(x, y): + ... return (2*x, -1*y) + >>> project_region((3, 5, -9, -4), projection) + (6.0, 10.0, 4.0, 9.0) + + """ + east, north = grid_coordinates(region, shape=(101, 101)) + east, north = projection(east.ravel(), north.ravel()) + return (east.min(), east.max(), north.min(), north.max()) + + +def project_grid(grid, projection, method="linear", antialias=True, **kwargs): + """ + Apply the given map projection to a grid and re-sample it. + + Creates a new grid in the projected coordinates by interpolating the + original values using the chosen *method* (linear by default). Before + interpolation, apply a blocked mean operation (:class:`~verde.BlockReduce`) + to avoid aliasing when the projected coordinates become oversampled in some + regions (which would cause the interpolation to down-sample the original + data). For example, applying a polar projection results in oversampled data + close to the pole. + + Points that fall outside the convex hull of the original data will be + masked (see :func:`~verde.convexhull_mask`) since they are not constrained + by any data points. + + Any arguments that can be passed to the + :meth:`~verde.base.BaseGridder.grid` method of Verde gridders can be passed + to this function as well. Use this to set a region and spacing (or shape) + for the projected grid. The region and spacing must be in *projected + coordinates*. + + If no region is provided, the bounding box of the projected data will be + used. If no spacing or shape is provided, the shape of the input *grid* + will be used for the projected grid. + + By default, the ``data_names`` argument will be set to the name of the data + variable of the input *grid* (if it has been set). + + .. note:: + + The interpolation methods are limited to what is available in Verde and + there is only support for single 2D grids. For more sophisticated use + cases, you might want to try + `pyresample `__ instead. + + Parameters + ---------- + grid : :class:`xarray.DataArray` + A single 2D grid of values. The first dimension is assumed to be the + northing/latitude dimension while the second is assumed to be the + easting/longitude dimension. + projection : callable + Should be a callable object (like a function) ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected northing and easting + coordinate arrays. + method : string or Verde gridder + If a string, will use it to create a :class:`~verde.ScipyGridder` with + the corresponding method (nearest, linear, or cubic). Otherwise, should + be a gridder/estimator object, like :class:`~verde.Spline`. Default is + ``"linear"``. + antialias : bool + If True, will run a :class:`~verde.BlockReduce` with a mean function to + avoid aliasing when the projection results in oversampling of the data + in some areas (for example, in polar projections). If False, will not + run the blocked mean. + + Returns + ------- + projected_grid : :class:`xarray.DataArray` + The projected grid, interpolated with the given parameters. + + """ + if hasattr(grid, "data_vars"): + raise ValueError( + "Projecting xarray.Dataset is not currently supported. " + "Please provide a DataArray instead." + ) + if len(grid.dims) != 2: + raise ValueError( + "Projecting grids with number of dimensions other than 2 is not " + "currently supported (dimensions of the given DataArray: {}).".format( + len(grid.dims) + ) + ) + + # Can be set to None for some data arrays depending on how they are created + # so we can't just rely on the default value for getattr. + name = getattr(grid, "name", None) + if name is None: + name = "scalars" + + data = grid_to_table(grid).dropna() + coordinates = projection(data[grid.dims[1]].values, data[grid.dims[0]].values) + data_region = get_region(coordinates) + + region = kwargs.pop("region", data_region) + shape = kwargs.pop("shape", grid.shape) + spacing = kwargs.pop("spacing", shape_to_spacing(region, shape)) + + check_region(region) + + steps = [] + if antialias: + steps.append( + ("mean", BlockReduce(np.mean, spacing=spacing, region=data_region)) + ) + if isinstance(method, str): + steps.append(("spline", ScipyGridder(method))) + else: + steps.append(("spline", method)) + interpolator = Chain(steps) + interpolator.fit(coordinates, data[name]) + + projected = interpolator.grid( + region=region, + spacing=spacing, + data_names=kwargs.pop("data_names", [name]), + **kwargs + ) + if method not in ["linear", "cubic"]: + projected = convexhull_mask(coordinates, grid=projected) + return projected[name] diff --git a/verde/tests/test_projections.py b/verde/tests/test_projections.py new file mode 100644 index 000000000..ed6af1858 --- /dev/null +++ b/verde/tests/test_projections.py @@ -0,0 +1,115 @@ +""" +Test the projection functions. +""" +import numpy.testing as npt +import numpy as np +import xarray as xr +import pytest + +from ..scipygridder import ScipyGridder +from ..projections import project_grid + + +def projection(longitude, latitude): + "Dummy projection" + return longitude ** 2, latitude ** 2 + + +@pytest.mark.parametrize( + "method", + ["nearest", "linear", "cubic", ScipyGridder("nearest")], + ids=["nearest", "linear", "cubic", "gridder"], +) +def test_project_grid(method): + "Use a simple projection to test that the output is as expected" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + proj = project_grid(grid, projection, method=method) + assert proj.dims == ("northing", "easting") + assert proj.name == "scalars" + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +def test_project_grid_name(): + "Check that grid name is kept" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray( + data, coords=[lons, lats], dims=("latitude", "longitude"), name="yara" + ) + proj = project_grid(grid, projection) + assert proj.name == "yara" + assert proj.dims == ("northing", "easting") + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +@pytest.mark.parametrize("antialias", [True, False]) +def test_project_grid_antialias(antialias): + "Check if antialias is being used" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + proj = project_grid(grid, projection, antialias=antialias) + if antialias: + assert "BlockReduce" in proj.attrs["metadata"] + else: + assert "BlockReduce" not in proj.attrs["metadata"] + assert proj.dims == ("northing", "easting") + assert proj.name == "scalars" + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +def test_project_grid_fails_dataset(): + "Should raise an exception when given a Datatset" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + grid = grid.to_dataset(name="scalars") + with pytest.raises(ValueError): + project_grid(grid, projection) + + +@pytest.mark.parametrize("ndims", [1, 3]) +def test_project_grid_fails_dimensions(ndims): + "Should raise an exception when given more or less than 2 dimensions" + shape = (10, 20, 12) + coords = [ + np.linspace(-10, 2, shape[0]), + np.linspace(2, 10, shape[1]), + np.linspace(0, 100, shape[2]), + ] + dims = ("height", "latitude", "longitude") + data = np.ones(shape[:ndims], dtype="float") + grid = xr.DataArray(data, coords=coords[:ndims], dims=dims[:ndims]) + with pytest.raises(ValueError): + project_grid(grid, projection) From 116e8a146ea8ac3d3267f851bc686ea8ea196016 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 6 Apr 2020 17:33:55 +0100 Subject: [PATCH 14/38] Changelog entry for v1.4.0 (#248) New minor release with new features and bug fixes. No breaking changes or deprecations. --- README.rst | 1 + doc/changes.rst | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/README.rst b/README.rst index bba8ab287..ed747ef24 100644 --- a/README.rst +++ b/README.rst @@ -159,6 +159,7 @@ Documentation for other versions * `Development `__ (reflects the *master* branch on Github) * `Latest release `__ +* `v1.4.0 `__ * `v1.3.0 `__ * `v1.2.0 `__ * `v1.1.0 `__ diff --git a/doc/changes.rst b/doc/changes.rst index c05852597..13191b2a8 100644 --- a/doc/changes.rst +++ b/doc/changes.rst @@ -3,6 +3,81 @@ Changelog ========= +Version 1.4.0 +------------- + +*Released on: 2020/04/06* + +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3739449.svg + :target: https://doi.org/10.5281/zenodo.3739449 + +Bug fixes: + +* **Profile distances are now returned in projected (Cartesian) coordinates by + the** ``profile`` **method of gridders if a projection is given.** The method + has the option to apply a projection to the coordinates before predicting so + we can pass geographic coordinates to Cartesian gridders. In these cases, the + distance along the profile is calculated by the ``profile_coordinates`` + function with the unprojected coordinates (in the geographic case it would be + degrees). The profile point calculation is also done assuming that + coordinates are Cartesian, which is clearly wrong if inputs are longitude and + latitude. To fix this, we now project the input points prior to passing them + to ``profile_coordinates``. This means that the distances are Cartesian and + generation of profile points is also Cartesian (as is assumed by the + function). The generated coordinates are projected back so that the user gets + longitude and latitude but distances are still projected Cartesian meters. + (`#231 `__) +* **Function** ``verde.grid_to_table`` **now sets the correct order for + coordinates.** We were relying on the order of the ``coords`` attribute of + the ``xarray.Dataset`` for the order of the coordinates. This is wrong + because xarray takes the coordinate order from the ``dims`` attribute + instead, which is what we should also have been doing. + (`#229 `__) + +Documentation: + +* Generalize coordinate system specifications in ``verde.base.BaseGridder`` + docstrings. Most methods don't really depend on the coordinate system so use + a more generic language to allow derived classes to specify their coordinate + systems without having to overload the base methods just to rewrite the + docstrings. + (`#240 `__) + +New features: + +* New function ``verde.convexhull_mask`` to mask points in a grid that fall + outside the convex hull defined by data points. + (`#237 `__) +* New function ``verde.project_grid`` that transforms 2D gridded data using a + given projection. It re-samples the data using ``ScipyGridder`` (by default) + and runs a blocked mean (optional) to avoid aliasing when the points aren't + evenly distributed in the projected coordinates (like in polar projections). + Finally, it applies a ``convexhull_mask`` to the grid to avoid extrapolation + to points that had no original data. + (`#246 `__) +* New function ``verde.expanding_window`` for selecting data that falls inside + of an expanding window around a central point. + (`#238 `__) +* New function ``verde.rolling_window`` for rolling window selections of + irregularly sampled data. + (`#236 `__) + +Improvements: + +* Allow ``verde.grid_to_table`` to take ``xarray.DataArray`` as input. + (`#235 `__) + +Maintenance: + +* Use newer MacOS images on Azure Pipelines. + (`#234 `__) + +This release contains contributions from: + +* Leonardo Uieda +* Santiago Soler +* Jesse Pisel + Version 1.3.0 ------------- From 9884a274fc7cc49e213345d21f17d960082eaef0 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 6 Apr 2020 18:18:49 +0100 Subject: [PATCH 15/38] Fix broken TravisCI deploy (#250) The docs and configuration validation tool had recommended using `cleanup: false` instead of `skip_cleanup: true` when deploying. Turns out Travis decided to ignore the new parameter and our Github Pages deploy was broken since the docs were deleted before we could deploy. Revert back to the old argument until Travis really removes it. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 69abef85e..f417298e6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -109,14 +109,14 @@ deploy: # Push the built HTML in doc/_build/html to the gh-pages branch - provider: script script: continuous-integration/travis/deploy-gh-pages.sh - cleanup: false + skip_cleanup: true on: branch: master condition: '$DEPLOY_DOCS == "true"' # Push HTML when building tags as well - provider: script script: continuous-integration/travis/deploy-gh-pages.sh - cleanup: false + skip_cleanup: true on: tags: true condition: '$DEPLOY_DOCS == "true"' From f768bab72da5f85fd7b8e662de9c255e672ff229 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Fri, 10 Apr 2020 09:41:46 +0100 Subject: [PATCH 16/38] Add a block version of ShuffleSplit cross-validator (#251) Cross-validation of spatial data with random splits can often lead to overestimation of accuracy (see https://doi.org/10.1111/ecog.02881 for a nice overview). To account for this, the splits can be done by spatial blocks. In this case, the data are partitioned into blocks and blocks are split randomly. That guarantees some independence of measurements between blocks assigned to test/train sets. This change adds the `BlockShuffleSplit` cross-validator which does the random splitting of blocks in a scikit-learn compatible class. It can be used with `verde.cross_val_score` like any other scikit-learn cross- validator. In fact, it can be used entirely outside of Verde for any type of machine learning on spatial data. The class takes care of balancing the random splits so that the right amount of train/test data are included in each (since blocks can have wildly different numbers of points). --- doc/api/index.rst | 1 + doc/references.rst | 2 + verde/__init__.py | 2 +- verde/model_selection.py | 307 +++++++++++++++++++++++++++- verde/tests/test_model_selection.py | 45 +++- 5 files changed, 346 insertions(+), 11 deletions(-) diff --git a/doc/api/index.rst b/doc/api/index.rst index 39a5b530a..9a2b3b1a6 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -45,6 +45,7 @@ Model Selection train_test_split cross_val_score + BlockShuffleSplit Coordinate Manipulation ----------------------- diff --git a/doc/references.rst b/doc/references.rst index 38a4bbcf1..10ae0bc7e 100644 --- a/doc/references.rst +++ b/doc/references.rst @@ -1,5 +1,7 @@ References ========== +.. [Roberts_etal2017] Roberts, D. R., Bahn, V., Ciuti, S., Boyce, M. S., Elith, J., Guillera‐Arroita, G., et al. (2017). Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure. Ecography, 40(8), 913–929. https://doi.org/10.1111/ecog.02881 .. [Sandwell1987] Sandwell, D. T. (1987). Biharmonic spline interpolation of GEOS-3 and SEASAT altimeter data. Geophysical Research Letters, 14(2), 139–142. https://doi.org/10.1029/GL014i002p00139 .. [SandwellWessel2016] Sandwell, D. T., & Wessel, P. (2016). Interpolation of 2-D vector data using constraints from elasticity. Geophysical Research Letters, 43(20), 2016GL070340. https://doi.org/10.1002/2016GL070340 +.. [Valavi_etal2019] Valavi, R., Elith, J., Lahoz‐Monfort, J. J., & Guillera‐Arroita, G. (2019). blockCV: An r package for generating spatially or environmentally separated folds for k-fold cross-validation of species distribution models. Methods in Ecology and Evolution, 10(2), 225–232. https://doi.org/10.1111/2041-210X.13107 diff --git a/verde/__init__.py b/verde/__init__.py index c0b5cf5bb..0fa4bee34 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -23,7 +23,7 @@ from .trend import Trend from .chain import Chain from .spline import Spline, SplineCV -from .model_selection import cross_val_score, train_test_split +from .model_selection import cross_val_score, train_test_split, BlockShuffleSplit from .vector import Vector, VectorSpline2D from .projections import project_region, project_grid diff --git a/verde/model_selection.py b/verde/model_selection.py index bc4c3958d..d692c5537 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -1,16 +1,14 @@ """ Functions for automating model selection through cross-validation. - -Supports using a dask.distributed.Client object for parallelism. The -DummyClient is used as a serial version of the parallel client. """ import warnings import numpy as np -from sklearn.model_selection import KFold, ShuffleSplit +from sklearn.model_selection import KFold, ShuffleSplit, BaseCrossValidator from sklearn.base import clone -from .base import check_fit_input +from .base import check_fit_input, n_1d_arrays +from .coordinates import block_split from .utils import dispatch @@ -18,6 +16,268 @@ warnings.simplefilter("default") +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class BlockShuffleSplit(BaseCrossValidator): + """ + Random permutation of spatial blocks cross-validator. + + Yields indices to split data into training and test sets. Data are first + grouped into rectangular blocks of size given by the *spacing* argument. + Alternatively, blocks can be defined by the number of blocks in each + dimension using the *shape* argument instead of *spacing*. The blocks are + then split into testing and training sets randomly. + + The proportion of blocks assigned to each set is controlled by *test_size* + and/or *train_size*. However, the total amount of actual data points in + each set could be different from these values since blocks can have + a different number of data points inside them. To guarantee that the + proportion of actual data is as close as possible to the proportion of + blocks, this cross-validator generates an extra number of splits and + selects the one with proportion of data points in each set closer to the + desired amount [Valavi_etal2019]_. The number of balancing splits per + iteration is controlled by the *balancing* argument. + + This cross-validator is preferred over + :class:`sklearn.model_selection.ShuffleSplit` for spatial data to avoid + overestimating cross-validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. + + .. note:: + + Like :class:`sklearn.model_selection.ShuffleSplit`, this + cross-validator cannot guarantee that all folds will be different, + although this is still very likely for sizeable datasets. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int, default 10 + Number of re-shuffling & splitting iterations. + test_size : float, int, None, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + train_size : float, int, or None, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + balancing : int + The number of splits generated per iteration to try to balance the + amount of data in each set so that *test_size* and *train_size* are + respected. If 1, then no extra splits are generated (essentially + disabling the balacing). Must be >= 1. + + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + cross_val_score : Score an estimator/gridder using cross-validation. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> import numpy as np + >>> # Make a regular grid of data points + >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) + >>> # Need to convert the coordinates into a feature matrix + >>> X = np.transpose([i.ravel() for i in coords]) + >>> shuffle = BlockShuffleSplit(spacing=1.5, n_splits=3, random_state=0) + >>> # These are the 1D indices of the points belonging to each set + >>> for train, test in shuffle.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + >>> # A better way to visualize this is to create a 2D array and put + >>> # "train" or "test" in the corresponding locations. + >>> shape = coords[0].shape + >>> mask = np.full(shape=shape, fill_value=" ") + >>> for iteration, (train, test) in enumerate(shuffle.split(X)): + ... # The index needs to be converted to 2D so we can index our matrix. + ... mask[np.unravel_index(train, shape)] = "train" + ... mask[np.unravel_index(test, shape)] = " test" + ... print("Iteration {}:".format(iteration)) + ... print(mask) + Iteration 0: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + [' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train']] + Iteration 1: + [[' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 2: + [['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=10, + test_size=0.1, + train_size=None, + random_state=None, + balancing=10, + ): + if spacing is None and shape is None: + raise ValueError("Either 'spacing' or 'shape' must be provided.") + if balancing < 1: + raise ValueError( + "The *balancing* argument must be >= 1. To disable balancing, use 1." + ) + self.spacing = spacing + self.shape = shape + self.n_splits = n_splits + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self.balancing = balancing + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + Runs several iterations until a split is found that yields blocks with + the right amount of data points in it. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + labels = block_split( + coordinates=(X[:, 0], X[:, 1]), + spacing=self.spacing, + shape=self.shape, + region=None, + adjust="spacing", + )[1] + block_ids = np.unique(labels) + # Generate many more splits so that we can pick and choose the ones + # that have the right balance of training and testing data. + shuffle = ShuffleSplit( + n_splits=self.n_splits * self.balancing, + test_size=self.test_size, + train_size=self.train_size, + random_state=self.random_state, + ).split(block_ids) + for _ in range(self.n_splits): + test_sets, balance = [], [] + for _ in range(self.balancing): + # This is a false positive in pylint which is why the warning + # is disabled at the top of this file: + # https://github.com/PyCQA/pylint/issues/1830 + # pylint: disable=stop-iteration-return + train_blocks, test_blocks = next(shuffle) + # pylint: enable=stop-iteration-return + train_points = np.where(np.isin(labels, block_ids[train_blocks]))[0] + test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] + # The proportion of data points assigned to each group should + # be close the proportion of blocks assigned to each group. + balance.append( + abs( + train_points.size / test_points.size + - train_blocks.size / test_blocks.size + ) + ) + test_sets.append(test_points) + best = np.argmin(balance) + yield test_sets[best] + + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + + """ + if X.shape[1] != 2: + raise ValueError( + "X must have exactly 2 columns ({} given).".format(X.shape[1]) + ) + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + y : object + Always ignored, exists for compatibility. + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +# pylint: enable=invalid-name,unused-argument + + def train_test_split(coordinates, data, weights=None, **kwargs): r""" Split a dataset into a training and a testing set for cross-validation. @@ -48,6 +308,11 @@ def train_test_split(coordinates, data, weights=None, **kwargs): Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. + See also + -------- + cross_val_score : Score an estimator/gridder using cross-validation. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- @@ -106,7 +371,8 @@ def cross_val_score( By default, will use :class:`sklearn.model_selection.KFold` with ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other - cross-validation class can be passed in through the *cv* argument. + cross-validation class from scikit-learn or Verde can be passed in through + the *cv* argument. Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead @@ -135,7 +401,7 @@ def cross_val_score( one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator - Any scikit-learn cross-validation generator. Defaults to + Any scikit-learn or Verde cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client **DEPRECATED:** This option is deprecated and will be removed in Verde @@ -155,6 +421,11 @@ def cross_val_score( *delayed*, will be a list of Dask delayed objects (see the *delayed* option). If *client* is not None, then the scores will be futures. + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- @@ -186,6 +457,24 @@ def cross_val_score( >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 + Often, spatial data are autocorrelated (points that are close together are + more likely to have similar values), which can cause cross-validation with + random splits to overestimate the prediction accuracy [Roberts_etal2017]_. + To account for the autocorrelation, we can split the data in blocks rather + than randomly with :class:`verde.BlockShuffleSplit`: + + >>> from verde import BlockShuffleSplit + >>> # spacing controls the size of the spatial blocks + >>> cross_validator = BlockShuffleSplit( + ... spacing=2, n_splits=3, random_state=0 + ... ) + >>> scores = cross_val_score(model, coords, data, cv=cross_validator) + >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) + 1.00, 1.00, 1.00 + + We didn't see a difference here since our model and data are perfect. See + :ref:`model_evaluation` for an example with real data. + If using many splits, we can speed up computations by running them in parallel with Dask: @@ -216,10 +505,10 @@ def cross_val_score( ) if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) - ndata = data[0].size + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) fit_args = (coordinates, data, weights) scores = [] - for train_index, test_index in cv.split(np.arange(ndata)): + for train_index, test_index in cv.split(feature_matrix): train = tuple(select(i, train_index) for i in fit_args) test = tuple(select(i, test_index) for i in fit_args) # Clone the estimator to avoid fitting the same object simultaneously diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index 224a4993b..4093c4fca 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -1,12 +1,14 @@ """ Test the model selection code (cross-validation, etc). """ +import pytest from sklearn.model_selection import ShuffleSplit +import numpy as np import numpy.testing as npt from dask.distributed import Client from .. import Trend, grid_coordinates -from ..model_selection import cross_val_score +from ..model_selection import cross_val_score, BlockShuffleSplit def test_cross_val_score_client(): @@ -22,3 +24,44 @@ def test_cross_val_score_client(): client.close() assert len(scores) == nsplits npt.assert_allclose(scores, 1) + + +def test_blockshufflesplit_n_splits(): + "Make sure get_n_splits returns the correct value" + cv = BlockShuffleSplit(spacing=1, n_splits=14) + assert cv.get_n_splits() == 14 + + +def test_blockshufflesplit_fails_balancing(): + "Should raise an exception if balancing < 1." + with pytest.raises(ValueError): + BlockShuffleSplit(spacing=1, balancing=0) + + +def test_blockshufflesplit_fails_spacing_shape(): + "Should raise an exception if not given spacing or shape." + with pytest.raises(ValueError): + BlockShuffleSplit() + + +def test_blockshufflesplit_fails_data_shape(): + "Should raise an exception if the X array doesn't have 2 columns." + cv = BlockShuffleSplit(spacing=1) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 4)))) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 1)))) + + +@pytest.mark.parametrize("test_size", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9]) +def test_blockshufflesplit_balancing(test_size): + "Make sure that the sets have the right number of points" + coords = np.random.RandomState(seed=0).multivariate_normal( + mean=[5, -7.5], cov=[[4, 0], [0, 9]], size=1000, + ) + npoints = coords.shape[0] + train_size = 1 - test_size + cv = BlockShuffleSplit(spacing=1, random_state=0, test_size=test_size, balancing=20) + for train, test in cv.split(coords): + npt.assert_allclose(train.size / npoints, train_size, atol=0.01) + npt.assert_allclose(test.size / npoints, test_size, atol=0.01) From affe4c10d783f6df93f24c051a888184faf9b522 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 14 Apr 2020 11:38:38 -0300 Subject: [PATCH 17/38] Add optional argument to least_squares to copy Jacobian matrix (#255) Add new copy_jacobian optional argument to least_squares to avoid scaling Jacobian matrix inplace. Add warning on its docstring warning about memory consumption when copy_jacobian is True. Add test function for this new feature. --- verde/base/least_squares.py | 13 +++++++++++-- verde/tests/test_base.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/verde/base/least_squares.py b/verde/base/least_squares.py index 81ec3b318..e2b738ad0 100644 --- a/verde/base/least_squares.py +++ b/verde/base/least_squares.py @@ -7,7 +7,7 @@ from sklearn.linear_model import LinearRegression, Ridge -def least_squares(jacobian, data, weights, damping=None): +def least_squares(jacobian, data, weights, damping=None, copy_jacobian=False): """ Solve a weighted least-squares problem with optional damping regularization @@ -17,6 +17,12 @@ def least_squares(jacobian, data, weights, damping=None): required for predictions. Doesn't normalize the column means because that operation can't be undone. + .. warning:: + + Setting `copy_jacobian` to True will copy the Jacobian matrix, doubling + the memory required. Use it only if the Jacobian matrix is needed + afterwards. + Parameters ---------- jacobian : 2d-array @@ -31,6 +37,9 @@ def least_squares(jacobian, data, weights, damping=None): damping : None or float The positive damping (Tikhonov 0th order) regularization parameter. If ``damping=None``, will use a regular least-squares fit. + copy_jacobian: bool + If False, the Jacobian matrix will be scaled inplace. If True, the + Jacobian matrix will be copied before scaling. Default False. Returns ------- @@ -44,7 +53,7 @@ def least_squares(jacobian, data, weights, damping=None): jacobian.shape ) ) - scaler = StandardScaler(copy=False, with_mean=False, with_std=True) + scaler = StandardScaler(copy=copy_jacobian, with_mean=False, with_std=True) jacobian = scaler.fit_transform(jacobian) if damping is None: regr = LinearRegression(fit_intercept=False, normalize=False) diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 4f2f61d70..88ba36949 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -6,6 +6,7 @@ import numpy.testing as npt import pytest +from ..base.least_squares import least_squares from ..base.utils import check_fit_input, check_coordinates from ..base.base_classes import ( BaseGridder, @@ -232,3 +233,16 @@ def test_check_fit_input_fails_weights(): check_fit_input(coords, data, weights) with pytest.raises(ValueError): check_fit_input(coords, (data, data), weights) + + +def test_least_squares_copy_jacobian(): + """ + Test if Jacobian matrix is copied or scaled inplace + """ + jacobian = np.identity(5) + original_jacobian = jacobian.copy() + data = np.array([1, 2, 3, 4, 5], dtype=float) + least_squares(jacobian, data, weights=None, copy_jacobian=True) + npt.assert_allclose(jacobian, original_jacobian) + least_squares(jacobian, data, weights=None) + assert not np.allclose(jacobian, original_jacobian) From 336d7e4386227f1ed5c62238cd341de292c08aa9 Mon Sep 17 00:00:00 2001 From: Fatiando a Terra Bot <50936856+fatiando-bot@users.noreply.github.com> Date: Wed, 15 Apr 2020 11:27:46 +0100 Subject: [PATCH 18/38] Update files from fatiando/contributing (#256) Changes have been made in https://github.com/fatiando/contributing to: MAINTENANCE.md Update the copies in this repository to match. See https://github.com/fatiando/contributing/commit/340dafd30d9d92b7bd61c57ffbe4009732b8b058 --- MAINTENANCE.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/MAINTENANCE.md b/MAINTENANCE.md index 8a8ba6b83..4858b1a8c 100644 --- a/MAINTENANCE.md +++ b/MAINTENANCE.md @@ -130,8 +130,11 @@ file of your project. Then: 1. Delete all existing files (they will be replaced with the new version). 2. Reserve a DOI and save the release draft. -3. Add as authors any new contributors who have added themselves to `AUTHORS.md`. -4. Update release date. +3. Add as authors any new contributors who have added themselves to + [`AUTHORS.md`](AUTHORS.md). +4. Review author order to make sure it follows the guidelines on our + [Authorship Guide](AUTHORSHIP.md) +5. Update release date. On the other hand, if you're making the first release of the project, you need to create a **New upload** for it inside the @@ -197,17 +200,13 @@ click on "Draft a new release": 1. Use the version number (including the `v`) as the "Tag version" and "Release title". -2. The first line of the release description should be the DOI of the release: +2. Fill the release description with a Markdown version of the **latest** + changelog entry (including the DOI badge). The `doc/changes.rst` file can be + converted to Markdown using `pandoc`: ``` - DOI: https://doi.org/ + pandoc -s doc/changes.rst -o changes.md --wrap=none ``` -3. Fill the release description with a Markdown version of the **latest** - changelog entry. The `doc/changes.rst` file can be converted to Markdown - using `pandoc`: - ``` - pandoc -s doc/changes.rst -o changes.md - ``` -4. Publish the release. +3. Publish the release. ### Archive on Zenodo From ba2131d028f943c9f48ecba2171e90d8be6471be Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Tue, 28 Apr 2020 16:29:00 +0100 Subject: [PATCH 19/38] Add a blocked option for train_test_split (#253) Add option `blocked=False` to `verde.train_test_split` to split along blocks instead of randomly. Uses `BlockShuffleSplit` if `True` instead of the scikit-learn `ShuffleSplit`. Add a gallery example comparing the two options. --- examples/train_test_split.py | 71 ++++++++++++++++++ verde/model_selection.py | 142 +++++++++++++++++++++++++---------- 2 files changed, 173 insertions(+), 40 deletions(-) create mode 100644 examples/train_test_split.py diff --git a/examples/train_test_split.py b/examples/train_test_split.py new file mode 100644 index 000000000..0ec9c06e5 --- /dev/null +++ b/examples/train_test_split.py @@ -0,0 +1,71 @@ +""" +Splitting data into train and test sets +======================================= + + +""" +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import verde as vd + +# Let's split the Baja California shipborne bathymetry data +data = vd.datasets.fetch_baja_bathymetry() +coordinates = (data.longitude, data.latitude) +values = data.bathymetry_m + +# Assign 20% of the data to the testing set. +test_size = 0.2 + +# Split the data randomly into training and testing. Set the random state +# (seed) so that we get the same result if running this code again. +train, test = vd.train_test_split( + coordinates, values, test_size=test_size, random_state=123 +) +# train and test are tuples = (coordinates, data, weights). +print("Train and test size for random splits:", train[0][0].size, test[0][0].size) + +# A better strategy for spatial data is to first assign the data to blocks and +# then split the blocks randomly. The size of the blocks is controlled by the +# 'spacing' argument. +train_block, test_block = vd.train_test_split( + coordinates, + values, + blocked=True, + spacing=0.5, + test_size=test_size, + random_state=213, +) +# Verde will attempt to balance the data between the splits so that the desired +# amount is assigned to the test set. It won't be exact since blocks contain +# different amounts of data points. +print( + "Train and test size for block splits: ", + train_block[0][0].size, + test_block[0][0].size, +) + +# Cartopy requires setting the coordinate reference system (CRS) of the +# original data through the transform argument. Their docs say to use +# PlateCarree to represent geographic data. +crs = ccrs.PlateCarree() + +# Make Mercator maps of the two different ways of splitting +fig, (ax1, ax2) = plt.subplots( + 1, 2, figsize=(10, 6), subplot_kw=dict(projection=ccrs.Mercator()) +) + +# Use an utility function to setup the tick labels and the land feature +vd.datasets.setup_baja_bathymetry_map(ax1) +vd.datasets.setup_baja_bathymetry_map(ax2) + +ax1.set_title("Random splitting") +ax1.plot(*train[0], ".b", markersize=1, transform=crs, label="Train") +ax1.plot(*test[0], ".r", markersize=1, transform=crs, label="Test", alpha=0.5) + +ax2.set_title("Blocked random splitting") +ax2.plot(*train_block[0], ".b", markersize=1, transform=crs, label="Train") +ax2.plot(*test_block[0], ".r", markersize=1, transform=crs, label="Test") +ax2.legend(loc="upper right") + +plt.subplots_adjust(wspace=0.15, top=1, bottom=0, left=0.05, right=0.95) +plt.show() diff --git a/verde/model_selection.py b/verde/model_selection.py index d692c5537..217cc87e8 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -278,16 +278,25 @@ def get_n_splits(self, X=None, y=None, groups=None): # pylint: enable=invalid-name,unused-argument -def train_test_split(coordinates, data, weights=None, **kwargs): +def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to - work on multi-component spatial data with optional weights. + work on single- or multi-component spatial data with optional weights. - Extra keyword arguments will be passed to - :class:`sklearn.model_selection.ShuffleSplit`, except for ``n_splits`` - which is always 1. + Extra keyword arguments will be passed to the cross-validation class: + :class:`sklearn.model_selection.ShuffleSplit` (random splits) if + ``block=False`` or :class:`verde.BlockShuffleSplit` (spatially blocked + random splits) if ``block=True``. The exception is ``n_splits`` which is + always 1. + + Using ``block=True`` is preferred over plain random splits for spatial data + to avoid overestimating validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. In this case, you + **must provide** a *spacing* or *shape* argument as well (see below). Parameters ---------- @@ -301,6 +310,12 @@ def train_test_split(coordinates, data, weights=None, **kwargs): if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). + block : bool + If True, will use :class:`verde.BlockShuffleSplit` as a cross-validator + to first split the data into spatial blocks and then split the blocks + randomly into training and testing sets. When using this option, a + *spacing* or *shape* must be provided as well to specify the size (or + number) of the spatial blocks. Returns ------- @@ -316,46 +331,93 @@ def train_test_split(coordinates, data, weights=None, **kwargs): Examples -------- + To randomly split the data between training and testing sets: + >>> import numpy as np - >>> # Split 2-component data with weights - >>> data = (np.array([1, 3, 5, 7]), np.array([0, 2, 4, 6])) + >>> # Make some data + >>> data = np.array([10, 30, 50, 70]) >>> coordinates = (np.arange(4), np.arange(-4, 0)) - >>> weights = (np.array([1, 1, 2, 1]), np.array([1, 2, 1, 1])) - >>> train, test = train_test_split(coordinates, data, weights, - ... random_state=0) - >>> print("Coordinates:", train[0], test[0], sep='\n ') - Coordinates: - (array([3, 1, 0]), array([-1, -3, -4])) - (array([2]), array([-2])) - >>> print("Data:", train[1], test[1], sep='\n ') - Data: - (array([7, 3, 1]), array([6, 2, 0])) - (array([5]), array([4])) - >>> print("Weights:", train[2], test[2], sep='\n ') - Weights: - (array([1, 1, 1]), array([1, 2, 1])) - (array([2]), array([1])) - >>> # Split single component data without weights - >>> train, test = train_test_split(coordinates, data[0], None, - ... random_state=0) - >>> print("Coordinates:", train[0], test[0], sep='\n ') - Coordinates: - (array([3, 1, 0]), array([-1, -3, -4])) - (array([2]), array([-2])) - >>> print("Data:", train[1], test[1], sep='\n ') - Data: - (array([7, 3, 1]),) - (array([5]),) - >>> print("Weights:", train[2], test[2], sep='\n ') - Weights: - (None,) - (None,) + >>> train, test = train_test_split(coordinates, data, random_state=0) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]),) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]),) + + If weights are given, they will also be split among the sets: + + >>> weights = np.array([4, 3, 2, 5]) + >>> train, test = train_test_split( + ... coordinates, data, weights, random_state=0, + ... ) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]),) + >>> print("weights:", train[2]) + weights: (array([5, 3, 4]),) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]),) + >>> print("weights:", test[2]) + weights: (array([2]),) + + Data with multiple components can also be split: + + >>> data = (np.array([10, 30, 50, 70]), np.array([-70, -50, -30, -10])) + >>> train, test = train_test_split(coordinates, data, random_state=0) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]), array([-10, -50, -70])) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]), array([-30])) + + To split data grouped in spatial blocks: + + >>> from verde import grid_coordinates + >>> # Make a regular grid of data points + >>> coordinates = grid_coordinates(region=(0, 3, 4, 7), spacing=1) + >>> data = np.arange(16).reshape((4, 4)) + >>> # We must specify the size of the blocks via the spacing argument. + >>> # Blocks of 1.5 will split the domain into 4 blocks. + >>> train, test = train_test_split( + ... coordinates, data, random_state=0, blocked=True, spacing=1.5, + ... ) + >>> # The training set: + >>> print("coords:", train[0][0], train[0][1], sep="\n") + coords: + [0. 1. 2. 3. 0. 1. 2. 3. 2. 3. 2. 3.] + [4. 4. 4. 4. 5. 5. 5. 5. 6. 6. 7. 7.] + >>> print("data:", train[1]) + data: (array([ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15]),) + >>> # The testing set: + >>> print("coords:", test[0][0], test[0][1]) + coords: [0. 1. 0. 1.] [6. 6. 7. 7.] + >>> print("data:", test[1]) + data: (array([ 8, 9, 12, 13]),) """ args = check_fit_input(coordinates, data, weights, unpack=False) - ndata = args[1][0].size - indices = np.arange(ndata) - split = next(ShuffleSplit(n_splits=1, **kwargs).split(indices)) + if blocked: + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) + shuffle = BlockShuffleSplit(n_splits=1, **kwargs).split(feature_matrix) + else: + indices = np.arange(args[1][0].size) + shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices) + split = next(shuffle) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test From 0fe83dc906138c56cb5d0d3bfe65c4ac1ee99055 Mon Sep 17 00:00:00 2001 From: Rowan Cockett Date: Wed, 6 May 2020 05:20:13 -0600 Subject: [PATCH 20/38] Fix typo in README contributing section (#258) Equality -> Equally --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index ed747ef24..e4b4bf2bf 100644 --- a/README.rst +++ b/README.rst @@ -130,7 +130,7 @@ What could you possibly offer? We assure you that the little voice in your head is wrong. **Being a contributor doesn't just mean writing code**. -Equality important contributions include: +Equally important contributions include: writing or proof-reading documentation, suggesting or implementing tests, or even giving feedback about the project (including giving feedback about the contribution process). From 644037ee5f6890f54430289eac5f5b0f654140f6 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 7 May 2020 09:28:38 +0100 Subject: [PATCH 21/38] Pin pylint to 2.4.* (#259) Version 2.5.0 introduced a bug with the multiprocessing option. This happens often enough that we should have pylint pinned and only upgrade when we choose to do it. --- .azure-pipelines.yml | 2 +- environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 914cc9397..f00e588e4 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -27,7 +27,7 @@ jobs: vmImage: 'ubuntu-16.04' variables: - CONDA_INSTALL_EXTRA: "black flake8 pylint" + CONDA_INSTALL_EXTRA: "black flake8 pylint==2.4.*" PYTHON: '3.7' steps: diff --git a/environment.yml b/environment.yml index 52801fc3e..346df383a 100644 --- a/environment.yml +++ b/environment.yml @@ -24,7 +24,7 @@ dependencies: - pytest-mpl - coverage - black - - pylint + - pylint=2.4.* - flake8 - sphinx=2.2.1 - sphinx_rtd_theme=0.4.3 From 491608ea3b4a4783cc47b79f8544facad3b6c0de Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Wed, 3 Jun 2020 15:30:23 -0300 Subject: [PATCH 22/38] Update tests to latest release of Scikit Learn (#267) Change the expected representation of any gridder based on the new default behaviour of Scikit Learn. After 0.23, Scikit Learn only shows those parameters whose default value has been changed when giving a string representation of the gridder. See scikit-learn/scikit-learn#17061 and its changelog for more information. --- verde/base/base_classes.py | 2 +- verde/tests/test_base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index cc988d015..c0e944f57 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -69,7 +69,7 @@ class BaseGridder(BaseEstimator): >>> # Fit the gridder to our synthetic data >>> grd = MeanGridder().fit((data.easting, data.northing), data.scalars) >>> grd - MeanGridder(multiplier=1) + MeanGridder() >>> # Interpolate on a regular grid >>> grid = grd.grid(region=(0, 5, -10, -8), shape=(30, 20)) >>> np.allclose(grid.scalars, -32.2182) diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 88ba36949..d637bd4dc 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -111,7 +111,7 @@ def test_basegridder(): BaseGridder().fit(None, None) grd = PolyGridder() - assert repr(grd) == "PolyGridder(degree=1)" + assert repr(grd) == "PolyGridder()" grd.degree = 2 assert repr(grd) == "PolyGridder(degree=2)" From c1ab70e9f73c03476f27c959b8fcd0daf0c971c1 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Wed, 3 Jun 2020 16:15:19 -0300 Subject: [PATCH 23/38] Gridders apply projections using only the first two coordinates (#264) Create a private function used by BaseGridder that applies projections only on the first two elements of the coordinates, but returns the whole set of coordinates, replacing the first two with their projected versions. Add a test function that passes extra_coords to include more than two coordinates when predicting through the grid, scatter and profile methods. --- verde/base/base_classes.py | 50 ++++++++++++++++++++++++++--- verde/tests/test_base.py | 64 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 5 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index c0e944f57..40e15fbf4 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -300,7 +300,9 @@ def grid( if projection is None: data = check_data(self.predict(coordinates)) else: - data = check_data(self.predict(projection(*coordinates))) + data = check_data( + self.predict(project_coordinates(coordinates, projection)) + ) data_names = get_data_names(data, data_names) coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} attrs = {"metadata": "Generated by {}".format(repr(self))} @@ -378,7 +380,9 @@ def scatter( if projection is None: data = check_data(self.predict(coordinates)) else: - data = check_data(self.predict(projection(*coordinates))) + data = check_data( + self.predict(project_coordinates(coordinates, projection)) + ) data_names = get_data_names(data, data_names) columns = [(dims[0], coordinates[1]), (dims[1], coordinates[0])] columns.extend(zip(data_names, data)) @@ -480,14 +484,14 @@ def profile( # geographic coordinates since we don't do actual distances on a # sphere). if projection is not None: - point1 = projection(*point1) - point2 = projection(*point2) + point1 = project_coordinates(point1, projection) + point2 = project_coordinates(point2, projection) coordinates, distances = profile_coordinates(point1, point2, size, **kwargs) data = check_data(self.predict(coordinates)) # Project the coordinates back to have geographic coordinates for the # profile but Cartesian distances. if projection is not None: - coordinates = projection(*coordinates, inverse=True) + coordinates = project_coordinates(coordinates, projection, inverse=True) data_names = get_data_names(data, data_names) columns = [ (dims[0], coordinates[1]), @@ -506,6 +510,42 @@ def _get_dims(self, dims): return self.dims +def project_coordinates(coordinates, projection, **kwargs): + """ + Apply projection to given coordiantes + + Allows to apply projections to any number of coordinates, assuming + that the first ones are ``longitude`` and ``latitude``. + + Examples + -------- + + >>> # Define a custom projection function + >>> def projection(lon, lat, inverse=False): + ... "Simple projection of geographic coordinates" + ... radius = 1000 + ... if inverse: + ... return (lon / radius, lat / radius) + ... return (lon * radius, lat * radius) + + >>> # Apply the projection to a set of coordinates containing: + >>> # longitude, latitude and height + >>> coordinates = (1., -2., 3.) + >>> project_coordinates(coordinates, projection) + (1000.0, -2000.0, 3.0) + + >>> # Apply the inverse projection + >>> coordinates = (-500.0, 1500.0, -19.0) + >>> project_coordinates(coordinates, projection, inverse=True) + (-0.5, 1.5, -19.0) + + """ + proj_coordinates = projection(*coordinates[:2], **kwargs) + if len(coordinates) > 2: + proj_coordinates += tuple(coordinates[2:]) + return proj_coordinates + + def get_data_names(data, data_names): """ Get default names for data fields if none are given based on the data. diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index d637bd4dc..ad8ea59c8 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -201,6 +201,70 @@ def proj(lon, lat, inverse=False): npt.assert_allclose(prof.distance, distance_true) +def test_basegridder_projection_multiple_coordinates(): + "Test BaseGridder when passing in a projection with multiple coordinates" + + # Lets say the projection is doubling the coordinates + def proj(lon, lat, inverse=False): + "Project from the new coordinates to the original" + if inverse: + return (lon / 2, lat / 2) + return (lon * 2, lat * 2) + + # Values in "geographic" coordinates + region = (0, 10, -10, -5) + shape = (51, 31) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0, extra_coords=(1, 2)) + data = angular * coordinates[0] + linear + # Project before passing to our Cartesian gridder + proj_coordinates = proj(coordinates[0], coordinates[1]) + coordinates[2:] + grd = PolyGridder().fit(proj_coordinates, data) + + # Check the estimated coefficients + # The grid is estimated in projected coordinates (which are twice as large) + # so the rate of change (angular) should be half to get the same values. + npt.assert_allclose(grd.coefs_, [linear, angular / 2]) + + # The actual values for a grid + coordinates_true = grid_coordinates(region, shape, extra_coords=(13, 17)) + data_true = angular * coordinates_true[0] + linear + + # Check the scatter + scat = grd.scatter( + region, 1000, random_state=0, projection=proj, extra_coords=(13, 17) + ) + npt.assert_allclose(scat.scalars, data) + npt.assert_allclose(scat.easting, coordinates[0]) + npt.assert_allclose(scat.northing, coordinates[1]) + + # Check the grid + grid = grd.grid(region, shape, projection=proj, extra_coords=(13, 17)) + npt.assert_allclose(grid.scalars.values, data_true) + npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :]) + npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) + + # Check the profile + prof = grd.profile( + (region[0], region[-1]), + (region[1], region[-1]), + shape[1], + projection=proj, + extra_coords=(13, 17), + ) + npt.assert_allclose(prof.scalars, data_true[-1, :]) + # Coordinates should still be evenly spaced since the projection is a + # multiplication. + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][-1, :]) + # Distance should still be in the projected coordinates. If the projection + # is from geographic, we shouldn't be returning distances in degrees but in + # projected meters. The distances will be evenly spaced in unprojected + # coordinates. + distance_true = np.linspace(region[0] * 2, region[1] * 2, shape[1]) + npt.assert_allclose(prof.distance, distance_true) + + def test_check_fit_input(): "Make sure no exceptions are raised for standard cases" size = 20 From 2c260c585e88fd72fd9f3ba7c58b1a536bff43ba Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 4 Jun 2020 14:22:32 +0100 Subject: [PATCH 24/38] Remove "blocked" argument from train_test_split (#257) It was a bit redundant since we have to specify `shape` or `spacing` along with it. So we can just ask for `spacing` or `shape` and assume `blocked=True` if they are provided. Also fix the missing description in the gallery example and make slight changes to its wording. Co-authored-by: Santiago Soler --- examples/train_test_split.py | 44 +++++++++++++++++---------- verde/model_selection.py | 58 ++++++++++++++++++++++-------------- 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/examples/train_test_split.py b/examples/train_test_split.py index 0ec9c06e5..88942fb97 100644 --- a/examples/train_test_split.py +++ b/examples/train_test_split.py @@ -2,7 +2,26 @@ Splitting data into train and test sets ======================================= +Verde gridders are mostly linear models that are used to predict data at new +locations. As such, they are subject to *over-fitting* and we should always +strive to quantify the quality of the model predictions (see +:ref:`model_evaluation`). Common practice for +doing this is to split the data into training (the one that is used to fit the +model) and testing (the one that is used to validate the predictions) datasets. +These two datasets can be generated by splitting the data randomly (without +regard for their positions in space). This is the default behaviour of function +:func:`verde.train_test_split`, which is based on the scikit-learn function +:func:`sklearn.model_selection.train_test_split`. This can be problematic if +the data points are autocorrelated (values close to each other spatially tend +to have similar values). In these cases, splitting the data randomly can +overestimate the prediction quality [Roberts_etal2017]_. + +Alternatively, Verde allows splitting the data along *spatial blocks*. In this +case, the data are first grouped into blocks with a given size and then the +blocks are split randomly between training and testing sets. + +This example compares splitting our sample dataset using both methods. """ import matplotlib.pyplot as plt import cartopy.crs as ccrs @@ -24,20 +43,15 @@ # train and test are tuples = (coordinates, data, weights). print("Train and test size for random splits:", train[0][0].size, test[0][0].size) -# A better strategy for spatial data is to first assign the data to blocks and -# then split the blocks randomly. The size of the blocks is controlled by the +# A different strategy is to first assign the data to blocks and then split the +# blocks randomly. To do this, specify the size of the blocks using the # 'spacing' argument. train_block, test_block = vd.train_test_split( - coordinates, - values, - blocked=True, - spacing=0.5, - test_size=test_size, - random_state=213, + coordinates, values, spacing=10 / 60, test_size=test_size, random_state=213, ) -# Verde will attempt to balance the data between the splits so that the desired -# amount is assigned to the test set. It won't be exact since blocks contain -# different amounts of data points. +# Verde will automatically attempt to balance the data between the splits so +# that the desired amount is assigned to the test set. It won't be exact since +# blocks contain different amounts of data points. print( "Train and test size for block splits: ", train_block[0][0].size, @@ -59,12 +73,12 @@ vd.datasets.setup_baja_bathymetry_map(ax2) ax1.set_title("Random splitting") -ax1.plot(*train[0], ".b", markersize=1, transform=crs, label="Train") -ax1.plot(*test[0], ".r", markersize=1, transform=crs, label="Test", alpha=0.5) +ax1.plot(*train[0], ".b", markersize=2, transform=crs, label="Train") +ax1.plot(*test[0], ".r", markersize=2, transform=crs, label="Test", alpha=0.5) ax2.set_title("Blocked random splitting") -ax2.plot(*train_block[0], ".b", markersize=1, transform=crs, label="Train") -ax2.plot(*test_block[0], ".r", markersize=1, transform=crs, label="Test") +ax2.plot(*train_block[0], ".b", markersize=2, transform=crs, label="Train") +ax2.plot(*test_block[0], ".r", markersize=2, transform=crs, label="Test") ax2.legend(loc="upper right") plt.subplots_adjust(wspace=0.15, top=1, bottom=0, left=0.05, right=0.95) diff --git a/verde/model_selection.py b/verde/model_selection.py index 217cc87e8..45a0136ae 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -278,25 +278,32 @@ def get_n_splits(self, X=None, y=None, groups=None): # pylint: enable=invalid-name,unused-argument -def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): +def train_test_split( + coordinates, data, weights=None, spacing=None, shape=None, **kwargs +): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to work on single- or multi-component spatial data with optional weights. - Extra keyword arguments will be passed to the cross-validation class: - :class:`sklearn.model_selection.ShuffleSplit` (random splits) if - ``block=False`` or :class:`verde.BlockShuffleSplit` (spatially blocked - random splits) if ``block=True``. The exception is ``n_splits`` which is - always 1. + If arguments *shape* or *spacing* are provided, will group the data by + spatial blocks before random splitting (using + :class:`verde.BlockShuffleSplit` instead of + :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing* + specifies the size of the spatial blocks. Alternatively, use *shape* to + specify the number of blocks in each dimension. - Using ``block=True`` is preferred over plain random splits for spatial data - to avoid overestimating validation scores. This can happen because of the - inherent autocorrelation that is usually associated with this type of data - (points that are close together are more likely to have similar values). - See [Roberts_etal2017]_ for an overview of this topic. In this case, you - **must provide** a *spacing* or *shape* argument as well (see below). + Extra keyword arguments will be passed to the cross-validation class. The + exception is ``n_splits`` which is always 1. + + Grouping by spatial blocks is preferred over plain random splits for + spatial data to avoid overestimating validation scores. This can happen + because of the inherent autocorrelation that is usually associated with + this type of data (points that are close together are more likely to have + similar values). See [Roberts_etal2017]_ for an overview of this topic. To + use spatial blocking, you **must provide** a *spacing* or *shape* argument + (see below). Parameters ---------- @@ -310,12 +317,15 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). - block : bool - If True, will use :class:`verde.BlockShuffleSplit` as a cross-validator - to first split the data into spatial blocks and then split the blocks - randomly into training and testing sets. When using this option, a - *spacing* or *shape* must be provided as well to specify the size (or - number) of the spatial blocks. + spacing : float, tuple = (s_north, s_east), or None + The spatial block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* must be provided in order to use + spatial blocking. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* must be provided in order to use + spatial blocking. Returns ------- @@ -394,7 +404,7 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): >>> # We must specify the size of the blocks via the spacing argument. >>> # Blocks of 1.5 will split the domain into 4 blocks. >>> train, test = train_test_split( - ... coordinates, data, random_state=0, blocked=True, spacing=1.5, + ... coordinates, data, random_state=0, spacing=1.5, ... ) >>> # The training set: >>> print("coords:", train[0][0], train[0][1], sep="\n") @@ -411,12 +421,14 @@ def train_test_split(coordinates, data, weights=None, blocked=False, **kwargs): """ args = check_fit_input(coordinates, data, weights, unpack=False) - if blocked: - feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) - shuffle = BlockShuffleSplit(n_splits=1, **kwargs).split(feature_matrix) - else: + if spacing is None and shape is None: indices = np.arange(args[1][0].size) shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices) + else: + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) + shuffle = BlockShuffleSplit( + n_splits=1, spacing=spacing, shape=shape, **kwargs + ).split(feature_matrix) split = next(shuffle) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test From 8bc94e79c5562cf7636c06501dc38977c1e9c7c3 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Thu, 4 Jun 2020 11:56:30 -0300 Subject: [PATCH 25/38] Add extra_coords on BaseGridder generated objects (#265) After predicting through the profile, scatter and grid methods, BaseGridder returns generated objects like xr.Dataset or pd.DataFrame. Any extra coordinate passed through the extra_coords keyword argument is now included on those objects as new columns or non-dimension coordinates (depending on the generated object). --- verde/base/base_classes.py | 43 ++++++++++++++++++++++++++- verde/tests/test_base.py | 59 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 40e15fbf4..2d5d9007d 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -87,6 +87,12 @@ class BaseGridder(BaseEstimator): # (pd.DataFrame, xr.Dataset, etc) dims = ("northing", "easting") + # The default name for any extra coordinates given to methods below + # through the `extra_coords` keyword argument. Coordinates are + # included in the outputs (pandas.DataFrame or xarray.Dataset) + # using this name as a basis. + extra_coords_name = "extra_coord" + def predict(self, coordinates): """ Predict data on the given coordinate values. NOT IMPLEMENTED. @@ -234,7 +240,7 @@ def grid( data_names=None, projection=None, **kwargs - ): + ): # pylint: disable=too-many-locals """ Interpolate the data onto a regular grid. @@ -305,6 +311,10 @@ def grid( ) data_names = get_data_names(data, data_names) coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} + # Add extra coordinates to xr.Dataset as non-dimension coordinates + extra_coords_names = self._get_extra_coords_names(coordinates) + for name, extra_coord in zip(extra_coords_names, coordinates[2:]): + coords[name] = (dims, extra_coord) attrs = {"metadata": "Generated by {}".format(repr(self))} data_vars = { name: (dims, value, attrs) for name, value in zip(data_names, data) @@ -385,6 +395,8 @@ def scatter( ) data_names = get_data_names(data, data_names) columns = [(dims[0], coordinates[1]), (dims[1], coordinates[0])] + extra_coords_names = self._get_extra_coords_names(coordinates) + columns.extend(zip(extra_coords_names, coordinates[2:])) columns.extend(zip(data_names, data)) return pd.DataFrame(dict(columns), columns=[c[0] for c in columns]) @@ -498,6 +510,8 @@ def profile( (dims[1], coordinates[0]), ("distance", distances), ] + extra_coords_names = self._get_extra_coords_names(coordinates) + columns.extend(zip(extra_coords_names, coordinates[2:])) columns.extend(zip(data_names, data)) return pd.DataFrame(dict(columns), columns=[c[0] for c in columns]) @@ -509,6 +523,33 @@ def _get_dims(self, dims): return dims return self.dims + def _get_extra_coords_names(self, coordinates): + """ + Return names for extra coordinates + + Examples + -------- + + >>> coordinates = (-5, 4, 3, 5, 1) + >>> grd = BaseGridder() + >>> grd._get_extra_coords_names(coordinates) + ['extra_coord', 'extra_coord_1', 'extra_coord_2'] + + >>> coordinates = (-5, 4, 3) + >>> grd = BaseGridder() + >>> grd.extra_coords_name = "upward" + >>> grd._get_extra_coords_names(coordinates) + ['upward'] + + """ + names = [] + for i in range(len(coordinates[2:])): + name = self.extra_coords_name + if i > 0: + name += "_{}".format(i) + names.append(name) + return names + def project_coordinates(coordinates, projection, **kwargs): """ diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index ad8ea59c8..28ab0a260 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -201,6 +201,65 @@ def proj(lon, lat, inverse=False): npt.assert_allclose(prof.distance, distance_true) +def test_basegridder_extra_coords(): + "Test if BaseGridder adds extra_coords to generated outputs" + + grd = PolyGridder() + region = (0, 10, -10, -5) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0) + data = angular * coordinates[0] + linear + grd = PolyGridder().fit(coordinates, data) + + # Test grid with a single extra coord + extra_coords = 9 + grid = grd.grid(region=region, spacing=1, extra_coords=extra_coords) + assert "extra_coord" in grid.coords + assert grid.coords["extra_coord"].dims == ("northing", "easting") + npt.assert_allclose(grid.coords["extra_coord"], extra_coords) + + # Test grid with multiple extra coords + extra_coords = [9, 18, 27] + grid = grd.grid(region=region, spacing=1, extra_coords=extra_coords) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in grid.coords + assert grid.coords[name].dims == ("northing", "easting") + npt.assert_allclose(grid.coords[name], coord) + + # Test scatter with a single extra coord + extra_coords = 9 + scat = grd.scatter(region, 1000, extra_coords=extra_coords) + assert "extra_coord" in scat.columns + npt.assert_allclose(scat["extra_coord"], extra_coords) + + # Test scatter with multiple extra coord + extra_coords = [9, 18, 27] + scat = grd.scatter(region, 1000, extra_coords=extra_coords) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in scat.columns + npt.assert_allclose(scat[name], coord) + + # Test profile with a single extra coord + extra_coords = 9 + prof = grd.profile( + (region[0], region[-1]), (region[1], region[-1]), 51, extra_coords=extra_coords, + ) + assert "extra_coord" in prof.columns + npt.assert_allclose(prof["extra_coord"], extra_coords) + + # Test profile with multiple extra coord + extra_coords = [9, 18, 27] + prof = grd.profile( + (region[0], region[-1]), (region[1], region[-1]), 51, extra_coords=extra_coords, + ) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in prof.columns + npt.assert_allclose(prof[name], coord) + + def test_basegridder_projection_multiple_coordinates(): "Test BaseGridder when passing in a projection with multiple coordinates" From 263b29a6e1ee7decb5746d8c8e9fcf11342d6d99 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 4 Jun 2020 16:31:55 +0100 Subject: [PATCH 26/38] Add a blocked K-Fold cross-validator (#254) The `BlockKFold` class splits the data into spatial blocks and does K-fold cross-validation splits on the blocks. Refactor code from `BlockShuffleSplit` into a base class that can be shared with the new class. Folds are balanced by making splits based on the number of data points in each fold instead of the number of blocks. --- doc/api/index.rst | 2 + examples/blockkfold.py | 102 +++++++++++++ verde/__init__.py | 7 +- verde/base/__init__.py | 2 +- verde/base/base_classes.py | 115 ++++++++++++++ verde/model_selection.py | 227 +++++++++++++++++++++++----- verde/tests/test_base.py | 41 +++++ verde/tests/test_model_selection.py | 61 +++++--- verde/tests/test_utils.py | 16 +- verde/utils.py | 118 +++++++++++++++ 10 files changed, 628 insertions(+), 63 deletions(-) create mode 100644 examples/blockkfold.py diff --git a/doc/api/index.rst b/doc/api/index.rst index 9a2b3b1a6..cbe7a9286 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -46,6 +46,7 @@ Model Selection train_test_split cross_val_score BlockShuffleSplit + BlockKFold Coordinate Manipulation ----------------------- @@ -130,6 +131,7 @@ Base Classes and Functions :toctree: generated/ base.BaseGridder + base.BaseBlockCrossValidator base.n_1d_arrays base.check_fit_input base.least_squares diff --git a/examples/blockkfold.py b/examples/blockkfold.py new file mode 100644 index 000000000..15ae001a7 --- /dev/null +++ b/examples/blockkfold.py @@ -0,0 +1,102 @@ +""" +K-Fold cross-validation with blocks +=================================== + +Cross-validation scores for spatial data can be biased because observations are +commonly spatially autocorrelated (closer data points have similar values). One +strategy to reduce the bias is to split data along spatial blocks +[Roberts_etal2017]_. Verde offers the cross-validator +:class:`verde.BlockKFold`, which is a scikit-learn compatible version of k-fold +cross-validation using spatial blocks. + +When splitting the data into training and testing sets, +:class:`~verde.BlockKFold` first splits the data into spatial blocks and then +splits the blocks into folds. During k-fold cross-validation, one fold is used +as the testing set while the rest are used for training. Since each block can +have a different number of data points, assigning the same number of blocks to +each fold can lead to folds with very different numbers of data points. +By default, :class:`~verde.BlockKFold` takes care to balance the folds to have +approximately equal number of data points. +Alternatively, you can turn off balancing to have each fold contain the same +number of blocks. + +This example shows the data assigned to each of the first 3 folds of a blocked +k-fold iteration, with and without balancing. Notice that the unbalanced folds +have very different numbers of data points. +""" +import numpy as np +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import verde as vd + +# Let's split the Baja California shipborne bathymetry data +data = vd.datasets.fetch_baja_bathymetry() +coordinates = (data.longitude, data.latitude) + +# Create cross-validators with blocks of 30 arc-minutes with shuffling enabled. +spacing = 30 / 60 +# Set the random state so that these plots don't vary when we rerun the example +random_state = 10 +kfold = vd.BlockKFold(spacing=spacing, shuffle=True, random_state=random_state) +# Make another cross-validator with fold balancing turned off. Without +# balancing, the folds can have very different number of data points in them +# (which may bias the scores). +kfold_unbalanced = vd.BlockKFold( + spacing=spacing, shuffle=True, random_state=random_state, balance=False, +) + +# The BlockKFold is compatible with scikit-learn, so instead of giving it a +# coordinates tuple (like we use in Verde), we have to put the coordinates in a +# feature matrix (X in scikit-learn jargon). Each column will have one of the +# coordinate values. This is usually not required if using this cross-validator +# with Verde functions and classes. You can pass it to verde.cross_val_score, +# for example. +feature_matrix = np.transpose(coordinates) + +# Create the folds for the balanced and unbalanced cross-validators to show the +# difference. +balanced = kfold.split(feature_matrix) +unbalanced = kfold_unbalanced.split(feature_matrix) + +# Cartopy requires setting the coordinate reference system (CRS) of the +# original data through the transform argument. Their docs say to use +# PlateCarree to represent geographic data. +crs = ccrs.PlateCarree() + +# Make Mercator maps of the two cross-validator folds +fig, axes = plt.subplots( + 2, + 3, + figsize=(12, 10), + subplot_kw=dict(projection=ccrs.Mercator()), + sharex=True, + sharey=True, +) +for row, title, folds in zip(axes, ["Balanced", "Unbalanced"], [balanced, unbalanced]): + for i, (ax, fold) in enumerate(zip(row, folds)): + train, test = fold + ax.set_title("{} fold {} ({} testing points)".format(title, i, test.size)) + # Use an utility function to setup the tick labels and the land feature + vd.datasets.setup_baja_bathymetry_map(ax) + ax.plot( + coordinates[0][train], + coordinates[1][train], + ".b", + markersize=1, + transform=crs, + label="Train", + ) + ax.plot( + coordinates[0][test], + coordinates[1][test], + ".r", + markersize=1, + transform=crs, + label="Test", + ) +# Place a legend on the first plot +axes[0, 0].legend(loc="upper right", markerscale=5) +plt.subplots_adjust( + hspace=0.1, wspace=0.05, top=0.95, bottom=0.05, left=0.05, right=0.95 +) +plt.show() diff --git a/verde/__init__.py b/verde/__init__.py index 0fa4bee34..f80b3e34b 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -23,7 +23,12 @@ from .trend import Trend from .chain import Chain from .spline import Spline, SplineCV -from .model_selection import cross_val_score, train_test_split, BlockShuffleSplit +from .model_selection import ( + cross_val_score, + train_test_split, + BlockShuffleSplit, + BlockKFold, +) from .vector import Vector, VectorSpline2D from .projections import project_region, project_grid diff --git a/verde/base/__init__.py b/verde/base/__init__.py index 9e246d929..f792ee4f7 100644 --- a/verde/base/__init__.py +++ b/verde/base/__init__.py @@ -1,4 +1,4 @@ # pylint: disable=missing-docstring -from .base_classes import BaseGridder +from .base_classes import BaseGridder, BaseBlockCrossValidator from .least_squares import least_squares from .utils import n_1d_arrays, check_fit_input diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 2d5d9007d..db952d837 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -1,16 +1,131 @@ """ Base classes for all gridders. """ +from abc import ABCMeta, abstractmethod + import xarray as xr import pandas as pd import numpy as np from sklearn.base import BaseEstimator +from sklearn.model_selection import BaseCrossValidator from sklearn.metrics import r2_score from ..coordinates import grid_coordinates, profile_coordinates, scatter_points from .utils import check_data, check_fit_input +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class BaseBlockCrossValidator(BaseCrossValidator, metaclass=ABCMeta): + """ + Base class for spatially blocked cross-validators. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int + Number of splitting iterations. + + """ + + def __init__( + self, spacing=None, shape=None, n_splits=10, + ): + if spacing is None and shape is None: + raise ValueError("Either 'spacing' or 'shape' must be provided.") + self.spacing = spacing + self.shape = shape + self.n_splits = n_splits + + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + + """ + if X.shape[1] != 2: + raise ValueError( + "X must have exactly 2 columns ({} given).".format(X.shape[1]) + ) + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + y : object + Always ignored, exists for compatibility. + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + @abstractmethod + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + MUST BE IMPLEMENTED BY DERIVED CLASSES. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + + +# pylint: enable=invalid-name,unused-argument + + class BaseGridder(BaseEstimator): """ Base class for gridders. diff --git a/verde/model_selection.py b/verde/model_selection.py index 45a0136ae..59602f7f4 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -4,12 +4,13 @@ import warnings import numpy as np -from sklearn.model_selection import KFold, ShuffleSplit, BaseCrossValidator +from sklearn.model_selection import KFold, ShuffleSplit from sklearn.base import clone +from sklearn.utils import check_random_state -from .base import check_fit_input, n_1d_arrays +from .base import check_fit_input, n_1d_arrays, BaseBlockCrossValidator from .coordinates import block_split -from .utils import dispatch +from .utils import dispatch, partition_by_sum # Otherwise, DeprecationWarning won't be shown, kind of defeating the purpose. @@ -20,7 +21,7 @@ # pylint: disable=invalid-name,unused-argument -class BlockShuffleSplit(BaseCrossValidator): +class BlockShuffleSplit(BaseBlockCrossValidator): """ Random permutation of spatial blocks cross-validator. @@ -145,15 +146,11 @@ def __init__( random_state=None, balancing=10, ): - if spacing is None and shape is None: - raise ValueError("Either 'spacing' or 'shape' must be provided.") + super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) if balancing < 1: raise ValueError( "The *balancing* argument must be >= 1. To disable balancing, use 1." ) - self.spacing = spacing - self.shape = shape - self.n_splits = n_splits self.test_size = test_size self.train_size = train_size self.random_state = random_state @@ -223,9 +220,158 @@ def _iter_test_indices(self, X=None, y=None, groups=None): best = np.argmin(balance) yield test_sets[best] - def split(self, X, y=None, groups=None): + +class BlockKFold(BaseBlockCrossValidator): + """ + K-Folds over spatial blocks cross-validator. + + Yields indices to split data into training and test sets. Data are first + grouped into rectangular blocks of size given by the *spacing* argument. + Alternatively, blocks can be defined by the number of blocks in each + dimension using the *shape* argument instead of *spacing*. The blocks are + then split into testing and training sets iteratively along k folds of the + data (k is given by *n_splits*). + + By default, the blocks are split into folds in a way that makes each fold + have approximately the same number of data points. Sometimes this might not + be possible, which can happen if the number of splits is close to the + number of blocks. In these cases, each fold will have the same number of + blocks, regardless of how many data points are in each block. This + behaviour can also be disabled by setting ``balance=False``. + + Shuffling the blocks prior to splitting is strongly encouraged. Not + shuffling will essentially lead to the creation of *n_splits* large blocks + since blocks are spatially adjacent when not shuffled. The default + behaviour is not to shuffle for compatibility with similar cross-validators + in scikit-learn. + + This cross-validator is preferred over + :class:`sklearn.model_selection.KFold` for spatial data to avoid + overestimating cross-validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int + Number of folds. Must be at least 2. + shuffle : bool + Whether to shuffle the data before splitting into batches. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + balance : bool + Whether or not to split blocks into fold with approximately equal + number of data points. If False, each fold will have the same number of + blocks (which can have different number of data points in them). + + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + cross_val_score : Score an estimator/gridder using cross-validation. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> import numpy as np + >>> # Make a regular grid of data points + >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) + >>> # Need to convert the coordinates into a feature matrix + >>> X = np.transpose([i.ravel() for i in coords]) + >>> kfold = BlockKFold(spacing=1.5, n_splits=4) + >>> # These are the 1D indices of the points belonging to each set + >>> for train, test in kfold.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] + >>> # A better way to visualize this is to create a 2D array and put + >>> # "train" or "test" in the corresponding locations. + >>> shape = coords[0].shape + >>> mask = np.full(shape=shape, fill_value=" ") + >>> for iteration, (train, test) in enumerate(kfold.split(X)): + ... # The index needs to be converted to 2D so we can index our matrix. + ... mask[np.unravel_index(train, shape)] = "train" + ... mask[np.unravel_index(test, shape)] = " test" + ... print("Iteration {}:".format(iteration)) + ... print(mask) + Iteration 0: + [[' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 1: + [['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 2: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + [' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train']] + Iteration 3: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test']] + + For spatial data, it's often good to shuffle the blocks before assigning + them to folds: + + >>> # Set the random_state to make sure we always get the same result. + >>> kfold = BlockKFold( + ... spacing=1.5, n_splits=4, shuffle=True, random_state=123, + ... ) + >>> for train, test in kfold.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + + These should be the same splits as we got before but in a different order. + This only happens because in this example we have the number of splits + equal to the number of blocks (4). With real data the effects would be more + dramatic. + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=5, + shuffle=False, + random_state=None, + balance=True, + ): + super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) + if n_splits < 2: + raise ValueError( + "Number of splits must be >=2 for BlockKFold. Given {}.".format( + n_splits + ) + ) + self.shuffle = shuffle + self.random_state = random_state + self.balance = balance + + def _iter_test_indices(self, X=None, y=None, groups=None): """ - Generate indices to split data into training and test set. + Generates integer indices corresponding to test sets. Parameters ---------- @@ -241,38 +387,45 @@ def split(self, X, y=None, groups=None): Yields ------ - train : ndarray - The training set indices for that split. test : ndarray The testing set indices for that split. """ - if X.shape[1] != 2: + labels = block_split( + coordinates=(X[:, 0], X[:, 1]), + spacing=self.spacing, + shape=self.shape, + region=None, + adjust="spacing", + )[1] + block_ids = np.unique(labels) + if self.n_splits > block_ids.size: raise ValueError( - "X must have exactly 2 columns ({} given).".format(X.shape[1]) + "Number of k-fold splits ({}) cannot be greater than the number of " + "blocks ({}). Either decrease n_splits or increase the number of " + "blocks.".format(self.n_splits, block_ids.size) ) - for train, test in super().split(X, y, groups): - yield train, test - - def get_n_splits(self, X=None, y=None, groups=None): - """ - Returns the number of splitting iterations in the cross-validator - - Parameters - ---------- - X : object - Always ignored, exists for compatibility. - y : object - Always ignored, exists for compatibility. - groups : object - Always ignored, exists for compatibility. - - Returns - ------- - n_splits : int - Returns the number of splitting iterations in the cross-validator. - """ - return self.n_splits + if self.shuffle: + check_random_state(self.random_state).shuffle(block_ids) + if self.balance: + block_sizes = [np.isin(labels, i).sum() for i in block_ids] + try: + split_points = partition_by_sum(block_sizes, parts=self.n_splits) + folds = np.split(np.arange(block_ids.size), split_points) + except ValueError: + warnings.warn( + "Could not balance folds to have approximately the same " + "number of data points. Dividing into folds with equal " + "number of blocks instead. Decreasing n_splits or increasing " + "the number of blocks may help.", + UserWarning, + ) + folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] + else: + folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] + for test_blocks in folds: + test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] + yield test_points # pylint: enable=invalid-name,unused-argument diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 28ab0a260..7d8f7cdea 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -10,6 +10,7 @@ from ..base.utils import check_fit_input, check_coordinates from ..base.base_classes import ( BaseGridder, + BaseBlockCrossValidator, get_data_names, get_instance_region, ) @@ -358,6 +359,46 @@ def test_check_fit_input_fails_weights(): check_fit_input(coords, (data, data), weights) +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class DummyCrossValidator(BaseBlockCrossValidator): + """ + Dummy class to test the base cross-validator. + """ + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Yields a list of indices for the entire X. + """ + yield list(range(X.shape[0])) + + +# pylint: enable=invalid-name,unused-argument + + +def test_baseblockedcrossvalidator_n_splits(): + "Make sure get_n_splits returns the correct value" + cv = DummyCrossValidator(spacing=1, n_splits=14) + assert cv.get_n_splits() == 14 + + +def test_baseblockedcrossvalidator_fails_spacing_shape(): + "Should raise an exception if not given spacing or shape." + with pytest.raises(ValueError): + DummyCrossValidator() + + +def test_baseblockedcrossvalidator_fails_data_shape(): + "Should raise an exception if the X array doesn't have 2 columns." + cv = DummyCrossValidator(spacing=1) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 4)))) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 1)))) + + def test_least_squares_copy_jacobian(): """ Test if Jacobian matrix is copied or scaled inplace diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index 4093c4fca..a8cd3da7a 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -1,14 +1,16 @@ """ Test the model selection code (cross-validation, etc). """ +import warnings + import pytest from sklearn.model_selection import ShuffleSplit import numpy as np import numpy.testing as npt from dask.distributed import Client -from .. import Trend, grid_coordinates -from ..model_selection import cross_val_score, BlockShuffleSplit +from .. import Trend, grid_coordinates, scatter_points +from ..model_selection import cross_val_score, BlockShuffleSplit, BlockKFold def test_cross_val_score_client(): @@ -26,33 +28,12 @@ def test_cross_val_score_client(): npt.assert_allclose(scores, 1) -def test_blockshufflesplit_n_splits(): - "Make sure get_n_splits returns the correct value" - cv = BlockShuffleSplit(spacing=1, n_splits=14) - assert cv.get_n_splits() == 14 - - def test_blockshufflesplit_fails_balancing(): "Should raise an exception if balancing < 1." with pytest.raises(ValueError): BlockShuffleSplit(spacing=1, balancing=0) -def test_blockshufflesplit_fails_spacing_shape(): - "Should raise an exception if not given spacing or shape." - with pytest.raises(ValueError): - BlockShuffleSplit() - - -def test_blockshufflesplit_fails_data_shape(): - "Should raise an exception if the X array doesn't have 2 columns." - cv = BlockShuffleSplit(spacing=1) - with pytest.raises(ValueError): - next(cv.split(np.ones(shape=(10, 4)))) - with pytest.raises(ValueError): - next(cv.split(np.ones(shape=(10, 1)))) - - @pytest.mark.parametrize("test_size", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9]) def test_blockshufflesplit_balancing(test_size): "Make sure that the sets have the right number of points" @@ -65,3 +46,37 @@ def test_blockshufflesplit_balancing(test_size): for train, test in cv.split(coords): npt.assert_allclose(train.size / npoints, train_size, atol=0.01) npt.assert_allclose(test.size / npoints, test_size, atol=0.01) + + +def test_blockkfold_fails_n_splits_too_small(): + "Should raise an exception if n_splits < 2." + BlockKFold(spacing=1, n_splits=2) + with pytest.raises(ValueError): + BlockKFold(spacing=1, n_splits=1) + + +def test_blockkfold_fails_n_splits_too_large(): + "Should raise an exception if n_splits < number of blocks." + coords = grid_coordinates(region=(0, 3, -10, -7), shape=(4, 4)) + features = np.transpose([i.ravel() for i in coords]) + next(BlockKFold(shape=(2, 2), n_splits=4).split(features)) + with pytest.raises(ValueError) as error: + next(BlockKFold(shape=(2, 2), n_splits=5).split(features)) + assert "Number of k-fold splits (5) cannot be greater" in str(error) + + +def test_blockkfold_cant_balance(): + "Should fall back to regular split if can't balance and print a warning" + coords = scatter_points(region=(0, 3, -10, -7), size=10, random_state=2) + features = np.transpose([i.ravel() for i in coords]) + cv = BlockKFold(shape=(4, 4), n_splits=8) + with warnings.catch_warnings(record=True) as warn: + splits = [i for _, i in cv.split(features)] + assert len(warn) == 1 + assert issubclass(warn[-1].category, UserWarning) + assert "Could not balance folds" in str(warn[-1].message) + # Should revert to the unbalanced version + cv_unbalanced = BlockKFold(shape=(4, 4), n_splits=8, balance=False) + splits_unbalanced = [i for _, i in cv_unbalanced.split(features)] + for balanced, unbalanced in zip(splits, splits_unbalanced): + npt.assert_allclose(balanced, unbalanced) diff --git a/verde/tests/test_utils.py b/verde/tests/test_utils.py index 0d6119b99..544260616 100644 --- a/verde/tests/test_utils.py +++ b/verde/tests/test_utils.py @@ -10,7 +10,7 @@ import pytest from ..coordinates import grid_coordinates -from ..utils import parse_engine, dummy_jit, kdtree, grid_to_table +from ..utils import parse_engine, dummy_jit, kdtree, grid_to_table, partition_by_sum from .. import utils @@ -75,3 +75,17 @@ def test_grid_to_table_order(): npt.assert_allclose(true_lat, table.latitude) npt.assert_allclose(true_lon, table.longitude) npt.assert_allclose(true_field, table.field) + + +def test_partition_by_sum_fails_size(): + "Should raise an exception if given more parts than elements." + with pytest.raises(ValueError) as error: + partition_by_sum(np.arange(10), 11) + assert "array of size 10 into 11 parts" in str(error) + + +def test_partition_by_sum_fails_no_partitions(): + "Should raise an exception if could not find unique partition points" + with pytest.raises(ValueError) as error: + partition_by_sum(np.arange(10), 8) + assert "Could not find partition points" in str(error) diff --git a/verde/utils.py b/verde/utils.py index aaaca9b22..ac6639c8c 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -348,3 +348,121 @@ def kdtree(coordinates, use_pykdtree=True, **kwargs): else: tree = cKDTree(points, **kwargs) return tree + + +def partition_by_sum(array, parts): + """ + Partition an array into parts of approximately equal sum. + + Does not change the order of the array elements. + + Produces the partition indices on the array. Use :func:`numpy.split` to + divide the array along these indices. + + .. warning:: + + Depending on the input and number of parts, there might not exist + partition points. In these cases, the function will raise + ``ValueError``. This is more likely to happen as the number of parts + approaches the number of elements in the array. + + Parameters + ---------- + array : array or array-like + The 1D array that will be partitioned. The array will be raveled before + computations. + parts : int + Number of parts to split the array. Can be at most the number of + elements in the array. + + Returns + ------- + indices : array + The indices in which the array should be split. + + Notes + ----- + + Solution from https://stackoverflow.com/a/54024280 + + Examples + -------- + + >>> import numpy as np + >>> array = np.arange(10) + >>> split_points = partition_by_sum(array, parts=2) + >>> print(split_points) + [7] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3 4 5 6] 21 + [7 8 9] 24 + >>> split_points = partition_by_sum(array, parts=3) + >>> print(split_points) + [6 8] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3 4 5] 15 + [6 7] 13 + [8 9] 17 + >>> split_points = partition_by_sum(array, parts=5) + >>> print(split_points) + [4 6 7 9] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3] 6 + [4 5] 9 + [6] 6 + [7 8] 15 + [9] 9 + >>> # Use an array with a random looking element order + >>> array = [5, 6, 4, 6, 8, 1, 2, 6, 3, 3] + >>> split_points = partition_by_sum(array, parts=2) + >>> print(split_points) + [4] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [5 6 4 6] 21 + [8 1 2 6 3 3] 23 + >>> # Splits can have very different sums but this is best that can be done + >>> # without changing the order of the array. + >>> split_points = partition_by_sum(array, parts=5) + >>> print(split_points) + [1 3 4 7] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [5] 5 + [6 4] 10 + [6] 6 + [8 1 2] 11 + [6 3 3] 12 + + """ + array = np.atleast_1d(array).ravel() + if parts > array.size: + raise ValueError( + "Cannot partition an array of size {} into {} parts of equal sum.".format( + array.size, parts + ) + ) + cumulative_sum = array.cumsum() + # Ideally, we want each part to have the same number of points (total / + # parts). + ideal_sum = cumulative_sum[-1] // parts + # If the parts are ideal, the cumulative sum of each part will be this + ideal_cumsum = np.arange(1, parts) * ideal_sum + # Find the places in the real cumulative sum where the ideal values would + # be. These are the split points. Between each split point, the sum of + # elements will be approximately the ideal sum. Need to insert to the right + # side so that we find cumsum[i - 1] <= ideal < cumsum[i]. This way, if a + # part has ideal sum, the last element (i - 1) will be included. Otherwise, + # we would never have ideal sums. + indices = np.searchsorted(cumulative_sum, ideal_cumsum, side="right") + # Check for repeated split points, which indicates that there is no way to + # split the array. + if np.unique(indices).size != indices.size: + raise ValueError( + "Could not find partition points to split the array into {} parts " + "of equal sum.".format(parts) + ) + return indices From b35724f193bbf4a01d66361a8aa69dbf80d0e8fb Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 4 Jun 2020 19:58:47 +0100 Subject: [PATCH 27/38] Pin Cartopy to 0.17.* until we sort out problems (#270) For some reason, the coast lines features are plotting on the entire figure instead of only the land or ocean. Might be an error on our side. For now, pin cartopy on CI to fix our docs. --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 346df383a..ec8804f8c 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,7 @@ dependencies: # Development requirements - pyproj - matplotlib - - cartopy + - cartopy==0.17.* - pytest - pytest-cov - pytest-mpl diff --git a/requirements-dev.txt b/requirements-dev.txt index f81cae355..85d0f586a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # Development requirements pyproj matplotlib -cartopy +cartopy=0.17.* pytest pytest-cov pytest-mpl From d870b41c8ea20ca4862bcbe13ed8767ef01c908e Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 4 Jun 2020 20:18:24 +0100 Subject: [PATCH 28/38] Changelog entry for v1.5.0 (#271) Includes the new cross-validators and fixes to our base classes to accommodate Harmonica. --- README.rst | 1 + doc/changes.rst | 56 ++++++++++++++++++++++++++++++++++++ verde/tests/test_datasets.py | 3 ++ 3 files changed, 60 insertions(+) diff --git a/README.rst b/README.rst index e4b4bf2bf..1c3e70794 100644 --- a/README.rst +++ b/README.rst @@ -159,6 +159,7 @@ Documentation for other versions * `Development `__ (reflects the *master* branch on Github) * `Latest release `__ +* `v1.5.0 `__ * `v1.4.0 `__ * `v1.3.0 `__ * `v1.2.0 `__ diff --git a/doc/changes.rst b/doc/changes.rst index 13191b2a8..8938a2969 100644 --- a/doc/changes.rst +++ b/doc/changes.rst @@ -3,6 +3,62 @@ Changelog ========= +Version 1.5.0 +------------- + +*Released on: 2020/06/04* + +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3877060.svg + :target: https://doi.org/10.5281/zenodo.3877060 + +Bug fixes: + +* Apply projections using only the first two coordinates instead all given + coordinates. Projections only really involve the first two (horizontal) + coordinates. Only affects users passing ``extra_coords`` to gridder methods. + (`#264 `__) + +New features: + +* **New** blocked cross-validation classes ``BlockShuffleSplit`` and + ``BlockKFold``. These are scikit-learn compatible cross-validators that split + the data into spatial blocks before assigning them to folds. Blocked + cross-validation can help avoid overestimation of prediction accuracy for + spatial data (see [Roberts_etal2017]_). The classes work with + ``verde.cross_val_score`` and any other function/method/class that accepts a + scikit-learn cross-validator. + (`#251 `__ and + `#254 `__) +* Add the option for block-wise splitting in ``verde.train_test_split`` by + passing in a ``spacing`` or ``shape`` parameters. + (`#253 `__ and + `#257 `__) + +Base classes: + +* Add optional argument to ``verde.base.least_squares`` to copy Jacobian + matrix. + (`#255 `__) +* Add extra coordinates (specified by the ``extra_coords`` keyword argument + to outputs of ``BaseGridder`` methods. + (`#265 `__) + +Maintenance: + +* Update tests to ``repr`` changes in scikit-learn 0.23.0. + (`#267 `__) + +Documentation: + +* Fix typo in README contributing section. + (`#258 `__) + +This release contains contributions from: + +* Leonardo Uieda +* Santiago Soler +* Rowan Cockett + Version 1.4.0 ------------- diff --git a/verde/tests/test_datasets.py b/verde/tests/test_datasets.py index 9bdb4711f..f7311e713 100644 --- a/verde/tests/test_datasets.py +++ b/verde/tests/test_datasets.py @@ -23,6 +23,9 @@ def test_datasets_locate(): "Make sure the data cache location has the right package name" + # Fetch a dataset first to make sure that the cache folder exists. Since + # Pooch 1.1.1 the cache isn't created until a download is requested. + fetch_texas_wind() path = locate() assert os.path.exists(path) # This is the most we can check in a platform independent way without From d289bc4563d31bb2d9aa045b9c9e733aad3f8d5e Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 25 Jun 2020 11:37:24 +0100 Subject: [PATCH 29/38] Allow specifing the scoring function in cross_val_score (#273) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, `cross_val_score` only uses the estimator's `.score` method (which is R² for all our gridders). But sometimes we want to use a different metric, like MSE or MAE. These changes allow passing in any scikit-learn compatible scorer through a new `scoring` parameter. This can be a string or callable, like in scikit-learn's `cross_val_score`. Needed to delegate score computation to a separate function in `verde.base.utils` to reuse in `cross_val_score` and a dummy class to be able to use the scikit-learn scorers correctly. --- verde/base/base_classes.py | 16 +----- verde/base/utils.py | 79 +++++++++++++++++++++++++++++ verde/model_selection.py | 51 ++++++++++++++++--- verde/tests/test_model_selection.py | 43 ++++++++++++++-- 4 files changed, 165 insertions(+), 24 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index db952d837..5400102f5 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -5,13 +5,11 @@ import xarray as xr import pandas as pd -import numpy as np from sklearn.base import BaseEstimator from sklearn.model_selection import BaseCrossValidator -from sklearn.metrics import r2_score from ..coordinates import grid_coordinates, profile_coordinates, scatter_points -from .utils import check_data, check_fit_input +from .utils import check_data, score_estimator # Pylint doesn't like X, y scikit-learn argument names. @@ -334,17 +332,7 @@ def score(self, coordinates, data, weights=None): The R^2 score """ - coordinates, data, weights = check_fit_input( - coordinates, data, weights, unpack=False - ) - pred = check_data(self.predict(coordinates)) - result = np.mean( - [ - r2_score(datai.ravel(), predi.ravel(), sample_weight=weighti) - for datai, predi, weighti in zip(data, pred, weights) - ] - ) - return result + return score_estimator("r2", self, coordinates, data, weights=weights) def grid( self, diff --git a/verde/base/utils.py b/verde/base/utils.py index 4c44c4ca0..0adf31e6d 100644 --- a/verde/base/utils.py +++ b/verde/base/utils.py @@ -2,6 +2,85 @@ Utility functions for building gridders and checking arguments. """ import numpy as np +from sklearn.metrics import check_scoring + + +def score_estimator(scoring, estimator, coordinates, data, weights=None): + """ + Score the given gridder against the given data using the given metric. + + If the data and predictions have more than 1 component, the scores of each + component will be averaged. + + Parameters + ---------- + scoring : str or callable + A scoring specification known to scikit-learn. See + :func:`sklearn.metrics.check_scoring`. + estimator : a Verde gridder + The gridder to score. Usually derived from + :class:`verde.base.BaseGridder`. + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. + data : array or tuple of arrays + The data values of each data point. If the data has more than one + component, *data* must be a tuple of arrays (one for each + component). + weights : None or array or tuple of arrays + If not None, then the weights assigned to each data point. If more + than one data component is provided, you must provide a weights + array for each data component (if not None). + + Returns + ------- + score : float + The score. + + """ + coordinates, data, weights = check_fit_input( + coordinates, data, weights, unpack=False + ) + predicted = check_data(estimator.predict(coordinates)) + scorer = check_scoring(DummyEstimator, scoring=scoring) + result = np.mean( + [ + scorer( + DummyEstimator(pred.ravel()), + coordinates, + data[i].ravel(), + sample_weight=weights[i], + ) + for i, pred in enumerate(predicted) + ] + ) + return result + + +class DummyEstimator: + """ + Dummy estimator that does nothing but pass along the predicted data. + Used to fool the scikit-learn scorer functions to fit our API + (multi-component estimators return a tuple on .predict). + + >>> est = DummyEstimator([1, 2, 3]) + >>> print(est.fit().predict()) + [1, 2, 3] + + """ + + def __init__(self, predicted): + self._predicted = predicted + + def predict(self, *args, **kwargs): # pylint: disable=unused-argument + "Return the stored predicted values" + return self._predicted + + def fit(self, *args, **kwards): # pylint: disable=unused-argument + "Does nothing. Just here to satisfy the API." + return self def check_data(data): diff --git a/verde/model_selection.py b/verde/model_selection.py index 59602f7f4..c230e5991 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -9,6 +9,7 @@ from sklearn.utils import check_random_state from .base import check_fit_input, n_1d_arrays, BaseBlockCrossValidator +from .base.utils import score_estimator from .coordinates import block_split from .utils import dispatch, partition_by_sum @@ -588,7 +589,14 @@ def train_test_split( def cross_val_score( - estimator, coordinates, data, weights=None, cv=None, client=None, delayed=False + estimator, + coordinates, + data, + weights=None, + cv=None, + client=None, + delayed=False, + scoring=None, ): """ Score an estimator/gridder using cross-validation. @@ -601,6 +609,10 @@ def cross_val_score( cross-validation class from scikit-learn or Verde can be passed in through the *cv* argument. + The score is calculated using the estimator/gridder's ``.score`` method by + default. Alternatively, use the *scoring* parameter to specify a different + scoring function (e.g., mean square error, mean absolute error, etc). + Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead of running them. The returned scores will be "lazy" objects instead of the @@ -640,6 +652,11 @@ def cross_val_score( actually executing them. The returned scores will be a list of delayed objects. Call `.compute()` on each score or :func:`dask.compute` on the entire list to trigger the actual computations. + scoring : None, str, or callable + A scoring function (or name of a function) known to scikit-learn. See + the description of *scoring* in + :func:`sklearn.model_selection.cross_val_score` for details. If None, + will fall back to the estimator's ``.score`` method. Returns ------- @@ -674,6 +691,22 @@ def cross_val_score( There are 5 scores because the default cross-validator is :class:`sklearn.model_selection.KFold` with ``n_splits=5``. + To calculate the score with a different metric, use the *scoring* argument: + + >>> scores = cross_val_score( + ... model, coords, data, scoring="neg_mean_squared_error", + ... ) + >>> print(', '.join(['{:.2f}'.format(-score) for score in scores])) + 0.00, 0.00, 0.00, 0.00, 0.00 + + In this case, we calculated the (negative) mean squared error (MSE) which + measures the distance between test data and predictions. This way, 0 is the + best possible value meaning that the data and prediction are the same. The + "neg" part indicates that this is the negative mean square error. This is + required because scikit-learn assumes that higher scores are always treated + as better (which is the opposite for MSE). For display, we take the + negative of the score to get the actual MSE. + We can use different cross-validators by assigning them to the ``cv`` argument: @@ -736,12 +769,13 @@ def cross_val_score( fit_args = (coordinates, data, weights) scores = [] for train_index, test_index in cv.split(feature_matrix): - train = tuple(select(i, train_index) for i in fit_args) - test = tuple(select(i, test_index) for i in fit_args) # Clone the estimator to avoid fitting the same object simultaneously # when delayed=True. score = dispatch(fit_score, client=client, delayed=delayed)( - clone(estimator), train, test + clone(estimator), + tuple(select(i, train_index) for i in fit_args), + tuple(select(i, test_index) for i in fit_args), + scoring, ) scores.append(score) if not delayed and client is None: @@ -749,11 +783,16 @@ def cross_val_score( return scores -def fit_score(estimator, train_data, test_data): +def fit_score(estimator, train_data, test_data, scoring): """ Fit an estimator on the training data and then score it on the testing data """ - return estimator.fit(*train_data).score(*test_data) + estimator.fit(*train_data) + if scoring is None: + score = estimator.score(*test_data) + else: + score = score_estimator(scoring, estimator, *test_data) + return score def select(arrays, index): diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index a8cd3da7a..5b52c1f59 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -5,18 +5,53 @@ import pytest from sklearn.model_selection import ShuffleSplit +from sklearn.metrics import get_scorer import numpy as np import numpy.testing as npt from dask.distributed import Client -from .. import Trend, grid_coordinates, scatter_points +from .. import Vector, Trend, grid_coordinates, scatter_points from ..model_selection import cross_val_score, BlockShuffleSplit, BlockKFold -def test_cross_val_score_client(): - "Test the deprecated dask Client interface" +@pytest.fixture(name="trend") +def fixture_trend(): + "Coordinates and data for a 1-degree trend" coords = grid_coordinates((0, 10, -10, -5), spacing=0.1) - data = 10 - coords[0] + 0.5 * coords[1] + coefs = (10, -1, 0.5) + data = coefs[0] + coefs[1] * coords[0] + coefs[2] * coords[1] + return coords, data, coefs + + +@pytest.mark.parametrize( + "metric,expected", + [(None, 1), ("r2", 1), (get_scorer("neg_mean_squared_error"), 0)], + ids=["none", "R2", "MSE"], +) +def test_cross_val_score(trend, metric, expected): + "Check that CV scores are perfect on a perfect model" + coords, data = trend[:2] + model = Trend(degree=1) + scores = cross_val_score(model, coords, data, scoring=metric) + npt.assert_allclose(scores, expected, atol=1e-10) + + +@pytest.mark.parametrize( + "metric,expected", + [(None, 1), ("r2", 1), (get_scorer("neg_mean_squared_error"), 0)], + ids=["none", "R2", "MSE"], +) +def test_cross_val_score_vector(trend, metric, expected): + "Check that CV works on Vector data types as well" + coords, data = trend[:2] + model = Vector([Trend(degree=1), Trend(degree=1)]) + scores = cross_val_score(model, coords, (data, data), scoring=metric) + npt.assert_allclose(scores, expected, atol=1e-10) + + +def test_cross_val_score_client(trend): + "Test the deprecated dask Client interface" + coords, data = trend[:2] model = Trend(degree=1) nsplits = 5 cross_validator = ShuffleSplit(n_splits=nsplits, random_state=0) From 8de5e993ee6da0484488694090f528c88e31715f Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 20 Jul 2020 07:27:27 -0300 Subject: [PATCH 30/38] Format the doc/conf.py file with Black (#275) Ran black on `doc/conf.py` and added it to the `black` and `flake8` runs in the main Makefile. Removed the setting of `sys.path` since we're installing the packaged in edit mode so it should be importable without that hack. --- Makefile | 4 +- doc/conf.py | 126 ++++++++++++++++++++++++++++------------------------ 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/Makefile b/Makefile index 62064b27f..9fc244be8 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ PROJECT=verde TESTDIR=tmp-test-dir-with-unique-name PYTEST_ARGS=--cov-config=../.coveragerc --cov-report=term-missing --cov=$(PROJECT) --doctest-modules -v --pyargs LINT_FILES=setup.py $(PROJECT) -BLACK_FILES=setup.py $(PROJECT) examples data/examples tutorials -FLAKE8_FILES=setup.py $(PROJECT) examples data/examples +BLACK_FILES=setup.py $(PROJECT) examples data/examples doc/conf.py tutorials +FLAKE8_FILES=setup.py $(PROJECT) examples data/examples doc/conf.py help: @echo "Commands:" diff --git a/doc/conf.py b/doc/conf.py index eca990d99..01a5dfc0d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -5,25 +5,20 @@ import sphinx_rtd_theme import sphinx_gallery from sphinx_gallery.sorting import FileNameSortKey - -# Sphinx needs to be able to import the package to use autodoc and get the -# version number -sys.path.append(os.path.pardir) - from verde.version import full_version extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.doctest', - 'sphinx.ext.viewcode', - 'sphinx.ext.extlinks', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.doctest", + "sphinx.ext.viewcode", + "sphinx.ext.extlinks", "sphinx.ext.intersphinx", - 'matplotlib.sphinxext.plot_directive', - 'sphinx.ext.napoleon', - 'sphinx_gallery.gen_gallery', + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.napoleon", + "sphinx_gallery.gen_gallery", ] # intersphinx configuration @@ -45,62 +40,65 @@ # Otherwise, the Return parameter list looks different from the Parameters list napoleon_use_rtype = False -# Otherwise, the Attributes parameter list looks different from the Parameters list +# Otherwise, the Attributes parameter list looks different from the Parameters +# list napoleon_use_ivar = True sphinx_gallery_conf = { # path to your examples scripts - 'examples_dirs': ['../examples', '../tutorials', '../data/examples'], + "examples_dirs": ["../examples", "../tutorials", "../data/examples"], # path where to save gallery generated examples - 'gallery_dirs': ['gallery', 'tutorials', 'sample_data'], - 'filename_pattern': '\.py', + "gallery_dirs": ["gallery", "tutorials", "sample_data"], + "filename_pattern": r"\.py", # Remove the "Download all examples" button from the top level gallery - 'download_all_examples': False, + "download_all_examples": False, # Sort gallery example by file name instead of number of lines (default) - 'within_subsection_order': FileNameSortKey, + "within_subsection_order": FileNameSortKey, # directory where function granular galleries are stored - 'backreferences_dir': 'api/generated/backreferences', + "backreferences_dir": "api/generated/backreferences", # Modules for which function level galleries are created. In # this case sphinx_gallery and numpy in a tuple of strings. - 'doc_module': 'verde', + "doc_module": "verde", # Insert links to documentation of objects in the examples - 'reference_url': {'verde': None}, + "reference_url": {"verde": None}, } # Always show the source code that generates a plot plot_include_source = True -plot_formats = ['png'] +plot_formats = ["png"] # Sphinx project configuration -templates_path = ['_templates'] -exclude_patterns = ['_build', '**.ipynb_checkpoints'] -source_suffix = '.rst' +templates_path = ["_templates"] +exclude_patterns = ["_build", "**.ipynb_checkpoints"] +source_suffix = ".rst" # The encoding of source files. -source_encoding = 'utf-8-sig' -master_doc = 'index' +source_encoding = "utf-8-sig" +master_doc = "index" # General information about the project year = datetime.date.today().year -project = 'Verde' -copyright = '2017-{}, The Verde Developers'.format(year) -if len(full_version.split('+')) > 1 or full_version == 'unknown': - version = 'dev' +project = "Verde" +copyright = "2017-{}, The Verde Developers".format(year) +if len(full_version.split("+")) > 1 or full_version == "unknown": + version = "dev" else: version = full_version # These enable substitutions using |variable| in the rst files rst_epilog = """ .. |year| replace:: {year} -""".format(year=year) +""".format( + year=year +) -html_last_updated_fmt = '%b %d, %Y' -html_title = 'Verde' -html_short_title = 'Verde' -html_logo = '_static/verde-logo.png' -html_favicon = '_static/favicon.png' -html_static_path = ['_static'] +html_last_updated_fmt = "%b %d, %Y" +html_title = "Verde" +html_short_title = "Verde" +html_logo = "_static/verde-logo.png" +html_favicon = "_static/favicon.png" +html_static_path = ["_static"] html_extra_path = [] -pygments_style = 'default' +pygments_style = "default" add_function_parentheses = False html_show_sourcelink = False html_show_sphinx = True @@ -109,28 +107,42 @@ # Theme config html_theme = "sphinx_rtd_theme" html_theme_options = { - 'logo_only': True, - 'display_version': True, + "logo_only": True, + "display_version": True, } html_context = { - 'menu_links_name': 'Getting help and contributing', - 'menu_links': [ - (' Fatiando a Terra', 'https://www.fatiando.org'), - (' Contributing', 'https://github.com/fatiando/verde/blob/master/CONTRIBUTING.md'), - (' Code of Conduct', 'https://github.com/fatiando/verde/blob/master/CODE_OF_CONDUCT.md'), - (' Contact', 'http://contact.fatiando.org'), - (' Source Code', 'https://github.com/fatiando/verde'), + "menu_links_name": "Getting help and contributing", + "menu_links": [ + ( + ' Fatiando a Terra', + "https://www.fatiando.org", + ), + ( + ' Contributing', + "https://github.com/fatiando/verde/blob/master/CONTRIBUTING.md", + ), + ( + ' Code of Conduct', + "https://github.com/fatiando/verde/blob/master/CODE_OF_CONDUCT.md", + ), + (' Contact', "http://contact.fatiando.org"), + ( + ' Source Code', + "https://github.com/fatiando/verde", + ), ], # Custom variables to enable "Improve this page"" and "Download notebook" # links - 'doc_path': 'doc', - 'galleries': sphinx_gallery_conf['gallery_dirs'], - 'gallery_dir': dict(zip(sphinx_gallery_conf['gallery_dirs'], - sphinx_gallery_conf['examples_dirs'])), - 'github_repo': 'fatiando/verde', - 'github_version': 'master', + "doc_path": "doc", + "galleries": sphinx_gallery_conf["gallery_dirs"], + "gallery_dir": dict( + zip(sphinx_gallery_conf["gallery_dirs"], sphinx_gallery_conf["examples_dirs"]) + ), + "github_repo": "fatiando/verde", + "github_version": "master", } + # Load the custom CSS files (needs sphinx >= 1.6 for this to work) def setup(app): app.add_stylesheet("style.css") From a1a4a3fcb633db34c488208f20b605b813a835e0 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Thu, 10 Sep 2020 14:16:04 -0300 Subject: [PATCH 31/38] Require Black >=20.8b1 (#284) After version 19, black changed the formatting slightly in a few files. This updates the format and sets a minimum version of black in the development environment and the CI. --- .azure-pipelines.yml | 2 +- environment.yml | 2 +- examples/blockkfold.py | 5 ++++- examples/train_test_split.py | 6 +++++- examples/trend.py | 6 +++++- tutorials/model_selection.py | 5 ++++- verde/base/base_classes.py | 5 ++++- verde/mask.py | 5 ++++- verde/tests/test_base.py | 13 ++++++++++--- verde/tests/test_mask.py | 4 +++- verde/tests/test_model_selection.py | 4 +++- 11 files changed, 44 insertions(+), 13 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index f00e588e4..c31749e1c 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -27,7 +27,7 @@ jobs: vmImage: 'ubuntu-16.04' variables: - CONDA_INSTALL_EXTRA: "black flake8 pylint==2.4.*" + CONDA_INSTALL_EXTRA: "black>=20.8b1 flake8 pylint==2.4.*" PYTHON: '3.7' steps: diff --git a/environment.yml b/environment.yml index ec8804f8c..7546ce8f4 100644 --- a/environment.yml +++ b/environment.yml @@ -23,7 +23,7 @@ dependencies: - pytest-cov - pytest-mpl - coverage - - black + - black>=20.8b1 - pylint=2.4.* - flake8 - sphinx=2.2.1 diff --git a/examples/blockkfold.py b/examples/blockkfold.py index 15ae001a7..3cb8df5f9 100644 --- a/examples/blockkfold.py +++ b/examples/blockkfold.py @@ -42,7 +42,10 @@ # balancing, the folds can have very different number of data points in them # (which may bias the scores). kfold_unbalanced = vd.BlockKFold( - spacing=spacing, shuffle=True, random_state=random_state, balance=False, + spacing=spacing, + shuffle=True, + random_state=random_state, + balance=False, ) # The BlockKFold is compatible with scikit-learn, so instead of giving it a diff --git a/examples/train_test_split.py b/examples/train_test_split.py index 88942fb97..8e3ce4d15 100644 --- a/examples/train_test_split.py +++ b/examples/train_test_split.py @@ -47,7 +47,11 @@ # blocks randomly. To do this, specify the size of the blocks using the # 'spacing' argument. train_block, test_block = vd.train_test_split( - coordinates, values, spacing=10 / 60, test_size=test_size, random_state=213, + coordinates, + values, + spacing=10 / 60, + test_size=test_size, + random_state=213, ) # Verde will automatically attempt to balance the data between the splits so # that the desired amount is assigned to the test set. It won't be exact since diff --git a/examples/trend.py b/examples/trend.py index f301150de..788615ef2 100644 --- a/examples/trend.py +++ b/examples/trend.py @@ -72,7 +72,11 @@ def plot_data(column, i, title): # Add a single colorbar on top of the histogram plot where there is some space cax = plt.axes((0.35, 0.44, 0.10, 0.01)) -cb = plt.colorbar(mappable, cax=cax, orientation="horizontal",) +cb = plt.colorbar( + mappable, + cax=cax, + orientation="horizontal", +) cb.set_label("C") plt.tight_layout() diff --git a/tutorials/model_selection.py b/tutorials/model_selection.py index 3cc5c1925..8f0bbefa2 100644 --- a/tutorials/model_selection.py +++ b/tutorials/model_selection.py @@ -100,7 +100,10 @@ # automatically when fitting a dataset. The only difference is that you must provide a # list of ``damping`` and ``mindist`` parameters to try instead of only a single value: -spline = vd.SplineCV(dampings=dampings, mindists=mindists,) +spline = vd.SplineCV( + dampings=dampings, + mindists=mindists, +) ######################################################################################## # Calling :meth:`~verde.SplineCV.fit` will run a grid search over all parameter diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 5400102f5..cfff11e6c 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -35,7 +35,10 @@ class BaseBlockCrossValidator(BaseCrossValidator, metaclass=ABCMeta): """ def __init__( - self, spacing=None, shape=None, n_splits=10, + self, + spacing=None, + shape=None, + n_splits=10, ): if spacing is None and shape is None: raise ValueError("Either 'spacing' or 'shape' must be provided.") diff --git a/verde/mask.py b/verde/mask.py index 6d92cb46d..c023d5957 100644 --- a/verde/mask.py +++ b/verde/mask.py @@ -110,7 +110,10 @@ def distance_mask( def convexhull_mask( - data_coordinates, coordinates=None, grid=None, projection=None, + data_coordinates, + coordinates=None, + grid=None, + projection=None, ): """ Mask grid points that are outside the convex hull of the given data points. diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 7d8f7cdea..9a148b63e 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -138,7 +138,8 @@ def test_basegridder(): npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) npt.assert_allclose(grd.scatter(region, 1000, random_state=0).scalars, data) npt.assert_allclose( - prof.scalars, angular * coordinates_true[0][0, :] + linear, + prof.scalars, + angular * coordinates_true[0][0, :] + linear, ) npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) npt.assert_allclose(prof.northing, coordinates_true[1][0, :]) @@ -245,7 +246,10 @@ def test_basegridder_extra_coords(): # Test profile with a single extra coord extra_coords = 9 prof = grd.profile( - (region[0], region[-1]), (region[1], region[-1]), 51, extra_coords=extra_coords, + (region[0], region[-1]), + (region[1], region[-1]), + 51, + extra_coords=extra_coords, ) assert "extra_coord" in prof.columns npt.assert_allclose(prof["extra_coord"], extra_coords) @@ -253,7 +257,10 @@ def test_basegridder_extra_coords(): # Test profile with multiple extra coord extra_coords = [9, 18, 27] prof = grd.profile( - (region[0], region[-1]), (region[1], region[-1]), 51, extra_coords=extra_coords, + (region[0], region[-1]), + (region[1], region[-1]), + 51, + extra_coords=extra_coords, ) extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] for name, coord in zip(extra_coord_names, extra_coords): diff --git a/verde/tests/test_mask.py b/verde/tests/test_mask.py index 5a7bd2577..bd9b53837 100644 --- a/verde/tests/test_mask.py +++ b/verde/tests/test_mask.py @@ -36,7 +36,9 @@ def test_convexhull_mask_projection(): # For a linear projection, the result should be the same since there is no # area change in the data. mask = convexhull_mask( - data_coords, coordinates=coords, projection=lambda e, n: (10 * e, 10 * n), + data_coords, + coordinates=coords, + projection=lambda e, n: (10 * e, 10 * n), ) true = [ [False, False, False, False, False, False], diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index 5b52c1f59..9c2c7995a 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -73,7 +73,9 @@ def test_blockshufflesplit_fails_balancing(): def test_blockshufflesplit_balancing(test_size): "Make sure that the sets have the right number of points" coords = np.random.RandomState(seed=0).multivariate_normal( - mean=[5, -7.5], cov=[[4, 0], [0, 9]], size=1000, + mean=[5, -7.5], + cov=[[4, 0], [0, 9]], + size=1000, ) npoints = coords.shape[0] train_size = 1 - test_size From 90ac8425b6cf35959141c8f2bd3200001b7dac4a Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Thu, 10 Sep 2020 15:51:55 -0300 Subject: [PATCH 32/38] Fix Cartopy issue and require >= 0.18 (#283) Unpin Cartopy because Cartopy 0.17.0 has a compatibility issue with Matplotlib > 3.3.0. Remove the unneeded tight_layout that was causing errors. Replace the filled polygons used for plotting land and ocean for the coastlines. --- data/examples/baja_bathymetry.py | 1 - data/examples/california_gps.py | 1 - data/examples/rio_magnetic.py | 1 - data/examples/texas-wind.py | 1 - environment.yml | 2 +- examples/blockreduce.py | 1 - examples/blockreduce_weights.py | 1 - examples/blockreduce_weights_mean.py | 1 - examples/convex_hull_mask.py | 1 - examples/distance_mask.py | 1 - examples/scipygridder.py | 1 - examples/spline.py | 1 - examples/spline_cv.py | 1 - examples/spline_weights.py | 1 - examples/trend.py | 1 - examples/vector_trend.py | 1 - examples/vector_uncoupled.py | 1 - requirements-dev.txt | 2 +- tutorials/chain.py | 4 -- tutorials/decimation.py | 3 - tutorials/model_evaluation.py | 2 - tutorials/model_selection.py | 1 - tutorials/trends.py | 1 - tutorials/vectors.py | 5 -- tutorials/weights.py | 2 - verde/datasets/sample_data.py | 91 ++++++++++++++-------------- verde/tests/test_datasets.py | 19 ++++++ 27 files changed, 66 insertions(+), 82 deletions(-) diff --git a/data/examples/baja_bathymetry.py b/data/examples/baja_bathymetry.py index 6671478a4..192376e73 100644 --- a/data/examples/baja_bathymetry.py +++ b/data/examples/baja_bathymetry.py @@ -32,5 +32,4 @@ plt.colorbar().set_label("meters") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/data/examples/california_gps.py b/data/examples/california_gps.py index ef5cbbb0e..319cccb1b 100644 --- a/data/examples/california_gps.py +++ b/data/examples/california_gps.py @@ -51,5 +51,4 @@ ) plt.colorbar(tmp, ax=ax).set_label("meters/year") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout(w_pad=0) plt.show() diff --git a/data/examples/rio_magnetic.py b/data/examples/rio_magnetic.py index c8618a43d..c731bc5d1 100644 --- a/data/examples/rio_magnetic.py +++ b/data/examples/rio_magnetic.py @@ -47,5 +47,4 @@ plt.colorbar(pad=0.01).set_label("nT") # Set the proper ticks for a Cartopy map vd.datasets.setup_rio_magnetic_map(ax) -plt.tight_layout() plt.show() diff --git a/data/examples/texas-wind.py b/data/examples/texas-wind.py index 300bf9dd1..c01aa2f86 100644 --- a/data/examples/texas-wind.py +++ b/data/examples/texas-wind.py @@ -39,5 +39,4 @@ ) # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() diff --git a/environment.yml b/environment.yml index 7546ce8f4..e5b5547a1 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,7 @@ dependencies: # Development requirements - pyproj - matplotlib - - cartopy==0.17.* + - cartopy>=0.18 - pytest - pytest-cov - pytest-mpl diff --git a/examples/blockreduce.py b/examples/blockreduce.py index c9d7880d0..35fc1be83 100644 --- a/examples/blockreduce.py +++ b/examples/blockreduce.py @@ -36,5 +36,4 @@ plt.colorbar().set_label("meters") # Use a utility function to setup the tick labels and land feature vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/blockreduce_weights.py b/examples/blockreduce_weights.py index 481a45943..c24da5294 100644 --- a/examples/blockreduce_weights.py +++ b/examples/blockreduce_weights.py @@ -67,5 +67,4 @@ cb.set_label("vertical velocity [m/yr]") vd.datasets.setup_california_gps_map(ax) ax.legend(loc="lower left") -plt.tight_layout() plt.show() diff --git a/examples/blockreduce_weights_mean.py b/examples/blockreduce_weights_mean.py index bef4cced9..de2887d26 100644 --- a/examples/blockreduce_weights_mean.py +++ b/examples/blockreduce_weights_mean.py @@ -83,5 +83,4 @@ ) plt.colorbar(pc, ax=ax, orientation="horizontal", pad=0.05).set_label("mm/yr") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/convex_hull_mask.py b/examples/convex_hull_mask.py index 438e55842..e2286fc16 100644 --- a/examples/convex_hull_mask.py +++ b/examples/convex_hull_mask.py @@ -47,5 +47,4 @@ ax.plot(data.longitude, data.latitude, ".y", markersize=0.5, transform=crs) ax.pcolormesh(*coordinates, dummy_data, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax, land=None) -plt.tight_layout() plt.show() diff --git a/examples/distance_mask.py b/examples/distance_mask.py index 4ef7e3c1a..bcc7e85cc 100644 --- a/examples/distance_mask.py +++ b/examples/distance_mask.py @@ -48,5 +48,4 @@ ax.plot(data.longitude, data.latitude, ".y", markersize=0.5, transform=crs) ax.pcolormesh(*coordinates, dummy_data, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax, land=None) -plt.tight_layout() plt.show() diff --git a/examples/scipygridder.py b/examples/scipygridder.py index c8bfe364e..3bbd8e83c 100644 --- a/examples/scipygridder.py +++ b/examples/scipygridder.py @@ -81,5 +81,4 @@ ax.plot(*coordinates, ".k", markersize=0.5, transform=crs) # Use an utility function to setup the tick labels and the land feature vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/spline.py b/examples/spline.py index 13f8b4cd1..bf5e27228 100644 --- a/examples/spline.py +++ b/examples/spline.py @@ -84,5 +84,4 @@ plt.colorbar(tmp).set_label("Air temperature (C)") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax, region=region) -plt.tight_layout() plt.show() diff --git a/examples/spline_cv.py b/examples/spline_cv.py index a5bea4b29..9818d933e 100644 --- a/examples/spline_cv.py +++ b/examples/spline_cv.py @@ -76,5 +76,4 @@ plt.colorbar(tmp).set_label("Air temperature (C)") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax, region=region) -plt.tight_layout() plt.show() diff --git a/examples/spline_weights.py b/examples/spline_weights.py index 171fa83fd..b246d67ef 100644 --- a/examples/spline_weights.py +++ b/examples/spline_weights.py @@ -104,5 +104,4 @@ ax.scatter(*coordinates, c="black", s=0.5, alpha=0.1, transform=crs) vd.datasets.setup_california_gps_map(ax, region=region) ax.coastlines() -plt.tight_layout() plt.show() diff --git a/examples/trend.py b/examples/trend.py index 788615ef2..36431b351 100644 --- a/examples/trend.py +++ b/examples/trend.py @@ -79,5 +79,4 @@ def plot_data(column, i, title): ) cb.set_label("C") -plt.tight_layout() plt.show() diff --git a/examples/vector_trend.py b/examples/vector_trend.py index b59f555ab..0384609d2 100644 --- a/examples/vector_trend.py +++ b/examples/vector_trend.py @@ -86,5 +86,4 @@ ) ax.coastlines(color="white") axes[0].legend(loc="lower left") -plt.tight_layout() plt.show() diff --git a/examples/vector_uncoupled.py b/examples/vector_uncoupled.py index 6f641940a..05e160c84 100644 --- a/examples/vector_uncoupled.py +++ b/examples/vector_uncoupled.py @@ -96,5 +96,4 @@ ax.legend(loc="lower left") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() diff --git a/requirements-dev.txt b/requirements-dev.txt index 85d0f586a..d96576a4e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # Development requirements pyproj matplotlib -cartopy=0.17.* +cartopy>=0.18 pytest pytest-cov pytest-mpl diff --git a/tutorials/chain.py b/tutorials/chain.py index aa3ed1d93..f4d1c46b7 100644 --- a/tutorials/chain.py +++ b/tutorials/chain.py @@ -54,7 +54,6 @@ ) plt.colorbar().set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -95,7 +94,6 @@ plt.hist(residuals, bins="auto", density=True) plt.xlabel("residuals (m)") plt.xlim(-1500, 1500) -plt.tight_layout() plt.show() ######################################################################################## @@ -123,7 +121,6 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -155,5 +152,4 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/tutorials/decimation.py b/tutorials/decimation.py index 5b0b23335..fb9e72c78 100644 --- a/tutorials/decimation.py +++ b/tutorials/decimation.py @@ -24,7 +24,6 @@ # Plot the bathymetry data locations as black dots plt.plot(data.longitude, data.latitude, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -57,7 +56,6 @@ # Plot the bathymetry data locations as black dots plt.plot(*coordinates, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() @@ -79,7 +77,6 @@ # Plot the bathymetry data locations as black dots plt.plot(*coordinates_center, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/model_evaluation.py b/tutorials/model_evaluation.py index 42f2623c6..5e9674db6 100644 --- a/tutorials/model_evaluation.py +++ b/tutorials/model_evaluation.py @@ -74,7 +74,6 @@ ax.plot(test[0][0], test[0][1], ".b", label="test") ax.legend() ax.set_aspect("equal") -plt.tight_layout() plt.show() ######################################################################################## @@ -120,7 +119,6 @@ plt.colorbar(pc).set_label("C") ax.plot(data.longitude, data.latitude, ".k", markersize=1, transform=ccrs.PlateCarree()) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/model_selection.py b/tutorials/model_selection.py index 8f0bbefa2..ebcbe2cd4 100644 --- a/tutorials/model_selection.py +++ b/tutorials/model_selection.py @@ -208,7 +208,6 @@ data.longitude, data.latitude, ".k", markersize=1, transform=ccrs.PlateCarree() ) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/trends.py b/tutorials/trends.py index 644c0bce3..35533c6e9 100644 --- a/tutorials/trends.py +++ b/tutorials/trends.py @@ -77,7 +77,6 @@ ) plt.colorbar(tmp, ax=ax, orientation="horizontal", pad=0.08) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/vectors.py b/tutorials/vectors.py index 63797fe61..bff5f0721 100644 --- a/tutorials/vectors.py +++ b/tutorials/vectors.py @@ -35,7 +35,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("GPS horizontal velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() @@ -87,7 +86,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("Block mean velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -142,7 +140,6 @@ ax.hist(data.velocity_east - pred_east, bins="auto", label="East", alpha=0.7) ax.legend() ax.set_xlabel("Velocity (m/yr)") -plt.tight_layout() plt.show() ######################################################################################## @@ -190,7 +187,6 @@ cb.set_label("meters/year") vd.datasets.setup_california_gps_map(ax, land=None, ocean=None) ax.coastlines(color="white") -plt.tight_layout() plt.show() ######################################################################################## @@ -277,7 +273,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("Gridded velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/weights.py b/tutorials/weights.py index 74f5c5c01..46569e07a 100644 --- a/tutorials/weights.py +++ b/tutorials/weights.py @@ -53,7 +53,6 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): ) plt.colorbar(pc, ax=ax, orientation="horizontal", pad=0.05) vd.datasets.setup_california_gps_map(ax) - plt.tight_layout() plt.show() @@ -305,5 +304,4 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): ax.plot(data.longitude, data.latitude, ".k", markersize=0.1, transform=crs) ax.coastlines() vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() diff --git a/verde/datasets/sample_data.py b/verde/datasets/sample_data.py index 76f7196ec..67ac00d7f 100644 --- a/verde/datasets/sample_data.py +++ b/verde/datasets/sample_data.py @@ -9,7 +9,6 @@ import pooch try: - import cartopy.feature as cfeature import cartopy.crs as ccrs from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter except ImportError: @@ -54,20 +53,18 @@ def locate(): def _setup_map( - ax, xticks, yticks, crs, region, land=None, ocean=None, borders=None, states=None + ax, + xticks, + yticks, + crs, + region, + coastlines=False, ): """ - Setup a Cartopy map with land and ocean features and proper tick labels. + Setup a Cartopy map with coastlines and proper tick labels. """ - - if land is not None: - ax.add_feature(cfeature.LAND, facecolor=land) - if ocean is not None: - ax.add_feature(cfeature.OCEAN, facecolor=ocean) - if borders is not None: - ax.add_feature(cfeature.BORDERS, linewidth=borders) - if states is not None: - ax.add_feature(cfeature.STATES, linewidth=states) + if coastlines: + ax.coastlines() ax.set_extent(region, crs=crs) # Set the proper ticks for a Cartopy map ax.set_xticks(xticks, crs=crs) @@ -103,7 +100,7 @@ def fetch_baja_bathymetry(): def setup_baja_bathymetry_map( - ax, region=(245.0, 254.705, 20.0, 29.99), land="gray", ocean=None + ax, region=(245.0, 254.705, 20.0, 29.99), coastlines=True, **kwargs ): """ Setup a Cartopy map for the Baja California bathymetry dataset. @@ -114,22 +111,27 @@ def setup_baja_bathymetry_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_baja_bathymetry: Sample bathymetry data from Baja California. """ + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-114, -105, 2), yticks=np.arange(21, 30, 2), - land=land, - ocean=ocean, + coastlines=coastlines, region=region, crs=ccrs.PlateCarree(), ) @@ -205,10 +207,6 @@ def setup_rio_magnetic_map(ax, region=(-42.6, -42, -22.5, -22)): The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. See also -------- @@ -224,8 +222,6 @@ def setup_rio_magnetic_map(ax, region=(-42.6, -42, -22.5, -22)): ax, xticks=np.arange(-42.5, -42, 0.1), yticks=np.arange(-22.5, -21.99, 0.1), - land=None, - ocean=None, region=region, crs=ccrs.PlateCarree(), ) @@ -268,7 +264,7 @@ def fetch_california_gps(): def setup_california_gps_map( - ax, region=(235.2, 245.3, 31.9, 42.3), land="gray", ocean="skyblue" + ax, region=(235.2, 245.3, 31.9, 42.3), coastlines=True, **kwargs ): """ Setup a Cartopy map for the California GPS velocity dataset. @@ -279,22 +275,27 @@ def setup_california_gps_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_california_gps: Sample GPS velocity data from California. """ + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-124, -115, 4), yticks=np.arange(33, 42, 2), - land=land, - ocean=ocean, + coastlines=coastlines, region=region, crs=ccrs.PlateCarree(), ) @@ -328,9 +329,7 @@ def fetch_texas_wind(): return data -def setup_texas_wind_map( - ax, region=(-107, -93, 25.5, 37), land="#dddddd", borders=0.5, states=0.1 -): +def setup_texas_wind_map(ax, region=(-107, -93, 25.5, 37), coastlines=True, **kwargs): """ Setup a Cartopy map for the Texas wind speed and air temperature dataset. @@ -340,27 +339,27 @@ def setup_texas_wind_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - borders : float or None - Line width of the country borders. - states : float or None - Line width of the state borders. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_texas_wind: Sample wind speed and air temperature data for Texas. """ - + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-106, -92, 3), yticks=np.arange(27, 38, 3), - land=land, - ocean=None, + coastlines=coastlines, region=region, - borders=borders, - states=states, crs=ccrs.PlateCarree(), ) diff --git a/verde/tests/test_datasets.py b/verde/tests/test_datasets.py index f7311e713..ed285c0fc 100644 --- a/verde/tests/test_datasets.py +++ b/verde/tests/test_datasets.py @@ -2,6 +2,7 @@ Test data fetching routines. """ import os +import warnings import matplotlib.pyplot as plt import cartopy.crs as ccrs @@ -130,3 +131,21 @@ def test_setup_california_gps(): ax = plt.subplot(111, projection=ccrs.Mercator()) setup_california_gps_map(ax) return fig + + +def test_setup_cartopy_backward(): + """ + Test backward compatibility of setup map functions + + Check if a warning is raise after passing deprecated parameters like ocean, + land, borders and states to functions to setup maps. + """ + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_texas_wind_map(ax, land="#dddddd", borders=0.5, states=0.1) + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_california_gps_map(ax, land="gray", ocean="skyblue") + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_baja_bathymetry_map(ax, land="gray", ocean=None) From b807423014d603746eb10778d919bde30a831e41 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Thu, 10 Sep 2020 17:06:39 -0300 Subject: [PATCH 33/38] Fix typos on docstrings (#281) Fix typo on description of data_names parameters on BaseGridder.grid method. Fix typo on description of BaseGridder.project_coordinates method. --- verde/base/base_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index cfff11e6c..cc28f1644 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -380,7 +380,7 @@ def grid( order: northing dimension, easting dimension. **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : list or None The name(s) of the data variables in the output grid. Defaults to ``['scalars']`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and @@ -659,7 +659,7 @@ def _get_extra_coords_names(self, coordinates): def project_coordinates(coordinates, projection, **kwargs): """ - Apply projection to given coordiantes + Apply projection to given coordinates Allows to apply projections to any number of coordinates, assuming that the first ones are ``longitude`` and ``latitude``. From 8302ca390658d7212fd5c527feeae23365c58948 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 11 Sep 2020 10:02:40 -0300 Subject: [PATCH 34/38] Raise errors for invalid rolling windows arguments (#280) Raise a specific error if no spacing or shape is passed to rolling_window(). Raise a specific error if window size is larger than the smaller dimension of the region. Add tests for these new methods that include matching error messages to identify which error has been raised. --- verde/coordinates.py | 10 ++++++++++ verde/tests/test_coordinates.py | 34 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/verde/coordinates.py b/verde/coordinates.py index b43ebd3ca..941df0aeb 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -1015,11 +1015,21 @@ def rolling_window( [20. 20. 20. 20. 20. 20. 20. 20. 20.] """ + # Check if shape or spacing were passed + if shape is None and spacing is None: + raise ValueError("Either a shape or a spacing must be provided.") # Select the coordinates after checking to make sure indexing will still # work on the ignored coordinates. coordinates = check_coordinates(coordinates)[:2] if region is None: region = get_region(coordinates) + # Check if window size is bigger than the minimum dimension of the region + region_min_width = min(region[1] - region[0], region[3] - region[2]) + if region_min_width < size: + raise ValueError( + "Window size '{}' is larger ".format(size) + + "than dimensions of the region '{}'.".format(region) + ) # Calculate the region spanning the centers of the rolling windows window_region = [ dimension + (-1) ** (i % 2) * size / 2 for i, dimension in enumerate(region) diff --git a/verde/tests/test_coordinates.py b/verde/tests/test_coordinates.py index ec38c469b..029933729 100644 --- a/verde/tests/test_coordinates.py +++ b/verde/tests/test_coordinates.py @@ -58,6 +58,40 @@ def test_rolling_window_warnings(): assert str(userwarnings[-1].message).split()[0] == "Rolling" +def test_rolling_window_no_shape_or_spacing(): + """ + Check if error is raise if no shape or spacing is passed + """ + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + err_msg = "Either a shape or a spacing must be provided." + with pytest.raises(ValueError, match=err_msg): + rolling_window(coords, size=2) + + +def test_rolling_window_oversized_window(): + """ + Check if error is raised if size larger than region is passed + """ + oversize = 5 + regions = [ + (-5, -1, 6, 20), # window larger than west-east + (-20, -1, 6, 10), # window larger than south-north + (-5, -1, 6, 10), # window larger than both dims + ] + for region in regions: + coords = grid_coordinates(region, spacing=1) + # The expected error message with regex + # (the long expression intends to capture floats and ints) + float_regex = r"[+-]?([0-9]*[.])?[0-9]+" + err_msg = ( + r"Window size '{}' is larger ".format(float_regex) + + r"than dimensions of the region " + + r"'\({0}, {0}, {0}, {0}\)'.".format(float_regex) + ) + with pytest.raises(ValueError, match=err_msg): + rolling_window(coords, size=oversize, spacing=2) + + def test_spacing_to_shape(): "Check that correct spacing and region are returned" region = (-10, 0, 0, 5) From bfe030d780b0a5a243c570f5a9293ac7886309b0 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 19 Oct 2020 10:45:29 +0100 Subject: [PATCH 35/38] Remove configuration files for unused CI (#292) We no longer use Stickler, Codacy, or CodeClimate. So there is no reason to keep these files around. --- .codacy.yml | 6 ------ .codeclimate.yml | 35 ----------------------------------- .stickler.yml | 12 ------------ 3 files changed, 53 deletions(-) delete mode 100644 .codacy.yml delete mode 100644 .codeclimate.yml delete mode 100644 .stickler.yml diff --git a/.codacy.yml b/.codacy.yml deleted file mode 100644 index 807a14cd1..000000000 --- a/.codacy.yml +++ /dev/null @@ -1,6 +0,0 @@ -exclude_paths: - - 'doc/**' - - 'versioneer.py' - - '**/_version.py' - - '**/__init__.py' - - '**/tests/**' diff --git a/.codeclimate.yml b/.codeclimate.yml deleted file mode 100644 index 8396c0c64..000000000 --- a/.codeclimate.yml +++ /dev/null @@ -1,35 +0,0 @@ -version: "2" - -checks: - file-lines: - enabled: false - similar-code: - enabled: false - method-lines: - enabled: false - argument-count: - config: - threshold: 10 - method-complexity: - config: - threshold: 6 - -plugins: - sonar-python: - enabled: true - config: - test_patterns: - - "verde/**/tests/*.py" - checks: - python:S3776: - enabled: false - python:S107: - enabled: false - python:S1542: - enabled: false - -exclude_paths: -- "versioneer.py" -- "verde/_version.py" -- "doc/**/*" -- "verde/tests/data/*" diff --git a/.stickler.yml b/.stickler.yml deleted file mode 100644 index 3ddff7ba9..000000000 --- a/.stickler.yml +++ /dev/null @@ -1,12 +0,0 @@ -linters: - flake8: - python: 3 - enable: true - ignore: E203, E266, E501, W503, F401, E741 - max-line-length: 88 - shellcheck: - shell: bash - csslint: - enable: false -files: - ignore: ['versioneer.py', 'verde/_version.py', 'doc/conf.py'] From c2a2fc87223b5159dd164ece33b692e161e26bb9 Mon Sep 17 00:00:00 2001 From: Fatiando a Terra Bot <50936856+fatiando-bot@users.noreply.github.com> Date: Tue, 27 Oct 2020 13:35:52 +0000 Subject: [PATCH 36/38] Update files from fatiando/contributing (#294) Changes have been made in https://github.com/fatiando/contributing to: CONTRIBUTING.md Update the copies in this repository to match. See https://github.com/fatiando/contributing/commit/e899f3f4f4b4823b2f877405a2f8a2e3999b3890 --- CONTRIBUTING.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d6099ac79..ad43271f4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,6 +125,15 @@ hesitate to [ask questions](#how-can-i-talk-to-you)): * Aaron Meurer's [tutorial on the git workflow](http://www.asmeurer.com/git-workflow/) * [How to Contribute to an Open Source Project on GitHub](https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github) +If you're new to working with git, GitHub, and the Unix Shell, we recommend +starting with the [Software Carpentry](https://software-carpentry.org/) lessons, +which are available in English and Spanish: + +* :gb: [Version Control with Git](http://swcarpentry.github.io/git-novice/) / :es: [Control de +versiones con Git](https://swcarpentry.github.io/git-novice-es/) +* :gb: [The Unix Shell](http://swcarpentry.github.io/shell-novice/) / :es: +[La Terminal de Unix](https://swcarpentry.github.io/shell-novice-es/) + ### General guidelines We follow the [git pull request workflow](http://www.asmeurer.com/git-workflow/) to From 5868d08f23b2d4a2594dd909bc6084177577a69c Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 27 Oct 2020 15:00:55 -0300 Subject: [PATCH 37/38] Add function to convert grid to Dataset (#282) Add a function that converts coordinates and data as np.arrays to xarray.Dataset. This would make handling grids easier, with the option of efficiently saving them as netCDF files. Make BaseGridder.grid method to use this function instead of building the xarray.Dataset by itself. Add new check_data_names function that converts a str into a tuple with a single str. Make BaseGridder.grid, BaseGridder.scatter and BaseGridder.profile able to take a single str for the data_names argument. Add test function for default data_names. Change data_names on all examples and tutorials to a single str. --- examples/project_grid.py | 2 +- examples/scipygridder.py | 2 +- examples/spline.py | 2 +- examples/spline_cv.py | 2 +- tutorials/chain.py | 4 +- tutorials/model_evaluation.py | 2 +- tutorials/model_selection.py | 4 +- tutorials/overview.py | 2 +- tutorials/projections.py | 4 +- tutorials/weights.py | 4 +- verde/__init__.py | 2 +- verde/base/base_classes.py | 43 ++++++---- verde/base/utils.py | 24 ++++++ verde/tests/test_base.py | 17 ++++ verde/tests/test_utils.py | 122 ++++++++++++++++++++++++++- verde/utils.py | 150 +++++++++++++++++++++++++++++++++- 16 files changed, 352 insertions(+), 34 deletions(-) diff --git a/examples/project_grid.py b/examples/project_grid.py index 5e80b4915..2ac9affa4 100644 --- a/examples/project_grid.py +++ b/examples/project_grid.py @@ -51,7 +51,7 @@ region=region, spacing=spacing, projection=projection, - data_names=["checkerboard"], + data_names="checkerboard", dims=("latitude", "longitude"), ) print("Geographic grid:") diff --git a/examples/scipygridder.py b/examples/scipygridder.py index 3bbd8e83c..fbc57cee4 100644 --- a/examples/scipygridder.py +++ b/examples/scipygridder.py @@ -58,7 +58,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry_m"], + data_names="bathymetry_m", ) print("Generated geographic grid:") print(grid) diff --git a/examples/spline.py b/examples/spline.py index bf5e27228..19e42dee9 100644 --- a/examples/spline.py +++ b/examples/spline.py @@ -66,7 +66,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) grid = vd.distance_mask( coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection diff --git a/examples/spline_cv.py b/examples/spline_cv.py index 9818d933e..a880fdd77 100644 --- a/examples/spline_cv.py +++ b/examples/spline_cv.py @@ -59,7 +59,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) grid = vd.distance_mask( coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection diff --git a/tutorials/chain.py b/tutorials/chain.py index f4d1c46b7..f0ac27ecc 100644 --- a/tutorials/chain.py +++ b/tutorials/chain.py @@ -106,7 +106,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print(grid) @@ -140,7 +140,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print(grid_trend) diff --git a/tutorials/model_evaluation.py b/tutorials/model_evaluation.py index 5e9674db6..8a8252686 100644 --- a/tutorials/model_evaluation.py +++ b/tutorials/model_evaluation.py @@ -91,7 +91,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) print(grid) diff --git a/tutorials/model_selection.py b/tutorials/model_selection.py index ebcbe2cd4..cb0571a83 100644 --- a/tutorials/model_selection.py +++ b/tutorials/model_selection.py @@ -128,7 +128,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) print(grid) @@ -174,7 +174,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) ######################################################################################## diff --git a/tutorials/overview.py b/tutorials/overview.py index 9ff5f6947..ea252bade 100644 --- a/tutorials/overview.py +++ b/tutorials/overview.py @@ -121,7 +121,7 @@ # and "northing") and data variables ("scalars"). You can overwrite these names by # setting the ``dims`` and ``data_names`` arguments. -grid = spline.grid(spacing=30, dims=["latitude", "longitude"], data_names=["gravity"]) +grid = spline.grid(spacing=30, dims=["latitude", "longitude"], data_names="gravity") print(grid) plt.figure() diff --git a/tutorials/projections.py b/tutorials/projections.py index 0d6f34b56..008f9f1b0 100644 --- a/tutorials/projections.py +++ b/tutorials/projections.py @@ -60,7 +60,7 @@ ######################################################################################## # If we now call :meth:`verde.Spline.grid` we'll get back a grid evenly spaced in # projected Cartesian coordinates. -grid = spline.grid(spacing=spacing * 111e3, data_names=["bathymetry"]) +grid = spline.grid(spacing=spacing * 111e3, data_names="bathymetry") print("Cartesian grid:") print(grid) @@ -105,7 +105,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print("Geographic grid:") print(grid_geo) diff --git a/tutorials/weights.py b/tutorials/weights.py index 46569e07a..073f84558 100644 --- a/tutorials/weights.py +++ b/tutorials/weights.py @@ -243,7 +243,7 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["velocity"], + data_names="velocity", ) # Avoid showing interpolation outside of the convex hull of the data points. grid = vd.convexhull_mask(coordinates, grid=grid, projection=projection) @@ -261,7 +261,7 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["velocity"], + data_names="velocity", ) grid_unweighted = vd.convexhull_mask( coordinates, grid=grid_unweighted, projection=projection diff --git a/verde/__init__.py b/verde/__init__.py index f80b3e34b..9f59575e2 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -15,7 +15,7 @@ longitude_continuity, ) from .mask import distance_mask, convexhull_mask -from .utils import variance_to_weights, maxabs, grid_to_table +from .utils import variance_to_weights, maxabs, grid_to_table, make_xarray_grid from .io import load_surfer from .distances import median_distance from .blockreduce import BlockReduce, BlockMean diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index cc28f1644..18f599b68 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -3,13 +3,13 @@ """ from abc import ABCMeta, abstractmethod -import xarray as xr import pandas as pd from sklearn.base import BaseEstimator from sklearn.model_selection import BaseCrossValidator from ..coordinates import grid_coordinates, profile_coordinates, scatter_points -from .utils import check_data, score_estimator +from .utils import check_data, check_data_names, score_estimator +from ..utils import make_xarray_grid # Pylint doesn't like X, y scikit-learn argument names. @@ -380,9 +380,9 @@ def grid( order: northing dimension, easting dimension. **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list or None + data_names : str, list or None The name(s) of the data variables in the output grid. Defaults to - ``['scalars']`` for scalar data, + ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. @@ -415,17 +415,24 @@ def grid( data = check_data( self.predict(project_coordinates(coordinates, projection)) ) + # Get names for data and any extra coordinates + data_names = check_data_names(data_names) data_names = get_data_names(data, data_names) - coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} - # Add extra coordinates to xr.Dataset as non-dimension coordinates extra_coords_names = self._get_extra_coords_names(coordinates) - for name, extra_coord in zip(extra_coords_names, coordinates[2:]): - coords[name] = (dims, extra_coord) - attrs = {"metadata": "Generated by {}".format(repr(self))} - data_vars = { - name: (dims, value, attrs) for name, value in zip(data_names, data) - } - return xr.Dataset(data_vars, coords=coords, attrs=attrs) + # Create xarray.Dataset + dataset = make_xarray_grid( + coordinates, + data, + data_names, + dims=dims, + extra_coords_names=extra_coords_names, + ) + # Add metadata as attrs + metadata = "Generated by {}".format(repr(self)) + dataset.attrs["metadata"] = metadata + for array in dataset: + dataset[array].attrs["metadata"] = metadata + return dataset def scatter( self, @@ -469,9 +476,9 @@ def scatter( following order: northing dimension, easting dimension. **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : str, list or None The name(s) of the data variables in the output dataframe. Defaults - to ``['scalars']`` for scalar data, + to ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. @@ -499,6 +506,7 @@ def scatter( data = check_data( self.predict(project_coordinates(coordinates, projection)) ) + data_names = check_data_names(data_names) data_names = get_data_names(data, data_names) columns = [(dims[0], coordinates[1]), (dims[1], coordinates[0])] extra_coords_names = self._get_extra_coords_names(coordinates) @@ -572,9 +580,9 @@ def profile( following order: northing dimension, easting dimension. **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : str, list or None The name(s) of the data variables in the output dataframe. Defaults - to ``['scalars']`` for scalar data, + to ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. @@ -610,6 +618,7 @@ def profile( # profile but Cartesian distances. if projection is not None: coordinates = project_coordinates(coordinates, projection, inverse=True) + data_names = check_data_names(data_names) data_names = get_data_names(data, data_names) columns = [ (dims[0], coordinates[1]), diff --git a/verde/base/utils.py b/verde/base/utils.py index 0adf31e6d..8cee2407e 100644 --- a/verde/base/utils.py +++ b/verde/base/utils.py @@ -104,6 +104,30 @@ def check_data(data): return data +def check_data_names(data_names): + """ + Check the *data_names* argument and make sure it's a tuple. + If ``data_names`` is a single string, return it as a tuple with a single + element. + + This is the default form accepted by gridders and functions that require + the ``data_names`` argument. + + Examples + -------- + + >>> check_data_names("dummy") + ('dummy',) + >>> check_data_names(("component_x", "component_y")) + ('component_x', 'component_y') + >>> check_data_names(["dummy"]) + ['dummy'] + """ + if isinstance(data_names, str): + data_names = (data_names,) + return data_names + + def check_coordinates(coordinates): """ Check that the given coordinate arrays are what we expect them to be. diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 9a148b63e..7a0ee1e23 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -146,6 +146,23 @@ def test_basegridder(): npt.assert_allclose(prof.distance, coordinates_true[0][0, :]) +def test_basegridder_data_names(): + """ + Test default values for data_names + """ + region = (0, 10, -10, -5) + shape = (50, 30) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0) + data = angular * coordinates[0] + linear + grd = PolyGridder().fit(coordinates, data) + grid = grd.grid(region, shape) + prof = grd.profile((0, -10), (10, -10), 30) + # Check if default name for data_names was applied correctly + assert "scalars" in grid + assert "scalars" in prof + + def test_basegridder_projection(): "Test basic functionality of BaseGridder when passing in a projection" diff --git a/verde/tests/test_utils.py b/verde/tests/test_utils.py index 544260616..3553c3360 100644 --- a/verde/tests/test_utils.py +++ b/verde/tests/test_utils.py @@ -10,7 +10,14 @@ import pytest from ..coordinates import grid_coordinates -from ..utils import parse_engine, dummy_jit, kdtree, grid_to_table, partition_by_sum +from ..utils import ( + parse_engine, + dummy_jit, + kdtree, + grid_to_table, + partition_by_sum, + make_xarray_grid, +) from .. import utils @@ -89,3 +96,116 @@ def test_partition_by_sum_fails_no_partitions(): with pytest.raises(ValueError) as error: partition_by_sum(np.arange(10), 8) assert "Could not find partition points" in str(error) + + +def test_build_grid(): + """ + Check if xarray.Dataset is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + data = np.ones_like(coordinates[0]) + grid = make_xarray_grid(coordinates, data, data_names="dummy") + npt.assert_allclose(grid.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(grid.northing, [6, 7, 8, 9, 10]) + npt.assert_allclose(grid.dummy, 1) + assert grid.dummy.shape == (5, 6) + # Change dims + grid = make_xarray_grid( + coordinates, data, data_names="dummy", dims=("latitude", "longitude") + ) + npt.assert_allclose(grid.longitude, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(grid.latitude, [6, 7, 8, 9, 10]) + npt.assert_allclose(grid.dummy, 1) + assert grid.dummy.shape == (5, 6) + + +def test_build_grid_multiple_data(): + """ + Check if xarray.Dataset with multiple data is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + data_arrays = tuple(i * np.ones_like(coordinates[0]) for i in range(1, 4)) + data_names = list("data_{}".format(i) for i in range(1, 4)) + dataset = make_xarray_grid(coordinates, data_arrays, data_names=data_names) + npt.assert_allclose(dataset.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(dataset.northing, [6, 7, 8, 9, 10]) + for i in range(1, 4): + npt.assert_allclose(dataset["data_{}".format(i)], i) + assert dataset["data_{}".format(i)].shape == (5, 6) + + +def test_build_grid_extra_coords(): + """ + Check if xarray.Dataset with extra coords is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + extra_coords = [1, 2] + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=extra_coords) + data = np.ones_like(coordinates[0]) + dataset = make_xarray_grid( + coordinates, + data, + data_names="dummy", + extra_coords_names=["upward", "time"], + ) + npt.assert_allclose(dataset.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(dataset.northing, [6, 7, 8, 9, 10]) + npt.assert_allclose(dataset.upward, 1) + npt.assert_allclose(dataset.time, 2) + npt.assert_allclose(dataset.dummy, 1) + assert dataset.dummy.shape == (5, 6) + assert dataset.upward.shape == (5, 6) + assert dataset.time.shape == (5, 6) + + +def test_build_grid_invalid_names(): + """ + Check if errors are raise after invalid data names + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + # Single data, multiple data_name + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names=["bla_1", "bla_2"]) + # Multiple data, single data_name + data = tuple(i * np.ones_like(coordinates[0]) for i in (1, 2)) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names="blabla") + + +def test_build_grid_invalid_extra_coords(): + """ + Check if errors are raise after invalid extra coords + """ + region = (-10, -5, 6, 10) + spacing = 1 + # No extra coords, extra_coords_name should be ignored + coordinates = grid_coordinates(region, spacing=spacing) + data = np.ones_like(coordinates[0]) + make_xarray_grid(coordinates, data, data_names="dummy", extra_coords_names="upward") + # Single extra coords, extra_coords_name equal to None + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=1) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names="dummy", extra_coords_names=None) + # Multiple extra coords, single extra_coords_name as a str + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=[1, 2]) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid( + coordinates, data, data_names="dummy", extra_coords_names="upward" + ) + # Multiple extra coords, multiple extra_coords_name but not equal + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=[1, 2, 3]) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid( + coordinates, data, data_names="dummy", extra_coords_names=["upward", "time"] + ) diff --git a/verde/utils.py b/verde/utils.py index ac6639c8c..3c40859fc 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -6,6 +6,7 @@ import dask import numpy as np import pandas as pd +import xarray as xr from scipy.spatial import cKDTree # pylint: disable=no-name-in-module try: @@ -18,7 +19,8 @@ except ImportError: numba = None -from .base.utils import check_data, n_1d_arrays +from .base.utils import check_data, check_data_names, n_1d_arrays +from .coordinates import check_coordinates def dispatch(function, delayed=False, client=None): @@ -207,6 +209,152 @@ def maxabs(*args, nan=True): return npmax(absolute) +def make_xarray_grid( + coordinates, + data, + data_names, + dims=("northing", "easting"), + extra_coords_names=None, +): + """ + Create an :class:`xarray.Dataset` grid from numpy arrays + + This functions creates an :class:`xarray.Dataset` out of 2d gridded data + including easting and northing coordinates, any extra coordinates (like + upward elevation, time, etc) and data arrays. + + Use this to transform the outputs of :func:`verde.grid_coordinates` and + the ``predict`` method of a gridder into an :class:`xarray.Dataset`. + + .. note:: + + This is a utility function to help create 2D grids (i.e., grids with + two ``dims`` coordinates). For arbitrary N-dimensional arrays, use + :mod:`xarray` directly. + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with coordinates of each point in the grid. Each array must + contain values for a dimension in the order: easting, northing, + vertical, etc. All arrays must be 2d and need to have the same *shape*. + These coordinates can be generated through + :func:`verde.grid_coordinates`. + data : array or tuple of arrays + Array or tuple of arrays with data values on each point in the grid. + Each array must contain values for a dimension in the same order as + the coordinates. All arrays need to have the same *shape*. + data_names : str, list or None + The name(s) of the data variables in the output grid. + dims : list or None + The names of the northing and easting data dimensions, respectively, + in the output grid. Must be defined in the following order: northing + dimension, easting dimension. + **NOTE: This is an exception to the "easting" then + "northing" pattern but is required for compatibility with xarray.** + The easting and northing coordinates in the :class:`xarray.Dataset` + will have the same names as the passed dimensions. + extra_coords_names : str or list + Name or list of names for any additional coordinates besides the + easting and northing ones. Ignored if coordinates has + only two elements. The extra coordinates are non-index coordinates of + the grid array. + + Returns + ------- + grid : :class:`xarray.Dataset` + A 2D grid with one or more data variables. + + Examples + -------- + + >>> import numpy as np + >>> import verde as vd + >>> # Create the coordinates of the regular grid + >>> coordinates = vd.grid_coordinates((-10, -6, 8, 10), spacing=2) + >>> # And some dummy data for each point of the grid + >>> data = np.ones_like(coordinates[0]) + >>> # Create the grid + >>> grid = make_xarray_grid(coordinates, data, data_names="dummy") + >>> print(grid) + + Dimensions: (easting: 3, northing: 2) + Coordinates: + * easting (easting) float64 -10.0 -8.0 -6.0 + * northing (northing) float64 8.0 10.0 + Data variables: + dummy (northing, easting) float64 1.0 1.0 1.0 1.0 1.0 1.0 + + >>> # Create a grid with an extra coordinate + >>> coordinates = vd.grid_coordinates( + ... (-10, -6, 8, 10), spacing=2, extra_coords=5 + ... ) + >>> # And some dummy data for each point of the grid + >>> data = np.ones_like(coordinates[0]) + >>> # Create the grid + >>> grid = make_xarray_grid( + ... coordinates, data, data_names="dummy", extra_coords_names="upward" + ... ) + >>> print(grid) + + Dimensions: (easting: 3, northing: 2) + Coordinates: + * easting (easting) float64 -10.0 -8.0 -6.0 + * northing (northing) float64 8.0 10.0 + upward (northing, easting) float64 5.0 5.0 5.0 5.0 5.0 5.0 + Data variables: + dummy (northing, easting) float64 1.0 1.0 1.0 1.0 1.0 1.0 + + """ + coordinates = check_coordinates(coordinates) + data = check_data(data) + data_names = check_data_names(data_names) + # dims is like shape with order (rows, cols) for the array + # so the first element is northing and second is easting + coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} + # Extra coordinates are handled like 2D data arrays with + # the same dims and the data. + if coordinates[2:]: + extra_coords_names = _check_extra_coords_names(coordinates, extra_coords_names) + for name, extra_coord in zip(extra_coords_names, coordinates[2:]): + coords[name] = (dims, extra_coord) + # Generate object + if len(data) != len(data_names): + raise ValueError( + "Invalid data_names '{}'. ".format(data_names) + + "Number of data names must match the number of " + + "data arrays ('{}').".format(len(data)) + ) + data_vars = {name: (dims, value) for name, value in zip(data_names, data)} + return xr.Dataset(data_vars, coords) + + +def _check_extra_coords_names(coordinates, extra_coords_names): + """ + Sanity checks for extra_coords_names against coordinates + + Assuming that there are extra coordinates on the ``coordinates`` tuple. + """ + # Convert to tuple if it's a str + if isinstance(extra_coords_names, str): + extra_coords_names = (extra_coords_names,) + # Check if it's not None + if extra_coords_names is None: + raise ValueError( + "Invalid extra_coords_names equal to None. " + + "When passing one or more extra coordinate, " + + "extra_coords_names cannot be None." + ) + # Check if there are the same number of extra_coords than extra_coords_name + if len(coordinates[2:]) != len(extra_coords_names): + raise ValueError( + "Invalid extra_coords_names '{}'. ".format(extra_coords_names) + + "Number of extra coordinates names must match the number of " + + "additional coordinates ('{}').".format(len(coordinates[2:])) + ) + return extra_coords_names + + def grid_to_table(grid): """ Convert a grid to a table with the values and coordinates of each point. From 64dff1b3a15911daa380463c7e4e6428bfc26d68 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Wed, 28 Oct 2020 03:33:39 -0300 Subject: [PATCH 38/38] Refactor get_data_names and related check functions (#295) Refactor `get_data_names` as a private method of `BaseGridder`: simplify `get_data_names` logic, make `get_data_names` to use `check_data_names` to convert single string to a tuple and check that `data_names` has the same number of elements as data. Add `data_names_defaults` as a class attribute of `BaseGridder`. Move `check_extra_coords_names` to `verde/base/utils.py` along with the other check functions. Make `check_data_names` to raise error if `data_names` is `None`. But `BaseGridder.get_data_names` will return default values if `data_names` is `None`. Rename test functions for `make_xarray_grid`. --- verde/base/base_classes.py | 101 +++++++++++++++++++------------------ verde/base/utils.py | 72 ++++++++++++++++++++++---- verde/tests/test_base.py | 19 +++---- verde/tests/test_utils.py | 13 +++-- verde/utils.py | 50 ++++-------------- 5 files changed, 142 insertions(+), 113 deletions(-) diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 18f599b68..0061025be 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -209,6 +209,14 @@ class BaseGridder(BaseEstimator): # using this name as a basis. extra_coords_name = "extra_coord" + # Define default values for data_names depending on the number of data + # arrays returned by predict method. + data_names_defaults = [ + ("scalars",), + ("east_component", "north_component"), + ("east_component", "north_component", "vertical_component"), + ] + def predict(self, coordinates): """ Predict data on the given coordinate values. NOT IMPLEMENTED. @@ -416,8 +424,7 @@ def grid( self.predict(project_coordinates(coordinates, projection)) ) # Get names for data and any extra coordinates - data_names = check_data_names(data_names) - data_names = get_data_names(data, data_names) + data_names = self._get_data_names(data, data_names) extra_coords_names = self._get_extra_coords_names(coordinates) # Create xarray.Dataset dataset = make_xarray_grid( @@ -506,8 +513,7 @@ def scatter( data = check_data( self.predict(project_coordinates(coordinates, projection)) ) - data_names = check_data_names(data_names) - data_names = get_data_names(data, data_names) + data_names = self._get_data_names(data, data_names) columns = [(dims[0], coordinates[1]), (dims[1], coordinates[0])] extra_coords_names = self._get_extra_coords_names(coordinates) columns.extend(zip(extra_coords_names, coordinates[2:])) @@ -618,8 +624,7 @@ def profile( # profile but Cartesian distances. if projection is not None: coordinates = project_coordinates(coordinates, projection, inverse=True) - data_names = check_data_names(data_names) - data_names = get_data_names(data, data_names) + data_names = self._get_data_names(data, data_names) columns = [ (dims[0], coordinates[1]), (dims[1], coordinates[0]), @@ -665,6 +670,46 @@ def _get_extra_coords_names(self, coordinates): names.append(name) return names + def _get_data_names(self, data, data_names): + """ + Get default names for data fields if none are given based on the data. + + Examples + -------- + + >>> import numpy as np + >>> east, north, up = [np.arange(10)]*3 + >>> gridder = BaseGridder() + >>> gridder._get_data_names((east,), data_names=None) + ('scalars',) + >>> gridder._get_data_names((east, north), data_names=None) + ('east_component', 'north_component') + >>> gridder._get_data_names((east, north, up), data_names=None) + ('east_component', 'north_component', 'vertical_component') + >>> gridder._get_data_names((east,), data_names="john") + ('john',) + >>> gridder._get_data_names((east,), data_names=("paul",)) + ('paul',) + >>> gridder._get_data_names( + ... (up, north), data_names=('ringo', 'george') + ... ) + ('ringo', 'george') + >>> gridder._get_data_names((north,), data_names=["brian"]) + ['brian'] + + """ + # Return the defaults data_names for the class + if data_names is None: + if len(data) > len(self.data_names_defaults): + raise ValueError( + "Default data names only available for up to 3 components. " + + "Must provide custom names through the 'data_names' argument." + ) + return self.data_names_defaults[len(data) - 1] + # Return the passed data_names if valid + data_names = check_data_names(data, data_names) + return data_names + def project_coordinates(coordinates, projection, **kwargs): """ @@ -702,50 +747,6 @@ def project_coordinates(coordinates, projection, **kwargs): return proj_coordinates -def get_data_names(data, data_names): - """ - Get default names for data fields if none are given based on the data. - - Examples - -------- - - >>> import numpy as np - >>> east, north, up = [np.arange(10)]*3 - >>> get_data_names((east,), data_names=None) - ('scalars',) - >>> get_data_names((east, north), data_names=None) - ('east_component', 'north_component') - >>> get_data_names((east, north, up), data_names=None) - ('east_component', 'north_component', 'vertical_component') - >>> get_data_names((up, north), data_names=('ringo', 'george')) - ('ringo', 'george') - - """ - if data_names is not None: - if len(data) != len(data_names): - raise ValueError( - "Data has {} components but only {} names provided: {}".format( - len(data), len(data_names), str(data_names) - ) - ) - return data_names - data_types = [ - ("scalars",), - ("east_component", "north_component"), - ("east_component", "north_component", "vertical_component"), - ] - if len(data) > len(data_types): - raise ValueError( - " ".join( - [ - "Default data names only available for up to 3 components.", - "Must provide custom names through the 'data_names' argument.", - ] - ) - ) - return data_types[len(data) - 1] - - def get_instance_region(instance, region): """ Get the region attribute stored in instance if one is not provided. diff --git a/verde/base/utils.py b/verde/base/utils.py index 8cee2407e..3f40f1e4a 100644 --- a/verde/base/utils.py +++ b/verde/base/utils.py @@ -104,27 +104,39 @@ def check_data(data): return data -def check_data_names(data_names): +def check_data_names(data, data_names): """ - Check the *data_names* argument and make sure it's a tuple. - If ``data_names`` is a single string, return it as a tuple with a single - element. + Check *data_names* against *data*. - This is the default form accepted by gridders and functions that require - the ``data_names`` argument. + Also, convert ``data_names`` to a tuple if it's a single string. Examples -------- - >>> check_data_names("dummy") + >>> import numpy as np + >>> east, north, scalar = [np.array(10)]*3 + >>> check_data_names((scalar,), "dummy") ('dummy',) - >>> check_data_names(("component_x", "component_y")) - ('component_x', 'component_y') - >>> check_data_names(["dummy"]) + >>> check_data_names((scalar,), ("dummy",)) + ('dummy',) + >>> check_data_names((scalar,), ["dummy"]) ['dummy'] + >>> check_data_names((east, north), ("component_x", "component_y")) + ('component_x', 'component_y') """ + # Convert single string to tuple if isinstance(data_names, str): data_names = (data_names,) + # Raise error if data_names is None + if data_names is None: + raise ValueError("Invalid data_names equal to None.") + # Raise error if data and data_names don't have the same number of elements + if len(data) != len(data_names): + raise ValueError( + "Data has {} components but only {} names provided: {}".format( + len(data), len(data_names), str(data_names) + ) + ) return data_names @@ -143,6 +155,46 @@ def check_coordinates(coordinates): return coordinates +def check_extra_coords_names(coordinates, extra_coords_names): + """ + Check extra_coords_names against coordiantes. + + Also, convert ``extra_coords_names`` to a tuple if it's a single string. + Assume that there are extra coordinates on the ``coordinates`` tuple. + + Examples + -------- + + >>> import numpy as np + >>> coordinates = [np.array(10)]*3 + >>> check_extra_coords_names(coordinates, "upward") + ('upward',) + >>> check_extra_coords_names(coordinates, ("upward",)) + ('upward',) + >>> coordinates = [np.array(10)]*4 + >>> check_extra_coords_names(coordinates, ("upward", "time")) + ('upward', 'time') + """ + # Convert single string to a tuple + if isinstance(extra_coords_names, str): + extra_coords_names = (extra_coords_names,) + # Check if it's not None + if extra_coords_names is None: + raise ValueError( + "Invalid extra_coords_names equal to None. " + + "When passing one or more extra coordinate, " + + "extra_coords_names cannot be None." + ) + # Check if there are the same number of extra_coords than extra_coords_name + if len(coordinates[2:]) != len(extra_coords_names): + raise ValueError( + "Invalid extra_coords_names '{}'. ".format(extra_coords_names) + + "Number of extra coordinates names must match the number of " + + "additional coordinates ('{}').".format(len(coordinates[2:])) + ) + return extra_coords_names + + def check_fit_input(coordinates, data, weights, unpack=True): """ Validate the inputs to the fit method of gridders. diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index 7a0ee1e23..5baa58b36 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -11,7 +11,6 @@ from ..base.base_classes import ( BaseGridder, BaseBlockCrossValidator, - get_data_names, get_instance_region, ) from ..coordinates import grid_coordinates, scatter_points @@ -46,28 +45,30 @@ def test_get_data_names(): data2 = tuple([np.arange(10)] * 2) data3 = tuple([np.arange(10)] * 3) # Test the default names - assert get_data_names(data1, data_names=None) == ("scalars",) - assert get_data_names(data2, data_names=None) == ( + gridder = BaseGridder() + assert gridder._get_data_names(data1, data_names=None) == ("scalars",) + assert gridder._get_data_names(data2, data_names=None) == ( "east_component", "north_component", ) - assert get_data_names(data3, data_names=None) == ( + assert gridder._get_data_names(data3, data_names=None) == ( "east_component", "north_component", "vertical_component", ) # Test custom names - assert get_data_names(data1, data_names=("a",)) == ("a",) - assert get_data_names(data2, data_names=("a", "b")) == ("a", "b") - assert get_data_names(data3, data_names=("a", "b", "c")) == ("a", "b", "c") + assert gridder._get_data_names(data1, data_names=("a",)) == ("a",) + assert gridder._get_data_names(data2, data_names=("a", "b")) == ("a", "b") + assert gridder._get_data_names(data3, data_names=("a", "b", "c")) == ("a", "b", "c") def test_get_data_names_fails(): "Check if fails for invalid data types" + gridder = BaseGridder() with pytest.raises(ValueError): - get_data_names(tuple([np.arange(5)] * 4), data_names=None) + gridder._get_data_names(tuple([np.arange(5)] * 4), data_names=None) with pytest.raises(ValueError): - get_data_names(tuple([np.arange(5)] * 2), data_names=("meh",)) + gridder._get_data_names(tuple([np.arange(5)] * 2), data_names=("meh",)) def test_get_instance_region(): diff --git a/verde/tests/test_utils.py b/verde/tests/test_utils.py index 3553c3360..f1ea64559 100644 --- a/verde/tests/test_utils.py +++ b/verde/tests/test_utils.py @@ -98,7 +98,7 @@ def test_partition_by_sum_fails_no_partitions(): assert "Could not find partition points" in str(error) -def test_build_grid(): +def test_make_xarray_grid(): """ Check if xarray.Dataset is correctly created """ @@ -121,7 +121,7 @@ def test_build_grid(): assert grid.dummy.shape == (5, 6) -def test_build_grid_multiple_data(): +def test_make_xarray_grid_multiple_data(): """ Check if xarray.Dataset with multiple data is correctly created """ @@ -138,7 +138,7 @@ def test_build_grid_multiple_data(): assert dataset["data_{}".format(i)].shape == (5, 6) -def test_build_grid_extra_coords(): +def test_make_xarray_grid_extra_coords(): """ Check if xarray.Dataset with extra coords is correctly created """ @@ -163,7 +163,7 @@ def test_build_grid_extra_coords(): assert dataset.time.shape == (5, 6) -def test_build_grid_invalid_names(): +def test_make_xarray_grid_invalid_names(): """ Check if errors are raise after invalid data names """ @@ -174,13 +174,16 @@ def test_build_grid_invalid_names(): data = np.ones_like(coordinates[0]) with pytest.raises(ValueError): make_xarray_grid(coordinates, data, data_names=["bla_1", "bla_2"]) + # data_names equal to None + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names=None) # Multiple data, single data_name data = tuple(i * np.ones_like(coordinates[0]) for i in (1, 2)) with pytest.raises(ValueError): make_xarray_grid(coordinates, data, data_names="blabla") -def test_build_grid_invalid_extra_coords(): +def test_make_xarray_grid_invalid_extra_coords(): """ Check if errors are raise after invalid extra coords """ diff --git a/verde/utils.py b/verde/utils.py index 3c40859fc..6f1874094 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -19,8 +19,13 @@ except ImportError: numba = None -from .base.utils import check_data, check_data_names, n_1d_arrays -from .coordinates import check_coordinates +from .base.utils import ( + check_coordinates, + check_extra_coords_names, + check_data, + check_data_names, + n_1d_arrays, +) def dispatch(function, delayed=False, client=None): @@ -244,9 +249,9 @@ def make_xarray_grid( Array or tuple of arrays with data values on each point in the grid. Each array must contain values for a dimension in the same order as the coordinates. All arrays need to have the same *shape*. - data_names : str, list or None + data_names : str or list The name(s) of the data variables in the output grid. - dims : list or None + dims : list The names of the northing and easting data dimensions, respectively, in the output grid. Must be defined in the following order: northing dimension, easting dimension. @@ -308,53 +313,20 @@ def make_xarray_grid( """ coordinates = check_coordinates(coordinates) data = check_data(data) - data_names = check_data_names(data_names) + data_names = check_data_names(data, data_names) # dims is like shape with order (rows, cols) for the array # so the first element is northing and second is easting coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} # Extra coordinates are handled like 2D data arrays with # the same dims and the data. if coordinates[2:]: - extra_coords_names = _check_extra_coords_names(coordinates, extra_coords_names) + extra_coords_names = check_extra_coords_names(coordinates, extra_coords_names) for name, extra_coord in zip(extra_coords_names, coordinates[2:]): coords[name] = (dims, extra_coord) - # Generate object - if len(data) != len(data_names): - raise ValueError( - "Invalid data_names '{}'. ".format(data_names) - + "Number of data names must match the number of " - + "data arrays ('{}').".format(len(data)) - ) data_vars = {name: (dims, value) for name, value in zip(data_names, data)} return xr.Dataset(data_vars, coords) -def _check_extra_coords_names(coordinates, extra_coords_names): - """ - Sanity checks for extra_coords_names against coordinates - - Assuming that there are extra coordinates on the ``coordinates`` tuple. - """ - # Convert to tuple if it's a str - if isinstance(extra_coords_names, str): - extra_coords_names = (extra_coords_names,) - # Check if it's not None - if extra_coords_names is None: - raise ValueError( - "Invalid extra_coords_names equal to None. " - + "When passing one or more extra coordinate, " - + "extra_coords_names cannot be None." - ) - # Check if there are the same number of extra_coords than extra_coords_name - if len(coordinates[2:]) != len(extra_coords_names): - raise ValueError( - "Invalid extra_coords_names '{}'. ".format(extra_coords_names) - + "Number of extra coordinates names must match the number of " - + "additional coordinates ('{}').".format(len(coordinates[2:])) - ) - return extra_coords_names - - def grid_to_table(grid): """ Convert a grid to a table with the values and coordinates of each point.