diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 0592ed71d..c31749e1c 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -9,6 +9,12 @@ trigger: - master - refs/tags/* +# Make sure triggers are set for PRs to any branch. +pr: + branches: + include: + - '*' + jobs: @@ -21,7 +27,7 @@ jobs: vmImage: 'ubuntu-16.04' variables: - CONDA_INSTALL_EXTRA: "black flake8 pylint" + CONDA_INSTALL_EXTRA: "black>=20.8b1 flake8 pylint==2.4.*" PYTHON: '3.7' steps: @@ -71,7 +77,7 @@ jobs: displayName: 'Mac' pool: - vmImage: 'macOS-10.13' + vmImage: 'macOS-10.15' variables: CONDA_REQUIREMENTS: requirements.txt diff --git a/.codacy.yml b/.codacy.yml deleted file mode 100644 index 807a14cd1..000000000 --- a/.codacy.yml +++ /dev/null @@ -1,6 +0,0 @@ -exclude_paths: - - 'doc/**' - - 'versioneer.py' - - '**/_version.py' - - '**/__init__.py' - - '**/tests/**' diff --git a/.codeclimate.yml b/.codeclimate.yml deleted file mode 100644 index 8396c0c64..000000000 --- a/.codeclimate.yml +++ /dev/null @@ -1,35 +0,0 @@ -version: "2" - -checks: - file-lines: - enabled: false - similar-code: - enabled: false - method-lines: - enabled: false - argument-count: - config: - threshold: 10 - method-complexity: - config: - threshold: 6 - -plugins: - sonar-python: - enabled: true - config: - test_patterns: - - "verde/**/tests/*.py" - checks: - python:S3776: - enabled: false - python:S107: - enabled: false - python:S1542: - enabled: false - -exclude_paths: -- "versioneer.py" -- "verde/_version.py" -- "doc/**/*" -- "verde/tests/data/*" diff --git a/.stickler.yml b/.stickler.yml deleted file mode 100644 index 3ddff7ba9..000000000 --- a/.stickler.yml +++ /dev/null @@ -1,12 +0,0 @@ -linters: - flake8: - python: 3 - enable: true - ignore: E203, E266, E501, W503, F401, E741 - max-line-length: 88 - shellcheck: - shell: bash - csslint: - enable: false -files: - ignore: ['versioneer.py', 'verde/_version.py', 'doc/conf.py'] diff --git a/.travis.yml b/.travis.yml index 42de0e256..f417298e6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,8 @@ # We use miniconda for Python so don't need any Python specific tools language: generic -# Use the container builds so we don't need sudo priviledges -sudo: false +os: linux +dist: xenial # Only build pushes to the master branch and tags. This avoids the double # builds than happen when working on a branch instead of a fork. @@ -33,7 +33,7 @@ env: - DEPLOY_PYPI=false # Specify the build configurations. Be sure to only deploy from a single build. -matrix: +jobs: include: - name: "Linux - Python 3.8" os: linux diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d6099ac79..ad43271f4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,6 +125,15 @@ hesitate to [ask questions](#how-can-i-talk-to-you)): * Aaron Meurer's [tutorial on the git workflow](http://www.asmeurer.com/git-workflow/) * [How to Contribute to an Open Source Project on GitHub](https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github) +If you're new to working with git, GitHub, and the Unix Shell, we recommend +starting with the [Software Carpentry](https://software-carpentry.org/) lessons, +which are available in English and Spanish: + +* :gb: [Version Control with Git](http://swcarpentry.github.io/git-novice/) / :es: [Control de +versiones con Git](https://swcarpentry.github.io/git-novice-es/) +* :gb: [The Unix Shell](http://swcarpentry.github.io/shell-novice/) / :es: +[La Terminal de Unix](https://swcarpentry.github.io/shell-novice-es/) + ### General guidelines We follow the [git pull request workflow](http://www.asmeurer.com/git-workflow/) to diff --git a/MAINTENANCE.md b/MAINTENANCE.md index 810fc862c..4858b1a8c 100644 --- a/MAINTENANCE.md +++ b/MAINTENANCE.md @@ -1,12 +1,27 @@ # Maintainers Guide -This page contains instructions for project maintainers about how our setup works, -making releases, creating packages, etc. +This page contains instructions for project maintainers about how our setup +works, making releases, creating packages, etc. If you want to make a contribution to the project, see the [Contributing Guide](CONTRIBUTING.md) instead. +## Contents + +* [Branches](#branches) +* [Reviewing and merging pull requests](#reviewing-and-merging-pull-requests) +* [Continuous Integration](#continuous-integration) +* [Citations](#citations) +* [Making a Release](#making-a-release) + * [Draft a new Zenodo release](#draft-a-new-zenodo-release) + * [Update the changelog](#update-the-changelog) + * [Check the README syntax](#check-the-readme-syntax) + * [Release](#release) + * [Archive on Zenodo](#archive-on-zenodo) + * [Update the conda package](#update-the-conda-package) + + ## Branches * *master*: Always tested and ready to become a new version. Don't push directly to this @@ -62,6 +77,43 @@ If you find any problems with the test setup and deployment, please create issue submit pull requests to that repository. +## Citations + +The citation for a package that doesn't have an associated paper will be the +Zenodo DOI for all versions. This citation will include everyone who has +contributed to the project and met our [authorship criteria](AUTHORSHIP.md). + +Include the following text in the `CITATION.rst` file: + +``` +This is research software **made by scientists**. Citations help us justify the +effort that goes into building and maintaining this project. + +If you used this software in your research, please consider +citing the following source: https://doi.org/10.5281/zenodo.3530749 + +The link above includes full citation information and export formats (BibTeX, +Mendeley, etc). +``` + +If the project has been publish as an academic paper (for example, on +[JOSS](https://joss.theoj.org)), **update the `CITATION.rst` to point to the +paper instead of the Zenodo archive**. + +``` +If you used this software in your research, please consider citing the +following publication: + + + +This is an open-access publication. The paper and the associated reviews can be +freely accessed at: https://doi.org/ + +The link above includes full citation information and export formats (BibTeX, +Mendeley, etc). +``` + + ## Making a Release We try to automate the release process as much as possible. @@ -70,7 +122,7 @@ The version number is set automatically using versioneer based information it ge git. There are a few steps that still must be done manually, though. -### Drafting a new Zenodo release +### Draft a new Zenodo release If the project already has releases on [Zenodo](https://zenodo.org/), you need to create a **New version** of it. Find the link to the latest Zenodo release on the `README.md` @@ -78,7 +130,11 @@ file of your project. Then: 1. Delete all existing files (they will be replaced with the new version). 2. Reserve a DOI and save the release draft. -3. Add as authors any new contributors who have added themselves to `AUTHORS.md`. +3. Add as authors any new contributors who have added themselves to + [`AUTHORS.md`](AUTHORS.md). +4. Review author order to make sure it follows the guidelines on our + [Authorship Guide](AUTHORSHIP.md) +5. Update release date. On the other hand, if you're making the first release of the project, you need to create a **New upload** for it inside the @@ -86,7 +142,7 @@ a **New upload** for it inside the Make sure the Fatiando a Terra community is chosen when filling the release draft. The rest of the process is the same as above. -### Updating the changelog +### Update the changelog 1. Generate a list of commits between the last release tag and now: @@ -126,39 +182,48 @@ python setup.py --long-description | rst2html.py --no-raw > index.html Open `index.html` and check for any flaws or error messages. -### Pushing to PyPI and updating the documentation - -After the changelog is updated, making a release should be as simple as creating a new -git tag and pushing it to Github: +### Release -```bash -git tag v0.2.0 -git push --tags -``` +After the changelog is updated, making a release should be as simple as +creating a new git tag. +The continuous integration services will take care of pushing the package to +PyPI and creating a new version of the documentation. +A new folder with version number containing the HTML documentation will be +pushed to *gh-pages*, and the `latest` link will be updated to point to this +new folder. The tag should be version number (following [Semantic Versioning](https://semver.org/)) -with a leading `v`. -This should trigger Travis to do all the work for us. -A new source distribution will be uploaded to PyPI, a new folder with the documentation -HTML will be pushed to *gh-pages*, and the `latest` link will be updated to point to -this new folder. +with a leading `v` (`v1.5.7`). + +To create a new tag, go to `https://github.com/fatiando/PROJECT/releases` and +click on "Draft a new release": + +1. Use the version number (including the `v`) as the "Tag version" and "Release + title". +2. Fill the release description with a Markdown version of the **latest** + changelog entry (including the DOI badge). The `doc/changes.rst` file can be + converted to Markdown using `pandoc`: + ``` + pandoc -s doc/changes.rst -o changes.md --wrap=none + ``` +3. Publish the release. -### Archiving on Zenodo +### Archive on Zenodo Grab a zip file from the Github release and upload to Zenodo using the previously reserved DOI. -### Updating the conda package +### Update the conda package -After Travis is done building the tag and all builds pass, we need to update the conda -package. -Unfortunately, this needs to be done manually for now. +After Travis is done building the tag and all builds pass, we need to update +the conda package. +Fortunately, the conda-forge bot will submit a PR updating the package for us +(it may take a couple of hours to do so). +Most releases can be merged right away but some might need further changes to +the conda recipe: -1. Fork the feedstock repository (https://github.com/conda-forge/PROJECT-feedstock) if - you haven't already. If you have a fork, update it. -2. Update the version number and sha256 hash on `recipe/meta.yaml`. You can get the hash - from the PyPI "Download files" section. -3. Add or remove any new dependencies (most are probably only `run` dependencies). -4. Make a new branch, commit, and push your changes **to your fork**. -5. Create a PR against the original feedstock master. -6. Once the CIs are passing, merge or as a maintainer to do so. +1. If the release added new dependencies, make sure they are included in the + `meta.yaml` file. +2. If dropping/adding support for Python versions (or version of other + packages) make sure the correct version restrictions are applied in + `meta.yaml`. diff --git a/Makefile b/Makefile index 62064b27f..9fc244be8 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ PROJECT=verde TESTDIR=tmp-test-dir-with-unique-name PYTEST_ARGS=--cov-config=../.coveragerc --cov-report=term-missing --cov=$(PROJECT) --doctest-modules -v --pyargs LINT_FILES=setup.py $(PROJECT) -BLACK_FILES=setup.py $(PROJECT) examples data/examples tutorials -FLAKE8_FILES=setup.py $(PROJECT) examples data/examples +BLACK_FILES=setup.py $(PROJECT) examples data/examples doc/conf.py tutorials +FLAKE8_FILES=setup.py $(PROJECT) examples data/examples doc/conf.py help: @echo "Commands:" diff --git a/README.rst b/README.rst index bba8ab287..1c3e70794 100644 --- a/README.rst +++ b/README.rst @@ -130,7 +130,7 @@ What could you possibly offer? We assure you that the little voice in your head is wrong. **Being a contributor doesn't just mean writing code**. -Equality important contributions include: +Equally important contributions include: writing or proof-reading documentation, suggesting or implementing tests, or even giving feedback about the project (including giving feedback about the contribution process). @@ -159,6 +159,8 @@ Documentation for other versions * `Development `__ (reflects the *master* branch on Github) * `Latest release `__ +* `v1.5.0 `__ +* `v1.4.0 `__ * `v1.3.0 `__ * `v1.2.0 `__ * `v1.1.0 `__ diff --git a/data/examples/baja_bathymetry.py b/data/examples/baja_bathymetry.py index 6671478a4..192376e73 100644 --- a/data/examples/baja_bathymetry.py +++ b/data/examples/baja_bathymetry.py @@ -32,5 +32,4 @@ plt.colorbar().set_label("meters") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/data/examples/california_gps.py b/data/examples/california_gps.py index ef5cbbb0e..319cccb1b 100644 --- a/data/examples/california_gps.py +++ b/data/examples/california_gps.py @@ -51,5 +51,4 @@ ) plt.colorbar(tmp, ax=ax).set_label("meters/year") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout(w_pad=0) plt.show() diff --git a/data/examples/rio_magnetic.py b/data/examples/rio_magnetic.py index c8618a43d..c731bc5d1 100644 --- a/data/examples/rio_magnetic.py +++ b/data/examples/rio_magnetic.py @@ -47,5 +47,4 @@ plt.colorbar(pad=0.01).set_label("nT") # Set the proper ticks for a Cartopy map vd.datasets.setup_rio_magnetic_map(ax) -plt.tight_layout() plt.show() diff --git a/data/examples/texas-wind.py b/data/examples/texas-wind.py index 300bf9dd1..c01aa2f86 100644 --- a/data/examples/texas-wind.py +++ b/data/examples/texas-wind.py @@ -39,5 +39,4 @@ ) # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() diff --git a/doc/api/index.rst b/doc/api/index.rst index 4f1e0708f..cbe7a9286 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -45,6 +45,8 @@ Model Selection train_test_split cross_val_score + BlockShuffleSplit + BlockKFold Coordinate Manipulation ----------------------- @@ -57,9 +59,28 @@ Coordinate Manipulation profile_coordinates get_region pad_region - project_region inside block_split + rolling_window + expanding_window + +Projection +---------- + +.. autosummary:: + :toctree: generated/ + + project_region + project_grid + +Masking +------- + +.. autosummary:: + :toctree: generated/ + + distance_mask + convexhull_mask Utilities --------- @@ -69,7 +90,6 @@ Utilities test maxabs - distance_mask variance_to_weights grid_to_table median_distance @@ -111,6 +131,7 @@ Base Classes and Functions :toctree: generated/ base.BaseGridder + base.BaseBlockCrossValidator base.n_1d_arrays base.check_fit_input base.least_squares diff --git a/doc/changes.rst b/doc/changes.rst index c05852597..8938a2969 100644 --- a/doc/changes.rst +++ b/doc/changes.rst @@ -3,6 +3,137 @@ Changelog ========= +Version 1.5.0 +------------- + +*Released on: 2020/06/04* + +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3877060.svg + :target: https://doi.org/10.5281/zenodo.3877060 + +Bug fixes: + +* Apply projections using only the first two coordinates instead all given + coordinates. Projections only really involve the first two (horizontal) + coordinates. Only affects users passing ``extra_coords`` to gridder methods. + (`#264 `__) + +New features: + +* **New** blocked cross-validation classes ``BlockShuffleSplit`` and + ``BlockKFold``. These are scikit-learn compatible cross-validators that split + the data into spatial blocks before assigning them to folds. Blocked + cross-validation can help avoid overestimation of prediction accuracy for + spatial data (see [Roberts_etal2017]_). The classes work with + ``verde.cross_val_score`` and any other function/method/class that accepts a + scikit-learn cross-validator. + (`#251 `__ and + `#254 `__) +* Add the option for block-wise splitting in ``verde.train_test_split`` by + passing in a ``spacing`` or ``shape`` parameters. + (`#253 `__ and + `#257 `__) + +Base classes: + +* Add optional argument to ``verde.base.least_squares`` to copy Jacobian + matrix. + (`#255 `__) +* Add extra coordinates (specified by the ``extra_coords`` keyword argument + to outputs of ``BaseGridder`` methods. + (`#265 `__) + +Maintenance: + +* Update tests to ``repr`` changes in scikit-learn 0.23.0. + (`#267 `__) + +Documentation: + +* Fix typo in README contributing section. + (`#258 `__) + +This release contains contributions from: + +* Leonardo Uieda +* Santiago Soler +* Rowan Cockett + +Version 1.4.0 +------------- + +*Released on: 2020/04/06* + +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3739449.svg + :target: https://doi.org/10.5281/zenodo.3739449 + +Bug fixes: + +* **Profile distances are now returned in projected (Cartesian) coordinates by + the** ``profile`` **method of gridders if a projection is given.** The method + has the option to apply a projection to the coordinates before predicting so + we can pass geographic coordinates to Cartesian gridders. In these cases, the + distance along the profile is calculated by the ``profile_coordinates`` + function with the unprojected coordinates (in the geographic case it would be + degrees). The profile point calculation is also done assuming that + coordinates are Cartesian, which is clearly wrong if inputs are longitude and + latitude. To fix this, we now project the input points prior to passing them + to ``profile_coordinates``. This means that the distances are Cartesian and + generation of profile points is also Cartesian (as is assumed by the + function). The generated coordinates are projected back so that the user gets + longitude and latitude but distances are still projected Cartesian meters. + (`#231 `__) +* **Function** ``verde.grid_to_table`` **now sets the correct order for + coordinates.** We were relying on the order of the ``coords`` attribute of + the ``xarray.Dataset`` for the order of the coordinates. This is wrong + because xarray takes the coordinate order from the ``dims`` attribute + instead, which is what we should also have been doing. + (`#229 `__) + +Documentation: + +* Generalize coordinate system specifications in ``verde.base.BaseGridder`` + docstrings. Most methods don't really depend on the coordinate system so use + a more generic language to allow derived classes to specify their coordinate + systems without having to overload the base methods just to rewrite the + docstrings. + (`#240 `__) + +New features: + +* New function ``verde.convexhull_mask`` to mask points in a grid that fall + outside the convex hull defined by data points. + (`#237 `__) +* New function ``verde.project_grid`` that transforms 2D gridded data using a + given projection. It re-samples the data using ``ScipyGridder`` (by default) + and runs a blocked mean (optional) to avoid aliasing when the points aren't + evenly distributed in the projected coordinates (like in polar projections). + Finally, it applies a ``convexhull_mask`` to the grid to avoid extrapolation + to points that had no original data. + (`#246 `__) +* New function ``verde.expanding_window`` for selecting data that falls inside + of an expanding window around a central point. + (`#238 `__) +* New function ``verde.rolling_window`` for rolling window selections of + irregularly sampled data. + (`#236 `__) + +Improvements: + +* Allow ``verde.grid_to_table`` to take ``xarray.DataArray`` as input. + (`#235 `__) + +Maintenance: + +* Use newer MacOS images on Azure Pipelines. + (`#234 `__) + +This release contains contributions from: + +* Leonardo Uieda +* Santiago Soler +* Jesse Pisel + Version 1.3.0 ------------- diff --git a/doc/conf.py b/doc/conf.py index eca990d99..01a5dfc0d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -5,25 +5,20 @@ import sphinx_rtd_theme import sphinx_gallery from sphinx_gallery.sorting import FileNameSortKey - -# Sphinx needs to be able to import the package to use autodoc and get the -# version number -sys.path.append(os.path.pardir) - from verde.version import full_version extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.doctest', - 'sphinx.ext.viewcode', - 'sphinx.ext.extlinks', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.doctest", + "sphinx.ext.viewcode", + "sphinx.ext.extlinks", "sphinx.ext.intersphinx", - 'matplotlib.sphinxext.plot_directive', - 'sphinx.ext.napoleon', - 'sphinx_gallery.gen_gallery', + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.napoleon", + "sphinx_gallery.gen_gallery", ] # intersphinx configuration @@ -45,62 +40,65 @@ # Otherwise, the Return parameter list looks different from the Parameters list napoleon_use_rtype = False -# Otherwise, the Attributes parameter list looks different from the Parameters list +# Otherwise, the Attributes parameter list looks different from the Parameters +# list napoleon_use_ivar = True sphinx_gallery_conf = { # path to your examples scripts - 'examples_dirs': ['../examples', '../tutorials', '../data/examples'], + "examples_dirs": ["../examples", "../tutorials", "../data/examples"], # path where to save gallery generated examples - 'gallery_dirs': ['gallery', 'tutorials', 'sample_data'], - 'filename_pattern': '\.py', + "gallery_dirs": ["gallery", "tutorials", "sample_data"], + "filename_pattern": r"\.py", # Remove the "Download all examples" button from the top level gallery - 'download_all_examples': False, + "download_all_examples": False, # Sort gallery example by file name instead of number of lines (default) - 'within_subsection_order': FileNameSortKey, + "within_subsection_order": FileNameSortKey, # directory where function granular galleries are stored - 'backreferences_dir': 'api/generated/backreferences', + "backreferences_dir": "api/generated/backreferences", # Modules for which function level galleries are created. In # this case sphinx_gallery and numpy in a tuple of strings. - 'doc_module': 'verde', + "doc_module": "verde", # Insert links to documentation of objects in the examples - 'reference_url': {'verde': None}, + "reference_url": {"verde": None}, } # Always show the source code that generates a plot plot_include_source = True -plot_formats = ['png'] +plot_formats = ["png"] # Sphinx project configuration -templates_path = ['_templates'] -exclude_patterns = ['_build', '**.ipynb_checkpoints'] -source_suffix = '.rst' +templates_path = ["_templates"] +exclude_patterns = ["_build", "**.ipynb_checkpoints"] +source_suffix = ".rst" # The encoding of source files. -source_encoding = 'utf-8-sig' -master_doc = 'index' +source_encoding = "utf-8-sig" +master_doc = "index" # General information about the project year = datetime.date.today().year -project = 'Verde' -copyright = '2017-{}, The Verde Developers'.format(year) -if len(full_version.split('+')) > 1 or full_version == 'unknown': - version = 'dev' +project = "Verde" +copyright = "2017-{}, The Verde Developers".format(year) +if len(full_version.split("+")) > 1 or full_version == "unknown": + version = "dev" else: version = full_version # These enable substitutions using |variable| in the rst files rst_epilog = """ .. |year| replace:: {year} -""".format(year=year) +""".format( + year=year +) -html_last_updated_fmt = '%b %d, %Y' -html_title = 'Verde' -html_short_title = 'Verde' -html_logo = '_static/verde-logo.png' -html_favicon = '_static/favicon.png' -html_static_path = ['_static'] +html_last_updated_fmt = "%b %d, %Y" +html_title = "Verde" +html_short_title = "Verde" +html_logo = "_static/verde-logo.png" +html_favicon = "_static/favicon.png" +html_static_path = ["_static"] html_extra_path = [] -pygments_style = 'default' +pygments_style = "default" add_function_parentheses = False html_show_sourcelink = False html_show_sphinx = True @@ -109,28 +107,42 @@ # Theme config html_theme = "sphinx_rtd_theme" html_theme_options = { - 'logo_only': True, - 'display_version': True, + "logo_only": True, + "display_version": True, } html_context = { - 'menu_links_name': 'Getting help and contributing', - 'menu_links': [ - (' Fatiando a Terra', 'https://www.fatiando.org'), - (' Contributing', 'https://github.com/fatiando/verde/blob/master/CONTRIBUTING.md'), - (' Code of Conduct', 'https://github.com/fatiando/verde/blob/master/CODE_OF_CONDUCT.md'), - (' Contact', 'http://contact.fatiando.org'), - (' Source Code', 'https://github.com/fatiando/verde'), + "menu_links_name": "Getting help and contributing", + "menu_links": [ + ( + ' Fatiando a Terra', + "https://www.fatiando.org", + ), + ( + ' Contributing', + "https://github.com/fatiando/verde/blob/master/CONTRIBUTING.md", + ), + ( + ' Code of Conduct', + "https://github.com/fatiando/verde/blob/master/CODE_OF_CONDUCT.md", + ), + (' Contact', "http://contact.fatiando.org"), + ( + ' Source Code', + "https://github.com/fatiando/verde", + ), ], # Custom variables to enable "Improve this page"" and "Download notebook" # links - 'doc_path': 'doc', - 'galleries': sphinx_gallery_conf['gallery_dirs'], - 'gallery_dir': dict(zip(sphinx_gallery_conf['gallery_dirs'], - sphinx_gallery_conf['examples_dirs'])), - 'github_repo': 'fatiando/verde', - 'github_version': 'master', + "doc_path": "doc", + "galleries": sphinx_gallery_conf["gallery_dirs"], + "gallery_dir": dict( + zip(sphinx_gallery_conf["gallery_dirs"], sphinx_gallery_conf["examples_dirs"]) + ), + "github_repo": "fatiando/verde", + "github_version": "master", } + # Load the custom CSS files (needs sphinx >= 1.6 for this to work) def setup(app): app.add_stylesheet("style.css") diff --git a/doc/references.rst b/doc/references.rst index 38a4bbcf1..10ae0bc7e 100644 --- a/doc/references.rst +++ b/doc/references.rst @@ -1,5 +1,7 @@ References ========== +.. [Roberts_etal2017] Roberts, D. R., Bahn, V., Ciuti, S., Boyce, M. S., Elith, J., Guillera‐Arroita, G., et al. (2017). Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure. Ecography, 40(8), 913–929. https://doi.org/10.1111/ecog.02881 .. [Sandwell1987] Sandwell, D. T. (1987). Biharmonic spline interpolation of GEOS-3 and SEASAT altimeter data. Geophysical Research Letters, 14(2), 139–142. https://doi.org/10.1029/GL014i002p00139 .. [SandwellWessel2016] Sandwell, D. T., & Wessel, P. (2016). Interpolation of 2-D vector data using constraints from elasticity. Geophysical Research Letters, 43(20), 2016GL070340. https://doi.org/10.1002/2016GL070340 +.. [Valavi_etal2019] Valavi, R., Elith, J., Lahoz‐Monfort, J. J., & Guillera‐Arroita, G. (2019). blockCV: An r package for generating spatially or environmentally separated folds for k-fold cross-validation of species distribution models. Methods in Ecology and Evolution, 10(2), 225–232. https://doi.org/10.1111/2041-210X.13107 diff --git a/environment.yml b/environment.yml index 52801fc3e..e5b5547a1 100644 --- a/environment.yml +++ b/environment.yml @@ -18,13 +18,13 @@ dependencies: # Development requirements - pyproj - matplotlib - - cartopy + - cartopy>=0.18 - pytest - pytest-cov - pytest-mpl - coverage - - black - - pylint + - black>=20.8b1 + - pylint=2.4.* - flake8 - sphinx=2.2.1 - sphinx_rtd_theme=0.4.3 diff --git a/examples/blockkfold.py b/examples/blockkfold.py new file mode 100644 index 000000000..3cb8df5f9 --- /dev/null +++ b/examples/blockkfold.py @@ -0,0 +1,105 @@ +""" +K-Fold cross-validation with blocks +=================================== + +Cross-validation scores for spatial data can be biased because observations are +commonly spatially autocorrelated (closer data points have similar values). One +strategy to reduce the bias is to split data along spatial blocks +[Roberts_etal2017]_. Verde offers the cross-validator +:class:`verde.BlockKFold`, which is a scikit-learn compatible version of k-fold +cross-validation using spatial blocks. + +When splitting the data into training and testing sets, +:class:`~verde.BlockKFold` first splits the data into spatial blocks and then +splits the blocks into folds. During k-fold cross-validation, one fold is used +as the testing set while the rest are used for training. Since each block can +have a different number of data points, assigning the same number of blocks to +each fold can lead to folds with very different numbers of data points. +By default, :class:`~verde.BlockKFold` takes care to balance the folds to have +approximately equal number of data points. +Alternatively, you can turn off balancing to have each fold contain the same +number of blocks. + +This example shows the data assigned to each of the first 3 folds of a blocked +k-fold iteration, with and without balancing. Notice that the unbalanced folds +have very different numbers of data points. +""" +import numpy as np +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import verde as vd + +# Let's split the Baja California shipborne bathymetry data +data = vd.datasets.fetch_baja_bathymetry() +coordinates = (data.longitude, data.latitude) + +# Create cross-validators with blocks of 30 arc-minutes with shuffling enabled. +spacing = 30 / 60 +# Set the random state so that these plots don't vary when we rerun the example +random_state = 10 +kfold = vd.BlockKFold(spacing=spacing, shuffle=True, random_state=random_state) +# Make another cross-validator with fold balancing turned off. Without +# balancing, the folds can have very different number of data points in them +# (which may bias the scores). +kfold_unbalanced = vd.BlockKFold( + spacing=spacing, + shuffle=True, + random_state=random_state, + balance=False, +) + +# The BlockKFold is compatible with scikit-learn, so instead of giving it a +# coordinates tuple (like we use in Verde), we have to put the coordinates in a +# feature matrix (X in scikit-learn jargon). Each column will have one of the +# coordinate values. This is usually not required if using this cross-validator +# with Verde functions and classes. You can pass it to verde.cross_val_score, +# for example. +feature_matrix = np.transpose(coordinates) + +# Create the folds for the balanced and unbalanced cross-validators to show the +# difference. +balanced = kfold.split(feature_matrix) +unbalanced = kfold_unbalanced.split(feature_matrix) + +# Cartopy requires setting the coordinate reference system (CRS) of the +# original data through the transform argument. Their docs say to use +# PlateCarree to represent geographic data. +crs = ccrs.PlateCarree() + +# Make Mercator maps of the two cross-validator folds +fig, axes = plt.subplots( + 2, + 3, + figsize=(12, 10), + subplot_kw=dict(projection=ccrs.Mercator()), + sharex=True, + sharey=True, +) +for row, title, folds in zip(axes, ["Balanced", "Unbalanced"], [balanced, unbalanced]): + for i, (ax, fold) in enumerate(zip(row, folds)): + train, test = fold + ax.set_title("{} fold {} ({} testing points)".format(title, i, test.size)) + # Use an utility function to setup the tick labels and the land feature + vd.datasets.setup_baja_bathymetry_map(ax) + ax.plot( + coordinates[0][train], + coordinates[1][train], + ".b", + markersize=1, + transform=crs, + label="Train", + ) + ax.plot( + coordinates[0][test], + coordinates[1][test], + ".r", + markersize=1, + transform=crs, + label="Test", + ) +# Place a legend on the first plot +axes[0, 0].legend(loc="upper right", markerscale=5) +plt.subplots_adjust( + hspace=0.1, wspace=0.05, top=0.95, bottom=0.05, left=0.05, right=0.95 +) +plt.show() diff --git a/examples/blockreduce.py b/examples/blockreduce.py index c9d7880d0..35fc1be83 100644 --- a/examples/blockreduce.py +++ b/examples/blockreduce.py @@ -36,5 +36,4 @@ plt.colorbar().set_label("meters") # Use a utility function to setup the tick labels and land feature vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/blockreduce_weights.py b/examples/blockreduce_weights.py index 481a45943..c24da5294 100644 --- a/examples/blockreduce_weights.py +++ b/examples/blockreduce_weights.py @@ -67,5 +67,4 @@ cb.set_label("vertical velocity [m/yr]") vd.datasets.setup_california_gps_map(ax) ax.legend(loc="lower left") -plt.tight_layout() plt.show() diff --git a/examples/blockreduce_weights_mean.py b/examples/blockreduce_weights_mean.py index bef4cced9..de2887d26 100644 --- a/examples/blockreduce_weights_mean.py +++ b/examples/blockreduce_weights_mean.py @@ -83,5 +83,4 @@ ) plt.colorbar(pc, ax=ax, orientation="horizontal", pad=0.05).set_label("mm/yr") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/convex_hull_mask.py b/examples/convex_hull_mask.py new file mode 100644 index 000000000..e2286fc16 --- /dev/null +++ b/examples/convex_hull_mask.py @@ -0,0 +1,50 @@ +""" +Mask grid points by convex hull +=============================== + +Sometimes, data points are unevenly distributed. In such cases, we might not +want to have interpolated grid points that are too far from any data point. +Function :func:`verde.convexhull_mask` allows us to set grid points that fall +outside of the convex hull of the data points to NaN or some other value. +""" +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import pyproj +import numpy as np +import verde as vd + +# The Baja California bathymetry dataset has big gaps on land. We want to mask +# these gaps on a dummy grid that we'll generate over the region just to show +# what that looks like. +data = vd.datasets.fetch_baja_bathymetry() +region = vd.get_region((data.longitude, data.latitude)) + +# Generate the coordinates for a regular grid mask +spacing = 10 / 60 +coordinates = vd.grid_coordinates(region, spacing=spacing) + +# Generate a mask for points. The mask is True for points that are within the +# convex hull. We can provide a projection function to convert the coordinates +# before the convex hull is calculated (Mercator in this case). +mask = vd.convexhull_mask( + data_coordinates=(data.longitude, data.latitude), + coordinates=coordinates, + projection=pyproj.Proj(proj="merc", lat_ts=data.latitude.mean()), +) +print(mask) + +# Create a dummy grid with ones that we can mask to show the results. Turn +# points that are outside of the convex hull into NaNs so they won't show up in +# our plot. +dummy_data = np.ones_like(coordinates[0]) +dummy_data[~mask] = np.nan + +# Make a plot of the masked data and the data locations. +crs = ccrs.PlateCarree() +plt.figure(figsize=(7, 6)) +ax = plt.axes(projection=ccrs.Mercator()) +ax.set_title("Only keep grid points that inside of the convex hull") +ax.plot(data.longitude, data.latitude, ".y", markersize=0.5, transform=crs) +ax.pcolormesh(*coordinates, dummy_data, transform=crs) +vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.show() diff --git a/examples/distance_mask.py b/examples/distance_mask.py index 4ef7e3c1a..bcc7e85cc 100644 --- a/examples/distance_mask.py +++ b/examples/distance_mask.py @@ -48,5 +48,4 @@ ax.plot(data.longitude, data.latitude, ".y", markersize=0.5, transform=crs) ax.pcolormesh(*coordinates, dummy_data, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax, land=None) -plt.tight_layout() plt.show() diff --git a/examples/project_grid.py b/examples/project_grid.py new file mode 100644 index 000000000..2ac9affa4 --- /dev/null +++ b/examples/project_grid.py @@ -0,0 +1,78 @@ +""" +Projection of gridded data +========================== + +Sometimes gridded data products need to be projected before they can be used. +For example, you might want to project the topography of Antarctica from +geographic latitude and longitude to a planar polar stereographic projection +before doing your analysis. When projecting, the data points will likely not +fall on a regular grid anymore and must be interpolated (re-sampled) onto a new +grid. + +The :func:`verde.project_grid` function automates this process using the +interpolation methods available in Verde. An input grid +(:class:`xarray.DataArray`) is interpolated onto a new grid in the given +`pyproj `__ projection. The function takes +care of choosing a default grid spacing and region, running a blocked mean to +avoid spatial aliasing (using :class:`~verde.BlockReduce`), and masking the +points in the new grid that aren't constrained by the original data (using +:func:`~verde.convexhull_mask`). + +In this example, we'll generate a synthetic geographic grid with a checkerboard +pattern around the South pole. We'll project the grid to South Polar +Stereographic, revealing the checkered pattern of the data. + +.. note:: + + The interpolation methods are limited to what is available in Verde and + there is only support for single 2D grids. For more sophisticated use + cases, you might want to try + `pyresample `__ instead. + +""" +import numpy as np +import matplotlib.pyplot as plt +import pyproj +import verde as vd + + +# We'll use synthetic data near the South pole to highlight the effects of the +# projection. EPSG 3031 is a South Polar Stereographic projection. +projection = pyproj.Proj("epsg:3031") + +# Create a synthetic geographic grid using a checkerboard pattern +region = (0, 360, -90, -60) +spacing = 0.25 +wavelength = 10 * 1e5 # The size of the cells in the checkerboard +checkerboard = vd.datasets.CheckerBoard( + region=vd.project_region(region, projection), w_east=wavelength, w_north=wavelength +) +data = checkerboard.grid( + region=region, + spacing=spacing, + projection=projection, + data_names="checkerboard", + dims=("latitude", "longitude"), +) +print("Geographic grid:") +print(data) + +# Do the projection while setting the output grid spacing (in projected meters). Set +# the coordinates names to x and y since they aren't really "northing" or +# "easting". +polar_data = vd.project_grid( + data.checkerboard, projection, spacing=0.5 * 1e5, dims=("y", "x") +) +print("\nProjected grid:") +print(polar_data) + +# Plot the original and projected grids +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6)) +data.checkerboard.plot( + ax=ax1, cbar_kwargs=dict(orientation="horizontal", aspect=50, pad=0.1) +) +ax1.set_title("Geographic Grid") +polar_data.plot(ax=ax2, cbar_kwargs=dict(orientation="horizontal", aspect=50, pad=0.1)) +ax2.set_title("Polar Stereographic Grid") +plt.tight_layout() +plt.show() diff --git a/examples/scipygridder.py b/examples/scipygridder.py index c8bfe364e..fbc57cee4 100644 --- a/examples/scipygridder.py +++ b/examples/scipygridder.py @@ -58,7 +58,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry_m"], + data_names="bathymetry_m", ) print("Generated geographic grid:") print(grid) @@ -81,5 +81,4 @@ ax.plot(*coordinates, ".k", markersize=0.5, transform=crs) # Use an utility function to setup the tick labels and the land feature vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/examples/spline.py b/examples/spline.py index 13f8b4cd1..19e42dee9 100644 --- a/examples/spline.py +++ b/examples/spline.py @@ -66,7 +66,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) grid = vd.distance_mask( coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection @@ -84,5 +84,4 @@ plt.colorbar(tmp).set_label("Air temperature (C)") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax, region=region) -plt.tight_layout() plt.show() diff --git a/examples/spline_cv.py b/examples/spline_cv.py index a5bea4b29..a880fdd77 100644 --- a/examples/spline_cv.py +++ b/examples/spline_cv.py @@ -59,7 +59,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) grid = vd.distance_mask( coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection @@ -76,5 +76,4 @@ plt.colorbar(tmp).set_label("Air temperature (C)") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax, region=region) -plt.tight_layout() plt.show() diff --git a/examples/spline_weights.py b/examples/spline_weights.py index 73de4fd52..b246d67ef 100644 --- a/examples/spline_weights.py +++ b/examples/spline_weights.py @@ -62,11 +62,8 @@ dims=["latitude", "longitude"], data_names=["velocity"], ) -grid = vd.distance_mask( - (data.longitude, data.latitude), - maxdist=5 * spacing * 111e3, - grid=grid_full, - projection=projection, +grid = vd.convexhull_mask( + (data.longitude, data.latitude), grid=grid_full, projection=projection ) fig, axes = plt.subplots( @@ -107,5 +104,4 @@ ax.scatter(*coordinates, c="black", s=0.5, alpha=0.1, transform=crs) vd.datasets.setup_california_gps_map(ax, region=region) ax.coastlines() -plt.tight_layout() plt.show() diff --git a/examples/train_test_split.py b/examples/train_test_split.py new file mode 100644 index 000000000..8e3ce4d15 --- /dev/null +++ b/examples/train_test_split.py @@ -0,0 +1,89 @@ +""" +Splitting data into train and test sets +======================================= + +Verde gridders are mostly linear models that are used to predict data at new +locations. As such, they are subject to *over-fitting* and we should always +strive to quantify the quality of the model predictions (see +:ref:`model_evaluation`). Common practice for +doing this is to split the data into training (the one that is used to fit the +model) and testing (the one that is used to validate the predictions) datasets. + +These two datasets can be generated by splitting the data randomly (without +regard for their positions in space). This is the default behaviour of function +:func:`verde.train_test_split`, which is based on the scikit-learn function +:func:`sklearn.model_selection.train_test_split`. This can be problematic if +the data points are autocorrelated (values close to each other spatially tend +to have similar values). In these cases, splitting the data randomly can +overestimate the prediction quality [Roberts_etal2017]_. + +Alternatively, Verde allows splitting the data along *spatial blocks*. In this +case, the data are first grouped into blocks with a given size and then the +blocks are split randomly between training and testing sets. + +This example compares splitting our sample dataset using both methods. +""" +import matplotlib.pyplot as plt +import cartopy.crs as ccrs +import verde as vd + +# Let's split the Baja California shipborne bathymetry data +data = vd.datasets.fetch_baja_bathymetry() +coordinates = (data.longitude, data.latitude) +values = data.bathymetry_m + +# Assign 20% of the data to the testing set. +test_size = 0.2 + +# Split the data randomly into training and testing. Set the random state +# (seed) so that we get the same result if running this code again. +train, test = vd.train_test_split( + coordinates, values, test_size=test_size, random_state=123 +) +# train and test are tuples = (coordinates, data, weights). +print("Train and test size for random splits:", train[0][0].size, test[0][0].size) + +# A different strategy is to first assign the data to blocks and then split the +# blocks randomly. To do this, specify the size of the blocks using the +# 'spacing' argument. +train_block, test_block = vd.train_test_split( + coordinates, + values, + spacing=10 / 60, + test_size=test_size, + random_state=213, +) +# Verde will automatically attempt to balance the data between the splits so +# that the desired amount is assigned to the test set. It won't be exact since +# blocks contain different amounts of data points. +print( + "Train and test size for block splits: ", + train_block[0][0].size, + test_block[0][0].size, +) + +# Cartopy requires setting the coordinate reference system (CRS) of the +# original data through the transform argument. Their docs say to use +# PlateCarree to represent geographic data. +crs = ccrs.PlateCarree() + +# Make Mercator maps of the two different ways of splitting +fig, (ax1, ax2) = plt.subplots( + 1, 2, figsize=(10, 6), subplot_kw=dict(projection=ccrs.Mercator()) +) + +# Use an utility function to setup the tick labels and the land feature +vd.datasets.setup_baja_bathymetry_map(ax1) +vd.datasets.setup_baja_bathymetry_map(ax2) + +ax1.set_title("Random splitting") +ax1.plot(*train[0], ".b", markersize=2, transform=crs, label="Train") +ax1.plot(*test[0], ".r", markersize=2, transform=crs, label="Test", alpha=0.5) + +ax2.set_title("Blocked random splitting") +ax2.plot(*train_block[0], ".b", markersize=2, transform=crs, label="Train") +ax2.plot(*test_block[0], ".r", markersize=2, transform=crs, label="Test") +ax2.legend(loc="upper right") + +plt.subplots_adjust(wspace=0.15, top=1, bottom=0, left=0.05, right=0.95) +plt.show() diff --git a/examples/trend.py b/examples/trend.py index f301150de..36431b351 100644 --- a/examples/trend.py +++ b/examples/trend.py @@ -72,8 +72,11 @@ def plot_data(column, i, title): # Add a single colorbar on top of the histogram plot where there is some space cax = plt.axes((0.35, 0.44, 0.10, 0.01)) -cb = plt.colorbar(mappable, cax=cax, orientation="horizontal",) +cb = plt.colorbar( + mappable, + cax=cax, + orientation="horizontal", +) cb.set_label("C") -plt.tight_layout() plt.show() diff --git a/examples/vector_trend.py b/examples/vector_trend.py index b59f555ab..0384609d2 100644 --- a/examples/vector_trend.py +++ b/examples/vector_trend.py @@ -86,5 +86,4 @@ ) ax.coastlines(color="white") axes[0].legend(loc="lower left") -plt.tight_layout() plt.show() diff --git a/examples/vector_uncoupled.py b/examples/vector_uncoupled.py index acc6db163..05e160c84 100644 --- a/examples/vector_uncoupled.py +++ b/examples/vector_uncoupled.py @@ -61,13 +61,11 @@ print("Cross-validation R^2 score: {:.2f}".format(score)) # Interpolate the wind speed onto a regular geographic grid and mask the data that are -# far from the observation points +# outside of the convex hull of the data points. grid_full = chain.grid( region, spacing=spacing, projection=projection, dims=["latitude", "longitude"] ) -grid = vd.distance_mask( - coordinates, maxdist=3 * spacing * 111e3, grid=grid_full, projection=projection -) +grid = vd.convexhull_mask(coordinates, grid=grid_full, projection=projection) # Make maps of the original and gridded wind speed plt.figure(figsize=(6, 6)) @@ -98,5 +96,4 @@ ax.legend(loc="lower left") # Use an utility function to add tick labels and land and ocean features to the map. vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() diff --git a/requirements-dev.txt b/requirements-dev.txt index f81cae355..d96576a4e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # Development requirements pyproj matplotlib -cartopy +cartopy>=0.18 pytest pytest-cov pytest-mpl diff --git a/tutorials/chain.py b/tutorials/chain.py index aa3ed1d93..f0ac27ecc 100644 --- a/tutorials/chain.py +++ b/tutorials/chain.py @@ -54,7 +54,6 @@ ) plt.colorbar().set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -95,7 +94,6 @@ plt.hist(residuals, bins="auto", density=True) plt.xlabel("residuals (m)") plt.xlim(-1500, 1500) -plt.tight_layout() plt.show() ######################################################################################## @@ -108,7 +106,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print(grid) @@ -123,7 +121,6 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -143,7 +140,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print(grid_trend) @@ -155,5 +152,4 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() diff --git a/tutorials/decimation.py b/tutorials/decimation.py index 5b0b23335..fb9e72c78 100644 --- a/tutorials/decimation.py +++ b/tutorials/decimation.py @@ -24,7 +24,6 @@ # Plot the bathymetry data locations as black dots plt.plot(data.longitude, data.latitude, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -57,7 +56,6 @@ # Plot the bathymetry data locations as black dots plt.plot(*coordinates, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() @@ -79,7 +77,6 @@ # Plot the bathymetry data locations as black dots plt.plot(*coordinates_center, ".k", markersize=1, transform=crs) vd.datasets.setup_baja_bathymetry_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/model_evaluation.py b/tutorials/model_evaluation.py index 42f2623c6..8a8252686 100644 --- a/tutorials/model_evaluation.py +++ b/tutorials/model_evaluation.py @@ -74,7 +74,6 @@ ax.plot(test[0][0], test[0][1], ".b", label="test") ax.legend() ax.set_aspect("equal") -plt.tight_layout() plt.show() ######################################################################################## @@ -92,7 +91,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) print(grid) @@ -120,7 +119,6 @@ plt.colorbar(pc).set_label("C") ax.plot(data.longitude, data.latitude, ".k", markersize=1, transform=ccrs.PlateCarree()) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/model_selection.py b/tutorials/model_selection.py index 3cc5c1925..cb0571a83 100644 --- a/tutorials/model_selection.py +++ b/tutorials/model_selection.py @@ -100,7 +100,10 @@ # automatically when fitting a dataset. The only difference is that you must provide a # list of ``damping`` and ``mindist`` parameters to try instead of only a single value: -spline = vd.SplineCV(dampings=dampings, mindists=mindists,) +spline = vd.SplineCV( + dampings=dampings, + mindists=mindists, +) ######################################################################################## # Calling :meth:`~verde.SplineCV.fit` will run a grid search over all parameter @@ -125,7 +128,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) print(grid) @@ -171,7 +174,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["temperature"], + data_names="temperature", ) ######################################################################################## @@ -205,7 +208,6 @@ data.longitude, data.latitude, ".k", markersize=1, transform=ccrs.PlateCarree() ) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/overview.py b/tutorials/overview.py index 9ff5f6947..ea252bade 100644 --- a/tutorials/overview.py +++ b/tutorials/overview.py @@ -121,7 +121,7 @@ # and "northing") and data variables ("scalars"). You can overwrite these names by # setting the ``dims`` and ``data_names`` arguments. -grid = spline.grid(spacing=30, dims=["latitude", "longitude"], data_names=["gravity"]) +grid = spline.grid(spacing=30, dims=["latitude", "longitude"], data_names="gravity") print(grid) plt.figure() diff --git a/tutorials/projections.py b/tutorials/projections.py index 16edce9d4..008f9f1b0 100644 --- a/tutorials/projections.py +++ b/tutorials/projections.py @@ -60,7 +60,7 @@ ######################################################################################## # If we now call :meth:`verde.Spline.grid` we'll get back a grid evenly spaced in # projected Cartesian coordinates. -grid = spline.grid(spacing=spacing * 111e3, data_names=["bathymetry"]) +grid = spline.grid(spacing=spacing * 111e3, data_names="bathymetry") print("Cartesian grid:") print(grid) @@ -105,7 +105,7 @@ spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["bathymetry"], + data_names="bathymetry", ) print("Geographic grid:") print(grid_geo) @@ -132,5 +132,66 @@ ) plt.colorbar(pc).set_label("meters") vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.show() + +######################################################################################## +# Profiles +# -------- +# +# For profiles, things are a bit different. The projection is applied to the +# input points before coordinates are generated. So the profile will be evenly +# spaced in *projected coordinates*, not geographic coordinates. This is to +# avoid issues with calculating distances on a sphere. +# +# The coordinates returned by the ``profile`` method will be in geographic +# coordinates, so projections given to ``profile`` must take an ``inverse`` +# argument so we can undo the projection. +# +# The main utility of using a projection with ``profile`` is being able to pass +# in points in geographic coordinates and get coordinates back in that same +# system (making it easier to plot on a map). +# +# To generate a profile cutting across our bathymetry data, we can use +# longitude and latitude points taken from the map above). + +start = (-114.5, 24.7) +end = (-110, 20.5) +profile = spline.profile( + point1=start, + point2=end, + size=200, + projection=projection, + dims=("latitude", "longitude"), + data_names=["bathymetry"], +) +print(profile) + +######################################################################################## +# Plot the profile location on our geographic grid from above. + +plt.figure(figsize=(7, 6)) +ax = plt.axes(projection=ccrs.Mercator()) +ax.set_title("Profile location") +pc = grid_geo.bathymetry.plot.pcolormesh( + ax=ax, transform=ccrs.PlateCarree(), vmax=0, zorder=-1, add_colorbar=False +) +plt.colorbar(pc).set_label("meters") +ax.plot(profile.longitude, profile.latitude, "-k", transform=ccrs.PlateCarree()) +ax.text(start[0], start[1], "A", transform=ccrs.PlateCarree()) +ax.text(end[0], end[1], "B", transform=ccrs.PlateCarree()) +vd.datasets.setup_baja_bathymetry_map(ax, land=None) +plt.show() + +######################################################################################## +# And finally plot the profile. + +plt.figure(figsize=(8, 3)) +ax = plt.axes() +ax.set_title("Profile of bathymetry (A-B)") +ax.plot(profile.distance, profile.bathymetry, "-k") +ax.set_xlabel("Distance (m)") +ax.set_ylabel("Bathymetry (m)") +ax.set_xlim(profile.distance.min(), profile.distance.max()) +ax.grid() plt.tight_layout() plt.show() diff --git a/tutorials/trends.py b/tutorials/trends.py index 644c0bce3..35533c6e9 100644 --- a/tutorials/trends.py +++ b/tutorials/trends.py @@ -77,7 +77,6 @@ ) plt.colorbar(tmp, ax=ax, orientation="horizontal", pad=0.08) vd.datasets.setup_texas_wind_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/vectors.py b/tutorials/vectors.py index 63797fe61..bff5f0721 100644 --- a/tutorials/vectors.py +++ b/tutorials/vectors.py @@ -35,7 +35,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("GPS horizontal velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() @@ -87,7 +86,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("Block mean velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() ######################################################################################## @@ -142,7 +140,6 @@ ax.hist(data.velocity_east - pred_east, bins="auto", label="East", alpha=0.7) ax.legend() ax.set_xlabel("Velocity (m/yr)") -plt.tight_layout() plt.show() ######################################################################################## @@ -190,7 +187,6 @@ cb.set_label("meters/year") vd.datasets.setup_california_gps_map(ax, land=None, ocean=None) ax.coastlines(color="white") -plt.tight_layout() plt.show() ######################################################################################## @@ -277,7 +273,6 @@ ax.quiverkey(tmp, 0.2, 0.15, 0.05, label="0.05 m/yr", coordinates="figure") ax.set_title("Gridded velocities") vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() ######################################################################################## diff --git a/tutorials/weights.py b/tutorials/weights.py index 099f1f216..073f84558 100644 --- a/tutorials/weights.py +++ b/tutorials/weights.py @@ -53,7 +53,6 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): ) plt.colorbar(pc, ax=ax, orientation="horizontal", pad=0.05) vd.datasets.setup_california_gps_map(ax) - plt.tight_layout() plt.show() @@ -244,8 +243,10 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["velocity"], + data_names="velocity", ) +# Avoid showing interpolation outside of the convex hull of the data points. +grid = vd.convexhull_mask(coordinates, grid=grid, projection=projection) ######################################################################################## # Calculate an unweighted spline as well for comparison. @@ -260,7 +261,10 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): spacing=spacing, projection=projection, dims=["latitude", "longitude"], - data_names=["velocity"], + data_names="velocity", +) +grid_unweighted = vd.convexhull_mask( + coordinates, grid=grid_unweighted, projection=projection ) ######################################################################################## @@ -300,5 +304,4 @@ def plot_data(coordinates, velocity, weights, title_data, title_weights): ax.plot(data.longitude, data.latitude, ".k", markersize=0.1, transform=crs) ax.coastlines() vd.datasets.setup_california_gps_map(ax) -plt.tight_layout() plt.show() diff --git a/verde/__init__.py b/verde/__init__.py index d1fd2ee62..9f59575e2 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -7,14 +7,15 @@ grid_coordinates, inside, block_split, + rolling_window, + expanding_window, profile_coordinates, get_region, pad_region, - project_region, longitude_continuity, ) -from .mask import distance_mask -from .utils import variance_to_weights, maxabs, grid_to_table +from .mask import distance_mask, convexhull_mask +from .utils import variance_to_weights, maxabs, grid_to_table, make_xarray_grid from .io import load_surfer from .distances import median_distance from .blockreduce import BlockReduce, BlockMean @@ -22,8 +23,14 @@ from .trend import Trend from .chain import Chain from .spline import Spline, SplineCV -from .model_selection import cross_val_score, train_test_split +from .model_selection import ( + cross_val_score, + train_test_split, + BlockShuffleSplit, + BlockKFold, +) from .vector import Vector, VectorSpline2D +from .projections import project_region, project_grid def test(doctest=True, verbose=True, coverage=False, figures=True): diff --git a/verde/base/__init__.py b/verde/base/__init__.py index 9e246d929..f792ee4f7 100644 --- a/verde/base/__init__.py +++ b/verde/base/__init__.py @@ -1,4 +1,4 @@ # pylint: disable=missing-docstring -from .base_classes import BaseGridder +from .base_classes import BaseGridder, BaseBlockCrossValidator from .least_squares import least_squares from .utils import n_1d_arrays, check_fit_input diff --git a/verde/base/base_classes.py b/verde/base/base_classes.py index 503f9abbe..0061025be 100644 --- a/verde/base/base_classes.py +++ b/verde/base/base_classes.py @@ -1,14 +1,130 @@ """ Base classes for all gridders. """ -import xarray as xr +from abc import ABCMeta, abstractmethod + import pandas as pd -import numpy as np from sklearn.base import BaseEstimator -from sklearn.metrics import r2_score +from sklearn.model_selection import BaseCrossValidator from ..coordinates import grid_coordinates, profile_coordinates, scatter_points -from .utils import check_data, check_fit_input +from .utils import check_data, check_data_names, score_estimator +from ..utils import make_xarray_grid + + +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class BaseBlockCrossValidator(BaseCrossValidator, metaclass=ABCMeta): + """ + Base class for spatially blocked cross-validators. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int + Number of splitting iterations. + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=10, + ): + if spacing is None and shape is None: + raise ValueError("Either 'spacing' or 'shape' must be provided.") + self.spacing = spacing + self.shape = shape + self.n_splits = n_splits + + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + + """ + if X.shape[1] != 2: + raise ValueError( + "X must have exactly 2 columns ({} given).".format(X.shape[1]) + ) + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + y : object + Always ignored, exists for compatibility. + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + @abstractmethod + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + MUST BE IMPLEMENTED BY DERIVED CLASSES. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + + +# pylint: enable=invalid-name,unused-argument class BaseGridder(BaseEstimator): @@ -69,7 +185,7 @@ class BaseGridder(BaseEstimator): >>> # Fit the gridder to our synthetic data >>> grd = MeanGridder().fit((data.easting, data.northing), data.scalars) >>> grd - MeanGridder(multiplier=1) + MeanGridder() >>> # Interpolate on a regular grid >>> grid = grd.grid(region=(0, 5, -10, -8), shape=(30, 20)) >>> np.allclose(grid.scalars, -32.2182) @@ -83,6 +199,24 @@ class BaseGridder(BaseEstimator): """ + # The default dimension names for generated outputs + # (pd.DataFrame, xr.Dataset, etc) + dims = ("northing", "easting") + + # The default name for any extra coordinates given to methods below + # through the `extra_coords` keyword argument. Coordinates are + # included in the outputs (pandas.DataFrame or xarray.Dataset) + # using this name as a basis. + extra_coords_name = "extra_coord" + + # Define default values for data_names depending on the number of data + # arrays returned by predict method. + data_names_defaults = [ + ("scalars",), + ("east_component", "north_component"), + ("east_component", "north_component", "vertical_component"), + ] + def predict(self, coordinates): """ Predict data on the given coordinate values. NOT IMPLEMENTED. @@ -148,6 +282,8 @@ def filter(self, coordinates, data, weights=None): coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. data : array or tuple of arrays The data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each @@ -190,6 +326,8 @@ def score(self, coordinates, data, weights=None): coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. data : array or tuple of arrays The data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each @@ -205,17 +343,7 @@ def score(self, coordinates, data, weights=None): The R^2 score """ - coordinates, data, weights = check_fit_input( - coordinates, data, weights, unpack=False - ) - pred = check_data(self.predict(coordinates)) - result = np.mean( - [ - r2_score(datai.ravel(), predi.ravel(), sample_weight=weighti) - for datai, predi, weighti in zip(data, pred, weights) - ] - ) - return result + return score_estimator("r2", self, coordinates, data, weights=weights) def grid( self, @@ -226,7 +354,7 @@ def grid( data_names=None, projection=None, **kwargs - ): + ): # pylint: disable=too-many-locals """ Interpolate the data onto a regular grid. @@ -246,8 +374,7 @@ def grid( Parameters ---------- region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. + The west, east, south, and north boundaries of a given region. shape : tuple = (n_north, n_east) or None The number of points in the South-North and West-East directions, respectively. @@ -256,12 +383,14 @@ def grid( respectively. dims : list or None The names of the northing and easting data dimensions, - respectively, in the output grid. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output grid. Default is determined from the + ``dims`` attribute of the class. Must be defined in the following + order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : str, list or None The name(s) of the data variables in the output grid. Defaults to - ``['scalars']`` for scalar data, + ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. @@ -285,20 +414,32 @@ def grid( verde.grid_coordinates : Generate the coordinate values for the grid. """ - dims = get_dims(dims) + dims = self._get_dims(dims) region = get_instance_region(self, region) coordinates = grid_coordinates(region, shape=shape, spacing=spacing, **kwargs) if projection is None: data = check_data(self.predict(coordinates)) else: - data = check_data(self.predict(projection(*coordinates))) - data_names = get_data_names(data, data_names) - coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} - attrs = {"metadata": "Generated by {}".format(repr(self))} - data_vars = { - name: (dims, value, attrs) for name, value in zip(data_names, data) - } - return xr.Dataset(data_vars, coords=coords, attrs=attrs) + data = check_data( + self.predict(project_coordinates(coordinates, projection)) + ) + # Get names for data and any extra coordinates + data_names = self._get_data_names(data, data_names) + extra_coords_names = self._get_extra_coords_names(coordinates) + # Create xarray.Dataset + dataset = make_xarray_grid( + coordinates, + data, + data_names, + dims=dims, + extra_coords_names=extra_coords_names, + ) + # Add metadata as attrs + metadata = "Generated by {}".format(repr(self)) + dataset.attrs["metadata"] = metadata + for array in dataset: + dataset[array].attrs["metadata"] = metadata + return dataset def scatter( self, @@ -327,8 +468,7 @@ def scatter( Parameters ---------- region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. + The west, east, south, and north boundaries of a given region. size : int The number of points to generate. random_state : numpy.random.RandomState or an int seed @@ -338,12 +478,14 @@ def scatter( (resulting in different numbers with each run). dims : list or None The names of the northing and easting data dimensions, - respectively, in the output dataframe. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output dataframe. Default is determined from + the ``dims`` attribute of the class. Must be defined in the + following order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : str, list or None The name(s) of the data variables in the output dataframe. Defaults - to ``['scalars']`` for scalar data, + to ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. @@ -362,15 +504,19 @@ def scatter( The interpolated values on a random set of points. """ - dims = get_dims(dims) + dims = self._get_dims(dims) region = get_instance_region(self, region) coordinates = scatter_points(region, size, random_state=random_state, **kwargs) if projection is None: data = check_data(self.predict(coordinates)) else: - data = check_data(self.predict(projection(*coordinates))) - data_names = get_data_names(data, data_names) + data = check_data( + self.predict(project_coordinates(coordinates, projection)) + ) + data_names = self._get_data_names(data, data_names) columns = [(dims[0], coordinates[1]), (dims[1], coordinates[0])] + extra_coords_names = self._get_extra_coords_names(coordinates) + columns.extend(zip(extra_coords_names, coordinates[2:])) columns.extend(zip(data_names, data)) return pd.DataFrame(dict(columns), columns=[c[0] for c in columns]) @@ -399,6 +545,30 @@ def profile( Includes the calculated Cartesian distance from *point1* for each data point in the profile. + To specify *point1* and *point2* in a coordinate system that would + require projection to Cartesian (geographic longitude and latitude, for + example), use the ``projection`` argument. With this option, the input + points will be projected using the given projection function prior to + computations. The generated Cartesian profile coordinates will be + projected back to the original coordinate system. **Note that the + profile points are evenly spaced in projected coordinates, not the + original system (e.g., geographic)**. + + .. warning:: + + **The profile calculation method with a projection has changed in + Verde 1.4.0**. Previous versions generated coordinates (assuming + they were Cartesian) and projected them afterwards. This led to + "distances" being incorrectly handled and returned in unprojected + coordinates. For example, if ``projection`` is from geographic to + Mercator, the distances would be "angles" (incorrectly calculated + as if they were Cartesian). After 1.4.0, *point1* and *point2* are + projected prior to generating coordinates for the profile, + guaranteeing that distances are properly handled in a Cartesian + system. **With this change, the profile points are now evenly + spaced in projected coordinates and the distances are returned in + projected coordinates as well.** + Parameters ---------- point1 : tuple @@ -411,23 +581,28 @@ def profile( The number of points to generate. dims : list or None The names of the northing and easting data dimensions, - respectively, in the output dataframe. Defaults to ``['northing', - 'easting']``. **NOTE: This is an exception to the "easting" then + respectively, in the output dataframe. Default is determined from + the ``dims`` attribute of the class. Must be defined in the + following order: northing dimension, easting dimension. + **NOTE: This is an exception to the "easting" then "northing" pattern but is required for compatibility with xarray.** - data_names : list of None + data_names : str, list or None The name(s) of the data variables in the output dataframe. Defaults - to ``['scalars']`` for scalar data, + to ``'scalars'`` for scalar data, ``['east_component', 'north_component']`` for 2D vector data, and ``['east_component', 'north_component', 'vertical_component']`` for 3D vector data. projection : callable or None - If not None, then should be a callable object - ``projection(easting, northing) -> (proj_easting, proj_northing)`` - that takes in easting and northing coordinate arrays and returns - projected northing and easting coordinate arrays. This function - will be used to project the generated profile coordinates before - passing them into ``predict``. For example, you can use this to - generate a geographic profile from a Cartesian gridder. + If not None, then should be a callable object ``projection(easting, + northing, inverse=False) -> (proj_easting, proj_northing)`` that + takes in easting and northing coordinate arrays and returns + projected northing and easting coordinate arrays. Should also take + an optional keyword argument ``inverse`` (default to False) that if + True will calculate the inverse transform instead. This function + will be used to project the profile end points before generating + coordinates and passing them into ``predict``. It will also be used + to undo the projection of the coordinates before returning the + results. Returns ------- @@ -435,82 +610,141 @@ def profile( The interpolated values along the profile. """ - dims = get_dims(dims) + dims = self._get_dims(dims) + # Project the input points to generate the profile in Cartesian + # coordinates (the distance calculation doesn't make sense in + # geographic coordinates since we don't do actual distances on a + # sphere). + if projection is not None: + point1 = project_coordinates(point1, projection) + point2 = project_coordinates(point2, projection) coordinates, distances = profile_coordinates(point1, point2, size, **kwargs) - if projection is None: - data = check_data(self.predict(coordinates)) - else: - data = check_data(self.predict(projection(*coordinates))) - data_names = get_data_names(data, data_names) + data = check_data(self.predict(coordinates)) + # Project the coordinates back to have geographic coordinates for the + # profile but Cartesian distances. + if projection is not None: + coordinates = project_coordinates(coordinates, projection, inverse=True) + data_names = self._get_data_names(data, data_names) columns = [ (dims[0], coordinates[1]), (dims[1], coordinates[0]), ("distance", distances), ] + extra_coords_names = self._get_extra_coords_names(coordinates) + columns.extend(zip(extra_coords_names, coordinates[2:])) columns.extend(zip(data_names, data)) return pd.DataFrame(dict(columns), columns=[c[0] for c in columns]) + def _get_dims(self, dims): + """ + Get default dimension names. + """ + if dims is not None: + return dims + return self.dims -def get_dims(dims): - """ - Get default dimension names. + def _get_extra_coords_names(self, coordinates): + """ + Return names for extra coordinates - Examples - -------- + Examples + -------- - >>> get_dims(dims=None) - ('northing', 'easting') - >>> get_dims(dims=('john', 'paul')) - ('john', 'paul') + >>> coordinates = (-5, 4, 3, 5, 1) + >>> grd = BaseGridder() + >>> grd._get_extra_coords_names(coordinates) + ['extra_coord', 'extra_coord_1', 'extra_coord_2'] - """ - if dims is not None: - return dims - return ("northing", "easting") + >>> coordinates = (-5, 4, 3) + >>> grd = BaseGridder() + >>> grd.extra_coords_name = "upward" + >>> grd._get_extra_coords_names(coordinates) + ['upward'] + + """ + names = [] + for i in range(len(coordinates[2:])): + name = self.extra_coords_name + if i > 0: + name += "_{}".format(i) + names.append(name) + return names + + def _get_data_names(self, data, data_names): + """ + Get default names for data fields if none are given based on the data. + + Examples + -------- + >>> import numpy as np + >>> east, north, up = [np.arange(10)]*3 + >>> gridder = BaseGridder() + >>> gridder._get_data_names((east,), data_names=None) + ('scalars',) + >>> gridder._get_data_names((east, north), data_names=None) + ('east_component', 'north_component') + >>> gridder._get_data_names((east, north, up), data_names=None) + ('east_component', 'north_component', 'vertical_component') + >>> gridder._get_data_names((east,), data_names="john") + ('john',) + >>> gridder._get_data_names((east,), data_names=("paul",)) + ('paul',) + >>> gridder._get_data_names( + ... (up, north), data_names=('ringo', 'george') + ... ) + ('ringo', 'george') + >>> gridder._get_data_names((north,), data_names=["brian"]) + ['brian'] -def get_data_names(data, data_names): + """ + # Return the defaults data_names for the class + if data_names is None: + if len(data) > len(self.data_names_defaults): + raise ValueError( + "Default data names only available for up to 3 components. " + + "Must provide custom names through the 'data_names' argument." + ) + return self.data_names_defaults[len(data) - 1] + # Return the passed data_names if valid + data_names = check_data_names(data, data_names) + return data_names + + +def project_coordinates(coordinates, projection, **kwargs): """ - Get default names for data fields if none are given based on the data. + Apply projection to given coordinates + + Allows to apply projections to any number of coordinates, assuming + that the first ones are ``longitude`` and ``latitude``. Examples -------- - >>> import numpy as np - >>> east, north, up = [np.arange(10)]*3 - >>> get_data_names((east,), data_names=None) - ('scalars',) - >>> get_data_names((east, north), data_names=None) - ('east_component', 'north_component') - >>> get_data_names((east, north, up), data_names=None) - ('east_component', 'north_component', 'vertical_component') - >>> get_data_names((up, north), data_names=('ringo', 'george')) - ('ringo', 'george') + >>> # Define a custom projection function + >>> def projection(lon, lat, inverse=False): + ... "Simple projection of geographic coordinates" + ... radius = 1000 + ... if inverse: + ... return (lon / radius, lat / radius) + ... return (lon * radius, lat * radius) + + >>> # Apply the projection to a set of coordinates containing: + >>> # longitude, latitude and height + >>> coordinates = (1., -2., 3.) + >>> project_coordinates(coordinates, projection) + (1000.0, -2000.0, 3.0) + + >>> # Apply the inverse projection + >>> coordinates = (-500.0, 1500.0, -19.0) + >>> project_coordinates(coordinates, projection, inverse=True) + (-0.5, 1.5, -19.0) """ - if data_names is not None: - if len(data) != len(data_names): - raise ValueError( - "Data has {} components but only {} names provided: {}".format( - len(data), len(data_names), str(data_names) - ) - ) - return data_names - data_types = [ - ("scalars",), - ("east_component", "north_component"), - ("east_component", "north_component", "vertical_component"), - ] - if len(data) > len(data_types): - raise ValueError( - " ".join( - [ - "Default data names only available for up to 3 components.", - "Must provide custom names through the 'data_names' argument.", - ] - ) - ) - return data_types[len(data) - 1] + proj_coordinates = projection(*coordinates[:2], **kwargs) + if len(coordinates) > 2: + proj_coordinates += tuple(coordinates[2:]) + return proj_coordinates def get_instance_region(instance, region): diff --git a/verde/base/least_squares.py b/verde/base/least_squares.py index 81ec3b318..e2b738ad0 100644 --- a/verde/base/least_squares.py +++ b/verde/base/least_squares.py @@ -7,7 +7,7 @@ from sklearn.linear_model import LinearRegression, Ridge -def least_squares(jacobian, data, weights, damping=None): +def least_squares(jacobian, data, weights, damping=None, copy_jacobian=False): """ Solve a weighted least-squares problem with optional damping regularization @@ -17,6 +17,12 @@ def least_squares(jacobian, data, weights, damping=None): required for predictions. Doesn't normalize the column means because that operation can't be undone. + .. warning:: + + Setting `copy_jacobian` to True will copy the Jacobian matrix, doubling + the memory required. Use it only if the Jacobian matrix is needed + afterwards. + Parameters ---------- jacobian : 2d-array @@ -31,6 +37,9 @@ def least_squares(jacobian, data, weights, damping=None): damping : None or float The positive damping (Tikhonov 0th order) regularization parameter. If ``damping=None``, will use a regular least-squares fit. + copy_jacobian: bool + If False, the Jacobian matrix will be scaled inplace. If True, the + Jacobian matrix will be copied before scaling. Default False. Returns ------- @@ -44,7 +53,7 @@ def least_squares(jacobian, data, weights, damping=None): jacobian.shape ) ) - scaler = StandardScaler(copy=False, with_mean=False, with_std=True) + scaler = StandardScaler(copy=copy_jacobian, with_mean=False, with_std=True) jacobian = scaler.fit_transform(jacobian) if damping is None: regr = LinearRegression(fit_intercept=False, normalize=False) diff --git a/verde/base/utils.py b/verde/base/utils.py index d35c216a8..3f40f1e4a 100644 --- a/verde/base/utils.py +++ b/verde/base/utils.py @@ -2,6 +2,85 @@ Utility functions for building gridders and checking arguments. """ import numpy as np +from sklearn.metrics import check_scoring + + +def score_estimator(scoring, estimator, coordinates, data, weights=None): + """ + Score the given gridder against the given data using the given metric. + + If the data and predictions have more than 1 component, the scores of each + component will be averaged. + + Parameters + ---------- + scoring : str or callable + A scoring specification known to scikit-learn. See + :func:`sklearn.metrics.check_scoring`. + estimator : a Verde gridder + The gridder to score. Usually derived from + :class:`verde.base.BaseGridder`. + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). + For the specific definition of coordinate systems and what these + names mean, see the class docstring. + data : array or tuple of arrays + The data values of each data point. If the data has more than one + component, *data* must be a tuple of arrays (one for each + component). + weights : None or array or tuple of arrays + If not None, then the weights assigned to each data point. If more + than one data component is provided, you must provide a weights + array for each data component (if not None). + + Returns + ------- + score : float + The score. + + """ + coordinates, data, weights = check_fit_input( + coordinates, data, weights, unpack=False + ) + predicted = check_data(estimator.predict(coordinates)) + scorer = check_scoring(DummyEstimator, scoring=scoring) + result = np.mean( + [ + scorer( + DummyEstimator(pred.ravel()), + coordinates, + data[i].ravel(), + sample_weight=weights[i], + ) + for i, pred in enumerate(predicted) + ] + ) + return result + + +class DummyEstimator: + """ + Dummy estimator that does nothing but pass along the predicted data. + Used to fool the scikit-learn scorer functions to fit our API + (multi-component estimators return a tuple on .predict). + + >>> est = DummyEstimator([1, 2, 3]) + >>> print(est.fit().predict()) + [1, 2, 3] + + """ + + def __init__(self, predicted): + self._predicted = predicted + + def predict(self, *args, **kwargs): # pylint: disable=unused-argument + "Return the stored predicted values" + return self._predicted + + def fit(self, *args, **kwards): # pylint: disable=unused-argument + "Does nothing. Just here to satisfy the API." + return self def check_data(data): @@ -25,6 +104,97 @@ def check_data(data): return data +def check_data_names(data, data_names): + """ + Check *data_names* against *data*. + + Also, convert ``data_names`` to a tuple if it's a single string. + + Examples + -------- + + >>> import numpy as np + >>> east, north, scalar = [np.array(10)]*3 + >>> check_data_names((scalar,), "dummy") + ('dummy',) + >>> check_data_names((scalar,), ("dummy",)) + ('dummy',) + >>> check_data_names((scalar,), ["dummy"]) + ['dummy'] + >>> check_data_names((east, north), ("component_x", "component_y")) + ('component_x', 'component_y') + """ + # Convert single string to tuple + if isinstance(data_names, str): + data_names = (data_names,) + # Raise error if data_names is None + if data_names is None: + raise ValueError("Invalid data_names equal to None.") + # Raise error if data and data_names don't have the same number of elements + if len(data) != len(data_names): + raise ValueError( + "Data has {} components but only {} names provided: {}".format( + len(data), len(data_names), str(data_names) + ) + ) + return data_names + + +def check_coordinates(coordinates): + """ + Check that the given coordinate arrays are what we expect them to be. + Should be a tuple with arrays of the same shape. + """ + shapes = [coord.shape for coord in coordinates] + if not all(shape == shapes[0] for shape in shapes): + raise ValueError( + "Coordinate arrays must have the same shape. Coordinate shapes: {}".format( + shapes + ) + ) + return coordinates + + +def check_extra_coords_names(coordinates, extra_coords_names): + """ + Check extra_coords_names against coordiantes. + + Also, convert ``extra_coords_names`` to a tuple if it's a single string. + Assume that there are extra coordinates on the ``coordinates`` tuple. + + Examples + -------- + + >>> import numpy as np + >>> coordinates = [np.array(10)]*3 + >>> check_extra_coords_names(coordinates, "upward") + ('upward',) + >>> check_extra_coords_names(coordinates, ("upward",)) + ('upward',) + >>> coordinates = [np.array(10)]*4 + >>> check_extra_coords_names(coordinates, ("upward", "time")) + ('upward', 'time') + """ + # Convert single string to a tuple + if isinstance(extra_coords_names, str): + extra_coords_names = (extra_coords_names,) + # Check if it's not None + if extra_coords_names is None: + raise ValueError( + "Invalid extra_coords_names equal to None. " + + "When passing one or more extra coordinate, " + + "extra_coords_names cannot be None." + ) + # Check if there are the same number of extra_coords than extra_coords_name + if len(coordinates[2:]) != len(extra_coords_names): + raise ValueError( + "Invalid extra_coords_names '{}'. ".format(extra_coords_names) + + "Number of extra coordinates names must match the number of " + + "additional coordinates ('{}').".format(len(coordinates[2:])) + ) + return extra_coords_names + + def check_fit_input(coordinates, data, weights, unpack=True): """ Validate the inputs to the fit method of gridders. @@ -59,8 +229,13 @@ def check_fit_input(coordinates, data, weights, unpack=True): """ data = check_data(data) weights = check_data(weights) - if any(i.shape != j.shape for i in coordinates for j in data): - raise ValueError("Coordinate and data arrays must have the same shape.") + coordinates = check_coordinates(coordinates) + if any(i.shape != coordinates[0].shape for i in data): + raise ValueError( + "Data arrays must have the same shape {} as coordinates. Data shapes: {}.".format( + coordinates[0].shape, [i.shape for i in data] + ) + ) if any(w is not None for w in weights): if len(weights) != len(data): raise ValueError( diff --git a/verde/coordinates.py b/verde/coordinates.py index b7e912d57..941df0aeb 100644 --- a/verde/coordinates.py +++ b/verde/coordinates.py @@ -1,10 +1,12 @@ """ Functions for generating and manipulating coordinates. """ +import warnings + import numpy as np from sklearn.utils import check_random_state -from .base.utils import n_1d_arrays +from .base.utils import n_1d_arrays, check_coordinates from .utils import kdtree @@ -108,40 +110,6 @@ def pad_region(region, pad): return padded -def project_region(region, projection): - """ - Calculate the bounding box of a region in projected coordinates. - - Parameters - ---------- - region : list = [W, E, S, N] - The boundaries of a given region in Cartesian or geographic - coordinates. - projection : callable or None - If not None, then should be a callable object (like a function) - ``projection(easting, northing) -> (proj_easting, proj_northing)`` that - takes in easting and northing coordinate arrays and returns projected - northing and easting coordinate arrays. - - Returns - ------- - proj_region : list = [W, E, S, N] - The bounding box of the projected region. - - Examples - -------- - - >>> def projection(x, y): - ... return (2*x, -1*y) - >>> project_region((3, 5, -9, -4), projection) - (6.0, 10.0, 4.0, 9.0) - - """ - east, north = grid_coordinates(region, shape=(101, 101)) - east, north = projection(east.ravel(), north.ravel()) - return (east.min(), east.max(), north.min(), north.max()) - - def scatter_points(region, size, random_state=None, extra_coords=None): """ Generate the coordinates for a random scatter of points. @@ -534,6 +502,62 @@ def spacing_to_shape(region, spacing, adjust): return (nnorth, neast), (w, e, s, n) +def shape_to_spacing(region, shape, pixel_register=False): + """ + Calculate the spacing of a grid given region and shape. + + Parameters + ---------- + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. + shape : tuple = (n_north, n_east) or None + The number of points in the South-North and West-East directions, + respectively. + pixel_register : bool + If True, the coordinates will refer to the center of each grid pixel + instead of the grid lines. In practice, this means that there will be + one less element per dimension of the grid when compared to grid line + registered (only if given *spacing* and not *shape*). Default is False. + + Returns + ------- + spacing : tuple = (s_north, s_east) + The grid spacing in the South-North and West-East directions, + respectively. + + Examples + -------- + + >>> spacing = shape_to_spacing([0, 10, -5, 1], (7, 11)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 1.0 + >>> spacing = shape_to_spacing([0, 10, -5, 1], (14, 11)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 0.5, 1.0 + >>> spacing = shape_to_spacing([0, 10, -5, 1], (7, 21)) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 0.5 + >>> spacing = shape_to_spacing( + ... [-0.5, 10.5, -5.5, 1.5], (7, 11), pixel_register=True, + ... ) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 1.0 + >>> spacing = shape_to_spacing( + ... [-0.25, 10.25, -5.5, 1.5], (7, 21), pixel_register=True, + ... ) + >>> print("{:.1f}, {:.1f}".format(*spacing)) + 1.0, 0.5 + + """ + spacing = [] + for i, n_points in enumerate(reversed(shape)): + if not pixel_register: + n_points -= 1 + spacing.append((region[2 * i + 1] - region[2 * i]) / n_points) + return tuple(reversed(spacing)) + + def profile_coordinates(point1, point2, size, extra_coords=None): """ Coordinates for a profile along a straight line between two points. @@ -735,6 +759,8 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= See also -------- BlockReduce : Apply a reduction operation to the data in blocks (windows). + rolling_window : Select points on a rolling (moving) window. + expanding_window : Select points on windows of changing size. Examples -------- @@ -768,17 +794,450 @@ def block_split(coordinates, spacing=None, adjust="spacing", region=None, shape= [6 6 6 7 7 7]] """ + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] if region is None: region = get_region(coordinates) - block_coords = tuple( - i.ravel() - for i in grid_coordinates( - region, spacing=spacing, shape=shape, adjust=adjust, pixel_register=True - ) + block_coords = grid_coordinates( + region, spacing=spacing, shape=shape, adjust=adjust, pixel_register=True ) tree = kdtree(block_coords) labels = tree.query(np.transpose(n_1d_arrays(coordinates, 2)))[1] - return block_coords, labels + return n_1d_arrays(block_coords, len(block_coords)), labels + + +def rolling_window( + coordinates, size, spacing=None, shape=None, region=None, adjust="spacing" +): + """ + Select points on a rolling (moving) window. + + A window of the given size is moved across the region at a given step + (specified by *spacing* or *shape*). Returns the indices of points falling + inside each window step. You can use the indices to select points falling + inside a given window. + + The size of the step when moving the windows can be specified by the + *spacing* parameter. Alternatively, the number of windows in the + South-North and West-East directions can be specified using the *shape* + parameter. **One of the two must be given.** + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. + size : float + The size of the windows. Units should match the units of *coordinates*. + spacing : float, tuple = (s_north, s_east), or None + The window size in the South-North and West-East directions, + respectively. A single value means that the size is equal in both + directions. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. If not region is given, will use the bounding region of + the given points. + adjust : {'spacing', 'region'} + Whether to adjust the spacing or the region if required. Ignored if + *shape* is given instead of *spacing*. Defaults to adjusting the + spacing. + + Returns + ------- + window_coordinates : tuple of arrays + Coordinate arrays for the center of each window. + indices : array + Each element of the array corresponds the indices of points falling + inside a window. The array will have the same shape as the + *window_coordinates*. Use the array elements to index the coordinates + for each window. The indices will depend on the number of dimensions in + the input coordinates. For example, if the coordinates are 2D arrays, + each window will contain indices for 2 dimensions (row, column). + + See also + -------- + block_split : Split a region into blocks and label points accordingly. + expanding_window : Select points on windows of changing size. + + Examples + -------- + + Generate a set of sample coordinates on a grid and determine the indices + of points for each rolling window: + + >>> from verde import grid_coordinates + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + >>> print(coords[0]) + [[-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.]] + >>> print(coords[1]) + [[ 6. 6. 6. 6. 6.] + [ 7. 7. 7. 7. 7.] + [ 8. 8. 8. 8. 8.] + [ 9. 9. 9. 9. 9.] + [10. 10. 10. 10. 10.]] + >>> # Get the rolling window indices + >>> window_coords, indices = rolling_window(coords, size=2, spacing=2) + >>> # Window coordinates will be 2D arrays. Their shape is the number of + >>> # windows in each dimension + >>> print(window_coords[0].shape, window_coords[1].shape) + (2, 2) (2, 2) + >>> # The there are the easting and northing coordinates for the center of + >>> # each rolling window + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> # The indices of points falling on each window will have the same shape + >>> # as the window center coordinates + >>> print(indices.shape) + (2, 2) + >>> # The points in the first window. Indices are 2D positions because the + >>> # coordinate arrays are 2D. + >>> print(len(indices[0, 0])) + 2 + >>> for dimension in indices[0, 0]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[0, 1]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [2 3 4 2 3 4 2 3 4] + >>> for dimension in indices[1, 0]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[1, 1]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [2 3 4 2 3 4 2 3 4] + >>> # To get the coordinates for each window, use indexing + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + If the coordinates are 1D, the indices will also be 1D: + + >>> coords1d = [coord.ravel() for coord in coords] + >>> window_coords, indices = rolling_window(coords1d, size=2, spacing=2) + >>> print(len(indices[0, 0])) + 1 + >>> print(indices[0, 0][0]) + [ 0 1 2 5 6 7 10 11 12] + >>> print(indices[0, 1][0]) + [ 2 3 4 7 8 9 12 13 14] + >>> print(indices[1, 0][0]) + [10 11 12 15 16 17 20 21 22] + >>> print(indices[1, 1][0]) + [12 13 14 17 18 19 22 23 24] + >>> # The returned indices can be used in the same way as before + >>> print(coords1d[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords1d[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + By default, the windows will span the entire data region. You can also + control the specific region you'd like the windows to cover: + + >>> # Coordinates on a larger region but with the same spacing as before + >>> coords = grid_coordinates((-10, 5, 0, 20), spacing=1) + >>> # Get the rolling window indices but limited to the region from before + >>> window_coords, indices = rolling_window( + ... coords, size=2, spacing=2, region=(-5, -1, 6, 10), + ... ) + >>> # The windows should still be in the same place as before + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> # And indexing the coordinates should also provide the same result + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + + Only the first 2 coordinates are considered (assumed to be the horizontal + ones). All others will be ignored by the function. + + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1, extra_coords=20) + >>> print(coords[2]) + [[20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.] + [20. 20. 20. 20. 20.]] + >>> window_coords, indices = rolling_window(coords, size=2, spacing=2) + >>> # The windows would be the same in this case since coords[2] is ignored + >>> for coord in window_coords: + ... print(coord) + [[-4. -2.] + [-4. -2.]] + [[7. 7.] + [9. 9.]] + >>> print(indices.shape) + (2, 2) + >>> for dimension in indices[0, 0]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[0, 1]: + ... print(dimension) + [0 0 0 1 1 1 2 2 2] + [2 3 4 2 3 4 2 3 4] + >>> for dimension in indices[1, 0]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [0 1 2 0 1 2 0 1 2] + >>> for dimension in indices[1, 1]: + ... print(dimension) + [2 2 2 3 3 3 4 4 4] + [2 3 4 2 3 4 2 3 4] + >>> # The indices can still be used with the third coordinate + >>> print(coords[0][indices[0, 0]]) + [-5. -4. -3. -5. -4. -3. -5. -4. -3.] + >>> print(coords[1][indices[0, 0]]) + [6. 6. 6. 7. 7. 7. 8. 8. 8.] + >>> print(coords[2][indices[0, 0]]) + [20. 20. 20. 20. 20. 20. 20. 20. 20.] + + """ + # Check if shape or spacing were passed + if shape is None and spacing is None: + raise ValueError("Either a shape or a spacing must be provided.") + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] + if region is None: + region = get_region(coordinates) + # Check if window size is bigger than the minimum dimension of the region + region_min_width = min(region[1] - region[0], region[3] - region[2]) + if region_min_width < size: + raise ValueError( + "Window size '{}' is larger ".format(size) + + "than dimensions of the region '{}'.".format(region) + ) + # Calculate the region spanning the centers of the rolling windows + window_region = [ + dimension + (-1) ** (i % 2) * size / 2 for i, dimension in enumerate(region) + ] + _check_rolling_window_overlap(window_region, size, shape, spacing) + centers = grid_coordinates( + window_region, spacing=spacing, shape=shape, adjust=adjust + ) + # pykdtree doesn't support query_ball_point yet and we need that + tree = kdtree(coordinates, use_pykdtree=False) + # Coordinates must be transposed because the kd-tree wants them as columns + # of a matrix + # Use p=inf (infinity norm) to get square windows instead of circular ones + indices1d = tree.query_ball_point( + np.transpose(n_1d_arrays(centers, 2)), r=size / 2, p=np.inf + ) + # Make the indices array the same shape as the center coordinates array. + # That preserves the information of the number of windows in each + # dimension. Need to first create an empty array of object type because + # otherwise numpy tries to use the index tuples as dimensions (even if + # given ndim=1 explicitly). Can't make it 1D and then reshape because the + # reshape is ignored for some reason. The workaround is to create the array + # with the correct shape and assign the values to a raveled view of the + # array. + indices = np.empty(centers[0].shape, dtype="object") + # Need to convert the indices to int arrays because unravel_index doesn't + # like empty lists but can handle empty integer arrays in case a window has + # no points inside it. + indices.ravel()[:] = [ + np.unravel_index(np.array(i, dtype="int"), shape=coordinates[0].shape) + for i in indices1d + ] + return centers, indices + + +def _check_rolling_window_overlap(region, size, shape, spacing): + """ + Warn the user if there is no overlap between neighboring windows. + """ + if shape is not None: + ndims = len(shape) + dimensions = [region[i * ndims + 1] - region[i * ndims] for i in range(ndims)] + # The - 1 is because we need to divide by the number of intervals, not + # the number of nodes. + spacing = tuple(dim / (n - 1) for dim, n in zip(dimensions, shape)) + spacing = np.atleast_1d(spacing) + if np.any(spacing > size): + warnings.warn( + f"Rolling windows do not overlap (size '{size}' and spacing '{spacing}'). " + "Some data points may not be included in any window. " + "Increase size or decrease spacing to avoid this." + ) + + +def expanding_window(coordinates, center, sizes): + """ + Select points on windows of changing size around a center point. + + Returns the indices of points falling inside each window. + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with the coordinates of each data point. Should be in the + following order: (easting, northing, vertical, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. + center : tuple + The coordinates of the center of the window. Should be in the + following order: (easting, northing, vertical, ...). + sizes : array + The sizes of the windows. Does not have to be in any particular order. + The order of indices returned will match the order of window sizes + given. Units should match the units of *coordinates* and *center*. + + Returns + ------- + indices : list + Each element of the list corresponds to the indices of points falling + inside a window. Use them to index the coordinates for each window. The + indices will depend on the number of dimensions in the input + coordinates. For example, if the coordinates are 2D arrays, each window + will contain indices for 2 dimensions (row, column). + + See also + -------- + block_split : Split a region into blocks and label points accordingly. + rolling_window : Select points on a rolling (moving) window. + + Examples + -------- + + Generate a set of sample coordinates on a grid and determine the indices + of points for each expanding window: + + >>> from verde import grid_coordinates + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + >>> print(coords[0]) + [[-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.] + [-5. -4. -3. -2. -1.]] + >>> print(coords[1]) + [[ 6. 6. 6. 6. 6.] + [ 7. 7. 7. 7. 7.] + [ 8. 8. 8. 8. 8.] + [ 9. 9. 9. 9. 9.] + [10. 10. 10. 10. 10.]] + >>> # Get the expanding window indices + >>> indices = expanding_window(coords, center=(-3, 8), sizes=[1, 2, 4]) + >>> # There is one index per window + >>> print(len(indices)) + 3 + >>> # The points in the first window. Indices are 2D positions because the + >>> # coordinate arrays are 2D. + >>> print(len(indices[0])) + 2 + >>> for dimension in indices[0]: + ... print(dimension) + [2] + [2] + >>> for dimension in indices[1]: + ... print(dimension) + [1 1 1 2 2 2 3 3 3] + [1 2 3 1 2 3 1 2 3] + >>> for dimension in indices[2]: + ... print(dimension) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4] + [0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4] + >>> # To get the coordinates for each window, use indexing + >>> print(coords[0][indices[0]]) + [-3.] + >>> print(coords[1][indices[0]]) + [8.] + >>> print(coords[0][indices[1]]) + [-4. -3. -2. -4. -3. -2. -4. -3. -2.] + >>> print(coords[1][indices[1]]) + [7. 7. 7. 8. 8. 8. 9. 9. 9.] + + If the coordinates are 1D, the indices will also be 1D: + + >>> coords1d = [coord.ravel() for coord in coords] + >>> indices = expanding_window(coords1d, center=(-3, 8), sizes=[1, 2, 4]) + >>> print(len(indices)) + 3 + >>> # Since coordinates are 1D, there is only one index + >>> print(len(indices[0])) + 1 + >>> print(indices[0][0]) + [12] + >>> print(indices[1][0]) + [ 6 7 8 11 12 13 16 17 18] + >>> # The returned indices can be used in the same way as before + >>> print(coords1d[0][indices[0]]) + [-3.] + >>> print(coords1d[1][indices[0]]) + [8.] + + Only the first 2 coordinates are considered (assumed to be the horizontal + ones). All others will be ignored by the function. + + >>> coords = grid_coordinates((-5, -1, 6, 10), spacing=1, extra_coords=15) + >>> print(coords[2]) + [[15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.] + [15. 15. 15. 15. 15.]] + >>> indices = expanding_window(coords, center=(-3, 8), sizes=[1, 2, 4]) + >>> # The returned indices should be the same as before, ignoring coords[2] + >>> print(len(indices[0])) + 2 + >>> for dimension in indices[0]: + ... print(dimension) + [2] + [2] + >>> for dimension in indices[1]: + ... print(dimension) + [1 1 1 2 2 2 3 3 3] + [1 2 3 1 2 3 1 2 3] + >>> for dimension in indices[2]: + ... print(dimension) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4] + [0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4] + >>> # The indices can be used to index all 3 coordinates + >>> print(coords[0][indices[0]]) + [-3.] + >>> print(coords[1][indices[0]]) + [8.] + >>> print(coords[2][indices[0]]) + [15.] + + """ + # Select the coordinates after checking to make sure indexing will still + # work on the ignored coordinates. + coordinates = check_coordinates(coordinates)[:2] + shape = coordinates[0].shape + center = np.atleast_2d(center) + # pykdtree doesn't support query_ball_point yet and we need that + tree = kdtree(coordinates, use_pykdtree=False) + indices = [] + for size in sizes: + # Use p=inf (infinity norm) to get square windows instead of circular + index1d = tree.query_ball_point(center, r=size / 2, p=np.inf)[0] + # Convert indices to an array to avoid errors when the index is empty + # (no points in the window). unravel_index doesn't like empty lists. + indices.append(np.unravel_index(np.array(index1d, dtype="int"), shape=shape)) + return indices def longitude_continuity(coordinates, region): diff --git a/verde/datasets/sample_data.py b/verde/datasets/sample_data.py index 76f7196ec..67ac00d7f 100644 --- a/verde/datasets/sample_data.py +++ b/verde/datasets/sample_data.py @@ -9,7 +9,6 @@ import pooch try: - import cartopy.feature as cfeature import cartopy.crs as ccrs from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter except ImportError: @@ -54,20 +53,18 @@ def locate(): def _setup_map( - ax, xticks, yticks, crs, region, land=None, ocean=None, borders=None, states=None + ax, + xticks, + yticks, + crs, + region, + coastlines=False, ): """ - Setup a Cartopy map with land and ocean features and proper tick labels. + Setup a Cartopy map with coastlines and proper tick labels. """ - - if land is not None: - ax.add_feature(cfeature.LAND, facecolor=land) - if ocean is not None: - ax.add_feature(cfeature.OCEAN, facecolor=ocean) - if borders is not None: - ax.add_feature(cfeature.BORDERS, linewidth=borders) - if states is not None: - ax.add_feature(cfeature.STATES, linewidth=states) + if coastlines: + ax.coastlines() ax.set_extent(region, crs=crs) # Set the proper ticks for a Cartopy map ax.set_xticks(xticks, crs=crs) @@ -103,7 +100,7 @@ def fetch_baja_bathymetry(): def setup_baja_bathymetry_map( - ax, region=(245.0, 254.705, 20.0, 29.99), land="gray", ocean=None + ax, region=(245.0, 254.705, 20.0, 29.99), coastlines=True, **kwargs ): """ Setup a Cartopy map for the Baja California bathymetry dataset. @@ -114,22 +111,27 @@ def setup_baja_bathymetry_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_baja_bathymetry: Sample bathymetry data from Baja California. """ + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-114, -105, 2), yticks=np.arange(21, 30, 2), - land=land, - ocean=ocean, + coastlines=coastlines, region=region, crs=ccrs.PlateCarree(), ) @@ -205,10 +207,6 @@ def setup_rio_magnetic_map(ax, region=(-42.6, -42, -22.5, -22)): The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. See also -------- @@ -224,8 +222,6 @@ def setup_rio_magnetic_map(ax, region=(-42.6, -42, -22.5, -22)): ax, xticks=np.arange(-42.5, -42, 0.1), yticks=np.arange(-22.5, -21.99, 0.1), - land=None, - ocean=None, region=region, crs=ccrs.PlateCarree(), ) @@ -268,7 +264,7 @@ def fetch_california_gps(): def setup_california_gps_map( - ax, region=(235.2, 245.3, 31.9, 42.3), land="gray", ocean="skyblue" + ax, region=(235.2, 245.3, 31.9, 42.3), coastlines=True, **kwargs ): """ Setup a Cartopy map for the California GPS velocity dataset. @@ -279,22 +275,27 @@ def setup_california_gps_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - ocean : str or None - The name of the color of the ocean feature or None to omit it. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_california_gps: Sample GPS velocity data from California. """ + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-124, -115, 4), yticks=np.arange(33, 42, 2), - land=land, - ocean=ocean, + coastlines=coastlines, region=region, crs=ccrs.PlateCarree(), ) @@ -328,9 +329,7 @@ def fetch_texas_wind(): return data -def setup_texas_wind_map( - ax, region=(-107, -93, 25.5, 37), land="#dddddd", borders=0.5, states=0.1 -): +def setup_texas_wind_map(ax, region=(-107, -93, 25.5, 37), coastlines=True, **kwargs): """ Setup a Cartopy map for the Texas wind speed and air temperature dataset. @@ -340,27 +339,27 @@ def setup_texas_wind_map( The axes where the map is being plotted. region : list = [W, E, S, N] The boundaries of the map region in the coordinate system of the data. - land : str or None - The name of the color of the land feature or None to omit it. - borders : float or None - Line width of the country borders. - states : float or None - Line width of the state borders. + coastlines : bool + If True the coastlines will be added to the plot. + kwargs : + All additional key-word arguments will be ignored. ``kwargs`` are + accepted to guarantee backward compatibility. See also -------- fetch_texas_wind: Sample wind speed and air temperature data for Texas. """ - + if kwargs: + warnings.warn( + "All kwargs are being ignored. They are accepted to " + + "guarantee backward compatibility." + ) _setup_map( ax, xticks=np.arange(-106, -92, 3), yticks=np.arange(27, 38, 3), - land=land, - ocean=None, + coastlines=coastlines, region=region, - borders=borders, - states=states, crs=ccrs.PlateCarree(), ) diff --git a/verde/mask.py b/verde/mask.py index 9c06ca29f..c023d5957 100644 --- a/verde/mask.py +++ b/verde/mask.py @@ -3,7 +3,10 @@ """ import numpy as np -from .base import n_1d_arrays +# pylint doesn't pick up on this import for some reason +from scipy.spatial import Delaunay # pylint: disable=no-name-in-module + +from .base.utils import n_1d_arrays, check_coordinates from .utils import kdtree @@ -43,9 +46,10 @@ def distance_mask( northing will be used, all subsequent coordinates will be ignored. grid : None or :class:`xarray.Dataset` 2D grid with values to be masked. Will use the first two dimensions of - the grid as northing and easting coordinates, respectively. The mask - will be applied to *grid* using the :meth:`xarray.Dataset.where` - method. + the grid as northing and easting coordinates, respectively. For this to + work, the grid dimensions **must be ordered as northing then easting**. + The mask will be applied to *grid* using the + :meth:`xarray.Dataset.where` method. projection : callable or None If not None, then should be a callable object ``projection(easting, northing) -> (proj_easting, proj_northing)`` that takes in easting and @@ -93,14 +97,7 @@ def distance_mask( [nan nan nan nan nan nan]] """ - if coordinates is None and grid is None: - raise ValueError("Either coordinates or grid must be given.") - if coordinates is None: - dims = [grid[var].dims for var in grid.data_vars][0] - coordinates = np.meshgrid(grid.coords[dims[1]], grid.coords[dims[0]]) - if len(set(i.shape for i in coordinates)) != 1: - raise ValueError("Coordinate arrays must have the same shape.") - shape = coordinates[0].shape + coordinates, shape = _get_grid_coordinates(coordinates, grid) if projection is not None: data_coordinates = projection(*n_1d_arrays(data_coordinates, 2)) coordinates = projection(*n_1d_arrays(coordinates, 2)) @@ -110,3 +107,124 @@ def distance_mask( if grid is not None: return grid.where(mask) return mask + + +def convexhull_mask( + data_coordinates, + coordinates=None, + grid=None, + projection=None, +): + """ + Mask grid points that are outside the convex hull of the given data points. + + Either *coordinates* or *grid* must be given: + + * If *coordinates* is not None, produces an array that is False when a + point is outside the convex hull and True otherwise. + * If *grid* is not None, produces a mask and applies it to *grid* (an + :class:`xarray.Dataset`). + + Parameters + ---------- + data_coordinates : tuple of arrays + Same as *coordinates* but for the data points. + coordinates : None or tuple of arrays + Arrays with the coordinates of each point that will be masked. Should + be in the following order: (easting, northing, ...). Only easting and + northing will be used, all subsequent coordinates will be ignored. + grid : None or :class:`xarray.Dataset` + 2D grid with values to be masked. Will use the first two dimensions of + the grid as northing and easting coordinates, respectively. For this to + work, the grid dimensions **must be ordered as northing then easting**. + The mask will be applied to *grid* using the + :meth:`xarray.Dataset.where` method. + projection : callable or None + If not None, then should be a callable object ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected easting and northing + coordinate arrays. This function will be used to project the given + coordinates (or the ones extracted from the grid) before calculating + distances. + + Returns + ------- + mask : array or :class:`xarray.Dataset` + If *coordinates* was given, then a boolean array with the same shape as + the elements of *coordinates*. If *grid* was given, then an + :class:`xarray.Dataset` with the mask applied to it. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> region = (0, 5, -10, -4) + >>> spacing = 1 + >>> coords = grid_coordinates(region, spacing=spacing) + >>> data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + >>> mask = convexhull_mask(data_coords, coordinates=coords) + >>> print(mask) + [[False False False False False False] + [False False True True False False] + [False False True True False False] + [False False True True False False] + [False False True True False False] + [False False False False False False] + [False False False False False False]] + >>> # Mask an xarray.Dataset directly + >>> import xarray as xr + >>> coords_dict = {"easting": coords[0][0, :], "northing": coords[1][:, 0]} + >>> data_vars = {"scalars": (["northing", "easting"], np.ones(mask.shape))} + >>> grid = xr.Dataset(data_vars, coords=coords_dict) + >>> masked = convexhull_mask(data_coords, grid=grid) + >>> print(masked.scalars.values) + [[nan nan nan nan nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan 1. 1. nan nan] + [nan nan nan nan nan nan] + [nan nan nan nan nan nan]] + + """ + coordinates, shape = _get_grid_coordinates(coordinates, grid) + n_coordinates = 2 + # Make sure they are arrays so we can normalize + data_coordinates = n_1d_arrays(data_coordinates, n_coordinates) + coordinates = n_1d_arrays(coordinates, n_coordinates) + if projection is not None: + data_coordinates = projection(*data_coordinates) + coordinates = projection(*coordinates) + # Normalize the coordinates to avoid errors from qhull when values are very + # large (as occurs when projections are used). + means = [coord.mean() for coord in data_coordinates] + stds = [coord.std() for coord in data_coordinates] + data_coordinates = tuple( + (coord - mean) / std for coord, mean, std in zip(data_coordinates, means, stds) + ) + coordinates = tuple( + (coord - mean) / std for coord, mean, std in zip(coordinates, means, stds) + ) + triangles = Delaunay(np.transpose(data_coordinates)) + # Find the triangle that contains each grid point. + # -1 indicates that it's not in any triangle. + in_triangle = triangles.find_simplex(np.transpose(coordinates)) + mask = (in_triangle != -1).reshape(shape) + if grid is not None: + return grid.where(mask) + return mask + + +def _get_grid_coordinates(coordinates, grid): + """ + If coordinates is given, return it and their shape. Otherwise, get + coordinate arrays from the grid. + """ + if coordinates is None and grid is None: + raise ValueError("Either coordinates or grid must be given.") + if coordinates is None: + dims = [grid[var].dims for var in grid.data_vars][0] + coordinates = np.meshgrid(grid.coords[dims[1]], grid.coords[dims[0]]) + check_coordinates(coordinates) + shape = coordinates[0].shape + return coordinates, shape diff --git a/verde/model_selection.py b/verde/model_selection.py index bc4c3958d..c230e5991 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -1,33 +1,463 @@ """ Functions for automating model selection through cross-validation. - -Supports using a dask.distributed.Client object for parallelism. The -DummyClient is used as a serial version of the parallel client. """ import warnings import numpy as np from sklearn.model_selection import KFold, ShuffleSplit from sklearn.base import clone +from sklearn.utils import check_random_state -from .base import check_fit_input -from .utils import dispatch +from .base import check_fit_input, n_1d_arrays, BaseBlockCrossValidator +from .base.utils import score_estimator +from .coordinates import block_split +from .utils import dispatch, partition_by_sum # Otherwise, DeprecationWarning won't be shown, kind of defeating the purpose. warnings.simplefilter("default") -def train_test_split(coordinates, data, weights=None, **kwargs): +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class BlockShuffleSplit(BaseBlockCrossValidator): + """ + Random permutation of spatial blocks cross-validator. + + Yields indices to split data into training and test sets. Data are first + grouped into rectangular blocks of size given by the *spacing* argument. + Alternatively, blocks can be defined by the number of blocks in each + dimension using the *shape* argument instead of *spacing*. The blocks are + then split into testing and training sets randomly. + + The proportion of blocks assigned to each set is controlled by *test_size* + and/or *train_size*. However, the total amount of actual data points in + each set could be different from these values since blocks can have + a different number of data points inside them. To guarantee that the + proportion of actual data is as close as possible to the proportion of + blocks, this cross-validator generates an extra number of splits and + selects the one with proportion of data points in each set closer to the + desired amount [Valavi_etal2019]_. The number of balancing splits per + iteration is controlled by the *balancing* argument. + + This cross-validator is preferred over + :class:`sklearn.model_selection.ShuffleSplit` for spatial data to avoid + overestimating cross-validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. + + .. note:: + + Like :class:`sklearn.model_selection.ShuffleSplit`, this + cross-validator cannot guarantee that all folds will be different, + although this is still very likely for sizeable datasets. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int, default 10 + Number of re-shuffling & splitting iterations. + test_size : float, int, None, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + train_size : float, int, or None, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + balancing : int + The number of splits generated per iteration to try to balance the + amount of data in each set so that *test_size* and *train_size* are + respected. If 1, then no extra splits are generated (essentially + disabling the balacing). Must be >= 1. + + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + cross_val_score : Score an estimator/gridder using cross-validation. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> import numpy as np + >>> # Make a regular grid of data points + >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) + >>> # Need to convert the coordinates into a feature matrix + >>> X = np.transpose([i.ravel() for i in coords]) + >>> shuffle = BlockShuffleSplit(spacing=1.5, n_splits=3, random_state=0) + >>> # These are the 1D indices of the points belonging to each set + >>> for train, test in shuffle.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + >>> # A better way to visualize this is to create a 2D array and put + >>> # "train" or "test" in the corresponding locations. + >>> shape = coords[0].shape + >>> mask = np.full(shape=shape, fill_value=" ") + >>> for iteration, (train, test) in enumerate(shuffle.split(X)): + ... # The index needs to be converted to 2D so we can index our matrix. + ... mask[np.unravel_index(train, shape)] = "train" + ... mask[np.unravel_index(test, shape)] = " test" + ... print("Iteration {}:".format(iteration)) + ... print(mask) + Iteration 0: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + [' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train']] + Iteration 1: + [[' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 2: + [['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=10, + test_size=0.1, + train_size=None, + random_state=None, + balancing=10, + ): + super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) + if balancing < 1: + raise ValueError( + "The *balancing* argument must be >= 1. To disable balancing, use 1." + ) + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self.balancing = balancing + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + Runs several iterations until a split is found that yields blocks with + the right amount of data points in it. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + labels = block_split( + coordinates=(X[:, 0], X[:, 1]), + spacing=self.spacing, + shape=self.shape, + region=None, + adjust="spacing", + )[1] + block_ids = np.unique(labels) + # Generate many more splits so that we can pick and choose the ones + # that have the right balance of training and testing data. + shuffle = ShuffleSplit( + n_splits=self.n_splits * self.balancing, + test_size=self.test_size, + train_size=self.train_size, + random_state=self.random_state, + ).split(block_ids) + for _ in range(self.n_splits): + test_sets, balance = [], [] + for _ in range(self.balancing): + # This is a false positive in pylint which is why the warning + # is disabled at the top of this file: + # https://github.com/PyCQA/pylint/issues/1830 + # pylint: disable=stop-iteration-return + train_blocks, test_blocks = next(shuffle) + # pylint: enable=stop-iteration-return + train_points = np.where(np.isin(labels, block_ids[train_blocks]))[0] + test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] + # The proportion of data points assigned to each group should + # be close the proportion of blocks assigned to each group. + balance.append( + abs( + train_points.size / test_points.size + - train_blocks.size / test_blocks.size + ) + ) + test_sets.append(test_points) + best = np.argmin(balance) + yield test_sets[best] + + +class BlockKFold(BaseBlockCrossValidator): + """ + K-Folds over spatial blocks cross-validator. + + Yields indices to split data into training and test sets. Data are first + grouped into rectangular blocks of size given by the *spacing* argument. + Alternatively, blocks can be defined by the number of blocks in each + dimension using the *shape* argument instead of *spacing*. The blocks are + then split into testing and training sets iteratively along k folds of the + data (k is given by *n_splits*). + + By default, the blocks are split into folds in a way that makes each fold + have approximately the same number of data points. Sometimes this might not + be possible, which can happen if the number of splits is close to the + number of blocks. In these cases, each fold will have the same number of + blocks, regardless of how many data points are in each block. This + behaviour can also be disabled by setting ``balance=False``. + + Shuffling the blocks prior to splitting is strongly encouraged. Not + shuffling will essentially lead to the creation of *n_splits* large blocks + since blocks are spatially adjacent when not shuffled. The default + behaviour is not to shuffle for compatibility with similar cross-validators + in scikit-learn. + + This cross-validator is preferred over + :class:`sklearn.model_selection.KFold` for spatial data to avoid + overestimating cross-validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int + Number of folds. Must be at least 2. + shuffle : bool + Whether to shuffle the data before splitting into batches. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + balance : bool + Whether or not to split blocks into fold with approximately equal + number of data points. If False, each fold will have the same number of + blocks (which can have different number of data points in them). + + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + cross_val_score : Score an estimator/gridder using cross-validation. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> import numpy as np + >>> # Make a regular grid of data points + >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) + >>> # Need to convert the coordinates into a feature matrix + >>> X = np.transpose([i.ravel() for i in coords]) + >>> kfold = BlockKFold(spacing=1.5, n_splits=4) + >>> # These are the 1D indices of the points belonging to each set + >>> for train, test in kfold.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] + >>> # A better way to visualize this is to create a 2D array and put + >>> # "train" or "test" in the corresponding locations. + >>> shape = coords[0].shape + >>> mask = np.full(shape=shape, fill_value=" ") + >>> for iteration, (train, test) in enumerate(kfold.split(X)): + ... # The index needs to be converted to 2D so we can index our matrix. + ... mask[np.unravel_index(train, shape)] = "train" + ... mask[np.unravel_index(test, shape)] = " test" + ... print("Iteration {}:".format(iteration)) + ... print(mask) + Iteration 0: + [[' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 1: + [['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 2: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + [' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train']] + Iteration 3: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test']] + + For spatial data, it's often good to shuffle the blocks before assigning + them to folds: + + >>> # Set the random_state to make sure we always get the same result. + >>> kfold = BlockKFold( + ... spacing=1.5, n_splits=4, shuffle=True, random_state=123, + ... ) + >>> for train, test in kfold.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + + These should be the same splits as we got before but in a different order. + This only happens because in this example we have the number of splits + equal to the number of blocks (4). With real data the effects would be more + dramatic. + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=5, + shuffle=False, + random_state=None, + balance=True, + ): + super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) + if n_splits < 2: + raise ValueError( + "Number of splits must be >=2 for BlockKFold. Given {}.".format( + n_splits + ) + ) + self.shuffle = shuffle + self.random_state = random_state + self.balance = balance + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + labels = block_split( + coordinates=(X[:, 0], X[:, 1]), + spacing=self.spacing, + shape=self.shape, + region=None, + adjust="spacing", + )[1] + block_ids = np.unique(labels) + if self.n_splits > block_ids.size: + raise ValueError( + "Number of k-fold splits ({}) cannot be greater than the number of " + "blocks ({}). Either decrease n_splits or increase the number of " + "blocks.".format(self.n_splits, block_ids.size) + ) + if self.shuffle: + check_random_state(self.random_state).shuffle(block_ids) + if self.balance: + block_sizes = [np.isin(labels, i).sum() for i in block_ids] + try: + split_points = partition_by_sum(block_sizes, parts=self.n_splits) + folds = np.split(np.arange(block_ids.size), split_points) + except ValueError: + warnings.warn( + "Could not balance folds to have approximately the same " + "number of data points. Dividing into folds with equal " + "number of blocks instead. Decreasing n_splits or increasing " + "the number of blocks may help.", + UserWarning, + ) + folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] + else: + folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] + for test_blocks in folds: + test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] + yield test_points + + +# pylint: enable=invalid-name,unused-argument + + +def train_test_split( + coordinates, data, weights=None, spacing=None, shape=None, **kwargs +): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to - work on multi-component spatial data with optional weights. - - Extra keyword arguments will be passed to - :class:`sklearn.model_selection.ShuffleSplit`, except for ``n_splits`` - which is always 1. + work on single- or multi-component spatial data with optional weights. + + If arguments *shape* or *spacing* are provided, will group the data by + spatial blocks before random splitting (using + :class:`verde.BlockShuffleSplit` instead of + :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing* + specifies the size of the spatial blocks. Alternatively, use *shape* to + specify the number of blocks in each dimension. + + Extra keyword arguments will be passed to the cross-validation class. The + exception is ``n_splits`` which is always 1. + + Grouping by spatial blocks is preferred over plain random splits for + spatial data to avoid overestimating validation scores. This can happen + because of the inherent autocorrelation that is usually associated with + this type of data (points that are close together are more likely to have + similar values). See [Roberts_etal2017]_ for an overview of this topic. To + use spatial blocking, you **must provide** a *spacing* or *shape* argument + (see below). Parameters ---------- @@ -41,6 +471,15 @@ def train_test_split(coordinates, data, weights=None, **kwargs): if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). + spacing : float, tuple = (s_north, s_east), or None + The spatial block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* must be provided in order to use + spatial blocking. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* must be provided in order to use + spatial blocking. Returns ------- @@ -48,55 +487,116 @@ def train_test_split(coordinates, data, weights=None, **kwargs): Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. + See also + -------- + cross_val_score : Score an estimator/gridder using cross-validation. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- + To randomly split the data between training and testing sets: + >>> import numpy as np - >>> # Split 2-component data with weights - >>> data = (np.array([1, 3, 5, 7]), np.array([0, 2, 4, 6])) + >>> # Make some data + >>> data = np.array([10, 30, 50, 70]) >>> coordinates = (np.arange(4), np.arange(-4, 0)) - >>> weights = (np.array([1, 1, 2, 1]), np.array([1, 2, 1, 1])) - >>> train, test = train_test_split(coordinates, data, weights, - ... random_state=0) - >>> print("Coordinates:", train[0], test[0], sep='\n ') - Coordinates: - (array([3, 1, 0]), array([-1, -3, -4])) - (array([2]), array([-2])) - >>> print("Data:", train[1], test[1], sep='\n ') - Data: - (array([7, 3, 1]), array([6, 2, 0])) - (array([5]), array([4])) - >>> print("Weights:", train[2], test[2], sep='\n ') - Weights: - (array([1, 1, 1]), array([1, 2, 1])) - (array([2]), array([1])) - >>> # Split single component data without weights - >>> train, test = train_test_split(coordinates, data[0], None, - ... random_state=0) - >>> print("Coordinates:", train[0], test[0], sep='\n ') - Coordinates: - (array([3, 1, 0]), array([-1, -3, -4])) - (array([2]), array([-2])) - >>> print("Data:", train[1], test[1], sep='\n ') - Data: - (array([7, 3, 1]),) - (array([5]),) - >>> print("Weights:", train[2], test[2], sep='\n ') - Weights: - (None,) - (None,) + >>> train, test = train_test_split(coordinates, data, random_state=0) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]),) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]),) + + If weights are given, they will also be split among the sets: + + >>> weights = np.array([4, 3, 2, 5]) + >>> train, test = train_test_split( + ... coordinates, data, weights, random_state=0, + ... ) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]),) + >>> print("weights:", train[2]) + weights: (array([5, 3, 4]),) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]),) + >>> print("weights:", test[2]) + weights: (array([2]),) + + Data with multiple components can also be split: + + >>> data = (np.array([10, 30, 50, 70]), np.array([-70, -50, -30, -10])) + >>> train, test = train_test_split(coordinates, data, random_state=0) + >>> # The training set: + >>> print("coords:", train[0]) + coords: (array([3, 1, 0]), array([-1, -3, -4])) + >>> print("data:", train[1]) + data: (array([70, 30, 10]), array([-10, -50, -70])) + >>> # The testing set: + >>> print("coords:", test[0]) + coords: (array([2]), array([-2])) + >>> print("data:", test[1]) + data: (array([50]), array([-30])) + + To split data grouped in spatial blocks: + + >>> from verde import grid_coordinates + >>> # Make a regular grid of data points + >>> coordinates = grid_coordinates(region=(0, 3, 4, 7), spacing=1) + >>> data = np.arange(16).reshape((4, 4)) + >>> # We must specify the size of the blocks via the spacing argument. + >>> # Blocks of 1.5 will split the domain into 4 blocks. + >>> train, test = train_test_split( + ... coordinates, data, random_state=0, spacing=1.5, + ... ) + >>> # The training set: + >>> print("coords:", train[0][0], train[0][1], sep="\n") + coords: + [0. 1. 2. 3. 0. 1. 2. 3. 2. 3. 2. 3.] + [4. 4. 4. 4. 5. 5. 5. 5. 6. 6. 7. 7.] + >>> print("data:", train[1]) + data: (array([ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15]),) + >>> # The testing set: + >>> print("coords:", test[0][0], test[0][1]) + coords: [0. 1. 0. 1.] [6. 6. 7. 7.] + >>> print("data:", test[1]) + data: (array([ 8, 9, 12, 13]),) """ args = check_fit_input(coordinates, data, weights, unpack=False) - ndata = args[1][0].size - indices = np.arange(ndata) - split = next(ShuffleSplit(n_splits=1, **kwargs).split(indices)) + if spacing is None and shape is None: + indices = np.arange(args[1][0].size) + shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices) + else: + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) + shuffle = BlockShuffleSplit( + n_splits=1, spacing=spacing, shape=shape, **kwargs + ).split(feature_matrix) + split = next(shuffle) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test def cross_val_score( - estimator, coordinates, data, weights=None, cv=None, client=None, delayed=False + estimator, + coordinates, + data, + weights=None, + cv=None, + client=None, + delayed=False, + scoring=None, ): """ Score an estimator/gridder using cross-validation. @@ -106,7 +606,12 @@ def cross_val_score( By default, will use :class:`sklearn.model_selection.KFold` with ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other - cross-validation class can be passed in through the *cv* argument. + cross-validation class from scikit-learn or Verde can be passed in through + the *cv* argument. + + The score is calculated using the estimator/gridder's ``.score`` method by + default. Alternatively, use the *scoring* parameter to specify a different + scoring function (e.g., mean square error, mean absolute error, etc). Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead @@ -135,7 +640,7 @@ def cross_val_score( one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator - Any scikit-learn cross-validation generator. Defaults to + Any scikit-learn or Verde cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client **DEPRECATED:** This option is deprecated and will be removed in Verde @@ -147,6 +652,11 @@ def cross_val_score( actually executing them. The returned scores will be a list of delayed objects. Call `.compute()` on each score or :func:`dask.compute` on the entire list to trigger the actual computations. + scoring : None, str, or callable + A scoring function (or name of a function) known to scikit-learn. See + the description of *scoring* in + :func:`sklearn.model_selection.cross_val_score` for details. If None, + will fall back to the estimator's ``.score`` method. Returns ------- @@ -155,6 +665,11 @@ def cross_val_score( *delayed*, will be a list of Dask delayed objects (see the *delayed* option). If *client* is not None, then the scores will be futures. + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- @@ -176,6 +691,22 @@ def cross_val_score( There are 5 scores because the default cross-validator is :class:`sklearn.model_selection.KFold` with ``n_splits=5``. + To calculate the score with a different metric, use the *scoring* argument: + + >>> scores = cross_val_score( + ... model, coords, data, scoring="neg_mean_squared_error", + ... ) + >>> print(', '.join(['{:.2f}'.format(-score) for score in scores])) + 0.00, 0.00, 0.00, 0.00, 0.00 + + In this case, we calculated the (negative) mean squared error (MSE) which + measures the distance between test data and predictions. This way, 0 is the + best possible value meaning that the data and prediction are the same. The + "neg" part indicates that this is the negative mean square error. This is + required because scikit-learn assumes that higher scores are always treated + as better (which is the opposite for MSE). For display, we take the + negative of the score to get the actual MSE. + We can use different cross-validators by assigning them to the ``cv`` argument: @@ -186,6 +717,24 @@ def cross_val_score( >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 + Often, spatial data are autocorrelated (points that are close together are + more likely to have similar values), which can cause cross-validation with + random splits to overestimate the prediction accuracy [Roberts_etal2017]_. + To account for the autocorrelation, we can split the data in blocks rather + than randomly with :class:`verde.BlockShuffleSplit`: + + >>> from verde import BlockShuffleSplit + >>> # spacing controls the size of the spatial blocks + >>> cross_validator = BlockShuffleSplit( + ... spacing=2, n_splits=3, random_state=0 + ... ) + >>> scores = cross_val_score(model, coords, data, cv=cross_validator) + >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) + 1.00, 1.00, 1.00 + + We didn't see a difference here since our model and data are perfect. See + :ref:`model_evaluation` for an example with real data. + If using many splits, we can speed up computations by running them in parallel with Dask: @@ -216,16 +765,17 @@ def cross_val_score( ) if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) - ndata = data[0].size + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) fit_args = (coordinates, data, weights) scores = [] - for train_index, test_index in cv.split(np.arange(ndata)): - train = tuple(select(i, train_index) for i in fit_args) - test = tuple(select(i, test_index) for i in fit_args) + for train_index, test_index in cv.split(feature_matrix): # Clone the estimator to avoid fitting the same object simultaneously # when delayed=True. score = dispatch(fit_score, client=client, delayed=delayed)( - clone(estimator), train, test + clone(estimator), + tuple(select(i, train_index) for i in fit_args), + tuple(select(i, test_index) for i in fit_args), + scoring, ) scores.append(score) if not delayed and client is None: @@ -233,11 +783,16 @@ def cross_val_score( return scores -def fit_score(estimator, train_data, test_data): +def fit_score(estimator, train_data, test_data, scoring): """ Fit an estimator on the training data and then score it on the testing data """ - return estimator.fit(*train_data).score(*test_data) + estimator.fit(*train_data) + if scoring is None: + score = estimator.score(*test_data) + else: + score = score_estimator(scoring, estimator, *test_data) + return score def select(arrays, index): diff --git a/verde/projections.py b/verde/projections.py new file mode 100644 index 000000000..a8a580fc4 --- /dev/null +++ b/verde/projections.py @@ -0,0 +1,161 @@ +""" +Operations with projections for grids, regions, etc. +""" +import numpy as np + +from .coordinates import grid_coordinates, get_region, shape_to_spacing, check_region +from .utils import grid_to_table +from .scipygridder import ScipyGridder +from .blockreduce import BlockReduce +from .chain import Chain +from .mask import convexhull_mask + + +def project_region(region, projection): + """ + Calculate the bounding box of a region in projected coordinates. + + Parameters + ---------- + region : list = [W, E, S, N] + The boundaries of a given region in Cartesian or geographic + coordinates. + projection : callable + Should be a callable object (like a function) ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected northing and easting + coordinate arrays. + + Returns + ------- + proj_region : list = [W, E, S, N] + The bounding box of the projected region. + + Examples + -------- + + >>> def projection(x, y): + ... return (2*x, -1*y) + >>> project_region((3, 5, -9, -4), projection) + (6.0, 10.0, 4.0, 9.0) + + """ + east, north = grid_coordinates(region, shape=(101, 101)) + east, north = projection(east.ravel(), north.ravel()) + return (east.min(), east.max(), north.min(), north.max()) + + +def project_grid(grid, projection, method="linear", antialias=True, **kwargs): + """ + Apply the given map projection to a grid and re-sample it. + + Creates a new grid in the projected coordinates by interpolating the + original values using the chosen *method* (linear by default). Before + interpolation, apply a blocked mean operation (:class:`~verde.BlockReduce`) + to avoid aliasing when the projected coordinates become oversampled in some + regions (which would cause the interpolation to down-sample the original + data). For example, applying a polar projection results in oversampled data + close to the pole. + + Points that fall outside the convex hull of the original data will be + masked (see :func:`~verde.convexhull_mask`) since they are not constrained + by any data points. + + Any arguments that can be passed to the + :meth:`~verde.base.BaseGridder.grid` method of Verde gridders can be passed + to this function as well. Use this to set a region and spacing (or shape) + for the projected grid. The region and spacing must be in *projected + coordinates*. + + If no region is provided, the bounding box of the projected data will be + used. If no spacing or shape is provided, the shape of the input *grid* + will be used for the projected grid. + + By default, the ``data_names`` argument will be set to the name of the data + variable of the input *grid* (if it has been set). + + .. note:: + + The interpolation methods are limited to what is available in Verde and + there is only support for single 2D grids. For more sophisticated use + cases, you might want to try + `pyresample `__ instead. + + Parameters + ---------- + grid : :class:`xarray.DataArray` + A single 2D grid of values. The first dimension is assumed to be the + northing/latitude dimension while the second is assumed to be the + easting/longitude dimension. + projection : callable + Should be a callable object (like a function) ``projection(easting, + northing) -> (proj_easting, proj_northing)`` that takes in easting and + northing coordinate arrays and returns projected northing and easting + coordinate arrays. + method : string or Verde gridder + If a string, will use it to create a :class:`~verde.ScipyGridder` with + the corresponding method (nearest, linear, or cubic). Otherwise, should + be a gridder/estimator object, like :class:`~verde.Spline`. Default is + ``"linear"``. + antialias : bool + If True, will run a :class:`~verde.BlockReduce` with a mean function to + avoid aliasing when the projection results in oversampling of the data + in some areas (for example, in polar projections). If False, will not + run the blocked mean. + + Returns + ------- + projected_grid : :class:`xarray.DataArray` + The projected grid, interpolated with the given parameters. + + """ + if hasattr(grid, "data_vars"): + raise ValueError( + "Projecting xarray.Dataset is not currently supported. " + "Please provide a DataArray instead." + ) + if len(grid.dims) != 2: + raise ValueError( + "Projecting grids with number of dimensions other than 2 is not " + "currently supported (dimensions of the given DataArray: {}).".format( + len(grid.dims) + ) + ) + + # Can be set to None for some data arrays depending on how they are created + # so we can't just rely on the default value for getattr. + name = getattr(grid, "name", None) + if name is None: + name = "scalars" + + data = grid_to_table(grid).dropna() + coordinates = projection(data[grid.dims[1]].values, data[grid.dims[0]].values) + data_region = get_region(coordinates) + + region = kwargs.pop("region", data_region) + shape = kwargs.pop("shape", grid.shape) + spacing = kwargs.pop("spacing", shape_to_spacing(region, shape)) + + check_region(region) + + steps = [] + if antialias: + steps.append( + ("mean", BlockReduce(np.mean, spacing=spacing, region=data_region)) + ) + if isinstance(method, str): + steps.append(("spline", ScipyGridder(method))) + else: + steps.append(("spline", method)) + interpolator = Chain(steps) + interpolator.fit(coordinates, data[name]) + + projected = interpolator.grid( + region=region, + spacing=spacing, + data_names=kwargs.pop("data_names", [name]), + **kwargs + ) + if method not in ["linear", "cubic"]: + projected = convexhull_mask(coordinates, grid=projected) + return projected[name] diff --git a/verde/tests/test_base.py b/verde/tests/test_base.py index b46884b8f..5baa58b36 100644 --- a/verde/tests/test_base.py +++ b/verde/tests/test_base.py @@ -1,4 +1,4 @@ -# pylint: disable=unused-argument,too-many-locals +# pylint: disable=unused-argument,too-many-locals,protected-access """ Test the base classes and their utility functions. """ @@ -6,20 +6,37 @@ import numpy.testing as npt import pytest -from ..base.utils import check_fit_input +from ..base.least_squares import least_squares +from ..base.utils import check_fit_input, check_coordinates from ..base.base_classes import ( BaseGridder, - get_dims, - get_data_names, + BaseBlockCrossValidator, get_instance_region, ) from ..coordinates import grid_coordinates, scatter_points +def test_check_coordinates(): + "Should raise a ValueError is the coordinates have different shapes." + # Should not raise an error + check_coordinates([np.arange(10), np.arange(10)]) + check_coordinates([np.arange(10).reshape((5, 2)), np.arange(10).reshape((5, 2))]) + # Should raise an error + with pytest.raises(ValueError): + check_coordinates([np.arange(10), np.arange(10).reshape((5, 2))]) + with pytest.raises(ValueError): + check_coordinates( + [np.arange(10).reshape((2, 5)), np.arange(10).reshape((5, 2))] + ) + + def test_get_dims(): "Tests that get_dims returns the expected results" - assert get_dims(dims=None) == ("northing", "easting") - assert get_dims(dims=("john", "paul")) == ("john", "paul") + gridder = BaseGridder() + assert gridder._get_dims(dims=None) == ("northing", "easting") + assert gridder._get_dims(dims=("john", "paul")) == ("john", "paul") + gridder.dims = ("latitude", "longitude") + assert gridder._get_dims(dims=None) == ("latitude", "longitude") def test_get_data_names(): @@ -28,28 +45,30 @@ def test_get_data_names(): data2 = tuple([np.arange(10)] * 2) data3 = tuple([np.arange(10)] * 3) # Test the default names - assert get_data_names(data1, data_names=None) == ("scalars",) - assert get_data_names(data2, data_names=None) == ( + gridder = BaseGridder() + assert gridder._get_data_names(data1, data_names=None) == ("scalars",) + assert gridder._get_data_names(data2, data_names=None) == ( "east_component", "north_component", ) - assert get_data_names(data3, data_names=None) == ( + assert gridder._get_data_names(data3, data_names=None) == ( "east_component", "north_component", "vertical_component", ) # Test custom names - assert get_data_names(data1, data_names=("a",)) == ("a",) - assert get_data_names(data2, data_names=("a", "b")) == ("a", "b") - assert get_data_names(data3, data_names=("a", "b", "c")) == ("a", "b", "c") + assert gridder._get_data_names(data1, data_names=("a",)) == ("a",) + assert gridder._get_data_names(data2, data_names=("a", "b")) == ("a", "b") + assert gridder._get_data_names(data3, data_names=("a", "b", "c")) == ("a", "b", "c") def test_get_data_names_fails(): "Check if fails for invalid data types" + gridder = BaseGridder() with pytest.raises(ValueError): - get_data_names(tuple([np.arange(5)] * 4), data_names=None) + gridder._get_data_names(tuple([np.arange(5)] * 4), data_names=None) with pytest.raises(ValueError): - get_data_names(tuple([np.arange(5)] * 2), data_names=("meh",)) + gridder._get_data_names(tuple([np.arange(5)] * 2), data_names=("meh",)) def test_get_instance_region(): @@ -94,7 +113,7 @@ def test_basegridder(): BaseGridder().fit(None, None) grd = PolyGridder() - assert repr(grd) == "PolyGridder(degree=1)" + assert repr(grd) == "PolyGridder()" grd.degree = 2 assert repr(grd) == "PolyGridder(degree=2)" @@ -112,6 +131,7 @@ def test_basegridder(): coordinates_true = grid_coordinates(region, shape) data_true = angular * coordinates_true[0] + linear grid = grd.grid(region, shape) + prof = grd.profile((0, -10), (10, -10), 30) npt.assert_allclose(grd.coefs_, [linear, angular]) npt.assert_allclose(grid.scalars.values, data_true) @@ -119,40 +139,215 @@ def test_basegridder(): npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) npt.assert_allclose(grd.scatter(region, 1000, random_state=0).scalars, data) npt.assert_allclose( - grd.profile((0, 0), (10, 0), 30).scalars, + prof.scalars, angular * coordinates_true[0][0, :] + linear, ) + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][0, :]) + npt.assert_allclose(prof.distance, coordinates_true[0][0, :]) + + +def test_basegridder_data_names(): + """ + Test default values for data_names + """ + region = (0, 10, -10, -5) + shape = (50, 30) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0) + data = angular * coordinates[0] + linear + grd = PolyGridder().fit(coordinates, data) + grid = grd.grid(region, shape) + prof = grd.profile((0, -10), (10, -10), 30) + # Check if default name for data_names was applied correctly + assert "scalars" in grid + assert "scalars" in prof def test_basegridder_projection(): "Test basic functionality of BaseGridder when passing in a projection" + # Lets say the projection is doubling the coordinates + def proj(lon, lat, inverse=False): + "Project from the new coordinates to the original" + if inverse: + return (lon / 2, lat / 2) + return (lon * 2, lat * 2) + + # Values in "geographic" coordinates region = (0, 10, -10, -5) - shape = (50, 30) + shape = (51, 31) angular, linear = 2, 100 coordinates = scatter_points(region, 1000, random_state=0) data = angular * coordinates[0] + linear + # Project before passing to our Cartesian gridder + grd = PolyGridder().fit(proj(coordinates[0], coordinates[1]), data) + + # Check the estimated coefficients + # The grid is estimated in projected coordinates (which are twice as large) + # so the rate of change (angular) should be half to get the same values. + npt.assert_allclose(grd.coefs_, [linear, angular / 2]) + + # The actual values for a grid coordinates_true = grid_coordinates(region, shape) data_true = angular * coordinates_true[0] + linear + + # Check the scatter + scat = grd.scatter(region, 1000, random_state=0, projection=proj) + npt.assert_allclose(scat.scalars, data) + npt.assert_allclose(scat.easting, coordinates[0]) + npt.assert_allclose(scat.northing, coordinates[1]) + + # Check the grid + grid = grd.grid(region, shape, projection=proj) + npt.assert_allclose(grid.scalars.values, data_true) + npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :]) + npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) + + # Check the profile + prof = grd.profile( + (region[0], region[-1]), (region[1], region[-1]), shape[1], projection=proj + ) + npt.assert_allclose(prof.scalars, data_true[-1, :]) + # Coordinates should still be evenly spaced since the projection is a + # multiplication. + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][-1, :]) + # Distance should still be in the projected coordinates. If the projection + # is from geographic, we shouldn't be returning distances in degrees but in + # projected meters. The distances will be evenly spaced in unprojected + # coordinates. + distance_true = np.linspace(region[0] * 2, region[1] * 2, shape[1]) + npt.assert_allclose(prof.distance, distance_true) + + +def test_basegridder_extra_coords(): + "Test if BaseGridder adds extra_coords to generated outputs" + + grd = PolyGridder() + region = (0, 10, -10, -5) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0) + data = angular * coordinates[0] + linear grd = PolyGridder().fit(coordinates, data) - # Lets say we want to specify the region for a grid using a coordinate - # system that is lon/2, lat/2. - def proj(lon, lat): + # Test grid with a single extra coord + extra_coords = 9 + grid = grd.grid(region=region, spacing=1, extra_coords=extra_coords) + assert "extra_coord" in grid.coords + assert grid.coords["extra_coord"].dims == ("northing", "easting") + npt.assert_allclose(grid.coords["extra_coord"], extra_coords) + + # Test grid with multiple extra coords + extra_coords = [9, 18, 27] + grid = grd.grid(region=region, spacing=1, extra_coords=extra_coords) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in grid.coords + assert grid.coords[name].dims == ("northing", "easting") + npt.assert_allclose(grid.coords[name], coord) + + # Test scatter with a single extra coord + extra_coords = 9 + scat = grd.scatter(region, 1000, extra_coords=extra_coords) + assert "extra_coord" in scat.columns + npt.assert_allclose(scat["extra_coord"], extra_coords) + + # Test scatter with multiple extra coord + extra_coords = [9, 18, 27] + scat = grd.scatter(region, 1000, extra_coords=extra_coords) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in scat.columns + npt.assert_allclose(scat[name], coord) + + # Test profile with a single extra coord + extra_coords = 9 + prof = grd.profile( + (region[0], region[-1]), + (region[1], region[-1]), + 51, + extra_coords=extra_coords, + ) + assert "extra_coord" in prof.columns + npt.assert_allclose(prof["extra_coord"], extra_coords) + + # Test profile with multiple extra coord + extra_coords = [9, 18, 27] + prof = grd.profile( + (region[0], region[-1]), + (region[1], region[-1]), + 51, + extra_coords=extra_coords, + ) + extra_coord_names = ["extra_coord", "extra_coord_1", "extra_coord_2"] + for name, coord in zip(extra_coord_names, extra_coords): + assert name in prof.columns + npt.assert_allclose(prof[name], coord) + + +def test_basegridder_projection_multiple_coordinates(): + "Test BaseGridder when passing in a projection with multiple coordinates" + + # Lets say the projection is doubling the coordinates + def proj(lon, lat, inverse=False): "Project from the new coordinates to the original" + if inverse: + return (lon / 2, lat / 2) return (lon * 2, lat * 2) - proj_region = [i / 2 for i in region] - grid = grd.grid(proj_region, shape, projection=proj) - scat = grd.scatter(proj_region, 1000, random_state=0, projection=proj) - prof = grd.profile((0, 0), (5, 0), 30, projection=proj) + # Values in "geographic" coordinates + region = (0, 10, -10, -5) + shape = (51, 31) + angular, linear = 2, 100 + coordinates = scatter_points(region, 1000, random_state=0, extra_coords=(1, 2)) + data = angular * coordinates[0] + linear + # Project before passing to our Cartesian gridder + proj_coordinates = proj(coordinates[0], coordinates[1]) + coordinates[2:] + grd = PolyGridder().fit(proj_coordinates, data) - npt.assert_allclose(grd.coefs_, [linear, angular]) - npt.assert_allclose(grid.scalars.values, data_true) - npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :] / 2) - npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0] / 2) + # Check the estimated coefficients + # The grid is estimated in projected coordinates (which are twice as large) + # so the rate of change (angular) should be half to get the same values. + npt.assert_allclose(grd.coefs_, [linear, angular / 2]) + + # The actual values for a grid + coordinates_true = grid_coordinates(region, shape, extra_coords=(13, 17)) + data_true = angular * coordinates_true[0] + linear + + # Check the scatter + scat = grd.scatter( + region, 1000, random_state=0, projection=proj, extra_coords=(13, 17) + ) npt.assert_allclose(scat.scalars, data) - npt.assert_allclose(prof.scalars, angular * coordinates_true[0][0, :] + linear) + npt.assert_allclose(scat.easting, coordinates[0]) + npt.assert_allclose(scat.northing, coordinates[1]) + + # Check the grid + grid = grd.grid(region, shape, projection=proj, extra_coords=(13, 17)) + npt.assert_allclose(grid.scalars.values, data_true) + npt.assert_allclose(grid.easting.values, coordinates_true[0][0, :]) + npt.assert_allclose(grid.northing.values, coordinates_true[1][:, 0]) + + # Check the profile + prof = grd.profile( + (region[0], region[-1]), + (region[1], region[-1]), + shape[1], + projection=proj, + extra_coords=(13, 17), + ) + npt.assert_allclose(prof.scalars, data_true[-1, :]) + # Coordinates should still be evenly spaced since the projection is a + # multiplication. + npt.assert_allclose(prof.easting, coordinates_true[0][0, :]) + npt.assert_allclose(prof.northing, coordinates_true[1][-1, :]) + # Distance should still be in the projected coordinates. If the projection + # is from geographic, we shouldn't be returning distances in degrees but in + # projected meters. The distances will be evenly spaced in unprojected + # coordinates. + distance_true = np.linspace(region[0] * 2, region[1] * 2, shape[1]) + npt.assert_allclose(prof.distance, distance_true) def test_check_fit_input(): @@ -187,3 +382,56 @@ def test_check_fit_input_fails_weights(): check_fit_input(coords, data, weights) with pytest.raises(ValueError): check_fit_input(coords, (data, data), weights) + + +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class DummyCrossValidator(BaseBlockCrossValidator): + """ + Dummy class to test the base cross-validator. + """ + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Yields a list of indices for the entire X. + """ + yield list(range(X.shape[0])) + + +# pylint: enable=invalid-name,unused-argument + + +def test_baseblockedcrossvalidator_n_splits(): + "Make sure get_n_splits returns the correct value" + cv = DummyCrossValidator(spacing=1, n_splits=14) + assert cv.get_n_splits() == 14 + + +def test_baseblockedcrossvalidator_fails_spacing_shape(): + "Should raise an exception if not given spacing or shape." + with pytest.raises(ValueError): + DummyCrossValidator() + + +def test_baseblockedcrossvalidator_fails_data_shape(): + "Should raise an exception if the X array doesn't have 2 columns." + cv = DummyCrossValidator(spacing=1) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 4)))) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 1)))) + + +def test_least_squares_copy_jacobian(): + """ + Test if Jacobian matrix is copied or scaled inplace + """ + jacobian = np.identity(5) + original_jacobian = jacobian.copy() + data = np.array([1, 2, 3, 4, 5], dtype=float) + least_squares(jacobian, data, weights=None, copy_jacobian=True) + npt.assert_allclose(jacobian, original_jacobian) + least_squares(jacobian, data, weights=None) + assert not np.allclose(jacobian, original_jacobian) diff --git a/verde/tests/test_coordinates.py b/verde/tests/test_coordinates.py index 701273285..029933729 100644 --- a/verde/tests/test_coordinates.py +++ b/verde/tests/test_coordinates.py @@ -1,6 +1,8 @@ """ Test the coordinate generation functions """ +import warnings + import numpy as np import numpy.testing as npt import pytest @@ -11,9 +13,85 @@ profile_coordinates, grid_coordinates, longitude_continuity, + rolling_window, ) +def test_rolling_window_invalid_coordinate_shapes(): + "Shapes of input coordinates must all be the same" + coordinates = [np.arange(10), np.arange(10).reshape((5, 2))] + with pytest.raises(ValueError): + rolling_window(coordinates, size=2, spacing=1) + + +def test_rolling_window_empty(): + "Make sure empty windows return an empty index" + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + # Use a larger region to make sure the first window is empty + # Doing this will raise a warning for non-overlapping windows. Capture it + # so it doesn't pollute the test log. + with warnings.catch_warnings(record=True): + windows = rolling_window(coords, size=0.001, spacing=1, region=(-7, 1, 4, 12))[ + 1 + ] + assert windows[0, 0][0].size == 0 and windows[0, 0][1].size == 0 + # Make sure we can still index with an empty array + assert coords[0][windows[0, 0]].size == 0 + + +def test_rolling_window_warnings(): + "Should warn users if the windows don't overlap" + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + # For exact same size there will be 1 point overlapping so should not warn + with warnings.catch_warnings(record=True) as warn: + rolling_window(coords, size=2, spacing=2) + assert not any(issubclass(w.category, UserWarning) for w in warn) + args = [dict(spacing=3), dict(spacing=(4, 1)), dict(shape=(1, 2))] + for arg in args: + with warnings.catch_warnings(record=True) as warn: + rolling_window(coords, size=2, **arg) + # Filter out the user warnings from some deprecation warnings that + # might be thrown by other packages. + userwarnings = [w for w in warn if issubclass(w.category, UserWarning)] + assert len(userwarnings) == 1 + assert issubclass(userwarnings[-1].category, UserWarning) + assert str(userwarnings[-1].message).split()[0] == "Rolling" + + +def test_rolling_window_no_shape_or_spacing(): + """ + Check if error is raise if no shape or spacing is passed + """ + coords = grid_coordinates((-5, -1, 6, 10), spacing=1) + err_msg = "Either a shape or a spacing must be provided." + with pytest.raises(ValueError, match=err_msg): + rolling_window(coords, size=2) + + +def test_rolling_window_oversized_window(): + """ + Check if error is raised if size larger than region is passed + """ + oversize = 5 + regions = [ + (-5, -1, 6, 20), # window larger than west-east + (-20, -1, 6, 10), # window larger than south-north + (-5, -1, 6, 10), # window larger than both dims + ] + for region in regions: + coords = grid_coordinates(region, spacing=1) + # The expected error message with regex + # (the long expression intends to capture floats and ints) + float_regex = r"[+-]?([0-9]*[.])?[0-9]+" + err_msg = ( + r"Window size '{}' is larger ".format(float_regex) + + r"than dimensions of the region " + + r"'\({0}, {0}, {0}, {0}\)'.".format(float_regex) + ) + with pytest.raises(ValueError, match=err_msg): + rolling_window(coords, size=oversize, spacing=2) + + def test_spacing_to_shape(): "Check that correct spacing and region are returned" region = (-10, 0, 0, 5) diff --git a/verde/tests/test_datasets.py b/verde/tests/test_datasets.py index 9bdb4711f..ed285c0fc 100644 --- a/verde/tests/test_datasets.py +++ b/verde/tests/test_datasets.py @@ -2,6 +2,7 @@ Test data fetching routines. """ import os +import warnings import matplotlib.pyplot as plt import cartopy.crs as ccrs @@ -23,6 +24,9 @@ def test_datasets_locate(): "Make sure the data cache location has the right package name" + # Fetch a dataset first to make sure that the cache folder exists. Since + # Pooch 1.1.1 the cache isn't created until a download is requested. + fetch_texas_wind() path = locate() assert os.path.exists(path) # This is the most we can check in a platform independent way without @@ -127,3 +131,21 @@ def test_setup_california_gps(): ax = plt.subplot(111, projection=ccrs.Mercator()) setup_california_gps_map(ax) return fig + + +def test_setup_cartopy_backward(): + """ + Test backward compatibility of setup map functions + + Check if a warning is raise after passing deprecated parameters like ocean, + land, borders and states to functions to setup maps. + """ + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_texas_wind_map(ax, land="#dddddd", borders=0.5, states=0.1) + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_california_gps_map(ax, land="gray", ocean="skyblue") + ax = plt.subplot(111, projection=ccrs.Mercator()) + with warnings.catch_warnings(record=True): + setup_baja_bathymetry_map(ax, land="gray", ocean=None) diff --git a/verde/tests/test_mask.py b/verde/tests/test_mask.py index cfb80ed65..bd9b53837 100644 --- a/verde/tests/test_mask.py +++ b/verde/tests/test_mask.py @@ -6,10 +6,52 @@ import xarray as xr import pytest -from ..mask import distance_mask +from ..mask import distance_mask, convexhull_mask from ..coordinates import grid_coordinates +def test_convexhull_mask(): + "Check that the mask works for basic input" + region = (0, 5, -10, -4) + coords = grid_coordinates(region, spacing=1) + data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + mask = convexhull_mask(data_coords, coordinates=coords) + true = [ + [False, False, False, False, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, False, False, False, False], + [False, False, False, False, False, False], + ] + assert mask.tolist() == true + + +def test_convexhull_mask_projection(): + "Check that the mask works when given a projection" + region = (0, 5, -10, -4) + coords = grid_coordinates(region, spacing=1) + data_coords = ((2, 3, 2, 3), (-9, -9, -6, -6)) + # For a linear projection, the result should be the same since there is no + # area change in the data. + mask = convexhull_mask( + data_coords, + coordinates=coords, + projection=lambda e, n: (10 * e, 10 * n), + ) + true = [ + [False, False, False, False, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, True, True, False, False], + [False, False, False, False, False, False], + [False, False, False, False, False, False], + ] + assert mask.tolist() == true + + def test_distance_mask(): "Check that the mask works for basic input" region = (0, 5, -10, -4) diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index 224a4993b..9c2c7995a 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -1,18 +1,57 @@ """ Test the model selection code (cross-validation, etc). """ +import warnings + +import pytest from sklearn.model_selection import ShuffleSplit +from sklearn.metrics import get_scorer +import numpy as np import numpy.testing as npt from dask.distributed import Client -from .. import Trend, grid_coordinates -from ..model_selection import cross_val_score +from .. import Vector, Trend, grid_coordinates, scatter_points +from ..model_selection import cross_val_score, BlockShuffleSplit, BlockKFold -def test_cross_val_score_client(): - "Test the deprecated dask Client interface" +@pytest.fixture(name="trend") +def fixture_trend(): + "Coordinates and data for a 1-degree trend" coords = grid_coordinates((0, 10, -10, -5), spacing=0.1) - data = 10 - coords[0] + 0.5 * coords[1] + coefs = (10, -1, 0.5) + data = coefs[0] + coefs[1] * coords[0] + coefs[2] * coords[1] + return coords, data, coefs + + +@pytest.mark.parametrize( + "metric,expected", + [(None, 1), ("r2", 1), (get_scorer("neg_mean_squared_error"), 0)], + ids=["none", "R2", "MSE"], +) +def test_cross_val_score(trend, metric, expected): + "Check that CV scores are perfect on a perfect model" + coords, data = trend[:2] + model = Trend(degree=1) + scores = cross_val_score(model, coords, data, scoring=metric) + npt.assert_allclose(scores, expected, atol=1e-10) + + +@pytest.mark.parametrize( + "metric,expected", + [(None, 1), ("r2", 1), (get_scorer("neg_mean_squared_error"), 0)], + ids=["none", "R2", "MSE"], +) +def test_cross_val_score_vector(trend, metric, expected): + "Check that CV works on Vector data types as well" + coords, data = trend[:2] + model = Vector([Trend(degree=1), Trend(degree=1)]) + scores = cross_val_score(model, coords, (data, data), scoring=metric) + npt.assert_allclose(scores, expected, atol=1e-10) + + +def test_cross_val_score_client(trend): + "Test the deprecated dask Client interface" + coords, data = trend[:2] model = Trend(degree=1) nsplits = 5 cross_validator = ShuffleSplit(n_splits=nsplits, random_state=0) @@ -22,3 +61,59 @@ def test_cross_val_score_client(): client.close() assert len(scores) == nsplits npt.assert_allclose(scores, 1) + + +def test_blockshufflesplit_fails_balancing(): + "Should raise an exception if balancing < 1." + with pytest.raises(ValueError): + BlockShuffleSplit(spacing=1, balancing=0) + + +@pytest.mark.parametrize("test_size", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9]) +def test_blockshufflesplit_balancing(test_size): + "Make sure that the sets have the right number of points" + coords = np.random.RandomState(seed=0).multivariate_normal( + mean=[5, -7.5], + cov=[[4, 0], [0, 9]], + size=1000, + ) + npoints = coords.shape[0] + train_size = 1 - test_size + cv = BlockShuffleSplit(spacing=1, random_state=0, test_size=test_size, balancing=20) + for train, test in cv.split(coords): + npt.assert_allclose(train.size / npoints, train_size, atol=0.01) + npt.assert_allclose(test.size / npoints, test_size, atol=0.01) + + +def test_blockkfold_fails_n_splits_too_small(): + "Should raise an exception if n_splits < 2." + BlockKFold(spacing=1, n_splits=2) + with pytest.raises(ValueError): + BlockKFold(spacing=1, n_splits=1) + + +def test_blockkfold_fails_n_splits_too_large(): + "Should raise an exception if n_splits < number of blocks." + coords = grid_coordinates(region=(0, 3, -10, -7), shape=(4, 4)) + features = np.transpose([i.ravel() for i in coords]) + next(BlockKFold(shape=(2, 2), n_splits=4).split(features)) + with pytest.raises(ValueError) as error: + next(BlockKFold(shape=(2, 2), n_splits=5).split(features)) + assert "Number of k-fold splits (5) cannot be greater" in str(error) + + +def test_blockkfold_cant_balance(): + "Should fall back to regular split if can't balance and print a warning" + coords = scatter_points(region=(0, 3, -10, -7), size=10, random_state=2) + features = np.transpose([i.ravel() for i in coords]) + cv = BlockKFold(shape=(4, 4), n_splits=8) + with warnings.catch_warnings(record=True) as warn: + splits = [i for _, i in cv.split(features)] + assert len(warn) == 1 + assert issubclass(warn[-1].category, UserWarning) + assert "Could not balance folds" in str(warn[-1].message) + # Should revert to the unbalanced version + cv_unbalanced = BlockKFold(shape=(4, 4), n_splits=8, balance=False) + splits_unbalanced = [i for _, i in cv_unbalanced.split(features)] + for balanced, unbalanced in zip(splits, splits_unbalanced): + npt.assert_allclose(balanced, unbalanced) diff --git a/verde/tests/test_projections.py b/verde/tests/test_projections.py new file mode 100644 index 000000000..ed6af1858 --- /dev/null +++ b/verde/tests/test_projections.py @@ -0,0 +1,115 @@ +""" +Test the projection functions. +""" +import numpy.testing as npt +import numpy as np +import xarray as xr +import pytest + +from ..scipygridder import ScipyGridder +from ..projections import project_grid + + +def projection(longitude, latitude): + "Dummy projection" + return longitude ** 2, latitude ** 2 + + +@pytest.mark.parametrize( + "method", + ["nearest", "linear", "cubic", ScipyGridder("nearest")], + ids=["nearest", "linear", "cubic", "gridder"], +) +def test_project_grid(method): + "Use a simple projection to test that the output is as expected" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + proj = project_grid(grid, projection, method=method) + assert proj.dims == ("northing", "easting") + assert proj.name == "scalars" + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +def test_project_grid_name(): + "Check that grid name is kept" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray( + data, coords=[lons, lats], dims=("latitude", "longitude"), name="yara" + ) + proj = project_grid(grid, projection) + assert proj.name == "yara" + assert proj.dims == ("northing", "easting") + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +@pytest.mark.parametrize("antialias", [True, False]) +def test_project_grid_antialias(antialias): + "Check if antialias is being used" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + proj = project_grid(grid, projection, antialias=antialias) + if antialias: + assert "BlockReduce" in proj.attrs["metadata"] + else: + assert "BlockReduce" not in proj.attrs["metadata"] + assert proj.dims == ("northing", "easting") + assert proj.name == "scalars" + assert proj.shape == shape + # Check the grid spacing is constant + spacing_east = proj.easting[1:] - proj.easting[0:-1] + npt.assert_allclose(spacing_east, spacing_east[0]) + spacing_north = proj.northing[1:] - proj.northing[0:-1] + npt.assert_allclose(spacing_north, spacing_north[0]) + # Check that the values are all 1 + npt.assert_allclose(proj.values[~np.isnan(proj.values)], 1) + + +def test_project_grid_fails_dataset(): + "Should raise an exception when given a Datatset" + shape = (50, 40) + lats = np.linspace(2, 10, shape[1]) + lons = np.linspace(-10, 2, shape[0]) + data = np.ones(shape, dtype="float") + grid = xr.DataArray(data, coords=[lons, lats], dims=("latitude", "longitude")) + grid = grid.to_dataset(name="scalars") + with pytest.raises(ValueError): + project_grid(grid, projection) + + +@pytest.mark.parametrize("ndims", [1, 3]) +def test_project_grid_fails_dimensions(ndims): + "Should raise an exception when given more or less than 2 dimensions" + shape = (10, 20, 12) + coords = [ + np.linspace(-10, 2, shape[0]), + np.linspace(2, 10, shape[1]), + np.linspace(0, 100, shape[2]), + ] + dims = ("height", "latitude", "longitude") + data = np.ones(shape[:ndims], dtype="float") + grid = xr.DataArray(data, coords=coords[:ndims], dims=dims[:ndims]) + with pytest.raises(ValueError): + project_grid(grid, projection) diff --git a/verde/tests/test_utils.py b/verde/tests/test_utils.py index 0f8f8722e..f1ea64559 100644 --- a/verde/tests/test_utils.py +++ b/verde/tests/test_utils.py @@ -5,11 +5,19 @@ import numpy as np import numpy.testing as npt +import xarray as xr from scipy.spatial import cKDTree # pylint: disable=no-name-in-module import pytest from ..coordinates import grid_coordinates -from ..utils import parse_engine, dummy_jit, kdtree +from ..utils import ( + parse_engine, + dummy_jit, + kdtree, + grid_to_table, + partition_by_sum, + make_xarray_grid, +) from .. import utils @@ -52,3 +60,155 @@ def test_kdtree(): npt.assert_allclose(dist, 0.1) if not use_pykdtree: assert isinstance(tree, cKDTree) + + +def test_grid_to_table_order(): + "Check that coordinates are in the right order when converting to tables" + lon, lat = grid_coordinates(region=(1, 10, -10, -1), shape=(3, 4)) + data = lon ** 2 + # If the DataArray is created with coords in an order that doesn't match + # the dims (which is valid), we were getting it wrong because we were + # relying on the order of the coords instead of dims. This test would have + # caught that bug. + grid = xr.DataArray( + data=data, + coords={"longitude": lon[0, :], "latitude": lat[:, 0]}, + dims=("latitude", "longitude"), + ).to_dataset(name="field") + table = grid_to_table(grid) + true_lat = [-10, -10, -10, -10, -5.5, -5.5, -5.5, -5.5, -1, -1, -1, -1] + true_lon = [1, 4, 7, 10, 1, 4, 7, 10, 1, 4, 7, 10] + true_field = [1, 16, 49, 100, 1, 16, 49, 100, 1, 16, 49, 100] + npt.assert_allclose(true_lat, table.latitude) + npt.assert_allclose(true_lon, table.longitude) + npt.assert_allclose(true_field, table.field) + + +def test_partition_by_sum_fails_size(): + "Should raise an exception if given more parts than elements." + with pytest.raises(ValueError) as error: + partition_by_sum(np.arange(10), 11) + assert "array of size 10 into 11 parts" in str(error) + + +def test_partition_by_sum_fails_no_partitions(): + "Should raise an exception if could not find unique partition points" + with pytest.raises(ValueError) as error: + partition_by_sum(np.arange(10), 8) + assert "Could not find partition points" in str(error) + + +def test_make_xarray_grid(): + """ + Check if xarray.Dataset is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + data = np.ones_like(coordinates[0]) + grid = make_xarray_grid(coordinates, data, data_names="dummy") + npt.assert_allclose(grid.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(grid.northing, [6, 7, 8, 9, 10]) + npt.assert_allclose(grid.dummy, 1) + assert grid.dummy.shape == (5, 6) + # Change dims + grid = make_xarray_grid( + coordinates, data, data_names="dummy", dims=("latitude", "longitude") + ) + npt.assert_allclose(grid.longitude, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(grid.latitude, [6, 7, 8, 9, 10]) + npt.assert_allclose(grid.dummy, 1) + assert grid.dummy.shape == (5, 6) + + +def test_make_xarray_grid_multiple_data(): + """ + Check if xarray.Dataset with multiple data is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + data_arrays = tuple(i * np.ones_like(coordinates[0]) for i in range(1, 4)) + data_names = list("data_{}".format(i) for i in range(1, 4)) + dataset = make_xarray_grid(coordinates, data_arrays, data_names=data_names) + npt.assert_allclose(dataset.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(dataset.northing, [6, 7, 8, 9, 10]) + for i in range(1, 4): + npt.assert_allclose(dataset["data_{}".format(i)], i) + assert dataset["data_{}".format(i)].shape == (5, 6) + + +def test_make_xarray_grid_extra_coords(): + """ + Check if xarray.Dataset with extra coords is correctly created + """ + region = (-10, -5, 6, 10) + spacing = 1 + extra_coords = [1, 2] + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=extra_coords) + data = np.ones_like(coordinates[0]) + dataset = make_xarray_grid( + coordinates, + data, + data_names="dummy", + extra_coords_names=["upward", "time"], + ) + npt.assert_allclose(dataset.easting, [-10, -9, -8, -7, -6, -5]) + npt.assert_allclose(dataset.northing, [6, 7, 8, 9, 10]) + npt.assert_allclose(dataset.upward, 1) + npt.assert_allclose(dataset.time, 2) + npt.assert_allclose(dataset.dummy, 1) + assert dataset.dummy.shape == (5, 6) + assert dataset.upward.shape == (5, 6) + assert dataset.time.shape == (5, 6) + + +def test_make_xarray_grid_invalid_names(): + """ + Check if errors are raise after invalid data names + """ + region = (-10, -5, 6, 10) + spacing = 1 + coordinates = grid_coordinates(region, spacing=spacing) + # Single data, multiple data_name + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names=["bla_1", "bla_2"]) + # data_names equal to None + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names=None) + # Multiple data, single data_name + data = tuple(i * np.ones_like(coordinates[0]) for i in (1, 2)) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names="blabla") + + +def test_make_xarray_grid_invalid_extra_coords(): + """ + Check if errors are raise after invalid extra coords + """ + region = (-10, -5, 6, 10) + spacing = 1 + # No extra coords, extra_coords_name should be ignored + coordinates = grid_coordinates(region, spacing=spacing) + data = np.ones_like(coordinates[0]) + make_xarray_grid(coordinates, data, data_names="dummy", extra_coords_names="upward") + # Single extra coords, extra_coords_name equal to None + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=1) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid(coordinates, data, data_names="dummy", extra_coords_names=None) + # Multiple extra coords, single extra_coords_name as a str + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=[1, 2]) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid( + coordinates, data, data_names="dummy", extra_coords_names="upward" + ) + # Multiple extra coords, multiple extra_coords_name but not equal + coordinates = grid_coordinates(region, spacing=spacing, extra_coords=[1, 2, 3]) + data = np.ones_like(coordinates[0]) + with pytest.raises(ValueError): + make_xarray_grid( + coordinates, data, data_names="dummy", extra_coords_names=["upward", "time"] + ) diff --git a/verde/utils.py b/verde/utils.py index 42ee7359e..6f1874094 100644 --- a/verde/utils.py +++ b/verde/utils.py @@ -6,6 +6,7 @@ import dask import numpy as np import pandas as pd +import xarray as xr from scipy.spatial import cKDTree # pylint: disable=no-name-in-module try: @@ -18,7 +19,13 @@ except ImportError: numba = None -from .base.utils import check_data, n_1d_arrays +from .base.utils import ( + check_coordinates, + check_extra_coords_names, + check_data, + check_data_names, + n_1d_arrays, +) def dispatch(function, delayed=False, client=None): @@ -207,6 +214,119 @@ def maxabs(*args, nan=True): return npmax(absolute) +def make_xarray_grid( + coordinates, + data, + data_names, + dims=("northing", "easting"), + extra_coords_names=None, +): + """ + Create an :class:`xarray.Dataset` grid from numpy arrays + + This functions creates an :class:`xarray.Dataset` out of 2d gridded data + including easting and northing coordinates, any extra coordinates (like + upward elevation, time, etc) and data arrays. + + Use this to transform the outputs of :func:`verde.grid_coordinates` and + the ``predict`` method of a gridder into an :class:`xarray.Dataset`. + + .. note:: + + This is a utility function to help create 2D grids (i.e., grids with + two ``dims`` coordinates). For arbitrary N-dimensional arrays, use + :mod:`xarray` directly. + + Parameters + ---------- + coordinates : tuple of arrays + Arrays with coordinates of each point in the grid. Each array must + contain values for a dimension in the order: easting, northing, + vertical, etc. All arrays must be 2d and need to have the same *shape*. + These coordinates can be generated through + :func:`verde.grid_coordinates`. + data : array or tuple of arrays + Array or tuple of arrays with data values on each point in the grid. + Each array must contain values for a dimension in the same order as + the coordinates. All arrays need to have the same *shape*. + data_names : str or list + The name(s) of the data variables in the output grid. + dims : list + The names of the northing and easting data dimensions, respectively, + in the output grid. Must be defined in the following order: northing + dimension, easting dimension. + **NOTE: This is an exception to the "easting" then + "northing" pattern but is required for compatibility with xarray.** + The easting and northing coordinates in the :class:`xarray.Dataset` + will have the same names as the passed dimensions. + extra_coords_names : str or list + Name or list of names for any additional coordinates besides the + easting and northing ones. Ignored if coordinates has + only two elements. The extra coordinates are non-index coordinates of + the grid array. + + Returns + ------- + grid : :class:`xarray.Dataset` + A 2D grid with one or more data variables. + + Examples + -------- + + >>> import numpy as np + >>> import verde as vd + >>> # Create the coordinates of the regular grid + >>> coordinates = vd.grid_coordinates((-10, -6, 8, 10), spacing=2) + >>> # And some dummy data for each point of the grid + >>> data = np.ones_like(coordinates[0]) + >>> # Create the grid + >>> grid = make_xarray_grid(coordinates, data, data_names="dummy") + >>> print(grid) + + Dimensions: (easting: 3, northing: 2) + Coordinates: + * easting (easting) float64 -10.0 -8.0 -6.0 + * northing (northing) float64 8.0 10.0 + Data variables: + dummy (northing, easting) float64 1.0 1.0 1.0 1.0 1.0 1.0 + + >>> # Create a grid with an extra coordinate + >>> coordinates = vd.grid_coordinates( + ... (-10, -6, 8, 10), spacing=2, extra_coords=5 + ... ) + >>> # And some dummy data for each point of the grid + >>> data = np.ones_like(coordinates[0]) + >>> # Create the grid + >>> grid = make_xarray_grid( + ... coordinates, data, data_names="dummy", extra_coords_names="upward" + ... ) + >>> print(grid) + + Dimensions: (easting: 3, northing: 2) + Coordinates: + * easting (easting) float64 -10.0 -8.0 -6.0 + * northing (northing) float64 8.0 10.0 + upward (northing, easting) float64 5.0 5.0 5.0 5.0 5.0 5.0 + Data variables: + dummy (northing, easting) float64 1.0 1.0 1.0 1.0 1.0 1.0 + + """ + coordinates = check_coordinates(coordinates) + data = check_data(data) + data_names = check_data_names(data, data_names) + # dims is like shape with order (rows, cols) for the array + # so the first element is northing and second is easting + coords = {dims[1]: coordinates[0][0, :], dims[0]: coordinates[1][:, 0]} + # Extra coordinates are handled like 2D data arrays with + # the same dims and the data. + if coordinates[2:]: + extra_coords_names = check_extra_coords_names(coordinates, extra_coords_names) + for name, extra_coord in zip(extra_coords_names, coordinates[2:]): + coords[name] = (dims, extra_coord) + data_vars = {name: (dims, value) for name, value in zip(data_names, data)} + return xr.Dataset(data_vars, coords) + + def grid_to_table(grid): """ Convert a grid to a table with the values and coordinates of each point. @@ -218,13 +338,16 @@ def grid_to_table(grid): Parameters ---------- - grid : :class:`xarray.Dataset` + grid : :class:`xarray.Dataset` or :class:`xarray.DataArray` A 2D grid with one or more data variables. Returns ------- table : :class:`pandas.DataFrame` Table with coordinates and variable values for each point in the grid. + Column names are taken from the grid. If *grid* is a + :class:`xarray.DataArray` that doesn't have a ``name`` attribute + defined, the column with data values will be called ``"scalars"``. Examples -------- @@ -237,16 +360,37 @@ def grid_to_table(grid): ... coords=(np.arange(4), np.arange(5, 10)), ... dims=['northing', 'easting'] ... ) + >>> print(temperature.values) + [[ 0 1 2 3 4] + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19]] + >>> # For DataArrays, the data column will be "scalars" by default + >>> table = grid_to_table(temperature) + >>> list(sorted(table.columns)) + ['easting', 'northing', 'scalars'] + >>> print(table.scalars.values) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] + >>> print(table.northing.values) + [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3] + >>> print(table.easting.values) + [5 6 7 8 9 5 6 7 8 9 5 6 7 8 9 5 6 7 8 9] + >>> # If the DataArray defines a "name", we will use that instead + >>> temperature.name = "temperature_K" + >>> table = grid_to_table(temperature) + >>> list(sorted(table.columns)) + ['easting', 'northing', 'temperature_K'] + >>> # Conversion of Datasets will preserve the data variable names >>> grid = xr.Dataset({"temperature": temperature}) >>> table = grid_to_table(grid) >>> list(sorted(table.columns)) ['easting', 'northing', 'temperature'] + >>> print(table.temperature.values) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] >>> print(table.northing.values) [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3] >>> print(table.easting.values) [5 6 7 8 9 5 6 7 8 9 5 6 7 8 9 5 6 7 8 9] - >>> print(table.temperature.values) - [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] >>> # Grids with multiple data variables will have more columns. >>> wind_speed = xr.DataArray( ... np.arange(20, 40).reshape((4, 5)), @@ -267,23 +411,24 @@ def grid_to_table(grid): [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] """ - coordinate_names = [*grid.coords.keys()] - coord_north = grid.coords[coordinate_names[0]].values - coord_east = grid.coords[coordinate_names[1]].values - coordinates = [i.ravel() for i in np.meshgrid(coord_east, coord_north)] - coord_dict = { - coordinate_names[0]: coordinates[1], - coordinate_names[1]: coordinates[0], - } - variable_name = [*grid.data_vars.keys()] - variable_data = grid.to_array().values - variable_arrays = variable_data.reshape( - len(variable_name), int(len(variable_data.ravel()) / len(variable_name)) - ) - var_dict = dict(zip(variable_name, variable_arrays)) - coord_dict.update(var_dict) - data = pd.DataFrame(coord_dict) - return data + if hasattr(grid, "data_vars"): + # It's a Dataset + data_names = list(grid.data_vars.keys()) + data_arrays = [grid[name].values.ravel() for name in data_names] + coordinate_names = list(grid[data_names[0]].dims) + else: + # It's a DataArray + data_names = [grid.name if grid.name is not None else "scalars"] + data_arrays = [grid.values.ravel()] + coordinate_names = list(grid.dims) + north = grid.coords[coordinate_names[0]].values + east = grid.coords[coordinate_names[1]].values + # Need to flip the coordinates because the names are in northing and + # easting order + coordinates = [i.ravel() for i in np.meshgrid(east, north)][::-1] + data_dict = dict(zip(coordinate_names, coordinates)) + data_dict.update(dict(zip(data_names, data_arrays))) + return pd.DataFrame(data_dict) def kdtree(coordinates, use_pykdtree=True, **kwargs): @@ -323,3 +468,121 @@ def kdtree(coordinates, use_pykdtree=True, **kwargs): else: tree = cKDTree(points, **kwargs) return tree + + +def partition_by_sum(array, parts): + """ + Partition an array into parts of approximately equal sum. + + Does not change the order of the array elements. + + Produces the partition indices on the array. Use :func:`numpy.split` to + divide the array along these indices. + + .. warning:: + + Depending on the input and number of parts, there might not exist + partition points. In these cases, the function will raise + ``ValueError``. This is more likely to happen as the number of parts + approaches the number of elements in the array. + + Parameters + ---------- + array : array or array-like + The 1D array that will be partitioned. The array will be raveled before + computations. + parts : int + Number of parts to split the array. Can be at most the number of + elements in the array. + + Returns + ------- + indices : array + The indices in which the array should be split. + + Notes + ----- + + Solution from https://stackoverflow.com/a/54024280 + + Examples + -------- + + >>> import numpy as np + >>> array = np.arange(10) + >>> split_points = partition_by_sum(array, parts=2) + >>> print(split_points) + [7] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3 4 5 6] 21 + [7 8 9] 24 + >>> split_points = partition_by_sum(array, parts=3) + >>> print(split_points) + [6 8] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3 4 5] 15 + [6 7] 13 + [8 9] 17 + >>> split_points = partition_by_sum(array, parts=5) + >>> print(split_points) + [4 6 7 9] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [0 1 2 3] 6 + [4 5] 9 + [6] 6 + [7 8] 15 + [9] 9 + >>> # Use an array with a random looking element order + >>> array = [5, 6, 4, 6, 8, 1, 2, 6, 3, 3] + >>> split_points = partition_by_sum(array, parts=2) + >>> print(split_points) + [4] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [5 6 4 6] 21 + [8 1 2 6 3 3] 23 + >>> # Splits can have very different sums but this is best that can be done + >>> # without changing the order of the array. + >>> split_points = partition_by_sum(array, parts=5) + >>> print(split_points) + [1 3 4 7] + >>> for part in np.split(array, split_points): + ... print(part, part.sum()) + [5] 5 + [6 4] 10 + [6] 6 + [8 1 2] 11 + [6 3 3] 12 + + """ + array = np.atleast_1d(array).ravel() + if parts > array.size: + raise ValueError( + "Cannot partition an array of size {} into {} parts of equal sum.".format( + array.size, parts + ) + ) + cumulative_sum = array.cumsum() + # Ideally, we want each part to have the same number of points (total / + # parts). + ideal_sum = cumulative_sum[-1] // parts + # If the parts are ideal, the cumulative sum of each part will be this + ideal_cumsum = np.arange(1, parts) * ideal_sum + # Find the places in the real cumulative sum where the ideal values would + # be. These are the split points. Between each split point, the sum of + # elements will be approximately the ideal sum. Need to insert to the right + # side so that we find cumsum[i - 1] <= ideal < cumsum[i]. This way, if a + # part has ideal sum, the last element (i - 1) will be included. Otherwise, + # we would never have ideal sums. + indices = np.searchsorted(cumulative_sum, ideal_cumsum, side="right") + # Check for repeated split points, which indicates that there is no way to + # split the array. + if np.unique(indices).size != indices.size: + raise ValueError( + "Could not find partition points to split the array into {} parts " + "of equal sum.".format(parts) + ) + return indices