diff --git a/.pep8speaks.yml b/.pep8speaks.yml index aedce6e44eb..018003f2223 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -1,12 +1,16 @@ # File : .pep8speaks.yml +# This should be kept in sync with the duplicate config in the [pycodestyle] +# block of setup.cfg. + scanner: - diff_only: True # If True, errors caused by only the patch are shown + diff_only: False # If True, errors caused by only the patch are shown pycodestyle: max-line-length: 79 ignore: # Errors and warnings to ignore - - E402, # module level import not at top of file - - E731, # do not assign a lambda expression, use a def - - W503 # line break before binary operator - - W504 # line break after binary operator + - E402 # module level import not at top of file + - E731 # do not assign a lambda expression, use a def + - E741 # ambiguous variable name + - W503 # line break before binary operator + - W504 # line break after binary operator diff --git a/.travis.yml b/.travis.yml index fbb7221d7ea..8e1866de8d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,52 +17,35 @@ matrix: - env: - CONDA_ENV=py36 - EXTRA_FLAGS="--run-flaky --run-network-tests" - - env: CONDA_ENV=py36-netcdf4-dev - addons: - apt_packages: - - libhdf5-serial-dev - - netcdf-bin - - libnetcdf-dev - env: CONDA_ENV=py36-dask-dev - env: CONDA_ENV=py36-pandas-dev - env: CONDA_ENV=py36-bottleneck-dev - - env: CONDA_ENV=py36-condaforge-rc - - env: CONDA_ENV=py36-pynio-dev - env: CONDA_ENV=py36-rasterio - env: CONDA_ENV=py36-zarr-dev - env: CONDA_ENV=docs - - env: CONDA_ENV=flake8 + - env: CONDA_ENV=lint - env: CONDA_ENV=py36-hypothesis allow_failures: - env: - CONDA_ENV=py36 - EXTRA_FLAGS="--run-flaky --run-network-tests" - - env: CONDA_ENV=py36-netcdf4-dev - addons: - apt_packages: - - libhdf5-serial-dev - - netcdf-bin - - libnetcdf-dev - env: CONDA_ENV=py36-pandas-dev - env: CONDA_ENV=py36-bottleneck-dev - - env: CONDA_ENV=py36-condaforge-rc - - env: CONDA_ENV=py36-pynio-dev - env: CONDA_ENV=py36-zarr-dev before_install: - - wget http://repo.continuum.io/miniconda/Miniconda3-3.16.0-Linux-x86_64.sh -O miniconda.sh; + - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - hash -r - conda config --set always_yes yes --set changeps1 no --set show_channel_urls true - - conda update -q conda - conda info -a install: - if [[ "$CONDA_ENV" == "docs" ]]; then conda env create -n test_env --file doc/environment.yml; - elif [[ "$CONDA_ENV" == "flake8" ]]; then + elif [[ "$CONDA_ENV" == "lint" ]]; then conda env create -n test_env --file ci/requirements-py37.yml; else conda env create -n test_env --file ci/requirements-$CONDA_ENV.yml; @@ -79,8 +62,8 @@ script: - if [[ "$CONDA_ENV" == "docs" ]]; then conda install -c conda-forge sphinx sphinx_rtd_theme sphinx-gallery numpydoc; sphinx-build -n -j auto -b html -d _build/doctrees doc _build/html; - elif [[ "$CONDA_ENV" == "flake8" ]]; then - flake8 xarray ; + elif [[ "$CONDA_ENV" == "lint" ]]; then + pycodestyle xarray ; elif [[ "$CONDA_ENV" == "py36-hypothesis" ]]; then pytest properties ; else diff --git a/README.rst b/README.rst index 0ac71d33954..a4c8f6d200b 100644 --- a/README.rst +++ b/README.rst @@ -18,20 +18,34 @@ xarray: N-D labeled arrays and datasets .. image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A :target: http://numfocus.org -**xarray** (formerly **xray**) is an open source project and Python package that aims to bring the -labeled data power of pandas_ to the physical sciences, by providing -N-dimensional variants of the core pandas data structures. - -Our goal is to provide a pandas-like and pandas-compatible toolkit for -analytics on multi-dimensional arrays, rather than the tabular data for which -pandas excels. Our approach adopts the `Common Data Model`_ for self- -describing scientific data in widespread use in the Earth sciences: -``xarray.Dataset`` is an in-memory representation of a netCDF file. - +**xarray** (formerly **xray**) is an open source project and Python package +that makes working with labelled multi-dimensional arrays simple, +efficient, and fun! + +Multi-dimensional (a.k.a. N-dimensional, ND) arrays (sometimes called +"tensors") are an essential part of computational science. +They are encountered in a wide range of fields, including physics, astronomy, +geoscience, bioinformatics, engineering, finance, and deep learning. +In Python, NumPy_ provides the fundamental data structure and API for +working with raw ND arrays. +However, real-world datasets are usually more than just raw numbers; +they have labels which encode information about how the array values map +to locations in space, time, etc. + +By introducing *dimensions*, *coordinates*, and *attributes* on top of raw +NumPy-like arrays, xarray is able to understand these labels and use them to +provide a more intuitive, more concise, and less error-prone experience. +Xarray also provides a large and growing library of functions for advanced +analytics and visualization with these data structures. +Xarray was inspired by and borrows heavily from pandas_, the popular data +analysis package focused on labelled tabular data. +Xarray can read and write data from most common labeled ND-array storage +formats and is particularly tailored to working with netCDF_ files, which were +the source of xarray's data model. + +.. _NumPy: http://www.numpy.org/ .. _pandas: http://pandas.pydata.org -.. _Common Data Model: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM .. _netCDF: http://www.unidata.ucar.edu/software/netcdf -.. _OPeNDAP: http://www.opendap.org/ Why xarray? ----------- diff --git a/ci/install_python.ps1 b/ci/install_python.ps1 index 63c476e65e5..e9cfae01bde 100644 --- a/ci/install_python.ps1 +++ b/ci/install_python.ps1 @@ -2,16 +2,16 @@ # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ -$MINICONDA_URL = "http://repo.continuum.io/miniconda/" +$MINICONDA_URL = "https://repo.anaconda.com/miniconda/" $BASE_URL = "https://www.python.org/ftp/python/" function DownloadMiniconda ($python_version, $platform_suffix) { $webclient = New-Object System.Net.WebClient - if ($python_version -match "3.6") { - $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" - } else { + if ($python_version -match "2.7") { $filename = "Miniconda2-latest-Windows-" + $platform_suffix + ".exe" + } else { + $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" } $url = $MINICONDA_URL + $filename diff --git a/ci/requirements-py35.yml b/ci/requirements-py35.yml index 29f4bb020fc..a71434865cc 100644 --- a/ci/requirements-py35.yml +++ b/ci/requirements-py35.yml @@ -10,8 +10,8 @@ dependencies: - matplotlib=1.5 - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - flake8 - numpy diff --git a/ci/requirements-py36-bottleneck-dev.yml b/ci/requirements-py36-bottleneck-dev.yml index bdf0349b5c0..3f08648be32 100644 --- a/ci/requirements-py36-bottleneck-dev.yml +++ b/ci/requirements-py36-bottleneck-dev.yml @@ -11,8 +11,8 @@ dependencies: - matplotlib - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - flake8 - numpy diff --git a/ci/requirements-py36-condaforge-rc.yml b/ci/requirements-py36-condaforge-rc.yml deleted file mode 100644 index ba980deeeea..00000000000 --- a/ci/requirements-py36-condaforge-rc.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: test_env -channels: - - conda-forge/label/rc - - conda-forge -dependencies: - - python=3.6 - - cftime - - dask - - distributed - - h5py - - h5netcdf - - matplotlib - - netcdf4 - - pytest - - pytest-env - - pytest-cov - - coveralls - - flake8 - - numpy - - pandas - - seaborn - - scipy - - toolz diff --git a/ci/requirements-py36-dask-dev.yml b/ci/requirements-py36-dask-dev.yml index 20b10fe29ee..32d01765439 100644 --- a/ci/requirements-py36-dask-dev.yml +++ b/ci/requirements-py36-dask-dev.yml @@ -9,8 +9,8 @@ dependencies: - matplotlib - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - flake8 - numpy diff --git a/ci/requirements-py36-hypothesis.yml b/ci/requirements-py36-hypothesis.yml index c5c228095a4..8066a53b6bc 100644 --- a/ci/requirements-py36-hypothesis.yml +++ b/ci/requirements-py36-hypothesis.yml @@ -10,8 +10,8 @@ dependencies: - matplotlib - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - hypothesis - flake8 diff --git a/ci/requirements-py36-netcdf4-dev.yml b/ci/requirements-py36-netcdf4-dev.yml deleted file mode 100644 index 2616a113fa4..00000000000 --- a/ci/requirements-py36-netcdf4-dev.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: test_env -channels: - - conda-forge -dependencies: - - python=3.6 - - cython - - dask - - distributed - - h5py - - h5netcdf - - matplotlib - - pytest - - pytest-env - - pytest-cov - - coveralls - - flake8 - - numpy - - pandas - - scipy - - toolz - - pip: - - git+https://github.com/Unidata/netcdf4-python.git - - git+https://github.com/Unidata/cftime.git diff --git a/ci/requirements-py36-pandas-dev.yml b/ci/requirements-py36-pandas-dev.yml index 2b914f746ab..bc0e5d0de09 100644 --- a/ci/requirements-py36-pandas-dev.yml +++ b/ci/requirements-py36-pandas-dev.yml @@ -12,8 +12,8 @@ dependencies: - matplotlib - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - flake8 - numpy diff --git a/ci/requirements-py36-pynio-dev.yml b/ci/requirements-py36-pynio-dev.yml deleted file mode 100644 index b8987611a6e..00000000000 --- a/ci/requirements-py36-pynio-dev.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: test_env -channels: - - conda-forge - - conda-forge/label/dev -dependencies: - - python=3.6 - - cftime - - dask - - distributed - - h5py - - h5netcdf - - matplotlib - - netcdf4 - - pynio=dev - - pytest - - pytest-env - - pytest-cov - - coveralls - - numpy - - pandas - - scipy - - seaborn - - toolz - - rasterio - - bottleneck - - pydap diff --git a/ci/requirements-py36-rasterio.yml b/ci/requirements-py36-rasterio.yml index dda9ea8cd29..e5ef1d29777 100644 --- a/ci/requirements-py36-rasterio.yml +++ b/ci/requirements-py36-rasterio.yml @@ -11,8 +11,8 @@ dependencies: - matplotlib - netcdf4 - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - numpy - pandas diff --git a/ci/requirements-py36-zarr-dev.yml b/ci/requirements-py36-zarr-dev.yml index 9966cf74815..94bdc50fbfe 100644 --- a/ci/requirements-py36-zarr-dev.yml +++ b/ci/requirements-py36-zarr-dev.yml @@ -8,8 +8,8 @@ dependencies: - distributed - matplotlib - pytest - - pytest-env - pytest-cov + - pytest-env - coveralls - flake8 - numpy diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index 2986dc33adb..311e4a275a8 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -14,7 +14,7 @@ dependencies: - pytest-cov - pytest-env - coveralls - - flake8 + - pycodestyle - numpy - pandas - scipy @@ -32,3 +32,4 @@ dependencies: - lxml - pip: - cfgrib>=0.9.2 + - mypy==0.650 diff --git a/ci/requirements-py37.yml b/ci/requirements-py37.yml index 86a44ed5398..1a98e6b285c 100644 --- a/ci/requirements-py37.yml +++ b/ci/requirements-py37.yml @@ -14,7 +14,7 @@ dependencies: - pytest-cov - pytest-env - coveralls - - flake8 + - pycodestyle - numpy - pandas - scipy @@ -28,4 +28,5 @@ dependencies: - eccodes - pydap - pip: - - cfgrib>=0.9.2 \ No newline at end of file + - cfgrib>=0.9.2 + - mypy==0.650 diff --git a/doc/api.rst b/doc/api.rst index 72979db3213..6fd3a737617 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -147,6 +147,7 @@ Computation Dataset.groupby Dataset.groupby_bins Dataset.rolling + Dataset.coarsen Dataset.resample Dataset.diff Dataset.quantile @@ -312,6 +313,7 @@ Computation DataArray.groupby DataArray.groupby_bins DataArray.rolling + DataArray.coarsen DataArray.dt DataArray.resample DataArray.get_axis_num diff --git a/doc/computation.rst b/doc/computation.rst index f1d1450a6dc..412f24eee6a 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -199,6 +199,47 @@ You can also use ``construct`` to compute a weighted rolling sum: To avoid this, use ``skipna=False`` as the above example. +.. _comput.coarsen: + +Coarsen large arrays +==================== + +``DataArray`` and ``Dataset`` objects include a +:py:meth:`~xarray.DataArray.coarsen` and :py:meth:`~xarray.Dataset.coarsen` +methods. This supports the block aggregation along multiple dimensions, + +.. ipython:: python + + x = np.linspace(0, 10, 300) + t = pd.date_range('15/12/1999', periods=364) + da = xr.DataArray(np.sin(x) * np.cos(np.linspace(0, 1, 364)[:, np.newaxis]), + dims=['time', 'x'], coords={'time': t, 'x': x}) + da + +In order to take a block mean for every 7 days along ``time`` dimension and +every 2 points along ``x`` dimension, + +.. ipython:: python + + da.coarsen(time=7, x=2).mean() + +:py:meth:`~xarray.DataArray.coarsen` raises an ``ValueError`` if the data +length is not a multiple of the corresponding window size. +You can choose ``boundary='trim'`` or ``boundary='pad'`` options for trimming +the excess entries or padding ``nan`` to insufficient entries, + +.. ipython:: python + + da.coarsen(time=30, x=2, boundary='trim').mean() + +If you want to apply a specific function to coordinate, you can pass the +function or method name to ``coord_func`` option, + +.. ipython:: python + + da.coarsen(time=7, x=2, coord_func={'time': 'min'}).mean() + + Computation using Coordinates ============================= diff --git a/doc/contributing.rst b/doc/contributing.rst index ceba81d9319..da9c89234a3 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -345,19 +345,26 @@ the more common ``PEP8`` issues: - passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` :ref:`Continuous Integration ` will run -the `flake8 `_ tool +the `pycodestyle `_ tool and report any stylistic errors in your code. Therefore, it is helpful before submitting code to run the check yourself:: - flake8 + pycodestyle xarray -If you install `isort `_ and -`flake8-isort `_, this will also show -any errors from incorrectly sorted imports. These aren't currently enforced in -CI. To automatically sort imports, you can run:: +Other recommended but optional tools for checking code quality (not currently +enforced in CI): - isort -y +- `mypy `_ performs static type checking, which can + make it easier to catch bugs. Please run ``mypy xarray`` if you annotate any + code with `type hints `_. +- `flake8 `_ includes a few more automated + checks than those enforced by pycodestyle. +- `isort `_ will highlight + incorrectly sorted imports. ``isort -y`` will automatically fix them. See + also `flake8-isort `_. +Note that your code editor probably supports extensions that can show results +of these checks inline as you type. Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -365,8 +372,7 @@ Backwards Compatibility Please try to maintain backward compatibility. *xarray* has growing number of users with lots of existing code, so don't break it if at all possible. If you think breakage is required, clearly state why as part of the pull request. Also, be careful when changing -method signatures and add deprecation warnings where needed. Also, add the deprecated -sphinx directive to the deprecated functions or methods. +method signatures and add deprecation warnings where needed. .. _contributing.ci: diff --git a/doc/index.rst b/doc/index.rst index 45897f4bccb..fe6d2874953 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -2,19 +2,33 @@ xarray: N-D labeled arrays and datasets in Python ================================================= **xarray** (formerly **xray**) is an open source project and Python package -that aims to bring the labeled data power of pandas_ to the physical sciences, -by providing N-dimensional variants of the core pandas data structures. - -Our goal is to provide a pandas-like and pandas-compatible toolkit for -analytics on multi-dimensional arrays, rather than the tabular data for which -pandas excels. Our approach adopts the `Common Data Model`_ for self- -describing scientific data in widespread use in the Earth sciences: -``xarray.Dataset`` is an in-memory representation of a netCDF file. - +that makes working with labelled multi-dimensional arrays simple, +efficient, and fun! + +Multi-dimensional (a.k.a. N-dimensional, ND) arrays (sometimes called +"tensors") are an essential part of computational science. +They are encountered in a wide range of fields, including physics, astronomy, +geoscience, bioinformatics, engineering, finance, and deep learning. +In Python, NumPy_ provides the fundamental data structure and API for +working with raw ND arrays. +However, real-world datasets are usually more than just raw numbers; +they have labels which encode information about how the array values map +to locations in space, time, etc. + +By introducing *dimensions*, *coordinates*, and *attributes* on top of raw +NumPy-like arrays, xarray is able to understand these labels and use them to +provide a more intuitive, more concise, and less error-prone experience. +Xarray also provides a large and growing library of functions for advanced +analytics and visualization with these data structures. +Xarray was inspired by and borrows heavily from pandas_, the popular data +analysis package focused on labelled tabular data. +Xarray can read and write data from most common labeled ND-array storage +formats and is particularly tailored to working with netCDF_ files, which were +the source of xarray's data model. + +.. _NumPy: http://www.numpy.org/ .. _pandas: http://pandas.pydata.org -.. _Common Data Model: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM .. _netCDF: http://www.unidata.ucar.edu/software/netcdf -.. _OPeNDAP: http://www.opendap.org/ Documentation ------------- @@ -106,7 +120,7 @@ See also .. _2015 Unidata Users Workshop talk: https://www.youtube.com/watch?v=J9ypQOnt5l8 .. _tutorial: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial.ipynb .. _with answers: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial-with-answers.ipynb -.. _Nicolas Fauchereau's tutorial: http://nbviewer.ipython.org/github/nicolasfauchereau/metocean/blob/master/notebooks/xray.ipynb +.. _Nicolas Fauchereau's tutorial: http://nbviewer.iPython.org/github/nicolasfauchereau/metocean/blob/master/notebooks/xray.ipynb Get in touch ------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abbe9da29fa..cb6344cceab 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,9 +13,9 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.11.2: +.. _whats-new.0.11.3: -v0.11.2 (unreleased) +v0.11.3 (unreleased) -------------------- Breaking changes @@ -28,16 +28,34 @@ Breaking changes Enhancements ~~~~~~~~~~~~ +- :py:meth:`~xarray.DataArray.coarsen` and + :py:meth:`~xarray.Dataset.coarsen` are newly added. + See :ref:`comput.coarsen` for details. + (:issue:`2525`) + By `Keisuke Fujii `_. +- Upsampling an array via interpolation with resample is now dask-compatible, + as long as the array is not chunked along the resampling dimension. + By `Spencer Clark `_. + Bug fixes ~~~~~~~~~ -.. _whats-new.0.11.1: +- Interpolating via resample now internally specifies ``bounds_error=False`` + as an argument to ``scipy.interpolate.interp1d``, allowing for interpolation + from higher frequencies to lower frequencies. Datapoints outside the bounds + of the original time coordinate are now filled with NaN (:issue:`2197`). By + `Spencer Clark `_. +- Saving files with times encoded with reference dates with timezones + (e.g. '2000-01-01T00:00:00-05:00') no longer raises an error + (:issue:`2649`). By `Spencer Clark `_. -v0.11.1 (29 December 2018) --------------------------- +.. _whats-new.0.11.2: -This minor release includes a number of enhancements and bug fixes, and two -(slightly) breaking changes. +v0.11.2 (2 January 2019) +------------------------ + +Removes inadvertently introduced setup dependency on pytest-runner +(:issue:`2641`). Otherwise, this release is exactly equivalent to 0.11.1. .. warning:: @@ -49,14 +67,22 @@ This minor release includes a number of enhancements and bug fixes, and two - `Python 3 Statement `__ - `Tips on porting to Python 3 `__ +.. _whats-new.0.11.1: + +v0.11.1 (29 December 2018) +-------------------------- + +This minor release includes a number of enhancements and bug fixes, and two +(slightly) breaking changes. + Breaking changes ~~~~~~~~~~~~~~~~ - Minimum rasterio version increased from 0.36 to 1.0 (for ``open_rasterio``) - Time bounds variables are now also decoded according to CF conventions (:issue:`2565`). The previous behavior was to decode them only if they - had specific time attributes, now these attributes are copied - automatically from the corresponding time coordinate. This might + had specific time attributes, now these attributes are copied + automatically from the corresponding time coordinate. This might brake downstream code that was relying on these variables to be not decoded. By `Fabien Maussion `_. diff --git a/setup.cfg b/setup.cfg index 847415ac04d..c80ff300a60 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,21 +10,74 @@ filterwarnings = env = UVCDAT_ANONYMOUS_LOG=no -[flake8] +# This should be kept in sync with .pep8speaks.yml +[pycodestyle] max-line-length=79 -ignore= - E402 # module level import not at top of file - E731 # do not assign a lambda expression, use a def - W503 # line break before binary operator - W504 # line break after binary operator -exclude= - doc/ +ignore=E402,E731,E741,W503,W504 [isort] default_section=THIRDPARTY known_first_party=xarray multi_line_output=4 +# Most of the numerical computing stack doesn't have type annotations yet. +[mypy-bottleneck.*] +ignore_missing_imports = True +[mypy-cdms2.*] +ignore_missing_imports = True +[mypy-cf_units.*] +ignore_missing_imports = True +[mypy-cfgrib.*] +ignore_missing_imports = True +[mypy-cftime.*] +ignore_missing_imports = True +[mypy-dask.*] +ignore_missing_imports = True +[mypy-distributed.*] +ignore_missing_imports = True +[mypy-h5netcdf.*] +ignore_missing_imports = True +[mypy-h5py.*] +ignore_missing_imports = True +[mypy-iris.*] +ignore_missing_imports = True +[mypy-matplotlib.*] +ignore_missing_imports = True +[mypy-Nio.*] +ignore_missing_imports = True +[mypy-numpy.*] +ignore_missing_imports = True +[mypy-netCDF4.*] +ignore_missing_imports = True +[mypy-netcdftime.*] +ignore_missing_imports = True +[mypy-pandas.*] +ignore_missing_imports = True +[mypy-PseudoNetCDF.*] +ignore_missing_imports = True +[mypy-pydap.*] +ignore_missing_imports = True +[mypy-pytest.*] +ignore_missing_imports = True +[mypy-rasterio.*] +ignore_missing_imports = True +[mypy-scipy.*] +ignore_missing_imports = True +[mypy-seaborn.*] +ignore_missing_imports = True +[mypy-toolz.*] +ignore_missing_imports = True +[mypy-zarr.*] +ignore_missing_imports = True + +# written by versioneer +[mypy-xarray._version] +ignore_errors = True +# version spanning code is hard to type annotate (and most of this module will +# be going away soon anyways) +[mypy-xarray.core.pycompat] +ignore_errors = True + [versioneer] VCS = git style = pep440 diff --git a/setup.py b/setup.py index ac226c4cab5..8c0c98ab33d 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,8 @@ PYTHON_REQUIRES = '>=3.5' INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] -SETUP_REQUIRES = ['pytest-runner >= 4.2'] +needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) +SETUP_REQUIRES = ['pytest-runner >= 4.2'] if needs_pytest else [] TESTS_REQUIRE = ['pytest >= 2.7.1'] if sys.version_info[0] < 3: TESTS_REQUIRE.append('mock') @@ -34,19 +35,33 @@ DESCRIPTION = "N-D labeled arrays and datasets in Python" LONG_DESCRIPTION = """ **xarray** (formerly **xray**) is an open source project and Python package -that aims to bring the labeled data power of pandas_ to the physical sciences, -by providing N-dimensional variants of the core pandas data structures. +that makes working with labelled multi-dimensional arrays simple, +efficient, and fun! -Our goal is to provide a pandas-like and pandas-compatible toolkit for -analytics on multi-dimensional arrays, rather than the tabular data for which -pandas excels. Our approach adopts the `Common Data Model`_ for self- -describing scientific data in widespread use in the Earth sciences: -``xarray.Dataset`` is an in-memory representation of a netCDF file. +Multi-dimensional (a.k.a. N-dimensional, ND) arrays (sometimes called +"tensors") are an essential part of computational science. +They are encountered in a wide range of fields, including physics, astronomy, +geoscience, bioinformatics, engineering, finance, and deep learning. +In Python, NumPy_ provides the fundamental data structure and API for +working with raw ND arrays. +However, real-world datasets are usually more than just raw numbers; +they have labels which encode information about how the array values map +to locations in space, time, etc. +By introducing *dimensions*, *coordinates*, and *attributes* on top of raw +NumPy-like arrays, xarray is able to understand these labels and use them to +provide a more intuitive, more concise, and less error-prone experience. +Xarray also provides a large and growing library of functions for advanced +analytics and visualization with these data structures. +Xarray was inspired by and borrows heavily from pandas_, the popular data +analysis package focused on labelled tabular data. +Xarray can read and write data from most common labeled ND-array storage +formats and is particularly tailored to working with netCDF_ files, which were +the source of xarray's data model. + +.. _NumPy: http://www.numpy.org/ .. _pandas: http://pandas.pydata.org -.. _Common Data Model: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM .. _netCDF: http://www.unidata.ucar.edu/software/netcdf -.. _OPeNDAP: http://www.opendap.org/ Important links --------------- diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3cd62e8264a..5f88783bb2e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,8 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import _auto_combine, _infer_concat_order_from_positions +from ..core.combine import ( + _CONCAT_DIM_DEFAULT, _auto_combine, _infer_concat_order_from_positions) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter @@ -483,9 +484,6 @@ def close(self): f.close() -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' - - def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', @@ -606,7 +604,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # Coerce 1D input into ND to maintain backwards-compatible API until API # for N-D combine decided # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) - if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: + if concat_dim is None or concat_dim is _CONCAT_DIM_DEFAULT: concat_dims = concat_dim elif not isinstance(concat_dim, list): concat_dims = [concat_dim] diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index d329f9e734f..d0efba86bf9 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -1,5 +1,6 @@ import contextlib import threading +from typing import Any, Dict import warnings from ..core import utils @@ -13,7 +14,7 @@ assert FILE_CACHE.maxsize, 'file cache must be at least size one' -REF_COUNTS = {} +REF_COUNTS = {} # type: Dict[Any, int] _DEFAULT_MODE = utils.ReprObject('') diff --git a/xarray/backends/locks.py b/xarray/backends/locks.py index 6c135fd1240..bca27a0bbc1 100644 --- a/xarray/backends/locks.py +++ b/xarray/backends/locks.py @@ -1,5 +1,6 @@ import multiprocessing import threading +from typing import Any, MutableMapping import weakref try: @@ -20,7 +21,7 @@ NETCDFC_LOCK = SerializableLock() -_FILE_LOCKS = weakref.WeakValueDictionary() +_FILE_LOCKS = weakref.WeakValueDictionary() # type: MutableMapping[Any, threading.Lock] # noqa def _get_threaded_lock(key): diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index f8e1cfa6718..98571c9a995 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -43,6 +43,7 @@ import re from datetime import timedelta from functools import partial +from typing import ClassVar, Optional import numpy as np @@ -74,7 +75,7 @@ def get_date_type(calendar): class BaseCFTimeOffset(object): - _freq = None + _freq = None # type: ClassVar[str] def __init__(self, n=1): if not isinstance(n, int): @@ -254,9 +255,9 @@ def onOffset(self, date): class YearOffset(BaseCFTimeOffset): - _freq = None - _day_option = None - _default_month = None + _freq = None # type: ClassVar[str] + _day_option = None # type: ClassVar[str] + _default_month = None # type: ClassVar[int] def __init__(self, n=1, month=None): BaseCFTimeOffset.__init__(self, n) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 0f2045cf356..c337a42e3b4 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -359,6 +359,11 @@ def encode_cf_datetime(dates, units=None, calendar=None): time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]') ref_date = pd.Timestamp(ref_date) + # If the ref_date Timestamp is timezone-aware, convert to UTC and + # make it timezone-naive (GH 2649). + if ref_date.tz is not None: + ref_date = ref_date.tz_convert(None) + # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index d5963b0e94f..d8453a95fad 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -1,6 +1,7 @@ """Coders for individual Variable objects.""" from __future__ import absolute_import, division, print_function +from typing import Any import warnings from functools import partial @@ -126,11 +127,12 @@ def pop_to(source, dest, key, name=None): return value -def _apply_mask(data, # type: np.ndarray - encoded_fill_values, # type: list - decoded_fill_value, # type: Any - dtype, # type: Any - ): # type: np.ndarray +def _apply_mask( + data: np.ndarray, + encoded_fill_values: list, + decoded_fill_value: Any, + dtype: Any, +) -> np.ndarray: """Mask all matching values in a NumPy arrays.""" data = np.asarray(data, dtype=dtype) condition = False diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 33902abaf3e..789bea90b55 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -31,7 +31,7 @@ def _get_joiner(join): raise ValueError('invalid value for join: %s' % join) -_DEFAULT_EXCLUDE = frozenset() +_DEFAULT_EXCLUDE = frozenset() # type: frozenset def align(*objects, **kwargs): diff --git a/xarray/core/combine.py b/xarray/core/combine.py index abb2e2c1306..e552d8d900c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -368,7 +368,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +_CONCAT_DIM_DEFAULT = utils.ReprObject('') def _infer_concat_order_from_positions(datasets, concat_dims): diff --git a/xarray/core/common.py b/xarray/core/common.py index 5b090bf0d2f..d272115f492 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -24,7 +24,7 @@ def wrapped_func(self, dim=None, axis=None, skipna=None, return self.reduce(func, dim, axis, skipna=skipna, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=None, axis=None, + def wrapped_func(self, dim=None, axis=None, # type: ignore **kwargs): return self.reduce(func, dim, axis, allow_lazy=True, **kwargs) @@ -56,7 +56,7 @@ def wrapped_func(self, dim=None, skipna=None, numeric_only=numeric_only, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=None, **kwargs): + def wrapped_func(self, dim=None, **kwargs): # type: ignore return self.reduce(func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs) @@ -590,6 +590,66 @@ def rolling(self, dim=None, min_periods=None, center=False, **dim_kwargs): return self._rolling_cls(self, dim, min_periods=min_periods, center=center) + def coarsen(self, dim=None, boundary='exact', side='left', + coord_func='mean', **dim_kwargs): + """ + Coarsen object. + + Parameters + ---------- + dim: dict, optional + Mapping from the dimension name to the window size. + dim : str + Name of the dimension to create the rolling iterator + along (e.g., `time`). + window : int + Size of the moving window. + boundary : 'exact' | 'trim' | 'pad' + If 'exact', a ValueError will be raised if dimension size is not a + multiple of the window size. If 'trim', the excess entries are + dropped. If 'pad', NA will be padded. + side : 'left' or 'right' or mapping from dimension to 'left' or 'right' + coord_func: function (name) that is applied to the coordintes, + or a mapping from coordinate name to function (name). + + Returns + ------- + Coarsen object (core.rolling.DataArrayCoarsen for DataArray, + core.rolling.DatasetCoarsen for Dataset.) + + Examples + -------- + Coarsen the long time series by averaging over every four days. + + >>> da = xr.DataArray(np.linspace(0, 364, num=364), + ... dims='time', + ... coords={'time': pd.date_range( + ... '15/12/1999', periods=364)}) + >>> da + + array([ 0. , 1.002755, 2.00551 , ..., 361.99449 , 362.997245, + 364. ]) + Coordinates: + * time (time) datetime64[ns] 1999-12-15 1999-12-16 ... 2000-12-12 + >>> + >>> da.coarsen(time=3, boundary='trim').mean() + + array([ 1.002755, 4.011019, 7.019284, ..., 358.986226, + 361.99449 ]) + Coordinates: + * time (time) datetime64[ns] 1999-12-16 1999-12-19 ... 2000-12-10 + >>> + + See Also + -------- + core.rolling.DataArrayCoarsen + core.rolling.DatasetCoarsen + """ + dim = either_dict_or_kwargs(dim, dim_kwargs, 'coarsen') + return self._coarsen_cls( + self, dim, boundary=boundary, side=side, + coord_func=coord_func) + def resample(self, indexer=None, skipna=None, closed=None, label=None, base=0, keep_attrs=None, loffset=None, **indexer_kwargs): """Returns a Resample object for performing resampling operations. @@ -806,7 +866,7 @@ def close(self): self._file_obj = None def isin(self, test_elements): - """Tests each value in the array for whether it is in the supplied list. + """Tests each value in the array for whether it is in test elements. Parameters ---------- diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 7998cc4f72f..bf9ab56bbb4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -8,6 +8,10 @@ import operator from collections import Counter from distutils.version import LooseVersion +from typing import ( + AbstractSet, Any, Dict, Iterable, List, Mapping, Union, Tuple, + TYPE_CHECKING, TypeVar +) import numpy as np @@ -16,8 +20,11 @@ from .merge import expand_and_merge_variables from .pycompat import OrderedDict, basestring, dask_array_type from .utils import is_dict_like +from .variable import Variable +if TYPE_CHECKING: + from .dataset import Dataset -_DEFAULT_FROZEN_SET = frozenset() +_DEFAULT_FROZEN_SET = frozenset() # type: frozenset _NO_FILL_VALUE = utils.ReprObject('') _DEFAULT_NAME = utils.ReprObject('') _JOINS_WITHOUT_FILL_VALUES = frozenset({'inner', 'exact'}) @@ -111,8 +118,7 @@ def to_gufunc_string(self): return str(alt_signature) -def result_name(objects): - # type: List[object] -> Any +def result_name(objects: list) -> Any: # use the same naming heuristics as pandas: # https://github.com/blaze/blaze/issues/458#issuecomment-51936356 names = {getattr(obj, 'name', _DEFAULT_NAME) for obj in objects} @@ -138,10 +144,10 @@ def _get_coord_variables(args): def build_output_coords( - args, # type: list - signature, # type: _UFuncSignature - exclude_dims=frozenset(), # type: set -): + args: list, + signature: _UFuncSignature, + exclude_dims: AbstractSet = frozenset(), +) -> 'List[OrderedDict[Any, Variable]]': """Build output coordinates for an operation. Parameters @@ -159,7 +165,6 @@ def build_output_coords( ------- OrderedDict of Variable objects with merged coordinates. """ - # type: (...) -> List[OrderedDict[Any, Variable]] input_coords = _get_coord_variables(args) if exclude_dims: @@ -220,8 +225,7 @@ def apply_dataarray_ufunc(func, *args, **kwargs): return out -def ordered_set_union(all_keys): - # type: List[Iterable] -> Iterable +def ordered_set_union(all_keys: List[Iterable]) -> Iterable: result_dict = OrderedDict() for keys in all_keys: for key in keys: @@ -229,8 +233,7 @@ def ordered_set_union(all_keys): return result_dict.keys() -def ordered_set_intersection(all_keys): - # type: List[Iterable] -> Iterable +def ordered_set_intersection(all_keys: List[Iterable]) -> Iterable: intersection = set(all_keys[0]) for keys in all_keys[1:]: intersection.intersection_update(keys) @@ -284,9 +287,9 @@ def _as_variables_or_variable(arg): def _unpack_dict_tuples( result_vars, # type: Mapping[Any, Tuple[Variable]] - num_outputs, # type: int + num_outputs, # type: int ): - # type: (...) -> Tuple[Dict[Any, Variable]] + # type: (...) -> Tuple[Dict[Any, Variable], ...] out = tuple(OrderedDict() for _ in range(num_outputs)) for name, values in result_vars.items(): for value, results_dict in zip(values, out): @@ -438,8 +441,11 @@ def apply_groupby_ufunc(func, *args): return combined -def unified_dim_sizes(variables, exclude_dims=frozenset()): - # type: Iterable[Variable] -> OrderedDict[Any, int] +def unified_dim_sizes( + variables: Iterable[Variable], + exclude_dims: AbstractSet = frozenset(), +) -> 'OrderedDict[Any, int]': + dim_sizes = OrderedDict() for var in variables: @@ -460,11 +466,9 @@ def unified_dim_sizes(variables, exclude_dims=frozenset()): SLICE_NONE = slice(None) -# A = TypeVar('A', numpy.ndarray, dask.array.Array) - def broadcast_compat_data(variable, broadcast_dims, core_dims): - # type: (Variable[A], tuple, tuple) -> A + # type: (Variable, tuple, tuple) -> Any data = variable.data old_dims = variable.dims diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index efe8affb2a3..820937dae6a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -196,6 +196,7 @@ def _update_coords(self, coords): self._data._variables = variables self._data._coord_names.update(new_coord_names) self._data._dims = dict(dims) + self._data._indexes = None def __delitem__(self, key): if key in self: @@ -276,44 +277,6 @@ def __iter__(self): return iter(self._data._level_coords) -class Indexes(Mapping, formatting.ReprMixin): - """Ordered Mapping[str, pandas.Index] for xarray objects. - """ - - def __init__(self, variables, sizes): - """Not for public consumption. - - Parameters - ---------- - variables : OrderedDict[Any, Variable] - Reference to OrderedDict holding variable objects. Should be the - same dictionary used by the source object. - sizes : OrderedDict[Any, int] - Map from dimension names to sizes. - """ - self._variables = variables - self._sizes = sizes - - def __iter__(self): - for key in self._sizes: - if key in self._variables: - yield key - - def __len__(self): - return sum(key in self._variables for key in self._sizes) - - def __contains__(self, key): - return key in self._sizes and key in self._variables - - def __getitem__(self, key): - if key not in self._sizes: - raise KeyError(key) - return self._variables[key].to_index() - - def __unicode__(self): - return formatting.indexes_repr(self) - - def assert_coordinate_consistent(obj, coords): """ Maeke sure the dimension coordinate of obj is consistent with coords. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 25a66e529ae..f27958b1c77 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -13,10 +13,11 @@ from .alignment import align, reindex_like_indexers from .common import AbstractArray, DataWithCoords from .coordinates import ( - DataArrayCoordinates, Indexes, LevelCoordinatesSource, + DataArrayCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers) from .dataset import Dataset, merge_indexes, split_indexes from .formatting import format_item +from .indexes import default_indexes, Indexes from .options import OPTIONS from .pycompat import OrderedDict, basestring, iteritems, range, zip from .utils import ( @@ -160,12 +161,13 @@ class DataArray(AbstractArray, DataWithCoords): """ _groupby_cls = groupby.DataArrayGroupBy _rolling_cls = rolling.DataArrayRolling + _coarsen_cls = rolling.DataArrayCoarsen _resample_cls = resample.DataArrayResample dt = property(DatetimeAccessor) def __init__(self, data, coords=None, dims=None, name=None, - attrs=None, encoding=None, fastpath=False): + attrs=None, encoding=None, indexes=None, fastpath=False): """ Parameters ---------- @@ -237,6 +239,10 @@ def __init__(self, data, coords=None, dims=None, name=None, self._coords = coords self._name = name + # TODO(shoyer): document this argument, once it becomes part of the + # public interface. + self._indexes = indexes + self._file_obj = None self._initialized = True @@ -534,9 +540,11 @@ def encoding(self, value): @property def indexes(self): - """OrderedDict of pandas.Index objects used for label based indexing + """Mapping of pandas.Index objects used for label based indexing """ - return Indexes(self._coords, self.sizes) + if self._indexes is None: + self._indexes = default_indexes(self._coords, self.dims) + return Indexes(self._indexes) @property def coords(self): @@ -763,7 +771,8 @@ def __deepcopy__(self, memo=None): return self.copy(deep=True) # mutable objects should not be hashable - __hash__ = None + # https://github.com/python/mypy/issues/4266 + __hash__ = None # type: ignore @property def chunks(self): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5ffc47cd17e..ef73f3cef12 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6,6 +6,7 @@ from collections import Mapping, defaultdict from distutils.version import LooseVersion from numbers import Number +from typing import Any, Dict, List, Set, Tuple, Union import numpy as np import pandas as pd @@ -13,16 +14,17 @@ import xarray as xr from . import ( - alignment, dtypes, duck_array_ops, formatting, groupby, indexing, ops, - pdcompat, resample, rolling, utils) + alignment, dtypes, duck_array_ops, formatting, groupby, + indexing, ops, pdcompat, resample, rolling, utils) from ..coding.cftimeindex import _parse_array_of_cftime_strings from .alignment import align from .common import ( ALL_DIMS, DataWithCoords, ImplementsDatasetReduce, _contains_datetime_like_objects) from .coordinates import ( - DatasetCoordinates, Indexes, LevelCoordinatesSource, + DatasetCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers) +from .indexes import Indexes, default_indexes from .merge import ( dataset_merge_method, dataset_update_method, merge_data_and_coords, merge_variables) @@ -124,14 +126,14 @@ def merge_indexes( Not public API. Used in Dataset and DataArray set_index methods. """ - vars_to_replace = {} - vars_to_remove = [] + vars_to_replace = {} # Dict[Any, Variable] + vars_to_remove = [] # type: list for dim, var_names in indexes.items(): if isinstance(var_names, basestring): var_names = [var_names] - names, labels, levels = [], [], [] + names, labels, levels = [], [], [] # type: (list, list, list) current_index_variable = variables.get(dim) for n in var_names: @@ -195,7 +197,7 @@ def split_indexes( if isinstance(dims_or_levels, basestring): dims_or_levels = [dims_or_levels] - dim_levels = defaultdict(list) + dim_levels = defaultdict(list) # type: Dict[Any, list] dims = [] for k in dims_or_levels: if k in level_coords: @@ -317,6 +319,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords, """ _groupby_cls = groupby.DatasetGroupBy _rolling_cls = rolling.DatasetRolling + _coarsen_cls = rolling.DatasetCoarsen _resample_cls = resample.DatasetResample def __init__(self, data_vars=None, coords=None, attrs=None, @@ -365,6 +368,10 @@ def __init__(self, data_vars=None, coords=None, attrs=None, coords = {} if data_vars is not None or coords is not None: self._set_init_vars_and_dims(data_vars, coords, compat) + + # TODO(shoyer): expose indexes as a public argument in __init__ + self._indexes = None + if attrs is not None: self.attrs = attrs self._encoding = None @@ -643,7 +650,7 @@ def persist(self, **kwargs): @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, - file_obj=None, encoding=None): + indexes=None, file_obj=None, encoding=None): """Shortcut around __init__ for internal use when we want to skip costly validation """ @@ -651,6 +658,7 @@ def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, obj._variables = variables obj._coord_names = coord_names obj._dims = dims + obj._indexes = indexes obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding @@ -665,7 +673,8 @@ def _from_vars_and_coord_names(cls, variables, coord_names, attrs=None): return cls._construct_direct(variables, coord_names, dims, attrs) def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, - attrs=__default_attrs, inplace=False): + attrs=__default_attrs, indexes=None, + inplace=False): """Fastpath constructor for internal use. Preserves coord names and attributes. If not provided explicitly, @@ -694,13 +703,15 @@ def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, self._coord_names = coord_names if attrs is not self.__default_attrs: self._attrs = attrs + self._indexes = indexes obj = self else: if coord_names is None: coord_names = self._coord_names.copy() if attrs is self.__default_attrs: attrs = self._attrs_copy() - obj = self._construct_direct(variables, coord_names, dims, attrs) + obj = self._construct_direct( + variables, coord_names, dims, attrs, indexes) return obj def _replace_indexes(self, indexes): @@ -996,7 +1007,8 @@ def __delitem__(self, key): self._coord_names.discard(key) # mutable objects should not be hashable - __hash__ = None + # https://github.com/python/mypy/issues/4266 + __hash__ = None # type: ignore def _all_compat(self, other, compat_str): """Helper function for equals and identical""" @@ -1065,9 +1077,11 @@ def identical(self, other): @property def indexes(self): - """OrderedDict of pandas.Index objects used for label based indexing + """Mapping of pandas.Index objects used for label based indexing """ - return Indexes(self._variables, self._dims) + if self._indexes is None: + self._indexes = default_indexes(self._variables, self._dims) + return Indexes(self._indexes) @property def coords(self): @@ -1078,7 +1092,7 @@ def coords(self): @property def data_vars(self): - """Dictionary of xarray.DataArray objects corresponding to data variables + """Dictionary of DataArray objects corresponding to data variables """ return DataVariables(self) @@ -1672,7 +1686,8 @@ def relevant_keys(mapping): if any(d in indexer_dims for d in v.dims)] coords = relevant_keys(self.coords) - indexers = [(k, np.asarray(v)) for k, v in iteritems(indexers)] + indexers = [(k, np.asarray(v)) # type: ignore + for k, v in iteritems(indexers)] indexers_dict = dict(indexers) non_indexed_dims = set(self.dims) - indexer_dims non_indexed_coords = set(self.coords) - set(coords) @@ -1682,9 +1697,9 @@ def relevant_keys(mapping): for k, v in indexers: if k not in self.dims: raise ValueError("dimension %s does not exist" % k) - if v.dtype.kind != 'i': + if v.dtype.kind != 'i': # type: ignore raise TypeError('Indexers must be integers') - if v.ndim != 1: + if v.ndim != 1: # type: ignore raise ValueError('Indexers must be 1 dimensional') # all the indexers should have the same length @@ -2172,8 +2187,8 @@ def swap_dims(self, dims_dict, inplace=None): inplace=inplace) def expand_dims(self, dim, axis=None): - """Return a new object with an additional axis (or axes) inserted at the - corresponding position in the array shape. + """Return a new object with an additional axis (or axes) inserted at + the corresponding position in the array shape. If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. @@ -2257,8 +2272,8 @@ def expand_dims(self, dim, axis=None): def set_index(self, indexes=None, append=False, inplace=None, **indexes_kwargs): - """Set Dataset (multi-)indexes using one or more existing coordinates or - variables. + """Set Dataset (multi-)indexes using one or more existing coordinates + or variables. Parameters ---------- diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index a2f11728b4d..00ff7958183 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -42,29 +42,6 @@ def __eq__(self, other): ] -@functools.total_ordering -class AlwaysGreaterThan(object): - def __gt__(self, other): - return True - - def __eq__(self, other): - return isinstance(other, type(self)) - - -@functools.total_ordering -class AlwaysLessThan(object): - def __lt__(self, other): - return True - - def __eq__(self, other): - return isinstance(other, type(self)) - - -# Equivalence to np.inf (-np.inf) for object-type -INF = AlwaysGreaterThan() -NINF = AlwaysLessThan() - - def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ef89dba2ab8..54fd8881a56 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -6,14 +6,14 @@ from __future__ import absolute_import, division, print_function import contextlib +from functools import partial import inspect import warnings -from functools import partial import numpy as np import pandas as pd -from . import dask_array_ops, dtypes, npcompat, nputils +from . import dask_array_ops, dtypes, npcompat, nputils, utils from .nputils import nanfirst, nanlast from .pycompat import dask_array_type @@ -21,8 +21,8 @@ import dask.array as dask_array from . import dask_array_compat except ImportError: - dask_array = None - dask_array_compat = None + dask_array = None # type: ignore + dask_array_compat = None # type: ignore def _dask_or_eager_func(name, eager_module=np, dask_module=dask_array, @@ -43,10 +43,10 @@ def f(*args, **kwargs): (e, requires_dask)) else: wrapped = getattr(eager_module, name) - return wrapped(*args, ** kwargs) + return wrapped(*args, **kwargs) else: - def f(data, *args, **kwargs): - return getattr(eager_module, name)(data, *args, **kwargs) + def f(*args, **kwargs): + return getattr(eager_module, name)(*args, **kwargs) return f @@ -261,8 +261,6 @@ def f(values, axis=None, skipna=None, **kwargs): sum = _create_nan_agg_method('sum') sum.numeric_only = True sum.available_min_count = True -mean = _create_nan_agg_method('mean') -mean.numeric_only = True std = _create_nan_agg_method('std') std.numeric_only = True var = _create_nan_agg_method('var') @@ -278,6 +276,25 @@ def f(values, axis=None, skipna=None, **kwargs): cumsum_1d.numeric_only = True +_mean = _create_nan_agg_method('mean') + + +def mean(array, axis=None, skipna=None, **kwargs): + """ inhouse mean that can handle datatime dtype """ + array = asarray(array) + if array.dtype.kind == 'M': + offset = min(array) + # xarray always uses datetime[ns] for datetime + dtype = 'timedelta64[ns]' + return _mean(utils.datetime_to_numeric(array, offset), axis=axis, + skipna=skipna, **kwargs).astype(dtype) + offset + else: + return _mean(array, axis=axis, skipna=skipna, **kwargs) + + +mean.numeric_only = True + + def _nd_cum_func(cum_func, array, axis, **kwargs): array = asarray(array) if axis is None: diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index ec8329d6805..aa8ced5adab 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -630,7 +630,7 @@ def wrapped_func(self, dim=DEFAULT_DIMS, axis=None, skipna=None, return self.reduce(func, dim, axis, keep_attrs=keep_attrs, skipna=skipna, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=DEFAULT_DIMS, axis=None, + def wrapped_func(self, dim=DEFAULT_DIMS, axis=None, # type: ignore keep_attrs=None, **kwargs): return self.reduce(func, dim, axis, keep_attrs=keep_attrs, allow_lazy=True, **kwargs) @@ -748,7 +748,7 @@ def wrapped_func(self, dim=DEFAULT_DIMS, skipna=skipna, numeric_only=numeric_only, allow_lazy=True, **kwargs) else: - def wrapped_func(self, dim=DEFAULT_DIMS, + def wrapped_func(self, dim=DEFAULT_DIMS, # type: ignore **kwargs): return self.reduce(func, dim, numeric_only=numeric_only, allow_lazy=True, diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py new file mode 100644 index 00000000000..ffa483fc370 --- /dev/null +++ b/xarray/core/indexes.py @@ -0,0 +1,55 @@ +from __future__ import absolute_import, division, print_function +try: + from collections.abc import Mapping +except ImportError: + from collections import Mapping +from collections import OrderedDict + +from . import formatting + + +class Indexes(Mapping, formatting.ReprMixin): + """Immutable proxy for Dataset or DataArrary indexes.""" + def __init__(self, indexes): + """Not for public consumption. + + Parameters + ---------- + indexes : Dict[Any, pandas.Index] + Indexes held by this object. + """ + self._indexes = indexes + + def __iter__(self): + return iter(self._indexes) + + def __len__(self): + return len(self._indexes) + + def __contains__(self, key): + return key in self._indexes + + def __getitem__(self, key): + return self._indexes[key] + + def __unicode__(self): + return formatting.indexes_repr(self) + + +def default_indexes(coords, dims): + """Default indexes for a Dataset/DataArray. + + Parameters + ---------- + coords : Mapping[Any, xarray.Variable] + Coordinate variables from which to draw default indexes. + dims : iterable + Iterable of dimension names. + + Returns + ------- + Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names) + to indexes used for indexing along that dimension. + """ + return OrderedDict((key, coords[key].to_index()) + for key in dims if key in coords) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 2a5e7acbb25..637a9cbda7f 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -1,11 +1,18 @@ from __future__ import absolute_import, division, print_function +from typing import ( + Any, Dict, List, Mapping, Optional, Set, Tuple, TYPE_CHECKING, Union, +) + import pandas as pd from .alignment import deep_align from .pycompat import OrderedDict, basestring from .utils import Frozen -from .variable import as_variable, assert_unique_multiindex_level_names +from .variable import ( + Variable, as_variable, assert_unique_multiindex_level_names) +if TYPE_CHECKING: + from .dataset import Dataset PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel) @@ -145,13 +152,13 @@ def merge_variables( # variables appear merged = OrderedDict() - for name, variables in lookup.items(): + for name, var_list in lookup.items(): if name in priority_vars: # one of these arguments (e.g., the first for in-place arithmetic # or the second for Dataset.update) takes priority merged[name] = priority_vars[name] else: - dim_variables = [var for var in variables if (name,) == var.dims] + dim_variables = [var for var in var_list if (name,) == var.dims] if dim_variables: # if there are dimension coordinates, these must be equal (or # identical), and they take priority over non-dimension @@ -159,7 +166,7 @@ def merge_variables( merged[name] = unique_variable(name, dim_variables, dim_compat) else: try: - merged[name] = unique_variable(name, variables, compat) + merged[name] = unique_variable(name, var_list, compat) except MergeError: if compat != 'minimal': # we need more than "minimal" compatibility (for which @@ -236,8 +243,8 @@ def determine_coords(list_of_variable_dicts): from .dataarray import DataArray from .dataset import Dataset - coord_names = set() - noncoord_names = set() + coord_names = set() # type: set + noncoord_names = set() # type: set for variables in list_of_variable_dicts: if isinstance(variables, Dataset): diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 3f4e0fc3ac9..ff0e63801bc 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -3,11 +3,12 @@ import warnings from collections import Iterable from functools import partial +from typing import Any, Dict import numpy as np import pandas as pd -from . import rolling +from . import utils from .common import _contains_datetime_like_objects from .computation import apply_ufunc from .duck_array_ops import dask_array_type @@ -18,8 +19,8 @@ class BaseInterpolator(object): '''gerneric interpolator class for normalizing interpolation methods''' - cons_kwargs = {} - call_kwargs = {} + cons_kwargs = {} # type: Dict[str, Any] + call_kwargs = {} # type: Dict[str, Any] f = None method = None @@ -370,7 +371,7 @@ def _get_valid_fill_mask(arr, dim, limit): None''' kw = {dim: limit + 1} # we explicitly use construct method to avoid copy. - new_dim = rolling._get_new_dimname(arr.dims, '_window') + new_dim = utils.get_temp_dimname(arr.dims, '_window') return (arr.isnull().rolling(min_periods=1, **kw) .construct(new_dim, fill_value=False) .sum(new_dim, skipna=False)) <= limit diff --git a/xarray/core/ops.py b/xarray/core/ops.py index a0dd2212a8f..272a4eaf2f1 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -122,6 +122,20 @@ New {da_or_ds} object with `{name}` applied along its rolling dimnension. """ +_COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\ +Coarsen this object by applying `{name}` along its dimensions. + +Parameters +---------- +**kwargs : dict + Additional keyword arguments passed on to `{name}`. + +Returns +------- +reduced : DataArray or Dataset + New object with `{name}` applied along its coasen dimnensions. +""" + def fillna(data, other, join="left", dataset_join="left"): """Fill missing values in this object with data from the other object. @@ -378,3 +392,15 @@ def inject_datasetrolling_methods(cls): func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format( name=func.__name__, da_or_ds='Dataset') setattr(cls, 'count', func) + + +def inject_coarsen_methods(cls): + # standard numpy reduce methods + methods = [(name, getattr(duck_array_ops, name)) + for name in NAN_REDUCE_METHODS] + for name, f in methods: + func = cls._reduce_method(f) + func.__name__ = name + func.__doc__ = _COARSEN_REDUCE_DOCSTRING_TEMPLATE.format( + name=func.__name__) + setattr(cls, name, func) diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index b980bc279b0..67921b5d145 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -128,22 +128,12 @@ def __exit__(self, exctype, excinst, exctb): # Inspired by discussions on http://bugs.python.org/issue13585 class ExitStack(object): """Context manager for dynamic management of a stack of exit callbacks - - For example: - - with ExitStack() as stack: - files = [stack.enter_context(open(fname)) for fname in filenames] - # All opened files will automatically be closed at the end of - # the with statement, even if attempts to open files later - # in the list raise an exception - """ def __init__(self): self._exit_callbacks = deque() def pop_all(self): - """Preserve the context stack by transferring it to a new instance""" new_stack = type(self)() new_stack._exit_callbacks = self._exit_callbacks self._exit_callbacks = deque() diff --git a/xarray/core/resample.py b/xarray/core/resample.py index 49351efc70f..886303db345 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -2,7 +2,6 @@ from . import ops from .groupby import DEFAULT_DIMS, DataArrayGroupBy, DatasetGroupBy -from .pycompat import OrderedDict, dask_array_type RESAMPLE_DIM = '__resample_dim__' @@ -110,7 +109,16 @@ def interpolate(self, kind='linear'): return self._interpolate(kind=kind) def _interpolate(self, kind='linear'): - raise NotImplementedError + """Apply scipy.interpolate.interp1d along resampling dimension.""" + # drop any existing non-dimension coordinates along the resampling + # dimension + dummy = self._obj.copy() + for k, v in self._obj.coords.items(): + if k != self._dim and self._dim in v.dims: + dummy = dummy.drop(k) + return dummy.interp(assume_sorted=True, method=kind, + kwargs={'bounds_error': False}, + **{self._dim: self._full_index}) class DataArrayResample(DataArrayGroupBy, Resample): @@ -182,46 +190,6 @@ def apply(self, func, shortcut=False, args=(), **kwargs): return combined - def _interpolate(self, kind='linear'): - """Apply scipy.interpolate.interp1d along resampling dimension.""" - from .dataarray import DataArray - from scipy.interpolate import interp1d - - if isinstance(self._obj.data, dask_array_type): - raise TypeError( - "Up-sampling via interpolation was attempted on the the " - "variable '{}', but it is a dask array; dask arrays are not " - "yet supported in resample.interpolate(). Load into " - "memory with Dataset.load() before resampling." - .format(self._obj.data.name) - ) - - x = self._obj[self._dim].astype('float') - y = self._obj.data - - axis = self._obj.get_axis_num(self._dim) - - f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True, - assume_sorted=True) - new_x = self._full_index.values.astype('float') - - # construct new up-sampled DataArray - dummy = self._obj.copy() - dims = dummy.dims - - # drop any existing non-dimension coordinates along the resampling - # dimension - coords = OrderedDict() - for k, v in dummy.coords.items(): - # is the resampling dimension - if k == self._dim: - coords[self._dim] = self._full_index - # else, check if resampling dim is in coordinate dimensions - elif self._dim not in v.dims: - coords[k] = v - return DataArray(f(new_x), coords, dims, name=dummy.name, - attrs=dummy.attrs) - ops.inject_reduce_methods(DataArrayResample) ops.inject_binary_ops(DataArrayResample) @@ -308,50 +276,6 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): return super(DatasetResample, self).reduce( func, dim, keep_attrs, **kwargs) - def _interpolate(self, kind='linear'): - """Apply scipy.interpolate.interp1d along resampling dimension.""" - from .dataset import Dataset - from .variable import Variable - from scipy.interpolate import interp1d - - old_times = self._obj[self._dim].astype(float) - new_times = self._full_index.values.astype(float) - - data_vars = OrderedDict() - coords = OrderedDict() - - # Apply the interpolation to each DataArray in our original Dataset - for name, variable in self._obj.variables.items(): - if name in self._obj.coords: - if name == self._dim: - coords[self._dim] = self._full_index - elif self._dim not in variable.dims: - coords[name] = variable - else: - if isinstance(variable.data, dask_array_type): - raise TypeError( - "Up-sampling via interpolation was attempted on the " - "variable '{}', but it is a dask array; dask arrays " - "are not yet supprted in resample.interpolate(). Load " - "into memory with Dataset.load() before resampling." - .format(name) - ) - - axis = variable.get_axis_num(self._dim) - - # We've previously checked for monotonicity along the - # re-sampling dimension (in __init__ via the GroupBy - # constructor), so we can avoid sorting the data again by - # passing 'assume_sorted=True' - f = interp1d(old_times, variable.data, kind=kind, - axis=axis, bounds_error=True, - assume_sorted=True) - interpolated = Variable(variable.dims, f(new_times)) - - data_vars[name] = interpolated - - return Dataset(data_vars, coords) - ops.inject_reduce_methods(DatasetResample) ops.inject_binary_ops(DatasetResample) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 09b632e47a6..57463ef5987 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -5,32 +5,14 @@ import numpy as np -from . import dtypes +from . import dtypes, duck_array_ops, utils from .dask_array_ops import dask_rolling_wrapper from .ops import ( - bn, has_bottleneck, inject_bottleneck_rolling_methods, - inject_datasetrolling_methods) + bn, has_bottleneck, inject_coarsen_methods, + inject_bottleneck_rolling_methods, inject_datasetrolling_methods) from .pycompat import OrderedDict, dask_array_type, zip -def _get_new_dimname(dims, new_dim): - """ Get an new dimension name based on new_dim, that is not used in dims. - If the same name exists, we add an underscore(s) in the head. - - Example1: - dims: ['a', 'b', 'c'] - new_dim: ['_rolling'] - -> ['_rolling'] - Example2: - dims: ['a', 'b', 'c', '_rolling'] - new_dim: ['_rolling'] - -> ['__rolling'] - """ - while new_dim in dims: - new_dim = '_' + new_dim - return new_dim - - class Rolling(object): """A object that implements the moving window pattern. @@ -231,7 +213,7 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ - rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim') + rolling_dim = utils.get_temp_dimname(self.obj.dims, '_rolling_dim') windows = self.construct(rolling_dim) result = windows.reduce(func, dim=rolling_dim, **kwargs) @@ -242,7 +224,7 @@ def reduce(self, func, **kwargs): def _counts(self): """ Number of non-nan entries in each rolling window. """ - rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim') + rolling_dim = utils.get_temp_dimname(self.obj.dims, '_rolling_dim') # We use False as the fill_value instead of np.nan, since boolean # array is faster to be reduced than object array. # The use of skipna==False is also faster since it does not need to @@ -454,5 +436,121 @@ def construct(self, window_dim, stride=1, fill_value=dtypes.NA): **{self.dim: slice(None, None, stride)}) +class Coarsen(object): + """A object that implements the coarsen. + + See Also + -------- + Dataset.coarsen + DataArray.coarsen + """ + + _attributes = ['windows', 'side', 'trim_excess'] + + def __init__(self, obj, windows, boundary, side, coord_func): + """ + Moving window object. + + Parameters + ---------- + obj : Dataset or DataArray + Object to window. + windows : A mapping from a dimension name to window size + dim : str + Name of the dimension to create the rolling iterator + along (e.g., `time`). + window : int + Size of the moving window. + boundary : 'exact' | 'trim' | 'pad' + If 'exact', a ValueError will be raised if dimension size is not a + multiple of window size. If 'trim', the excess indexes are trimed. + If 'pad', NA will be padded. + side : 'left' or 'right' or mapping from dimension to 'left' or 'right' + coord_func: mapping from coordinate name to func. + + Returns + ------- + coarsen + """ + self.obj = obj + self.windows = windows + self.side = side + self.boundary = boundary + + if not utils.is_dict_like(coord_func): + coord_func = {d: coord_func for d in self.obj.dims} + for c in self.obj.coords: + if c not in coord_func: + coord_func[c] = duck_array_ops.mean + self.coord_func = coord_func + + def __repr__(self): + """provide a nice str repr of our coarsen object""" + + attrs = ["{k}->{v}".format(k=k, v=getattr(self, k)) + for k in self._attributes + if getattr(self, k, None) is not None] + return "{klass} [{attrs}]".format(klass=self.__class__.__name__, + attrs=','.join(attrs)) + + +class DataArrayCoarsen(Coarsen): + @classmethod + def _reduce_method(cls, func): + """ + Return a wrapped function for injecting numpy methods. + see ops.inject_coarsen_methods + """ + def wrapped_func(self, **kwargs): + from .dataarray import DataArray + + reduced = self.obj.variable.coarsen( + self.windows, func, self.boundary, self.side) + coords = {} + for c, v in self.obj.coords.items(): + if c == self.obj.name: + coords[c] = reduced + else: + if any(d in self.windows for d in v.dims): + coords[c] = v.variable.coarsen( + self.windows, self.coord_func[c], + self.boundary, self.side) + else: + coords[c] = v + return DataArray(reduced, dims=self.obj.dims, coords=coords) + + return wrapped_func + + +class DatasetCoarsen(Coarsen): + @classmethod + def _reduce_method(cls, func): + """ + Return a wrapped function for injecting numpy methods. + see ops.inject_coarsen_methods + """ + def wrapped_func(self, **kwargs): + from .dataset import Dataset + + reduced = OrderedDict() + for key, da in self.obj.data_vars.items(): + reduced[key] = da.variable.coarsen( + self.windows, func, self.boundary, self.side) + + coords = {} + for c, v in self.obj.coords.items(): + if any(d in self.windows for d in v.dims): + coords[c] = v.variable.coarsen( + self.windows, self.coord_func[c], + self.boundary, self.side) + else: + coords[c] = v.variable + return Dataset(reduced, coords=coords) + + return wrapped_func + + inject_bottleneck_rolling_methods(DataArrayRolling) inject_datasetrolling_methods(DatasetRolling) +inject_coarsen_methods(DataArrayCoarsen) +inject_coarsen_methods(DatasetCoarsen) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index fbda658c23f..085eaaa5ed1 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -488,7 +488,7 @@ def __repr__(self): class ReprObject(object): """Object that prints as the given value, for use with sentinel values.""" - def __init__(self, value): # type: str + def __init__(self, value: str): self._value = value def __repr__(self): @@ -622,10 +622,36 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): ------- array """ + from . import duck_array_ops + if offset is None: offset = array.min() array = array - offset if datetime_unit: - return (array / np.timedelta64(1, datetime_unit)).astype(dtype) - return array.astype(dtype) + array = array / np.timedelta64(1, datetime_unit) + # convert np.NaT to np.nan + if array.dtype.kind in 'mM': + if hasattr(array, 'isnull'): + return np.where(array.isnull(), np.nan, array.astype(dtype)) + return np.where(duck_array_ops.isnull(array), np.nan, + array.astype(dtype)) + return array + + +def get_temp_dimname(dims, new_dim): + """ Get an new dimension name based on new_dim, that is not used in dims. + If the same name exists, we add an underscore(s) in the head. + + Example1: + dims: ['a', 'b', 'c'] + new_dim: ['_rolling'] + -> ['_rolling'] + Example2: + dims: ['a', 'b', 'c', '_rolling'] + new_dim: ['_rolling'] + -> ['__rolling'] + """ + while new_dim in dims: + new_dim = '_' + new_dim + return new_dim diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 243487db034..8bd7225efc3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -4,6 +4,7 @@ import itertools from collections import defaultdict from datetime import timedelta +from typing import Tuple, Type import numpy as np import pandas as pd @@ -28,7 +29,8 @@ NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, pd.Index) + dask_array_type -BASIC_INDEXING_TYPES = integer_types + (slice,) +# https://github.com/python/mypy/issues/224 +BASIC_INDEXING_TYPES = integer_types + (slice,) # type: ignore class MissingDimensionsError(ValueError): @@ -414,6 +416,10 @@ def dims(self): """ return self._dims + @dims.setter + def dims(self, value): + self._dims = self._parse_dimensions(value) + def _parse_dimensions(self, dims): if isinstance(dims, basestring): dims = (dims,) @@ -424,10 +430,6 @@ def _parse_dimensions(self, dims): % (dims, self.ndim)) return dims - @dims.setter - def dims(self, value): - self._dims = self._parse_dimensions(value) - def _item_key_to_tuple(self, key): if utils.is_dict_like(key): return tuple(key.get(dim, slice(None)) for dim in self.dims) @@ -816,7 +818,8 @@ def __deepcopy__(self, memo=None): return self.copy(deep=True) # mutable objects should not be hashable - __hash__ = None + # https://github.com/python/mypy/issues/4266 + __hash__ = None # type: ignore @property def chunks(self): @@ -1019,7 +1022,7 @@ def pad_with_fill_value(self, pad_widths=None, fill_value=dtypes.NA, pad_widths = either_dict_or_kwargs(pad_widths, pad_widths_kwargs, 'pad') - if fill_value is dtypes.NA: # np.nan is passed + if fill_value is dtypes.NA: dtype, fill_value = dtypes.maybe_promote(self.dtype) else: dtype = self.dtype @@ -1641,6 +1644,85 @@ def rolling_window(self, dim, window, window_dim, center=False, array, axis=self.get_axis_num(dim), window=window, center=center, fill_value=fill_value)) + def coarsen(self, windows, func, boundary='exact', side='left'): + """ + Apply + """ + windows = {k: v for k, v in windows.items() if k in self.dims} + if not windows: + return self.copy() + + reshaped, axes = self._coarsen_reshape(windows, boundary, side) + if isinstance(func, basestring): + name = func + func = getattr(duck_array_ops, name, None) + if func is None: + raise NameError('{} is not a valid method.'.format(name)) + return type(self)(self.dims, func(reshaped, axis=axes), self._attrs) + + def _coarsen_reshape(self, windows, boundary, side): + """ + Construct a reshaped-array for corsen + """ + if not utils.is_dict_like(boundary): + boundary = {d: boundary for d in windows.keys()} + + if not utils.is_dict_like(side): + side = {d: side for d in windows.keys()} + + # remove unrelated dimensions + boundary = {k: v for k, v in boundary.items() if k in windows} + side = {k: v for k, v in side.items() if k in windows} + + for d, window in windows.items(): + if window <= 0: + raise ValueError('window must be > 0. Given {}'.format(window)) + + variable = self + for d, window in windows.items(): + # trim or pad the object + size = variable.shape[self._get_axis_num(d)] + n = int(size / window) + if boundary[d] == 'exact': + if n * window != size: + raise ValueError( + 'Could not coarsen a dimension of size {} with ' + 'window {}'.format(size, window)) + elif boundary[d] == 'trim': + if side[d] == 'left': + variable = variable.isel({d: slice(0, window * n)}) + else: + excess = size - window * n + variable = variable.isel({d: slice(excess, None)}) + elif boundary[d] == 'pad': # pad + pad = window * n - size + if pad < 0: + pad += window + if side[d] == 'left': + pad_widths = {d: (0, pad)} + else: + pad_widths = {d: (pad, 0)} + variable = variable.pad_with_fill_value(pad_widths) + else: + raise TypeError( + "{} is invalid for boundary. Valid option is 'exact', " + "'trim' and 'pad'".format(boundary[d])) + + shape = [] + axes = [] + axis_count = 0 + for i, d in enumerate(variable.dims): + if d in windows: + size = variable.shape[i] + shape.append(int(size / windows[d])) + shape.append(windows[d]) + axis_count += 1 + axes.append(i + axis_count) + else: + shape.append(variable.shape[i]) + + return variable.data.reshape(shape), tuple(axes) + @property def real(self): return type(self)(self.dims, self.data.real, self._attrs) @@ -1722,7 +1804,8 @@ def load(self): # data is already loaded into memory for IndexVariable return self - @Variable.data.setter + # https://github.com/python/mypy/issues/1465 + @Variable.data.setter # type: ignore def data(self, data): Variable.data.fset(self, data) if not isinstance(self._data, PandasIndexAdapter): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 52345396ffa..58f76596822 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -6,6 +6,7 @@ from distutils import version import re import importlib +from unittest import mock import numpy as np from numpy.testing import assert_array_equal # noqa: F401 @@ -25,12 +26,6 @@ # old location, for pandas < 0.20 from pandas.util.testing import assert_frame_equal # noqa: F401 - -try: - from unittest import mock -except ImportError: - import mock # noqa: F401 - # import mpl and change the backend before other mpl imports try: import matplotlib as mpl diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 48c2f64c8db..d3c8599b21b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4,10 +4,12 @@ import itertools import math import os.path +from pathlib import Path import pickle import shutil import sys import tempfile +from typing import Optional import warnings from io import BytesIO @@ -47,14 +49,6 @@ except ImportError: pass -try: - from pathlib import Path -except ImportError: - try: - from pathlib2 import Path - except ImportError: - pass - ON_WINDOWS = sys.platform == 'win32' @@ -172,8 +166,8 @@ class NetCDF3Only(object): class DatasetIOBase(object): - engine = None - file_format = None + engine = None # type: Optional[str] + file_format = None # type: Optional[str] def create_store(self): raise NotImplementedError @@ -2367,6 +2361,29 @@ def test_open_single_dataset(self): with open_mfdataset([tmp], concat_dim=dim) as actual: assert_identical(expected, actual) + def test_open_multi_dataset(self): + # Test for issue GH #1988 and #2647. This makes sure that the + # concat_dim is utilized when specified in open_mfdataset(). + # The additional wrinkle is to ensure that a length greater + # than one is tested as well due to numpy's implicit casting + # of 1-length arrays to booleans in tests, which allowed + # #2647 to still pass the test_open_single_dataset(), + # which is itself still needed as-is because the original + # bug caused one-length arrays to not be used correctly + # in concatenation. + rnddata = np.random.randn(10) + original = Dataset({'foo': ('x', rnddata)}) + dim = DataArray([100, 150], name='baz', dims='baz') + expected = Dataset({'foo': (('baz', 'x'), + np.tile(rnddata[np.newaxis, :], (2, 1)))}, + {'baz': [100, 150]}) + with create_tmp_file() as tmp1, \ + create_tmp_file() as tmp2: + original.to_netcdf(tmp1) + original.to_netcdf(tmp2) + with open_mfdataset([tmp1, tmp2], concat_dim=dim) as actual: + assert_identical(expected, actual) + def test_dask_roundtrip(self): with create_tmp_file() as tmp: data = create_test_data() @@ -2463,7 +2480,7 @@ def test_cmp_local_file(self): assert actual.attrs.keys() == expected.attrs.keys() with self.create_datasets() as (actual, expected): - assert_equal(actual.isel(l=2), expected.isel(l=2)) # noqa + assert_equal(actual[{'l': 2}], expected[{'l': 2}]) with self.create_datasets() as (actual, expected): assert_equal(actual.isel(i=0, j=-1), diff --git a/xarray/tests/test_backends_file_manager.py b/xarray/tests/test_backends_file_manager.py index 9c4c1cf815c..4405454e216 100644 --- a/xarray/tests/test_backends_file_manager.py +++ b/xarray/tests/test_backends_file_manager.py @@ -1,6 +1,7 @@ import gc import pickle import threading +from unittest import mock import pytest @@ -8,11 +9,6 @@ from xarray.backends.lru_cache import LRUCache from xarray.core.options import set_options -try: - from unittest import mock -except ImportError: - import mock # noqa: F401 - @pytest.fixture(params=[1, 2, 3, None]) def file_cache(request): diff --git a/xarray/tests/test_backends_lru_cache.py b/xarray/tests/test_backends_lru_cache.py index 03eb6dcf208..d64d718f2f7 100644 --- a/xarray/tests/test_backends_lru_cache.py +++ b/xarray/tests/test_backends_lru_cache.py @@ -1,8 +1,4 @@ -try: - from unittest import mock -except ImportError: - import mock # noqa: F401 - +from unittest import mock import pytest from xarray.backends.lru_cache import LRUCache diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index dfb46df21e3..b9d2cf520a8 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -141,7 +141,7 @@ def test_to_offset_sub_annual(freq, expected): @pytest.mark.parametrize(('month_int', 'month_label'), - list(_MONTH_ABBREVIATIONS.items()) + [('', '')]) + list(_MONTH_ABBREVIATIONS.items()) + [(0, '')]) @pytest.mark.parametrize('multiple', [None, 2]) @pytest.mark.parametrize('offset_str', ['AS', 'A']) def test_to_offset_annual(month_label, month_int, multiple, offset_str): diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 271cacb5ca0..3c4fc67c5eb 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -384,7 +384,7 @@ def test_resample_error(da): SEL_STRING_OR_LIST_TESTS = { 'string': '0001', - 'string-slice': slice('0001-01-01', '0001-12-30'), + 'string-slice': slice('0001-01-01', '0001-12-30'), # type: ignore 'bool-list': [True, True, False, False] } diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d9a40c23add..756d51e7997 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -12,6 +12,7 @@ _import_cftime, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime) from xarray.conventions import _update_bounds_attributes from xarray.core.common import contains_cftime_datetimes +from xarray.testing import assert_equal from . import ( assert_array_equal, has_cftime, has_cftime_or_netCDF4, has_dask, @@ -750,3 +751,30 @@ def test_encode_cf_datetime_pandas_min(): np.testing.assert_array_equal(num, expected_num) assert units == expected_units assert calendar == expected_calendar + + +@pytest.mark.skipif(not has_cftime_or_netCDF4, reason='cftime not installed') +def test_time_units_with_timezone_roundtrip(calendar): + # Regression test for GH 2649 + expected_units = 'days since 2000-01-01T00:00:00-05:00' + expected_num_dates = np.array([1, 2, 3]) + dates = decode_cf_datetime(expected_num_dates, expected_units, calendar) + + # Check that dates were decoded to UTC; here the hours should all + # equal 5. + result_hours = DataArray(dates).dt.hour + expected_hours = DataArray([5, 5, 5]) + assert_equal(result_hours, expected_hours) + + # Check that the encoded values are accurately roundtripped. + result_num_dates, result_units, result_calendar = encode_cf_datetime( + dates, expected_units, calendar) + + if calendar in _STANDARD_CALENDARS: + np.testing.assert_array_equal(result_num_dates, expected_num_dates) + else: + # cftime datetime arithmetic is not quite exact. + np.testing.assert_allclose(result_num_dates, expected_num_dates) + + assert result_units == expected_units + assert result_calendar == calendar diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 9b4f8523178..aa02e802fc5 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2524,6 +2524,16 @@ def test_upsample_interpolate(self): # done here due to floating point arithmetic assert_allclose(expected, actual, rtol=1e-16) + @requires_scipy + def test_upsample_interpolate_bug_2197(self): + dates = pd.date_range('2007-02-01', '2007-03-01', freq='D') + da = xr.DataArray(np.arange(len(dates)), [('time', dates)]) + result = da.resample(time='M').interpolate('linear') + expected_times = np.array([np.datetime64('2007-02-28'), + np.datetime64('2007-03-31')]) + expected = xr.DataArray([27., np.nan], [('time', expected_times)]) + assert_equal(result, expected) + @requires_scipy def test_upsample_interpolate_regression_1605(self): dates = pd.date_range('2016-01-01', '2016-03-31', freq='1D') @@ -2536,21 +2546,42 @@ def test_upsample_interpolate_regression_1605(self): @requires_dask @requires_scipy def test_upsample_interpolate_dask(self): - import dask.array as da - - times = pd.date_range('2000-01-01', freq='6H', periods=5) + from scipy.interpolate import interp1d xs = np.arange(6) ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) z = np.arange(5)**2 - data = da.from_array(np.tile(z, (6, 3, 1)), (1, 3, 1)) + data = np.tile(z, (6, 3, 1)) array = DataArray(data, {'time': times, 'x': xs, 'y': ys}, ('x', 'y', 'time')) + chunks = {'x': 2, 'y': 1} + + expected_times = times.to_series().resample('1H').asfreq().index + # Split the times into equal sub-intervals to simulate the 6 hour + # to 1 hour up-sampling + new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) + for kind in ['linear', 'nearest', 'zero', 'slinear', 'quadratic', + 'cubic']: + actual = array.chunk(chunks).resample(time='1H').interpolate(kind) + actual = actual.compute() + f = interp1d(np.arange(len(times)), data, kind=kind, axis=-1, + bounds_error=True, assume_sorted=True) + expected_data = f(new_times_idx) + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + # Use AllClose because there are some small differences in how + # we upsample timeseries versus the integer indexing as I've + # done here due to floating point arithmetic + assert_allclose(expected, actual, rtol=1e-16) - with raises_regex(TypeError, - "dask arrays are not yet supported"): - array.resample(time='1H').interpolate('linear') + # Check that an error is raised if an attempt is made to interpolate + # over a chunked dimension + with raises_regex(NotImplementedError, + 'Chunking along the dimension to be interpolated'): + array.chunk({'time': 1}).resample(time='1H').interpolate('linear') def test_align(self): array = DataArray(np.random.random((6, 8)), diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 6f6287efcac..e55caf1bf13 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5,6 +5,7 @@ import warnings from copy import copy, deepcopy from io import StringIO +import pickle from textwrap import dedent import numpy as np @@ -26,10 +27,6 @@ raises_regex, requires_bottleneck, requires_dask, requires_scipy, source_ndarray) -try: - import cPickle as pickle -except ImportError: - import pickle try: import dask.array as da except ImportError: @@ -4432,6 +4429,48 @@ def ds(request): 'y': range(2)}) +@pytest.mark.parametrize('dask', [True, False]) +@pytest.mark.parametrize(('boundary', 'side'), [ + ('trim', 'left'), ('pad', 'right')]) +def test_coarsen(ds, dask, boundary, side): + if dask and has_dask: + ds = ds.chunk({'x': 4}) + + actual = ds.coarsen(time=2, x=3, boundary=boundary, side=side).max() + assert_equal( + actual['z1'], + ds['z1'].coarsen(time=2, x=3, boundary=boundary, side=side).max()) + # coordinate should be mean by default + assert_equal(actual['time'], ds['time'].coarsen( + time=2, x=3, boundary=boundary, side=side).mean()) + + +@pytest.mark.parametrize('dask', [True, False]) +def test_coarsen_coords(ds, dask): + if dask and has_dask: + ds = ds.chunk({'x': 4}) + + # check if coord_func works + actual = ds.coarsen(time=2, x=3, boundary='trim', + coord_func={'time': 'max'}).max() + assert_equal(actual['z1'], + ds['z1'].coarsen(time=2, x=3, boundary='trim').max()) + assert_equal(actual['time'], + ds['time'].coarsen(time=2, x=3, boundary='trim').max()) + + # raise if exact + with pytest.raises(ValueError): + ds.coarsen(x=3).mean() + # should be no error + ds.isel(x=slice(0, 3 * (len(ds['x']) // 3))).coarsen(x=3).mean() + + # working test with pd.time + da = xr.DataArray( + np.linspace(0, 365, num=364), dims='time', + coords={'time': pd.date_range('15/12/1999', periods=364)}) + actual = da.coarsen(time=2).mean() + + def test_rolling_properties(ds): # catching invalid args with pytest.raises(ValueError) as exception: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 5ea5b3d2a42..2a6a957e10f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -251,6 +251,26 @@ def assert_dask_array(da, dask): assert isinstance(da.data, dask_array_type) +@pytest.mark.parametrize('dask', [False, True]) +def test_datetime_reduce(dask): + time = np.array(pd.date_range('15/12/1999', periods=11)) + time[8: 11] = np.nan + da = DataArray( + np.linspace(0, 365, num=11), dims='time', coords={'time': time}) + + if dask and has_dask: + chunks = {'time': 5} + da = da.chunk(chunks) + + actual = da['time'].mean() + assert not pd.isnull(actual) + actual = da['time'].mean(skipna=False) + assert pd.isnull(actual) + + # test for a 0d array + assert da['time'][0].mean() == da['time'][:1].mean() + + @pytest.mark.parametrize('dim_num', [1, 2]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) diff --git a/xarray/tests/test_extensions.py b/xarray/tests/test_extensions.py index ffefa78aa34..608ec798ca1 100644 --- a/xarray/tests/test_extensions.py +++ b/xarray/tests/test_extensions.py @@ -1,16 +1,13 @@ from __future__ import absolute_import, division, print_function +import pickle + import pytest import xarray as xr from . import raises_regex -try: - import cPickle as pickle -except ImportError: - import pickle - @xr.register_dataset_accessor('example_accessor') @xr.register_dataarray_accessor('example_accessor') diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 08cab4b3541..6dd50e11fd3 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1684,6 +1684,58 @@ def assert_assigned_2d(array, key_x, key_y, values): expected = Variable(['x', 'y'], [[2, 3], [3, 4], [4, 5]]) assert_identical(v, expected) + def test_coarsen(self): + v = self.cls(['x'], [0, 1, 2, 3, 4]) + actual = v.coarsen({'x': 2}, boundary='pad', func='mean') + expected = self.cls(['x'], [0.5, 2.5, 4]) + assert_identical(actual, expected) + + actual = v.coarsen({'x': 2}, func='mean', boundary='pad', + side='right') + expected = self.cls(['x'], [0, 1.5, 3.5]) + assert_identical(actual, expected) + + actual = v.coarsen({'x': 2}, func=np.mean, side='right', + boundary='trim') + expected = self.cls(['x'], [1.5, 3.5]) + assert_identical(actual, expected) + + # working test + v = self.cls(['x', 'y', 'z'], + np.arange(40 * 30 * 2).reshape(40, 30, 2)) + for windows, func, side, boundary in [ + ({'x': 2}, np.mean, 'left', 'trim'), + ({'x': 2}, np.median, {'x': 'left'}, 'pad'), + ({'x': 2, 'y': 3}, np.max, 'left', {'x': 'pad', 'y': 'trim'})]: + v.coarsen(windows, func, boundary, side) + + def test_coarsen_2d(self): + # 2d-mean should be the same with the successive 1d-mean + v = self.cls(['x', 'y'], np.arange(6 * 12).reshape(6, 12)) + actual = v.coarsen({'x': 3, 'y': 4}, func='mean') + expected = v.coarsen({'x': 3}, func='mean').coarsen( + {'y': 4}, func='mean') + assert_equal(actual, expected) + + v = self.cls(['x', 'y'], np.arange(7 * 12).reshape(7, 12)) + actual = v.coarsen({'x': 3, 'y': 4}, func='mean', boundary='trim') + expected = v.coarsen({'x': 3}, func='mean', boundary='trim').coarsen( + {'y': 4}, func='mean', boundary='trim') + assert_equal(actual, expected) + + # if there is nan, the two should be different + v = self.cls(['x', 'y'], 1.0 * np.arange(6 * 12).reshape(6, 12)) + v[2, 4] = np.nan + v[3, 5] = np.nan + actual = v.coarsen({'x': 3, 'y': 4}, func='mean', boundary='trim') + expected = v.coarsen({'x': 3}, func='sum', boundary='trim').coarsen( + {'y': 4}, func='sum', boundary='trim') / 12 + assert not actual.equals(expected) + # adjusting the nan count + expected[0, 1] *= 12 / 11 + expected[1, 1] *= 12 / 11 + assert_allclose(actual, expected) + @requires_dask class TestVariableWithDask(VariableSubclassobjects): @@ -1838,6 +1890,10 @@ def test_pad(self): def test_rolling_window(self): super(TestIndexVariable, self).test_rolling_window() + @pytest.mark.xfail + def test_coarsen_2d(self): + super(TestIndexVariable, self).test_coarsen_2d() + class TestAsCompatibleData(object): def test_unchanged_types(self):