diff --git a/doc/api.rst b/doc/api.rst
index 433aa93c9de..95325c13bbd 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -14,6 +14,7 @@ Top-level functions
 .. autosummary::
    :toctree: generated/
 
+   apply_ufunc
    align
    broadcast
    concat
diff --git a/doc/computation.rst b/doc/computation.rst
index 6185ca401f4..fa7f8da3512 100644
--- a/doc/computation.rst
+++ b/doc/computation.rst
@@ -303,15 +303,21 @@ Datasets support most of the same methods found on data arrays:
     ds.mean(dim='x')
     abs(ds)
 
-Unfortunately, a limitation of the current version of numpy means that we
-cannot override ufuncs for datasets, because datasets cannot be written as
-a single array [1]_. :py:meth:`~xarray.Dataset.apply` works around this
+Unfortunately, we currently do not support NumPy ufuncs for datasets [1]_.
+:py:meth:`~xarray.Dataset.apply` works around this
 limitation, by applying the given function to each variable in the dataset:
 
 .. ipython:: python
 
     ds.apply(np.sin)
 
+You can also use the wrapped functions in the ``xarray.ufuncs`` module:
+
+.. ipython:: python
+
+    import xarray.ufuncs as xu
+    xu.sin(ds)
+
 Datasets also use looping over variables for *broadcasting* in binary
 arithmetic. You can do arithmetic between any ``DataArray`` and a dataset:
 
@@ -329,5 +335,93 @@ Arithmetic between two datasets matches data variables of the same name:
 Similarly to index based alignment, the result has the intersection of all
 matching data variables.
 
-.. [1] In some future version of NumPy, we should be able to override ufuncs for
-       datasets by making use of ``__numpy_ufunc__``.
+.. [1] This was previously due to a limitation of NumPy, but with NumPy 1.13
+       we should be able to support this by leveraging ``__array_ufunc__``
+       (:issue:`1617`).
+
+.. _comput.wrapping-custom:
+
+Wrapping custom computation
+===========================
+
+It doesn't always make sense to do computation directly with xarray objects:
+
+  - In the inner loop of performance limited code, using xarray can add
+    considerable overhead compared to using NumPy or native Python types.
+    This is particularly true when working with scalars or small arrays (less
+    than ~1e6 elements). Keeping track of labels and ensuring their consistency
+    adds overhead, and array's core itself is not especially fast, because it's
+    written in Python rather than a compiled language like C. Also, xarray's
+    high level label-based APIs removes low-level control over how operations
+    are implemented.
+  - Even if speed doesn't matter, it can be important to wrap existing code, or
+    to support alternative interfaces that don't use xarray objects.
+
+For these reasons, it is often well-advised to write low-level routines that
+work with NumPy arrays, and to wrap these routines to work with xarray objects.
+However, adding support for labels on both :py:class:`~xarray.Dataset` and
+:py:class:`~xarray.DataArray` can be a bit of a chore.
+
+To make this easier, xarray supplies the :py:func:`~xarray.apply_ufunc` helper
+function, designed for wrapping functions that support broadcasting and
+vectorization on unlabeled arrays in the style of a NumPy
+`universal function <https://docs.scipy.org/doc/numpy-1.13.0/reference/ufuncs.html>`_ ("ufunc" for short).
+``apply_ufunc`` takes care of everything needed for an idiomatic xarray wrapper,
+including alignment, broadcasting, looping over ``Dataset`` variables (if
+needed), and merging of coordinates. In fact, many internal xarray
+functions/methods are written using ``apply_ufunc``.
+
+Simple functions that act independently on each value should work without
+any additional arguments:
+
+.. ipython:: python
+
+    squared_error = lambda x, y: (x - y) ** 2
+    arr1 = xr.DataArray([0, 1, 2, 3], dims='x')
+    xr.apply_ufunc(squared_error, arr1, 1)
+
+For using more complex operations that consider some array values collectively,
+it's important to understand the idea of "core dimensions" from NumPy's
+`generalized ufuncs <http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html>`_. Core dimensions are defined as dimensions
+that should *not* be broadcast over. Usually, they correspond to the fundamental
+dimensions over which an operation is defined, e.g., the summed axis in
+``np.sum``. A good clue that core dimensions are needed is the presence of an
+``axis`` argument on the corresponding NumPy function.
+
+With ``apply_ufunc``, core dimensions are recognized by name, and then moved to
+the last dimension of any input arguments before applying the given function.
+This means that for functions that accept an ``axis`` argument, you usually need
+to set ``axis=-1``. As an example, here is how we would wrap
+:py:func:`numpy.linalg.norm` to calculate the vector norm:
+
+.. code-block:: python
+
+    def vector_norm(x, dim, ord=None):
+        return xr.apply_ufunc(np.linalg.norm, x,
+                              input_core_dims=[[dim]],
+                              kwargs={'ord': ord, 'axis': -1})
+
+.. ipython:: python
+   :suppress:
+
+    def vector_norm(x, dim, ord=None):
+        return xr.apply_ufunc(np.linalg.norm, x,
+                              input_core_dims=[[dim]],
+                              kwargs={'ord': ord, 'axis': -1})
+
+.. ipython:: python
+
+    vector_norm(arr1, dim='x')
+
+Because ``apply_ufunc`` follows a standard convention for ufuncs, it plays
+nicely with tools for building vectorized functions, like
+:func:`numpy.broadcast_arrays` and :func:`numpy.vectorize`. For high performance
+needs, consider using Numba's `vectorize and guvectorize <http://numba.pydata.org/numba-doc/dev/user/vectorize.html>`_.
+
+In addition to wrapping functions, ``apply_ufunc`` can automatically parallelize
+many functions when using dask by setting ``dask='parallelized'``. See
+:ref:`dask.automatic-parallelization` for details.
+
+:py:func:`~xarray.apply_ufunc` also supports some advanced options for
+controlling alignment of variables and the form of the result. See the
+docstring for full details and more examples.
diff --git a/doc/dask.rst b/doc/dask.rst
index 2968c567094..2b3c671835e 100644
--- a/doc/dask.rst
+++ b/doc/dask.rst
@@ -1,10 +1,10 @@
 .. _dask:
 
-Out of core computation with dask
-=================================
+Parallel computing with dask
+============================
 
-xarray integrates with `dask <http://dask.pydata.org/>`__ to support streaming
-computation on datasets that don't fit into memory.
+xarray integrates with `dask <http://dask.pydata.org/>`__ to support parallel
+computations and streaming computation on datasets that don't fit into memory.
 
 Currently, dask is an entirely optional feature for xarray. However, the
 benefits of using dask are sufficiently strong that dask may become a required
@@ -33,7 +33,7 @@ to your screen or write to disk). At that point, data is loaded into memory
 and computation proceeds in a streaming fashion, block-by-block.
 
 The actual computation is controlled by a multi-processing or thread pool,
-which allows dask to take full advantage of multiple processers available on
+which allows dask to take full advantage of multiple processors available on
 most modern computers.
 
 For more details on dask, read `its documentation <http://dask.pydata.org/>`__.
@@ -213,6 +213,115 @@ loaded into dask or not:
     In the future, we may extend ``.data`` to support other "computable" array
     backends beyond dask and numpy (e.g., to support sparse arrays).
 
+.. _dask.automatic-parallelization:
+
+Automatic parallelization
+-------------------------
+
+Almost all of xarray's built-in operations work on dask arrays. If you want to
+use a function that isn't wrapped by xarray, one option is to extract dask
+arrays from xarray objects (``.data``) and use dask directly.
+
+Another option is to use xarray's :py:func:`~xarray.apply_ufunc`, which can
+automate `embarrassingly parallel <https://en.wikipedia.org/wiki/Embarrassingly_parallel>`__
+"map" type operations where a functions written for processing NumPy arrays
+should be repeatedly applied to xarray objects containing dask arrays. It works
+similarly to :py:func:`dask.array.map_blocks` and :py:func:`dask.array.atop`,
+but without requiring an intermediate layer of abstraction.
+
+For the best performance when using dask's multi-threaded scheduler, wrap a
+function that already releases the global interpreter lock, which fortunately
+already includes most NumPy and Scipy functions. Here we show an example
+using NumPy operations and a fast function from
+`bottleneck <https://github.com/kwgoodman/bottleneck>`__, which
+we use to calculate `Spearman's rank-correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`__:
+
+.. code-block:: python
+
+    import numpy as np
+    import xarray as xr
+    import bottleneck
+
+    def covariance_gufunc(x, y):
+        return ((x - x.mean(axis=-1, keepdims=True))
+                * (y - y.mean(axis=-1, keepdims=True))).mean(axis=-1)
+
+    def pearson_correlation_gufunc(x, y):
+        return covariance_gufunc(x, y) / (x.std(axis=-1) * y.std(axis=-1))
+
+    def spearman_correlation_gufunc(x, y):
+        x_ranks = bottleneck.rankdata(x, axis=-1)
+        y_ranks = bottleneck.rankdata(y, axis=-1)
+        return pearson_correlation_gufunc(x_ranks, y_ranks)
+
+    def spearman_correlation(x, y, dim):
+        return xr.apply_ufunc(
+            spearman_correlation_gufunc, x, y,
+            input_core_dims=[[dim], [dim]],
+            dask='parallelized',
+            output_dtypes=[float])
+
+The only aspect of this example that is different from standard usage of
+``apply_ufunc()`` is that we needed to supply the ``output_dtypes`` arguments.
+(Read up on :ref:`comput.wrapping-custom` for an explanation of the
+"core dimensions" listed in ``input_core_dims``.)
+
+Our new ``spearman_correlation()`` function achieves near linear speedup
+when run on large arrays across the four cores on my laptop. It would also
+work as a streaming operation, when run on arrays loaded from disk:
+
+.. ipython::
+    :verbatim:
+
+    In [56]: rs = np.random.RandomState(0)
+
+    In [57]: array1 = xr.DataArray(rs.randn(1000, 100000), dims=['place', 'time'])  # 800MB
+
+    In [58]: array2 = array1 + 0.5 * rs.randn(1000, 100000)
+
+    # using one core, on numpy arrays
+    In [61]: %time _ = spearman_correlation(array1, array2, 'time')
+    CPU times: user 21.6 s, sys: 2.84 s, total: 24.5 s
+    Wall time: 24.9 s
+
+    In [8]: chunked1 = array1.chunk({'place': 10})
+
+    In [9]: chunked2 = array2.chunk({'place': 10})
+
+    # using all my laptop's cores, with dask
+    In [63]: r = spearman_correlation(chunked1, chunked2, 'time').compute()
+
+    In [64]: %time _ = r.compute()
+    CPU times: user 30.9 s, sys: 1.74 s, total: 32.6 s
+    Wall time: 4.59 s
+
+One limitation of ``apply_ufunc()`` is that it cannot be applied to arrays with
+multiple chunks along a core dimension:
+
+.. ipython::
+    :verbatim:
+
+    In [63]: spearman_correlation(chunked1, chunked2, 'place')
+    ValueError: dimension 'place' on 0th function argument to apply_ufunc with
+    dask='parallelized' consists of multiple chunks, but is also a core
+    dimension. To fix, rechunk into a single dask array chunk along this
+    dimension, i.e., ``.rechunk({'place': -1})``, but beware that this may
+    significantly increase memory usage.
+
+The reflects the nature of core dimensions, in contrast to broadcast
+(non-core) dimensions that allow operations to be split into arbitrary chunks
+for application.
+
+.. tip::
+
+    For the majority of NumPy functions that are already wrapped by dask, it's
+    usually a better idea to use the pre-existing ``dask.array`` function, by
+    using either a pre-existing xarray methods or
+    :py:func:`~xarray.apply_ufunc()` with ``dask='allowed'``. Dask can often
+    have a more efficient implementation that makes use of the specialized
+    structure of a problem, unlike the generic speedups offered by
+    ``dask='parallelized'``.
+
 Chunking and performance
 ------------------------
 
@@ -237,7 +346,6 @@ larger chunksizes.
     import os
     os.remove('example-data.nc')
 
-
 Optimization Tips
 -----------------
 
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 2cb626cfd8c..56a33bb6004 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -75,6 +75,13 @@ Backward Incompatible Changes
 Enhancements
 ~~~~~~~~~~~~
 
+- New helper function :py:func:`~xarray.apply_ufunc` for wrapping functions
+  written to work on NumPy arrays to support labels on xarray objects
+  (:issue:`770`). ``apply_ufunc`` also support automatic parallelization for
+  many functions with dask. See :ref:`comput.wrapping-custom` and
+  :ref:`dask.automatic-parallelization` for details.
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+
 - Support for `pathlib.Path` objects added to
   :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
   :py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset`
@@ -232,7 +239,7 @@ Bug fixes
   The previous behavior unintentionally causing additional tests to be skipped
   (:issue:`1531`). By `Joe Hamman <https://github.com/jhamman>`_.
 
-- Fix pynio backend for upcoming release of pynio with python3 support 
+- Fix pynio backend for upcoming release of pynio with python3 support
   (:issue:`1611`). By `Ben Hillman <https://github/brhillman>`_.
 
 .. _whats-new.0.9.6:
diff --git a/xarray/__init__.py b/xarray/__init__.py
index 654ed77b28a..d2ce7d076f7 100644
--- a/xarray/__init__.py
+++ b/xarray/__init__.py
@@ -6,7 +6,7 @@
 from .core.alignment import align, broadcast, broadcast_arrays
 from .core.common import full_like, zeros_like, ones_like
 from .core.combine import concat, auto_combine
-from .core.computation import where
+from .core.computation import apply_ufunc, where
 from .core.extensions import (register_dataarray_accessor,
                               register_dataset_accessor)
 from .core.variable import as_variable, Variable, IndexVariable, Coordinate
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
index 0adc797cc4c..f419e04def3 100644
--- a/xarray/core/computation.py
+++ b/xarray/core/computation.py
@@ -676,6 +676,7 @@ def apply_ufunc(func, *args, **kwargs):
         Method for joining the indexes of the passed objects along each
         dimension, and the variables of Dataset objects with mismatched
         data variables:
+
         - 'outer': use the union of object indexes
         - 'inner': use the intersection of object indexes
         - 'left': use indexes from the first object with each dimension
@@ -685,6 +686,7 @@ def apply_ufunc(func, *args, **kwargs):
     dataset_join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
         Method for joining variables of Dataset objects with mismatched
         data variables.
+
         - 'outer': take variables from both Dataset objects
         - 'inner': take only overlapped variables
         - 'left': take only variables from the first object
@@ -701,6 +703,7 @@ def apply_ufunc(func, *args, **kwargs):
     dask: 'forbidden', 'allowed' or 'parallelized', optional
         How to handle applying to objects containing lazy data in the form of
         dask arrays:
+
         - 'forbidden' (default): raise an error if a dask array is encountered.
         - 'allowed': pass dask arrays directly on to ``func``.
         - 'parallelized': automatically parallelize ``func`` if any of the
@@ -724,7 +727,7 @@ def apply_ufunc(func, *args, **kwargs):
     ``apply_ufunc`` to write functions to (very nearly) replicate existing
     xarray functionality:
 
-    Calculate the vector magnitude of two arguments:
+    Calculate the vector magnitude of two arguments::
 
         def magnitude(a, b):
             func = lambda x, y: np.sqrt(x ** 2 + y ** 2)