pydata · shoyer · Oct 9, 2018 · Jul 1, 2018 · Jul 1, 2018 · Jul 4, 2018
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -25,11 +25,23 @@ What's New
   - `Python 3 Statement <http://www.python3statement.org/>`__
   - `Tips on porting to Python 3 <https://docs.python.org/3/howto/pyporting.html>`__
 
-.. _whats-new.0.10.9:
+.. _whats-new.0.11.0:
 
-v0.10.9 (unreleased)
+v0.11.0 (unreleased)
 --------------------
 
+Breaking changes
+~~~~~~~~~~~~~~~~
+
+- Xarray's storage backends now automatically open and close files when
+  necessary, rather than requiring opening a file with ``autoclose=True``. A
+  global least-recently-used cache is used to store open files; the default
+  limit of 512 open files should suffice in most cases, but can be adjusted if
+  necessary with
+  ``xarray.set_options(file_cache_maxsize=...)``.
+
+  TODO: Add some note about performance benefits.
+
 Documentation
 ~~~~~~~~~~~~~
 

diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py
@@ -4,6 +4,7 @@
 formats. They should not be used directly, but rather through Dataset objects.
 """
 from .common import AbstractDataStore
+from .file_manager import FileManager, CachingFileManager, DummyFileManager
 from .memory import InMemoryDataStore
 from .netCDF4_ import NetCDF4DataStore
 from .pydap_ import PydapDataStore
@@ -15,6 +16,9 @@
 
 __all__ = [
     'AbstractDataStore',
+    'FileManager',
+    'CachingFileManager',
+    'DummyFileManager',
     'InMemoryDataStore',
     'NetCDF4DataStore',
     'PydapDataStore',

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -4,6 +4,7 @@
 from glob import glob
 from io import BytesIO
 from numbers import Number
+import warnings
 
 import numpy as np
 
@@ -13,7 +14,7 @@
 from ..core.pycompat import basestring, path_type
 from ..core.utils import close_on_error, is_remote_uri
 from .common import (
-    HDF5_LOCK, ArrayWriter, CombinedLock, _get_scheduler, _get_scheduler_lock)
+    HDF5_LOCK, ArrayWriter, combine_locks, _get_scheduler, _get_scheduler_lock)
 
 DATAARRAY_NAME = '__xarray_dataarray_name__'
 DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
@@ -52,27 +53,42 @@ def _normalize_path(path):
         return os.path.abspath(os.path.expanduser(path))
 
 
-def _default_lock(filename, engine):
+def _default_read_lock(filename, engine):
     if filename.endswith('.gz'):
-        lock = False
+        lock = None
     else:
         if engine is None:
             engine = _get_default_engine(filename, allow_remote=True)
 
         if engine == 'netcdf4':
             if is_remote_uri(filename):
-                lock = False
+                lock = None
             else:
                 # TODO: identify netcdf3 files and don't use the global lock
                 # for them
                 lock = HDF5_LOCK
         elif engine in {'h5netcdf', 'pynio'}:
             lock = HDF5_LOCK
         else:
-            lock = False
+            lock = None
     return lock
 
 
+def _get_write_lock(engine, scheduler, format, path_or_file):
+    """ Get the lock(s) that apply to a particular scheduler/engine/format"""
+
+    locks = []
+
+    if (engine == 'h5netcdf' or engine == 'netcdf4' and
+            (format is None or format.startswith('NETCDF4'))):
+        locks.append(HDF5_LOCK)
+
+    locks.append(_get_scheduler_lock(scheduler, path_or_file))
+
+    return combine_locks(locks)
+
+
+
 def _validate_dataset_names(dataset):
     """DataArray.name and Dataset keys must be a string or None"""
     def check_name(name):
@@ -130,20 +146,6 @@ def _protect_dataset_variables_inplace(dataset, cache):
             variable.data = data
 
 
-def _get_lock(engine, scheduler, format, path_or_file):
-    """ Get the lock(s) that apply to a particular scheduler/engine/format"""
-
-    locks = []
-    if format in ['NETCDF4', None] and engine in ['h5netcdf', 'netcdf4']:
-        locks.append(HDF5_LOCK)
-    locks.append(_get_scheduler_lock(scheduler, path_or_file))
-
-    # When we have more than one lock, use the CombinedLock wrapper class
-    lock = CombinedLock(locks) if len(locks) > 1 else locks[0]
-
-    return lock
-
-
 def _finalize_store(write, store):
     """ Finalize this store by explicitly syncing and closing"""
     del write  # ensure writing is done first
@@ -152,7 +154,7 @@ def _finalize_store(write, store):
 
 
 def open_dataset(filename_or_obj, group=None, decode_cf=True,
-                 mask_and_scale=None, decode_times=True, autoclose=False,
+                 mask_and_scale=None, decode_times=True, autoclose=None,
                  concat_characters=True, decode_coords=True, engine=None,
                  chunks=None, lock=None, cache=None, drop_variables=None,
                  backend_kwargs=None):
@@ -235,6 +237,14 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
     --------
     open_mfdataset
     """
+    if autoclose is not None:
+        warnings.warn(
+            'The autoclose argument is no longer used by '
+            'xarray.open_dataset() and is now ignored; it will be removed in '
+            'xarray v0.12. If necessary, you can control the maximum number '
+            'of simultaneous open files with '
+            'xarray.set_options(file_cache_maxsize=...).',
+            FutureWarning, stacklevel=2)
 
     if mask_and_scale is None:
         mask_and_scale = not engine == 'pseudonetcdf'
@@ -272,18 +282,11 @@ def maybe_decode_store(store, lock=False):
                              mask_and_scale, decode_times, concat_characters,
                              decode_coords, engine, chunks, drop_variables)
             name_prefix = 'open_dataset-%s' % token
-            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token,
-                           lock=lock)
+            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
             ds2._file_obj = ds._file_obj
         else:
             ds2 = ds
 
-        # protect so that dataset store isn't necessarily closed, e.g.,
-        # streams like BytesIO  can't be reopened
-        # datastore backend is responsible for determining this capability
-        if store._autoclose:
-            store.close()
-
         return ds2
 
     if isinstance(filename_or_obj, path_type):
@@ -310,40 +313,34 @@ def maybe_decode_store(store, lock=False):
             else:
                 engine = 'scipy'
 
+        if lock is None:
+            lock = _default_read_lock(filename_or_obj, engine)
+
         if engine is None:
             engine = _get_default_engine(filename_or_obj,
                                          allow_remote=True)
         if engine == 'netcdf4':
-            store = backends.NetCDF4DataStore.open(filename_or_obj,
-                                                   group=group,
-                                                   autoclose=autoclose,
-                                                   **backend_kwargs)
+            store = backends.NetCDF4DataStore.open(
+                filename_or_obj, group=group, lock=lock, **backend_kwargs)
         elif engine == 'scipy':
-            store = backends.ScipyDataStore(filename_or_obj,
-                                            autoclose=autoclose,
-                                            **backend_kwargs)
+            store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
         elif engine == 'pydap':
-            store = backends.PydapDataStore.open(filename_or_obj,
-                                                 **backend_kwargs)
+            store = backends.PydapDataStore.open(
+                filename_or_obj, **backend_kwargs)
         elif engine == 'h5netcdf':
-            store = backends.H5NetCDFStore(filename_or_obj, group=group,
-                                           autoclose=autoclose,
-                                           **backend_kwargs)
+            store = backends.H5NetCDFStore(
+                filename_or_obj, group=group, lock=lock, **backend_kwargs)
         elif engine == 'pynio':
-            store = backends.NioDataStore(filename_or_obj,
-                                          autoclose=autoclose,
-                                           **backend_kwargs)
+            store = backends.NioDataStore(filename_or_obj, **backend_kwargs)
         elif engine == 'pseudonetcdf':
             store = backends.PseudoNetCDFDataStore.open(
-                filename_or_obj, autoclose=autoclose, **backend_kwargs)
+                filename_or_obj, **backend_kwargs)
         else:
             raise ValueError('unrecognized engine for open_dataset: %r'
                              % engine)
 
-        if lock is None:
-            lock = _default_lock(filename_or_obj, engine)
         with close_on_error(store):
-            return maybe_decode_store(store, lock)
+            return maybe_decode_store(store)
     else:
         if engine is not None and engine != 'scipy':
             raise ValueError('can only read file-like objects with '
@@ -355,7 +352,7 @@ def maybe_decode_store(store, lock=False):
 
 
 def open_dataarray(filename_or_obj, group=None, decode_cf=True,
-                   mask_and_scale=None, decode_times=True, autoclose=False,
+                   mask_and_scale=None, decode_times=True, autoclose=None,
                    concat_characters=True, decode_coords=True, engine=None,
                    chunks=None, lock=None, cache=None, drop_variables=None,
                    backend_kwargs=None):
@@ -390,10 +387,6 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
     decode_times : bool, optional
         If True, decode times encoded in the standard NetCDF datetime format
         into datetime objects. Otherwise, leave them encoded as numbers.
-    autoclose : bool, optional
-        If True, automatically close files to avoid OS Error of too many files
-        being open.  However, this option doesn't work with streams, e.g.,
-        BytesIO.
     concat_characters : bool, optional
         If True, concatenate along the last dimension of character arrays to
         form string arrays. Dimensions will only be concatenated over (and
@@ -490,7 +483,7 @@ def close(self):
 def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
                    compat='no_conflicts', preprocess=None, engine=None,
                    lock=None, data_vars='all', coords='different',
-                   autoclose=False, parallel=False, **kwargs):
+                   autoclose=None, parallel=False, **kwargs):
     """Open multiple files as a single dataset.
 
     Requires dask to be installed. See documentation for details on dask [1].
@@ -537,10 +530,6 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         Engine to use when reading files. If not provided, the default engine
         is chosen based on available dependencies, with a preference for
         'netcdf4'.
-    autoclose : bool, optional
-        If True, automatically close files to avoid OS Error of too many files
-        being open.  However, this option doesn't work with streams, e.g.,
-        BytesIO.
     lock : False, True or threading.Lock, optional
         This argument is passed on to :py:func:`dask.array.from_array`. By
         default, a per-variable lock is used when reading data from netCDF
@@ -605,7 +594,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         raise IOError('no files to open')
 
     if lock is None:
-        lock = _default_lock(paths[0], engine)
+        lock = _default_read_lock(paths[0], engine)
 
     open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock,
                        autoclose=autoclose, **kwargs)
@@ -701,18 +690,18 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
     # handle scheduler specific logic
     scheduler = _get_scheduler()
     have_chunks = any(v.chunks for v in dataset.variables.values())
-    if (have_chunks and scheduler in ['distributed', 'multiprocessing'] and
-            engine != 'netcdf4'):
+
+    autoclose = have_chunks and scheduler in ['distributed', 'multiprocessing']
+    if autoclose and engine == 'scipy':
         raise NotImplementedError("Writing netCDF files with the %s backend "
                                   "is not currently supported with dask's %s "
                                   "scheduler" % (engine, scheduler))
-    lock = _get_lock(engine, scheduler, format, path_or_file)
-    autoclose = (have_chunks and
-                 scheduler in ['distributed', 'multiprocessing'])
+    lock = _get_write_lock(engine, scheduler, format, path_or_file)
 
     target = path_or_file if path_or_file is not None else BytesIO()
-    store = store_open(target, mode, format, group, writer,
-                       autoclose=autoclose, lock=lock)
+    kwargs = dict(autoclose=True) if autoclose else {}
+    store = store_open(
+        target, mode, format, group, writer, lock=lock, **kwargs)
 
     if unlimited_dims is None:
         unlimited_dims = dataset.encoding.get('unlimited_dims', None)
@@ -735,6 +724,7 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
     if not sync:
         return store
 
+
 def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
                    engine=None, compute=True):
     """Write multiple datasets to disk as netCDF files simultaneously.