Merge pull request #557 from drdavella/inline-threshold

Automatically store small numeric arrays inline
asdf-format · Oct 15, 2018 · fc51e24 · fc51e24
2 parents 8af0595 + 2d06d13
commit fc51e24
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 18 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,11 @@
 2.2.0 (unreleased)
 ------------------
 
+- Small numeric arrays are now automatically stored inline. This behavior can
+  be overridden using the new ``inline_threshold`` argument to the ``AsdfFile``
+  constructor. It can also be controlled with the existing
+  ``set_array_storage`` method of ``AsdfFile`` and the ``all_array_storage``
+  argument to ``AsdfFile.write_to``. [#557]
 
 2.1.1 (unreleased)
 ------------------

diff --git a/asdf/asdf.py b/asdf/asdf.py
@@ -51,7 +51,7 @@ class AsdfFile(versioning.VersionedMixin):
     def __init__(self, tree=None, uri=None, extensions=None, version=None,
                  ignore_version_mismatch=True, ignore_unrecognized_tag=False,
                  ignore_implicit_conversion=False, copy_arrays=False,
-                 custom_schema=None):
+                 custom_schema=None, inline_threshold=None):
         """
         Parameters
         ----------
@@ -98,7 +98,10 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None,
             files follow custom conventions beyond those enforced by the
             standard.
 
-        """
+        inline_threshold : int, optional
+            Optional threshold size below which arrays will automatically be
+            stored inline. Defaults to {0}.
+        """.format(block._DEFAULT_INLINE_THRESHOLD_SIZE)
 
         if custom_schema is not None:
             self._custom_schema = schema.load_custom_schema(custom_schema)
@@ -119,7 +122,8 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None,
         self._fd = None
         self._closed = False
         self._external_asdf_by_uri = {}
-        self._blocks = block.BlockManager(self, copy_arrays=copy_arrays)
+        self._blocks = block.BlockManager(self, copy_arrays=copy_arrays,
+                                          inline_threshold=inline_threshold)
         self._uri = None
         if tree is None:
             self.tree = {}

diff --git a/asdf/block.py b/asdf/block.py
@@ -12,6 +12,7 @@
 from urllib import parse as urlparse
 
 import numpy as np
+from numpy.ma.core import masked_array
 
 import yaml
 
@@ -25,11 +26,14 @@
 from . import yamlutil
 
 
+_DEFAULT_INLINE_THRESHOLD_SIZE = 50
+
+
 class BlockManager(object):
     """
     Manages the `Block`s associated with a ASDF file.
     """
-    def __init__(self, asdffile, copy_arrays=False):
+    def __init__(self, asdffile, copy_arrays=False, inline_threshold=None):
         self._asdffile = weakref.ref(asdffile)
 
         self._internal_blocks = []
@@ -44,6 +48,11 @@ def __init__(self, asdffile, copy_arrays=False):
             'streamed': self._streamed_blocks
         }
 
+        if inline_threshold is not None:
+            self._inline_threshold_size = inline_threshold
+        else:
+            self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE
+
         self._data_to_block_mapping = {}
         self._validate_checksums = False
         self._memmap = not copy_arrays
@@ -687,6 +696,20 @@ def get_source(self, block):
 
         raise ValueError("block not found.")
 
+    def _should_inline(self, array):
+
+        if not np.issubdtype(array.dtype, np.number):
+            return False
+
+        if isinstance(array, masked_array):
+            return False
+
+        # Make sure none of the values are too large to store as literals
+        if (array > 2**52).any():
+            return False
+
+        return array.size <= self._inline_threshold_size
+
     def find_or_create_block_for_array(self, arr, ctx):
         """
         For a given array, looks for an existing block containing its
@@ -702,8 +725,7 @@ def find_or_create_block_for_array(self, arr, ctx):
         block : Block
         """
         from .tags.core import ndarray
-        if (isinstance(arr, ndarray.NDArrayType) and
-            arr.block is not None):
+        if (isinstance(arr, ndarray.NDArrayType) and arr.block is not None):
             if arr.block in self.blocks:
                 return arr.block
             else:
@@ -714,6 +736,10 @@ def find_or_create_block_for_array(self, arr, ctx):
         if block is not None:
             return block
         block = Block(base)
+
+        if self._should_inline(arr):
+            block._array_storage = 'inline'
+
         self.add(block)
         self._handle_global_block_settings(ctx, block)
         return block

diff --git a/asdf/commands/tests/test_exploded.py b/asdf/commands/tests/test_exploded.py
@@ -23,7 +23,10 @@ def test_explode_then_implode(tmpdir):
 
     path = os.path.join(str(tmpdir), 'original.asdf')
     ff = AsdfFile(tree)
-    ff.write_to(path)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ff.write_to(path, all_array_storage='internal')
     assert len(ff.blocks) == 2
 
     result = main.main_from_args(['explode', path])

diff --git a/asdf/tests/helpers.py b/asdf/tests/helpers.py
@@ -170,6 +170,9 @@ def assert_roundtrip_tree(tree, tmpdir, *, asdf_check_func=None,
     """
     fname = str(tmpdir.join('test.asdf'))
 
+    # Most tests assume that all blocks will be stored internally
+    init_options.setdefault('inline_threshold', 0)
+
     # First, test writing/reading a BytesIO buffer
     buff = io.BytesIO()
     AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options)

diff --git a/asdf/tests/test_generic_io.py b/asdf/tests/test_generic_io.py
@@ -28,6 +28,12 @@ def tree(request):
 
 def _roundtrip(tree, get_write_fd, get_read_fd,
                write_options={}, read_options={}):
+
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    write_options.setdefault('all_array_storage', 'internal')
+
     with get_write_fd() as fd:
         asdf.AsdfFile(tree).write_to(fd, **write_options)
         # Work around the fact that generic_io's get_file doesn't have a way of

diff --git a/asdf/tests/test_low_level.py b/asdf/tests/test_low_level.py
@@ -128,7 +128,10 @@ def test_invalid_source(small_tree):
     buff = io.BytesIO()
 
     ff = asdf.AsdfFile(small_tree)
-    ff.write_to(buff)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ff.write_to(buff, all_array_storage='internal')
 
     buff.seek(0)
     with asdf.AsdfFile.open(buff) as ff2:
@@ -802,7 +805,10 @@ def test_deferred_block_loading(small_tree):
     buff = io.BytesIO()
 
     ff = asdf.AsdfFile(small_tree)
-    ff.write_to(buff, include_block_index=False)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ff.write_to(buff, include_block_index=False, all_array_storage='internal')
 
     buff.seek(0)
     with asdf.AsdfFile.open(buff) as ff2:
@@ -869,7 +875,10 @@ def test_large_block_index():
     }
 
     ff = asdf.AsdfFile(tree)
-    ff.write_to(buff)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ff.write_to(buff, all_array_storage='internal')
 
     buff.seek(0)
     with asdf.AsdfFile.open(buff) as ff2:
@@ -927,7 +936,10 @@ def test_short_file_find_block_index():
     buff = io.BytesIO()
 
     ff = asdf.AsdfFile({'arr': np.ndarray([1]), 'arr2': np.ndarray([2])})
-    ff.write_to(buff, include_block_index=False)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ff.write_to(buff, include_block_index=False, all_array_storage='internal')
 
     buff.write(b'#ASDF BLOCK INDEX\n')
     buff.write(b'0' * (io.DEFAULT_BUFFER_SIZE * 4))
@@ -1201,3 +1213,82 @@ def test_context_handler_resolve_and_inline(tmpdir):
 
     with pytest.raises(OSError):
         newf.tree['random'][0]
+
+
+def test_inline_threshold(tmpdir):
+
+    tree = {
+        'small': np.ones(10),
+        'large': np.ones(100)
+    }
+
+    with asdf.AsdfFile(tree) as af:
+        assert len(list(af.blocks.inline_blocks)) == 1
+        assert len(list(af.blocks.internal_blocks)) == 1
+
+    with asdf.AsdfFile(tree, inline_threshold=10) as af:
+        assert len(list(af.blocks.inline_blocks)) == 1
+        assert len(list(af.blocks.internal_blocks)) == 1
+
+    with asdf.AsdfFile(tree, inline_threshold=5) as af:
+        assert len(list(af.blocks.inline_blocks)) == 0
+        assert len(list(af.blocks.internal_blocks)) == 2
+
+    with asdf.AsdfFile(tree, inline_threshold=100) as af:
+        assert len(list(af.blocks.inline_blocks)) == 2
+        assert len(list(af.blocks.internal_blocks)) == 0
+
+
+def test_inline_threshold_masked(tmpdir):
+
+    mask = np.random.randint(0, 1+1, 20)
+    masked_array = np.ma.masked_array(np.ones(20), mask=mask)
+
+    tree = {
+        'masked': masked_array
+    }
+
+    # Make sure that masked arrays aren't automatically inlined, even if they
+    # are small enough
+    with asdf.AsdfFile(tree) as af:
+        assert len(list(af.blocks.inline_blocks)) == 0
+        assert len(list(af.blocks.internal_blocks)) == 2
+
+    tree = {
+        'masked': masked_array,
+        'normal': np.random.random(20)
+    }
+
+    with asdf.AsdfFile(tree) as af:
+        assert len(list(af.blocks.inline_blocks)) == 1
+        assert len(list(af.blocks.internal_blocks)) == 2
+
+
+def test_inline_threshold_override(tmpdir):
+
+    tmpfile = str(tmpdir.join('inline.asdf'))
+
+    tree = {
+        'small': np.ones(10),
+        'large': np.ones(100)
+    }
+
+    with asdf.AsdfFile(tree) as af:
+        af.set_array_storage(tree['small'], 'internal')
+        assert len(list(af.blocks.inline_blocks)) == 0
+        assert len(list(af.blocks.internal_blocks)) == 2
+
+    with asdf.AsdfFile(tree) as af:
+        af.set_array_storage(tree['large'], 'inline')
+        assert len(list(af.blocks.inline_blocks)) == 2
+        assert len(list(af.blocks.internal_blocks)) == 0
+
+    with asdf.AsdfFile(tree) as af:
+        af.write_to(tmpfile, all_array_storage='internal')
+        assert len(list(af.blocks.inline_blocks)) == 0
+        assert len(list(af.blocks.internal_blocks)) == 2
+
+    with asdf.AsdfFile(tree) as af:
+        af.write_to(tmpfile, all_array_storage='inline')
+        assert len(list(af.blocks.inline_blocks)) == 2
+        assert len(list(af.blocks.internal_blocks)) == 0
diff --git a/asdf/tests/test_reference.py b/asdf/tests/test_reference.py
@@ -32,11 +32,14 @@ def test_external_reference(tmpdir):
         }
     external_path = os.path.join(str(tmpdir), 'external.asdf')
     ext = asdf.AsdfFile(exttree)
-    ext.write_to(external_path)
+    # Since we're testing with small arrays, force all arrays to be stored
+    # in internal blocks rather than letting some of them be automatically put
+    # inline.
+    ext.write_to(external_path, all_array_storage='internal')
 
     external_path = os.path.join(str(tmpdir), 'external2.asdf')
     ff = asdf.AsdfFile(exttree)
-    ff.write_to(external_path)
+    ff.write_to(external_path, all_array_storage='internal')
 
     tree = {
         # The special name "data" here must be an array.  This is

diff --git a/asdf/tests/test_stream.py b/asdf/tests/test_stream.py
@@ -87,6 +87,9 @@ def test_stream_with_nonstream():
     }
 
     ff = asdf.AsdfFile(tree)
+    # Since we're testing with small arrays, force this array to be stored in
+    # an internal block rather than letting it be automatically put inline.
+    ff.set_array_storage(ff['nonstream'], 'internal')
     ff.write_to(buff)
     for i in range(100):
         buff.write(np.array([i] * 12, np.float64).tostring())
@@ -112,6 +115,10 @@ def test_stream_real_file(tmpdir):
 
     with open(path, 'wb') as fd:
         ff = asdf.AsdfFile(tree)
+        # Since we're testing with small arrays, force this array to be stored
+        # in an internal block rather than letting it be automatically put
+        # inline.
+        ff.set_array_storage(ff['nonstream'], 'internal')
         ff.write_to(fd)
         for i in range(100):
             fd.write(np.array([i] * 12, np.float64).tostring())

diff --git a/docs/asdf/arrays.rst b/docs/asdf/arrays.rst
@@ -56,11 +56,37 @@ data being saved.
 Saving inline arrays
 --------------------
 
-For small arrays, you may not care about the efficiency of a binary
-representation and just want to save the array contents directly in the YAML
-tree.  The `~asdf.AsdfFile.set_array_storage` method can be used to set the
-storage type of the associated data. The allowed values are ``internal``,
-``external``, and ``inline``.
+As of `asdf-2.2.0`, small numerical arrays are automatically stored inline. The
+default threshold size for inline versus internal arrays can be found with the
+following:
+
+.. code::
+
+   >>> from asdf.block import _DEFAULT_INLINE_THRESHOLD_SIZE
+   >>> print(_DEFAULT_INLINE_THRESHOLD_SIZE)
+   50
+
+The default threshold can be overridden passing the `inline_threshold` argument
+to the `asdf.AsdfFile` constructor. Setting `inline_threshold=0` has the effect
+of making all small arrays be stored in internal blocks:
+
+.. runcode::
+
+   from asdf import AsdfFile
+   import numpy as np
+
+   # Ordinarilly an array this size would be automatically inlined
+   my_array = np.ones(10)
+   tree = {'my_array': my_array}
+   # Set the inline threshold to 0 to force internal storage
+   with AsdfFile(tree, inline_threshold=0) as ff:
+      ff.write_to("test.asdf")
+
+.. asdf:: test.asdf
+
+The `~asdf.AsdfFile.set_array_storage` method can be used to set or override
+the default storage type of a particular data array. The allowed values are
+``internal``, ``external``, and ``inline``.
 
 - ``internal``: The default.  The array data will be
   stored in a binary block in the same ASDF file.