diff --git a/CHANGES.rst b/CHANGES.rst index f12f020cf..1a8555a88 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ 2.2.0 (unreleased) ------------------ +- Small numeric arrays are now automatically stored inline. This behavior can + be overridden using the new ``inline_threshold`` argument to the ``AsdfFile`` + constructor. It can also be controlled with the existing + ``set_array_storage`` method of ``AsdfFile`` and the ``all_array_storage`` + argument to ``AsdfFile.write_to``. [#557] 2.1.1 (unreleased) ------------------ diff --git a/asdf/asdf.py b/asdf/asdf.py index ce54abf61..44b482726 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -51,7 +51,7 @@ class AsdfFile(versioning.VersionedMixin): def __init__(self, tree=None, uri=None, extensions=None, version=None, ignore_version_mismatch=True, ignore_unrecognized_tag=False, ignore_implicit_conversion=False, copy_arrays=False, - custom_schema=None): + custom_schema=None, inline_threshold=None): """ Parameters ---------- @@ -98,7 +98,10 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None, files follow custom conventions beyond those enforced by the standard. - """ + inline_threshold : int, optional + Optional threshold size below which arrays will automatically be + stored inline. Defaults to {0}. + """.format(block._DEFAULT_INLINE_THRESHOLD_SIZE) if custom_schema is not None: self._custom_schema = schema.load_custom_schema(custom_schema) @@ -119,7 +122,8 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None, self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = block.BlockManager(self, copy_arrays=copy_arrays) + self._blocks = block.BlockManager(self, copy_arrays=copy_arrays, + inline_threshold=inline_threshold) self._uri = None if tree is None: self.tree = {} diff --git a/asdf/block.py b/asdf/block.py index d75a6af3f..36fc2e797 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -12,6 +12,7 @@ from urllib import parse as urlparse import numpy as np +from numpy.ma.core import masked_array import yaml @@ -25,11 +26,14 @@ from . import yamlutil +_DEFAULT_INLINE_THRESHOLD_SIZE = 50 + + class BlockManager(object): """ Manages the `Block`s associated with a ASDF file. """ - def __init__(self, asdffile, copy_arrays=False): + def __init__(self, asdffile, copy_arrays=False, inline_threshold=None): self._asdffile = weakref.ref(asdffile) self._internal_blocks = [] @@ -44,6 +48,11 @@ def __init__(self, asdffile, copy_arrays=False): 'streamed': self._streamed_blocks } + if inline_threshold is not None: + self._inline_threshold_size = inline_threshold + else: + self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE + self._data_to_block_mapping = {} self._validate_checksums = False self._memmap = not copy_arrays @@ -687,6 +696,20 @@ def get_source(self, block): raise ValueError("block not found.") + def _should_inline(self, array): + + if not np.issubdtype(array.dtype, np.number): + return False + + if isinstance(array, masked_array): + return False + + # Make sure none of the values are too large to store as literals + if (array > 2**52).any(): + return False + + return array.size <= self._inline_threshold_size + def find_or_create_block_for_array(self, arr, ctx): """ For a given array, looks for an existing block containing its @@ -702,8 +725,7 @@ def find_or_create_block_for_array(self, arr, ctx): block : Block """ from .tags.core import ndarray - if (isinstance(arr, ndarray.NDArrayType) and - arr.block is not None): + if (isinstance(arr, ndarray.NDArrayType) and arr.block is not None): if arr.block in self.blocks: return arr.block else: @@ -714,6 +736,10 @@ def find_or_create_block_for_array(self, arr, ctx): if block is not None: return block block = Block(base) + + if self._should_inline(arr): + block._array_storage = 'inline' + self.add(block) self._handle_global_block_settings(ctx, block) return block diff --git a/asdf/commands/tests/test_exploded.py b/asdf/commands/tests/test_exploded.py index e3b2855a6..47629f939 100644 --- a/asdf/commands/tests/test_exploded.py +++ b/asdf/commands/tests/test_exploded.py @@ -23,7 +23,10 @@ def test_explode_then_implode(tmpdir): path = os.path.join(str(tmpdir), 'original.asdf') ff = AsdfFile(tree) - ff.write_to(path) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(path, all_array_storage='internal') assert len(ff.blocks) == 2 result = main.main_from_args(['explode', path]) diff --git a/asdf/tests/helpers.py b/asdf/tests/helpers.py index 665c4148d..a1ed31356 100644 --- a/asdf/tests/helpers.py +++ b/asdf/tests/helpers.py @@ -170,6 +170,9 @@ def assert_roundtrip_tree(tree, tmpdir, *, asdf_check_func=None, """ fname = str(tmpdir.join('test.asdf')) + # Most tests assume that all blocks will be stored internally + init_options.setdefault('inline_threshold', 0) + # First, test writing/reading a BytesIO buffer buff = io.BytesIO() AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options) diff --git a/asdf/tests/test_generic_io.py b/asdf/tests/test_generic_io.py index c78b14ec1..342b8db52 100644 --- a/asdf/tests/test_generic_io.py +++ b/asdf/tests/test_generic_io.py @@ -28,6 +28,12 @@ def tree(request): def _roundtrip(tree, get_write_fd, get_read_fd, write_options={}, read_options={}): + + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + write_options.setdefault('all_array_storage', 'internal') + with get_write_fd() as fd: asdf.AsdfFile(tree).write_to(fd, **write_options) # Work around the fact that generic_io's get_file doesn't have a way of diff --git a/asdf/tests/test_low_level.py b/asdf/tests/test_low_level.py index 8590e607c..7d54a1018 100644 --- a/asdf/tests/test_low_level.py +++ b/asdf/tests/test_low_level.py @@ -128,7 +128,10 @@ def test_invalid_source(small_tree): buff = io.BytesIO() ff = asdf.AsdfFile(small_tree) - ff.write_to(buff) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -802,7 +805,10 @@ def test_deferred_block_loading(small_tree): buff = io.BytesIO() ff = asdf.AsdfFile(small_tree) - ff.write_to(buff, include_block_index=False) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, include_block_index=False, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -869,7 +875,10 @@ def test_large_block_index(): } ff = asdf.AsdfFile(tree) - ff.write_to(buff) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -927,7 +936,10 @@ def test_short_file_find_block_index(): buff = io.BytesIO() ff = asdf.AsdfFile({'arr': np.ndarray([1]), 'arr2': np.ndarray([2])}) - ff.write_to(buff, include_block_index=False) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, include_block_index=False, all_array_storage='internal') buff.write(b'#ASDF BLOCK INDEX\n') buff.write(b'0' * (io.DEFAULT_BUFFER_SIZE * 4)) @@ -1201,3 +1213,82 @@ def test_context_handler_resolve_and_inline(tmpdir): with pytest.raises(OSError): newf.tree['random'][0] + + +def test_inline_threshold(tmpdir): + + tree = { + 'small': np.ones(10), + 'large': np.ones(100) + } + + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 1 + + with asdf.AsdfFile(tree, inline_threshold=10) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 1 + + with asdf.AsdfFile(tree, inline_threshold=5) as af: + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree, inline_threshold=100) as af: + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 + + +def test_inline_threshold_masked(tmpdir): + + mask = np.random.randint(0, 1+1, 20) + masked_array = np.ma.masked_array(np.ones(20), mask=mask) + + tree = { + 'masked': masked_array + } + + # Make sure that masked arrays aren't automatically inlined, even if they + # are small enough + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + tree = { + 'masked': masked_array, + 'normal': np.random.random(20) + } + + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 2 + + +def test_inline_threshold_override(tmpdir): + + tmpfile = str(tmpdir.join('inline.asdf')) + + tree = { + 'small': np.ones(10), + 'large': np.ones(100) + } + + with asdf.AsdfFile(tree) as af: + af.set_array_storage(tree['small'], 'internal') + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree) as af: + af.set_array_storage(tree['large'], 'inline') + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile, all_array_storage='internal') + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile, all_array_storage='inline') + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 diff --git a/asdf/tests/test_reference.py b/asdf/tests/test_reference.py index 87edf142b..74f70657f 100644 --- a/asdf/tests/test_reference.py +++ b/asdf/tests/test_reference.py @@ -32,11 +32,14 @@ def test_external_reference(tmpdir): } external_path = os.path.join(str(tmpdir), 'external.asdf') ext = asdf.AsdfFile(exttree) - ext.write_to(external_path) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ext.write_to(external_path, all_array_storage='internal') external_path = os.path.join(str(tmpdir), 'external2.asdf') ff = asdf.AsdfFile(exttree) - ff.write_to(external_path) + ff.write_to(external_path, all_array_storage='internal') tree = { # The special name "data" here must be an array. This is diff --git a/asdf/tests/test_stream.py b/asdf/tests/test_stream.py index 9b2b1f29d..f3cdf82b4 100644 --- a/asdf/tests/test_stream.py +++ b/asdf/tests/test_stream.py @@ -87,6 +87,9 @@ def test_stream_with_nonstream(): } ff = asdf.AsdfFile(tree) + # Since we're testing with small arrays, force this array to be stored in + # an internal block rather than letting it be automatically put inline. + ff.set_array_storage(ff['nonstream'], 'internal') ff.write_to(buff) for i in range(100): buff.write(np.array([i] * 12, np.float64).tostring()) @@ -112,6 +115,10 @@ def test_stream_real_file(tmpdir): with open(path, 'wb') as fd: ff = asdf.AsdfFile(tree) + # Since we're testing with small arrays, force this array to be stored + # in an internal block rather than letting it be automatically put + # inline. + ff.set_array_storage(ff['nonstream'], 'internal') ff.write_to(fd) for i in range(100): fd.write(np.array([i] * 12, np.float64).tostring()) diff --git a/docs/asdf/arrays.rst b/docs/asdf/arrays.rst index 5e588c120..371dc9163 100644 --- a/docs/asdf/arrays.rst +++ b/docs/asdf/arrays.rst @@ -56,11 +56,37 @@ data being saved. Saving inline arrays -------------------- -For small arrays, you may not care about the efficiency of a binary -representation and just want to save the array contents directly in the YAML -tree. The `~asdf.AsdfFile.set_array_storage` method can be used to set the -storage type of the associated data. The allowed values are ``internal``, -``external``, and ``inline``. +As of `asdf-2.2.0`, small numerical arrays are automatically stored inline. The +default threshold size for inline versus internal arrays can be found with the +following: + +.. code:: + + >>> from asdf.block import _DEFAULT_INLINE_THRESHOLD_SIZE + >>> print(_DEFAULT_INLINE_THRESHOLD_SIZE) + 50 + +The default threshold can be overridden passing the `inline_threshold` argument +to the `asdf.AsdfFile` constructor. Setting `inline_threshold=0` has the effect +of making all small arrays be stored in internal blocks: + +.. runcode:: + + from asdf import AsdfFile + import numpy as np + + # Ordinarilly an array this size would be automatically inlined + my_array = np.ones(10) + tree = {'my_array': my_array} + # Set the inline threshold to 0 to force internal storage + with AsdfFile(tree, inline_threshold=0) as ff: + ff.write_to("test.asdf") + +.. asdf:: test.asdf + +The `~asdf.AsdfFile.set_array_storage` method can be used to set or override +the default storage type of a particular data array. The allowed values are +``internal``, ``external``, and ``inline``. - ``internal``: The default. The array data will be stored in a binary block in the same ASDF file.