From 5ceed65e1a827868aa4861b31a4d4cceac758d8c Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 14:31:10 -0400 Subject: [PATCH 01/10] Implement basic threshold for inlining array data --- asdf/block.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/asdf/block.py b/asdf/block.py index d75a6af3f..513c02d31 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -25,6 +25,9 @@ from . import yamlutil +_DEFAULT_INLINE_THRESHOLD_SIZE = 100 + + class BlockManager(object): """ Manages the `Block`s associated with a ASDF file. @@ -44,6 +47,8 @@ def __init__(self, asdffile, copy_arrays=False): 'streamed': self._streamed_blocks } + self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE + self._data_to_block_mapping = {} self._validate_checksums = False self._memmap = not copy_arrays @@ -702,8 +707,7 @@ def find_or_create_block_for_array(self, arr, ctx): block : Block """ from .tags.core import ndarray - if (isinstance(arr, ndarray.NDArrayType) and - arr.block is not None): + if (isinstance(arr, ndarray.NDArrayType) and arr.block is not None): if arr.block in self.blocks: return arr.block else: @@ -714,6 +718,10 @@ def find_or_create_block_for_array(self, arr, ctx): if block is not None: return block block = Block(base) + + if arr.size <= self._inline_threshold_size: + block._array_storage = 'inline' + self.add(block) self._handle_global_block_settings(ctx, block) return block From b4b5fe2204ec0f7d951977f89d3bd5585883fb77 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 14:52:19 -0400 Subject: [PATCH 02/10] Add top-level option for controlling inline threshold size --- asdf/asdf.py | 10 +++++++--- asdf/block.py | 7 +++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/asdf/asdf.py b/asdf/asdf.py index ce54abf61..44b482726 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -51,7 +51,7 @@ class AsdfFile(versioning.VersionedMixin): def __init__(self, tree=None, uri=None, extensions=None, version=None, ignore_version_mismatch=True, ignore_unrecognized_tag=False, ignore_implicit_conversion=False, copy_arrays=False, - custom_schema=None): + custom_schema=None, inline_threshold=None): """ Parameters ---------- @@ -98,7 +98,10 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None, files follow custom conventions beyond those enforced by the standard. - """ + inline_threshold : int, optional + Optional threshold size below which arrays will automatically be + stored inline. Defaults to {0}. + """.format(block._DEFAULT_INLINE_THRESHOLD_SIZE) if custom_schema is not None: self._custom_schema = schema.load_custom_schema(custom_schema) @@ -119,7 +122,8 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None, self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = block.BlockManager(self, copy_arrays=copy_arrays) + self._blocks = block.BlockManager(self, copy_arrays=copy_arrays, + inline_threshold=inline_threshold) self._uri = None if tree is None: self.tree = {} diff --git a/asdf/block.py b/asdf/block.py index 513c02d31..8d591c7af 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -32,7 +32,7 @@ class BlockManager(object): """ Manages the `Block`s associated with a ASDF file. """ - def __init__(self, asdffile, copy_arrays=False): + def __init__(self, asdffile, copy_arrays=False, inline_threshold=None): self._asdffile = weakref.ref(asdffile) self._internal_blocks = [] @@ -47,7 +47,10 @@ def __init__(self, asdffile, copy_arrays=False): 'streamed': self._streamed_blocks } - self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE + if inline_threshold is not None: + self._inline_threshold_size = inline_threshold + else: + self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE self._data_to_block_mapping = {} self._validate_checksums = False From a25bb0b50d4ec5bffe8376044281a3b935a3e685 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 14:59:11 -0400 Subject: [PATCH 03/10] Set default inline threshold level to 0 for test helpers... Since many tests assume that arrays will be stored internally --- asdf/tests/helpers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asdf/tests/helpers.py b/asdf/tests/helpers.py index 665c4148d..a1ed31356 100644 --- a/asdf/tests/helpers.py +++ b/asdf/tests/helpers.py @@ -170,6 +170,9 @@ def assert_roundtrip_tree(tree, tmpdir, *, asdf_check_func=None, """ fname = str(tmpdir.join('test.asdf')) + # Most tests assume that all blocks will be stored internally + init_options.setdefault('inline_threshold', 0) + # First, test writing/reading a BytesIO buffer buff = io.BytesIO() AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options) From df3a7b52fb5177d8fc2a10640fc8fe8c4cae8950 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 15:05:26 -0400 Subject: [PATCH 04/10] Do not automatically inline masked arrays --- asdf/block.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asdf/block.py b/asdf/block.py index 8d591c7af..aba5fbde0 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -12,6 +12,7 @@ from urllib import parse as urlparse import numpy as np +from numpy.ma.core import masked_array import yaml @@ -723,7 +724,8 @@ def find_or_create_block_for_array(self, arr, ctx): block = Block(base) if arr.size <= self._inline_threshold_size: - block._array_storage = 'inline' + if not isinstance(arr, masked_array): + block._array_storage = 'inline' self.add(block) self._handle_global_block_settings(ctx, block) From 1516a493f3f6bc91b7c69b4fbf85dd70a1834555 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 15:21:23 -0400 Subject: [PATCH 05/10] Adjust inline threshold size down to 50... This is pretty arbitrary but it makes a lot more tests pass without modification. --- asdf/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/block.py b/asdf/block.py index aba5fbde0..a101acbcf 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -26,7 +26,7 @@ from . import yamlutil -_DEFAULT_INLINE_THRESHOLD_SIZE = 100 +_DEFAULT_INLINE_THRESHOLD_SIZE = 50 class BlockManager(object): From 091943b7a028ee6949c1202d3c817010fa5ace12 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 16:26:12 -0400 Subject: [PATCH 06/10] More sophisticated check for automatic inlining of arrays --- asdf/block.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/asdf/block.py b/asdf/block.py index a101acbcf..36fc2e797 100644 --- a/asdf/block.py +++ b/asdf/block.py @@ -696,6 +696,20 @@ def get_source(self, block): raise ValueError("block not found.") + def _should_inline(self, array): + + if not np.issubdtype(array.dtype, np.number): + return False + + if isinstance(array, masked_array): + return False + + # Make sure none of the values are too large to store as literals + if (array > 2**52).any(): + return False + + return array.size <= self._inline_threshold_size + def find_or_create_block_for_array(self, arr, ctx): """ For a given array, looks for an existing block containing its @@ -723,9 +737,8 @@ def find_or_create_block_for_array(self, arr, ctx): return block block = Block(base) - if arr.size <= self._inline_threshold_size: - if not isinstance(arr, masked_array): - block._array_storage = 'inline' + if self._should_inline(arr): + block._array_storage = 'inline' self.add(block) self._handle_global_block_settings(ctx, block) From 85eaae410cca583961a780e0ccd413f8b7c6a73b Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 16:26:57 -0400 Subject: [PATCH 07/10] Update tests to account for automatic inlining of arrays --- asdf/commands/tests/test_exploded.py | 5 ++++- asdf/tests/test_generic_io.py | 6 ++++++ asdf/tests/test_low_level.py | 20 ++++++++++++++++---- asdf/tests/test_reference.py | 7 +++++-- asdf/tests/test_stream.py | 7 +++++++ 5 files changed, 38 insertions(+), 7 deletions(-) diff --git a/asdf/commands/tests/test_exploded.py b/asdf/commands/tests/test_exploded.py index e3b2855a6..47629f939 100644 --- a/asdf/commands/tests/test_exploded.py +++ b/asdf/commands/tests/test_exploded.py @@ -23,7 +23,10 @@ def test_explode_then_implode(tmpdir): path = os.path.join(str(tmpdir), 'original.asdf') ff = AsdfFile(tree) - ff.write_to(path) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(path, all_array_storage='internal') assert len(ff.blocks) == 2 result = main.main_from_args(['explode', path]) diff --git a/asdf/tests/test_generic_io.py b/asdf/tests/test_generic_io.py index c78b14ec1..342b8db52 100644 --- a/asdf/tests/test_generic_io.py +++ b/asdf/tests/test_generic_io.py @@ -28,6 +28,12 @@ def tree(request): def _roundtrip(tree, get_write_fd, get_read_fd, write_options={}, read_options={}): + + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + write_options.setdefault('all_array_storage', 'internal') + with get_write_fd() as fd: asdf.AsdfFile(tree).write_to(fd, **write_options) # Work around the fact that generic_io's get_file doesn't have a way of diff --git a/asdf/tests/test_low_level.py b/asdf/tests/test_low_level.py index 8590e607c..14d27c979 100644 --- a/asdf/tests/test_low_level.py +++ b/asdf/tests/test_low_level.py @@ -128,7 +128,10 @@ def test_invalid_source(small_tree): buff = io.BytesIO() ff = asdf.AsdfFile(small_tree) - ff.write_to(buff) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -802,7 +805,10 @@ def test_deferred_block_loading(small_tree): buff = io.BytesIO() ff = asdf.AsdfFile(small_tree) - ff.write_to(buff, include_block_index=False) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, include_block_index=False, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -869,7 +875,10 @@ def test_large_block_index(): } ff = asdf.AsdfFile(tree) - ff.write_to(buff) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, all_array_storage='internal') buff.seek(0) with asdf.AsdfFile.open(buff) as ff2: @@ -927,7 +936,10 @@ def test_short_file_find_block_index(): buff = io.BytesIO() ff = asdf.AsdfFile({'arr': np.ndarray([1]), 'arr2': np.ndarray([2])}) - ff.write_to(buff, include_block_index=False) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ff.write_to(buff, include_block_index=False, all_array_storage='internal') buff.write(b'#ASDF BLOCK INDEX\n') buff.write(b'0' * (io.DEFAULT_BUFFER_SIZE * 4)) diff --git a/asdf/tests/test_reference.py b/asdf/tests/test_reference.py index 87edf142b..74f70657f 100644 --- a/asdf/tests/test_reference.py +++ b/asdf/tests/test_reference.py @@ -32,11 +32,14 @@ def test_external_reference(tmpdir): } external_path = os.path.join(str(tmpdir), 'external.asdf') ext = asdf.AsdfFile(exttree) - ext.write_to(external_path) + # Since we're testing with small arrays, force all arrays to be stored + # in internal blocks rather than letting some of them be automatically put + # inline. + ext.write_to(external_path, all_array_storage='internal') external_path = os.path.join(str(tmpdir), 'external2.asdf') ff = asdf.AsdfFile(exttree) - ff.write_to(external_path) + ff.write_to(external_path, all_array_storage='internal') tree = { # The special name "data" here must be an array. This is diff --git a/asdf/tests/test_stream.py b/asdf/tests/test_stream.py index 9b2b1f29d..f3cdf82b4 100644 --- a/asdf/tests/test_stream.py +++ b/asdf/tests/test_stream.py @@ -87,6 +87,9 @@ def test_stream_with_nonstream(): } ff = asdf.AsdfFile(tree) + # Since we're testing with small arrays, force this array to be stored in + # an internal block rather than letting it be automatically put inline. + ff.set_array_storage(ff['nonstream'], 'internal') ff.write_to(buff) for i in range(100): buff.write(np.array([i] * 12, np.float64).tostring()) @@ -112,6 +115,10 @@ def test_stream_real_file(tmpdir): with open(path, 'wb') as fd: ff = asdf.AsdfFile(tree) + # Since we're testing with small arrays, force this array to be stored + # in an internal block rather than letting it be automatically put + # inline. + ff.set_array_storage(ff['nonstream'], 'internal') ff.write_to(fd) for i in range(100): fd.write(np.array([i] * 12, np.float64).tostring()) From deac0cd527fe639c6566f48a5c3b8b8620b24fbb Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 9 Oct 2018 16:31:02 -0400 Subject: [PATCH 08/10] Update change log --- CHANGES.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 7ecaa9d15..1d2b71d3b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ 2.2.0 (unreleased) ------------------ +- Small numeric arrays are now automatically stored inline. This behavior can + be overridden using the new ``inline_threshold`` argument to the ``AsdfFile`` + constructor. It can also be controlled with the existing + ``set_array_storage`` method of ``AsdfFile`` and the ``all_array_storage`` + argument to ``AsdfFile.write_to``. [#557] 2.1.1 (unreleased) ------------------ From ab534ac0144bf29a19022ff1b12761a8b7228f13 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Thu, 11 Oct 2018 16:46:19 -0400 Subject: [PATCH 09/10] Add test cases for inline array threshold size --- asdf/tests/test_low_level.py | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/asdf/tests/test_low_level.py b/asdf/tests/test_low_level.py index 14d27c979..7d54a1018 100644 --- a/asdf/tests/test_low_level.py +++ b/asdf/tests/test_low_level.py @@ -1213,3 +1213,82 @@ def test_context_handler_resolve_and_inline(tmpdir): with pytest.raises(OSError): newf.tree['random'][0] + + +def test_inline_threshold(tmpdir): + + tree = { + 'small': np.ones(10), + 'large': np.ones(100) + } + + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 1 + + with asdf.AsdfFile(tree, inline_threshold=10) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 1 + + with asdf.AsdfFile(tree, inline_threshold=5) as af: + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree, inline_threshold=100) as af: + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 + + +def test_inline_threshold_masked(tmpdir): + + mask = np.random.randint(0, 1+1, 20) + masked_array = np.ma.masked_array(np.ones(20), mask=mask) + + tree = { + 'masked': masked_array + } + + # Make sure that masked arrays aren't automatically inlined, even if they + # are small enough + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + tree = { + 'masked': masked_array, + 'normal': np.random.random(20) + } + + with asdf.AsdfFile(tree) as af: + assert len(list(af.blocks.inline_blocks)) == 1 + assert len(list(af.blocks.internal_blocks)) == 2 + + +def test_inline_threshold_override(tmpdir): + + tmpfile = str(tmpdir.join('inline.asdf')) + + tree = { + 'small': np.ones(10), + 'large': np.ones(100) + } + + with asdf.AsdfFile(tree) as af: + af.set_array_storage(tree['small'], 'internal') + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree) as af: + af.set_array_storage(tree['large'], 'inline') + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile, all_array_storage='internal') + assert len(list(af.blocks.inline_blocks)) == 0 + assert len(list(af.blocks.internal_blocks)) == 2 + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile, all_array_storage='inline') + assert len(list(af.blocks.inline_blocks)) == 2 + assert len(list(af.blocks.internal_blocks)) == 0 From 2d06d133303da315f8e417b773aa252fafd4ab00 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Fri, 12 Oct 2018 13:33:34 -0400 Subject: [PATCH 10/10] Update documentation --- docs/asdf/arrays.rst | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/docs/asdf/arrays.rst b/docs/asdf/arrays.rst index 5e588c120..371dc9163 100644 --- a/docs/asdf/arrays.rst +++ b/docs/asdf/arrays.rst @@ -56,11 +56,37 @@ data being saved. Saving inline arrays -------------------- -For small arrays, you may not care about the efficiency of a binary -representation and just want to save the array contents directly in the YAML -tree. The `~asdf.AsdfFile.set_array_storage` method can be used to set the -storage type of the associated data. The allowed values are ``internal``, -``external``, and ``inline``. +As of `asdf-2.2.0`, small numerical arrays are automatically stored inline. The +default threshold size for inline versus internal arrays can be found with the +following: + +.. code:: + + >>> from asdf.block import _DEFAULT_INLINE_THRESHOLD_SIZE + >>> print(_DEFAULT_INLINE_THRESHOLD_SIZE) + 50 + +The default threshold can be overridden passing the `inline_threshold` argument +to the `asdf.AsdfFile` constructor. Setting `inline_threshold=0` has the effect +of making all small arrays be stored in internal blocks: + +.. runcode:: + + from asdf import AsdfFile + import numpy as np + + # Ordinarilly an array this size would be automatically inlined + my_array = np.ones(10) + tree = {'my_array': my_array} + # Set the inline threshold to 0 to force internal storage + with AsdfFile(tree, inline_threshold=0) as ff: + ff.write_to("test.asdf") + +.. asdf:: test.asdf + +The `~asdf.AsdfFile.set_array_storage` method can be used to set or override +the default storage type of a particular data array. The allowed values are +``internal``, ``external``, and ``inline``. - ``internal``: The default. The array data will be stored in a binary block in the same ASDF file.