Skip to content

Commit

Permalink
Merge pull request #557 from drdavella/inline-threshold
Browse files Browse the repository at this point in the history
Automatically store small numeric arrays inline
  • Loading branch information
drdavella authored Oct 15, 2018
2 parents 8af0595 + 2d06d13 commit fc51e24
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 18 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
2.2.0 (unreleased)
------------------

- Small numeric arrays are now automatically stored inline. This behavior can
be overridden using the new ``inline_threshold`` argument to the ``AsdfFile``
constructor. It can also be controlled with the existing
``set_array_storage`` method of ``AsdfFile`` and the ``all_array_storage``
argument to ``AsdfFile.write_to``. [#557]

2.1.1 (unreleased)
------------------
Expand Down
10 changes: 7 additions & 3 deletions asdf/asdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class AsdfFile(versioning.VersionedMixin):
def __init__(self, tree=None, uri=None, extensions=None, version=None,
ignore_version_mismatch=True, ignore_unrecognized_tag=False,
ignore_implicit_conversion=False, copy_arrays=False,
custom_schema=None):
custom_schema=None, inline_threshold=None):
"""
Parameters
----------
Expand Down Expand Up @@ -98,7 +98,10 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None,
files follow custom conventions beyond those enforced by the
standard.
"""
inline_threshold : int, optional
Optional threshold size below which arrays will automatically be
stored inline. Defaults to {0}.
""".format(block._DEFAULT_INLINE_THRESHOLD_SIZE)

if custom_schema is not None:
self._custom_schema = schema.load_custom_schema(custom_schema)
Expand All @@ -119,7 +122,8 @@ def __init__(self, tree=None, uri=None, extensions=None, version=None,
self._fd = None
self._closed = False
self._external_asdf_by_uri = {}
self._blocks = block.BlockManager(self, copy_arrays=copy_arrays)
self._blocks = block.BlockManager(self, copy_arrays=copy_arrays,
inline_threshold=inline_threshold)
self._uri = None
if tree is None:
self.tree = {}
Expand Down
32 changes: 29 additions & 3 deletions asdf/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from urllib import parse as urlparse

import numpy as np
from numpy.ma.core import masked_array

import yaml

Expand All @@ -25,11 +26,14 @@
from . import yamlutil


_DEFAULT_INLINE_THRESHOLD_SIZE = 50


class BlockManager(object):
"""
Manages the `Block`s associated with a ASDF file.
"""
def __init__(self, asdffile, copy_arrays=False):
def __init__(self, asdffile, copy_arrays=False, inline_threshold=None):
self._asdffile = weakref.ref(asdffile)

self._internal_blocks = []
Expand All @@ -44,6 +48,11 @@ def __init__(self, asdffile, copy_arrays=False):
'streamed': self._streamed_blocks
}

if inline_threshold is not None:
self._inline_threshold_size = inline_threshold
else:
self._inline_threshold_size = _DEFAULT_INLINE_THRESHOLD_SIZE

self._data_to_block_mapping = {}
self._validate_checksums = False
self._memmap = not copy_arrays
Expand Down Expand Up @@ -687,6 +696,20 @@ def get_source(self, block):

raise ValueError("block not found.")

def _should_inline(self, array):

if not np.issubdtype(array.dtype, np.number):
return False

if isinstance(array, masked_array):
return False

# Make sure none of the values are too large to store as literals
if (array > 2**52).any():
return False

return array.size <= self._inline_threshold_size

def find_or_create_block_for_array(self, arr, ctx):
"""
For a given array, looks for an existing block containing its
Expand All @@ -702,8 +725,7 @@ def find_or_create_block_for_array(self, arr, ctx):
block : Block
"""
from .tags.core import ndarray
if (isinstance(arr, ndarray.NDArrayType) and
arr.block is not None):
if (isinstance(arr, ndarray.NDArrayType) and arr.block is not None):
if arr.block in self.blocks:
return arr.block
else:
Expand All @@ -714,6 +736,10 @@ def find_or_create_block_for_array(self, arr, ctx):
if block is not None:
return block
block = Block(base)

if self._should_inline(arr):
block._array_storage = 'inline'

self.add(block)
self._handle_global_block_settings(ctx, block)
return block
Expand Down
5 changes: 4 additions & 1 deletion asdf/commands/tests/test_exploded.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ def test_explode_then_implode(tmpdir):

path = os.path.join(str(tmpdir), 'original.asdf')
ff = AsdfFile(tree)
ff.write_to(path)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ff.write_to(path, all_array_storage='internal')
assert len(ff.blocks) == 2

result = main.main_from_args(['explode', path])
Expand Down
3 changes: 3 additions & 0 deletions asdf/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ def assert_roundtrip_tree(tree, tmpdir, *, asdf_check_func=None,
"""
fname = str(tmpdir.join('test.asdf'))

# Most tests assume that all blocks will be stored internally
init_options.setdefault('inline_threshold', 0)

# First, test writing/reading a BytesIO buffer
buff = io.BytesIO()
AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options)
Expand Down
6 changes: 6 additions & 0 deletions asdf/tests/test_generic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ def tree(request):

def _roundtrip(tree, get_write_fd, get_read_fd,
write_options={}, read_options={}):

# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
write_options.setdefault('all_array_storage', 'internal')

with get_write_fd() as fd:
asdf.AsdfFile(tree).write_to(fd, **write_options)
# Work around the fact that generic_io's get_file doesn't have a way of
Expand Down
99 changes: 95 additions & 4 deletions asdf/tests/test_low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ def test_invalid_source(small_tree):
buff = io.BytesIO()

ff = asdf.AsdfFile(small_tree)
ff.write_to(buff)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ff.write_to(buff, all_array_storage='internal')

buff.seek(0)
with asdf.AsdfFile.open(buff) as ff2:
Expand Down Expand Up @@ -802,7 +805,10 @@ def test_deferred_block_loading(small_tree):
buff = io.BytesIO()

ff = asdf.AsdfFile(small_tree)
ff.write_to(buff, include_block_index=False)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ff.write_to(buff, include_block_index=False, all_array_storage='internal')

buff.seek(0)
with asdf.AsdfFile.open(buff) as ff2:
Expand Down Expand Up @@ -869,7 +875,10 @@ def test_large_block_index():
}

ff = asdf.AsdfFile(tree)
ff.write_to(buff)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ff.write_to(buff, all_array_storage='internal')

buff.seek(0)
with asdf.AsdfFile.open(buff) as ff2:
Expand Down Expand Up @@ -927,7 +936,10 @@ def test_short_file_find_block_index():
buff = io.BytesIO()

ff = asdf.AsdfFile({'arr': np.ndarray([1]), 'arr2': np.ndarray([2])})
ff.write_to(buff, include_block_index=False)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ff.write_to(buff, include_block_index=False, all_array_storage='internal')

buff.write(b'#ASDF BLOCK INDEX\n')
buff.write(b'0' * (io.DEFAULT_BUFFER_SIZE * 4))
Expand Down Expand Up @@ -1201,3 +1213,82 @@ def test_context_handler_resolve_and_inline(tmpdir):

with pytest.raises(OSError):
newf.tree['random'][0]


def test_inline_threshold(tmpdir):

tree = {
'small': np.ones(10),
'large': np.ones(100)
}

with asdf.AsdfFile(tree) as af:
assert len(list(af.blocks.inline_blocks)) == 1
assert len(list(af.blocks.internal_blocks)) == 1

with asdf.AsdfFile(tree, inline_threshold=10) as af:
assert len(list(af.blocks.inline_blocks)) == 1
assert len(list(af.blocks.internal_blocks)) == 1

with asdf.AsdfFile(tree, inline_threshold=5) as af:
assert len(list(af.blocks.inline_blocks)) == 0
assert len(list(af.blocks.internal_blocks)) == 2

with asdf.AsdfFile(tree, inline_threshold=100) as af:
assert len(list(af.blocks.inline_blocks)) == 2
assert len(list(af.blocks.internal_blocks)) == 0


def test_inline_threshold_masked(tmpdir):

mask = np.random.randint(0, 1+1, 20)
masked_array = np.ma.masked_array(np.ones(20), mask=mask)

tree = {
'masked': masked_array
}

# Make sure that masked arrays aren't automatically inlined, even if they
# are small enough
with asdf.AsdfFile(tree) as af:
assert len(list(af.blocks.inline_blocks)) == 0
assert len(list(af.blocks.internal_blocks)) == 2

tree = {
'masked': masked_array,
'normal': np.random.random(20)
}

with asdf.AsdfFile(tree) as af:
assert len(list(af.blocks.inline_blocks)) == 1
assert len(list(af.blocks.internal_blocks)) == 2


def test_inline_threshold_override(tmpdir):

tmpfile = str(tmpdir.join('inline.asdf'))

tree = {
'small': np.ones(10),
'large': np.ones(100)
}

with asdf.AsdfFile(tree) as af:
af.set_array_storage(tree['small'], 'internal')
assert len(list(af.blocks.inline_blocks)) == 0
assert len(list(af.blocks.internal_blocks)) == 2

with asdf.AsdfFile(tree) as af:
af.set_array_storage(tree['large'], 'inline')
assert len(list(af.blocks.inline_blocks)) == 2
assert len(list(af.blocks.internal_blocks)) == 0

with asdf.AsdfFile(tree) as af:
af.write_to(tmpfile, all_array_storage='internal')
assert len(list(af.blocks.inline_blocks)) == 0
assert len(list(af.blocks.internal_blocks)) == 2

with asdf.AsdfFile(tree) as af:
af.write_to(tmpfile, all_array_storage='inline')
assert len(list(af.blocks.inline_blocks)) == 2
assert len(list(af.blocks.internal_blocks)) == 0
7 changes: 5 additions & 2 deletions asdf/tests/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,14 @@ def test_external_reference(tmpdir):
}
external_path = os.path.join(str(tmpdir), 'external.asdf')
ext = asdf.AsdfFile(exttree)
ext.write_to(external_path)
# Since we're testing with small arrays, force all arrays to be stored
# in internal blocks rather than letting some of them be automatically put
# inline.
ext.write_to(external_path, all_array_storage='internal')

external_path = os.path.join(str(tmpdir), 'external2.asdf')
ff = asdf.AsdfFile(exttree)
ff.write_to(external_path)
ff.write_to(external_path, all_array_storage='internal')

tree = {
# The special name "data" here must be an array. This is
Expand Down
7 changes: 7 additions & 0 deletions asdf/tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def test_stream_with_nonstream():
}

ff = asdf.AsdfFile(tree)
# Since we're testing with small arrays, force this array to be stored in
# an internal block rather than letting it be automatically put inline.
ff.set_array_storage(ff['nonstream'], 'internal')
ff.write_to(buff)
for i in range(100):
buff.write(np.array([i] * 12, np.float64).tostring())
Expand All @@ -112,6 +115,10 @@ def test_stream_real_file(tmpdir):

with open(path, 'wb') as fd:
ff = asdf.AsdfFile(tree)
# Since we're testing with small arrays, force this array to be stored
# in an internal block rather than letting it be automatically put
# inline.
ff.set_array_storage(ff['nonstream'], 'internal')
ff.write_to(fd)
for i in range(100):
fd.write(np.array([i] * 12, np.float64).tostring())
Expand Down
36 changes: 31 additions & 5 deletions docs/asdf/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,37 @@ data being saved.
Saving inline arrays
--------------------

For small arrays, you may not care about the efficiency of a binary
representation and just want to save the array contents directly in the YAML
tree. The `~asdf.AsdfFile.set_array_storage` method can be used to set the
storage type of the associated data. The allowed values are ``internal``,
``external``, and ``inline``.
As of `asdf-2.2.0`, small numerical arrays are automatically stored inline. The
default threshold size for inline versus internal arrays can be found with the
following:

.. code::
>>> from asdf.block import _DEFAULT_INLINE_THRESHOLD_SIZE
>>> print(_DEFAULT_INLINE_THRESHOLD_SIZE)
50
The default threshold can be overridden passing the `inline_threshold` argument
to the `asdf.AsdfFile` constructor. Setting `inline_threshold=0` has the effect
of making all small arrays be stored in internal blocks:

.. runcode::

from asdf import AsdfFile
import numpy as np

# Ordinarilly an array this size would be automatically inlined
my_array = np.ones(10)
tree = {'my_array': my_array}
# Set the inline threshold to 0 to force internal storage
with AsdfFile(tree, inline_threshold=0) as ff:
ff.write_to("test.asdf")

.. asdf:: test.asdf

The `~asdf.AsdfFile.set_array_storage` method can be used to set or override
the default storage type of a particular data array. The allowed values are
``internal``, ``external``, and ``inline``.

- ``internal``: The default. The array data will be
stored in a binary block in the same ASDF file.
Expand Down

0 comments on commit fc51e24

Please sign in to comment.