diff --git a/CHANGES.rst b/CHANGES.rst index 7870758b7..fb0f051c8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,12 @@ is still effective and the active memory maps may still require the file to stay open in case ``copy_arrays`` is ``False``. [#573] +- Storage of arbitrary precision integers is now provided by + ``asdf.IntegerType``. Reading a file with integer literals that are too + large now causes only a warning instead of a validation error. This is to + provide backwards compatibility for files that were created with a buggy + version of ASDF (see #553 below). [#566] + 2.1.1 (unreleased) ------------------ diff --git a/asdf-standard b/asdf-standard index 0bc7aa7c6..dfada9046 160000 --- a/asdf-standard +++ b/asdf-standard @@ -1 +1 @@ -Subproject commit 0bc7aa7c6b7b2099174f6ac40e2590f366f8b092 +Subproject commit dfada904677a6b7f5e7473977fe9a14c4ecb8277 diff --git a/asdf/__init__.py b/asdf/__init__.py index 224028fba..4586236be 100644 --- a/asdf/__init__.py +++ b/asdf/__init__.py @@ -15,7 +15,7 @@ __all__ = [ 'AsdfFile', 'CustomType', 'AsdfExtension', 'Stream', 'open', 'test', - 'commands', 'ExternalArrayReference' + 'commands', 'IntegerType', 'ExternalArrayReference' ] try: @@ -38,6 +38,7 @@ from .extension import AsdfExtension from .stream import Stream from . import commands +from .tags.core import IntegerType from .tags.core.external_reference import ExternalArrayReference from jsonschema import ValidationError diff --git a/asdf/asdf.py b/asdf/asdf.py index 0d75d8e63..f476dc11c 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -414,13 +414,14 @@ def comments(self): """ return self._comments - def _validate(self, tree, custom=True): + def _validate(self, tree, custom=True, reading=False): tagged_tree = yamlutil.custom_tree_to_tagged_tree( tree, self) - schema.validate(tagged_tree, self) + schema.validate(tagged_tree, self, reading=reading) # Perform secondary validation pass if requested if custom and self._custom_schema: - schema.validate(tagged_tree, self, self._custom_schema) + schema.validate(tagged_tree, self, self._custom_schema, + reading=reading) def validate(self): """ @@ -651,10 +652,10 @@ def _open_asdf(cls, self, fd, uri=None, mode='r', tree = reference.find_references(tree, self) if not do_not_fill_defaults: - schema.fill_defaults(tree, self) + schema.fill_defaults(tree, self, reading=True) try: - self._validate(tree) + self._validate(tree, reading=True) except ValidationError: self.close() raise diff --git a/asdf/schema.py b/asdf/schema.py index d7ac26378..ac68b9a69 100644 --- a/asdf/schema.py +++ b/asdf/schema.py @@ -458,21 +458,33 @@ def get_validator(schema={}, ctx=None, validators=None, url_mapping=None, return validator -def validate_large_literals(instance): +def validate_large_literals(instance, reading=False): """ Validate that the tree has no large numeric literals. """ # We can count on 52 bits of precision for instance in treeutil.iter_tree(instance): - if (isinstance(instance, (Integral)) and ( - instance > ((1 << 51) - 1) or - instance < -((1 << 51) - 2))): + + if not isinstance(instance, Integral): + continue + + if instance <= ((1 << 51) - 1) and instance >= -((1 << 51) - 2): + continue + + if not reading: raise ValidationError( "Integer value {0} is too large to safely represent as a " "literal in ASDF".format(instance)) + warnings.warn( + "Invalid integer literal value {0} detected while reading file. " + "The value has been read safely, but the file should be " + "fixed.".format(instance) + ) -def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs): + +def validate(instance, ctx=None, schema={}, validators=None, reading=False, + *args, **kwargs): """ Validate the given instance (which must be a tagged tree) against the appropriate schema. The schema itself is located using the @@ -495,6 +507,11 @@ def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs): validators : dict, optional A dictionary mapping properties to validators to use (instead of the built-in ones and ones provided by extension types). + + reading: bool, optional + Indicates whether validation is being performed when the file is being + read. This is useful to allow for different validation behavior when + reading vs writing files. """ if ctx is None: from .asdf import AsdfFile @@ -504,10 +521,10 @@ def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs): *args, **kwargs) validator.validate(instance, _schema=(schema or None)) - validate_large_literals(instance) + validate_large_literals(instance, reading=reading) -def fill_defaults(instance, ctx): +def fill_defaults(instance, ctx, reading=False): """ For any default values in the schema, add them to the tree if they don't exist. @@ -518,8 +535,12 @@ def fill_defaults(instance, ctx): ctx : AsdfFile context Used to resolve tags and urls + + reading: bool, optional + Indicates whether the ASDF file is being read (in contrast to being + written). """ - validate(instance, ctx, validators=FILL_DEFAULTS) + validate(instance, ctx, validators=FILL_DEFAULTS, reading=reading) def remove_defaults(instance, ctx): diff --git a/asdf/tags/core/__init__.py b/asdf/tags/core/__init__.py index 5233f834e..7e0dadd02 100644 --- a/asdf/tags/core/__init__.py +++ b/asdf/tags/core/__init__.py @@ -44,4 +44,5 @@ def to_tree(cls, node, ctx): from .constant import ConstantType from .ndarray import NDArrayType from .complex import ComplexType +from .integer import IntegerType from .external_reference import ExternalArrayReference diff --git a/asdf/tags/core/integer.py b/asdf/tags/core/integer.py new file mode 100644 index 000000000..0d1d62972 --- /dev/null +++ b/asdf/tags/core/integer.py @@ -0,0 +1,121 @@ +# Licensed under a 3-clause BSD style license - see LICENSE.rst +# -*- coding: utf-8 -*- + +from numbers import Integral + +import numpy as np + +from ...asdftypes import AsdfType +from ...yamlutil import custom_tree_to_tagged_tree + + +class IntegerType(AsdfType): + """ + Enables the storage of arbitrarily large integer values + + The ASDF Standard mandates that integer literals in the tree can be no + larger than 52 bits. Use of this class enables the storage of arbitrarily + large integer values. + + When reading files that contain arbitrarily large integers, the values that + are restored in the tree will be raw Python `int` instances. + + Parameters + ---------- + + value: `numbers.Integral` + A Python integral value (e.g. `int` or `numpy.integer`) + + storage_type: `str`, optional + Optionally overrides the storage type of the array used to represent + the integer value. Valid values are "internal" (the default) and + "inline" + + Examples + -------- + + >>> import asdf + >>> import random + >>> # Create a large integer value + >>> largeval = random.getrandbits(100) + >>> # Store the large integer value to the tree using asdf.IntegerType + >>> tree = dict(largeval=asdf.IntegerType(largeval)) + >>> with asdf.AsdfFile(tree) as af: + ... af.write_to('largeval.asdf') + >>> with asdf.open('largeval.asdf') as aa: + ... assert aa['largeval'] == largeval + """ + + name = 'core/integer' + version = '1.0.0' + + _value_cache = dict() + + def __init__(self, value, storage_type='internal'): + assert storage_type in ['internal', 'inline'], "Invalid storage type given" + self._value = value + self._sign = '-' if value < 0 else '+' + self._storage = storage_type + + @classmethod + def to_tree(cls, node, ctx): + + if ctx not in cls._value_cache: + cls._value_cache[ctx] = dict() + + abs_value = int(np.abs(node._value)) + + # If the same value has already been stored, reuse the array + if abs_value in cls._value_cache[ctx]: + array = cls._value_cache[ctx][abs_value] + else: + # pack integer value into 32-bit words + words = [] + value = abs_value + while value > 0: + words.append(value & 0xffffffff) + value >>= 32 + + array = np.array(words, dtype=np.uint32) + if node._storage == 'internal': + cls._value_cache[ctx][abs_value] = array + + tree = dict() + ctx.set_array_storage(array, node._storage) + tree['words'] = custom_tree_to_tagged_tree(array, ctx) + tree['sign'] = node._sign + tree['string'] = str(int(node._value)) + + return tree + + @classmethod + def from_tree(cls, tree, ctx): + + value = 0 + for x in tree['words'][::-1]: + value <<= 32 + value |= int(x) + + if tree['sign'] == '-': + value = -value + + return IntegerType(value) + + def __int__(self): + return int(self._value) + + def __float__(self): + return float(self._value) + + def __eq__(self, other): + if isinstance(other, Integral): + return self._value == other + elif isinstance(other, IntegerType): + return self._value == other._value + else: + raise ValueError( + "Can't compare IntegralType to unknown type: {}".format( + type(other))) + + def __repr__(self): + return "IntegerType({})".format(self._value) diff --git a/asdf/tags/core/tests/test_integer.py b/asdf/tags/core/tests/test_integer.py new file mode 100644 index 000000000..5b171f966 --- /dev/null +++ b/asdf/tags/core/tests/test_integer.py @@ -0,0 +1,93 @@ +# Licensed under a 3-clause BSD style license - see LICENSE.rst +# -*- coding: utf-8 -*- + +import random + +import pytest + +import asdf +from asdf import IntegerType +from asdf.tests import helpers + + +# Make sure tests are deterministic +random.seed(0) + + +@pytest.mark.parametrize('sign', ['+', '-']) +@pytest.mark.parametrize('value', [ + random.getrandbits(64), + random.getrandbits(65), + random.getrandbits(100), + random.getrandbits(128), + random.getrandbits(129), + random.getrandbits(200), +]) +def test_integer_value(tmpdir, value, sign): + + if sign == '-': + value = -value + + integer = IntegerType(value) + tree = dict(integer=integer) + helpers.assert_roundtrip_tree(tree, tmpdir) + + +@pytest.mark.parametrize('inline', [False, True]) +def test_integer_storage(tmpdir, inline): + + tmpfile = str(tmpdir.join('integer.asdf')) + + kwargs = dict() + if inline: + kwargs['storage_type'] = 'inline' + + random.seed(0) + value = random.getrandbits(1000) + tree = dict(integer=IntegerType(value, **kwargs)) + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile) + + with asdf.open(tmpfile, _force_raw_types=True) as rf: + if inline: + assert 'source' not in rf.tree['integer']['words'] + assert 'data' in rf.tree['integer']['words'] + else: + assert 'source' in rf.tree['integer']['words'] + assert 'data' not in rf.tree['integer']['words'] + + assert 'string' in rf.tree['integer'] + assert rf.tree['integer']['string'] == str(value) + + +def test_integer_storage_duplication(tmpdir): + + tmpfile = str(tmpdir.join('integer.asdf')) + + random.seed(0) + value = random.getrandbits(1000) + tree = dict(integer1=IntegerType(value), integer2=IntegerType(value)) + + with asdf.AsdfFile(tree) as af: + af.write_to(tmpfile) + assert len(af.blocks) == 1 + + with asdf.open(tmpfile, _force_raw_types=True) as rf: + assert rf.tree['integer1']['words']['source'] == 0 + assert rf.tree['integer2']['words']['source'] == 0 + + with asdf.open(tmpfile) as aa: + assert aa.tree['integer1'] == value + assert aa.tree['integer2'] == value + + +def test_integer_conversion(): + + random.seed(0) + value = random.getrandbits(1000) + + integer = asdf.IntegerType(value) + assert integer == value + assert int(integer) == int(value) + assert float(integer) == float(value) diff --git a/asdf/tests/test_schema.py b/asdf/tests/test_schema.py index 0aa03b0d2..81b3b2c28 100644 --- a/asdf/tests/test_schema.py +++ b/asdf/tests/test_schema.py @@ -507,6 +507,24 @@ def test_large_literals(use_numpy): print(buff.getvalue()) +def test_read_large_literal(): + + value = 1 << 64 + yaml = """integer: {}""".format(value) + + buff = helpers.yaml_to_asdf(yaml) + + with pytest.warns(UserWarning) as w: + with asdf.open(buff) as af: + assert af['integer'] == value + + # We get two warnings: one for validation time, and one when defaults + # are filled. It seems like we could improve this architecture, though... + assert len(w) == 2 + assert str(w[0].message).startswith('Invalid integer literal value') + assert str(w[1].message).startswith('Invalid integer literal value') + + def test_nested_array(): s = { 'type': 'object', diff --git a/asdf/versioning.py b/asdf/versioning.py index 00a92997e..f87c637ce 100644 --- a/asdf/versioning.py +++ b/asdf/versioning.py @@ -132,15 +132,15 @@ def __hash__(self): return super(AsdfSpec, self).__hash__() -default_version = AsdfVersion('1.2.0') - - supported_versions = [ AsdfVersion('1.0.0'), AsdfVersion('1.1.0'), - AsdfVersion('1.2.0') + AsdfVersion('1.2.0'), + AsdfVersion('1.3.0') ] +default_version = supported_versions[-1] + class VersionedMixin(object): _version = default_version diff --git a/docs/asdf/features.rst b/docs/asdf/features.rst index 3b8d537a8..77f916375 100644 --- a/docs/asdf/features.rst +++ b/docs/asdf/features.rst @@ -18,6 +18,19 @@ respectively. The top-level tree object behaves like a Python dictionary and supports arbitrary nesting of data structures. For simple examples of creating and reading trees, see :ref:`overview`. +.. note:: + + The ASDF Standard imposes a maximum size of 52 bits for integer literals in + the tree (see `the docs `_ + for details and justification). Attempting to store a larger value will + result in a validation error. + + Integers and floats of up to 64 bits can be stored inside of :mod:`numpy` + arrays (see below). + + For arbitrary precision integer support, see `IntegerType`. + + One of the key features of ASDF is its ability to serialize :mod:`numpy` arrays. This is discussed in detail in :ref:`array-data`.