Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for arbitrary precision integers #566

Merged
merged 10 commits into from
Oct 24, 2018
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
is still effective and the active memory maps may still require the file
to stay open in case ``copy_arrays`` is ``False``. [#573]

- Storage of arbitrary precision integers is now provided by
``asdf.IntegerType``. Reading a file with integer literals that are too
large now causes only a warning instead of a validation error. This is to
provide backwards compatibility for files that were created with a buggy
version of ASDF (see #553 below). [#566]

2.1.1 (unreleased)
------------------

Expand Down
2 changes: 1 addition & 1 deletion asdf-standard
3 changes: 2 additions & 1 deletion asdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

__all__ = [
'AsdfFile', 'CustomType', 'AsdfExtension', 'Stream', 'open', 'test',
'commands', 'ExternalArrayReference'
'commands', 'IntegerType', 'ExternalArrayReference'
]

try:
Expand All @@ -38,6 +38,7 @@
from .extension import AsdfExtension
from .stream import Stream
from . import commands
from .tags.core import IntegerType
from .tags.core.external_reference import ExternalArrayReference

from jsonschema import ValidationError
Expand Down
11 changes: 6 additions & 5 deletions asdf/asdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,13 +414,14 @@ def comments(self):
"""
return self._comments

def _validate(self, tree, custom=True):
def _validate(self, tree, custom=True, reading=False):
tagged_tree = yamlutil.custom_tree_to_tagged_tree(
tree, self)
schema.validate(tagged_tree, self)
schema.validate(tagged_tree, self, reading=reading)
# Perform secondary validation pass if requested
if custom and self._custom_schema:
schema.validate(tagged_tree, self, self._custom_schema)
schema.validate(tagged_tree, self, self._custom_schema,
reading=reading)

def validate(self):
"""
Expand Down Expand Up @@ -651,10 +652,10 @@ def _open_asdf(cls, self, fd, uri=None, mode='r',

tree = reference.find_references(tree, self)
if not do_not_fill_defaults:
schema.fill_defaults(tree, self)
schema.fill_defaults(tree, self, reading=True)

try:
self._validate(tree)
self._validate(tree, reading=True)
except ValidationError:
self.close()
raise
Expand Down
37 changes: 29 additions & 8 deletions asdf/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,21 +458,33 @@ def get_validator(schema={}, ctx=None, validators=None, url_mapping=None,
return validator


def validate_large_literals(instance):
def validate_large_literals(instance, reading=False):
"""
Validate that the tree has no large numeric literals.
"""
# We can count on 52 bits of precision
for instance in treeutil.iter_tree(instance):
if (isinstance(instance, (Integral)) and (
instance > ((1 << 51) - 1) or
instance < -((1 << 51) - 2))):

if not isinstance(instance, Integral):
continue

if instance <= ((1 << 51) - 1) and instance >= -((1 << 51) - 2):
continue

if not reading:
raise ValidationError(
"Integer value {0} is too large to safely represent as a "
"literal in ASDF".format(instance))

warnings.warn(
"Invalid integer literal value {0} detected while reading file. "
"The value has been read safely, but the file should be "
"fixed.".format(instance)
)

def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs):

def validate(instance, ctx=None, schema={}, validators=None, reading=False,
*args, **kwargs):
"""
Validate the given instance (which must be a tagged tree) against
the appropriate schema. The schema itself is located using the
Expand All @@ -495,6 +507,11 @@ def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs):
validators : dict, optional
A dictionary mapping properties to validators to use (instead
of the built-in ones and ones provided by extension types).

reading: bool, optional
Indicates whether validation is being performed when the file is being
read. This is useful to allow for different validation behavior when
reading vs writing files.
"""
if ctx is None:
from .asdf import AsdfFile
Expand All @@ -504,10 +521,10 @@ def validate(instance, ctx=None, schema={}, validators=None, *args, **kwargs):
*args, **kwargs)
validator.validate(instance, _schema=(schema or None))

validate_large_literals(instance)
validate_large_literals(instance, reading=reading)


def fill_defaults(instance, ctx):
def fill_defaults(instance, ctx, reading=False):
"""
For any default values in the schema, add them to the tree if they
don't exist.
Expand All @@ -518,8 +535,12 @@ def fill_defaults(instance, ctx):

ctx : AsdfFile context
Used to resolve tags and urls

reading: bool, optional
Indicates whether the ASDF file is being read (in contrast to being
written).
"""
validate(instance, ctx, validators=FILL_DEFAULTS)
validate(instance, ctx, validators=FILL_DEFAULTS, reading=reading)


def remove_defaults(instance, ctx):
Expand Down
1 change: 1 addition & 0 deletions asdf/tags/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ def to_tree(cls, node, ctx):
from .constant import ConstantType
from .ndarray import NDArrayType
from .complex import ComplexType
from .integer import IntegerType
from .external_reference import ExternalArrayReference
121 changes: 121 additions & 0 deletions asdf/tags/core/integer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Licensed under a 3-clause BSD style license - see LICENSE.rst
# -*- coding: utf-8 -*-

from numbers import Integral

import numpy as np

from ...asdftypes import AsdfType
from ...yamlutil import custom_tree_to_tagged_tree


class IntegerType(AsdfType):
"""
Enables the storage of arbitrarily large integer values

The ASDF Standard mandates that integer literals in the tree can be no
larger than 52 bits. Use of this class enables the storage of arbitrarily
large integer values.

When reading files that contain arbitrarily large integers, the values that
are restored in the tree will be raw Python `int` instances.

Parameters
----------

value: `numbers.Integral`
A Python integral value (e.g. `int` or `numpy.integer`)

storage_type: `str`, optional
Optionally overrides the storage type of the array used to represent
the integer value. Valid values are "internal" (the default) and
"inline"

Examples
--------

>>> import asdf
>>> import random
>>> # Create a large integer value
>>> largeval = random.getrandbits(100)
>>> # Store the large integer value to the tree using asdf.IntegerType
>>> tree = dict(largeval=asdf.IntegerType(largeval))
>>> with asdf.AsdfFile(tree) as af:
... af.write_to('largeval.asdf')
>>> with asdf.open('largeval.asdf') as aa:
... assert aa['largeval'] == largeval
"""

name = 'core/integer'
version = '1.0.0'

_value_cache = dict()

def __init__(self, value, storage_type='internal'):
assert storage_type in ['internal', 'inline'], "Invalid storage type given"
self._value = value
self._sign = '-' if value < 0 else '+'
self._storage = storage_type

@classmethod
def to_tree(cls, node, ctx):

if ctx not in cls._value_cache:
cls._value_cache[ctx] = dict()

abs_value = int(np.abs(node._value))

# If the same value has already been stored, reuse the array
if abs_value in cls._value_cache[ctx]:
array = cls._value_cache[ctx][abs_value]
else:
# pack integer value into 32-bit words
words = []
value = abs_value
while value > 0:
words.append(value & 0xffffffff)
value >>= 32

array = np.array(words, dtype=np.uint32)
if node._storage == 'internal':
cls._value_cache[ctx][abs_value] = array

tree = dict()
ctx.set_array_storage(array, node._storage)
tree['words'] = custom_tree_to_tagged_tree(array, ctx)
tree['sign'] = node._sign
tree['string'] = str(int(node._value))

return tree

@classmethod
def from_tree(cls, tree, ctx):

value = 0
for x in tree['words'][::-1]:
value <<= 32
value |= int(x)

if tree['sign'] == '-':
value = -value

return IntegerType(value)

def __int__(self):
return int(self._value)

def __float__(self):
return float(self._value)

def __eq__(self, other):
if isinstance(other, Integral):
return self._value == other
elif isinstance(other, IntegerType):
return self._value == other._value
else:
raise ValueError(
"Can't compare IntegralType to unknown type: {}".format(
type(other)))

def __repr__(self):
return "IntegerType({})".format(self._value)
93 changes: 93 additions & 0 deletions asdf/tags/core/tests/test_integer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Licensed under a 3-clause BSD style license - see LICENSE.rst
# -*- coding: utf-8 -*-

import random

import pytest

import asdf
from asdf import IntegerType
from asdf.tests import helpers


# Make sure tests are deterministic
random.seed(0)


@pytest.mark.parametrize('sign', ['+', '-'])
@pytest.mark.parametrize('value', [
random.getrandbits(64),
random.getrandbits(65),
random.getrandbits(100),
random.getrandbits(128),
random.getrandbits(129),
random.getrandbits(200),
])
def test_integer_value(tmpdir, value, sign):

if sign == '-':
value = -value

integer = IntegerType(value)
tree = dict(integer=integer)
helpers.assert_roundtrip_tree(tree, tmpdir)


@pytest.mark.parametrize('inline', [False, True])
def test_integer_storage(tmpdir, inline):

tmpfile = str(tmpdir.join('integer.asdf'))

kwargs = dict()
if inline:
kwargs['storage_type'] = 'inline'

random.seed(0)
value = random.getrandbits(1000)
tree = dict(integer=IntegerType(value, **kwargs))

with asdf.AsdfFile(tree) as af:
af.write_to(tmpfile)

with asdf.open(tmpfile, _force_raw_types=True) as rf:
if inline:
assert 'source' not in rf.tree['integer']['words']
assert 'data' in rf.tree['integer']['words']
else:
assert 'source' in rf.tree['integer']['words']
assert 'data' not in rf.tree['integer']['words']

assert 'string' in rf.tree['integer']
assert rf.tree['integer']['string'] == str(value)


def test_integer_storage_duplication(tmpdir):

tmpfile = str(tmpdir.join('integer.asdf'))

random.seed(0)
value = random.getrandbits(1000)
tree = dict(integer1=IntegerType(value), integer2=IntegerType(value))

with asdf.AsdfFile(tree) as af:
af.write_to(tmpfile)
assert len(af.blocks) == 1

with asdf.open(tmpfile, _force_raw_types=True) as rf:
assert rf.tree['integer1']['words']['source'] == 0
assert rf.tree['integer2']['words']['source'] == 0

with asdf.open(tmpfile) as aa:
assert aa.tree['integer1'] == value
assert aa.tree['integer2'] == value


def test_integer_conversion():

random.seed(0)
value = random.getrandbits(1000)

integer = asdf.IntegerType(value)
assert integer == value
assert int(integer) == int(value)
assert float(integer) == float(value)
18 changes: 18 additions & 0 deletions asdf/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,24 @@ def test_large_literals(use_numpy):
print(buff.getvalue())


def test_read_large_literal():

value = 1 << 64
yaml = """integer: {}""".format(value)

buff = helpers.yaml_to_asdf(yaml)

with pytest.warns(UserWarning) as w:
with asdf.open(buff) as af:
assert af['integer'] == value

# We get two warnings: one for validation time, and one when defaults
# are filled. It seems like we could improve this architecture, though...
assert len(w) == 2
assert str(w[0].message).startswith('Invalid integer literal value')
assert str(w[1].message).startswith('Invalid integer literal value')


def test_nested_array():
s = {
'type': 'object',
Expand Down
Loading