Add lz4 as option for compression (#258)

asdf-format · Jul 10, 2017 · 9278462 · 9278462
1 parent 7a8e226
commit 9278462
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 21 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -22,7 +22,7 @@ env:
         # to repeat them for all configurations.
         - NUMPY_VERSION=1.11
         - ASTROPY_VERSION=development
-        - CONDA_DEPENDENCIES='jsonschema pyyaml six'
+        - CONDA_DEPENDENCIES='jsonschema pyyaml six lz4'
         - PIP_DEPENDENCIES_FLAGS='--no-deps --force'
         - PIP_DEPENDENCIES='git+http://github.com/spacetelescope/gwcs.git#egg=gwcs'
         - SETUP_CMD='test --remote-data'

diff --git a/appveyor.yml b/appveyor.yml
@@ -8,7 +8,7 @@ environment:
       MINICONDA_VERSION: "latest"
       CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci-helpers\\appveyor\\windows_sdk.cmd"
       ASTROPY_VERSION: "development"
-      CONDA_DEPENDENCIES: "jsonschema pyyaml six"
+      CONDA_DEPENDENCIES: "jsonschema pyyaml six lz4"
       PIP_DEPENDENCIES: "git+http://github.com/spacetelescope/gwcs.git#egg=gwcs"
 
   matrix:

diff --git a/asdf/asdf.py b/asdf/asdf.py
@@ -355,8 +355,13 @@ def set_array_compression(self, arr, compression):
 
             - ``bzp2``: Use bzip2 compression
 
+            - ``lz4``: Use lz4 compression
+
+            - ``''`` or `None`: no compression
+
             - ``input``: Use the same compression as in the file read.
               If there is no prior file, acts as None.
+
         """
         self.blocks[arr].output_compression = compression
 
@@ -671,8 +676,10 @@ def update(self, all_array_storage=None, all_array_compression='input',
 
             - ``bzp2``: Use bzip2 compression.
 
+            - ``lz4``: Use lz4 compression.
+
             - ``input``: Use the same compression as in the file read.
-              If there is no prior file, acts as None.
+              If there is no prior file, acts as None
 
         auto_inline : int, optional
             When the number of elements in an array is less than this
@@ -808,6 +815,8 @@ def write_to(self, fd, all_array_storage=None, all_array_compression='input',
 
             - ``bzp2``: Use bzip2 compression.
 
+            - ``lz4``: Use lz4 compression.
+
             - ``input``: Use the same compression as in the file read.
               If there is no prior file, acts as None.
 

diff --git a/asdf/commands/defragment.py b/asdf/commands/defragment.py
@@ -35,8 +35,8 @@ def setup_arguments(cls, subparsers):
             the output file.""")
         parser.add_argument(
             "--compress", "-c", type=str, nargs="?",
-            choices=['zlib', 'bzp2'],
-            help="""Compress blocks using one of "zlib" or "bzp2".""")
+            choices=['zlib', 'bzp2', 'lz4'],
+            help="""Compress blocks using one of "zlib", "bzp2" or "lz4".""")
 
         parser.set_defaults(func=cls.run)
 

diff --git a/asdf/commands/tests/test_defragment.py b/asdf/commands/tests/test_defragment.py
@@ -6,20 +6,21 @@
 import os
 
 import numpy as np
+import pytest
 
 from ... import AsdfFile
 from .. import main
-from ...tests.helpers import get_file_sizes, assert_tree_match
+from ...tests.helpers import get_file_sizes, assert_tree_match, has_package
 
 
-def test_defragment(tmpdir):
-    x = np.arange(0, 10, dtype=np.float)
+def _test_defragment(tmpdir, codec):
+    x = np.arange(0, 1000, dtype=np.float)
 
     tree = {
         'science_data': x,
         'subset': x[3:-3],
         'skipping': x[::2],
-        'not_shared': np.arange(10, 0, -1, dtype=np.uint8)
+        'not_shared': np.arange(100, 0, -1, dtype=np.uint8)
         }
 
     path = os.path.join(str(tmpdir), 'original.asdf')
@@ -29,7 +30,7 @@ def test_defragment(tmpdir):
     assert len(ff.blocks) == 2
 
     result = main.main_from_args(
-        ['defragment', path, '-o', out_path, '-c', 'zlib'])
+        ['defragment', path, '-o', out_path, '-c', codec])
 
     assert result == 0
 
@@ -43,3 +44,16 @@ def test_defragment(tmpdir):
     with AsdfFile.open(os.path.join(str(tmpdir), 'original.defragment.asdf')) as ff:
         assert_tree_match(ff.tree, tree)
         assert len(list(ff.blocks.internal_blocks)) == 2
+
+
+def test_defragment_zlib(tmpdir):
+    _test_defragment(tmpdir, 'zlib')
+
+
+def test_defragment_bzp2(tmpdir):
+    _test_defragment(tmpdir, 'bzp2')
+
+
+@pytest.mark.skipif(not has_package('lz4'), reason='lz4 is not installed')
+def test_defragment_lz4(tmpdir):
+    _test_defragment(tmpdir, 'lz4')
diff --git a/asdf/compression.py b/asdf/compression.py
@@ -2,19 +2,23 @@
 # -*- coding: utf-8 -*-
 
 from __future__ import absolute_import, division, unicode_literals, print_function
+import struct
 
 import numpy as np
 
 import six
 
 
+DEFAULT_BLOCK_SIZE = 1 << 22  #: Decompressed block size in bytes, 4MiB
+
+
 def validate(compression):
     """
     Validate the compression string.
 
     Parameters
     ----------
-    compression : str or None
+    compression : str, bytes or None
 
     Returns
     -------
@@ -31,13 +35,55 @@ def validate(compression):
     if isinstance(compression, bytes):
         compression = compression.decode('ascii')
 
-    if compression not in ('zlib', 'bzp2', 'input'):
+    compression = compression.strip('\0')
+    if compression not in ('zlib', 'bzp2', 'lz4', 'input'):
         raise ValueError(
-            "Supported compression types are: 'zlib', 'bzp2' or 'input'")
+            "Supported compression types are: 'zlib', 'bzp2', 'lz4', or 'input'")
 
     return compression
 
 
+class Lz4Compressor(object):
+    def __init__(self, block_api):
+        self._api = block_api
+
+    def compress(self, data):
+        output = self._api.compress(data, mode='high_compression')
+        header = struct.pack('!I', len(output))
+        return header + output
+
+
+class Lz4Decompressor(object):
+    def __init__(self, block_api):
+        self._api = block_api
+        self._size = 0
+        self._pos = 0
+        self._buffer = b''
+
+    def decompress(self, data):
+        if not self._size:
+            data = self._buffer + data
+            if len(data) < 4:
+                self._buffer += data
+                return b''
+            self._size = struct.unpack('!I', data[:4])[0]
+            data = data[4:]
+            self._buffer = bytearray(self._size)
+        if self._pos + len(data) < self._size:
+            self._buffer[self._pos:self._pos + len(data)] = data
+            self._pos += len(data)
+            return b''
+        else:
+            offset = self._size - self._pos
+            self._buffer[self._pos:] = data[:offset]
+            data = data[offset:]
+            self._size = 0
+            self._pos = 0
+            output = self._api.decompress(self._buffer)
+            self._buffer = b''
+            return output + self.decompress(data)
+
+
 def _get_decoder(compression):
     if compression == 'zlib':
         try:
@@ -57,6 +103,15 @@ def _get_decoder(compression):
                 "therefore the compressed block in this ASDF file "
                 "can not be decompressed.")
         return bz2.BZ2Decompressor()
+    elif compression == 'lz4':
+        try:
+            import lz4.block
+        except ImportError:
+            raise ImportError(
+                "lz4 library in not installed in your Python environment, "
+                "therefore the compressed block in this ASDF file "
+                "can not be decompressed.")
+        return Lz4Decompressor(lz4.block)
     else:
         raise ValueError(
             "Unknown compression type: '{0}'".format(compression))
@@ -81,6 +136,15 @@ def _get_encoder(compression):
                 "therefore the block in this ASDF file "
                 "can not be compressed.")
         return bz2.BZ2Compressor()
+    elif compression == 'lz4':
+        try:
+            import lz4.block
+        except ImportError:
+            raise ImportError(
+                "lz4 library in not installed in your Python environment, "
+                "therefore the block in this ASDF file "
+                "can not be compressed.")
+        return Lz4Compressor(lz4.block)
     else:
         raise ValueError(
             "Unknown compression type: '{0}'".format(compression))
@@ -147,7 +211,7 @@ def decompress(fd, used_size, data_size, compression):
     return buffer
 
 
-def compress(fd, data, compression, block_size=1 << 16):
+def compress(fd, data, compression, block_size=DEFAULT_BLOCK_SIZE):
     """
     Compress array data and write to a file.
 
@@ -157,23 +221,30 @@ def compress(fd, data, compression, block_size=1 << 16):
         The file to write to.
 
     data : buffer
-        The buffer of uncompressed data
+        The buffer of uncompressed data.
 
     compression : str
         The type of compression to use.
 
     block_size : int, optional
-        Input data will be split into blocks of this size (in bytes) before the compression.
+        Input data will be split into blocks of this size (in bytes) before compression.
     """
     compression = validate(compression)
     encoder = _get_encoder(compression)
 
+    # We can have numpy arrays here. While compress() will work with them,
+    # it is impossible to split them into fixed size blocks without converting
+    # them to bytes.
+    if isinstance(data, np.ndarray):
+        data = data.tobytes()
+
     for i in range(0, len(data), block_size):
         fd.write(encoder.compress(data[i:i+block_size]))
-    fd.write(encoder.flush())
+    if hasattr(encoder, "flush"):
+        fd.write(encoder.flush())
 
 
-def get_compressed_size(data, compression, block_size=1 << 16):
+def get_compressed_size(data, compression, block_size=DEFAULT_BLOCK_SIZE):
     """
     Returns the number of bytes required when the given data is
     compressed.
@@ -198,6 +269,7 @@ def get_compressed_size(data, compression, block_size=1 << 16):
     l = 0
     for i in range(0, len(data), block_size):
         l += len(encoder.compress(data[i:i+block_size]))
-    l += len(encoder.flush())
+    if hasattr(encoder, "flush"):
+        l += len(encoder.flush())
 
     return l
diff --git a/asdf/tests/test_compression.py b/asdf/tests/test_compression.py
@@ -114,6 +114,13 @@ def test_bzp2(tmpdir):
 
     _roundtrip(tmpdir, tree, 'bzp2')
 
+
+def test_lz4(tmpdir):
+    pytest.importorskip('lz4')
+    tree = _get_large_tree()
+
+    _roundtrip(tmpdir, tree, 'lz4')
+
 
 def test_recompression(tmpdir):
     tree = _get_large_tree()

diff --git a/setup.py b/setup.py
@@ -105,6 +105,10 @@ def _null_validate(self):
     'asdftool = asdf.commands.main:main',
 ]
 
+# Add the dependencies which are not strictly needed but enable otherwise skipped tests
+extra_requires = []
+if os.getenv('CI'):
+    extra_requires.extend(['lz4>=0.10'])
 
 # Note that requires and provides should not be included in the call to
 # ``setup``, since these are now deprecated. See this link for more details:
@@ -119,8 +123,8 @@ def _null_validate(self):
           'jsonschema>=2.3.0',
           'six>=1.9.0',
           'pytest>=2.7.2',
-          'numpy>=1.8'
-      ],
+          'numpy>=1.8',
+      ] + extra_requires,
       author=AUTHOR,
       author_email=AUTHOR_EMAIL,
       license=LICENSE,