Skip to content

Commit

Permalink
[s3] Change gzip compression to a streaming implementation (jschneier…
Browse files Browse the repository at this point in the history
…#1061)

The original version reads the entire content in one go, compresses and
places it into a buffer in memory. This limits both the possible size of
the file that can be saved and compressed, and also the performance of the
transfer.
  • Loading branch information
vainu-arto authored and mlazowik committed Mar 9, 2022
1 parent 1ad68e7 commit e8b867c
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 15 deletions.
18 changes: 3 additions & 15 deletions storages/backends/s3boto3.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import io
import mimetypes
import os
import posixpath
Expand All @@ -18,8 +17,8 @@

from storages.base import BaseStorage
from storages.utils import (
check_location, get_available_overwrite_name, lookup_env, safe_join,
setting, to_bytes,
GzipCompressionWrapper, check_location, get_available_overwrite_name,
lookup_env, safe_join, setting, to_bytes,
)

try:
Expand Down Expand Up @@ -435,18 +434,7 @@ def _normalize_name(self, name):

def _compress_content(self, content):
"""Gzip a given string content."""
zbuf = io.BytesIO()
# The GZIP header has a modification time attribute (see http://www.zlib.org/rfc-gzip.html)
# This means each time a file is compressed it changes even if the other contents don't change
# For S3 this defeats detection of changes using MD5 sums on gzipped files
# Fixing the mtime at 0.0 at compression time avoids this problem
with GzipFile(mode='wb', fileobj=zbuf, mtime=0.0) as zfile:
zfile.write(to_bytes(content.read()))
zbuf.seek(0)
# Boto 2 returned the InMemoryUploadedFile with the file pointer replaced,
# but Boto 3 seems to have issues with that. No need for fp.name in Boto3
# so just returning the BytesIO directly
return zbuf
return GzipCompressionWrapper(content)

def _open(self, name, mode='rb'):
name = self._normalize_name(self._clean_name(name))
Expand Down
35 changes: 35 additions & 0 deletions storages/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import io
import os
import posixpath
import zlib
from typing import Optional

from django.conf import settings
from django.core.exceptions import (
Expand Down Expand Up @@ -126,3 +129,35 @@ def get_available_overwrite_name(name, max_length):
'allows sufficient "max_length".' % name
)
return os.path.join(dir_name, "{}{}".format(file_root, file_ext))


class GzipCompressionWrapper(io.RawIOBase):
"""Wrapper for compressing file contents on the fly."""

def __init__(self, raw, level=zlib.Z_BEST_COMPRESSION):
super().__init__()
self.raw = raw
self.compress = zlib.compressobj(level=level, wbits=31)
self.leftover = bytearray()

@staticmethod
def readable():
return True

def readinto(self, buf: bytearray) -> Optional[int]:
size = len(buf)
while len(self.leftover) < size:
chunk = to_bytes(self.raw.read(size))
if not chunk:
if self.compress:
self.leftover += self.compress.flush(zlib.Z_FINISH)
self.compress = None
break
self.leftover += self.compress.compress(chunk)
if len(self.leftover) == 0:
return 0
output = self.leftover[:size]
size = len(output)
buf[:size] = output
self.leftover = self.leftover[size:]
return size

0 comments on commit e8b867c

Please sign in to comment.