Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-95534: Improve gzip reading speed by 10% #97664

Merged
merged 33 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1e13a89
Add code from python-isal project
rhpvorderman Sep 28, 2022
6a5cdfd
Reorder code
rhpvorderman Sep 28, 2022
809ad5f
Add ZlibDecompressor
rhpvorderman Sep 28, 2022
03254b8
Add zlibdecompressor object
rhpvorderman Sep 28, 2022
669848a
Fix compile warnings
rhpvorderman Sep 28, 2022
69ff613
Do not use class input
rhpvorderman Sep 28, 2022
6fa43ae
Fix lock stuff
rhpvorderman Sep 28, 2022
cdc5972
Fix incorrect error handling
rhpvorderman Sep 28, 2022
7820627
Rework _GzipReader to be more efficient
rhpvorderman Sep 28, 2022
6f8b64a
Properly initialize zstate
rhpvorderman Sep 30, 2022
3e2a4f5
Add blurb for increased gzip read speed
rhpvorderman Sep 30, 2022
070df1c
Make sure self->initialised is set to 0. Reword some comments.
rhpvorderman Sep 30, 2022
70b7d4d
Add appropriate doctype in blurb
rhpvorderman Sep 30, 2022
22d3893
Merge branch 'main' into gh-95534
rhpvorderman Sep 30, 2022
18a7692
Add missing NULL member to ZlibDecompressor_Members
rhpvorderman Sep 30, 2022
d54c8b5
Merge branch 'gh-95534' of github.com:rhpvorderman/cpython into gh-95534
rhpvorderman Sep 30, 2022
c90096f
Remove double comment
rhpvorderman Sep 30, 2022
1c15839
Use READ_BUFFER_SIZE in python -m gzip command line application
rhpvorderman Sep 30, 2022
d0ff4f0
Fix error in news entry
rhpvorderman Sep 30, 2022
afd92ab
minor edit, use +=
gpshead Sep 30, 2022
922ac5c
Throw compile warning on zlib versions that are too old
rhpvorderman Oct 2, 2022
dc7de61
Use bool instead of int
rhpvorderman Oct 2, 2022
ca12c1f
Correct spelling of insufficient
rhpvorderman Oct 2, 2022
1ce342b
Put brackets around if statement
rhpvorderman Oct 2, 2022
0b7735e
Remove strange default case
rhpvorderman Oct 2, 2022
043a376
Remove unnecessary zero op
rhpvorderman Oct 2, 2022
2a653a9
Change RetVal to return_value
rhpvorderman Oct 2, 2022
475aef6
Change char to bool
rhpvorderman Oct 2, 2022
41ba076
Properly bracketify if-else clause
rhpvorderman Oct 2, 2022
5f1901d
Prefix underscore to _ZlibDecompressor name
rhpvorderman Oct 2, 2022
c5d6888
Copy explanation about zdict from python docs into function docstring
rhpvorderman Oct 2, 2022
9d60339
Merge branch 'gh-95534' of github.com:rhpvorderman/cpython into gh-95534
rhpvorderman Oct 2, 2022
e3da415
Add tests for _ZlibDecompressor
rhpvorderman Oct 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions Lib/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
_COMPRESS_LEVEL_TRADEOFF = 6
_COMPRESS_LEVEL_BEST = 9

READ_BUFFER_SIZE = 128 * 1024


def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
encoding=None, errors=None, newline=None):
Expand Down Expand Up @@ -446,7 +448,7 @@ def _read_gzip_header(fp):

class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
wbits=-zlib.MAX_WBITS)
# Set flag indicating start of a new member
self._new_member = True
Expand Down Expand Up @@ -494,12 +496,13 @@ def read(self, size=-1):
self._new_member = False

# Read a chunk of data from the file
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
uncompress = self._decompressor.decompress(buf, size)
else:
uncompress = self._decompressor.decompress(b"", size)

uncompress = self._decompressor.decompress(buf, size)
if self._decompressor.unconsumed_tail != b"":
self._fp.prepend(self._decompressor.unconsumed_tail)
elif self._decompressor.unused_data != b"":
if self._decompressor.unused_data != b"":
# Prepend the already read bytes to the fileobj so they can
# be seen by _read_eof() and _read_gzip_header()
self._fp.prepend(self._decompressor.unused_data)
Expand All @@ -510,14 +513,11 @@ def read(self, size=-1):
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")

self._add_read_data( uncompress )
self._crc = zlib.crc32(uncompress, self._crc)
self._stream_size += len(uncompress)
self._pos += len(uncompress)
return uncompress

def _add_read_data(self, data):
self._crc = zlib.crc32(data, self._crc)
self._stream_size = self._stream_size + len(data)

def _read_eof(self):
# We've read to the end of the file
# We check that the computed CRC and size of the
Expand Down Expand Up @@ -647,7 +647,7 @@ def main():
f = builtins.open(arg, "rb")
g = open(arg + ".gz", "wb")
while True:
chunk = f.read(io.DEFAULT_BUFFER_SIZE)
chunk = f.read(READ_BUFFER_SIZE)
if not chunk:
break
g.write(chunk)
Expand Down
167 changes: 167 additions & 0 deletions Lib/test/test_zlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):
"""


class ZlibDecompressorTest():
# Test adopted from test_bz2.py
TEXT = HAMLET_SCENE
DATA = zlib.compress(HAMLET_SCENE)
BAD_DATA = b"Not a valid deflate block"
def test_Constructor(self):
self.assertRaises(TypeError, zlib._ZlibDecompressor, 42)

def testDecompress(self):
zlibd = zlib._ZlibDecompressor()
self.assertRaises(TypeError, zlibd.decompress)
text = zlibd.decompress(self.DATA)
self.assertEqual(text, self.TEXT)

def testDecompressChunks10(self):
zlibd = zlib._ZlibDecompressor()
text = b''
n = 0
while True:
str = self.DATA[n*10:(n+1)*10]
if not str:
break
text += zlibd.decompress(str)
n += 1
self.assertEqual(text, self.TEXT)

def testDecompressUnusedData(self):
zlibd = zlib._ZlibDecompressor()
unused_data = b"this is unused data"
text = zlibd.decompress(self.DATA+unused_data)
self.assertEqual(text, self.TEXT)
self.assertEqual(zlibd.unused_data, unused_data)

def testEOFError(self):
zlibd = zlib._ZlibDecompressor()
text = zlibd.decompress(self.DATA)
self.assertRaises(EOFError, zlibd.decompress, b"anything")
self.assertRaises(EOFError, zlibd.decompress, b"")

@support.skip_if_pgo_task
@bigmemtest(size=_4G + 100, memuse=3.3)
def testDecompress4G(self, size):
# "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
blocksize = 10 * 1024 * 1024
block = random.randbytes(blocksize)
try:
data = block * (size // blocksize + 1)
compressed = zlib.compress(data)
zlibd = zlib._ZlibDecompressor()
decompressed = zlibd.decompress(compressed)
self.assertTrue(decompressed == data)
finally:
data = None
compressed = None
decompressed = None

def testPickle(self):
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
with self.assertRaises(TypeError):
pickle.dumps(zlib._ZlibDecompressor(), proto)

def testDecompressorChunksMaxsize(self):
zlibd = zlib._ZlibDecompressor()
max_length = 100
out = []

# Feed some input
len_ = len(self.BIG_DATA) - 64
out.append(zlibd.decompress(self.BIG_DATA[:len_],
max_length=max_length))
self.assertFalse(zlibd.needs_input)
self.assertEqual(len(out[-1]), max_length)

# Retrieve more data without providing more input
out.append(zlibd.decompress(b'', max_length=max_length))
self.assertFalse(zlibd.needs_input)
self.assertEqual(len(out[-1]), max_length)

# Retrieve more data while providing more input
out.append(zlibd.decompress(self.BIG_DATA[len_:],
max_length=max_length))
self.assertLessEqual(len(out[-1]), max_length)

# Retrieve remaining uncompressed data
while not zlibd.eof:
out.append(zlibd.decompress(b'', max_length=max_length))
self.assertLessEqual(len(out[-1]), max_length)

out = b"".join(out)
self.assertEqual(out, self.BIG_TEXT)
self.assertEqual(zlibd.unused_data, b"")

def test_decompressor_inputbuf_1(self):
# Test reusing input buffer after moving existing
# contents to beginning
zlibd = zlib._ZlibDecompressor()
out = []

# Create input buffer and fill it
self.assertEqual(zlibd.decompress(self.DATA[:100],
max_length=0), b'')

# Retrieve some results, freeing capacity at beginning
# of input buffer
out.append(zlibd.decompress(b'', 2))

# Add more data that fits into input buffer after
# moving existing data to beginning
out.append(zlibd.decompress(self.DATA[100:105], 15))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[105:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_decompressor_inputbuf_2(self):
# Test reusing input buffer by appending data at the
# end right away
zlibd = zlib._ZlibDecompressor()
out = []

# Create input buffer and empty it
self.assertEqual(zlibd.decompress(self.DATA[:200],
max_length=0), b'')
out.append(zlibd.decompress(b''))

# Fill buffer with new data
out.append(zlibd.decompress(self.DATA[200:280], 2))

# Append some more data, not enough to require resize
out.append(zlibd.decompress(self.DATA[280:300], 2))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[300:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_decompressor_inputbuf_3(self):
# Test reusing input buffer after extending it

zlibd = zlib._ZlibDecompressor()
out = []

# Create almost full input buffer
out.append(zlibd.decompress(self.DATA[:200], 5))

# Add even more data to it, requiring resize
out.append(zlibd.decompress(self.DATA[200:300], 5))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[300:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_failure(self):
zlibd = zlib._ZlibDecompressor()
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
# Previously, a second call could crash due to internal inconsistency
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)

@support.refcount_test
def test_refleaks_in___init__(self):
gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount')
zlibd = zlib._ZlibDecompressor()
refs_before = gettotalrefcount()
for i in range(100):
zlibd.__init__()
self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10)


class CustomInt:
def __index__(self):
return 100
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:meth:`gzip.GzipFile.read` reads 10% faster.
100 changes: 99 additions & 1 deletion Modules/clinic/zlibmodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading