From 943ca9c92c9a897d657ea19ea7ed1e7d2f49a289 Mon Sep 17 00:00:00 2001 From: Arjun Date: Sun, 22 Jan 2023 09:26:05 -0800 Subject: [PATCH 1/8] initial testing to see if it's faster --- Lib/gzip.py | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/Lib/gzip.py b/Lib/gzip.py index 75c6ddc3f2cffb..ee2e54fe3ab3eb 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -9,7 +9,9 @@ import zlib import builtins import io +import _pyio as pyio import _compression +import errno __all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] @@ -445,6 +447,115 @@ def _read_gzip_header(fp): _read_exact(fp, 2) # Read & discard the 16-bit header CRC return last_mtime +class _Writable(): + def writable(self): + return True + +class GzipWriter(pyio.BufferedWriter): + def __init__(self, fp, compresslevel=_COMPRESS_LEVEL_BEST, buffer_size=pyio.DEFAULT_BUFFER_SIZE, mtime=None): + super().__init__(fp, buffer_size=buffer_size) + + self.fileobj = fp + self.crc = zlib.crc32(b"") + self.size = 0 + self.writebuf = [] + self.bufsize = 0 + self.offset = 0 # Current file offset for seek(), tell(), etc + self.compress = zlib.compressobj(compresslevel, + zlib.DEFLATED, + -zlib.MAX_WBITS, + zlib.DEF_MEM_LEVEL, + 0) + self._write_mtime = mtime + + self._write_gzip_header(compresslevel) + + def close(self): + fileobj = self.fileobj + self.flush() + fileobj.write(self.compress.flush()) + write32u(fileobj, self.crc) + # self.size may exceed 2 GiB, or even 4 GiB + write32u(fileobj, self.size & 0xffffffff) + self.fileobj.close() + + def write(self, data): + super().write(data) + + def _flush_unlocked(self): + if self.closed: + raise ValueError("flush on closed file") + while self._write_buf: + try: + #n = self.raw.write(self._write_buf) + n = self.compress_and_write(self._write_buf) + except BlockingIOError: + raise RuntimeError("self.raw should implement RawIOBase: it " + "should not raise BlockingIOError") + if n is None: + raise BlockingIOError( + errno.EAGAIN, + "write could not complete without blocking", 0) + if n > len(self._write_buf) or n < 0: + raise OSError("write() returned incorrect number of bytes") + del self._write_buf[:n] + + def compress_and_write(self,data): + self._check_not_closed() + if self.fileobj is None: + raise ValueError("write() on closed GzipFile object") + + if isinstance(data, (bytes, bytearray)): + length = len(data) + else: + # accept any data that supports the buffer protocol + data = memoryview(data) + length = data.nbytes + + if length > 0: + self.fileobj.write(self.compress.compress(data)) + self.size += length + self.crc = zlib.crc32(data, self.crc) + self.offset += length + + return length + + def _write_gzip_header(self, compresslevel): + self.fileobj.write(b'\037\213') # magic header + self.fileobj.write(b'\010') # compression method + try: + # RFC 1952 requires the FNAME field to be Latin-1. Do not + # include filenames that cannot be represented that way. + fname = os.path.basename(self.name) + if not isinstance(fname, bytes): + fname = fname.encode('latin-1') + if fname.endswith(b'.gz'): + fname = fname[:-3] + except UnicodeEncodeError: + fname = b'' + flags = 0 + if fname: + flags = FNAME + self.fileobj.write(chr(flags).encode('latin-1')) + mtime = self._write_mtime + if mtime is None: + mtime = time.time() + write32u(self.fileobj, int(mtime)) + if compresslevel == _COMPRESS_LEVEL_BEST: + xfl = b'\002' + elif compresslevel == _COMPRESS_LEVEL_FAST: + xfl = b'\004' + else: + xfl = b'\000' + self.fileobj.write(xfl) + self.fileobj.write(b'\377') + if fname: + self.fileobj.write(fname + b'\000') + + def _check_not_closed(self): + return self.fileobj.closed + + class _GzipReader(_compression.DecompressReader): def __init__(self, fp): From 8653faf0305db8464ee7cef66e23ebe72d88563a Mon Sep 17 00:00:00 2001 From: Arjun Date: Sun, 22 Jan 2023 14:21:51 -0800 Subject: [PATCH 2/8] use io.BufferedWriter to buffer gzip writes --- Lib/gzip.py | 143 ++++++++++------------------------------------------ 1 file changed, 28 insertions(+), 115 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index ee2e54fe3ab3eb..e9dc1d036a8e51 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -9,9 +9,7 @@ import zlib import builtins import io -import _pyio as pyio import _compression -import errno __all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] @@ -122,6 +120,21 @@ class BadGzipFile(OSError): """Exception raised in some cases for invalid gzip files.""" +class _WriteBufferStream(io.RawIOBase): + """Minimal object to pass WriteBuffer flushes into GzipFile""" + def __init__(self, gzip_file): + self.gzip_file = gzip_file + + def write(self, data): + return self.gzip_file._write_raw(data) + + def seekable(self): + return False + + def writable(self): + return True + + class GzipFile(_compression.BaseStream): """The GzipFile class simulates most of the methods of a file object with the exception of the truncate() method. @@ -208,6 +221,7 @@ def __init__(self, filename=None, mode=None, zlib.DEF_MEM_LEVEL, 0) self._write_mtime = mtime + self._buffer = io.BufferedWriter(_WriteBufferStream(self)) else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -233,6 +247,11 @@ def _init_write(self, filename): self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc + def tell(self): + self._check_not_closed() + self._buffer.flush() + return super().tell() + def _write_gzip_header(self, compresslevel): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method @@ -274,6 +293,9 @@ def write(self,data): if self.fileobj is None: raise ValueError("write() on closed GzipFile object") + return self._buffer.write(data) + + def _write_raw(self, data): if isinstance(data, (bytes, bytearray)): length = len(data) else: @@ -324,9 +346,9 @@ def close(self): fileobj = self.fileobj if fileobj is None: return - self.fileobj = None try: if self.mode == WRITE: + self._buffer.flush() fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB @@ -334,6 +356,7 @@ def close(self): elif self.mode == READ: self._buffer.close() finally: + self.fileobj = None myfileobj = self.myfileobj if myfileobj: self.myfileobj = None @@ -343,7 +366,7 @@ def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed - self.fileobj.write(self.compress.flush(zlib_mode)) + self._buffer.flush() self.fileobj.flush() def fileno(self): @@ -381,8 +404,7 @@ def seek(self, offset, whence=io.SEEK_SET): raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 - for i in range(count // 1024): - self.write(chunk) + self.write(chunk * (count // 1024)) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() @@ -447,115 +469,6 @@ def _read_gzip_header(fp): _read_exact(fp, 2) # Read & discard the 16-bit header CRC return last_mtime -class _Writable(): - def writable(self): - return True - -class GzipWriter(pyio.BufferedWriter): - def __init__(self, fp, compresslevel=_COMPRESS_LEVEL_BEST, buffer_size=pyio.DEFAULT_BUFFER_SIZE, mtime=None): - super().__init__(fp, buffer_size=buffer_size) - - self.fileobj = fp - self.crc = zlib.crc32(b"") - self.size = 0 - self.writebuf = [] - self.bufsize = 0 - self.offset = 0 # Current file offset for seek(), tell(), etc - self.compress = zlib.compressobj(compresslevel, - zlib.DEFLATED, - -zlib.MAX_WBITS, - zlib.DEF_MEM_LEVEL, - 0) - self._write_mtime = mtime - - self._write_gzip_header(compresslevel) - - def close(self): - fileobj = self.fileobj - self.flush() - fileobj.write(self.compress.flush()) - write32u(fileobj, self.crc) - # self.size may exceed 2 GiB, or even 4 GiB - write32u(fileobj, self.size & 0xffffffff) - self.fileobj.close() - - def write(self, data): - super().write(data) - - def _flush_unlocked(self): - if self.closed: - raise ValueError("flush on closed file") - while self._write_buf: - try: - #n = self.raw.write(self._write_buf) - n = self.compress_and_write(self._write_buf) - except BlockingIOError: - raise RuntimeError("self.raw should implement RawIOBase: it " - "should not raise BlockingIOError") - if n is None: - raise BlockingIOError( - errno.EAGAIN, - "write could not complete without blocking", 0) - if n > len(self._write_buf) or n < 0: - raise OSError("write() returned incorrect number of bytes") - del self._write_buf[:n] - - def compress_and_write(self,data): - self._check_not_closed() - if self.fileobj is None: - raise ValueError("write() on closed GzipFile object") - - if isinstance(data, (bytes, bytearray)): - length = len(data) - else: - # accept any data that supports the buffer protocol - data = memoryview(data) - length = data.nbytes - - if length > 0: - self.fileobj.write(self.compress.compress(data)) - self.size += length - self.crc = zlib.crc32(data, self.crc) - self.offset += length - - return length - - def _write_gzip_header(self, compresslevel): - self.fileobj.write(b'\037\213') # magic header - self.fileobj.write(b'\010') # compression method - try: - # RFC 1952 requires the FNAME field to be Latin-1. Do not - # include filenames that cannot be represented that way. - fname = os.path.basename(self.name) - if not isinstance(fname, bytes): - fname = fname.encode('latin-1') - if fname.endswith(b'.gz'): - fname = fname[:-3] - except UnicodeEncodeError: - fname = b'' - flags = 0 - if fname: - flags = FNAME - self.fileobj.write(chr(flags).encode('latin-1')) - mtime = self._write_mtime - if mtime is None: - mtime = time.time() - write32u(self.fileobj, int(mtime)) - if compresslevel == _COMPRESS_LEVEL_BEST: - xfl = b'\002' - elif compresslevel == _COMPRESS_LEVEL_FAST: - xfl = b'\004' - else: - xfl = b'\000' - self.fileobj.write(xfl) - self.fileobj.write(b'\377') - if fname: - self.fileobj.write(fname + b'\000') - - def _check_not_closed(self): - return self.fileobj.closed - - class _GzipReader(_compression.DecompressReader): def __init__(self, fp): From bc4d929e18856b5a2dcc8bbaf002543c67b4c565 Mon Sep 17 00:00:00 2001 From: Arjun Date: Sun, 22 Jan 2023 14:54:00 -0800 Subject: [PATCH 3/8] news blurb --- .../next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst diff --git a/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst new file mode 100644 index 00000000000000..97b322e975eb55 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst @@ -0,0 +1 @@ +Decrease execution time of gzip file writes by 15% From b3aa1f71b6107e3eccf930e4598eb13d735cc7b2 Mon Sep 17 00:00:00 2001 From: Arjun Date: Fri, 17 Feb 2023 23:15:33 -0800 Subject: [PATCH 4/8] added buffer_size for file writes --- Lib/gzip.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index e9dc1d036a8e51..723b97111bad2f 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -25,7 +25,7 @@ def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, - encoding=None, errors=None, newline=None): + encoding=None, errors=None, newline=None, buffer_size=4*io.DEFAULT_BUFFER_SIZE): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or @@ -57,9 +57,9 @@ def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): - binary_file = GzipFile(filename, gz_mode, compresslevel) + binary_file = GzipFile(filename, gz_mode, compresslevel, buffer_size=buffer_size) elif hasattr(filename, "read") or hasattr(filename, "write"): - binary_file = GzipFile(None, gz_mode, compresslevel, filename) + binary_file = GzipFile(None, gz_mode, compresslevel, filename, buffer_size=buffer_size) else: raise TypeError("filename must be a str or bytes object, or a file") @@ -149,7 +149,8 @@ class GzipFile(_compression.BaseStream): myfileobj = None def __init__(self, filename=None, mode=None, - compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): + compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None, + buffer_size=4*io.DEFAULT_BUFFER_SIZE): """Constructor for the GzipFile class. At least one of fileobj and filename must be given a @@ -199,6 +200,8 @@ def __init__(self, filename=None, mode=None, if mode is None: mode = getattr(fileobj, 'mode', 'rb') + self.buffer_size = buffer_size + if mode.startswith('r'): self.mode = READ raw = _GzipReader(fileobj) @@ -221,7 +224,8 @@ def __init__(self, filename=None, mode=None, zlib.DEF_MEM_LEVEL, 0) self._write_mtime = mtime - self._buffer = io.BufferedWriter(_WriteBufferStream(self)) + self._buffer = io.BufferedWriter(_WriteBufferStream(self), + buffer_size=buffer_size) else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -403,9 +407,10 @@ def seek(self, offset, whence=io.SEEK_SET): if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset - chunk = b'\0' * 1024 - self.write(chunk * (count // 1024)) - self.write(b'\0' * (count % 1024)) + chunk = b'\0' * self.buffer_size + for i in range(count // self.buffer_size): + self.write(chunk) + self.write(b'\0' * (count % self.buffer_size)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) From a929ac8deedeb61971260d894b18f76f035aa17e Mon Sep 17 00:00:00 2001 From: Arjun Date: Fri, 17 Feb 2023 23:20:35 -0800 Subject: [PATCH 5/8] comment for _write_raw --- Lib/gzip.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/gzip.py b/Lib/gzip.py index 723b97111bad2f..b1e109fb93c3f2 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -300,6 +300,7 @@ def write(self,data): return self._buffer.write(data) def _write_raw(self, data): + # Called by our self._buffer underlying WriteBufferStream. if isinstance(data, (bytes, bytearray)): length = len(data) else: From d1618f26050f3a5abcca8625a1aa85fa2beaed6e Mon Sep 17 00:00:00 2001 From: Arjun Date: Fri, 17 Feb 2023 23:23:46 -0800 Subject: [PATCH 6/8] common DEFAULT_WRITE_BUFFER_SIZE --- Lib/gzip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index b1e109fb93c3f2..7527f391d6ac46 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -22,10 +22,12 @@ _COMPRESS_LEVEL_BEST = 9 READ_BUFFER_SIZE = 128 * 1024 +DEFAULT_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, - encoding=None, errors=None, newline=None, buffer_size=4*io.DEFAULT_BUFFER_SIZE): + encoding=None, errors=None, newline=None, + buffer_size=DEFAULT_WRITE_BUFFER_SIZE): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or @@ -150,7 +152,7 @@ class GzipFile(_compression.BaseStream): def __init__(self, filename=None, mode=None, compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None, - buffer_size=4*io.DEFAULT_BUFFER_SIZE): + buffer_size=DEFAULT_WRITE_BUFFER_SIZE): """Constructor for the GzipFile class. At least one of fileobj and filename must be given a From fe31588314f11c40d5a9de54bd15ec746898bf6e Mon Sep 17 00:00:00 2001 From: CCLDArjun Date: Thu, 23 Feb 2023 14:54:27 -0800 Subject: [PATCH 7/8] use constant for write buffer size --- Lib/gzip.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 7527f391d6ac46..8796c8d9fd9a2d 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -22,12 +22,11 @@ _COMPRESS_LEVEL_BEST = 9 READ_BUFFER_SIZE = 128 * 1024 -DEFAULT_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE +_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, - encoding=None, errors=None, newline=None, - buffer_size=DEFAULT_WRITE_BUFFER_SIZE): + encoding=None, errors=None, newline=None): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or @@ -59,9 +58,9 @@ def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): - binary_file = GzipFile(filename, gz_mode, compresslevel, buffer_size=buffer_size) + binary_file = GzipFile(filename, gz_mode, compresslevel) elif hasattr(filename, "read") or hasattr(filename, "write"): - binary_file = GzipFile(None, gz_mode, compresslevel, filename, buffer_size=buffer_size) + binary_file = GzipFile(None, gz_mode, compresslevel, filename) else: raise TypeError("filename must be a str or bytes object, or a file") @@ -151,8 +150,7 @@ class GzipFile(_compression.BaseStream): myfileobj = None def __init__(self, filename=None, mode=None, - compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None, - buffer_size=DEFAULT_WRITE_BUFFER_SIZE): + compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): """Constructor for the GzipFile class. At least one of fileobj and filename must be given a @@ -202,7 +200,6 @@ def __init__(self, filename=None, mode=None, if mode is None: mode = getattr(fileobj, 'mode', 'rb') - self.buffer_size = buffer_size if mode.startswith('r'): self.mode = READ @@ -226,8 +223,9 @@ def __init__(self, filename=None, mode=None, zlib.DEF_MEM_LEVEL, 0) self._write_mtime = mtime + self._buffer_size = _WRITE_BUFFER_SIZE self._buffer = io.BufferedWriter(_WriteBufferStream(self), - buffer_size=buffer_size) + buffer_size=self._buffer_size) else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -410,10 +408,10 @@ def seek(self, offset, whence=io.SEEK_SET): if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset - chunk = b'\0' * self.buffer_size - for i in range(count // self.buffer_size): + chunk = b'\0' * self._buffer_size + for i in range(count // self._buffer_size): self.write(chunk) - self.write(b'\0' * (count % self.buffer_size)) + self.write(b'\0' * (count % self._buffer_size)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) From 5ef4674d604ed5415467225c279b23a107657292 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 8 May 2023 10:27:52 -0700 Subject: [PATCH 8/8] improve news wording and ReSTify --- .../next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst index 97b322e975eb55..556db0eae00c0b 100644 --- a/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst +++ b/Misc/NEWS.d/next/Library/2023-01-22-14-53-12.gh-issue-89550.c1U23f.rst @@ -1 +1,2 @@ -Decrease execution time of gzip file writes by 15% +Decrease execution time of some :mod:`gzip` file writes by 15% by +adding more appropriate buffering.