From d17848da98b630b39c7afb304054e81df0be9806 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 17 Nov 2023 11:54:46 +0100
Subject: [PATCH] gh-103477: Write gzip trailer with zlib
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RHEL, SLES and Ubuntu for IBM zSystems (aka s390x) ship with a zlib
optimization [1] that significantly improves deflate performance by
using a specialized CPU instruction.

This instruction not only compresses the data, but also computes a
checksum. At the moment Pyhton's gzip support performs compression and
checksum calculation separately, which creates unnecessary overhead.
The reason is that Python needs to write specific values into gzip
header, so it uses a raw stream instead of a gzip stream, and zlib
does not compute a checksum for raw streams.

The challenge with using gzip streams instead of zlib streams is
dealing with zlib-generated gzip header, which we need to rather
generate manually. Implement the method proposed by @rhpvorderman: use
Z_BLOCK on the first deflate() call in order to stop before the first
deflate block is emitted. The data that is emitted up until this point
is zlib-generated gzip header, which should be discarded.

Expose this new functionality by adding a boolean gzip_trailer argument
to zlib.compress() and zlib.compressobj(). Make use of it in
gzip.compress(), GzipFile and TarFile. The performance improvement
varies depending on data being compressed, but it's in the ballpark of
40%.

An alternative approach is to use the deflateSetHeader() function,
introduced in zlib v1.2.2.1 (2011). This also works, but the change
was deemed too intrusive [2].

📜🤖 Added by blurb_it.

[1] https://github.com/madler/zlib/pull/410
[2] https://github.com/python/cpython/pull/103478
---
 Lib/gzip.py                                   | 19 ++---
 Lib/tarfile.py                                |  9 +-
 ...-11-17-12-26-47.gh-issue-103477._7cTsK.rst |  1 +
 Modules/clinic/zlibmodule.c.h                 | 71 +++++++++++-----
 Modules/zlibmodule.c                          | 84 +++++++++++++++++--
 5 files changed, 135 insertions(+), 49 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2023-11-17-12-26-47.gh-issue-103477._7cTsK.rst

diff --git a/Lib/gzip.py b/Lib/gzip.py
index 177f9080dc5af8b..39a46ed0d573c69 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -221,7 +221,8 @@ def __init__(self, filename=None, mode=None,
                                              zlib.DEFLATED,
                                              -zlib.MAX_WBITS,
                                              zlib.DEF_MEM_LEVEL,
-                                             0)
+                                             0,
+                                             gzip_trailer=True)
             self._write_mtime = mtime
             self._buffer_size = _WRITE_BUFFER_SIZE
             self._buffer = io.BufferedWriter(_WriteBufferStream(self),
@@ -245,8 +246,6 @@ def __repr__(self):
 
     def _init_write(self, filename):
         self.name = filename
-        self.crc = zlib.crc32(b"")
-        self.size = 0
         self.writebuf = []
         self.bufsize = 0
         self.offset = 0  # Current file offset for seek(), tell(), etc
@@ -310,8 +309,6 @@ def _write_raw(self, data):
 
         if length > 0:
             self.fileobj.write(self.compress.compress(data))
-            self.size += length
-            self.crc = zlib.crc32(data, self.crc)
             self.offset += length
 
         return length
@@ -355,9 +352,6 @@ def close(self):
             if self.mode == WRITE:
                 self._buffer.flush()
                 fileobj.write(self.compress.flush())
-                write32u(fileobj, self.crc)
-                # self.size may exceed 2 GiB, or even 4 GiB
-                write32u(fileobj, self.size & 0xffffffff)
             elif self.mode == READ:
                 self._buffer.close()
         finally:
@@ -611,10 +605,11 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
         # This is faster and with less overhead.
         return zlib.compress(data, level=compresslevel, wbits=31)
     header = _create_simple_gzip_header(compresslevel, mtime)
-    trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
-    # Wbits=-15 creates a raw deflate block.
-    return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
-            trailer)
+    # Wbits=-15 creates a raw deflate block. Gzip_trailer=True computes CRC32
+    # and writes gzip trailer with zlib, which on some platforms is faster
+    # than doing it manually.
+    return (header + zlib.compress(data, level=compresslevel, wbits=-15,
+                                   gzip_trailer=True))
 
 
 def decompress(data):
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index ec32f9ba49b03f6..c889b7ab067521a 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -369,7 +369,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
                 except ImportError:
                     raise CompressionError("zlib module is not available") from None
                 self.zlib = zlib
-                self.crc = zlib.crc32(b"")
                 if mode == "r":
                     self.exception = zlib.error
                     self._init_read_gz()
@@ -420,7 +419,8 @@ def _init_write_gz(self, compresslevel):
                                          self.zlib.DEFLATED,
                                          -self.zlib.MAX_WBITS,
                                          self.zlib.DEF_MEM_LEVEL,
-                                         0)
+                                         0,
+                                         gzip_trailer=True)
         timestamp = struct.pack("<L", int(time.time()))
         self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
         if self.name.endswith(".gz"):
@@ -433,8 +433,6 @@ def _init_write_gz(self, compresslevel):
     def write(self, s):
         """Write string s to the stream.
         """
-        if self.comptype == "gz":
-            self.crc = self.zlib.crc32(s, self.crc)
         self.pos += len(s)
         if self.comptype != "tar":
             s = self.cmp.compress(s)
@@ -464,9 +462,6 @@ def close(self):
             if self.mode == "w" and self.buf:
                 self.fileobj.write(self.buf)
                 self.buf = b""
-                if self.comptype == "gz":
-                    self.fileobj.write(struct.pack("<L", self.crc))
-                    self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
         finally:
             if not self._extfileobj:
                 self.fileobj.close()
diff --git a/Misc/NEWS.d/next/Library/2023-11-17-12-26-47.gh-issue-103477._7cTsK.rst b/Misc/NEWS.d/next/Library/2023-11-17-12-26-47.gh-issue-103477._7cTsK.rst
new file mode 100644
index 000000000000000..f272c09324af54d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-11-17-12-26-47.gh-issue-103477._7cTsK.rst
@@ -0,0 +1 @@
+Write gzip trailer with zlib, improving gzip compression performance on s390x by roughly 40%.
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h
index 6b09abe309bf486..b7acf39bba19b15 100644
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@@ -10,7 +10,8 @@ preserve
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(zlib_compress__doc__,
-"compress($module, data, /, level=Z_DEFAULT_COMPRESSION, wbits=MAX_WBITS)\n"
+"compress($module, data, /, level=Z_DEFAULT_COMPRESSION,\n"
+"         wbits=MAX_WBITS, gzip_trailer=False)\n"
 "--\n"
 "\n"
 "Returns a bytes object containing compressed data.\n"
@@ -20,13 +21,16 @@ PyDoc_STRVAR(zlib_compress__doc__,
 "  level\n"
 "    Compression level, in 0-9 or -1.\n"
 "  wbits\n"
-"    The window buffer size and container format.");
+"    The window buffer size and container format.\n"
+"  gzip_trailer\n"
+"    Whether to append a gzip trailer to a raw stream.");
 
 #define ZLIB_COMPRESS_METHODDEF    \
     {"compress", _PyCFunction_CAST(zlib_compress), METH_FASTCALL|METH_KEYWORDS, zlib_compress__doc__},
 
 static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits);
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits,
+                   int gzip_trailer);
 
 static PyObject *
 zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
@@ -34,14 +38,14 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
     PyObject *return_value = NULL;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 2
+    #define NUM_KEYWORDS 3
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
         PyObject *ob_item[NUM_KEYWORDS];
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
-        .ob_item = { &_Py_ID(level), &_Py_ID(wbits), },
+        .ob_item = { &_Py_ID(level), &_Py_ID(wbits), &_Py_ID(gzip_trailer), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -50,20 +54,21 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"", "level", "wbits", NULL};
+    static const char * const _keywords[] = {"", "level", "wbits", "gzip_trailer", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "compress",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[3];
+    PyObject *argsbuf[4];
     Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
     Py_buffer data = {NULL, NULL};
     int level = Z_DEFAULT_COMPRESSION;
     int wbits = MAX_WBITS;
+    int gzip_trailer = 0;
 
-    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 3, 0, argsbuf);
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 4, 0, argsbuf);
     if (!args) {
         goto exit;
     }
@@ -82,12 +87,21 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
             goto skip_optional_pos;
         }
     }
-    wbits = PyLong_AsInt(args[2]);
-    if (wbits == -1 && PyErr_Occurred()) {
+    if (args[2]) {
+        wbits = PyLong_AsInt(args[2]);
+        if (wbits == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    gzip_trailer = PyObject_IsTrue(args[3]);
+    if (gzip_trailer < 0) {
         goto exit;
     }
 skip_optional_pos:
-    return_value = zlib_compress_impl(module, &data, level, wbits);
+    return_value = zlib_compress_impl(module, &data, level, wbits, gzip_trailer);
 
 exit:
     /* Cleanup for data */
@@ -199,7 +213,7 @@ zlib_decompress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj
 PyDoc_STRVAR(zlib_compressobj__doc__,
 "compressobj($module, /, level=Z_DEFAULT_COMPRESSION, method=DEFLATED,\n"
 "            wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL,\n"
-"            strategy=Z_DEFAULT_STRATEGY, zdict=None)\n"
+"            strategy=Z_DEFAULT_STRATEGY, zdict=None, gzip_trailer=False)\n"
 "--\n"
 "\n"
 "Return a compressor object.\n"
@@ -224,14 +238,17 @@ PyDoc_STRVAR(zlib_compressobj__doc__,
 "    Z_DEFAULT_STRATEGY, Z_FILTERED, and Z_HUFFMAN_ONLY.\n"
 "  zdict\n"
 "    The predefined compression dictionary - a sequence of bytes\n"
-"    containing subsequences that are likely to occur in the input data.");
+"    containing subsequences that are likely to occur in the input data.\n"
+"  gzip_trailer\n"
+"    Whether to append a gzip trailer to a raw stream.");
 
 #define ZLIB_COMPRESSOBJ_METHODDEF    \
     {"compressobj", _PyCFunction_CAST(zlib_compressobj), METH_FASTCALL|METH_KEYWORDS, zlib_compressobj__doc__},
 
 static PyObject *
 zlib_compressobj_impl(PyObject *module, int level, int method, int wbits,
-                      int memLevel, int strategy, Py_buffer *zdict);
+                      int memLevel, int strategy, Py_buffer *zdict,
+                      int gzip_trailer);
 
 static PyObject *
 zlib_compressobj(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
@@ -239,14 +256,14 @@ zlib_compressobj(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb
     PyObject *return_value = NULL;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 6
+    #define NUM_KEYWORDS 7
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
         PyObject *ob_item[NUM_KEYWORDS];
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
-        .ob_item = { &_Py_ID(level), &_Py_ID(method), &_Py_ID(wbits), &_Py_ID(memLevel), &_Py_ID(strategy), &_Py_ID(zdict), },
+        .ob_item = { &_Py_ID(level), &_Py_ID(method), &_Py_ID(wbits), &_Py_ID(memLevel), &_Py_ID(strategy), &_Py_ID(zdict), &_Py_ID(gzip_trailer), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -255,14 +272,14 @@ zlib_compressobj(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"level", "method", "wbits", "memLevel", "strategy", "zdict", NULL};
+    static const char * const _keywords[] = {"level", "method", "wbits", "memLevel", "strategy", "zdict", "gzip_trailer", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "compressobj",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[6];
+    PyObject *argsbuf[7];
     Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
     int level = Z_DEFAULT_COMPRESSION;
     int method = DEFLATED;
@@ -270,8 +287,9 @@ zlib_compressobj(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb
     int memLevel = DEF_MEM_LEVEL;
     int strategy = Z_DEFAULT_STRATEGY;
     Py_buffer zdict = {NULL, NULL};
+    int gzip_trailer = 0;
 
-    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 6, 0, argsbuf);
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 7, 0, argsbuf);
     if (!args) {
         goto exit;
     }
@@ -323,11 +341,20 @@ zlib_compressobj(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb
             goto skip_optional_pos;
         }
     }
-    if (PyObject_GetBuffer(args[5], &zdict, PyBUF_SIMPLE) != 0) {
+    if (args[5]) {
+        if (PyObject_GetBuffer(args[5], &zdict, PyBUF_SIMPLE) != 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    gzip_trailer = PyObject_IsTrue(args[6]);
+    if (gzip_trailer < 0) {
         goto exit;
     }
 skip_optional_pos:
-    return_value = zlib_compressobj_impl(module, level, method, wbits, memLevel, strategy, &zdict);
+    return_value = zlib_compressobj_impl(module, level, method, wbits, memLevel, strategy, &zdict, gzip_trailer);
 
 exit:
     /* Cleanup for zdict */
@@ -1098,4 +1125,4 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
 #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
     #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
 #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=6dd97dc851c39031 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=78ccfdb13639c155 input=a9049054013a1b77]*/
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index 9b76afa0e56f76e..0448e04b3218546 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -302,6 +302,55 @@ PyZlib_Free(voidpf ctx, void *ptr)
     PyMem_RawFree(ptr);
 }
 
+static void
+arrange_gzip_trailer(int *gzip_trailer, int *wbits)
+{
+    if (*gzip_trailer && *wbits >= -15 && *wbits <= -9) {
+        /* Ask zlib to emit gzip header and gzip trailer. We need ony the
+           trailer, but it's not possible to request that, so we will have to
+           skip the header manually. */
+        *wbits = 16 - *wbits;
+    } else {
+        /* Ignore gzip_trailer. */
+        *gzip_trailer = 0;
+    }
+}
+
+static int
+skip_gzip_header(z_stream *zst)
+{
+    /* Emit gzip header into a throw-away buffer by compressing an empty
+       buffer with Z_BLOCK. The header should fully fit into the buffer, so
+       one deflate() call should be enough, but use a loop anyway just in
+       case. */
+    uInt saved_avail_in = zst->avail_in, saved_avail_out = zst->avail_out;
+    Bytef *saved_next_in = zst->next_in, *saved_next_out = zst->next_out;
+    int flush = Z_BLOCK;
+    Bytef tmp[32];
+    int err;
+
+    while (true) {
+        zst->next_in = NULL;
+        zst->avail_in = 0;
+        zst->next_out = tmp;
+        zst->avail_out = sizeof(tmp);
+        err = deflate(zst, flush);
+        if (err != Z_OK) {
+            return err;
+        }
+        if (zst->avail_out != 0) {
+            break;
+        }
+        flush = Z_NO_FLUSH;
+    }
+    zst->next_in = saved_next_in;
+    zst->avail_in = saved_avail_in;
+    zst->next_out = saved_next_out;
+    zst->avail_out = saved_avail_out;
+
+    return Z_OK;
+}
+
 static void
 arrange_input_buffer(z_stream *zst, Py_ssize_t *remains)
 {
@@ -319,13 +368,16 @@ zlib.compress
         Compression level, in 0-9 or -1.
     wbits: int(c_default="MAX_WBITS") = MAX_WBITS
         The window buffer size and container format.
+    gzip_trailer: bool = False
+        Whether to append a gzip trailer to a raw stream.
 
 Returns a bytes object containing compressed data.
 [clinic start generated code]*/
 
 static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
-/*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits,
+                   int gzip_trailer)
+/*[clinic end generated code: output=feb20f80fe7e4848 input=c17ae8b22942f857]*/
 {
     PyObject *return_value;
     int flush;
@@ -341,6 +393,8 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
         goto error;
     }
 
+    arrange_gzip_trailer(&gzip_trailer, &wbits);
+
     zst.opaque = NULL;
     zst.zalloc = PyZlib_Malloc;
     zst.zfree = PyZlib_Free;
@@ -364,6 +418,12 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
         goto error;
     }
 
+    if (gzip_trailer && (err = skip_gzip_header(&zst)) != Z_OK) {
+        deflateEnd(&zst);
+        zlib_error(state, zst, err, "while skipping gzip header");
+        goto error;
+    }
+
     do {
         arrange_input_buffer(&zst, &ibuflen);
         flush = ibuflen == 0 ? Z_FINISH : Z_NO_FLUSH;
@@ -555,14 +615,17 @@ zlib.compressobj
     zdict: Py_buffer = None
         The predefined compression dictionary - a sequence of bytes
         containing subsequences that are likely to occur in the input data.
+    gzip_trailer: bool = False
+        Whether to append a gzip trailer to a raw stream.
 
 Return a compressor object.
 [clinic start generated code]*/
 
 static PyObject *
 zlib_compressobj_impl(PyObject *module, int level, int method, int wbits,
-                      int memLevel, int strategy, Py_buffer *zdict)
-/*[clinic end generated code: output=8b5bed9c8fc3814d input=2fa3d026f90ab8d5]*/
+                      int memLevel, int strategy, Py_buffer *zdict,
+                      int gzip_trailer)
+/*[clinic end generated code: output=fb4c37ba07d34e28 input=8de44294b8fe50f4]*/
 {
     zlibstate *state = get_zlib_state(module);
     if (zdict->buf != NULL && (size_t)zdict->len > UINT_MAX) {
@@ -571,6 +634,8 @@ zlib_compressobj_impl(PyObject *module, int level, int method, int wbits,
         return NULL;
     }
 
+    arrange_gzip_trailer(&gzip_trailer, &wbits);
+
     compobject *self = newcompobject(state->Comptype);
     if (self == NULL)
         goto error;
@@ -583,14 +648,12 @@ zlib_compressobj_impl(PyObject *module, int level, int method, int wbits,
     switch (err) {
     case Z_OK:
         self->is_initialised = 1;
-        if (zdict->buf == NULL) {
-            goto success;
-        } else {
+        if (zdict->buf != NULL) {
             err = deflateSetDictionary(&self->zst,
                                        zdict->buf, (unsigned int)zdict->len);
             switch (err) {
             case Z_OK:
-                goto success;
+                break;
             case Z_STREAM_ERROR:
                 PyErr_SetString(PyExc_ValueError, "Invalid dictionary");
                 goto error;
@@ -599,6 +662,11 @@ zlib_compressobj_impl(PyObject *module, int level, int method, int wbits,
                 goto error;
             }
        }
+       if (gzip_trailer && (err = skip_gzip_header(&self->zst)) != Z_OK) {
+           zlib_error(state, self->zst, err, "while skipping gzip header");
+           goto error;
+       }
+       goto success;
     case Z_MEM_ERROR:
         PyErr_SetString(PyExc_MemoryError,
                         "Can't allocate memory for compression object");