From 87b44a8da884665b2e2f2a76f662a93d8318fe5d Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Mon, 2 Oct 2023 19:59:47 +0000
Subject: [PATCH] Unfilter in place instead of copying data out of
 `ZlibStream`'s buffers.

Before this commit the decompressed image data would flow as follows:

1. `fdeflate` writes the raw data into `ZlibStream::out_buffer`
2. Each byte of (raw) image data is copied (i.e. `append`-ed) into
   `Reader::current`.  See
   [`append` here](https://github.com/image-rs/image-png/blob/f10238a1e886b228e7da5301e5c0f5011316f2d6/src/decoder/zlib.rs#L168)
   and
   [`extend` here](https://github.com/image-rs/image-png/blob/f10238a1e886b228e7da5301e5c0f5011316f2d6/src/decoder/zlib.rs#L208).
     A. Before this happens, `Reader::current` is compacted, which
        would requires some additional copying of data.  See
        [`drain` here](https://github.com/image-rs/image-png/blob/f10238a1e886b228e7da5301e5c0f5011316f2d6/src/decoder/mod.rs#L733-L737)
     B. After this happens, the `ZlibStream::out_buffer` is **always**
        compacted - the consumed data is removed and the remaining data
        is copied within the buffer (i.e. shifted left, to position 0).
        See
        [`drain` here](https://github.com/image-rs/image-png/blob/f10238a1e886b228e7da5301e5c0f5011316f2d6/src/decoder/zlib.rs#L208)
3. The data is `unfilter`-ed in place, inside `Reader::current`
4. Each byte of (unfiltered) image data is copied into `Reader::prev`.
   See
   [`copy_from_slice` here](https://github.com/image-rs/image-png/blob/f10238a1e886b228e7da5301e5c0f5011316f2d6/src/decoder/mod.rs#L769)
5. The unfiltered data is used as input for `expand_...` calls.

After this commit we avoid copies in steps 2 and 4 and instead keep
the data in `ZlibStream::out_buffer` a bit longer.  In particular,
Unfiltering is done by keeping the previous and current row within
`ZlibStream::out_buffer` and mutating a portion of
`ZlibStream::out_buffer` in place.

Additionally, after this commit compaction of `ZlibStream::out_buffer`
(in step 2B above) is rate-limited, so that at most 1 byte goes through
`memcpy` (or `memset`) per 1 byte of decompressed output.  And after
this commit the compaction of `Reader::current` (in step 2A above)
doesn't happen at all (since this buffer has been removed entirely).

The changes in this commit should have miminal or no impact on the
cache/memory pressure.  On one hand, this commit *removes* some memory
pressure by removing the `Reader::current` and `Reader::prev` buffers.
OTOH, this commit *increases* the amount of data stored in the
`ZlibStream::out_buffer` (by more or less the same amount). Therefore
the overall amount of memory used during decoding should stay the same
(if we ignore change stemming from the tweaks to the frequency of
compaction of `ZlibStream::out_buffer`).

This commit includes a breaking API change and therefore increases the
crate version in `Cargo.toml`.  In particular, after this commit
the `png::StreamingDecoder::update` API no longer takes `&mut Vec<u8>`
and the stream of decompressed data has to be instead accessed and/or
manipilated via the following new methids: `decompressed_image_data`,
`decompressed_image_data_mut`, `discard_image_data`.
---
 Cargo.toml            |   2 +-
 examples/pngcheck.rs  |   2 +-
 src/decoder/mod.rs    | 150 +++++++++++++++++++++++++++---------------
 src/decoder/stream.rs |  53 ++++++++-------
 src/decoder/zlib.rs   | 122 ++++++++++++++++++++++++++--------
 5 files changed, 227 insertions(+), 102 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index fcc45e18..2c2ad018 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "png"
-version = "0.17.10"
+version = "0.18.0"
 license = "MIT OR Apache-2.0"
 
 description = "PNG decoding and encoding library in pure Rust"
diff --git a/examples/pngcheck.rs b/examples/pngcheck.rs
index 69e95e3c..63f51fac 100644
--- a/examples/pngcheck.rs
+++ b/examples/pngcheck.rs
@@ -174,7 +174,7 @@ fn check_image<P: AsRef<Path>>(c: Config, fname: P) -> io::Result<()> {
             }
             buf = &data[..n];
         }
-        match decoder.update(buf, &mut Vec::new()) {
+        match decoder.update(buf) {
             Ok((_, ImageEnd)) => {
                 if !have_idat {
                     // This isn't beautiful. But it works.
diff --git a/src/decoder/mod.rs b/src/decoder/mod.rs
index 29e2fd07..36c5e70a 100644
--- a/src/decoder/mod.rs
+++ b/src/decoder/mod.rs
@@ -6,6 +6,7 @@ use self::stream::{FormatErrorInner, CHUNCK_BUFFER_SIZE};
 
 use std::io::{BufRead, BufReader, Read};
 use std::mem;
+use std::num::NonZeroUsize;
 use std::ops::Range;
 
 use crate::chunk;
@@ -188,15 +189,14 @@ impl<R: Read> Decoder<R> {
     /// Most image metadata will not be read until `read_info` is called, so those fields will be
     /// None or empty.
     pub fn read_header_info(&mut self) -> Result<&Info, DecodingError> {
-        let mut buf = Vec::new();
         while self.read_decoder.info().is_none() {
-            buf.clear();
-            if self.read_decoder.decode_next(&mut buf)?.is_none() {
+            if self.read_decoder.decode_next()?.is_none() {
                 return Err(DecodingError::Format(
                     FormatErrorInner::UnexpectedEof.into(),
                 ));
             }
         }
+        debug_assert!(self.read_decoder.decompressed_image_data().is_empty());
         Ok(self.read_decoder.info().unwrap())
     }
 
@@ -210,9 +210,7 @@ impl<R: Read> Decoder<R> {
             subframe: SubframeInfo::not_yet_init(),
             fctl_read: 0,
             next_frame: SubframeIdx::Initial,
-            prev: Vec::new(),
-            current: Vec::new(),
-            scan_start: 0,
+            prev_row_len: None,
             transform: self.transform,
             scratch_buffer: Vec::new(),
             limits: self.limits,
@@ -281,9 +279,25 @@ struct ReadDecoder<R: Read> {
 }
 
 impl<R: Read> ReadDecoder<R> {
-    /// Returns the next decoded chunk. If the chunk is an ImageData chunk, its contents are written
-    /// into image_data.
-    fn decode_next(&mut self, image_data: &mut Vec<u8>) -> Result<Option<Decoded>, DecodingError> {
+    fn decompressed_image_data(&self) -> &[u8] {
+        self.decoder.decompressed_image_data()
+    }
+
+    fn decompressed_image_data_mut(&mut self) -> &mut [u8] {
+        self.decoder.decompressed_image_data_mut()
+    }
+
+    fn discard_image_data(&mut self, number_of_bytes_to_discard: usize) {
+        self.decoder.discard_image_data(number_of_bytes_to_discard);
+    }
+
+    fn reset_image_data_inflater(&mut self) {
+        self.decoder.reset_image_data_inflater();
+    }
+
+    /// Returns the next decoded chunk. If the chunk is an ImageData chunk, its contents can be
+    /// found at the end of `decompressed_image_data_mut()`.
+    fn decode_next(&mut self) -> Result<Option<Decoded>, DecodingError> {
         while !self.at_eof {
             let (consumed, result) = {
                 let buf = self.reader.fill_buf()?;
@@ -292,7 +306,7 @@ impl<R: Read> ReadDecoder<R> {
                         FormatErrorInner::UnexpectedEof.into(),
                     ));
                 }
-                self.decoder.update(buf, image_data)?
+                self.decoder.update(buf)?
             };
             self.reader.consume(consumed);
             match result {
@@ -312,14 +326,17 @@ impl<R: Read> ReadDecoder<R> {
                     FormatErrorInner::UnexpectedEof.into(),
                 ));
             }
-            let (consumed, event) = self.decoder.update(buf, &mut vec![])?;
+            let (consumed, event) = self.decoder.update(buf)?;
             self.reader.consume(consumed);
             match event {
                 Decoded::Nothing => (),
                 Decoded::ImageEnd => self.at_eof = true,
                 // ignore more data
                 Decoded::ChunkComplete(_, _) | Decoded::ChunkBegin(_, _) | Decoded::ImageData => {}
-                Decoded::ImageDataFlushed => return Ok(()),
+                Decoded::ImageDataFlushed => {
+                    self.decoder.reset_image_data_inflater();
+                    return Ok(());
+                }
                 Decoded::PartialChunk(_) => {}
                 new => unreachable!("{:?}", new),
             }
@@ -347,12 +364,9 @@ pub struct Reader<R: Read> {
     /// control chunk. The IDAT image _may_ have such a chunk applying to it.
     fctl_read: u32,
     next_frame: SubframeIdx,
-    /// Previous raw line
-    prev: Vec<u8>,
-    /// Current raw line
-    current: Vec<u8>,
-    /// Start index of the current scan line.
-    scan_start: usize,
+    /// The length of the previous row (at the beginning of
+    /// `self.decoder.decompressed_image_data_mut()`) or `None` if there is no previous row yet.
+    prev_row_len: Option<NonZeroUsize>,
     /// Output transformations
     transform: Transformations,
     /// This buffer is only used so that `next_row` and `next_interlaced_row` can return reference
@@ -401,12 +415,8 @@ impl<R: Read> Reader<R> {
     /// Requires IHDR before the IDAT and fcTL before fdAT.
     fn read_until_image_data(&mut self) -> Result<(), DecodingError> {
         loop {
-            // This is somewhat ugly. The API requires us to pass a buffer to decode_next but we
-            // know that we will stop before reading any image data from the stream. Thus pass an
-            // empty buffer and assert that remains empty.
-            let mut buf = Vec::new();
-            let state = self.decoder.decode_next(&mut buf)?;
-            assert!(buf.is_empty());
+            let state = self.decoder.decode_next()?;
+            assert!(self.decoder.decompressed_image_data().is_empty());
 
             match state {
                 Some(Decoded::ChunkBegin(_, chunk::IDAT))
@@ -444,12 +454,18 @@ impl<R: Read> Reader<R> {
             return Err(DecodingError::LimitsExceeded);
         }
 
-        self.prev.clear();
-        self.prev.resize(self.subframe.rowlen, 0);
+        self.reset_prev_row();
 
         Ok(())
     }
 
+    fn reset_prev_row(&mut self) {
+        if let Some(prev_row_len) = self.prev_row_len.as_ref() {
+            self.decoder.discard_image_data(prev_row_len.get());
+            self.prev_row_len = None;
+        }
+    }
+
     /// Get information on the image.
     ///
     /// The structure will change as new frames of an animated image are decoded.
@@ -471,6 +487,9 @@ impl<R: Read> Reader<R> {
     /// Output lines will be written in row-major, packed matrix with width and height of the read
     /// frame (or subframe), all samples are in big endian byte order where this matters.
     pub fn next_frame(&mut self, buf: &mut [u8]) -> Result<OutputInfo, DecodingError> {
+        self.reset_prev_row();
+        self.decoder.reset_image_data_inflater();
+
         let subframe_idx = match self.decoder.info().unwrap().frame_control() {
             None => SubframeIdx::Initial,
             Some(_) => SubframeIdx::Some(self.fctl_read - 1),
@@ -504,8 +523,6 @@ impl<R: Read> Reader<R> {
             line_size: self.output_line_size(self.subframe.width),
         };
 
-        self.current.clear();
-        self.scan_start = 0;
         let width = self.info().width;
         if self.info().interlaced {
             while let Some(InterlacedRow {
@@ -597,7 +614,6 @@ impl<R: Read> Reader<R> {
         output_buffer: &mut [u8],
     ) -> Result<(), DecodingError> {
         self.next_raw_interlaced_row(rowlen)?;
-        let row = &self.prev[1..rowlen];
 
         // Apply transformations and write resulting data to buffer.
         let (color_type, bit_depth, trns) = {
@@ -617,6 +633,7 @@ impl<R: Read> Reader<R> {
         } else {
             None
         };
+        let row = &self.decoder.decompressed_image_data()[1..rowlen];
         match (color_type, trns) {
             (ColorType::Indexed, _) if expand => {
                 expand_paletted(row, output_buffer, info, trns)?;
@@ -706,8 +723,7 @@ impl<R: Read> Reader<R> {
                 let (pass, line, width) = adam7.next()?;
                 let rowlen = self.info().raw_row_length_from_width(width);
                 if last_pass != pass {
-                    self.prev.clear();
-                    self.prev.resize(rowlen, 0u8);
+                    self.reset_prev_row();
                 }
                 Some((rowlen, InterlaceInfo::Adam7 { pass, line, width }))
             }
@@ -718,32 +734,33 @@ impl<R: Read> Reader<R> {
         }
     }
 
-    /// Write the next raw interlaced row into `self.prev`.
+    /// Write the next raw interlaced row into `self.decoder.decompressed_image_data_mut()`.
     ///
     /// The scanline is filtered against the previous scanline according to the specification.
     fn next_raw_interlaced_row(&mut self, rowlen: usize) -> Result<(), DecodingError> {
         // Read image data until we have at least one full row (but possibly more than one).
-        while self.current.len() - self.scan_start < rowlen {
+        let required_len = match self.prev_row_len {
+            None => rowlen,
+            Some(prev_row_len) => {
+                debug_assert_eq!(prev_row_len.get(), rowlen);
+                rowlen * 2
+            }
+        };
+        while self.decoder.decompressed_image_data().len() < required_len {
             if self.subframe.consumed_and_flushed {
                 return Err(DecodingError::Format(
                     FormatErrorInner::NoMoreImageData.into(),
                 ));
             }
 
-            // Clear the current buffer before appending more data.
-            if self.scan_start > 0 {
-                self.current.drain(..self.scan_start).for_each(drop);
-                self.scan_start = 0;
-            }
-
-            match self.decoder.decode_next(&mut self.current)? {
+            match self.decoder.decode_next()? {
                 Some(Decoded::ImageData) => {}
                 Some(Decoded::ImageDataFlushed) => {
                     self.subframe.consumed_and_flushed = true;
                 }
                 None => {
                     return Err(DecodingError::Format(
-                        if self.current.is_empty() {
+                        if self.decoder.decompressed_image_data().is_empty() {
                             FormatErrorInner::NoMoreImageData
                         } else {
                             FormatErrorInner::UnexpectedEndOfChunk
@@ -755,18 +772,47 @@ impl<R: Read> Reader<R> {
             }
         }
 
-        // Get a reference to the current row and point scan_start to the next one.
-        let row = &mut self.current[self.scan_start..];
-        self.scan_start += rowlen;
-
-        // Unfilter the row.
-        let filter = FilterType::from_u8(row[0]).ok_or(DecodingError::Format(
-            FormatErrorInner::UnknownFilterMethod(row[0]).into(),
-        ))?;
-        unfilter(filter, self.bpp, &self.prev[1..rowlen], &mut row[1..rowlen]);
+        {
+            // Calculate a reference to the previous row (or a buffer of zeros there is no previous
+            // row) and a mutable reference to the current row.
+            let mut zeros = Vec::new();
+            let (prev, row) = match self.prev_row_len {
+                None => {
+                    zeros.resize(rowlen, 0);
+                    (
+                        zeros.as_slice(),
+                        &mut self.decoder.decompressed_image_data_mut()[..rowlen],
+                    )
+                }
+                Some(prev_row_len) => {
+                    debug_assert_eq!(prev_row_len.get(), rowlen);
+                    let (prev, row) = self
+                        .decoder
+                        .decompressed_image_data_mut()
+                        .split_at_mut(rowlen);
+                    (&prev[..], &mut row[..rowlen])
+                }
+            };
+            debug_assert_eq!(prev.len(), rowlen);
+            debug_assert_eq!(row.len(), rowlen);
+
+            // Unfilter the row.
+            let filter = FilterType::from_u8(row[0]).ok_or(DecodingError::Format(
+                FormatErrorInner::UnknownFilterMethod(row[0]).into(),
+            ))?;
+            unfilter(filter, self.bpp, &prev[1..], &mut row[1..]);
+        }
 
-        // Save the current row for the next pass.
-        self.prev[..rowlen].copy_from_slice(&row[..rowlen]);
+        match self.prev_row_len {
+            None => {
+                debug_assert_ne!(0, rowlen);
+                self.prev_row_len = NonZeroUsize::new(rowlen);
+            }
+            Some(prev_row_len) => {
+                debug_assert_eq!(prev_row_len.get(), rowlen);
+                self.decoder.discard_image_data(rowlen);
+            }
+        }
 
         Ok(())
     }
diff --git a/src/decoder/stream.rs b/src/decoder/stream.rs
index 95fcb8e9..3bfbe878 100644
--- a/src/decoder/stream.rs
+++ b/src/decoder/stream.rs
@@ -481,6 +481,10 @@ impl StreamingDecoder {
         self.have_idat = false;
     }
 
+    pub fn reset_image_data_inflater(&mut self) {
+        self.inflater.reset();
+    }
+
     /// Provides access to the inner `info` field
     pub fn info(&self) -> Option<&Info<'static>> {
         self.info.as_ref()
@@ -518,15 +522,11 @@ impl StreamingDecoder {
     ///
     /// Allows to stream partial data to the encoder. Returns a tuple containing the bytes that have
     /// been consumed from the input buffer and the current decoding result. If the decoded chunk
-    /// was an image data chunk, it also appends the read data to `image_data`.
-    pub fn update(
-        &mut self,
-        mut buf: &[u8],
-        image_data: &mut Vec<u8>,
-    ) -> Result<(usize, Decoded), DecodingError> {
+    /// was an image data chunk, then it will appear at the end of `decompressed_image_data_mut`.
+    pub fn update(&mut self, mut buf: &[u8]) -> Result<(usize, Decoded), DecodingError> {
         let len = buf.len();
         while !buf.is_empty() && self.state.is_some() {
-            match self.next_state(buf, image_data) {
+            match self.next_state(buf) {
                 Ok((bytes, Decoded::Nothing)) => buf = &buf[bytes..],
                 Ok((bytes, result)) => {
                     buf = &buf[bytes..];
@@ -538,11 +538,20 @@ impl StreamingDecoder {
         Ok((len - buf.len(), Decoded::Nothing))
     }
 
-    fn next_state<'a>(
-        &'a mut self,
-        buf: &[u8],
-        image_data: &mut Vec<u8>,
-    ) -> Result<(usize, Decoded), DecodingError> {
+    pub fn decompressed_image_data(&self) -> &[u8] {
+        self.inflater.decompressed_data()
+    }
+
+    pub fn decompressed_image_data_mut(&mut self) -> &mut [u8] {
+        self.inflater.decompressed_data_mut()
+    }
+
+    pub fn discard_image_data(&mut self, number_of_bytes_to_discard: usize) {
+        self.inflater
+            .discard_decompressed_data(number_of_bytes_to_discard);
+    }
+
+    fn next_state<'a>(&'a mut self, buf: &[u8]) -> Result<(usize, Decoded), DecodingError> {
         use self::State::*;
 
         let current_byte = buf[0];
@@ -585,8 +594,7 @@ impl StreamingDecoder {
                                 || self.current_chunk.type_ == chunk::fdAT)
                         {
                             self.current_chunk.type_ = type_str;
-                            self.inflater.finish_compressed_chunks(image_data)?;
-                            self.inflater.reset();
+                            self.inflater.finish_compressed_chunks()?;
                             self.state = Some(U32Byte3(Type(length), val & !0xff));
                             return Ok((0, Decoded::ImageDataFlushed));
                         }
@@ -735,7 +743,7 @@ impl StreamingDecoder {
             DecodeData(type_str, mut n) => {
                 let chunk_len = self.current_chunk.raw_bytes.len();
                 let chunk_data = &self.current_chunk.raw_bytes[n..];
-                let c = self.inflater.decompress(chunk_data, image_data)?;
+                let c = self.inflater.decompress(chunk_data)?;
                 n += c;
                 if n == chunk_len && c == 0 {
                     self.current_chunk.raw_bytes.clear();
@@ -1122,17 +1130,18 @@ impl StreamingDecoder {
                 }
             }
 
-            let mut profile = Vec::new();
             let mut inflater = ZlibStream::new();
             while !buf.is_empty() {
-                let consumed_bytes = inflater.decompress(buf, &mut profile)?;
-                if profile.len() > 8000000 {
-                    // TODO: this should use Limits.bytes
-                    return Err(DecodingError::LimitsExceeded);
-                }
+                let consumed_bytes = inflater.decompress(buf)?;
                 buf = &buf[consumed_bytes..];
             }
-            inflater.finish_compressed_chunks(&mut profile)?;
+            inflater.finish_compressed_chunks()?;
+
+            let profile = inflater.into_decompressed_data();
+            if profile.len() > 8000000 {
+                // TODO: this should use Limits.bytes
+                return Err(DecodingError::LimitsExceeded);
+            }
 
             info.icc_profile = Some(Cow::Owned(profile));
             Ok(Decoded::Nothing)
diff --git a/src/decoder/zlib.rs b/src/decoder/zlib.rs
index 2953c951..34ab7595 100644
--- a/src/decoder/zlib.rs
+++ b/src/decoder/zlib.rs
@@ -23,11 +23,25 @@ pub(super) struct ZlibStream {
     in_buffer: Vec<u8>,
     /// The logical start of the `in_buffer`.
     in_pos: usize,
-    /// Remaining buffered decoded bytes.
-    /// The decoder sometimes wants inspect some already finished bytes for further decoding. So we
-    /// keep a total of 32KB of decoded data available as long as more data may be appended.
+    /// Remaining buffered decoded bytes.  The buffer is split into 4 parts - from left to right:
+    ///
+    /// * `0..self.reader_pos` - decompressed bytes that our client has marked as already consumed
+    ///   (by calling `consume_decompressed_bytes()`) and that we are free to discard (according to
+    ///   our internal heuristics).
+    /// * `self.reader_pos..self.safe_pos` - decompressed bytes that our client can work with
+    ///   (mutating them as needed).  The client can get this slice by calling
+    ///   `decompressed_bytes_mut()`.
+    /// * `self.safe_pos..self.out_pos` - decompressed bytes that may still need to be
+    ///   referenced by future decompression (see section "3.2.5. Compressed blocks (length and
+    ///   distance codes)" in RFC 1951).
+    /// * `self.out_pos..` - zeroed-out bytes that will be used for output from future calls
+    ///   to the `fdeflate::Decompressor::read`
     out_buffer: Vec<u8>,
-    /// The cursor position in the output stream as a buffer index.
+    /// Index into `out_buffer`.  See the doc comment of `out_buffer` for more details.
+    reader_pos: usize,
+    /// Index into `out_buffer`.  See the doc comment of `out_buffer` for more details.
+    safe_pos: usize,
+    /// Index into `out_buffer`.  See the doc comment of `out_buffer` for more details.
     out_pos: usize,
     /// Ignore and do not calculate the Adler-32 checksum. Defaults to `true`.
     ///
@@ -45,6 +59,8 @@ impl ZlibStream {
             in_buffer: Vec::with_capacity(CHUNCK_BUFFER_SIZE),
             in_pos: 0,
             out_buffer: vec![0; 2 * CHUNCK_BUFFER_SIZE],
+            reader_pos: 0,
+            safe_pos: 0,
             out_pos: 0,
             ignore_adler32: true,
         }
@@ -55,6 +71,8 @@ impl ZlibStream {
         self.in_buffer.clear();
         self.in_pos = 0;
         self.out_buffer.clear();
+        self.reader_pos = 0;
+        self.safe_pos = 0;
         self.out_pos = 0;
         *self.state = Decompressor::new();
     }
@@ -82,11 +100,7 @@ impl ZlibStream {
 
     /// Fill the decoded buffer as far as possible from `data`.
     /// On success returns the number of consumed input bytes.
-    pub(crate) fn decompress(
-        &mut self,
-        data: &[u8],
-        image_data: &mut Vec<u8>,
-    ) -> Result<usize, DecodingError> {
+    pub(crate) fn decompress(&mut self, data: &[u8]) -> Result<usize, DecodingError> {
         self.prepare_vec_for_appending();
 
         if !self.started && self.ignore_adler32 {
@@ -123,7 +137,7 @@ impl ZlibStream {
 
         self.started = true;
         self.out_pos += out_consumed;
-        self.transfer_finished_data(image_data);
+        self.transfer_finished_data();
 
         Ok(in_consumed)
     }
@@ -133,10 +147,7 @@ impl ZlibStream {
     /// The compressed stream can be split on arbitrary byte boundaries. This enables some cleanup
     /// within the decompressor and flushing additional data which may have been kept back in case
     /// more data were passed to it.
-    pub(crate) fn finish_compressed_chunks(
-        &mut self,
-        image_data: &mut Vec<u8>,
-    ) -> Result<(), DecodingError> {
+    pub(crate) fn finish_compressed_chunks(&mut self) -> Result<(), DecodingError> {
         if !self.started {
             return Ok(());
         }
@@ -165,14 +176,10 @@ impl ZlibStream {
 
             if self.state.is_done() {
                 self.out_buffer.truncate(self.out_pos);
-                image_data.append(&mut self.out_buffer);
+                self.safe_pos = self.out_pos;
                 return Ok(());
             } else {
-                let transferred = self.transfer_finished_data(image_data);
-                assert!(
-                    transferred > 0 || in_consumed > 0 || out_consumed > 0,
-                    "No more forward progress made in stream decoding."
-                );
+                self.transfer_finished_data();
             }
         }
     }
@@ -202,11 +209,74 @@ impl ZlibStream {
             .min(isize::max_value() as usize)
     }
 
-    fn transfer_finished_data(&mut self, image_data: &mut Vec<u8>) -> usize {
-        let safe = self.out_pos.saturating_sub(CHUNCK_BUFFER_SIZE);
-        // TODO: allocation limits.
-        image_data.extend(self.out_buffer.drain(..safe));
-        self.out_pos -= safe;
-        safe
+    /// Return the part of the decompressed data that can be mutated.
+    pub(crate) fn decompressed_data_mut(&mut self) -> &mut [u8] {
+        &mut self.out_buffer[self.reader_pos..self.safe_pos]
+    }
+
+    /// Return the same part of the decompressed data as `decompressed_data_mut`,
+    /// but takes immutable `&self` reference so allows `&self` to be reused.
+    pub(crate) fn decompressed_data(&self) -> &[u8] {
+        &self.out_buffer[self.reader_pos..self.safe_pos]
+    }
+
+    /// Return all decompressed data.
+    pub(crate) fn into_decompressed_data(mut self) -> Vec<u8> {
+        if self.reader_pos != 0 {
+            self.out_buffer
+                .copy_within(self.reader_pos..self.out_pos, 0);
+        }
+        self.out_buffer.truncate(self.out_pos - self.reader_pos);
+        self.out_buffer
+    }
+
+    /// Discard a prefix of decompressed data.
+    pub(crate) fn discard_decompressed_data(&mut self, number_of_bytes_to_discard: usize) {
+        self.reader_pos += number_of_bytes_to_discard;
+        debug_assert!(self.reader_pos <= self.safe_pos);
+        self.compact_out_buffer_if_needed();
+    }
+
+    // DO NOT SUBMIT: Should rename this before merging the PR (because after the changes the name
+    // of this method is rather inaccurate - there is no "transfer" happening here).
+    fn transfer_finished_data(&mut self) {
+        self.safe_pos = self.out_pos.saturating_sub(CHUNCK_BUFFER_SIZE);
+        self.compact_out_buffer_if_needed();
+    }
+
+    fn compact_out_buffer_if_needed(&mut self) {
+        // The cost of compaction comes mostly from
+        // - Copying `self.reader_pos .. self.out_pos` bytes (below)
+        // - Re-zeroing some number of `self.out_pos..` bytes (later - in
+        //   `prepare_vec_for_appending`).
+        let cost_of_compaction = self.out_buffer.len() - self.reader_pos;
+
+        // The cost of compaction is offset by the credit from past work - the number
+        // of bytes read since the last compaction.
+        let credit_from_past_work = self.reader_pos;
+
+        // Avoid compacting if it would mean paying more than 1 byte (copying or re-zeroing the
+        // byte) per 1 byte that has been decompressed.
+        //
+        // TODO: The decision to keep the compaction:output ratio near 1:1 is a bit ad-hoc and
+        // arbitrary.  It's possible that another ratio would be better.  Notes:
+        // - Compacting more often means that more copies will be done below.
+        // - Compacting less often means that `self.out_buffer` may grow more.
+        //     - Bigger `self.out_buffer` *may* mean bigger memory/cache pressure
+        //       (only "may", because in practice only the suffix is hot)
+        //     - Growing `self.out_buffer` means copying bytes around (after `Vec` allocates
+        //       a bigger capacity buffer, it needs to copy bytes to the new buffer).
+        if cost_of_compaction > credit_from_past_work {
+            return;
+        }
+
+        // Compact `out_buffer` by shifting all the non-discardable byte to the left (i.e. to
+        // position 0).
+        self.out_buffer
+            .copy_within(self.reader_pos..self.out_pos, 0);
+        self.out_pos -= self.reader_pos;
+        self.safe_pos -= self.reader_pos;
+        self.reader_pos = 0;
+        self.out_buffer.truncate(self.out_pos);
     }
 }