From 9ec71c6aa0e00db7799ccfa2a0e0b0656beab11f Mon Sep 17 00:00:00 2001
From: ac <survtur@yandex.ru>
Date: Thu, 2 Mar 2023 23:22:21 +0300
Subject: [PATCH 1/6] Stopped keeping images in memory in order to consume less
 RAM.

Stopped used BytesIO when writing PDF to file or streams. Also to reduce RAM consumption.
---
 weasyprint/__init__.py   |  8 +++-
 weasyprint/document.py   | 34 +++++++++-----
 weasyprint/images.py     | 95 ++++++++++++++++++++++++++++++++--------
 weasyprint/pdf/stream.py |  8 ++--
 weasyprint/rotate_fn.py  | 24 ++++++++++
 5 files changed, 135 insertions(+), 34 deletions(-)
 create mode 100644 weasyprint/rotate_fn.py

diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
index 00061795f..b68055ce2 100644
--- a/weasyprint/__init__.py
+++ b/weasyprint/__init__.py
@@ -133,7 +133,9 @@ def render(self, stylesheets=None, presentational_hints=False,
         :param bool presentational_hints:
             Whether HTML presentational hints are followed.
         :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain "images" and "fonts".
+            Optimize size of generated PDF. Can contain: "fonts";
+            "images" applies `optimize=True` parameter to image compression;
+            "not_jpegs" tries to prevent keep jpeg data same as in file.
         :type font_config: :class:`text.fonts.FontConfiguration`
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
@@ -181,7 +183,9 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1,
         :param bool presentational_hints: Whether HTML presentational hints are
             followed.
         :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain "images" and "fonts".
+            Optimize size of generated PDF. Can contain: "fonts";
+            "images" applies `optimize=True` parameter to image compression;
+            "not_jpegs" tries to prevent keep jpeg data same as in file.
         :type font_config: :class:`text.fonts.FontConfiguration`
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
diff --git a/weasyprint/document.py b/weasyprint/document.py
index c722497eb..933a44c81 100644
--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@@ -2,7 +2,10 @@
 
 import functools
 import io
+import os
 import shutil
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 
 from . import CSS
 from .anchors import gather_anchors, make_page_bookmark_tree
@@ -364,15 +367,26 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None,
         if finisher:
             finisher(self, pdf)
 
-        output = io.BytesIO()
-        pdf.write(output, version=pdf.version, identifier=identifier)
+        if hasattr(target, 'write'):
+            pdf.write(target, version=pdf.version, identifier=identifier)
+            return
 
         if target is None:
-            return output.getvalue()
-        else:
-            output.seek(0)
-            if hasattr(target, 'write'):
-                shutil.copyfileobj(output, target)
-            else:
-                with open(target, 'wb') as fd:
-                    shutil.copyfileobj(output, fd)
+            # TODO: Should we make None value for target parameter deprecated in write_pdf()?
+            # Returning bytes.
+            # You should avoid target=None value if you may run out of RAM.
+            # Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it.
+            # Just for a moment two copies of PDF document will be in memory.
+            # Also pydyf.PDF object is in a memory.
+            bytes_io = io.BytesIO()
+            pdf.write(bytes_io, version=pdf.version, identifier=identifier)
+            return bytes_io.getvalue()
+
+        temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~")
+        try:
+            pdf.write(temp_file, version=pdf.version, identifier=identifier)
+            temp_file.close()
+            shutil.move(temp_file.name, target)
+        finally:
+            if os.path.exists(temp_file.name):
+                os.remove(temp_file.name)
diff --git a/weasyprint/images.py b/weasyprint/images.py
index e10d230b6..2097f35ec 100644
--- a/weasyprint/images.py
+++ b/weasyprint/images.py
@@ -1,16 +1,22 @@
 """Fetch and decode images in various formats."""
 
+import io
 import math
 from hashlib import md5
 from io import BytesIO
 from itertools import cycle
 from math import inf
+from pathlib import Path
+from typing import Optional, Union, Collection
+from urllib.parse import unquote
 from xml.etree import ElementTree
 
-from PIL import Image, ImageFile, ImageOps
+from PIL import Image, ImageFile
 
+from pydyf import DelayedBytes
 from .layout.percent import percentage
 from .logger import LOGGER
+from .rotate_fn import rotate_pillow_image
 from .svg import SVG
 from .urls import URLFetchingError, fetch
 
@@ -32,10 +38,31 @@ def from_exception(cls, exception):
         return cls(f'{name}: {value}' if value else name)
 
 
+
+RAM_SAVE = True
+"""
+This is temporary constant that defines RAM Saving when work with images.
+When it is True, local images will not be kept im memory.
+PROS: Lowers memory consumption.
+CONS: Had to read images twice from disk. Might affect speed a little.
+"""
+# TODO: Should RAM_SAVE be introduced as a parameter into render() method?
+#       It was my temp quick-fix ugly solution to put it as module constant.
+
+
 class RasterImage:
-    def __init__(self, pillow_image, image_id, optimize_size):
+    def __init__(self, pillow_image, image_id, optimize_size,
+                 url: Optional[str] = None, orientation: Optional[str] = None):
+
         pillow_image.id = image_id
-        self._pillow_image = pillow_image
+
+        if RAM_SAVE and url and url.startswith("file://"):
+            quoted_path = url[7:]
+            path = unquote(quoted_path)
+            self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size)
+        else:
+            self._pillow_image = pillow_image
+
         self._optimize_size = optimize_size
         self._intrinsic_width = pillow_image.width
         self._intrinsic_height = pillow_image.height
@@ -136,22 +163,8 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
                     # Store image id to enable cache in Stream.add_image
                     image_id = md5(url.encode()).hexdigest()
                     # Keep image format as it is discarded by transposition
-                    image_format = pillow_image.format
-                    if orientation == 'from-image':
-                        if 'exif' in pillow_image.info:
-                            pillow_image = ImageOps.exif_transpose(
-                                pillow_image)
-                    elif orientation != 'none':
-                        angle, flip = orientation
-                        if angle > 0:
-                            rotation = getattr(
-                                Image.Transpose, f'ROTATE_{angle}')
-                            pillow_image = pillow_image.transpose(rotation)
-                        if flip:
-                            pillow_image = pillow_image.transpose(
-                                Image.Transpose.FLIP_LEFT_RIGHT)
-                    pillow_image.format = image_format
-                    image = RasterImage(pillow_image, image_id, optimize_size)
+                    pillow_image = rotate_pillow_image(pillow_image, orientation)
+                    image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation)
 
     except (URLFetchingError, ImageLoadingError) as exception:
         LOGGER.error('Failed to load image at %r: %s', url, exception)
@@ -665,3 +678,47 @@ def _handle_degenerate(self, size_x, size_y):
             size_x = 1e7
             size_y = 1e-7
         return size_x, size_y
+
+
+class DelayedPillowImage(DelayedBytes):
+    def __init__(self, pillow_image,
+                 path: Union[str, Path], orientation, optimize_size: Collection[str]):
+        """
+        Memory efficient replacer of PIL Image.
+        Does not keep image in memory. Retreives image bytes on demand.
+        """
+
+        # Those are paramerers to recreate image bytes from file
+        self.path = path
+        self.orientation = orientation
+        self.optimize_size = optimize_size
+
+        # These parameters of original Image object that used somewhere else.
+        self.id = pillow_image.id
+        self.info = pillow_image.info
+        self.mode = pillow_image.mode
+        self.width = pillow_image.width
+        self.height = pillow_image.height
+        self.format = pillow_image.format
+
+    def __repr__(self):
+        return f"<Picture {self.path}>"
+
+    def get_bytes(self):
+
+        original_pillow_image = Image.open(self.path)
+        rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation)
+
+
+        if rotated_pillow_image is original_pillow_image:
+            if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size):
+                return Path(self.path).read_bytes()
+
+        optimize = 'images' in self.optimize_size
+        return get_jpeg_bytes(rotated_pillow_image, optimize)
+
+
+def get_jpeg_bytes(pillow_image, optimize: bool):
+    image_file = io.BytesIO()
+    pillow_image.save(image_file, format='JPEG', optimize=optimize)
+    return image_file.getvalue()
diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py
index ea03ac533..8fc929ae4 100644
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@@ -10,6 +10,7 @@
 from fontTools.ttLib import TTFont, TTLibError, ttFont
 from fontTools.varLib.mutator import instantiateVariableFont
 
+from ..images import DelayedPillowImage, get_jpeg_bytes
 from ..logger import LOGGER
 from ..matrix import Matrix
 from ..text.ffi import ffi, harfbuzz, pango, units_to_double
@@ -425,9 +426,10 @@ def add_image(self, pillow_image, image_rendering, optimize_size):
         optimize = 'images' in optimize_size
         if pillow_image.format in ('JPEG', 'MPO'):
             extra['Filter'] = '/DCTDecode'
-            image_file = io.BytesIO()
-            pillow_image.save(image_file, format='JPEG', optimize=optimize)
-            stream = [image_file.getvalue()]
+            if isinstance(pillow_image, DelayedPillowImage):
+                stream = [pillow_image]
+            else:
+                stream = [get_jpeg_bytes(pillow_image, optimize)]
         else:
             extra['Filter'] = '/FlateDecode'
             extra['DecodeParms'] = pydyf.Dictionary({
diff --git a/weasyprint/rotate_fn.py b/weasyprint/rotate_fn.py
new file mode 100644
index 000000000..a4cde3622
--- /dev/null
+++ b/weasyprint/rotate_fn.py
@@ -0,0 +1,24 @@
+from PIL import ImageOps, Image
+
+
+def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image:
+    """
+    Returns either absolute same image if orientation was not changed.
+    or its copy with modified orientation.
+    """
+    image_format = pillow_image.format
+    if orientation == 'from-image':
+        if 'exif' in pillow_image.info:
+            pillow_image = ImageOps.exif_transpose(
+                pillow_image)
+    elif orientation != 'none':
+        angle, flip = orientation
+        if angle > 0:
+            rotation = getattr(
+                Image.Transpose, f'ROTATE_{angle}')
+            pillow_image = pillow_image.transpose(rotation)
+        if flip:
+            pillow_image = pillow_image.transpose(
+                Image.Transpose.FLIP_LEFT_RIGHT)
+    pillow_image.format = image_format
+    return pillow_image

From a2231ba147cc92320cf2f8652238cd1dbab9ad07 Mon Sep 17 00:00:00 2001
From: Alex Ch <survtur@yandex.ru>
Date: Fri, 3 Mar 2023 23:11:52 +0300
Subject: [PATCH 2/6] Now it works with original pydyf module

---
 weasyprint/images.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/weasyprint/images.py b/weasyprint/images.py
index 2097f35ec..9cb8aa749 100644
--- a/weasyprint/images.py
+++ b/weasyprint/images.py
@@ -13,7 +13,7 @@
 
 from PIL import Image, ImageFile
 
-from pydyf import DelayedBytes
+import pydyf
 from .layout.percent import percentage
 from .logger import LOGGER
 from .rotate_fn import rotate_pillow_image
@@ -680,7 +680,7 @@ def _handle_degenerate(self, size_x, size_y):
         return size_x, size_y
 
 
-class DelayedPillowImage(DelayedBytes):
+class DelayedPillowImage(pydyf.Object):
     def __init__(self, pillow_image,
                  path: Union[str, Path], orientation, optimize_size: Collection[str]):
         """
@@ -703,8 +703,9 @@ def __init__(self, pillow_image,
 
     def __repr__(self):
         return f"<Picture {self.path}>"
-
-    def get_bytes(self):
+    
+    @property
+    def data(self) -> bytes:
 
         original_pillow_image = Image.open(self.path)
         rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation)

From 07e43dc4c29ccf63b324d437235081705dd650b6 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Sun, 5 Mar 2023 14:40:26 +0100
Subject: [PATCH 3/6] Clean lazy image loading

---
 weasyprint/__init__.py   |   8 +-
 weasyprint/document.py   |  33 ++---
 weasyprint/images.py     | 287 +++++++++++++++++++++++----------------
 weasyprint/pdf/stream.py | 106 ++-------------
 weasyprint/rotate_fn.py  |  24 ----
 5 files changed, 189 insertions(+), 269 deletions(-)
 delete mode 100644 weasyprint/rotate_fn.py

diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
index b68055ce2..00061795f 100644
--- a/weasyprint/__init__.py
+++ b/weasyprint/__init__.py
@@ -133,9 +133,7 @@ def render(self, stylesheets=None, presentational_hints=False,
         :param bool presentational_hints:
             Whether HTML presentational hints are followed.
         :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain: "fonts";
-            "images" applies `optimize=True` parameter to image compression;
-            "not_jpegs" tries to prevent keep jpeg data same as in file.
+            Optimize size of generated PDF. Can contain "images" and "fonts".
         :type font_config: :class:`text.fonts.FontConfiguration`
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
@@ -183,9 +181,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1,
         :param bool presentational_hints: Whether HTML presentational hints are
             followed.
         :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain: "fonts";
-            "images" applies `optimize=True` parameter to image compression;
-            "not_jpegs" tries to prevent keep jpeg data same as in file.
+            Optimize size of generated PDF. Can contain "images" and "fonts".
         :type font_config: :class:`text.fonts.FontConfiguration`
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
diff --git a/weasyprint/document.py b/weasyprint/document.py
index 933a44c81..6e1afcee2 100644
--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@@ -2,10 +2,6 @@
 
 import functools
 import io
-import os
-import shutil
-from pathlib import Path
-from tempfile import NamedTemporaryFile
 
 from . import CSS
 from .anchors import gather_anchors, make_page_bookmark_tree
@@ -367,26 +363,13 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None,
         if finisher:
             finisher(self, pdf)
 
+        if target is None:
+            output = io.BytesIO()
+            pdf.write(output, version=pdf.version, identifier=identifier)
+            return output.getvalue()
+
         if hasattr(target, 'write'):
             pdf.write(target, version=pdf.version, identifier=identifier)
-            return
-
-        if target is None:
-            # TODO: Should we make None value for target parameter deprecated in write_pdf()?
-            # Returning bytes.
-            # You should avoid target=None value if you may run out of RAM.
-            # Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it.
-            # Just for a moment two copies of PDF document will be in memory.
-            # Also pydyf.PDF object is in a memory.
-            bytes_io = io.BytesIO()
-            pdf.write(bytes_io, version=pdf.version, identifier=identifier)
-            return bytes_io.getvalue()
-
-        temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~")
-        try:
-            pdf.write(temp_file, version=pdf.version, identifier=identifier)
-            temp_file.close()
-            shutil.move(temp_file.name, target)
-        finally:
-            if os.path.exists(temp_file.name):
-                os.remove(temp_file.name)
+        else:
+            with open(target, 'wb') as fd:
+                pdf.write(fd, version=pdf.version, identifier=identifier)
diff --git a/weasyprint/images.py b/weasyprint/images.py
index 9cb8aa749..24ab93e74 100644
--- a/weasyprint/images.py
+++ b/weasyprint/images.py
@@ -2,21 +2,18 @@
 
 import io
 import math
+import struct
 from hashlib import md5
 from io import BytesIO
 from itertools import cycle
 from math import inf
-from pathlib import Path
-from typing import Optional, Union, Collection
-from urllib.parse import unquote
 from xml.etree import ElementTree
 
-from PIL import Image, ImageFile
-
 import pydyf
+from PIL import Image, ImageFile, ImageOps
+
 from .layout.percent import percentage
 from .logger import LOGGER
-from .rotate_fn import rotate_pillow_image
 from .svg import SVG
 from .urls import URLFetchingError, fetch
 
@@ -38,54 +35,137 @@ def from_exception(cls, exception):
         return cls(f'{name}: {value}' if value else name)
 
 
-
-RAM_SAVE = True
-"""
-This is temporary constant that defines RAM Saving when work with images.
-When it is True, local images will not be kept im memory.
-PROS: Lowers memory consumption.
-CONS: Had to read images twice from disk. Might affect speed a little.
-"""
-# TODO: Should RAM_SAVE be introduced as a parameter into render() method?
-#       It was my temp quick-fix ugly solution to put it as module constant.
-
-
 class RasterImage:
-    def __init__(self, pillow_image, image_id, optimize_size,
-                 url: Optional[str] = None, orientation: Optional[str] = None):
+    def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
+        self.id = image_id
+        self._cache_path = cache_path
 
-        pillow_image.id = image_id
+        if 'transparency' in pillow_image.info:
+            pillow_image = pillow_image.convert('RGBA')
+        elif pillow_image.mode in ('1', 'P', 'I'):
+            pillow_image = pillow_image.convert('RGB')
 
-        if RAM_SAVE and url and url.startswith("file://"):
-            quoted_path = url[7:]
-            path = unquote(quoted_path)
-            self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size)
+        self.width = pillow_image.width
+        self.height = pillow_image.height
+        self.ratio = (self.width / self.height) if self.height != 0 else inf
+
+        if pillow_image.mode in ('RGB', 'RGBA'):
+            color_space = '/DeviceRGB'
+        elif pillow_image.mode in ('L', 'LA'):
+            color_space = '/DeviceGray'
+        elif pillow_image.mode == 'CMYK':
+            color_space = '/DeviceCMYK'
         else:
-            self._pillow_image = pillow_image
-
-        self._optimize_size = optimize_size
-        self._intrinsic_width = pillow_image.width
-        self._intrinsic_height = pillow_image.height
-        self._intrinsic_ratio = (
-            self._intrinsic_width / self._intrinsic_height
-            if self._intrinsic_height != 0 else inf)
-
-    def get_intrinsic_size(self, image_resolution, font_size):
-        return (
-            self._intrinsic_width / image_resolution,
-            self._intrinsic_height / image_resolution,
-            self._intrinsic_ratio)
+            LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
+            color_space = '/DeviceRGB'
+
+        self.extra = pydyf.Dictionary({
+            'Type': '/XObject',
+            'Subtype': '/Image',
+            'Width': self.width,
+            'Height': self.height,
+            'ColorSpace': color_space,
+            'BitsPerComponent': 8,
+        })
+        optimize = 'images' in optimize_size
+        if pillow_image.format in ('JPEG', 'MPO'):
+            self.extra['Filter'] = '/DCTDecode'
+            image_file = io.BytesIO()
+            pillow_image.save(image_file, format='JPEG', optimize=optimize)
+            self.stream = self.get_stream(image_file.getvalue())
+        else:
+            self.extra['Filter'] = '/FlateDecode'
+            self.extra['DecodeParms'] = pydyf.Dictionary({
+                # Predictor 15 specifies that we're providing PNG data,
+                # ostensibly using an "optimum predictor", but doesn't actually
+                # matter as long as the predictor value is 10+ according to the
+                # spec. (Other PNG predictor values assert that we're using
+                # specific predictors that we don't want to commit to, but
+                # "optimum" can vary.)
+                'Predictor': 15,
+                'Columns': self.width,
+            })
+            if pillow_image.mode in ('RGB', 'RGBA'):
+                # Defaults to 1.
+                self.extra['DecodeParms']['Colors'] = 3
+            if pillow_image.mode in ('RGBA', 'LA'):
+                alpha = pillow_image.getchannel('A')
+                pillow_image = pillow_image.convert(pillow_image.mode[:-1])
+                alpha_data = self._get_png_data(alpha, optimize)
+                stream = self.get_stream(alpha_data)
+                self.extra['SMask'] = pydyf.Stream(stream, extra={
+                    'Filter': '/FlateDecode',
+                    'Type': '/XObject',
+                    'Subtype': '/Image',
+                    'DecodeParms': pydyf.Dictionary({
+                        'Predictor': 15,
+                        'Columns': pillow_image.width,
+                    }),
+                    'Width': pillow_image.width,
+                    'Height': pillow_image.height,
+                    'ColorSpace': '/DeviceGray',
+                    'BitsPerComponent': 8,
+                })
+
+            png_data = self._get_png_data(pillow_image, optimize)
+            self.stream = self.get_stream(png_data)
+
+    def get_intrinsic_size(self, resolution, font_size):
+        return self.width / resolution, self.height / resolution, self.ratio
 
     def draw(self, stream, concrete_width, concrete_height, image_rendering):
-        if self._intrinsic_width <= 0 or self._intrinsic_height <= 0:
+        if self.width <= 0 or self.height <= 0:
             return
 
-        image_name = stream.add_image(
-            self._pillow_image, image_rendering, self._optimize_size)
+        image_name = stream.add_image(self, image_rendering)
         stream.transform(
             concrete_width, 0, 0, -concrete_height, 0, concrete_height)
         stream.draw_x_object(image_name)
 
+    @staticmethod
+    def _get_png_data(pillow_image, optimize):
+        image_file = io.BytesIO()
+        pillow_image.save(image_file, format='PNG', optimize=optimize)
+
+        # Read the PNG header, then discard it because we know it's a PNG. If
+        # this weren't just output from Pillow, we should actually check it.
+        image_file.seek(8)
+
+        png_data = []
+        raw_chunk_length = image_file.read(4)
+        # PNG files consist of a series of chunks.
+        while raw_chunk_length:
+            # Each chunk begins with its data length (four bytes, may be zero),
+            # then its type (four ASCII characters), then the data, then four
+            # bytes of a CRC.
+            chunk_len, = struct.unpack('!I', raw_chunk_length)
+            chunk_type = image_file.read(4)
+            if chunk_type == b'IDAT':
+                png_data.append(image_file.read(chunk_len))
+            else:
+                image_file.seek(chunk_len, io.SEEK_CUR)
+            # We aren't checking the CRC, we assume this is a valid PNG.
+            image_file.seek(4, io.SEEK_CUR)
+            raw_chunk_length = image_file.read(4)
+
+        return b''.join(png_data)
+
+    def get_stream(self, data, alpha=False):
+        if self._cache_path:
+            path = self._cache_path / f'{self.id}{int(alpha)}'
+            path.write_bytes(data)
+            return [LazyImage(path)]
+        else:
+            return [data]
+
+
+class LazyImage:
+    def __init__(self, path):
+        self._path = path
+
+    def __bytes__(self):
+        self._path.read_bytes()
+
 
 class SVGImage:
     def __init__(self, tree, base_url, url_fetcher, context):
@@ -133,38 +213,36 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
                 string = result['file_obj'].read()
             mime_type = forced_mime_type or result['mime_type']
 
-            image = None
-            svg_exceptions = []
-            # Try to rely on given mimetype for SVG
-            if mime_type == 'image/svg+xml':
+        image = None
+        svg_exceptions = []
+        # Try to rely on given mimetype for SVG
+        if mime_type == 'image/svg+xml':
+            try:
+                tree = ElementTree.fromstring(string)
+                image = SVGImage(tree, url, url_fetcher, context)
+            except Exception as svg_exception:
+                svg_exceptions.append(svg_exception)
+        # Try pillow for raster images, or for failing SVG
+        if image is None:
+            try:
+                pillow_image = Image.open(BytesIO(string))
+            except Exception as raster_exception:
+                if mime_type == 'image/svg+xml':
+                    # Tried SVGImage then Pillow for a SVG, abort
+                    raise ImageLoadingError.from_exception(svg_exceptions[0])
                 try:
+                    # Last chance, try SVG
                     tree = ElementTree.fromstring(string)
                     image = SVGImage(tree, url, url_fetcher, context)
-                except Exception as svg_exception:
-                    svg_exceptions.append(svg_exception)
-            # Try pillow for raster images, or for failing SVG
-            if image is None:
-                try:
-                    pillow_image = Image.open(BytesIO(string))
-                except Exception as raster_exception:
-                    if mime_type == 'image/svg+xml':
-                        # Tried SVGImage then Pillow for a SVG, abort
-                        raise ImageLoadingError.from_exception(
-                            svg_exceptions[0])
-                    try:
-                        # Last chance, try SVG
-                        tree = ElementTree.fromstring(string)
-                        image = SVGImage(tree, url, url_fetcher, context)
-                    except Exception:
-                        # Tried Pillow then SVGImage for a raster, abort
-                        raise ImageLoadingError.from_exception(
-                            raster_exception)
-                else:
-                    # Store image id to enable cache in Stream.add_image
-                    image_id = md5(url.encode()).hexdigest()
-                    # Keep image format as it is discarded by transposition
-                    pillow_image = rotate_pillow_image(pillow_image, orientation)
-                    image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation)
+                except Exception:
+                    # Tried Pillow then SVGImage for a raster, abort
+                    raise ImageLoadingError.from_exception(raster_exception)
+            else:
+                # Store image id to enable cache in Stream.add_image
+                image_id = md5(url.encode()).hexdigest()
+                # Keep image format as it is discarded by transposition
+                pillow_image = rotate_pillow_image(pillow_image, orientation)
+                image = RasterImage(pillow_image, image_id, optimize_size)
 
     except (URLFetchingError, ImageLoadingError) as exception:
         LOGGER.error('Failed to load image at %r: %s', url, exception)
@@ -173,6 +251,28 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
     return image
 
 
+def rotate_pillow_image(pillow_image, orientation):
+    """Return a copy of a Pillow image with modified orientation.
+
+    If orientation is not changed, return the same image.
+
+    """
+    image_format = pillow_image.format
+    if orientation == 'from-image':
+        if 'exif' in pillow_image.info:
+            pillow_image = ImageOps.exif_transpose(pillow_image)
+    elif orientation != 'none':
+        angle, flip = orientation
+        if angle > 0:
+            rotation = getattr(Image.Transpose, f'ROTATE_{angle}')
+            pillow_image = pillow_image.transpose(rotation)
+        if flip:
+            pillow_image = pillow_image.transpose(
+                Image.Transpose.FLIP_LEFT_RIGHT)
+    pillow_image.format = image_format
+    return pillow_image
+
+
 def process_color_stops(vector_length, positions):
     """Give color stops positions on the gradient vector.
 
@@ -678,48 +778,3 @@ def _handle_degenerate(self, size_x, size_y):
             size_x = 1e7
             size_y = 1e-7
         return size_x, size_y
-
-
-class DelayedPillowImage(pydyf.Object):
-    def __init__(self, pillow_image,
-                 path: Union[str, Path], orientation, optimize_size: Collection[str]):
-        """
-        Memory efficient replacer of PIL Image.
-        Does not keep image in memory. Retreives image bytes on demand.
-        """
-
-        # Those are paramerers to recreate image bytes from file
-        self.path = path
-        self.orientation = orientation
-        self.optimize_size = optimize_size
-
-        # These parameters of original Image object that used somewhere else.
-        self.id = pillow_image.id
-        self.info = pillow_image.info
-        self.mode = pillow_image.mode
-        self.width = pillow_image.width
-        self.height = pillow_image.height
-        self.format = pillow_image.format
-
-    def __repr__(self):
-        return f"<Picture {self.path}>"
-    
-    @property
-    def data(self) -> bytes:
-
-        original_pillow_image = Image.open(self.path)
-        rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation)
-
-
-        if rotated_pillow_image is original_pillow_image:
-            if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size):
-                return Path(self.path).read_bytes()
-
-        optimize = 'images' in self.optimize_size
-        return get_jpeg_bytes(rotated_pillow_image, optimize)
-
-
-def get_jpeg_bytes(pillow_image, optimize: bool):
-    image_file = io.BytesIO()
-    pillow_image.save(image_file, format='JPEG', optimize=optimize)
-    return image_file.getvalue()
diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py
index 8fc929ae4..d08bd974b 100644
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@@ -1,7 +1,7 @@
 """PDF stream."""
 
 import io
-import struct
+from copy import deepcopy
 from functools import lru_cache
 from hashlib import md5
 
@@ -10,7 +10,6 @@
 from fontTools.ttLib import TTFont, TTLibError, ttFont
 from fontTools.varLib.mutator import instantiateVariableFont
 
-from ..images import DelayedPillowImage, get_jpeg_bytes
 from ..logger import LOGGER
 from ..matrix import Matrix
 from ..text.ffi import ffi, harfbuzz, pango, units_to_double
@@ -363,109 +362,20 @@ def add_group(self, x, y, width, height):
         self._x_objects[group.id] = group
         return group
 
-    def _get_png_data(self, pillow_image, optimize):
-        image_file = io.BytesIO()
-        pillow_image.save(image_file, format='PNG', optimize=optimize)
-
-        # Read the PNG header, then discard it because we know it's a PNG. If
-        # this weren't just output from Pillow, we should actually check it.
-        image_file.seek(8)
-
-        png_data = b''
-        raw_chunk_length = image_file.read(4)
-        # PNG files consist of a series of chunks.
-        while len(raw_chunk_length) > 0:
-            # Each chunk begins with its data length (four bytes, may be zero),
-            # then its type (four ASCII characters), then the data, then four
-            # bytes of a CRC.
-            chunk_len, = struct.unpack('!I', raw_chunk_length)
-            chunk_type = image_file.read(4)
-            if chunk_type == b'IDAT':
-                png_data += image_file.read(chunk_len)
-            else:
-                image_file.seek(chunk_len, io.SEEK_CUR)
-            # We aren't checking the CRC, we assume this is a valid PNG.
-            image_file.seek(4, io.SEEK_CUR)
-            raw_chunk_length = image_file.read(4)
-
-        return png_data
-
-    def add_image(self, pillow_image, image_rendering, optimize_size):
-        image_name = f'i{pillow_image.id}'
+    def add_image(self, image, image_rendering):
+        image_name = f'i{image.id}{image_rendering}'
         self._x_objects[image_name] = None  # Set by write_pdf
         if image_name in self._images:
             # Reuse image already stored in document
             return image_name
 
-        if 'transparency' in pillow_image.info:
-            pillow_image = pillow_image.convert('RGBA')
-        elif pillow_image.mode in ('1', 'P', 'I'):
-            pillow_image = pillow_image.convert('RGB')
-
-        if pillow_image.mode in ('RGB', 'RGBA'):
-            color_space = '/DeviceRGB'
-        elif pillow_image.mode in ('L', 'LA'):
-            color_space = '/DeviceGray'
-        elif pillow_image.mode == 'CMYK':
-            color_space = '/DeviceCMYK'
-        else:
-            LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
-            color_space = '/DeviceRGB'
-
         interpolate = 'true' if image_rendering == 'auto' else 'false'
-        extra = pydyf.Dictionary({
-            'Type': '/XObject',
-            'Subtype': '/Image',
-            'Width': pillow_image.width,
-            'Height': pillow_image.height,
-            'ColorSpace': color_space,
-            'BitsPerComponent': 8,
-            'Interpolate': interpolate,
-        })
+        extra = deepcopy(image.extra)
+        extra['Interpolate'] = interpolate
+        if 'SMask' in extra:
+            extra['SMask'].extra['Interpolate'] = interpolate
 
-        optimize = 'images' in optimize_size
-        if pillow_image.format in ('JPEG', 'MPO'):
-            extra['Filter'] = '/DCTDecode'
-            if isinstance(pillow_image, DelayedPillowImage):
-                stream = [pillow_image]
-            else:
-                stream = [get_jpeg_bytes(pillow_image, optimize)]
-        else:
-            extra['Filter'] = '/FlateDecode'
-            extra['DecodeParms'] = pydyf.Dictionary({
-                # Predictor 15 specifies that we're providing PNG data,
-                # ostensibly using an "optimum predictor", but doesn't actually
-                # matter as long as the predictor value is 10+ according to the
-                # spec. (Other PNG predictor values assert that we're using
-                # specific predictors that we don't want to commit to, but
-                # "optimum" can vary.)
-                'Predictor': 15,
-                'Columns': pillow_image.width,
-            })
-            if pillow_image.mode in ('RGB', 'RGBA'):
-                # Defaults to 1.
-                extra['DecodeParms']['Colors'] = 3
-            if pillow_image.mode in ('RGBA', 'LA'):
-                alpha = pillow_image.getchannel('A')
-                pillow_image = pillow_image.convert(pillow_image.mode[:-1])
-                alpha_data = self._get_png_data(alpha, optimize)
-                extra['SMask'] = pydyf.Stream([alpha_data], extra={
-                    'Filter': '/FlateDecode',
-                    'Type': '/XObject',
-                    'Subtype': '/Image',
-                    'DecodeParms': pydyf.Dictionary({
-                        'Predictor': 15,
-                        'Columns': pillow_image.width,
-                    }),
-                    'Width': pillow_image.width,
-                    'Height': pillow_image.height,
-                    'ColorSpace': '/DeviceGray',
-                    'BitsPerComponent': 8,
-                    'Interpolate': interpolate,
-                    })
-            stream = [self._get_png_data(pillow_image, optimize)]
-
-        xobject = pydyf.Stream(stream, extra=extra)
+        xobject = pydyf.Stream(image.stream, extra=extra)
         self._images[image_name] = xobject
         return image_name
 
diff --git a/weasyprint/rotate_fn.py b/weasyprint/rotate_fn.py
deleted file mode 100644
index a4cde3622..000000000
--- a/weasyprint/rotate_fn.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from PIL import ImageOps, Image
-
-
-def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image:
-    """
-    Returns either absolute same image if orientation was not changed.
-    or its copy with modified orientation.
-    """
-    image_format = pillow_image.format
-    if orientation == 'from-image':
-        if 'exif' in pillow_image.info:
-            pillow_image = ImageOps.exif_transpose(
-                pillow_image)
-    elif orientation != 'none':
-        angle, flip = orientation
-        if angle > 0:
-            rotation = getattr(
-                Image.Transpose, f'ROTATE_{angle}')
-            pillow_image = pillow_image.transpose(rotation)
-        if flip:
-            pillow_image = pillow_image.transpose(
-                Image.Transpose.FLIP_LEFT_RIGHT)
-    pillow_image.format = image_format
-    return pillow_image

From 44001c53832c6d5df120786ffc68894afdc76e76 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Sun, 5 Mar 2023 22:56:41 +0100
Subject: [PATCH 4/6] Add --cache-folder option to temporarily store images on
 disk

---
 weasyprint/__init__.py | 12 ++++++++--
 weasyprint/__main__.py | 10 ++++++++
 weasyprint/document.py | 53 +++++++++++++++++++++++++++++++++++++++++-
 weasyprint/images.py   | 35 +++++++++++++++-------------
 4 files changed, 91 insertions(+), 19 deletions(-)

diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
index 00061795f..4a6822946 100644
--- a/weasyprint/__init__.py
+++ b/weasyprint/__init__.py
@@ -138,7 +138,11 @@ def render(self, stylesheets=None, presentational_hints=False,
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
         :param counter_style: A dictionary storing ``@counter-style`` rules.
-        :param dict image_cache: A dictionary used to cache images.
+        :param image_cache:
+            A dictionary used to cache images, or a folder path where images
+            are temporarily stored.
+        :type image_cache:
+            :obj:`dict`, :obj:`str` or :class:`document.DiskCache`
         :param bool forms: Whether PDF forms have to be included.
         :returns: A :class:`document.Document` object.
 
@@ -186,7 +190,11 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1,
         :param font_config: A font configuration handling ``@font-face`` rules.
         :type counter_style: :class:`css.counters.CounterStyle`
         :param counter_style: A dictionary storing ``@counter-style`` rules.
-        :param dict image_cache: A dictionary used to cache images.
+        :param image_cache:
+            A dictionary used to cache images, or a folder path where images
+            are temporarily stored.
+        :type image_cache:
+            :obj:`dict`, :obj:`str` or :class:`document.DiskCache`
         :param bytes identifier: A bytestring used as PDF file identifier.
         :param str variant: A PDF variant name.
         :param str version: A PDF version number.
diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py
index dfe4b38fa..9ddabee52 100644
--- a/weasyprint/__main__.py
+++ b/weasyprint/__main__.py
@@ -94,6 +94,11 @@ def main(argv=None, stdout=None, stdin=None):
         multiple times, ``all`` adds all allowed values, ``none`` removes all
         previously set values.
 
+    .. option:: -c <folder>, --cache-folder <folder>
+
+        Store cache on disk instead of memory. The ``folder`` is created if
+        needed and cleaned after the PDF is generated.
+
     .. option:: -v, --verbose
 
         Show warnings and information messages.
@@ -156,6 +161,10 @@ def main(argv=None, stdout=None, stdin=None):
         '-O', '--optimize-size', action='append',
         help='optimize output size for specified features',
         choices=('images', 'fonts', 'all', 'none'), default=['fonts'])
+    parser.add_argument(
+        '-c', '--cache-folder',
+        help='Store cache on disk instead of memory. The ``folder`` is '
+        'created if needed and cleaned after the PDF is generated.')
     parser.add_argument(
         '-v', '--verbose', action='store_true',
         help='show warnings and information messages')
@@ -203,6 +212,7 @@ def main(argv=None, stdout=None, stdin=None):
         'version': args.pdf_version,
         'forms': args.pdf_forms,
         'custom_metadata': args.custom_metadata,
+        'image_cache': args.cache_folder,
     }
 
     # Default to logging to stderr.
diff --git a/weasyprint/document.py b/weasyprint/document.py
index 6e1afcee2..909fafdbd 100644
--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@@ -2,6 +2,8 @@
 
 import functools
 import io
+from hashlib import md5
+from pathlib import Path
 
 from . import CSS
 from .anchors import gather_anchors, make_page_bookmark_tree
@@ -158,6 +160,52 @@ def __init__(self, title=None, authors=None, description=None,
         self.custom = custom or {}
 
 
+class DiskCache:
+    """Dict-like storing images content on disk.
+
+    Bytestrings values are stored on disk. Other Python objects (i.e.
+    RasterImage instances) are still stored in memory, but are much more
+    lightweight.
+
+    """
+    def __init__(self, folder):
+        self._path = Path(folder)
+        self._path.mkdir(parents=True, exist_ok=True)
+        self._memory_cache = {}
+        self._disk_paths = set()
+
+    def _path_from_key(self, key):
+        return self._path / md5(key.encode()).hexdigest()
+
+    def __getitem__(self, key):
+        if key in self._memory_cache:
+            return self._memory_cache[key]
+        else:
+            return self._path_from_key(key).read_bytes()
+
+    def __setitem__(self, key, value):
+        if isinstance(value, bytes):
+            path = self._path_from_key(key)
+            self._disk_paths.add(path)
+            path.write_bytes(value)
+        else:
+            self._memory_cache[key] = value
+
+    def __contains__(self, key):
+        return (
+            key in self._memory_cache or
+            self._path_from_key(key).exists())
+
+    def __del__(self):
+        try:
+            for path in self._disk_paths:
+                path.unlink(missing_ok=True)
+            self._path.rmdir()
+        except Exception:
+            # Silently ignore errors while clearing cache
+            pass
+
+
 class Document:
     """A rendered document ready to be painted in a pydyf stream.
 
@@ -180,7 +228,10 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints,
         target_collector = TargetCollector()
         page_rules = []
         user_stylesheets = []
-        image_cache = {} if image_cache is None else image_cache
+        if image_cache is None:
+            image_cache = {}
+        elif not isinstance(image_cache, DiskCache):
+            image_cache = DiskCache(image_cache)
         for css in stylesheets or []:
             if not hasattr(css, 'matcher'):
                 css = CSS(
diff --git a/weasyprint/images.py b/weasyprint/images.py
index 24ab93e74..9e731ea18 100644
--- a/weasyprint/images.py
+++ b/weasyprint/images.py
@@ -36,9 +36,9 @@ def from_exception(cls, exception):
 
 
 class RasterImage:
-    def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
+    def __init__(self, pillow_image, image_id, optimize_size, cache):
         self.id = image_id
-        self._cache_path = cache_path
+        self._cache = cache
 
         if 'transparency' in pillow_image.info:
             pillow_image = pillow_image.convert('RGBA')
@@ -92,7 +92,7 @@ def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
                 alpha = pillow_image.getchannel('A')
                 pillow_image = pillow_image.convert(pillow_image.mode[:-1])
                 alpha_data = self._get_png_data(alpha, optimize)
-                stream = self.get_stream(alpha_data)
+                stream = self.get_stream(alpha_data, alpha=True)
                 self.extra['SMask'] = pydyf.Stream(stream, extra={
                     'Filter': '/FlateDecode',
                     'Type': '/XObject',
@@ -151,20 +151,20 @@ def _get_png_data(pillow_image, optimize):
         return b''.join(png_data)
 
     def get_stream(self, data, alpha=False):
-        if self._cache_path:
-            path = self._cache_path / f'{self.id}{int(alpha)}'
-            path.write_bytes(data)
-            return [LazyImage(path)]
-        else:
-            return [data]
+        key = f'{self.id}{int(alpha)}'
+        return [LazyImage(self._cache, key, data)]
 
 
-class LazyImage:
-    def __init__(self, path):
-        self._path = path
+class LazyImage(pydyf.Object):
+    def __init__(self, cache, key, data):
+        super().__init__()
+        self._key = key
+        self._cache = cache
+        cache[key] = data
 
-    def __bytes__(self):
-        self._path.read_bytes()
+    @property
+    def data(self):
+        return self._cache[self._key]
 
 
 class SVGImage:
@@ -240,13 +240,14 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
             else:
                 # Store image id to enable cache in Stream.add_image
                 image_id = md5(url.encode()).hexdigest()
-                # Keep image format as it is discarded by transposition
                 pillow_image = rotate_pillow_image(pillow_image, orientation)
-                image = RasterImage(pillow_image, image_id, optimize_size)
+                image = RasterImage(
+                    pillow_image, image_id, optimize_size, cache)
 
     except (URLFetchingError, ImageLoadingError) as exception:
         LOGGER.error('Failed to load image at %r: %s', url, exception)
         image = None
+
     cache[url] = image
     return image
 
@@ -269,6 +270,8 @@ def rotate_pillow_image(pillow_image, orientation):
         if flip:
             pillow_image = pillow_image.transpose(
                 Image.Transpose.FLIP_LEFT_RIGHT)
+
+    # Keep image format as it is discarded by transposition
     pillow_image.format = image_format
     return pillow_image
 

From ab6daa2a035882e6c738c92e3465c17935669556 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Mon, 6 Mar 2023 16:35:03 +0100
Subject: [PATCH 5/6] Fix emojis

---
 weasyprint/draw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/weasyprint/draw.py b/weasyprint/draw.py
index 66b427b9c..d7e2961a1 100644
--- a/weasyprint/draw.py
+++ b/weasyprint/draw.py
@@ -1199,7 +1199,7 @@ def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y,
                     pillow_image = Image.open(BytesIO(png_data))
                     image_id = f'{font.hash}{glyph}'
                     image = RasterImage(
-                        pillow_image, image_id, optimize_size=())
+                        pillow_image, image_id, optimize_size=(), cache={})
                     d = font.widths[glyph] / 1000
                     a = pillow_image.width / pillow_image.height * d
                     pango.pango_font_get_glyph_extents(

From c7087d3d86b75e16a2dc96f9978d6886ffc2390c Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Mon, 13 Mar 2023 17:04:15 +0100
Subject: [PATCH 6/6] =?UTF-8?q?Don=E2=80=99t=20deepcopy=20streams=20dicts?=
 =?UTF-8?q?=20when=20duplicating=20for=20interpolation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unsurprisingly, deepcopy doesn’t work well on SMask streams :).
---
 weasyprint/pdf/stream.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py
index d08bd974b..a9950836e 100644
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@@ -1,7 +1,6 @@
 """PDF stream."""
 
 import io
-from copy import deepcopy
 from functools import lru_cache
 from hashlib import md5
 
@@ -370,9 +369,12 @@ def add_image(self, image, image_rendering):
             return image_name
 
         interpolate = 'true' if image_rendering == 'auto' else 'false'
-        extra = deepcopy(image.extra)
+        extra = image.extra.copy()
         extra['Interpolate'] = interpolate
         if 'SMask' in extra:
+            extra['SMask'] = pydyf.Stream(
+                extra['SMask'].stream.copy(), extra['SMask'].extra.copy(),
+                extra['SMask'].compress)
             extra['SMask'].extra['Interpolate'] = interpolate
 
         xobject = pydyf.Stream(image.stream, extra=extra)