From 9ec71c6aa0e00db7799ccfa2a0e0b0656beab11f Mon Sep 17 00:00:00 2001 From: ac Date: Thu, 2 Mar 2023 23:22:21 +0300 Subject: [PATCH 1/6] Stopped keeping images in memory in order to consume less RAM. Stopped used BytesIO when writing PDF to file or streams. Also to reduce RAM consumption. --- weasyprint/__init__.py | 8 +++- weasyprint/document.py | 34 +++++++++----- weasyprint/images.py | 95 ++++++++++++++++++++++++++++++++-------- weasyprint/pdf/stream.py | 8 ++-- weasyprint/rotate_fn.py | 24 ++++++++++ 5 files changed, 135 insertions(+), 34 deletions(-) create mode 100644 weasyprint/rotate_fn.py diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 00061795f..b68055ce2 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -133,7 +133,9 @@ def render(self, stylesheets=None, presentational_hints=False, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images" and "fonts". + Optimize size of generated PDF. Can contain: "fonts"; + "images" applies `optimize=True` parameter to image compression; + "not_jpegs" tries to prevent keep jpeg data same as in file. :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -181,7 +183,9 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images" and "fonts". + Optimize size of generated PDF. Can contain: "fonts"; + "images" applies `optimize=True` parameter to image compression; + "not_jpegs" tries to prevent keep jpeg data same as in file. :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` diff --git a/weasyprint/document.py b/weasyprint/document.py index c722497eb..933a44c81 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -2,7 +2,10 @@ import functools import io +import os import shutil +from pathlib import Path +from tempfile import NamedTemporaryFile from . import CSS from .anchors import gather_anchors, make_page_bookmark_tree @@ -364,15 +367,26 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None, if finisher: finisher(self, pdf) - output = io.BytesIO() - pdf.write(output, version=pdf.version, identifier=identifier) + if hasattr(target, 'write'): + pdf.write(target, version=pdf.version, identifier=identifier) + return if target is None: - return output.getvalue() - else: - output.seek(0) - if hasattr(target, 'write'): - shutil.copyfileobj(output, target) - else: - with open(target, 'wb') as fd: - shutil.copyfileobj(output, fd) + # TODO: Should we make None value for target parameter deprecated in write_pdf()? + # Returning bytes. + # You should avoid target=None value if you may run out of RAM. + # Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it. + # Just for a moment two copies of PDF document will be in memory. + # Also pydyf.PDF object is in a memory. + bytes_io = io.BytesIO() + pdf.write(bytes_io, version=pdf.version, identifier=identifier) + return bytes_io.getvalue() + + temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~") + try: + pdf.write(temp_file, version=pdf.version, identifier=identifier) + temp_file.close() + shutil.move(temp_file.name, target) + finally: + if os.path.exists(temp_file.name): + os.remove(temp_file.name) diff --git a/weasyprint/images.py b/weasyprint/images.py index e10d230b6..2097f35ec 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -1,16 +1,22 @@ """Fetch and decode images in various formats.""" +import io import math from hashlib import md5 from io import BytesIO from itertools import cycle from math import inf +from pathlib import Path +from typing import Optional, Union, Collection +from urllib.parse import unquote from xml.etree import ElementTree -from PIL import Image, ImageFile, ImageOps +from PIL import Image, ImageFile +from pydyf import DelayedBytes from .layout.percent import percentage from .logger import LOGGER +from .rotate_fn import rotate_pillow_image from .svg import SVG from .urls import URLFetchingError, fetch @@ -32,10 +38,31 @@ def from_exception(cls, exception): return cls(f'{name}: {value}' if value else name) + +RAM_SAVE = True +""" +This is temporary constant that defines RAM Saving when work with images. +When it is True, local images will not be kept im memory. +PROS: Lowers memory consumption. +CONS: Had to read images twice from disk. Might affect speed a little. +""" +# TODO: Should RAM_SAVE be introduced as a parameter into render() method? +# It was my temp quick-fix ugly solution to put it as module constant. + + class RasterImage: - def __init__(self, pillow_image, image_id, optimize_size): + def __init__(self, pillow_image, image_id, optimize_size, + url: Optional[str] = None, orientation: Optional[str] = None): + pillow_image.id = image_id - self._pillow_image = pillow_image + + if RAM_SAVE and url and url.startswith("file://"): + quoted_path = url[7:] + path = unquote(quoted_path) + self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size) + else: + self._pillow_image = pillow_image + self._optimize_size = optimize_size self._intrinsic_width = pillow_image.width self._intrinsic_height = pillow_image.height @@ -136,22 +163,8 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, # Store image id to enable cache in Stream.add_image image_id = md5(url.encode()).hexdigest() # Keep image format as it is discarded by transposition - image_format = pillow_image.format - if orientation == 'from-image': - if 'exif' in pillow_image.info: - pillow_image = ImageOps.exif_transpose( - pillow_image) - elif orientation != 'none': - angle, flip = orientation - if angle > 0: - rotation = getattr( - Image.Transpose, f'ROTATE_{angle}') - pillow_image = pillow_image.transpose(rotation) - if flip: - pillow_image = pillow_image.transpose( - Image.Transpose.FLIP_LEFT_RIGHT) - pillow_image.format = image_format - image = RasterImage(pillow_image, image_id, optimize_size) + pillow_image = rotate_pillow_image(pillow_image, orientation) + image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) @@ -665,3 +678,47 @@ def _handle_degenerate(self, size_x, size_y): size_x = 1e7 size_y = 1e-7 return size_x, size_y + + +class DelayedPillowImage(DelayedBytes): + def __init__(self, pillow_image, + path: Union[str, Path], orientation, optimize_size: Collection[str]): + """ + Memory efficient replacer of PIL Image. + Does not keep image in memory. Retreives image bytes on demand. + """ + + # Those are paramerers to recreate image bytes from file + self.path = path + self.orientation = orientation + self.optimize_size = optimize_size + + # These parameters of original Image object that used somewhere else. + self.id = pillow_image.id + self.info = pillow_image.info + self.mode = pillow_image.mode + self.width = pillow_image.width + self.height = pillow_image.height + self.format = pillow_image.format + + def __repr__(self): + return f"" + + def get_bytes(self): + + original_pillow_image = Image.open(self.path) + rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation) + + + if rotated_pillow_image is original_pillow_image: + if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size): + return Path(self.path).read_bytes() + + optimize = 'images' in self.optimize_size + return get_jpeg_bytes(rotated_pillow_image, optimize) + + +def get_jpeg_bytes(pillow_image, optimize: bool): + image_file = io.BytesIO() + pillow_image.save(image_file, format='JPEG', optimize=optimize) + return image_file.getvalue() diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index ea03ac533..8fc929ae4 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -10,6 +10,7 @@ from fontTools.ttLib import TTFont, TTLibError, ttFont from fontTools.varLib.mutator import instantiateVariableFont +from ..images import DelayedPillowImage, get_jpeg_bytes from ..logger import LOGGER from ..matrix import Matrix from ..text.ffi import ffi, harfbuzz, pango, units_to_double @@ -425,9 +426,10 @@ def add_image(self, pillow_image, image_rendering, optimize_size): optimize = 'images' in optimize_size if pillow_image.format in ('JPEG', 'MPO'): extra['Filter'] = '/DCTDecode' - image_file = io.BytesIO() - pillow_image.save(image_file, format='JPEG', optimize=optimize) - stream = [image_file.getvalue()] + if isinstance(pillow_image, DelayedPillowImage): + stream = [pillow_image] + else: + stream = [get_jpeg_bytes(pillow_image, optimize)] else: extra['Filter'] = '/FlateDecode' extra['DecodeParms'] = pydyf.Dictionary({ diff --git a/weasyprint/rotate_fn.py b/weasyprint/rotate_fn.py new file mode 100644 index 000000000..a4cde3622 --- /dev/null +++ b/weasyprint/rotate_fn.py @@ -0,0 +1,24 @@ +from PIL import ImageOps, Image + + +def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image: + """ + Returns either absolute same image if orientation was not changed. + or its copy with modified orientation. + """ + image_format = pillow_image.format + if orientation == 'from-image': + if 'exif' in pillow_image.info: + pillow_image = ImageOps.exif_transpose( + pillow_image) + elif orientation != 'none': + angle, flip = orientation + if angle > 0: + rotation = getattr( + Image.Transpose, f'ROTATE_{angle}') + pillow_image = pillow_image.transpose(rotation) + if flip: + pillow_image = pillow_image.transpose( + Image.Transpose.FLIP_LEFT_RIGHT) + pillow_image.format = image_format + return pillow_image From a2231ba147cc92320cf2f8652238cd1dbab9ad07 Mon Sep 17 00:00:00 2001 From: Alex Ch Date: Fri, 3 Mar 2023 23:11:52 +0300 Subject: [PATCH 2/6] Now it works with original pydyf module --- weasyprint/images.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/weasyprint/images.py b/weasyprint/images.py index 2097f35ec..9cb8aa749 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -13,7 +13,7 @@ from PIL import Image, ImageFile -from pydyf import DelayedBytes +import pydyf from .layout.percent import percentage from .logger import LOGGER from .rotate_fn import rotate_pillow_image @@ -680,7 +680,7 @@ def _handle_degenerate(self, size_x, size_y): return size_x, size_y -class DelayedPillowImage(DelayedBytes): +class DelayedPillowImage(pydyf.Object): def __init__(self, pillow_image, path: Union[str, Path], orientation, optimize_size: Collection[str]): """ @@ -703,8 +703,9 @@ def __init__(self, pillow_image, def __repr__(self): return f"" - - def get_bytes(self): + + @property + def data(self) -> bytes: original_pillow_image = Image.open(self.path) rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation) From 07e43dc4c29ccf63b324d437235081705dd650b6 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 5 Mar 2023 14:40:26 +0100 Subject: [PATCH 3/6] Clean lazy image loading --- weasyprint/__init__.py | 8 +- weasyprint/document.py | 33 ++--- weasyprint/images.py | 287 +++++++++++++++++++++++---------------- weasyprint/pdf/stream.py | 106 ++------------- weasyprint/rotate_fn.py | 24 ---- 5 files changed, 189 insertions(+), 269 deletions(-) delete mode 100644 weasyprint/rotate_fn.py diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index b68055ce2..00061795f 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -133,9 +133,7 @@ def render(self, stylesheets=None, presentational_hints=False, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain: "fonts"; - "images" applies `optimize=True` parameter to image compression; - "not_jpegs" tries to prevent keep jpeg data same as in file. + Optimize size of generated PDF. Can contain "images" and "fonts". :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -183,9 +181,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain: "fonts"; - "images" applies `optimize=True` parameter to image compression; - "not_jpegs" tries to prevent keep jpeg data same as in file. + Optimize size of generated PDF. Can contain "images" and "fonts". :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` diff --git a/weasyprint/document.py b/weasyprint/document.py index 933a44c81..6e1afcee2 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -2,10 +2,6 @@ import functools import io -import os -import shutil -from pathlib import Path -from tempfile import NamedTemporaryFile from . import CSS from .anchors import gather_anchors, make_page_bookmark_tree @@ -367,26 +363,13 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None, if finisher: finisher(self, pdf) + if target is None: + output = io.BytesIO() + pdf.write(output, version=pdf.version, identifier=identifier) + return output.getvalue() + if hasattr(target, 'write'): pdf.write(target, version=pdf.version, identifier=identifier) - return - - if target is None: - # TODO: Should we make None value for target parameter deprecated in write_pdf()? - # Returning bytes. - # You should avoid target=None value if you may run out of RAM. - # Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it. - # Just for a moment two copies of PDF document will be in memory. - # Also pydyf.PDF object is in a memory. - bytes_io = io.BytesIO() - pdf.write(bytes_io, version=pdf.version, identifier=identifier) - return bytes_io.getvalue() - - temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~") - try: - pdf.write(temp_file, version=pdf.version, identifier=identifier) - temp_file.close() - shutil.move(temp_file.name, target) - finally: - if os.path.exists(temp_file.name): - os.remove(temp_file.name) + else: + with open(target, 'wb') as fd: + pdf.write(fd, version=pdf.version, identifier=identifier) diff --git a/weasyprint/images.py b/weasyprint/images.py index 9cb8aa749..24ab93e74 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -2,21 +2,18 @@ import io import math +import struct from hashlib import md5 from io import BytesIO from itertools import cycle from math import inf -from pathlib import Path -from typing import Optional, Union, Collection -from urllib.parse import unquote from xml.etree import ElementTree -from PIL import Image, ImageFile - import pydyf +from PIL import Image, ImageFile, ImageOps + from .layout.percent import percentage from .logger import LOGGER -from .rotate_fn import rotate_pillow_image from .svg import SVG from .urls import URLFetchingError, fetch @@ -38,54 +35,137 @@ def from_exception(cls, exception): return cls(f'{name}: {value}' if value else name) - -RAM_SAVE = True -""" -This is temporary constant that defines RAM Saving when work with images. -When it is True, local images will not be kept im memory. -PROS: Lowers memory consumption. -CONS: Had to read images twice from disk. Might affect speed a little. -""" -# TODO: Should RAM_SAVE be introduced as a parameter into render() method? -# It was my temp quick-fix ugly solution to put it as module constant. - - class RasterImage: - def __init__(self, pillow_image, image_id, optimize_size, - url: Optional[str] = None, orientation: Optional[str] = None): + def __init__(self, pillow_image, image_id, optimize_size, cache_path=None): + self.id = image_id + self._cache_path = cache_path - pillow_image.id = image_id + if 'transparency' in pillow_image.info: + pillow_image = pillow_image.convert('RGBA') + elif pillow_image.mode in ('1', 'P', 'I'): + pillow_image = pillow_image.convert('RGB') - if RAM_SAVE and url and url.startswith("file://"): - quoted_path = url[7:] - path = unquote(quoted_path) - self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size) + self.width = pillow_image.width + self.height = pillow_image.height + self.ratio = (self.width / self.height) if self.height != 0 else inf + + if pillow_image.mode in ('RGB', 'RGBA'): + color_space = '/DeviceRGB' + elif pillow_image.mode in ('L', 'LA'): + color_space = '/DeviceGray' + elif pillow_image.mode == 'CMYK': + color_space = '/DeviceCMYK' else: - self._pillow_image = pillow_image - - self._optimize_size = optimize_size - self._intrinsic_width = pillow_image.width - self._intrinsic_height = pillow_image.height - self._intrinsic_ratio = ( - self._intrinsic_width / self._intrinsic_height - if self._intrinsic_height != 0 else inf) - - def get_intrinsic_size(self, image_resolution, font_size): - return ( - self._intrinsic_width / image_resolution, - self._intrinsic_height / image_resolution, - self._intrinsic_ratio) + LOGGER.warning('Unknown image mode: %s', pillow_image.mode) + color_space = '/DeviceRGB' + + self.extra = pydyf.Dictionary({ + 'Type': '/XObject', + 'Subtype': '/Image', + 'Width': self.width, + 'Height': self.height, + 'ColorSpace': color_space, + 'BitsPerComponent': 8, + }) + optimize = 'images' in optimize_size + if pillow_image.format in ('JPEG', 'MPO'): + self.extra['Filter'] = '/DCTDecode' + image_file = io.BytesIO() + pillow_image.save(image_file, format='JPEG', optimize=optimize) + self.stream = self.get_stream(image_file.getvalue()) + else: + self.extra['Filter'] = '/FlateDecode' + self.extra['DecodeParms'] = pydyf.Dictionary({ + # Predictor 15 specifies that we're providing PNG data, + # ostensibly using an "optimum predictor", but doesn't actually + # matter as long as the predictor value is 10+ according to the + # spec. (Other PNG predictor values assert that we're using + # specific predictors that we don't want to commit to, but + # "optimum" can vary.) + 'Predictor': 15, + 'Columns': self.width, + }) + if pillow_image.mode in ('RGB', 'RGBA'): + # Defaults to 1. + self.extra['DecodeParms']['Colors'] = 3 + if pillow_image.mode in ('RGBA', 'LA'): + alpha = pillow_image.getchannel('A') + pillow_image = pillow_image.convert(pillow_image.mode[:-1]) + alpha_data = self._get_png_data(alpha, optimize) + stream = self.get_stream(alpha_data) + self.extra['SMask'] = pydyf.Stream(stream, extra={ + 'Filter': '/FlateDecode', + 'Type': '/XObject', + 'Subtype': '/Image', + 'DecodeParms': pydyf.Dictionary({ + 'Predictor': 15, + 'Columns': pillow_image.width, + }), + 'Width': pillow_image.width, + 'Height': pillow_image.height, + 'ColorSpace': '/DeviceGray', + 'BitsPerComponent': 8, + }) + + png_data = self._get_png_data(pillow_image, optimize) + self.stream = self.get_stream(png_data) + + def get_intrinsic_size(self, resolution, font_size): + return self.width / resolution, self.height / resolution, self.ratio def draw(self, stream, concrete_width, concrete_height, image_rendering): - if self._intrinsic_width <= 0 or self._intrinsic_height <= 0: + if self.width <= 0 or self.height <= 0: return - image_name = stream.add_image( - self._pillow_image, image_rendering, self._optimize_size) + image_name = stream.add_image(self, image_rendering) stream.transform( concrete_width, 0, 0, -concrete_height, 0, concrete_height) stream.draw_x_object(image_name) + @staticmethod + def _get_png_data(pillow_image, optimize): + image_file = io.BytesIO() + pillow_image.save(image_file, format='PNG', optimize=optimize) + + # Read the PNG header, then discard it because we know it's a PNG. If + # this weren't just output from Pillow, we should actually check it. + image_file.seek(8) + + png_data = [] + raw_chunk_length = image_file.read(4) + # PNG files consist of a series of chunks. + while raw_chunk_length: + # Each chunk begins with its data length (four bytes, may be zero), + # then its type (four ASCII characters), then the data, then four + # bytes of a CRC. + chunk_len, = struct.unpack('!I', raw_chunk_length) + chunk_type = image_file.read(4) + if chunk_type == b'IDAT': + png_data.append(image_file.read(chunk_len)) + else: + image_file.seek(chunk_len, io.SEEK_CUR) + # We aren't checking the CRC, we assume this is a valid PNG. + image_file.seek(4, io.SEEK_CUR) + raw_chunk_length = image_file.read(4) + + return b''.join(png_data) + + def get_stream(self, data, alpha=False): + if self._cache_path: + path = self._cache_path / f'{self.id}{int(alpha)}' + path.write_bytes(data) + return [LazyImage(path)] + else: + return [data] + + +class LazyImage: + def __init__(self, path): + self._path = path + + def __bytes__(self): + self._path.read_bytes() + class SVGImage: def __init__(self, tree, base_url, url_fetcher, context): @@ -133,38 +213,36 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, string = result['file_obj'].read() mime_type = forced_mime_type or result['mime_type'] - image = None - svg_exceptions = [] - # Try to rely on given mimetype for SVG - if mime_type == 'image/svg+xml': + image = None + svg_exceptions = [] + # Try to rely on given mimetype for SVG + if mime_type == 'image/svg+xml': + try: + tree = ElementTree.fromstring(string) + image = SVGImage(tree, url, url_fetcher, context) + except Exception as svg_exception: + svg_exceptions.append(svg_exception) + # Try pillow for raster images, or for failing SVG + if image is None: + try: + pillow_image = Image.open(BytesIO(string)) + except Exception as raster_exception: + if mime_type == 'image/svg+xml': + # Tried SVGImage then Pillow for a SVG, abort + raise ImageLoadingError.from_exception(svg_exceptions[0]) try: + # Last chance, try SVG tree = ElementTree.fromstring(string) image = SVGImage(tree, url, url_fetcher, context) - except Exception as svg_exception: - svg_exceptions.append(svg_exception) - # Try pillow for raster images, or for failing SVG - if image is None: - try: - pillow_image = Image.open(BytesIO(string)) - except Exception as raster_exception: - if mime_type == 'image/svg+xml': - # Tried SVGImage then Pillow for a SVG, abort - raise ImageLoadingError.from_exception( - svg_exceptions[0]) - try: - # Last chance, try SVG - tree = ElementTree.fromstring(string) - image = SVGImage(tree, url, url_fetcher, context) - except Exception: - # Tried Pillow then SVGImage for a raster, abort - raise ImageLoadingError.from_exception( - raster_exception) - else: - # Store image id to enable cache in Stream.add_image - image_id = md5(url.encode()).hexdigest() - # Keep image format as it is discarded by transposition - pillow_image = rotate_pillow_image(pillow_image, orientation) - image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation) + except Exception: + # Tried Pillow then SVGImage for a raster, abort + raise ImageLoadingError.from_exception(raster_exception) + else: + # Store image id to enable cache in Stream.add_image + image_id = md5(url.encode()).hexdigest() + # Keep image format as it is discarded by transposition + pillow_image = rotate_pillow_image(pillow_image, orientation) + image = RasterImage(pillow_image, image_id, optimize_size) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) @@ -173,6 +251,28 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, return image +def rotate_pillow_image(pillow_image, orientation): + """Return a copy of a Pillow image with modified orientation. + + If orientation is not changed, return the same image. + + """ + image_format = pillow_image.format + if orientation == 'from-image': + if 'exif' in pillow_image.info: + pillow_image = ImageOps.exif_transpose(pillow_image) + elif orientation != 'none': + angle, flip = orientation + if angle > 0: + rotation = getattr(Image.Transpose, f'ROTATE_{angle}') + pillow_image = pillow_image.transpose(rotation) + if flip: + pillow_image = pillow_image.transpose( + Image.Transpose.FLIP_LEFT_RIGHT) + pillow_image.format = image_format + return pillow_image + + def process_color_stops(vector_length, positions): """Give color stops positions on the gradient vector. @@ -678,48 +778,3 @@ def _handle_degenerate(self, size_x, size_y): size_x = 1e7 size_y = 1e-7 return size_x, size_y - - -class DelayedPillowImage(pydyf.Object): - def __init__(self, pillow_image, - path: Union[str, Path], orientation, optimize_size: Collection[str]): - """ - Memory efficient replacer of PIL Image. - Does not keep image in memory. Retreives image bytes on demand. - """ - - # Those are paramerers to recreate image bytes from file - self.path = path - self.orientation = orientation - self.optimize_size = optimize_size - - # These parameters of original Image object that used somewhere else. - self.id = pillow_image.id - self.info = pillow_image.info - self.mode = pillow_image.mode - self.width = pillow_image.width - self.height = pillow_image.height - self.format = pillow_image.format - - def __repr__(self): - return f"" - - @property - def data(self) -> bytes: - - original_pillow_image = Image.open(self.path) - rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation) - - - if rotated_pillow_image is original_pillow_image: - if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size): - return Path(self.path).read_bytes() - - optimize = 'images' in self.optimize_size - return get_jpeg_bytes(rotated_pillow_image, optimize) - - -def get_jpeg_bytes(pillow_image, optimize: bool): - image_file = io.BytesIO() - pillow_image.save(image_file, format='JPEG', optimize=optimize) - return image_file.getvalue() diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index 8fc929ae4..d08bd974b 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -1,7 +1,7 @@ """PDF stream.""" import io -import struct +from copy import deepcopy from functools import lru_cache from hashlib import md5 @@ -10,7 +10,6 @@ from fontTools.ttLib import TTFont, TTLibError, ttFont from fontTools.varLib.mutator import instantiateVariableFont -from ..images import DelayedPillowImage, get_jpeg_bytes from ..logger import LOGGER from ..matrix import Matrix from ..text.ffi import ffi, harfbuzz, pango, units_to_double @@ -363,109 +362,20 @@ def add_group(self, x, y, width, height): self._x_objects[group.id] = group return group - def _get_png_data(self, pillow_image, optimize): - image_file = io.BytesIO() - pillow_image.save(image_file, format='PNG', optimize=optimize) - - # Read the PNG header, then discard it because we know it's a PNG. If - # this weren't just output from Pillow, we should actually check it. - image_file.seek(8) - - png_data = b'' - raw_chunk_length = image_file.read(4) - # PNG files consist of a series of chunks. - while len(raw_chunk_length) > 0: - # Each chunk begins with its data length (four bytes, may be zero), - # then its type (four ASCII characters), then the data, then four - # bytes of a CRC. - chunk_len, = struct.unpack('!I', raw_chunk_length) - chunk_type = image_file.read(4) - if chunk_type == b'IDAT': - png_data += image_file.read(chunk_len) - else: - image_file.seek(chunk_len, io.SEEK_CUR) - # We aren't checking the CRC, we assume this is a valid PNG. - image_file.seek(4, io.SEEK_CUR) - raw_chunk_length = image_file.read(4) - - return png_data - - def add_image(self, pillow_image, image_rendering, optimize_size): - image_name = f'i{pillow_image.id}' + def add_image(self, image, image_rendering): + image_name = f'i{image.id}{image_rendering}' self._x_objects[image_name] = None # Set by write_pdf if image_name in self._images: # Reuse image already stored in document return image_name - if 'transparency' in pillow_image.info: - pillow_image = pillow_image.convert('RGBA') - elif pillow_image.mode in ('1', 'P', 'I'): - pillow_image = pillow_image.convert('RGB') - - if pillow_image.mode in ('RGB', 'RGBA'): - color_space = '/DeviceRGB' - elif pillow_image.mode in ('L', 'LA'): - color_space = '/DeviceGray' - elif pillow_image.mode == 'CMYK': - color_space = '/DeviceCMYK' - else: - LOGGER.warning('Unknown image mode: %s', pillow_image.mode) - color_space = '/DeviceRGB' - interpolate = 'true' if image_rendering == 'auto' else 'false' - extra = pydyf.Dictionary({ - 'Type': '/XObject', - 'Subtype': '/Image', - 'Width': pillow_image.width, - 'Height': pillow_image.height, - 'ColorSpace': color_space, - 'BitsPerComponent': 8, - 'Interpolate': interpolate, - }) + extra = deepcopy(image.extra) + extra['Interpolate'] = interpolate + if 'SMask' in extra: + extra['SMask'].extra['Interpolate'] = interpolate - optimize = 'images' in optimize_size - if pillow_image.format in ('JPEG', 'MPO'): - extra['Filter'] = '/DCTDecode' - if isinstance(pillow_image, DelayedPillowImage): - stream = [pillow_image] - else: - stream = [get_jpeg_bytes(pillow_image, optimize)] - else: - extra['Filter'] = '/FlateDecode' - extra['DecodeParms'] = pydyf.Dictionary({ - # Predictor 15 specifies that we're providing PNG data, - # ostensibly using an "optimum predictor", but doesn't actually - # matter as long as the predictor value is 10+ according to the - # spec. (Other PNG predictor values assert that we're using - # specific predictors that we don't want to commit to, but - # "optimum" can vary.) - 'Predictor': 15, - 'Columns': pillow_image.width, - }) - if pillow_image.mode in ('RGB', 'RGBA'): - # Defaults to 1. - extra['DecodeParms']['Colors'] = 3 - if pillow_image.mode in ('RGBA', 'LA'): - alpha = pillow_image.getchannel('A') - pillow_image = pillow_image.convert(pillow_image.mode[:-1]) - alpha_data = self._get_png_data(alpha, optimize) - extra['SMask'] = pydyf.Stream([alpha_data], extra={ - 'Filter': '/FlateDecode', - 'Type': '/XObject', - 'Subtype': '/Image', - 'DecodeParms': pydyf.Dictionary({ - 'Predictor': 15, - 'Columns': pillow_image.width, - }), - 'Width': pillow_image.width, - 'Height': pillow_image.height, - 'ColorSpace': '/DeviceGray', - 'BitsPerComponent': 8, - 'Interpolate': interpolate, - }) - stream = [self._get_png_data(pillow_image, optimize)] - - xobject = pydyf.Stream(stream, extra=extra) + xobject = pydyf.Stream(image.stream, extra=extra) self._images[image_name] = xobject return image_name diff --git a/weasyprint/rotate_fn.py b/weasyprint/rotate_fn.py deleted file mode 100644 index a4cde3622..000000000 --- a/weasyprint/rotate_fn.py +++ /dev/null @@ -1,24 +0,0 @@ -from PIL import ImageOps, Image - - -def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image: - """ - Returns either absolute same image if orientation was not changed. - or its copy with modified orientation. - """ - image_format = pillow_image.format - if orientation == 'from-image': - if 'exif' in pillow_image.info: - pillow_image = ImageOps.exif_transpose( - pillow_image) - elif orientation != 'none': - angle, flip = orientation - if angle > 0: - rotation = getattr( - Image.Transpose, f'ROTATE_{angle}') - pillow_image = pillow_image.transpose(rotation) - if flip: - pillow_image = pillow_image.transpose( - Image.Transpose.FLIP_LEFT_RIGHT) - pillow_image.format = image_format - return pillow_image From 44001c53832c6d5df120786ffc68894afdc76e76 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 5 Mar 2023 22:56:41 +0100 Subject: [PATCH 4/6] Add --cache-folder option to temporarily store images on disk --- weasyprint/__init__.py | 12 ++++++++-- weasyprint/__main__.py | 10 ++++++++ weasyprint/document.py | 53 +++++++++++++++++++++++++++++++++++++++++- weasyprint/images.py | 35 +++++++++++++++------------- 4 files changed, 91 insertions(+), 19 deletions(-) diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 00061795f..4a6822946 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -138,7 +138,11 @@ def render(self, stylesheets=None, presentational_hints=False, :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` :param counter_style: A dictionary storing ``@counter-style`` rules. - :param dict image_cache: A dictionary used to cache images. + :param image_cache: + A dictionary used to cache images, or a folder path where images + are temporarily stored. + :type image_cache: + :obj:`dict`, :obj:`str` or :class:`document.DiskCache` :param bool forms: Whether PDF forms have to be included. :returns: A :class:`document.Document` object. @@ -186,7 +190,11 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` :param counter_style: A dictionary storing ``@counter-style`` rules. - :param dict image_cache: A dictionary used to cache images. + :param image_cache: + A dictionary used to cache images, or a folder path where images + are temporarily stored. + :type image_cache: + :obj:`dict`, :obj:`str` or :class:`document.DiskCache` :param bytes identifier: A bytestring used as PDF file identifier. :param str variant: A PDF variant name. :param str version: A PDF version number. diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index dfe4b38fa..9ddabee52 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -94,6 +94,11 @@ def main(argv=None, stdout=None, stdin=None): multiple times, ``all`` adds all allowed values, ``none`` removes all previously set values. + .. option:: -c , --cache-folder + + Store cache on disk instead of memory. The ``folder`` is created if + needed and cleaned after the PDF is generated. + .. option:: -v, --verbose Show warnings and information messages. @@ -156,6 +161,10 @@ def main(argv=None, stdout=None, stdin=None): '-O', '--optimize-size', action='append', help='optimize output size for specified features', choices=('images', 'fonts', 'all', 'none'), default=['fonts']) + parser.add_argument( + '-c', '--cache-folder', + help='Store cache on disk instead of memory. The ``folder`` is ' + 'created if needed and cleaned after the PDF is generated.') parser.add_argument( '-v', '--verbose', action='store_true', help='show warnings and information messages') @@ -203,6 +212,7 @@ def main(argv=None, stdout=None, stdin=None): 'version': args.pdf_version, 'forms': args.pdf_forms, 'custom_metadata': args.custom_metadata, + 'image_cache': args.cache_folder, } # Default to logging to stderr. diff --git a/weasyprint/document.py b/weasyprint/document.py index 6e1afcee2..909fafdbd 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -2,6 +2,8 @@ import functools import io +from hashlib import md5 +from pathlib import Path from . import CSS from .anchors import gather_anchors, make_page_bookmark_tree @@ -158,6 +160,52 @@ def __init__(self, title=None, authors=None, description=None, self.custom = custom or {} +class DiskCache: + """Dict-like storing images content on disk. + + Bytestrings values are stored on disk. Other Python objects (i.e. + RasterImage instances) are still stored in memory, but are much more + lightweight. + + """ + def __init__(self, folder): + self._path = Path(folder) + self._path.mkdir(parents=True, exist_ok=True) + self._memory_cache = {} + self._disk_paths = set() + + def _path_from_key(self, key): + return self._path / md5(key.encode()).hexdigest() + + def __getitem__(self, key): + if key in self._memory_cache: + return self._memory_cache[key] + else: + return self._path_from_key(key).read_bytes() + + def __setitem__(self, key, value): + if isinstance(value, bytes): + path = self._path_from_key(key) + self._disk_paths.add(path) + path.write_bytes(value) + else: + self._memory_cache[key] = value + + def __contains__(self, key): + return ( + key in self._memory_cache or + self._path_from_key(key).exists()) + + def __del__(self): + try: + for path in self._disk_paths: + path.unlink(missing_ok=True) + self._path.rmdir() + except Exception: + # Silently ignore errors while clearing cache + pass + + class Document: """A rendered document ready to be painted in a pydyf stream. @@ -180,7 +228,10 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, target_collector = TargetCollector() page_rules = [] user_stylesheets = [] - image_cache = {} if image_cache is None else image_cache + if image_cache is None: + image_cache = {} + elif not isinstance(image_cache, DiskCache): + image_cache = DiskCache(image_cache) for css in stylesheets or []: if not hasattr(css, 'matcher'): css = CSS( diff --git a/weasyprint/images.py b/weasyprint/images.py index 24ab93e74..9e731ea18 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -36,9 +36,9 @@ def from_exception(cls, exception): class RasterImage: - def __init__(self, pillow_image, image_id, optimize_size, cache_path=None): + def __init__(self, pillow_image, image_id, optimize_size, cache): self.id = image_id - self._cache_path = cache_path + self._cache = cache if 'transparency' in pillow_image.info: pillow_image = pillow_image.convert('RGBA') @@ -92,7 +92,7 @@ def __init__(self, pillow_image, image_id, optimize_size, cache_path=None): alpha = pillow_image.getchannel('A') pillow_image = pillow_image.convert(pillow_image.mode[:-1]) alpha_data = self._get_png_data(alpha, optimize) - stream = self.get_stream(alpha_data) + stream = self.get_stream(alpha_data, alpha=True) self.extra['SMask'] = pydyf.Stream(stream, extra={ 'Filter': '/FlateDecode', 'Type': '/XObject', @@ -151,20 +151,20 @@ def _get_png_data(pillow_image, optimize): return b''.join(png_data) def get_stream(self, data, alpha=False): - if self._cache_path: - path = self._cache_path / f'{self.id}{int(alpha)}' - path.write_bytes(data) - return [LazyImage(path)] - else: - return [data] + key = f'{self.id}{int(alpha)}' + return [LazyImage(self._cache, key, data)] -class LazyImage: - def __init__(self, path): - self._path = path +class LazyImage(pydyf.Object): + def __init__(self, cache, key, data): + super().__init__() + self._key = key + self._cache = cache + cache[key] = data - def __bytes__(self): - self._path.read_bytes() + @property + def data(self): + return self._cache[self._key] class SVGImage: @@ -240,13 +240,14 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, else: # Store image id to enable cache in Stream.add_image image_id = md5(url.encode()).hexdigest() - # Keep image format as it is discarded by transposition pillow_image = rotate_pillow_image(pillow_image, orientation) - image = RasterImage(pillow_image, image_id, optimize_size) + image = RasterImage( + pillow_image, image_id, optimize_size, cache) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) image = None + cache[url] = image return image @@ -269,6 +270,8 @@ def rotate_pillow_image(pillow_image, orientation): if flip: pillow_image = pillow_image.transpose( Image.Transpose.FLIP_LEFT_RIGHT) + + # Keep image format as it is discarded by transposition pillow_image.format = image_format return pillow_image From ab6daa2a035882e6c738c92e3465c17935669556 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Mon, 6 Mar 2023 16:35:03 +0100 Subject: [PATCH 5/6] Fix emojis --- weasyprint/draw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weasyprint/draw.py b/weasyprint/draw.py index 66b427b9c..d7e2961a1 100644 --- a/weasyprint/draw.py +++ b/weasyprint/draw.py @@ -1199,7 +1199,7 @@ def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y, pillow_image = Image.open(BytesIO(png_data)) image_id = f'{font.hash}{glyph}' image = RasterImage( - pillow_image, image_id, optimize_size=()) + pillow_image, image_id, optimize_size=(), cache={}) d = font.widths[glyph] / 1000 a = pillow_image.width / pillow_image.height * d pango.pango_font_get_glyph_extents( From c7087d3d86b75e16a2dc96f9978d6886ffc2390c Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Mon, 13 Mar 2023 17:04:15 +0100 Subject: [PATCH 6/6] =?UTF-8?q?Don=E2=80=99t=20deepcopy=20streams=20dicts?= =?UTF-8?q?=20when=20duplicating=20for=20interpolation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unsurprisingly, deepcopy doesn’t work well on SMask streams :). --- weasyprint/pdf/stream.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index d08bd974b..a9950836e 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -1,7 +1,6 @@ """PDF stream.""" import io -from copy import deepcopy from functools import lru_cache from hashlib import md5 @@ -370,9 +369,12 @@ def add_image(self, image, image_rendering): return image_name interpolate = 'true' if image_rendering == 'auto' else 'false' - extra = deepcopy(image.extra) + extra = image.extra.copy() extra['Interpolate'] = interpolate if 'SMask' in extra: + extra['SMask'] = pydyf.Stream( + extra['SMask'].stream.copy(), extra['SMask'].extra.copy(), + extra['SMask'].compress) extra['SMask'].extra['Interpolate'] = interpolate xobject = pydyf.Stream(image.stream, extra=extra)