diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 00061795f..4a6822946 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -138,7 +138,11 @@ def render(self, stylesheets=None, presentational_hints=False, :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` :param counter_style: A dictionary storing ``@counter-style`` rules. - :param dict image_cache: A dictionary used to cache images. + :param image_cache: + A dictionary used to cache images, or a folder path where images + are temporarily stored. + :type image_cache: + :obj:`dict`, :obj:`str` or :class:`document.DiskCache` :param bool forms: Whether PDF forms have to be included. :returns: A :class:`document.Document` object. @@ -186,7 +190,11 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` :param counter_style: A dictionary storing ``@counter-style`` rules. - :param dict image_cache: A dictionary used to cache images. + :param image_cache: + A dictionary used to cache images, or a folder path where images + are temporarily stored. + :type image_cache: + :obj:`dict`, :obj:`str` or :class:`document.DiskCache` :param bytes identifier: A bytestring used as PDF file identifier. :param str variant: A PDF variant name. :param str version: A PDF version number. diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index dfe4b38fa..9ddabee52 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -94,6 +94,11 @@ def main(argv=None, stdout=None, stdin=None): multiple times, ``all`` adds all allowed values, ``none`` removes all previously set values. + .. option:: -c , --cache-folder + + Store cache on disk instead of memory. The ``folder`` is created if + needed and cleaned after the PDF is generated. + .. option:: -v, --verbose Show warnings and information messages. @@ -156,6 +161,10 @@ def main(argv=None, stdout=None, stdin=None): '-O', '--optimize-size', action='append', help='optimize output size for specified features', choices=('images', 'fonts', 'all', 'none'), default=['fonts']) + parser.add_argument( + '-c', '--cache-folder', + help='Store cache on disk instead of memory. The ``folder`` is ' + 'created if needed and cleaned after the PDF is generated.') parser.add_argument( '-v', '--verbose', action='store_true', help='show warnings and information messages') @@ -203,6 +212,7 @@ def main(argv=None, stdout=None, stdin=None): 'version': args.pdf_version, 'forms': args.pdf_forms, 'custom_metadata': args.custom_metadata, + 'image_cache': args.cache_folder, } # Default to logging to stderr. diff --git a/weasyprint/document.py b/weasyprint/document.py index c722497eb..909fafdbd 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -2,7 +2,8 @@ import functools import io -import shutil +from hashlib import md5 +from pathlib import Path from . import CSS from .anchors import gather_anchors, make_page_bookmark_tree @@ -159,6 +160,52 @@ def __init__(self, title=None, authors=None, description=None, self.custom = custom or {} +class DiskCache: + """Dict-like storing images content on disk. + + Bytestrings values are stored on disk. Other Python objects (i.e. + RasterImage instances) are still stored in memory, but are much more + lightweight. + + """ + def __init__(self, folder): + self._path = Path(folder) + self._path.mkdir(parents=True, exist_ok=True) + self._memory_cache = {} + self._disk_paths = set() + + def _path_from_key(self, key): + return self._path / md5(key.encode()).hexdigest() + + def __getitem__(self, key): + if key in self._memory_cache: + return self._memory_cache[key] + else: + return self._path_from_key(key).read_bytes() + + def __setitem__(self, key, value): + if isinstance(value, bytes): + path = self._path_from_key(key) + self._disk_paths.add(path) + path.write_bytes(value) + else: + self._memory_cache[key] = value + + def __contains__(self, key): + return ( + key in self._memory_cache or + self._path_from_key(key).exists()) + + def __del__(self): + try: + for path in self._disk_paths: + path.unlink(missing_ok=True) + self._path.rmdir() + except Exception: + # Silently ignore errors while clearing cache + pass + + class Document: """A rendered document ready to be painted in a pydyf stream. @@ -181,7 +228,10 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, target_collector = TargetCollector() page_rules = [] user_stylesheets = [] - image_cache = {} if image_cache is None else image_cache + if image_cache is None: + image_cache = {} + elif not isinstance(image_cache, DiskCache): + image_cache = DiskCache(image_cache) for css in stylesheets or []: if not hasattr(css, 'matcher'): css = CSS( @@ -364,15 +414,13 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None, if finisher: finisher(self, pdf) - output = io.BytesIO() - pdf.write(output, version=pdf.version, identifier=identifier) - if target is None: + output = io.BytesIO() + pdf.write(output, version=pdf.version, identifier=identifier) return output.getvalue() + + if hasattr(target, 'write'): + pdf.write(target, version=pdf.version, identifier=identifier) else: - output.seek(0) - if hasattr(target, 'write'): - shutil.copyfileobj(output, target) - else: - with open(target, 'wb') as fd: - shutil.copyfileobj(output, fd) + with open(target, 'wb') as fd: + pdf.write(fd, version=pdf.version, identifier=identifier) diff --git a/weasyprint/draw.py b/weasyprint/draw.py index 66b427b9c..d7e2961a1 100644 --- a/weasyprint/draw.py +++ b/weasyprint/draw.py @@ -1199,7 +1199,7 @@ def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y, pillow_image = Image.open(BytesIO(png_data)) image_id = f'{font.hash}{glyph}' image = RasterImage( - pillow_image, image_id, optimize_size=()) + pillow_image, image_id, optimize_size=(), cache={}) d = font.widths[glyph] / 1000 a = pillow_image.width / pillow_image.height * d pango.pango_font_get_glyph_extents( diff --git a/weasyprint/images.py b/weasyprint/images.py index e10d230b6..9e731ea18 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -1,12 +1,15 @@ """Fetch and decode images in various formats.""" +import io import math +import struct from hashlib import md5 from io import BytesIO from itertools import cycle from math import inf from xml.etree import ElementTree +import pydyf from PIL import Image, ImageFile, ImageOps from .layout.percent import percentage @@ -33,32 +36,136 @@ def from_exception(cls, exception): class RasterImage: - def __init__(self, pillow_image, image_id, optimize_size): - pillow_image.id = image_id - self._pillow_image = pillow_image - self._optimize_size = optimize_size - self._intrinsic_width = pillow_image.width - self._intrinsic_height = pillow_image.height - self._intrinsic_ratio = ( - self._intrinsic_width / self._intrinsic_height - if self._intrinsic_height != 0 else inf) - - def get_intrinsic_size(self, image_resolution, font_size): - return ( - self._intrinsic_width / image_resolution, - self._intrinsic_height / image_resolution, - self._intrinsic_ratio) + def __init__(self, pillow_image, image_id, optimize_size, cache): + self.id = image_id + self._cache = cache + + if 'transparency' in pillow_image.info: + pillow_image = pillow_image.convert('RGBA') + elif pillow_image.mode in ('1', 'P', 'I'): + pillow_image = pillow_image.convert('RGB') + + self.width = pillow_image.width + self.height = pillow_image.height + self.ratio = (self.width / self.height) if self.height != 0 else inf + + if pillow_image.mode in ('RGB', 'RGBA'): + color_space = '/DeviceRGB' + elif pillow_image.mode in ('L', 'LA'): + color_space = '/DeviceGray' + elif pillow_image.mode == 'CMYK': + color_space = '/DeviceCMYK' + else: + LOGGER.warning('Unknown image mode: %s', pillow_image.mode) + color_space = '/DeviceRGB' + + self.extra = pydyf.Dictionary({ + 'Type': '/XObject', + 'Subtype': '/Image', + 'Width': self.width, + 'Height': self.height, + 'ColorSpace': color_space, + 'BitsPerComponent': 8, + }) + optimize = 'images' in optimize_size + if pillow_image.format in ('JPEG', 'MPO'): + self.extra['Filter'] = '/DCTDecode' + image_file = io.BytesIO() + pillow_image.save(image_file, format='JPEG', optimize=optimize) + self.stream = self.get_stream(image_file.getvalue()) + else: + self.extra['Filter'] = '/FlateDecode' + self.extra['DecodeParms'] = pydyf.Dictionary({ + # Predictor 15 specifies that we're providing PNG data, + # ostensibly using an "optimum predictor", but doesn't actually + # matter as long as the predictor value is 10+ according to the + # spec. (Other PNG predictor values assert that we're using + # specific predictors that we don't want to commit to, but + # "optimum" can vary.) + 'Predictor': 15, + 'Columns': self.width, + }) + if pillow_image.mode in ('RGB', 'RGBA'): + # Defaults to 1. + self.extra['DecodeParms']['Colors'] = 3 + if pillow_image.mode in ('RGBA', 'LA'): + alpha = pillow_image.getchannel('A') + pillow_image = pillow_image.convert(pillow_image.mode[:-1]) + alpha_data = self._get_png_data(alpha, optimize) + stream = self.get_stream(alpha_data, alpha=True) + self.extra['SMask'] = pydyf.Stream(stream, extra={ + 'Filter': '/FlateDecode', + 'Type': '/XObject', + 'Subtype': '/Image', + 'DecodeParms': pydyf.Dictionary({ + 'Predictor': 15, + 'Columns': pillow_image.width, + }), + 'Width': pillow_image.width, + 'Height': pillow_image.height, + 'ColorSpace': '/DeviceGray', + 'BitsPerComponent': 8, + }) + + png_data = self._get_png_data(pillow_image, optimize) + self.stream = self.get_stream(png_data) + + def get_intrinsic_size(self, resolution, font_size): + return self.width / resolution, self.height / resolution, self.ratio def draw(self, stream, concrete_width, concrete_height, image_rendering): - if self._intrinsic_width <= 0 or self._intrinsic_height <= 0: + if self.width <= 0 or self.height <= 0: return - image_name = stream.add_image( - self._pillow_image, image_rendering, self._optimize_size) + image_name = stream.add_image(self, image_rendering) stream.transform( concrete_width, 0, 0, -concrete_height, 0, concrete_height) stream.draw_x_object(image_name) + @staticmethod + def _get_png_data(pillow_image, optimize): + image_file = io.BytesIO() + pillow_image.save(image_file, format='PNG', optimize=optimize) + + # Read the PNG header, then discard it because we know it's a PNG. If + # this weren't just output from Pillow, we should actually check it. + image_file.seek(8) + + png_data = [] + raw_chunk_length = image_file.read(4) + # PNG files consist of a series of chunks. + while raw_chunk_length: + # Each chunk begins with its data length (four bytes, may be zero), + # then its type (four ASCII characters), then the data, then four + # bytes of a CRC. + chunk_len, = struct.unpack('!I', raw_chunk_length) + chunk_type = image_file.read(4) + if chunk_type == b'IDAT': + png_data.append(image_file.read(chunk_len)) + else: + image_file.seek(chunk_len, io.SEEK_CUR) + # We aren't checking the CRC, we assume this is a valid PNG. + image_file.seek(4, io.SEEK_CUR) + raw_chunk_length = image_file.read(4) + + return b''.join(png_data) + + def get_stream(self, data, alpha=False): + key = f'{self.id}{int(alpha)}' + return [LazyImage(self._cache, key, data)] + + +class LazyImage(pydyf.Object): + def __init__(self, cache, key, data): + super().__init__() + self._key = key + self._cache = cache + cache[key] = data + + @property + def data(self): + return self._cache[self._key] + class SVGImage: def __init__(self, tree, base_url, url_fetcher, context): @@ -106,60 +213,69 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, string = result['file_obj'].read() mime_type = forced_mime_type or result['mime_type'] - image = None - svg_exceptions = [] - # Try to rely on given mimetype for SVG - if mime_type == 'image/svg+xml': + image = None + svg_exceptions = [] + # Try to rely on given mimetype for SVG + if mime_type == 'image/svg+xml': + try: + tree = ElementTree.fromstring(string) + image = SVGImage(tree, url, url_fetcher, context) + except Exception as svg_exception: + svg_exceptions.append(svg_exception) + # Try pillow for raster images, or for failing SVG + if image is None: + try: + pillow_image = Image.open(BytesIO(string)) + except Exception as raster_exception: + if mime_type == 'image/svg+xml': + # Tried SVGImage then Pillow for a SVG, abort + raise ImageLoadingError.from_exception(svg_exceptions[0]) try: + # Last chance, try SVG tree = ElementTree.fromstring(string) image = SVGImage(tree, url, url_fetcher, context) - except Exception as svg_exception: - svg_exceptions.append(svg_exception) - # Try pillow for raster images, or for failing SVG - if image is None: - try: - pillow_image = Image.open(BytesIO(string)) - except Exception as raster_exception: - if mime_type == 'image/svg+xml': - # Tried SVGImage then Pillow for a SVG, abort - raise ImageLoadingError.from_exception( - svg_exceptions[0]) - try: - # Last chance, try SVG - tree = ElementTree.fromstring(string) - image = SVGImage(tree, url, url_fetcher, context) - except Exception: - # Tried Pillow then SVGImage for a raster, abort - raise ImageLoadingError.from_exception( - raster_exception) - else: - # Store image id to enable cache in Stream.add_image - image_id = md5(url.encode()).hexdigest() - # Keep image format as it is discarded by transposition - image_format = pillow_image.format - if orientation == 'from-image': - if 'exif' in pillow_image.info: - pillow_image = ImageOps.exif_transpose( - pillow_image) - elif orientation != 'none': - angle, flip = orientation - if angle > 0: - rotation = getattr( - Image.Transpose, f'ROTATE_{angle}') - pillow_image = pillow_image.transpose(rotation) - if flip: - pillow_image = pillow_image.transpose( - Image.Transpose.FLIP_LEFT_RIGHT) - pillow_image.format = image_format - image = RasterImage(pillow_image, image_id, optimize_size) + except Exception: + # Tried Pillow then SVGImage for a raster, abort + raise ImageLoadingError.from_exception(raster_exception) + else: + # Store image id to enable cache in Stream.add_image + image_id = md5(url.encode()).hexdigest() + pillow_image = rotate_pillow_image(pillow_image, orientation) + image = RasterImage( + pillow_image, image_id, optimize_size, cache) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) image = None + cache[url] = image return image +def rotate_pillow_image(pillow_image, orientation): + """Return a copy of a Pillow image with modified orientation. + + If orientation is not changed, return the same image. + + """ + image_format = pillow_image.format + if orientation == 'from-image': + if 'exif' in pillow_image.info: + pillow_image = ImageOps.exif_transpose(pillow_image) + elif orientation != 'none': + angle, flip = orientation + if angle > 0: + rotation = getattr(Image.Transpose, f'ROTATE_{angle}') + pillow_image = pillow_image.transpose(rotation) + if flip: + pillow_image = pillow_image.transpose( + Image.Transpose.FLIP_LEFT_RIGHT) + + # Keep image format as it is discarded by transposition + pillow_image.format = image_format + return pillow_image + + def process_color_stops(vector_length, positions): """Give color stops positions on the gradient vector. diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index ea03ac533..a9950836e 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -1,7 +1,6 @@ """PDF stream.""" import io -import struct from functools import lru_cache from hashlib import md5 @@ -362,108 +361,23 @@ def add_group(self, x, y, width, height): self._x_objects[group.id] = group return group - def _get_png_data(self, pillow_image, optimize): - image_file = io.BytesIO() - pillow_image.save(image_file, format='PNG', optimize=optimize) - - # Read the PNG header, then discard it because we know it's a PNG. If - # this weren't just output from Pillow, we should actually check it. - image_file.seek(8) - - png_data = b'' - raw_chunk_length = image_file.read(4) - # PNG files consist of a series of chunks. - while len(raw_chunk_length) > 0: - # Each chunk begins with its data length (four bytes, may be zero), - # then its type (four ASCII characters), then the data, then four - # bytes of a CRC. - chunk_len, = struct.unpack('!I', raw_chunk_length) - chunk_type = image_file.read(4) - if chunk_type == b'IDAT': - png_data += image_file.read(chunk_len) - else: - image_file.seek(chunk_len, io.SEEK_CUR) - # We aren't checking the CRC, we assume this is a valid PNG. - image_file.seek(4, io.SEEK_CUR) - raw_chunk_length = image_file.read(4) - - return png_data - - def add_image(self, pillow_image, image_rendering, optimize_size): - image_name = f'i{pillow_image.id}' + def add_image(self, image, image_rendering): + image_name = f'i{image.id}{image_rendering}' self._x_objects[image_name] = None # Set by write_pdf if image_name in self._images: # Reuse image already stored in document return image_name - if 'transparency' in pillow_image.info: - pillow_image = pillow_image.convert('RGBA') - elif pillow_image.mode in ('1', 'P', 'I'): - pillow_image = pillow_image.convert('RGB') - - if pillow_image.mode in ('RGB', 'RGBA'): - color_space = '/DeviceRGB' - elif pillow_image.mode in ('L', 'LA'): - color_space = '/DeviceGray' - elif pillow_image.mode == 'CMYK': - color_space = '/DeviceCMYK' - else: - LOGGER.warning('Unknown image mode: %s', pillow_image.mode) - color_space = '/DeviceRGB' - interpolate = 'true' if image_rendering == 'auto' else 'false' - extra = pydyf.Dictionary({ - 'Type': '/XObject', - 'Subtype': '/Image', - 'Width': pillow_image.width, - 'Height': pillow_image.height, - 'ColorSpace': color_space, - 'BitsPerComponent': 8, - 'Interpolate': interpolate, - }) - - optimize = 'images' in optimize_size - if pillow_image.format in ('JPEG', 'MPO'): - extra['Filter'] = '/DCTDecode' - image_file = io.BytesIO() - pillow_image.save(image_file, format='JPEG', optimize=optimize) - stream = [image_file.getvalue()] - else: - extra['Filter'] = '/FlateDecode' - extra['DecodeParms'] = pydyf.Dictionary({ - # Predictor 15 specifies that we're providing PNG data, - # ostensibly using an "optimum predictor", but doesn't actually - # matter as long as the predictor value is 10+ according to the - # spec. (Other PNG predictor values assert that we're using - # specific predictors that we don't want to commit to, but - # "optimum" can vary.) - 'Predictor': 15, - 'Columns': pillow_image.width, - }) - if pillow_image.mode in ('RGB', 'RGBA'): - # Defaults to 1. - extra['DecodeParms']['Colors'] = 3 - if pillow_image.mode in ('RGBA', 'LA'): - alpha = pillow_image.getchannel('A') - pillow_image = pillow_image.convert(pillow_image.mode[:-1]) - alpha_data = self._get_png_data(alpha, optimize) - extra['SMask'] = pydyf.Stream([alpha_data], extra={ - 'Filter': '/FlateDecode', - 'Type': '/XObject', - 'Subtype': '/Image', - 'DecodeParms': pydyf.Dictionary({ - 'Predictor': 15, - 'Columns': pillow_image.width, - }), - 'Width': pillow_image.width, - 'Height': pillow_image.height, - 'ColorSpace': '/DeviceGray', - 'BitsPerComponent': 8, - 'Interpolate': interpolate, - }) - stream = [self._get_png_data(pillow_image, optimize)] - - xobject = pydyf.Stream(stream, extra=extra) + extra = image.extra.copy() + extra['Interpolate'] = interpolate + if 'SMask' in extra: + extra['SMask'] = pydyf.Stream( + extra['SMask'].stream.copy(), extra['SMask'].extra.copy(), + extra['SMask'].compress) + extra['SMask'].extra['Interpolate'] = interpolate + + xobject = pydyf.Stream(image.stream, extra=extra) self._images[image_name] = xobject return image_name