From 6b82aec1031198b1acd23c11bbc5c3c9f82efd8a Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 21 Mar 2023 14:58:09 +0100 Subject: [PATCH 01/20] =?UTF-8?q?Don=E2=80=99t=20draw=20invisible=20charac?= =?UTF-8?q?ters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even spaces seem to be ignored by PDF readers, that probably rely more on text layout than on actual word separators. --- weasyprint/draw.py | 4 ++++ weasyprint/svg/text.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/weasyprint/draw.py b/weasyprint/draw.py index ac107e1ec..0eafb776d 100644 --- a/weasyprint/draw.py +++ b/weasyprint/draw.py @@ -1052,6 +1052,10 @@ def draw_emojis(stream, font_size, x, y, emojis): def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y, angle=0): """Draw the given ``textbox`` line to the document ``stream``.""" + # Don’t draw lines with only invisible characters + if not textbox.text.strip(): + return [] + font_size = textbox.style['font_size'] if font_size < 1e-6: # Default float precision used by pydyf return [] diff --git a/weasyprint/svg/text.py b/weasyprint/svg/text.py index 992900cae..48be5e7a7 100644 --- a/weasyprint/svg/text.py +++ b/weasyprint/svg/text.py @@ -12,6 +12,10 @@ def __init__(self, pango_layout, style): self.pango_layout = pango_layout self.style = style + @property + def text(self): + return self.pango_layout.text + def text(svg, node, font_size): """Draw text node.""" From e49d9555090ecec4f76b31f7beba18975f471211 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 21 Mar 2023 14:58:47 +0100 Subject: [PATCH 02/20] Make tests pass with pydyf 0.6.0 --- docs/first_steps.rst | 2 +- pyproject.toml | 2 +- tests/test_pdf.py | 40 ++++++++++++++++++++-------------------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/first_steps.rst b/docs/first_steps.rst index 85c3ec857..e2aacefe8 100644 --- a/docs/first_steps.rst +++ b/docs/first_steps.rst @@ -11,7 +11,7 @@ WeasyPrint |version| depends on: * Python_ ≥ 3.7.0 * Pango_ ≥ 1.44.0 -* pydyf_ ≥ 0.5.0 +* pydyf_ ≥ 0.6.0 * CFFI_ ≥ 0.6 * html5lib_ ≥ 1.1 * tinycss2_ ≥ 1.0.0 diff --git a/pyproject.toml b/pyproject.toml index 1c1495473..570aef8af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ requires-python = '>=3.7' readme = {file = 'README.rst', content-type = 'text/x-rst'} license = {file = 'LICENSE'} dependencies = [ - 'pydyf >=0.5.0', + 'pydyf >=0.6.0', 'cffi >=0.6', 'html5lib >=1.1', 'tinycss2 >=1.0.0', diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 3bcc360ac..3d4dcdd0a 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -26,7 +26,7 @@ def test_page_size_zoom(zoom): pdf = FakeHTML(string=' test ''' % style).write_pdf() - assert '/MediaBox [ {} {} {} {} ]'.format(*media).encode() in pdf - assert '/BleedBox [ {} {} {} {} ]'.format(*bleed).encode() in pdf - assert '/TrimBox [ {} {} {} {} ]'.format(*trim).encode() in pdf + assert '/MediaBox [{} {} {} {}]'.format(*media).encode() in pdf + assert '/BleedBox [{} {} {} {}]'.format(*bleed).encode() in pdf + assert '/TrimBox [{} {} {} {}]'.format(*trim).encode() in pdf From fdbdfc150c6115defe8cb061e088bed9d9c09503 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 26 Mar 2023 13:08:19 +0200 Subject: [PATCH 03/20] Add API to compress generated PDF files This feature compresses PDF streams (as it was already the case) and ask pydyf to use a compact PDF structure with compressed object stream and cross-reference object (for PDF version >=1.5). --- docs/first_steps.rst | 5 +++-- tests/conftest.py | 2 +- tests/test_api.py | 45 ++++++++++++++++++++++++++------------ tests/testing_utils.py | 12 ++++++++++ weasyprint/__init__.py | 12 +++++----- weasyprint/__main__.py | 12 +++++----- weasyprint/document.py | 12 +++++----- weasyprint/pdf/__init__.py | 11 ++++++---- weasyprint/pdf/anchors.py | 14 ++++++------ weasyprint/pdf/fonts.py | 11 ++++++---- weasyprint/pdf/metadata.py | 4 ++-- weasyprint/pdf/pdfa.py | 6 ++--- weasyprint/pdf/pdfua.py | 4 ++-- weasyprint/pdf/stream.py | 7 +++--- 14 files changed, 100 insertions(+), 57 deletions(-) diff --git a/docs/first_steps.rst b/docs/first_steps.rst index e2aacefe8..2d2c1a485 100644 --- a/docs/first_steps.rst +++ b/docs/first_steps.rst @@ -513,7 +513,8 @@ WeasyPrint provides two options to deal with images: ``optimize_size`` and ``optimize_size`` can enable size optimization for images, but also for fonts. When enabled, the generated PDF will include smaller images and fonts, but the -rendering time may be slightly increased. +rendering time may be slightly increased. The whole structure of the PDF can be +compressed too. .. code-block:: python @@ -523,7 +524,7 @@ rendering time may be slightly increased. # Full size optimization, slower, but generated PDF is smaller HTML('https://example.org/').write_pdf( - 'example.pdf', optimize_size=('fonts', 'images')) + 'example.pdf', optimize_size=('fonts', 'images', 'pdf')) ``image_cache`` gives the possibility to use a cache for images, avoiding to download, parse and optimize them each time they are used. diff --git a/tests/conftest.py b/tests/conftest.py index 383d0e137..3fa957b55 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,7 +74,7 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1, def html_write_png(self, target=None, stylesheets=None, resolution=96, - presentational_hints=False, optimize_size=('fonts',), + presentational_hints=False, optimize_size=('fonts', 'pdf'), font_config=None, counter_style=None, image_cache=None): return self.render( stylesheets, presentational_hints=presentational_hints, diff --git a/tests/test_api.py b/tests/test_api.py index 27dea4c99..b8f23d206 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -303,11 +303,12 @@ def test_command_line_render(tmpdir): tmpdir.join(name).write_binary(pattern_bytes) # Reference + optimize_size = ('fonts', 'pdf') html_obj = FakeHTML(string=combined, base_url='dummy.html') - pdf_bytes = html_obj.write_pdf() + pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size) rotated_pdf_bytes = FakeHTML( string=combined, base_url='dummy.html', - media_type='screen').write_pdf() + media_type='screen').write_pdf(optimize_size=optimize_size) tmpdir.join('no_css.html').write_binary(html) tmpdir.join('combined.html').write_binary(combined) @@ -386,9 +387,9 @@ def test_command_line_render(tmpdir): assert stdout.count(b'attachment') == 0 stdout = _run('combined.html -') assert stdout.count(b'attachment') == 0 - stdout = _run('-a pattern.png combined.html -') + stdout = _run('-O none -a pattern.png combined.html -') assert stdout.count(b'attachment') == 1 - stdout = _run('-a style.css -a pattern.png combined.html -') + stdout = _run('-O none -a style.css -a pattern.png combined.html -') assert stdout.count(b'attachment') == 2 os.mkdir('subdirectory') @@ -423,42 +424,58 @@ def test_command_line_render(tmpdir): (4, '2.0'), )) def test_pdfa(version, pdf_version): - stdout = _run(f'--pdf-variant=pdf/a-{version}b - -', b'test') + stdout = _run(f'--pdf-variant=pdf/a-{version}b -O none - -', b'test') assert f'PDF-{pdf_version}'.encode() in stdout assert f'part="{version}"'.encode() in stdout +@pytest.mark.parametrize('version, pdf_version', ( + (1, '1.4'), + (2, '1.7'), + (3, '1.7'), + (4, '2.0'), +)) +def test_pdfa_compressed(version, pdf_version): + _run(f'--pdf-variant=pdf/a-{version}b - -', b'test') + + def test_pdfua(): - stdout = _run('--pdf-variant=pdf/ua-1 - -', b'test') + stdout = _run('--pdf-variant=pdf/ua-1 -O none - -', b'test') assert b'part="1"' in stdout +def test_pdfua_compressed(): + _run('--pdf-variant=pdf/ua-1 - -', b'test') + + def test_pdf_identifier(): - stdout = _run('--pdf-identifier=abc - -', b'test') + stdout = _run('--pdf-identifier=abc -O none - -', b'test') assert b'abc' in stdout def test_pdf_version(): - stdout = _run('--pdf-version=1.4 - -', b'test') + stdout = _run('--pdf-version=1.4 -O none - -', b'test') assert b'PDF-1.4' in stdout def test_pdf_custom_metadata(): - stdout = _run('--custom-metadata - -', b'') + stdout = _run( + '--custom-metadata -O none - -', + b'') assert b'/key' in stdout assert b'value' in stdout def test_bad_pdf_custom_metadata(): stdout = _run( - '--custom-metadata - -', + '--custom-metadata -O none - -', ''.encode('latin1')) assert b'value' not in stdout def test_partial_pdf_custom_metadata(): stdout = _run( - '--custom-metadata - -', + '--custom-metadata -O none - -', ''.encode('latin1')) assert b'/abcd0' in stdout assert b'value' in stdout @@ -470,7 +487,7 @@ def test_partial_pdf_custom_metadata(): (b'', b'/Tx'), )) def test_pdf_inputs(html, field): - stdout = _run('--pdf-forms - -', html) + stdout = _run('--pdf-forms -O none - -', html) assert b'AcroForm' in stdout assert field in stdout stdout = _run('- -', html) @@ -484,8 +501,8 @@ def test_pdf_inputs(html, field): )) def test_appearance(css, with_forms, without_forms): html = f''.encode() - assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms - assert (b'AcroForm' in _run('- -', html)) is without_forms + assert (b'AcroForm' in _run('--pdf-forms -O none - -', html)) is with_forms + assert (b'AcroForm' in _run(' -O none - -', html)) is without_forms def test_reproducible(): diff --git a/tests/testing_utils.py b/tests/testing_utils.py index a0212b405..20f143e26 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -53,6 +53,18 @@ def _ua_stylesheets(self, forms=False): TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET else stylesheet for stylesheet in super()._ua_stylesheets(forms)] + def write_pdf(self, target=None, stylesheets=None, zoom=1, + attachments=None, finisher=None, presentational_hints=False, + optimize_size=('fonts',), font_config=None, + counter_style=None, image_cache=None, identifier=None, + variant=None, version=None, forms=False, + custom_metadata=False): + # Override function to set PDF size optimization to False by default + return super().write_pdf( + target, stylesheets, zoom, attachments, finisher, + presentational_hints, optimize_size, font_config, counter_style, + image_cache, identifier, variant, version, forms, custom_metadata) + def resource_filename(basename): """Return the absolute path of the resource called ``basename``.""" diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index e3914f7d2..915d58533 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -118,8 +118,8 @@ def _ph_stylesheets(self): return [HTML5_PH_STYLESHEET] def render(self, stylesheets=None, presentational_hints=False, - optimize_size=('fonts',), font_config=None, counter_style=None, - image_cache=None, forms=False): + optimize_size=('fonts', 'pdf'), font_config=None, + counter_style=None, image_cache=None, forms=False): """Lay out and paginate the document, but do not (yet) export it. This returns a :class:`document.Document` object which provides @@ -133,7 +133,8 @@ def render(self, stylesheets=None, presentational_hints=False, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images" and "fonts". + Optimize size of generated PDF. Can contain "images", "fonts" and + "pdf". :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -153,7 +154,7 @@ def render(self, stylesheets=None, presentational_hints=False, def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts',), font_config=None, + optimize_size=('fonts', 'pdf'), font_config=None, counter_style=None, image_cache=None, identifier=None, variant=None, version=None, forms=False, custom_metadata=False): @@ -185,7 +186,8 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images" and "fonts". + Optimize size of generated PDF. Can contain "images", "fonts" and + "pdf". :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index 9ddabee52..e5bbf69f8 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -90,9 +90,10 @@ def main(argv=None, stdout=None, stdin=None): .. option:: -O , --optimize-size Optimize the size of generated documents. Supported types are - ``images``, ``fonts``, ``all`` and ``none``. This option can be used - multiple times, ``all`` adds all allowed values, ``none`` removes all - previously set values. + ``images``, ``fonts``, ``pdf``, ``all`` and ``none``. This option can + be used multiple times, ``all`` adds all allowed values, ``none`` + removes all previously set values (including the default ones, + ``fonts`` and ``pdf``). .. option:: -c , --cache-folder @@ -160,7 +161,8 @@ def main(argv=None, stdout=None, stdin=None): parser.add_argument( '-O', '--optimize-size', action='append', help='optimize output size for specified features', - choices=('images', 'fonts', 'all', 'none'), default=['fonts']) + choices=('images', 'fonts', 'pdf', 'all', 'none'), + default=['fonts', 'pdf']) parser.add_argument( '-c', '--cache-folder', help='Store cache on disk instead of memory. The ``folder`` is ' @@ -198,7 +200,7 @@ def main(argv=None, stdout=None, stdin=None): if arg == 'none': optimize_size.clear() elif arg == 'all': - optimize_size |= {'images', 'fonts'} + optimize_size |= {'images', 'fonts', 'pdf'} else: optimize_size.add(arg) diff --git a/weasyprint/document.py b/weasyprint/document.py index 909fafdbd..b213c0c0e 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -296,8 +296,8 @@ def __init__(self, pages, metadata, url_fetcher, font_config, # rendering is destroyed. This is needed as font_config.__del__ removes # fonts that may be used when rendering self.font_config = font_config - # Set of flags for PDF size optimization. Can contain "images" and - # "fonts". + # Set of flags for PDF size optimization. Can contain "images", "fonts" + # and "pdf". self._optimize_size = optimize_size def build_element_structure(self, structure, etree_element=None): @@ -414,13 +414,15 @@ def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None, if finisher: finisher(self, pdf) + compress = 'pdf' in self._optimize_size + if target is None: output = io.BytesIO() - pdf.write(output, version=pdf.version, identifier=identifier) + pdf.write(output, pdf.version, identifier, compress) return output.getvalue() if hasattr(target, 'write'): - pdf.write(target, version=pdf.version, identifier=identifier) + pdf.write(target, pdf.version, identifier, compress) else: with open(target, 'wb') as fd: - pdf.write(fd, version=pdf.version, identifier=identifier) + pdf.write(fd, pdf.version, identifier, compress) diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index dd344a32a..a4ddb5452 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -153,9 +153,10 @@ def generate_pdf(document, target, zoom, attachments, optimize_size, page_rectangle = ( left / scale, top / scale, (right - left) / scale, (bottom - top) / scale) + compress = 'pdf' in optimize_size stream = Stream( document.fonts, page_rectangle, states, x_objects, patterns, - shadings, images, mark) + shadings, images, mark, compress=compress) stream.transform(d=-1, f=(page.height * scale)) pdf.add_object(stream) page_streams.append(stream) @@ -175,10 +176,11 @@ def generate_pdf(document, target, zoom, attachments, optimize_size, add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark) add_annotations( - links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files) + links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files, + compress) add_inputs( page.inputs, matrix, pdf, pdf_page, resources, stream, - document.font_config.font_map) + document.font_config.font_map, compress) page.paint(stream, scale=scale) # Bleed @@ -281,6 +283,7 @@ def generate_pdf(document, target, zoom, attachments, optimize_size, # Apply PDF variants functions if variant: - variant_function(pdf, metadata, document, page_streams) + compress = 'pdf' in optimize_size + variant_function(pdf, metadata, document, page_streams, compress) return pdf diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index 33a6ac0ce..f907475a0 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -91,7 +91,8 @@ def add_outlines(pdf, bookmarks, parent=None): return outlines, count -def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map): +def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map, + compress): """Include form inputs in PDF.""" if not inputs: return @@ -118,7 +119,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map): input_name = pydyf.String(element.attrib.get('name', default_name)) # TODO: where does this 0.75 scale come from? font_size = style['font_size'] * 0.75 - field_stream = pydyf.Stream() + field_stream = pydyf.Stream(compress=compress) field_stream.set_color_rgb(*style['color'][:3]) if input_type == 'checkbox': # Checkboxes @@ -129,7 +130,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map): 'Type': '/XObject', 'Subtype': '/Form', 'BBox': pydyf.Array((0, 0, width, height)), - }) + }, compress=compress) checked_stream.push_state() checked_stream.begin_text() checked_stream.set_color_rgb(*style['color'][:3]) @@ -194,7 +195,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map): pdf.catalog['AcroForm']['Fields'].append(field.reference) -def add_annotations(links, matrix, document, pdf, page, annot_files): +def add_annotations(links, matrix, document, pdf, page, annot_files, compress): """Include annotations in PDF.""" # TODO: splitting a link into multiple independent rectangular # annotations works well for pure links, but rather mediocre for @@ -225,8 +226,7 @@ def add_annotations(links, matrix, document, pdf, page, annot_files): 'Type': '/XObject', 'Subtype': '/Form', 'BBox': pydyf.Array(rectangle), - 'Length': 0, - }) + }, compress) pdf.add_object(stream) annot = pydyf.Dictionary({ 'Type': '/Annot', @@ -278,7 +278,7 @@ def write_pdf_attachment(pdf, attachment, url_fetcher): 'Size': uncompressed_length, }) }) - file_stream = pydyf.Stream([stream], file_extra) + file_stream = pydyf.Stream([stream], file_extra, compress) pdf.add_object(file_stream) except URLFetchingError as exception: diff --git a/weasyprint/pdf/fonts.py b/weasyprint/pdf/fonts.py index c4461e2bb..9027f0e64 100644 --- a/weasyprint/pdf/fonts.py +++ b/weasyprint/pdf/fonts.py @@ -8,6 +8,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): + compress = 'pdf' in optimize_size pdf_fonts = pydyf.Dictionary() fonts_by_file_hash = {} for font in fonts.values(): @@ -32,7 +33,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): else: font_extra = pydyf.Dictionary({'Length1': len(font.file_content)}) font_stream = pydyf.Stream( - [font.file_content], font_extra, compress=True) + [font.file_content], font_extra, compress=compress) pdf.add_object(font_stream) font_references_by_file_hash[file_hash] = font_stream.reference @@ -80,7 +81,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): b'1 begincodespacerange', b'<0000> ', b'endcodespacerange', - f'{len(cmap)} beginbfchar'.encode()]) + f'{len(cmap)} beginbfchar'.encode()], compress=compress) for glyph, text in cmap.items(): unicode_codepoints = ''.join( f'{letter.encode("utf-16-be").hex()}' for letter in text) @@ -125,7 +126,8 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): for cid in cids: bits[cid] = '1' stream = pydyf.Stream( - (int(''.join(bits), 2).to_bytes(padded_width, 'big'),)) + (int(''.join(bits), 2).to_bytes(padded_width, 'big'),), + compress=compress) pdf.add_object(stream) font_descriptor['CIDSet'] = stream.reference if font.type == 'otf': @@ -156,6 +158,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths, optimize_size): + compress = 'pdf' in optimize_size # https://docs.microsoft.com/typography/opentype/spec/ebdt font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1]) font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0]) @@ -308,7 +311,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths, b'/BPC 1', b'/D [1 0]', b'ID', bitmap, b'EI' - ]) + ], compress=compress) pdf.add_object(bitmap_stream) char_procs[glyph_id] = bitmap_stream.reference diff --git a/weasyprint/pdf/metadata.py b/weasyprint/pdf/metadata.py index 559bbc521..7e3d7ee3d 100644 --- a/weasyprint/pdf/metadata.py +++ b/weasyprint/pdf/metadata.py @@ -20,7 +20,7 @@ register_namespace(key, value) -def add_metadata(pdf, metadata, variant, version, conformance): +def add_metadata(pdf, metadata, variant, version, conformance, compress): """Add PDF stream of metadata. Described in ISO-32000-1:2008, 14.3.2. @@ -88,6 +88,6 @@ def add_metadata(pdf, metadata, variant, version, conformance): footer = b'' stream_content = b'\n'.join((header, xml, footer)) extra = {'Type': '/Metadata', 'Subtype': '/XML'} - metadata = pydyf.Stream([stream_content], extra=extra) + metadata = pydyf.Stream([stream_content], extra, compress) pdf.add_object(metadata) pdf.catalog['Metadata'] = metadata.reference diff --git a/weasyprint/pdf/pdfa.py b/weasyprint/pdf/pdfa.py index 67ee012de..738f7ed57 100644 --- a/weasyprint/pdf/pdfa.py +++ b/weasyprint/pdf/pdfa.py @@ -18,7 +18,7 @@ def read_binary(package, resource): from .metadata import add_metadata -def pdfa(pdf, metadata, document, page_streams, version): +def pdfa(pdf, metadata, document, page_streams, compress, version): """Set metadata for PDF/A documents.""" LOGGER.warning( 'PDF/A support is experimental, ' @@ -29,7 +29,7 @@ def pdfa(pdf, metadata, document, page_streams, version): profile = pydyf.Stream( [read_binary(__package__, 'sRGB2014.icc')], pydyf.Dictionary({'N': 3, 'Alternate': '/DeviceRGB'}), - compress=True) + compress=compress) pdf.add_object(profile) pdf.catalog['OutputIntents'] = pydyf.Array([ pydyf.Dictionary({ @@ -46,7 +46,7 @@ def pdfa(pdf, metadata, document, page_streams, version): pdf_object['F'] = 2 ** (3 - 1) # Common PDF metadata stream - add_metadata(pdf, metadata, 'a', version, 'B') + add_metadata(pdf, metadata, 'a', version, 'B', compress) VARIANTS = { diff --git a/weasyprint/pdf/pdfua.py b/weasyprint/pdf/pdfua.py index ba4c71cb4..a40ec85d9 100644 --- a/weasyprint/pdf/pdfua.py +++ b/weasyprint/pdf/pdfua.py @@ -6,7 +6,7 @@ from .metadata import add_metadata -def pdfua(pdf, metadata, document, page_streams): +def pdfua(pdf, metadata, document, page_streams, compress): """Set metadata for PDF/UA documents.""" LOGGER.warning( 'PDF/UA support is experimental, ' @@ -117,7 +117,7 @@ def pdfua(pdf, metadata, document, page_streams): annotation['F'] = 2 ** (2 - 1) # Common PDF metadata stream - add_metadata(pdf, metadata, 'ua', version=1, conformance=None) + add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress) # PDF document extra metadata if 'Lang' not in pdf.catalog: diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index a9950836e..a70764376 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -195,7 +195,6 @@ class Stream(pydyf.Stream): def __init__(self, fonts, page_rectangle, states, x_objects, patterns, shadings, images, mark, *args, **kwargs): super().__init__(*args, **kwargs) - self.compress = True self.page_rectangle = page_rectangle self.marked = [] self._fonts = fonts @@ -356,7 +355,8 @@ def add_group(self, x, y, width, height): }) group = Stream( self._fonts, self.page_rectangle, states, x_objects, patterns, - shadings, self._images, self._mark, extra=extra) + shadings, self._images, self._mark, extra=extra, + compress=self.compress) group.id = f'x{len(self._x_objects)}' self._x_objects[group.id] = group return group @@ -407,7 +407,8 @@ def add_pattern(self, x, y, width, height, repeat_width, repeat_height, }) pattern = Stream( self._fonts, self.page_rectangle, states, x_objects, patterns, - shadings, self._images, self._mark, extra=extra) + shadings, self._images, self._mark, extra=extra, + compress=self.compress) pattern.id = f'p{len(self._patterns)}' self._patterns[pattern.id] = pattern return pattern From c5585c62c5e1d3495eeafcc91e5392a3b14df61b Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 21 Mar 2023 16:01:33 +0100 Subject: [PATCH 04/20] Use new pydyf operators for text --- weasyprint/pdf/anchors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index f907475a0..86e1aca2e 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -138,9 +138,8 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map, # Center (let’s assume that Dingbat’s check has a 0.8em size) x = (width - font_size * 0.8) / 2 y = (height - font_size * 0.8) / 2 - # TODO: we should have these operators in pydyf - checked_stream.stream.append(f'{x} {y} Td') - checked_stream.stream.append('(4) Tj') + checked_stream.move_text_to(x, y) + checked_stream.show_text_string('4') checked_stream.end_text() checked_stream.pop_state() pdf.add_object(checked_stream) From 5cdd751af101d812cc4a024f20d6df5f7335a7d4 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 21 Mar 2023 16:10:45 +0100 Subject: [PATCH 05/20] Test new default compression values --- tests/test_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_api.py b/tests/test_api.py index b8f23d206..9d385eb76 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -368,6 +368,8 @@ def test_command_line_render(tmpdir): _run('not_optimized.html out20.pdf -O none') _run('not_optimized.html out21.pdf -O none -O all') _run('not_optimized.html out22.pdf -O all -O none') + _run('not_optimized.html out23.pdf -O pdf') + _run('not_optimized.html out24.pdf -O none -O fonts -O pdf') assert ( len(tmpdir.join('out16.pdf').read_binary()) < len(tmpdir.join('out15.pdf').read_binary()) < @@ -377,7 +379,7 @@ def test_command_line_render(tmpdir): for i in (16, 18, 19, 21)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() - for i in (15, 17)}) == 1 + for i in (15, 17, 23, 24)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() for i in (20, 22)}) == 1 From 51971f329303b568d2bc93bceecc4f9596c5f376 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 21 Mar 2023 16:17:47 +0100 Subject: [PATCH 06/20] Fix tests for attachments --- tests/test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 9d385eb76..4aafa00c6 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -385,9 +385,9 @@ def test_command_line_render(tmpdir): for i in (20, 22)}) == 1 os.environ.pop('SOURCE_DATE_EPOCH') - stdout = _run('combined.html -') + stdout = _run('-O none combined.html -') assert stdout.count(b'attachment') == 0 - stdout = _run('combined.html -') + stdout = _run('-O none combined.html -') assert stdout.count(b'attachment') == 0 stdout = _run('-O none -a pattern.png combined.html -') assert stdout.count(b'attachment') == 1 From eb6491f89576f6d731bfa8572a4c006a2de9574d Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 26 Mar 2023 13:37:15 +0200 Subject: [PATCH 07/20] Add API controlling JPEG quality --- tests/test_api.py | 2 ++ tests/testing_utils.py | 13 +++++++------ weasyprint/__init__.py | 19 +++++++++++-------- weasyprint/__main__.py | 8 ++++++++ weasyprint/document.py | 11 ++++++----- weasyprint/draw.py | 3 +-- weasyprint/images.py | 28 +++++++++++++++++++--------- weasyprint/pdf/stream.py | 2 +- 8 files changed, 55 insertions(+), 31 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 4aafa00c6..c7dac77cf 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -370,7 +370,9 @@ def test_command_line_render(tmpdir): _run('not_optimized.html out22.pdf -O all -O none') _run('not_optimized.html out23.pdf -O pdf') _run('not_optimized.html out24.pdf -O none -O fonts -O pdf') + _run('not_optimized.html out25.pdf -O all -j 10') assert ( + len(tmpdir.join('out25.pdf').read_binary()) < len(tmpdir.join('out16.pdf').read_binary()) < len(tmpdir.join('out15.pdf').read_binary()) < len(tmpdir.join('out20.pdf').read_binary())) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 20f143e26..60050ef19 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -55,15 +55,16 @@ def _ua_stylesheets(self, forms=False): def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts',), font_config=None, - counter_style=None, image_cache=None, identifier=None, - variant=None, version=None, forms=False, + optimize_size=('fonts',), jpeg_quality=None, + font_config=None, counter_style=None, image_cache=None, + identifier=None, variant=None, version=None, forms=False, custom_metadata=False): # Override function to set PDF size optimization to False by default return super().write_pdf( target, stylesheets, zoom, attachments, finisher, - presentational_hints, optimize_size, font_config, counter_style, - image_cache, identifier, variant, version, forms, custom_metadata) + presentational_hints, optimize_size, jpeg_quality, font_config, + counter_style, image_cache, identifier, variant, version, forms, + custom_metadata) def resource_filename(basename): @@ -194,7 +195,7 @@ def _parse_base(html_content, base_url=BASE_URL): style_for = get_all_computed_styles(document, counter_style=counter_style) get_image_from_uri = functools.partial( images.get_image_from_uri, cache={}, url_fetcher=document.url_fetcher, - optimize_size=()) + optimize_size=(), jpeg_quality=None) target_collector = TargetCollector() footnotes = [] return ( diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 915d58533..8125e78fb 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -118,8 +118,9 @@ def _ph_stylesheets(self): return [HTML5_PH_STYLESHEET] def render(self, stylesheets=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), font_config=None, - counter_style=None, image_cache=None, forms=False): + optimize_size=('fonts', 'pdf'), jpeg_quality=None, + font_config=None, counter_style=None, image_cache=None, + forms=False): """Lay out and paginate the document, but do not (yet) export it. This returns a :class:`document.Document` object which provides @@ -135,6 +136,7 @@ def render(self, stylesheets=None, presentational_hints=False, :param tuple optimize_size: Optimize size of generated PDF. Can contain "images", "fonts" and "pdf". + :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -150,13 +152,13 @@ def render(self, stylesheets=None, presentational_hints=False, """ return Document._render( self, stylesheets, presentational_hints, optimize_size, - font_config, counter_style, image_cache, forms) + jpeg_quality, font_config, counter_style, image_cache, forms) def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), font_config=None, - counter_style=None, image_cache=None, identifier=None, - variant=None, version=None, forms=False, + optimize_size=('fonts', 'pdf'), jpeg_quality=None, + font_config=None, counter_style=None, image_cache=None, + identifier=None, variant=None, version=None, forms=False, custom_metadata=False): """Render the document to a PDF file. @@ -188,6 +190,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param tuple optimize_size: Optimize size of generated PDF. Can contain "images", "fonts" and "pdf". + :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -211,8 +214,8 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, """ return ( self.render( - stylesheets, presentational_hints, optimize_size, font_config, - counter_style, image_cache, forms) + stylesheets, presentational_hints, optimize_size, jpeg_quality, + font_config, counter_style, image_cache, forms) .write_pdf( target, zoom, attachments, finisher, identifier, variant, version, custom_metadata)) diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index e5bbf69f8..5e4aaf514 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -100,6 +100,10 @@ def main(argv=None, stdout=None, stdin=None): Store cache on disk instead of memory. The ``folder`` is created if needed and cleaned after the PDF is generated. + .. option:: -j , --jpeg-quality + + JPEG quality between 0 (worst) to 95 (best). + .. option:: -v, --verbose Show warnings and information messages. @@ -167,6 +171,9 @@ def main(argv=None, stdout=None, stdin=None): '-c', '--cache-folder', help='Store cache on disk instead of memory. The ``folder`` is ' 'created if needed and cleaned after the PDF is generated.') + parser.add_argument( + '-j', '--jpeg-quality', type=int, + help='JPEG quality between 0 (worst) to 95 (best)') parser.add_argument( '-v', '--verbose', action='store_true', help='show warnings and information messages') @@ -208,6 +215,7 @@ def main(argv=None, stdout=None, stdin=None): 'stylesheets': args.stylesheet, 'presentational_hints': args.presentational_hints, 'optimize_size': tuple(optimize_size), + 'jpeg_quality': args.jpeg_quality, 'attachments': args.attachment, 'identifier': args.pdf_identifier, 'variant': args.pdf_variant, diff --git a/weasyprint/document.py b/weasyprint/document.py index b213c0c0e..55589b7c3 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -219,8 +219,8 @@ class Document: @classmethod def _build_layout_context(cls, html, stylesheets, presentational_hints, - optimize_size, font_config, counter_style, - image_cache, forms): + optimize_size, jpeg_quality, font_config, + counter_style, image_cache, forms): if font_config is None: font_config = FontConfiguration() if counter_style is None: @@ -243,7 +243,8 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, counter_style, page_rules, target_collector, forms) get_image_from_uri = functools.partial( original_get_image_from_uri, cache=image_cache, - url_fetcher=html.url_fetcher, optimize_size=optimize_size) + url_fetcher=html.url_fetcher, optimize_size=optimize_size, + jpeg_quality=jpeg_quality) PROGRESS_LOGGER.info('Step 4 - Creating formatting structure') context = LayoutContext( style_for, get_image_from_uri, font_config, counter_style, @@ -252,7 +253,7 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, @classmethod def _render(cls, html, stylesheets, presentational_hints, optimize_size, - font_config, counter_style, image_cache, forms): + jpeg_quality, font_config, counter_style, image_cache, forms): if font_config is None: font_config = FontConfiguration() @@ -261,7 +262,7 @@ def _render(cls, html, stylesheets, presentational_hints, optimize_size, context = cls._build_layout_context( html, stylesheets, presentational_hints, optimize_size, - font_config, counter_style, image_cache, forms) + jpeg_quality, font_config, counter_style, image_cache, forms) root_box = build_formatting_structure( html.etree_element, context.style_for, context.get_image_from_uri, diff --git a/weasyprint/draw.py b/weasyprint/draw.py index 0eafb776d..b25185041 100644 --- a/weasyprint/draw.py +++ b/weasyprint/draw.py @@ -1202,8 +1202,7 @@ def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y, png_data = ffi.unpack(hb_data, int(stream.length[0])) pillow_image = Image.open(BytesIO(png_data)) image_id = f'{font.hash}{glyph}' - image = RasterImage( - pillow_image, image_id, optimize_size=(), cache={}) + image = RasterImage(pillow_image, image_id) d = font.widths[glyph] / 1000 a = pillow_image.width / pillow_image.height * d pango.pango_font_get_glyph_extents( diff --git a/weasyprint/images.py b/weasyprint/images.py index 9e731ea18..d64f68fe2 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -36,9 +36,17 @@ def from_exception(cls, exception): class RasterImage: - def __init__(self, pillow_image, image_id, optimize_size, cache): + def __init__(self, pillow_image, image_id, cache=None, optimize_size=(), + jpeg_quality=None): self.id = image_id - self._cache = cache + self._cache = {} if cache is None else cache + self._optimize_size = optimize_size + self._jpeg_quality = jpeg_quality + self._intrinsic_width = pillow_image.width + self._intrinsic_height = pillow_image.height + self._intrinsic_ratio = ( + self._intrinsic_width / self._intrinsic_height + if self._intrinsic_height != 0 else inf) if 'transparency' in pillow_image.info: pillow_image = pillow_image.convert('RGBA') @@ -71,7 +79,10 @@ def __init__(self, pillow_image, image_id, optimize_size, cache): if pillow_image.format in ('JPEG', 'MPO'): self.extra['Filter'] = '/DCTDecode' image_file = io.BytesIO() - pillow_image.save(image_file, format='JPEG', optimize=optimize) + options = {'format': 'JPEG', 'optimize': optimize} + if jpeg_quality is not None: + options['quality'] = jpeg_quality + pillow_image.save(image_file, **options) self.stream = self.get_stream(image_file.getvalue()) else: self.extra['Filter'] = '/FlateDecode' @@ -116,7 +127,6 @@ def get_intrinsic_size(self, resolution, font_size): def draw(self, stream, concrete_width, concrete_height, image_rendering): if self.width <= 0 or self.height <= 0: return - image_name = stream.add_image(self, image_rendering) stream.transform( concrete_width, 0, 0, -concrete_height, 0, concrete_height) @@ -138,12 +148,12 @@ def _get_png_data(pillow_image, optimize): # Each chunk begins with its data length (four bytes, may be zero), # then its type (four ASCII characters), then the data, then four # bytes of a CRC. - chunk_len, = struct.unpack('!I', raw_chunk_length) + chunk_length, = struct.unpack('!I', raw_chunk_length) chunk_type = image_file.read(4) if chunk_type == b'IDAT': - png_data.append(image_file.read(chunk_len)) + png_data.append(image_file.read(chunk_length)) else: - image_file.seek(chunk_len, io.SEEK_CUR) + image_file.seek(chunk_length, io.SEEK_CUR) # We aren't checking the CRC, we assume this is a valid PNG. image_file.seek(4, io.SEEK_CUR) raw_chunk_length = image_file.read(4) @@ -198,7 +208,7 @@ def draw(self, stream, concrete_width, concrete_height, image_rendering): self._url_fetcher, self._context) -def get_image_from_uri(cache, url_fetcher, optimize_size, url, +def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, forced_mime_type=None, context=None, orientation='from-image'): """Get an Image instance from an image URI.""" @@ -242,7 +252,7 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url, image_id = md5(url.encode()).hexdigest() pillow_image = rotate_pillow_image(pillow_image, orientation) image = RasterImage( - pillow_image, image_id, optimize_size, cache) + pillow_image, image_id, cache, optimize_size, jpeg_quality) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index a70764376..2373b6e8d 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -377,7 +377,7 @@ def add_image(self, image, image_rendering): extra['SMask'].compress) extra['SMask'].extra['Interpolate'] = interpolate - xobject = pydyf.Stream(image.stream, extra=extra) + xobject = pydyf.Stream(image.stream, extra) self._images[image_name] = xobject return image_name From 343c51b30a98678936835f65a7501a3f4e7f901f Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 28 Mar 2023 18:40:37 +0200 Subject: [PATCH 08/20] Keep original image when no modification is required --- weasyprint/draw.py | 2 +- weasyprint/images.py | 46 ++++++++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/weasyprint/draw.py b/weasyprint/draw.py index b25185041..128dd7f15 100644 --- a/weasyprint/draw.py +++ b/weasyprint/draw.py @@ -1202,7 +1202,7 @@ def draw_first_line(stream, textbox, text_overflow, block_ellipsis, x, y, png_data = ffi.unpack(hb_data, int(stream.length[0])) pillow_image = Image.open(BytesIO(png_data)) image_id = f'{font.hash}{glyph}' - image = RasterImage(pillow_image, image_id) + image = RasterImage(pillow_image, image_id, png_data) d = font.widths[glyph] / 1000 a = pillow_image.width / pillow_image.height * d pango.pango_font_get_glyph_extents( diff --git a/weasyprint/images.py b/weasyprint/images.py index d64f68fe2..005dc65b7 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -36,8 +36,17 @@ def from_exception(cls, exception): class RasterImage: - def __init__(self, pillow_image, image_id, cache=None, optimize_size=(), - jpeg_quality=None): + def __init__(self, pillow_image, image_id, image_data, cache=None, + optimize_size=(), jpeg_quality=None, orientation='none'): + # Transpose image + original_pillow_image = pillow_image + pillow_image = rotate_pillow_image(pillow_image, orientation) + if original_pillow_image is not pillow_image: + # Keep image format as it is discarded by transposition + pillow_image.format = original_pillow_image.format + # Discard original data, as the image has been transformed + image_data = None + self.id = image_id self._cache = {} if cache is None else cache self._optimize_size = optimize_size @@ -78,12 +87,14 @@ def __init__(self, pillow_image, image_id, cache=None, optimize_size=(), optimize = 'images' in optimize_size if pillow_image.format in ('JPEG', 'MPO'): self.extra['Filter'] = '/DCTDecode' - image_file = io.BytesIO() - options = {'format': 'JPEG', 'optimize': optimize} - if jpeg_quality is not None: - options['quality'] = jpeg_quality - pillow_image.save(image_file, **options) - self.stream = self.get_stream(image_file.getvalue()) + if image_data is None or optimize or jpeg_quality is not None: + image_file = io.BytesIO() + options = {'format': 'JPEG', 'optimize': optimize} + if jpeg_quality is not None: + options['quality'] = jpeg_quality + pillow_image.save(image_file, **options) + image_data = image_file.getvalue() + self.stream = self.get_stream(image_data) else: self.extra['Filter'] = '/FlateDecode' self.extra['DecodeParms'] = pydyf.Dictionary({ @@ -100,8 +111,11 @@ def __init__(self, pillow_image, image_id, cache=None, optimize_size=(), # Defaults to 1. self.extra['DecodeParms']['Colors'] = 3 if pillow_image.mode in ('RGBA', 'LA'): + # Remove alpha channel from image and discard original data alpha = pillow_image.getchannel('A') pillow_image = pillow_image.convert(pillow_image.mode[:-1]) + image_data = None + # Save alpha channel as mask alpha_data = self._get_png_data(alpha, optimize) stream = self.get_stream(alpha_data, alpha=True) self.extra['SMask'] = pydyf.Stream(stream, extra={ @@ -118,7 +132,7 @@ def __init__(self, pillow_image, image_id, cache=None, optimize_size=(), 'BitsPerComponent': 8, }) - png_data = self._get_png_data(pillow_image, optimize) + png_data = self._get_png_data(pillow_image, optimize, image_data) self.stream = self.get_stream(png_data) def get_intrinsic_size(self, resolution, font_size): @@ -133,9 +147,13 @@ def draw(self, stream, concrete_width, concrete_height, image_rendering): stream.draw_x_object(image_name) @staticmethod - def _get_png_data(pillow_image, optimize): - image_file = io.BytesIO() - pillow_image.save(image_file, format='PNG', optimize=optimize) + def _get_png_data(pillow_image, optimize, image_data=None): + format = pillow_image.format + if image_data is not None and format == 'PNG' and not optimize: + image_file = io.BytesIO(image_data) + else: + image_file = io.BytesIO() + pillow_image.save(image_file, format='PNG', optimize=optimize) # Read the PNG header, then discard it because we know it's a PNG. If # this weren't just output from Pillow, we should actually check it. @@ -250,9 +268,9 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, else: # Store image id to enable cache in Stream.add_image image_id = md5(url.encode()).hexdigest() - pillow_image = rotate_pillow_image(pillow_image, orientation) image = RasterImage( - pillow_image, image_id, cache, optimize_size, jpeg_quality) + pillow_image, image_id, string, cache, optimize_size, + jpeg_quality, orientation) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) From 82c077b457ceade370395c6d5e29e2fcadbeffd8 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 29 Mar 2023 13:53:26 +0200 Subject: [PATCH 09/20] =?UTF-8?q?Don=E2=80=99t=20copy=20unmodified=20JPEG?= =?UTF-8?q?=20images?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weasyprint/images.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/weasyprint/images.py b/weasyprint/images.py index 005dc65b7..307090e3d 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -7,6 +7,8 @@ from io import BytesIO from itertools import cycle from math import inf +from pathlib import Path +from urllib.parse import urlparse from xml.etree import ElementTree import pydyf @@ -36,8 +38,9 @@ def from_exception(cls, exception): class RasterImage: - def __init__(self, pillow_image, image_id, image_data, cache=None, - optimize_size=(), jpeg_quality=None, orientation='none'): + def __init__(self, pillow_image, image_id, image_data, filename=None, + cache=None, optimize_size=(), jpeg_quality=None, + orientation='none'): # Transpose image original_pillow_image = pillow_image pillow_image = rotate_pillow_image(pillow_image, orientation) @@ -45,7 +48,7 @@ def __init__(self, pillow_image, image_id, image_data, cache=None, # Keep image format as it is discarded by transposition pillow_image.format = original_pillow_image.format # Discard original data, as the image has been transformed - image_data = None + image_data = filename = None self.id = image_id self._cache = {} if cache is None else cache @@ -94,7 +97,8 @@ def __init__(self, pillow_image, image_id, image_data, cache=None, options['quality'] = jpeg_quality pillow_image.save(image_file, **options) image_data = image_file.getvalue() - self.stream = self.get_stream(image_data) + filename = None + self.stream = self.get_stream(image_data, filename) else: self.extra['Filter'] = '/FlateDecode' self.extra['DecodeParms'] = pydyf.Dictionary({ @@ -178,9 +182,12 @@ def _get_png_data(pillow_image, optimize, image_data=None): return b''.join(png_data) - def get_stream(self, data, alpha=False): - key = f'{self.id}{int(alpha)}' - return [LazyImage(self._cache, key, data)] + def get_stream(self, data, filename=None, alpha=False): + if filename: + return [LazyLocalImage(filename)] + else: + key = f'{self.id}{int(alpha)}' + return [LazyImage(self._cache, key, data)] class LazyImage(pydyf.Object): @@ -195,6 +202,16 @@ def data(self): return self._cache[self._key] +class LazyLocalImage(pydyf.Object): + def __init__(self, filename): + super().__init__() + self._filename = filename + + @property + def data(self): + return Path(self._filename).read_bytes() + + class SVGImage: def __init__(self, tree, base_url, url_fetcher, context): self._svg = SVG(tree, base_url) @@ -235,6 +252,8 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, try: with fetch(url_fetcher, url) as result: + parsed_url = urlparse(result.get('redirected_url')) + filename = parsed_url.path if parsed_url.scheme == 'file' else None if 'string' in result: string = result['string'] else: @@ -269,8 +288,8 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, # Store image id to enable cache in Stream.add_image image_id = md5(url.encode()).hexdigest() image = RasterImage( - pillow_image, image_id, string, cache, optimize_size, - jpeg_quality, orientation) + pillow_image, image_id, string, filename, cache, + optimize_size, jpeg_quality, orientation) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) From 019922584fc81b0b76754d4807089a4df28a7139 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 29 Mar 2023 21:06:17 +0200 Subject: [PATCH 10/20] Allow users to keep using a dictionary to store cache in memory --- weasyprint/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index 55589b7c3..8901c7b52 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -230,7 +230,7 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, user_stylesheets = [] if image_cache is None: image_cache = {} - elif not isinstance(image_cache, DiskCache): + elif not isinstance(image_cache, (dict, DiskCache)): image_cache = DiskCache(image_cache) for css in stylesheets or []: if not hasattr(css, 'matcher'): From 75c0e44e2c5806c8a1688be8b7bf2c4cadccb4d2 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 29 Mar 2023 21:06:50 +0200 Subject: [PATCH 11/20] Add documentation for cache on disk --- docs/first_steps.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/first_steps.rst b/docs/first_steps.rst index 2d2c1a485..9b3bbcc63 100644 --- a/docs/first_steps.rst +++ b/docs/first_steps.rst @@ -540,6 +540,11 @@ time when you render a lot of documents that use the same images. HTML(f'https://example.org/?id={i}').write_pdf( f'example-{i}.pdf', image_cache=cache) +It’s also possible to cache images on disk instead of keeping them in memory. +The ``--cache-folder`` CLI option can be used to define the folder used to +store temporary images. You can also provide this folder path as a string for +``image_cache``. + Logging ~~~~~~~ From 3a745c5be9f8e43954334b79bea34142ea9d95cf Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 2 Apr 2023 09:57:11 +0200 Subject: [PATCH 12/20] Add the --dpi option --- tests/testing_utils.py | 10 +- weasyprint/__init__.py | 10 +- weasyprint/__main__.py | 4 + weasyprint/document.py | 9 +- weasyprint/images.py | 204 +++++++++++++++++++++++---------------- weasyprint/pdf/stream.py | 15 +-- 6 files changed, 143 insertions(+), 109 deletions(-) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 60050ef19..ba618cb9d 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -55,16 +55,16 @@ def _ua_stylesheets(self, forms=False): def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts',), jpeg_quality=None, + optimize_size=('fonts',), jpeg_quality=None, dpi=None, font_config=None, counter_style=None, image_cache=None, identifier=None, variant=None, version=None, forms=False, custom_metadata=False): # Override function to set PDF size optimization to False by default return super().write_pdf( target, stylesheets, zoom, attachments, finisher, - presentational_hints, optimize_size, jpeg_quality, font_config, - counter_style, image_cache, identifier, variant, version, forms, - custom_metadata) + presentational_hints, optimize_size, jpeg_quality, dpi, + font_config, counter_style, image_cache, identifier, variant, + version, forms, custom_metadata) def resource_filename(basename): @@ -195,7 +195,7 @@ def _parse_base(html_content, base_url=BASE_URL): style_for = get_all_computed_styles(document, counter_style=counter_style) get_image_from_uri = functools.partial( images.get_image_from_uri, cache={}, url_fetcher=document.url_fetcher, - optimize_size=(), jpeg_quality=None) + optimize_size=(), jpeg_quality=None, dpi=None) target_collector = TargetCollector() footnotes = [] return ( diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 8125e78fb..efbccf214 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -118,7 +118,7 @@ def _ph_stylesheets(self): return [HTML5_PH_STYLESHEET] def render(self, stylesheets=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), jpeg_quality=None, + optimize_size=('fonts', 'pdf'), jpeg_quality=None, dpi=None, font_config=None, counter_style=None, image_cache=None, forms=False): """Lay out and paginate the document, but do not (yet) export it. @@ -137,6 +137,7 @@ def render(self, stylesheets=None, presentational_hints=False, Optimize size of generated PDF. Can contain "images", "fonts" and "pdf". :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). + :param int dpi: Maximum resolution of images embedded in the PDF. :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -152,11 +153,11 @@ def render(self, stylesheets=None, presentational_hints=False, """ return Document._render( self, stylesheets, presentational_hints, optimize_size, - jpeg_quality, font_config, counter_style, image_cache, forms) + jpeg_quality, dpi, font_config, counter_style, image_cache, forms) def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), jpeg_quality=None, + optimize_size=('fonts', 'pdf'), jpeg_quality=None, dpi=None, font_config=None, counter_style=None, image_cache=None, identifier=None, variant=None, version=None, forms=False, custom_metadata=False): @@ -191,6 +192,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, Optimize size of generated PDF. Can contain "images", "fonts" and "pdf". :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). + :param int dpi: Maximum resolution of images embedded in the PDF. :type font_config: :class:`text.fonts.FontConfiguration` :param font_config: A font configuration handling ``@font-face`` rules. :type counter_style: :class:`css.counters.CounterStyle` @@ -215,7 +217,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, return ( self.render( stylesheets, presentational_hints, optimize_size, jpeg_quality, - font_config, counter_style, image_cache, forms) + dpi, font_config, counter_style, image_cache, forms) .write_pdf( target, zoom, attachments, finisher, identifier, variant, version, custom_metadata)) diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index 5e4aaf514..a1cb392a9 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -174,6 +174,9 @@ def main(argv=None, stdout=None, stdin=None): parser.add_argument( '-j', '--jpeg-quality', type=int, help='JPEG quality between 0 (worst) to 95 (best)') + parser.add_argument( + '-D', '--dpi', type=int, + help='Maximum resolution of images embedded in the PDF') parser.add_argument( '-v', '--verbose', action='store_true', help='show warnings and information messages') @@ -216,6 +219,7 @@ def main(argv=None, stdout=None, stdin=None): 'presentational_hints': args.presentational_hints, 'optimize_size': tuple(optimize_size), 'jpeg_quality': args.jpeg_quality, + 'dpi': args.dpi, 'attachments': args.attachment, 'identifier': args.pdf_identifier, 'variant': args.pdf_variant, diff --git a/weasyprint/document.py b/weasyprint/document.py index 8901c7b52..74f78c74a 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -219,7 +219,7 @@ class Document: @classmethod def _build_layout_context(cls, html, stylesheets, presentational_hints, - optimize_size, jpeg_quality, font_config, + optimize_size, jpeg_quality, dpi, font_config, counter_style, image_cache, forms): if font_config is None: font_config = FontConfiguration() @@ -244,7 +244,7 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, get_image_from_uri = functools.partial( original_get_image_from_uri, cache=image_cache, url_fetcher=html.url_fetcher, optimize_size=optimize_size, - jpeg_quality=jpeg_quality) + jpeg_quality=jpeg_quality, dpi=dpi) PROGRESS_LOGGER.info('Step 4 - Creating formatting structure') context = LayoutContext( style_for, get_image_from_uri, font_config, counter_style, @@ -253,7 +253,8 @@ def _build_layout_context(cls, html, stylesheets, presentational_hints, @classmethod def _render(cls, html, stylesheets, presentational_hints, optimize_size, - jpeg_quality, font_config, counter_style, image_cache, forms): + jpeg_quality, dpi, font_config, counter_style, image_cache, + forms): if font_config is None: font_config = FontConfiguration() @@ -262,7 +263,7 @@ def _render(cls, html, stylesheets, presentational_hints, optimize_size, context = cls._build_layout_context( html, stylesheets, presentational_hints, optimize_size, - jpeg_quality, font_config, counter_style, image_cache, forms) + jpeg_quality, dpi, font_config, counter_style, image_cache, forms) root_box = build_formatting_structure( html.etree_element, context.style_for, context.get_image_from_uri, diff --git a/weasyprint/images.py b/weasyprint/images.py index 307090e3d..d95faf2fa 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -8,7 +8,7 @@ from itertools import cycle from math import inf from pathlib import Path -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from xml.etree import ElementTree import pydyf @@ -39,7 +39,7 @@ def from_exception(cls, exception): class RasterImage: def __init__(self, pillow_image, image_id, image_data, filename=None, - cache=None, optimize_size=(), jpeg_quality=None, + cache=None, optimize_size=(), jpeg_quality=None, dpi=None, orientation='none'): # Transpose image original_pillow_image = pillow_image @@ -54,42 +54,21 @@ def __init__(self, pillow_image, image_id, image_data, filename=None, self._cache = {} if cache is None else cache self._optimize_size = optimize_size self._jpeg_quality = jpeg_quality - self._intrinsic_width = pillow_image.width - self._intrinsic_height = pillow_image.height - self._intrinsic_ratio = ( - self._intrinsic_width / self._intrinsic_height - if self._intrinsic_height != 0 else inf) + self._dpi = dpi if 'transparency' in pillow_image.info: pillow_image = pillow_image.convert('RGBA') elif pillow_image.mode in ('1', 'P', 'I'): pillow_image = pillow_image.convert('RGB') + self.mode = pillow_image.mode self.width = pillow_image.width self.height = pillow_image.height self.ratio = (self.width / self.height) if self.height != 0 else inf - if pillow_image.mode in ('RGB', 'RGBA'): - color_space = '/DeviceRGB' - elif pillow_image.mode in ('L', 'LA'): - color_space = '/DeviceGray' - elif pillow_image.mode == 'CMYK': - color_space = '/DeviceCMYK' - else: - LOGGER.warning('Unknown image mode: %s', pillow_image.mode) - color_space = '/DeviceRGB' - - self.extra = pydyf.Dictionary({ - 'Type': '/XObject', - 'Subtype': '/Image', - 'Width': self.width, - 'Height': self.height, - 'ColorSpace': color_space, - 'BitsPerComponent': 8, - }) optimize = 'images' in optimize_size if pillow_image.format in ('JPEG', 'MPO'): - self.extra['Filter'] = '/DCTDecode' + self.format = 'JPEG' if image_data is None or optimize or jpeg_quality is not None: image_file = io.BytesIO() options = {'format': 'JPEG', 'optimize': optimize} @@ -98,46 +77,14 @@ def __init__(self, pillow_image, image_id, image_data, filename=None, pillow_image.save(image_file, **options) image_data = image_file.getvalue() filename = None - self.stream = self.get_stream(image_data, filename) else: - self.extra['Filter'] = '/FlateDecode' - self.extra['DecodeParms'] = pydyf.Dictionary({ - # Predictor 15 specifies that we're providing PNG data, - # ostensibly using an "optimum predictor", but doesn't actually - # matter as long as the predictor value is 10+ according to the - # spec. (Other PNG predictor values assert that we're using - # specific predictors that we don't want to commit to, but - # "optimum" can vary.) - 'Predictor': 15, - 'Columns': self.width, - }) - if pillow_image.mode in ('RGB', 'RGBA'): - # Defaults to 1. - self.extra['DecodeParms']['Colors'] = 3 - if pillow_image.mode in ('RGBA', 'LA'): - # Remove alpha channel from image and discard original data - alpha = pillow_image.getchannel('A') - pillow_image = pillow_image.convert(pillow_image.mode[:-1]) - image_data = None - # Save alpha channel as mask - alpha_data = self._get_png_data(alpha, optimize) - stream = self.get_stream(alpha_data, alpha=True) - self.extra['SMask'] = pydyf.Stream(stream, extra={ - 'Filter': '/FlateDecode', - 'Type': '/XObject', - 'Subtype': '/Image', - 'DecodeParms': pydyf.Dictionary({ - 'Predictor': 15, - 'Columns': pillow_image.width, - }), - 'Width': pillow_image.width, - 'Height': pillow_image.height, - 'ColorSpace': '/DeviceGray', - 'BitsPerComponent': 8, - }) - - png_data = self._get_png_data(pillow_image, optimize, image_data) - self.stream = self.get_stream(png_data) + self.format = 'PNG' + if image_data is None or optimize or pillow_image.format != 'PNG': + image_file = io.BytesIO() + pillow_image.save(image_file, format='PNG', optimize=optimize) + image_data = image_file.getvalue() + filename = None + self.image_data = self.cache_image_data(image_data, filename) def get_intrinsic_size(self, resolution, font_size): return self.width / resolution, self.height / resolution, self.ratio @@ -145,19 +92,112 @@ def get_intrinsic_size(self, resolution, font_size): def draw(self, stream, concrete_width, concrete_height, image_rendering): if self.width <= 0 or self.height <= 0: return - image_name = stream.add_image(self, image_rendering) + + width, height = self.width, self.height + if self._dpi: + pt_to_in = 4 / 3 / 96 + width_inches = abs(concrete_width * stream.ctm[0][0] * pt_to_in) + height_inches = abs(concrete_height * stream.ctm[1][1] * pt_to_in) + dpi = max(self.width / width_inches, self.height / height_inches) + if dpi > self._dpi: + ratio = self._dpi / dpi + image = Image.open(io.BytesIO(self.image_data.data)) + width = int(round(self.width * ratio)) + height = int(round(self.height * ratio)) + image.thumbnail((width, height)) + image_file = io.BytesIO() + image.save(image_file, format=image.format) + width, height = image.width, image.height + self.image_data = self.cache_image_data(image_file.getvalue()) + else: + dpi = None + + interpolate = 'true' if image_rendering == 'auto' else 'false' + + image_name = stream.add_image(self, width, height, interpolate) stream.transform( concrete_width, 0, 0, -concrete_height, 0, concrete_height) stream.draw_x_object(image_name) - @staticmethod - def _get_png_data(pillow_image, optimize, image_data=None): - format = pillow_image.format - if image_data is not None and format == 'PNG' and not optimize: - image_file = io.BytesIO(image_data) + def cache_image_data(self, data, filename=None, alpha=False): + if filename: + return LazyLocalImage(filename) + else: + key = f'{self.id}{int(alpha)}{self._dpi or ""}' + return LazyImage(self._cache, key, data) + + def get_xobject(self, width, height, interpolate): + if self.mode in ('RGB', 'RGBA'): + color_space = '/DeviceRGB' + elif self.mode in ('L', 'LA'): + color_space = '/DeviceGray' + elif self.mode == 'CMYK': + color_space = '/DeviceCMYK' else: - image_file = io.BytesIO() - pillow_image.save(image_file, format='PNG', optimize=optimize) + LOGGER.warning('Unknown image mode: %s', self.mode) + color_space = '/DeviceRGB' + + extra = pydyf.Dictionary({ + 'Type': '/XObject', + 'Subtype': '/Image', + 'Width': width, + 'Height': height, + 'ColorSpace': color_space, + 'BitsPerComponent': 8, + 'Interpolate': interpolate, + }) + + if self.format == 'JPEG': + extra['Filter'] = '/DCTDecode' + return pydyf.Stream([self.image_data], extra) + + extra['Filter'] = '/FlateDecode' + extra['DecodeParms'] = pydyf.Dictionary({ + # Predictor 15 specifies that we're providing PNG data, + # ostensibly using an "optimum predictor", but doesn't actually + # matter as long as the predictor value is 10+ according to the + # spec. (Other PNG predictor values assert that we're using + # specific predictors that we don't want to commit to, but + # "optimum" can vary.) + 'Predictor': 15, + 'Columns': width, + }) + if self.mode in ('RGB', 'RGBA'): + # Defaults to 1. + extra['DecodeParms']['Colors'] = 3 + if self.mode in ('RGBA', 'LA'): + # Remove alpha channel from image + pillow_image = Image.open(io.BytesIO(self.image_data.data)) + alpha = pillow_image.getchannel('A') + pillow_image = pillow_image.convert(self.mode[:-1]) + png_data = self._get_png_data(pillow_image) + # Save alpha channel as mask + alpha_data = self._get_png_data(alpha) + stream = self.cache_image_data(alpha_data, alpha=True) + extra['SMask'] = pydyf.Stream([stream], extra={ + 'Filter': '/FlateDecode', + 'Type': '/XObject', + 'Subtype': '/Image', + 'DecodeParms': pydyf.Dictionary({ + 'Predictor': 15, + 'Columns': width, + }), + 'Width': width, + 'Height': height, + 'ColorSpace': '/DeviceGray', + 'BitsPerComponent': 8, + 'Interpolate': interpolate, + }) + else: + png_data = self._get_png_data( + Image.open(io.BytesIO(self.image_data.data))) + + return pydyf.Stream([self.cache_image_data(png_data)], extra) + + @staticmethod + def _get_png_data(pillow_image): + image_file = BytesIO() + pillow_image.save(image_file, format='PNG') # Read the PNG header, then discard it because we know it's a PNG. If # this weren't just output from Pillow, we should actually check it. @@ -182,13 +222,6 @@ def _get_png_data(pillow_image, optimize, image_data=None): return b''.join(png_data) - def get_stream(self, data, filename=None, alpha=False): - if filename: - return [LazyLocalImage(filename)] - else: - key = f'{self.id}{int(alpha)}' - return [LazyImage(self._cache, key, data)] - class LazyImage(pydyf.Object): def __init__(self, cache, key, data): @@ -243,8 +276,8 @@ def draw(self, stream, concrete_width, concrete_height, image_rendering): self._url_fetcher, self._context) -def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, - forced_mime_type=None, context=None, +def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, dpi, + url, forced_mime_type=None, context=None, orientation='from-image'): """Get an Image instance from an image URI.""" if url in cache: @@ -253,7 +286,10 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, try: with fetch(url_fetcher, url) as result: parsed_url = urlparse(result.get('redirected_url')) - filename = parsed_url.path if parsed_url.scheme == 'file' else None + if parsed_url.scheme == 'file': + filename = unquote(parsed_url.path) + else: + filename = None if 'string' in result: string = result['string'] else: @@ -289,7 +325,7 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, url, image_id = md5(url.encode()).hexdigest() image = RasterImage( pillow_image, image_id, string, filename, cache, - optimize_size, jpeg_quality, orientation) + optimize_size, jpeg_quality, dpi, orientation) except (URLFetchingError, ImageLoadingError) as exception: LOGGER.error('Failed to load image at %r: %s', url, exception) diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index 2373b6e8d..6dd5db8b0 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -361,23 +361,14 @@ def add_group(self, x, y, width, height): self._x_objects[group.id] = group return group - def add_image(self, image, image_rendering): - image_name = f'i{image.id}{image_rendering}' + def add_image(self, image, width, height, interpolate): + image_name = f'i{image.id}{width}{height}{interpolate}' self._x_objects[image_name] = None # Set by write_pdf if image_name in self._images: # Reuse image already stored in document return image_name - interpolate = 'true' if image_rendering == 'auto' else 'false' - extra = image.extra.copy() - extra['Interpolate'] = interpolate - if 'SMask' in extra: - extra['SMask'] = pydyf.Stream( - extra['SMask'].stream.copy(), extra['SMask'].extra.copy(), - extra['SMask'].compress) - extra['SMask'].extra['Interpolate'] = interpolate - - xobject = pydyf.Stream(image.stream, extra) + xobject = image.get_xobject(width, height, interpolate) self._images[image_name] = xobject return image_name From e693d721bc344a4988e17191abb9b8241b8af229 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Mon, 3 Apr 2023 16:51:06 +0200 Subject: [PATCH 13/20] Fix image paths on Windows --- weasyprint/images.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/weasyprint/images.py b/weasyprint/images.py index d95faf2fa..88518c4d6 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -8,7 +8,8 @@ from itertools import cycle from math import inf from pathlib import Path -from urllib.parse import unquote, urlparse +from urllib.parse import urlparse +from urllib.request import url2pathname from xml.etree import ElementTree import pydyf @@ -287,7 +288,7 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, jpeg_quality, dpi, with fetch(url_fetcher, url) as result: parsed_url = urlparse(result.get('redirected_url')) if parsed_url.scheme == 'file': - filename = unquote(parsed_url.path) + filename = url2pathname(parsed_url.path) else: filename = None if 'string' in result: From f34174e8be3848beec8d69a1e8020fb0febba7f7 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 4 Apr 2023 16:51:13 +0200 Subject: [PATCH 14/20] Only test extra Python versions on Linux --- .github/workflows/tests.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bf732fec1..13c163c91 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,11 +8,12 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', 'pypy-3.8'] - exclude: - # Wheels missing for this configuration - - os: macos-latest - python-version: pypy-3.8 + python-version: ['3.11'] + include: + - os: ubuntu-latest + python-version: '3.7' + - os: ubuntu-latest + python-version: 'pypy-3.8' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 From 6191d609510769ee106785c9cd24ed9e8642c145 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 5 Apr 2023 14:30:25 +0200 Subject: [PATCH 15/20] Test DPI option --- tests/test_api.py | 2 ++ weasyprint/images.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index c7dac77cf..18d7fcb76 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -371,7 +371,9 @@ def test_command_line_render(tmpdir): _run('not_optimized.html out23.pdf -O pdf') _run('not_optimized.html out24.pdf -O none -O fonts -O pdf') _run('not_optimized.html out25.pdf -O all -j 10') + _run('not_optimized.html out26.pdf -O all -j 10 -D 1') assert ( + len(tmpdir.join('out26.pdf').read_binary()) < len(tmpdir.join('out25.pdf').read_binary()) < len(tmpdir.join('out16.pdf').read_binary()) < len(tmpdir.join('out15.pdf').read_binary()) < diff --git a/weasyprint/images.py b/weasyprint/images.py index 88518c4d6..f261e25db 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -53,7 +53,6 @@ def __init__(self, pillow_image, image_id, image_data, filename=None, self.id = image_id self._cache = {} if cache is None else cache - self._optimize_size = optimize_size self._jpeg_quality = jpeg_quality self._dpi = dpi @@ -66,8 +65,8 @@ def __init__(self, pillow_image, image_id, image_data, filename=None, self.width = pillow_image.width self.height = pillow_image.height self.ratio = (self.width / self.height) if self.height != 0 else inf + self.optimize = optimize = 'images' in optimize_size - optimize = 'images' in optimize_size if pillow_image.format in ('JPEG', 'MPO'): self.format = 'JPEG' if image_data is None or optimize or jpeg_quality is not None: @@ -105,9 +104,10 @@ def draw(self, stream, concrete_width, concrete_height, image_rendering): image = Image.open(io.BytesIO(self.image_data.data)) width = int(round(self.width * ratio)) height = int(round(self.height * ratio)) - image.thumbnail((width, height)) + image.thumbnail((max(1, width), max(1, height))) image_file = io.BytesIO() - image.save(image_file, format=image.format) + image.save( + image_file, format=image.format, optimize=self.optimize) width, height = image.width, image.height self.image_data = self.cache_image_data(image_file.getvalue()) else: From 190a576ccebb4fec32391e503b64647292116aad Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 5 Apr 2023 14:45:33 +0200 Subject: [PATCH 16/20] Test cache folder option --- tests/test_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_api.py b/tests/test_api.py index 18d7fcb76..5d944d88a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -372,6 +372,7 @@ def test_command_line_render(tmpdir): _run('not_optimized.html out24.pdf -O none -O fonts -O pdf') _run('not_optimized.html out25.pdf -O all -j 10') _run('not_optimized.html out26.pdf -O all -j 10 -D 1') + _run(f'not_optimized.html out27.pdf -c {tmpdir}') assert ( len(tmpdir.join('out26.pdf').read_binary()) < len(tmpdir.join('out25.pdf').read_binary()) < @@ -383,7 +384,7 @@ def test_command_line_render(tmpdir): for i in (16, 18, 19, 21)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() - for i in (15, 17, 23, 24)}) == 1 + for i in (15, 17, 23, 24, 27)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() for i in (20, 22)}) == 1 From e2ae74e7efe7655a935b5fcd5e671a524f185711 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 5 Apr 2023 14:52:47 +0200 Subject: [PATCH 17/20] =?UTF-8?q?Don=E2=80=99t=20reference=20DiskCache=20i?= =?UTF-8?q?n=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weasyprint/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index efbccf214..d237420e4 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -145,8 +145,7 @@ def render(self, stylesheets=None, presentational_hints=False, :param image_cache: A dictionary used to cache images, or a folder path where images are temporarily stored. - :type image_cache: - :obj:`dict`, :obj:`str` or :class:`document.DiskCache` + :type image_cache: :obj:`dict` or :obj:`str` :param bool forms: Whether PDF forms have to be included. :returns: A :class:`document.Document` object. @@ -200,8 +199,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param image_cache: A dictionary used to cache images, or a folder path where images are temporarily stored. - :type image_cache: - :obj:`dict`, :obj:`str` or :class:`document.DiskCache` + :type image_cache: :obj:`dict` or :obj:`str` :param bytes identifier: A bytestring used as PDF file identifier. :param str variant: A PDF variant name. :param str version: A PDF version number. From 9b21d0f10057e39b6393ad440d67356e8da40c74 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Thu, 6 Apr 2023 15:56:52 +0200 Subject: [PATCH 18/20] Add the --dpi option in main() docstring --- weasyprint/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index a1cb392a9..82fbd1713 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -104,6 +104,10 @@ def main(argv=None, stdout=None, stdin=None): JPEG quality between 0 (worst) to 95 (best). + .. option:: -D , --dpi + + Maximum resolution of images embedded in the PDF. + .. option:: -v, --verbose Show warnings and information messages. From 2358a01bc431f6975596550c7a82115fa6c6250f Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 9 Apr 2023 19:45:08 +0200 Subject: [PATCH 19/20] Add an option to disable hinting Hinting is now enabled by default. Fix #1858. --- tests/test_api.py | 6 +++++- weasyprint/__init__.py | 8 ++++---- weasyprint/__main__.py | 12 ++++++------ weasyprint/pdf/fonts.py | 3 ++- weasyprint/pdf/stream.py | 4 ++-- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 5d944d88a..81929c1fd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -376,12 +376,16 @@ def test_command_line_render(tmpdir): assert ( len(tmpdir.join('out26.pdf').read_binary()) < len(tmpdir.join('out25.pdf').read_binary()) < + len(tmpdir.join('out19.pdf').read_binary()) < len(tmpdir.join('out16.pdf').read_binary()) < len(tmpdir.join('out15.pdf').read_binary()) < len(tmpdir.join('out20.pdf').read_binary())) assert len({ tmpdir.join(f'out{i}.pdf').read_binary() - for i in (16, 18, 19, 21)}) == 1 + for i in (16, 18)}) == 1 + assert len({ + tmpdir.join(f'out{i}.pdf').read_binary() + for i in (19, 21)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() for i in (15, 17, 23, 24, 27)}) == 1 diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index d237420e4..eac6b85e7 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -134,8 +134,8 @@ def render(self, stylesheets=None, presentational_hints=False, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images", "fonts" and - "pdf". + Optimize size of generated PDF. Can contain "images", "fonts", + "hinting" and "pdf". :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). :param int dpi: Maximum resolution of images embedded in the PDF. :type font_config: :class:`text.fonts.FontConfiguration` @@ -188,8 +188,8 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1, :param bool presentational_hints: Whether HTML presentational hints are followed. :param tuple optimize_size: - Optimize size of generated PDF. Can contain "images", "fonts" and - "pdf". + Optimize size of generated PDF. Can contain "images", "fonts", + "hinting" and "pdf". :param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best). :param int dpi: Maximum resolution of images embedded in the PDF. :type font_config: :class:`text.fonts.FontConfiguration` diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index 82fbd1713..633b9ed44 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -90,10 +90,10 @@ def main(argv=None, stdout=None, stdin=None): .. option:: -O , --optimize-size Optimize the size of generated documents. Supported types are - ``images``, ``fonts``, ``pdf``, ``all`` and ``none``. This option can - be used multiple times, ``all`` adds all allowed values, ``none`` - removes all previously set values (including the default ones, - ``fonts`` and ``pdf``). + ``images``, ``fonts``, ``hinting``, ``pdf``, ``all`` and ``none``. + This option can be used multiple times, ``all`` adds all allowed + values, ``none`` removes all previously set values (including the + default ones, ``fonts`` and ``pdf``). .. option:: -c , --cache-folder @@ -169,7 +169,7 @@ def main(argv=None, stdout=None, stdin=None): parser.add_argument( '-O', '--optimize-size', action='append', help='optimize output size for specified features', - choices=('images', 'fonts', 'pdf', 'all', 'none'), + choices=('images', 'fonts', 'hinting', 'pdf', 'all', 'none'), default=['fonts', 'pdf']) parser.add_argument( '-c', '--cache-folder', @@ -214,7 +214,7 @@ def main(argv=None, stdout=None, stdin=None): if arg == 'none': optimize_size.clear() elif arg == 'all': - optimize_size |= {'images', 'fonts', 'pdf'} + optimize_size |= {'images', 'fonts', 'hinting', 'pdf'} else: optimize_size.add(arg) diff --git a/weasyprint/pdf/fonts.py b/weasyprint/pdf/fonts.py index 9027f0e64..3ddffd0ae 100644 --- a/weasyprint/pdf/fonts.py +++ b/weasyprint/pdf/fonts.py @@ -25,7 +25,8 @@ def build_fonts_dictionary(pdf, fonts, optimize_size): if 'fonts' in optimize_size and not font.used_in_forms: for file_font in file_fonts: cmap = {**cmap, **file_font.cmap} - font.clean(cmap) + hinting = 'hinting' not in optimize_size + font.clean(cmap, hinting) # Include font if font.type == 'otf': diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py index 6dd5db8b0..f15eb29df 100644 --- a/weasyprint/pdf/stream.py +++ b/weasyprint/pdf/stream.py @@ -97,7 +97,7 @@ def __init__(self, pango_font): if len(widths) > 1 and len(set(widths)) == 1: self.flags += 2 ** (1 - 1) # FixedPitch - def clean(self, cmap): + def clean(self, cmap, hinting): if self.ttfont is None: return @@ -106,7 +106,7 @@ def clean(self, cmap): optimized_font = io.BytesIO() options = subset.Options( retain_gids=True, passthrough_tables=True, - ignore_missing_glyphs=True, hinting=False, + ignore_missing_glyphs=True, hinting=hinting, desubroutinize=True) options.drop_tables += ['GSUB', 'GPOS', 'SVG'] subsetter = subset.Subsetter(options) From 6ff1b97f4c1b69c35d4699aa526f04bd4691a128 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Wed, 12 Apr 2023 10:26:11 +0200 Subject: [PATCH 20/20] Remove font hinting information by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if it can be an important feature for some users, the fact that nobody ever complained means that it’s not useful for the majority of users. The option is available but disabled by default. --- docs/first_steps.rst | 2 +- tests/conftest.py | 5 +++-- tests/test_api.py | 10 +++------- weasyprint/__init__.py | 14 +++++++------- weasyprint/__main__.py | 2 +- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/docs/first_steps.rst b/docs/first_steps.rst index 9b3bbcc63..538f29e31 100644 --- a/docs/first_steps.rst +++ b/docs/first_steps.rst @@ -524,7 +524,7 @@ compressed too. # Full size optimization, slower, but generated PDF is smaller HTML('https://example.org/').write_pdf( - 'example.pdf', optimize_size=('fonts', 'images', 'pdf')) + 'example.pdf', optimize_size=('fonts', 'images', 'hinting', 'pdf')) ``image_cache`` gives the possibility to use a cache for images, avoiding to download, parse and optimize them each time they are used. diff --git a/tests/conftest.py b/tests/conftest.py index 3fa957b55..4daf47455 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,8 +74,9 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1, def html_write_png(self, target=None, stylesheets=None, resolution=96, - presentational_hints=False, optimize_size=('fonts', 'pdf'), - font_config=None, counter_style=None, image_cache=None): + presentational_hints=False, + optimize_size=('fonts', 'hinting', 'pdf'), font_config=None, + counter_style=None, image_cache=None): return self.render( stylesheets, presentational_hints=presentational_hints, optimize_size=optimize_size, font_config=font_config, diff --git a/tests/test_api.py b/tests/test_api.py index 81929c1fd..6049af31e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -303,7 +303,7 @@ def test_command_line_render(tmpdir): tmpdir.join(name).write_binary(pattern_bytes) # Reference - optimize_size = ('fonts', 'pdf') + optimize_size = ('fonts', 'hinting', 'pdf') html_obj = FakeHTML(string=combined, base_url='dummy.html') pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size) rotated_pdf_bytes = FakeHTML( @@ -369,23 +369,19 @@ def test_command_line_render(tmpdir): _run('not_optimized.html out21.pdf -O none -O all') _run('not_optimized.html out22.pdf -O all -O none') _run('not_optimized.html out23.pdf -O pdf') - _run('not_optimized.html out24.pdf -O none -O fonts -O pdf') + _run('not_optimized.html out24.pdf -O none -O fonts -O pdf -O hinting') _run('not_optimized.html out25.pdf -O all -j 10') _run('not_optimized.html out26.pdf -O all -j 10 -D 1') _run(f'not_optimized.html out27.pdf -c {tmpdir}') assert ( len(tmpdir.join('out26.pdf').read_binary()) < len(tmpdir.join('out25.pdf').read_binary()) < - len(tmpdir.join('out19.pdf').read_binary()) < len(tmpdir.join('out16.pdf').read_binary()) < len(tmpdir.join('out15.pdf').read_binary()) < len(tmpdir.join('out20.pdf').read_binary())) assert len({ tmpdir.join(f'out{i}.pdf').read_binary() - for i in (16, 18)}) == 1 - assert len({ - tmpdir.join(f'out{i}.pdf').read_binary() - for i in (19, 21)}) == 1 + for i in (16, 18, 19, 21)}) == 1 assert len({ tmpdir.join(f'out{i}.pdf').read_binary() for i in (15, 17, 23, 24, 27)}) == 1 diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index eac6b85e7..ce6351faf 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -118,9 +118,9 @@ def _ph_stylesheets(self): return [HTML5_PH_STYLESHEET] def render(self, stylesheets=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), jpeg_quality=None, dpi=None, - font_config=None, counter_style=None, image_cache=None, - forms=False): + optimize_size=('fonts', 'hinting', 'pdf'), jpeg_quality=None, + dpi=None, font_config=None, counter_style=None, + image_cache=None, forms=False): """Lay out and paginate the document, but do not (yet) export it. This returns a :class:`document.Document` object which provides @@ -156,10 +156,10 @@ def render(self, stylesheets=None, presentational_hints=False, def write_pdf(self, target=None, stylesheets=None, zoom=1, attachments=None, finisher=None, presentational_hints=False, - optimize_size=('fonts', 'pdf'), jpeg_quality=None, dpi=None, - font_config=None, counter_style=None, image_cache=None, - identifier=None, variant=None, version=None, forms=False, - custom_metadata=False): + optimize_size=('fonts', 'hinting', 'pdf'), jpeg_quality=None, + dpi=None, font_config=None, counter_style=None, + image_cache=None, identifier=None, variant=None, + version=None, forms=False, custom_metadata=False): """Render the document to a PDF file. This is a shortcut for calling :meth:`render`, then diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py index 633b9ed44..aef713d88 100644 --- a/weasyprint/__main__.py +++ b/weasyprint/__main__.py @@ -170,7 +170,7 @@ def main(argv=None, stdout=None, stdin=None): '-O', '--optimize-size', action='append', help='optimize output size for specified features', choices=('images', 'fonts', 'hinting', 'pdf', 'all', 'none'), - default=['fonts', 'pdf']) + default=['fonts', 'hinting', 'pdf']) parser.add_argument( '-c', '--cache-folder', help='Store cache on disk instead of memory. The ``folder`` is '