Merge pull request #1853 from Kozea/size

Reduce PDF size
Kozea · Apr 12, 2023 · d797e75 · d797e75
2 parents 4eb7fa3 + 6ff1b97
commit d797e75
Show file tree

Hide file tree

Showing 20 changed files with 360 additions and 199 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -8,11 +8,12 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', 'pypy-3.8']
-        exclude:
-          # Wheels missing for this configuration
-          - os: macos-latest
-            python-version: pypy-3.8
+        python-version: ['3.11']
+        include:
+          - os: ubuntu-latest
+            python-version: '3.7'
+          - os: ubuntu-latest
+            python-version: 'pypy-3.8'
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4

diff --git a/docs/first_steps.rst b/docs/first_steps.rst
@@ -11,7 +11,7 @@ WeasyPrint |version| depends on:
 
 * Python_ ≥ 3.7.0
 * Pango_ ≥ 1.44.0
-* pydyf_ ≥ 0.5.0
+* pydyf_ ≥ 0.6.0
 * CFFI_ ≥ 0.6
 * html5lib_ ≥ 1.1
 * tinycss2_ ≥ 1.0.0
@@ -513,7 +513,8 @@ WeasyPrint provides two options to deal with images: ``optimize_size`` and
 
 ``optimize_size`` can enable size optimization for images, but also for fonts.
 When enabled, the generated PDF will include smaller images and fonts, but the
-rendering time may be slightly increased.
+rendering time may be slightly increased. The whole structure of the PDF can be
+compressed too.
 
 .. code-block:: python
 
@@ -523,7 +524,7 @@ rendering time may be slightly increased.
 
     # Full size optimization, slower, but generated PDF is smaller
     HTML('https://example.org/').write_pdf(
-        'example.pdf', optimize_size=('fonts', 'images'))
+        'example.pdf', optimize_size=('fonts', 'images', 'hinting', 'pdf'))
 
 ``image_cache`` gives the possibility to use a cache for images, avoiding to
 download, parse and optimize them each time they are used.
@@ -539,6 +540,11 @@ time when you render a lot of documents that use the same images.
         HTML(f'https://example.org/?id={i}').write_pdf(
             f'example-{i}.pdf', image_cache=cache)
 
+It’s also possible to cache images on disk instead of keeping them in memory.
+The ``--cache-folder`` CLI option can be used to define the folder used to
+store temporary images. You can also provide this folder path as a string for
+``image_cache``.
+
 
 Logging
 ~~~~~~~

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ requires-python = '>=3.7'
 readme = {file = 'README.rst', content-type = 'text/x-rst'}
 license = {file = 'LICENSE'}
 dependencies = [
-  'pydyf >=0.5.0',
+  'pydyf >=0.6.0',
   'cffi >=0.6',
   'html5lib >=1.1',
   'tinycss2 >=1.0.0',

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -74,8 +74,9 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1,
 
 
 def html_write_png(self, target=None, stylesheets=None, resolution=96,
-                   presentational_hints=False, optimize_size=('fonts',),
-                   font_config=None, counter_style=None, image_cache=None):
+                   presentational_hints=False,
+                   optimize_size=('fonts', 'hinting', 'pdf'), font_config=None,
+                   counter_style=None, image_cache=None):
     return self.render(
         stylesheets, presentational_hints=presentational_hints,
         optimize_size=optimize_size, font_config=font_config,

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -303,11 +303,12 @@ def test_command_line_render(tmpdir):
         tmpdir.join(name).write_binary(pattern_bytes)
 
     # Reference
+    optimize_size = ('fonts', 'hinting', 'pdf')
     html_obj = FakeHTML(string=combined, base_url='dummy.html')
-    pdf_bytes = html_obj.write_pdf()
+    pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size)
     rotated_pdf_bytes = FakeHTML(
         string=combined, base_url='dummy.html',
-        media_type='screen').write_pdf()
+        media_type='screen').write_pdf(optimize_size=optimize_size)
 
     tmpdir.join('no_css.html').write_binary(html)
     tmpdir.join('combined.html').write_binary(combined)
@@ -367,7 +368,14 @@ def test_command_line_render(tmpdir):
     _run('not_optimized.html out20.pdf -O none')
     _run('not_optimized.html out21.pdf -O none -O all')
     _run('not_optimized.html out22.pdf -O all -O none')
+    _run('not_optimized.html out23.pdf -O pdf')
+    _run('not_optimized.html out24.pdf -O none -O fonts -O pdf -O hinting')
+    _run('not_optimized.html out25.pdf -O all -j 10')
+    _run('not_optimized.html out26.pdf -O all -j 10 -D 1')
+    _run(f'not_optimized.html out27.pdf -c {tmpdir}')
     assert (
+        len(tmpdir.join('out26.pdf').read_binary()) <
+        len(tmpdir.join('out25.pdf').read_binary()) <
         len(tmpdir.join('out16.pdf').read_binary()) <
         len(tmpdir.join('out15.pdf').read_binary()) <
         len(tmpdir.join('out20.pdf').read_binary()))
@@ -376,19 +384,19 @@ def test_command_line_render(tmpdir):
         for i in (16, 18, 19, 21)}) == 1
     assert len({
         tmpdir.join(f'out{i}.pdf').read_binary()
-        for i in (15, 17)}) == 1
+        for i in (15, 17, 23, 24, 27)}) == 1
     assert len({
         tmpdir.join(f'out{i}.pdf').read_binary()
         for i in (20, 22)}) == 1
     os.environ.pop('SOURCE_DATE_EPOCH')
 
-    stdout = _run('combined.html -')
+    stdout = _run('-O none combined.html -')
     assert stdout.count(b'attachment') == 0
-    stdout = _run('combined.html -')
+    stdout = _run('-O none combined.html -')
     assert stdout.count(b'attachment') == 0
-    stdout = _run('-a pattern.png combined.html -')
+    stdout = _run('-O none -a pattern.png combined.html -')
     assert stdout.count(b'attachment') == 1
-    stdout = _run('-a style.css -a pattern.png combined.html -')
+    stdout = _run('-O none -a style.css -a pattern.png combined.html -')
     assert stdout.count(b'attachment') == 2
 
     os.mkdir('subdirectory')
@@ -423,42 +431,58 @@ def test_command_line_render(tmpdir):
     (4, '2.0'),
 ))
 def test_pdfa(version, pdf_version):
-    stdout = _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
+    stdout = _run(f'--pdf-variant=pdf/a-{version}b -O none - -', b'test')
     assert f'PDF-{pdf_version}'.encode() in stdout
     assert f'part="{version}"'.encode() in stdout
 
 
+@pytest.mark.parametrize('version, pdf_version', (
+    (1, '1.4'),
+    (2, '1.7'),
+    (3, '1.7'),
+    (4, '2.0'),
+))
+def test_pdfa_compressed(version, pdf_version):
+    _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
+
+
 def test_pdfua():
-    stdout = _run('--pdf-variant=pdf/ua-1 - -', b'test')
+    stdout = _run('--pdf-variant=pdf/ua-1 -O none - -', b'test')
     assert b'part="1"' in stdout
 
 
+def test_pdfua_compressed():
+    _run('--pdf-variant=pdf/ua-1 - -', b'test')
+
+
 def test_pdf_identifier():
-    stdout = _run('--pdf-identifier=abc - -', b'test')
+    stdout = _run('--pdf-identifier=abc -O none - -', b'test')
     assert b'abc' in stdout
 
 
 def test_pdf_version():
-    stdout = _run('--pdf-version=1.4 - -', b'test')
+    stdout = _run('--pdf-version=1.4 -O none - -', b'test')
     assert b'PDF-1.4' in stdout
 
 
 def test_pdf_custom_metadata():
-    stdout = _run('--custom-metadata - -', b'<meta name=key content=value />')
+    stdout = _run(
+        '--custom-metadata -O none - -',
+        b'<meta name=key content=value />')
     assert b'/key' in stdout
     assert b'value' in stdout
 
 
 def test_bad_pdf_custom_metadata():
     stdout = _run(
-        '--custom-metadata - -',
+        '--custom-metadata -O none - -',
         '<meta name=é content=value />'.encode('latin1'))
     assert b'value' not in stdout
 
 
 def test_partial_pdf_custom_metadata():
     stdout = _run(
-        '--custom-metadata - -',
+        '--custom-metadata -O none - -',
         '<meta name=a.b/céd0 content=value />'.encode('latin1'))
     assert b'/abcd0' in stdout
     assert b'value' in stdout
@@ -470,7 +494,7 @@ def test_partial_pdf_custom_metadata():
     (b'<textarea></textarea>', b'/Tx'),
 ))
 def test_pdf_inputs(html, field):
-    stdout = _run('--pdf-forms - -', html)
+    stdout = _run('--pdf-forms -O none - -', html)
     assert b'AcroForm' in stdout
     assert field in stdout
     stdout = _run('- -', html)
@@ -484,8 +508,8 @@ def test_pdf_inputs(html, field):
 ))
 def test_appearance(css, with_forms, without_forms):
     html = f'<input style="{css}">'.encode()
-    assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms
-    assert (b'AcroForm' in _run('- -', html)) is without_forms
+    assert (b'AcroForm' in _run('--pdf-forms -O none - -', html)) is with_forms
+    assert (b'AcroForm' in _run(' -O none - -', html)) is without_forms
 
 
 def test_reproducible():

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -26,7 +26,7 @@
 def test_page_size_zoom(zoom):
     pdf = FakeHTML(string='<style>@page{size:3in 4in').write_pdf(zoom=zoom)
     width, height = int(216 * zoom), int(288 * zoom)
-    assert f'/MediaBox [ 0 0 {width} {height} ]'.encode() in pdf
+    assert f'/MediaBox [0 0 {width} {height}]'.encode() in pdf
 
 
 @assert_no_logs
@@ -57,7 +57,7 @@ def test_bookmarks_2():
 @assert_no_logs
 def test_bookmarks_3():
     pdf = FakeHTML(string='<h1>a nbsp…</h1>').write_pdf()
-    assert re.findall(b'/Title <(.*)>', pdf) == [
+    assert re.findall(b'/Title <(\\w*)>', pdf) == [
         b'feff006100a0006e0062007300702026']
 
 
@@ -327,11 +327,11 @@ def test_links():
     ''', base_url=resource_filename('<inline HTML>')).write_pdf()
 
     uris = re.findall(b'/URI \\((.*)\\)', pdf)
-    types = re.findall(b'/S (.*)', pdf)
-    subtypes = re.findall(b'/Subtype (.*)', pdf)
+    types = re.findall(b'/S (/\\w*)', pdf)
+    subtypes = re.findall(b'/Subtype (/\\w*)', pdf)
     rects = [
         [float(number) for number in match.split()] for match in re.findall(
-            b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf)]
+            b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]', pdf)]
 
     # 30pt wide (like the image), 20pt high (like line-height)
     assert uris.pop(0) == b'https://weasyprint.org'
@@ -349,7 +349,7 @@ def test_links():
     assert subtypes.pop(0) == b'/Link'
     assert b'/Dest (lipsum)' in pdf
     link = re.search(
-        b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
+        b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
         pdf).group(1)
     assert [float(number) for number in link.split()] == [0, TOP, 0]
     assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 20]
@@ -362,7 +362,7 @@ def test_links():
     assert subtypes.pop(0) == b'/Link'
     assert b'/Dest (hello)' in pdf
     link = re.search(
-        b'\\(hello\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
+        b'\\(hello\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
         pdf).group(1)
     assert [float(number) for number in link.split()] == [0, TOP - 200, 0]
     assert rects.pop(0) == [0, TOP, RIGHT, TOP - 30]
@@ -387,7 +387,7 @@ def test_relative_links_no_height():
         string='<a href="../lipsum" style="display: block"></a>a',
         base_url='https://weasyprint.org/foo/bar/').write_pdf()
     assert b'/S /URI\n/URI (https://weasyprint.org/foo/lipsum)'
-    assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
+    assert f'/Rect [0 {TOP} {RIGHT} {TOP}]'.encode() in pdf
 
 
 @assert_no_logs
@@ -397,7 +397,7 @@ def test_relative_links_missing_base():
         string='<a href="../lipsum" style="display: block"></a>a',
         base_url=None).write_pdf()
     assert b'/S /URI\n/URI (../lipsum)'
-    assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
+    assert f'/Rect [0 {TOP} {RIGHT} {TOP}]'.encode() in pdf
 
 
 @assert_no_logs
@@ -421,11 +421,11 @@ def test_relative_links_internal():
         base_url=None).write_pdf()
     assert b'/Dest (lipsum)' in pdf
     link = re.search(
-        b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
+        b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
         pdf).group(1)
     assert [float(number) for number in link.split()] == [0, TOP, 0]
     rect = re.search(
-        b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
+        b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
         pdf).group(1)
     assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]
 
@@ -437,11 +437,11 @@ def test_relative_links_anchors():
         base_url=None).write_pdf()
     assert b'/Dest (lipsum)' in pdf
     link = re.search(
-        b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
+        b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
         pdf).group(1)
     assert [float(number) for number in link.split()] == [0, TOP, 0]
     rect = re.search(
-        b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
+        b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
         pdf).group(1)
     assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]
 
@@ -474,11 +474,11 @@ def test_missing_links():
     assert b'/Dest (lipsum)' in pdf
     assert len(logs) == 1
     link = re.search(
-        b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
+        b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
         pdf).group(1)
     assert [float(number) for number in link.split()] == [0, TOP - 15, 0]
     rect = re.search(
-        b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
+        b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
         pdf).group(1)
     assert [float(number) for number in rect.split()] == [
         0, TOP, RIGHT, TOP - 15]
@@ -495,8 +495,8 @@ def test_anchor_multiple_pages():
         <a href="#lipsum"></a>
       </div>
     ''', base_url=None).write_pdf()
-    first_page, = re.findall(b'/Kids \\[ (\\d+) 0 R', pdf)
-    assert b'/Names [ (lipsum) [ ' + first_page in pdf
+    first_page, = re.findall(b'/Kids \\[(\\d+) 0 R', pdf)
+    assert b'/Names [(lipsum) [' + first_page in pdf
 
 
 @assert_no_logs
@@ -717,6 +717,6 @@ def test_bleed(style, media, bleed, trim):
       <style>@page { %s }</style>
       <body>test
     ''' % style).write_pdf()
-    assert '/MediaBox [ {} {} {} {} ]'.format(*media).encode() in pdf
-    assert '/BleedBox [ {} {} {} {} ]'.format(*bleed).encode() in pdf
-    assert '/TrimBox [ {} {} {} {} ]'.format(*trim).encode() in pdf
+    assert '/MediaBox [{} {} {} {}]'.format(*media).encode() in pdf
+    assert '/BleedBox [{} {} {} {}]'.format(*bleed).encode() in pdf
+    assert '/TrimBox [{} {} {} {}]'.format(*trim).encode() in pdf
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -53,6 +53,19 @@ def _ua_stylesheets(self, forms=False):
             TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET
             else stylesheet for stylesheet in super()._ua_stylesheets(forms)]
 
+    def write_pdf(self, target=None, stylesheets=None, zoom=1,
+                  attachments=None, finisher=None, presentational_hints=False,
+                  optimize_size=('fonts',), jpeg_quality=None, dpi=None,
+                  font_config=None, counter_style=None, image_cache=None,
+                  identifier=None, variant=None, version=None, forms=False,
+                  custom_metadata=False):
+        # Override function to set PDF size optimization to False by default
+        return super().write_pdf(
+            target, stylesheets, zoom, attachments, finisher,
+            presentational_hints, optimize_size, jpeg_quality, dpi,
+            font_config, counter_style, image_cache, identifier, variant,
+            version, forms, custom_metadata)
+
 
 def resource_filename(basename):
     """Return the absolute path of the resource called ``basename``."""
@@ -182,7 +195,7 @@ def _parse_base(html_content, base_url=BASE_URL):
     style_for = get_all_computed_styles(document, counter_style=counter_style)
     get_image_from_uri = functools.partial(
         images.get_image_from_uri, cache={}, url_fetcher=document.url_fetcher,
-        optimize_size=())
+        optimize_size=(), jpeg_quality=None, dpi=None)
     target_collector = TargetCollector()
     footnotes = []
     return (