Skip to content

Commit

Permalink
Merge pull request #1853 from Kozea/size
Browse files Browse the repository at this point in the history
Reduce PDF size
  • Loading branch information
liZe authored Apr 12, 2023
2 parents 4eb7fa3 + 6ff1b97 commit d797e75
Show file tree
Hide file tree
Showing 20 changed files with 360 additions and 199 deletions.
11 changes: 6 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', 'pypy-3.8']
exclude:
# Wheels missing for this configuration
- os: macos-latest
python-version: pypy-3.8
python-version: ['3.11']
include:
- os: ubuntu-latest
python-version: '3.7'
- os: ubuntu-latest
python-version: 'pypy-3.8'
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
Expand Down
12 changes: 9 additions & 3 deletions docs/first_steps.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ WeasyPrint |version| depends on:

* Python_ ≥ 3.7.0
* Pango_ ≥ 1.44.0
* pydyf_ ≥ 0.5.0
* pydyf_ ≥ 0.6.0
* CFFI_ ≥ 0.6
* html5lib_ ≥ 1.1
* tinycss2_ ≥ 1.0.0
Expand Down Expand Up @@ -513,7 +513,8 @@ WeasyPrint provides two options to deal with images: ``optimize_size`` and

``optimize_size`` can enable size optimization for images, but also for fonts.
When enabled, the generated PDF will include smaller images and fonts, but the
rendering time may be slightly increased.
rendering time may be slightly increased. The whole structure of the PDF can be
compressed too.

.. code-block:: python
Expand All @@ -523,7 +524,7 @@ rendering time may be slightly increased.
# Full size optimization, slower, but generated PDF is smaller
HTML('https://example.org/').write_pdf(
'example.pdf', optimize_size=('fonts', 'images'))
'example.pdf', optimize_size=('fonts', 'images', 'hinting', 'pdf'))
``image_cache`` gives the possibility to use a cache for images, avoiding to
download, parse and optimize them each time they are used.
Expand All @@ -539,6 +540,11 @@ time when you render a lot of documents that use the same images.
HTML(f'https://example.org/?id={i}').write_pdf(
f'example-{i}.pdf', image_cache=cache)
It’s also possible to cache images on disk instead of keeping them in memory.
The ``--cache-folder`` CLI option can be used to define the folder used to
store temporary images. You can also provide this folder path as a string for
``image_cache``.


Logging
~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ requires-python = '>=3.7'
readme = {file = 'README.rst', content-type = 'text/x-rst'}
license = {file = 'LICENSE'}
dependencies = [
'pydyf >=0.5.0',
'pydyf >=0.6.0',
'cffi >=0.6',
'html5lib >=1.1',
'tinycss2 >=1.0.0',
Expand Down
5 changes: 3 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1,


def html_write_png(self, target=None, stylesheets=None, resolution=96,
presentational_hints=False, optimize_size=('fonts',),
font_config=None, counter_style=None, image_cache=None):
presentational_hints=False,
optimize_size=('fonts', 'hinting', 'pdf'), font_config=None,
counter_style=None, image_cache=None):
return self.render(
stylesheets, presentational_hints=presentational_hints,
optimize_size=optimize_size, font_config=font_config,
Expand Down
58 changes: 41 additions & 17 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,12 @@ def test_command_line_render(tmpdir):
tmpdir.join(name).write_binary(pattern_bytes)

# Reference
optimize_size = ('fonts', 'hinting', 'pdf')
html_obj = FakeHTML(string=combined, base_url='dummy.html')
pdf_bytes = html_obj.write_pdf()
pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size)
rotated_pdf_bytes = FakeHTML(
string=combined, base_url='dummy.html',
media_type='screen').write_pdf()
media_type='screen').write_pdf(optimize_size=optimize_size)

tmpdir.join('no_css.html').write_binary(html)
tmpdir.join('combined.html').write_binary(combined)
Expand Down Expand Up @@ -367,7 +368,14 @@ def test_command_line_render(tmpdir):
_run('not_optimized.html out20.pdf -O none')
_run('not_optimized.html out21.pdf -O none -O all')
_run('not_optimized.html out22.pdf -O all -O none')
_run('not_optimized.html out23.pdf -O pdf')
_run('not_optimized.html out24.pdf -O none -O fonts -O pdf -O hinting')
_run('not_optimized.html out25.pdf -O all -j 10')
_run('not_optimized.html out26.pdf -O all -j 10 -D 1')
_run(f'not_optimized.html out27.pdf -c {tmpdir}')
assert (
len(tmpdir.join('out26.pdf').read_binary()) <
len(tmpdir.join('out25.pdf').read_binary()) <
len(tmpdir.join('out16.pdf').read_binary()) <
len(tmpdir.join('out15.pdf').read_binary()) <
len(tmpdir.join('out20.pdf').read_binary()))
Expand All @@ -376,19 +384,19 @@ def test_command_line_render(tmpdir):
for i in (16, 18, 19, 21)}) == 1
assert len({
tmpdir.join(f'out{i}.pdf').read_binary()
for i in (15, 17)}) == 1
for i in (15, 17, 23, 24, 27)}) == 1
assert len({
tmpdir.join(f'out{i}.pdf').read_binary()
for i in (20, 22)}) == 1
os.environ.pop('SOURCE_DATE_EPOCH')

stdout = _run('combined.html -')
stdout = _run('-O none combined.html -')
assert stdout.count(b'attachment') == 0
stdout = _run('combined.html -')
stdout = _run('-O none combined.html -')
assert stdout.count(b'attachment') == 0
stdout = _run('-a pattern.png combined.html -')
stdout = _run('-O none -a pattern.png combined.html -')
assert stdout.count(b'attachment') == 1
stdout = _run('-a style.css -a pattern.png combined.html -')
stdout = _run('-O none -a style.css -a pattern.png combined.html -')
assert stdout.count(b'attachment') == 2

os.mkdir('subdirectory')
Expand Down Expand Up @@ -423,42 +431,58 @@ def test_command_line_render(tmpdir):
(4, '2.0'),
))
def test_pdfa(version, pdf_version):
stdout = _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
stdout = _run(f'--pdf-variant=pdf/a-{version}b -O none - -', b'test')
assert f'PDF-{pdf_version}'.encode() in stdout
assert f'part="{version}"'.encode() in stdout


@pytest.mark.parametrize('version, pdf_version', (
(1, '1.4'),
(2, '1.7'),
(3, '1.7'),
(4, '2.0'),
))
def test_pdfa_compressed(version, pdf_version):
_run(f'--pdf-variant=pdf/a-{version}b - -', b'test')


def test_pdfua():
stdout = _run('--pdf-variant=pdf/ua-1 - -', b'test')
stdout = _run('--pdf-variant=pdf/ua-1 -O none - -', b'test')
assert b'part="1"' in stdout


def test_pdfua_compressed():
_run('--pdf-variant=pdf/ua-1 - -', b'test')


def test_pdf_identifier():
stdout = _run('--pdf-identifier=abc - -', b'test')
stdout = _run('--pdf-identifier=abc -O none - -', b'test')
assert b'abc' in stdout


def test_pdf_version():
stdout = _run('--pdf-version=1.4 - -', b'test')
stdout = _run('--pdf-version=1.4 -O none - -', b'test')
assert b'PDF-1.4' in stdout


def test_pdf_custom_metadata():
stdout = _run('--custom-metadata - -', b'<meta name=key content=value />')
stdout = _run(
'--custom-metadata -O none - -',
b'<meta name=key content=value />')
assert b'/key' in stdout
assert b'value' in stdout


def test_bad_pdf_custom_metadata():
stdout = _run(
'--custom-metadata - -',
'--custom-metadata -O none - -',
'<meta name=é content=value />'.encode('latin1'))
assert b'value' not in stdout


def test_partial_pdf_custom_metadata():
stdout = _run(
'--custom-metadata - -',
'--custom-metadata -O none - -',
'<meta name=a.b/céd0 content=value />'.encode('latin1'))
assert b'/abcd0' in stdout
assert b'value' in stdout
Expand All @@ -470,7 +494,7 @@ def test_partial_pdf_custom_metadata():
(b'<textarea></textarea>', b'/Tx'),
))
def test_pdf_inputs(html, field):
stdout = _run('--pdf-forms - -', html)
stdout = _run('--pdf-forms -O none - -', html)
assert b'AcroForm' in stdout
assert field in stdout
stdout = _run('- -', html)
Expand All @@ -484,8 +508,8 @@ def test_pdf_inputs(html, field):
))
def test_appearance(css, with_forms, without_forms):
html = f'<input style="{css}">'.encode()
assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms
assert (b'AcroForm' in _run('- -', html)) is without_forms
assert (b'AcroForm' in _run('--pdf-forms -O none - -', html)) is with_forms
assert (b'AcroForm' in _run(' -O none - -', html)) is without_forms


def test_reproducible():
Expand Down
40 changes: 20 additions & 20 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
def test_page_size_zoom(zoom):
pdf = FakeHTML(string='<style>@page{size:3in 4in').write_pdf(zoom=zoom)
width, height = int(216 * zoom), int(288 * zoom)
assert f'/MediaBox [ 0 0 {width} {height} ]'.encode() in pdf
assert f'/MediaBox [0 0 {width} {height}]'.encode() in pdf


@assert_no_logs
Expand Down Expand Up @@ -57,7 +57,7 @@ def test_bookmarks_2():
@assert_no_logs
def test_bookmarks_3():
pdf = FakeHTML(string='<h1>a nbsp…</h1>').write_pdf()
assert re.findall(b'/Title <(.*)>', pdf) == [
assert re.findall(b'/Title <(\\w*)>', pdf) == [
b'feff006100a0006e0062007300702026']


Expand Down Expand Up @@ -327,11 +327,11 @@ def test_links():
''', base_url=resource_filename('<inline HTML>')).write_pdf()

uris = re.findall(b'/URI \\((.*)\\)', pdf)
types = re.findall(b'/S (.*)', pdf)
subtypes = re.findall(b'/Subtype (.*)', pdf)
types = re.findall(b'/S (/\\w*)', pdf)
subtypes = re.findall(b'/Subtype (/\\w*)', pdf)
rects = [
[float(number) for number in match.split()] for match in re.findall(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf)]
b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]', pdf)]

# 30pt wide (like the image), 20pt high (like line-height)
assert uris.pop(0) == b'https://weasyprint.org'
Expand All @@ -349,7 +349,7 @@ def test_links():
assert subtypes.pop(0) == b'/Link'
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 20]
Expand All @@ -362,7 +362,7 @@ def test_links():
assert subtypes.pop(0) == b'/Link'
assert b'/Dest (hello)' in pdf
link = re.search(
b'\\(hello\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
b'\\(hello\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP - 200, 0]
assert rects.pop(0) == [0, TOP, RIGHT, TOP - 30]
Expand All @@ -387,7 +387,7 @@ def test_relative_links_no_height():
string='<a href="../lipsum" style="display: block"></a>a',
base_url='https://weasyprint.org/foo/bar/').write_pdf()
assert b'/S /URI\n/URI (https://weasyprint.org/foo/lipsum)'
assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
assert f'/Rect [0 {TOP} {RIGHT} {TOP}]'.encode() in pdf


@assert_no_logs
Expand All @@ -397,7 +397,7 @@ def test_relative_links_missing_base():
string='<a href="../lipsum" style="display: block"></a>a',
base_url=None).write_pdf()
assert b'/S /URI\n/URI (../lipsum)'
assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
assert f'/Rect [0 {TOP} {RIGHT} {TOP}]'.encode() in pdf


@assert_no_logs
Expand All @@ -421,11 +421,11 @@ def test_relative_links_internal():
base_url=None).write_pdf()
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]

Expand All @@ -437,11 +437,11 @@ def test_relative_links_anchors():
base_url=None).write_pdf()
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]

Expand Down Expand Up @@ -474,11 +474,11 @@ def test_missing_links():
assert b'/Dest (lipsum)' in pdf
assert len(logs) == 1
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
b'\\(lipsum\\) \\[\\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+)]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP - 15, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
b'/Rect \\[([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+)\\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [
0, TOP, RIGHT, TOP - 15]
Expand All @@ -495,8 +495,8 @@ def test_anchor_multiple_pages():
<a href="#lipsum"></a>
</div>
''', base_url=None).write_pdf()
first_page, = re.findall(b'/Kids \\[ (\\d+) 0 R', pdf)
assert b'/Names [ (lipsum) [ ' + first_page in pdf
first_page, = re.findall(b'/Kids \\[(\\d+) 0 R', pdf)
assert b'/Names [(lipsum) [' + first_page in pdf


@assert_no_logs
Expand Down Expand Up @@ -717,6 +717,6 @@ def test_bleed(style, media, bleed, trim):
<style>@page { %s }</style>
<body>test
''' % style).write_pdf()
assert '/MediaBox [ {} {} {} {} ]'.format(*media).encode() in pdf
assert '/BleedBox [ {} {} {} {} ]'.format(*bleed).encode() in pdf
assert '/TrimBox [ {} {} {} {} ]'.format(*trim).encode() in pdf
assert '/MediaBox [{} {} {} {}]'.format(*media).encode() in pdf
assert '/BleedBox [{} {} {} {}]'.format(*bleed).encode() in pdf
assert '/TrimBox [{} {} {} {}]'.format(*trim).encode() in pdf
15 changes: 14 additions & 1 deletion tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@ def _ua_stylesheets(self, forms=False):
TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET
else stylesheet for stylesheet in super()._ua_stylesheets(forms)]

def write_pdf(self, target=None, stylesheets=None, zoom=1,
attachments=None, finisher=None, presentational_hints=False,
optimize_size=('fonts',), jpeg_quality=None, dpi=None,
font_config=None, counter_style=None, image_cache=None,
identifier=None, variant=None, version=None, forms=False,
custom_metadata=False):
# Override function to set PDF size optimization to False by default
return super().write_pdf(
target, stylesheets, zoom, attachments, finisher,
presentational_hints, optimize_size, jpeg_quality, dpi,
font_config, counter_style, image_cache, identifier, variant,
version, forms, custom_metadata)


def resource_filename(basename):
"""Return the absolute path of the resource called ``basename``."""
Expand Down Expand Up @@ -182,7 +195,7 @@ def _parse_base(html_content, base_url=BASE_URL):
style_for = get_all_computed_styles(document, counter_style=counter_style)
get_image_from_uri = functools.partial(
images.get_image_from_uri, cache={}, url_fetcher=document.url_fetcher,
optimize_size=())
optimize_size=(), jpeg_quality=None, dpi=None)
target_collector = TargetCollector()
footnotes = []
return (
Expand Down
Loading

0 comments on commit d797e75

Please sign in to comment.