From ce84073f1ac63a44c6a1ef5760b2055f9083a4e7 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sun, 28 Jan 2018 16:21:48 +0100 Subject: [PATCH] Use Latin-1-decoded strings instead of bytestrings in pdfrw streams Fix #558. --- weasyprint/pdf.py | 8 +++++--- weasyprint/tests/test_pdf.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/weasyprint/pdf.py b/weasyprint/pdf.py index 16e7d5e6c..88d3a511c 100644 --- a/weasyprint/pdf.py +++ b/weasyprint/pdf.py @@ -85,13 +85,15 @@ def _create_compressed_file_object(source): pdf_file_object = PdfDict( Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode')) - pdf_file_object.stream = b'' + + # pdfrw needs Latin-1-decoded unicode strings in object.stream + pdf_file_object.stream = '' size = 0 for data in iter(lambda: source.read(4096), b''): size += len(data) md5.update(data) - pdf_file_object.stream += compress.compress(data) - pdf_file_object.stream += compress.flush(zlib.Z_FINISH) + pdf_file_object.stream += compress.compress(data).decode('latin-1') + pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1') pdf_file_object.Params = PdfDict( CheckSum=PdfString('<{}>'.format(md5.hexdigest())), Size=size) return pdf_file_object diff --git a/weasyprint/tests/test_pdf.py b/weasyprint/tests/test_pdf.py index c28794a58..c9fc0e6cd 100644 --- a/weasyprint/tests/test_pdf.py +++ b/weasyprint/tests/test_pdf.py @@ -12,6 +12,7 @@ import hashlib import io import os +import zlib import cairocffi import pytest @@ -424,27 +425,39 @@ def test_embedded_files(): pdf = PdfReader(fdata=pdf_bytes) embedded = pdf.Root.Names.EmbeddedFiles.Names + assert zlib.decompress( + embedded[1].EF.F.stream.encode('latin-1')) == b'hi there' assert embedded[1].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(b'hi there').hexdigest())) assert embedded[1].F.decode() == '' assert embedded[1].UF.decode() == 'attachment.bin' assert embedded[1].Desc.decode() == 'some file attachment äöü' + assert zlib.decompress( + embedded[3].EF.F.stream.encode('latin-1')) == b'12345678' assert embedded[3].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(adata).hexdigest())) assert embedded[3].UF.decode() == os.path.basename(absolute_tmp_file) + assert zlib.decompress( + embedded[5].EF.F.stream.encode('latin-1')) == b'abcdefgh' assert embedded[5].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(rdata).hexdigest())) assert embedded[5].UF.decode() == os.path.basename(relative_tmp_file) + assert zlib.decompress( + embedded[7].EF.F.stream.encode('latin-1')) == b'oob attachment' assert embedded[7].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(b'oob attachment').hexdigest())) assert embedded[7].Desc.decode() == 'Hello' + assert zlib.decompress( + embedded[9].EF.F.stream.encode('latin-1')) == b'raw URL' assert embedded[9].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(b'raw URL').hexdigest())) + assert zlib.decompress( + embedded[11].EF.F.stream.encode('latin-1')) == b'file like obj' assert embedded[11].EF.F.Params.CheckSum == ( '<{}>'.format(hashlib.md5(b'file like obj').hexdigest()))