From b6d403315a9ca7eee2e23d6791d16d01468b88e6 Mon Sep 17 00:00:00 2001 From: Ernesto Ruge Date: Tue, 31 Dec 2024 15:14:39 +0100 Subject: [PATCH 1/6] custom metadata --- tests/test_api.py | 1 + tests/test_pdf.py | 32 +++++++++++++++++++ weasyprint/document.py | 37 ++++++++++++++++++---- weasyprint/pdf/metadata.py | 63 ++++++++++++++++++++++++++++++-------- 4 files changed, 114 insertions(+), 19 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 48b3efe63..df227c646 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1140,6 +1140,7 @@ def assert_meta(html, **meta): meta.setdefault('attachments', []) meta.setdefault('lang', None) meta.setdefault('custom', {}) + meta.setdefault('rdf_metadata_generator', None) assert vars(FakeHTML(string=html).render().metadata) == meta diff --git a/tests/test_pdf.py b/tests/test_pdf.py index d657e6d9a..e547c7c34 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -716,3 +716,35 @@ def test_bleed(style, media, bleed, trim): assert f'/MediaBox {str(media).replace(",", "")}'.encode() in pdf assert f'/BleedBox {str(bleed).replace(",", "")}'.encode() in pdf assert f'/TrimBox {str(trim).replace(",", "")}'.encode() in pdf + + +@assert_no_logs +def test_default_rdf_metadata() -> None: + pdf_document = FakeHTML(string='test').render() + + pdf_document.metadata.title = None + + pdf_bytes = pdf_document.write_pdf( + pdf_variant='pdf/a-3b', + pdf_identifier=b'example-bytes', + uncompressed_pdf=True, + ) + assert b' None: + def rdf_metadata_generator(*args, **kwargs) -> bytes: + return b'TEST_METADATA' + + pdf_document = FakeHTML(string='test').render() + + pdf_document.metadata.title = None + pdf_document.metadata.rdf_metadata_generator = rdf_metadata_generator + + pdf_bytes = pdf_document.write_pdf( + pdf_variant='pdf/a-3b', + pdf_identifier=b'example-bytes', + uncompressed_pdf=True, + ) + assert b'TEST_METADATA' in pdf_bytes diff --git a/weasyprint/document.py b/weasyprint/document.py index f630fe6a5..07406a2af 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -4,8 +4,9 @@ import io from hashlib import md5 from pathlib import Path +from typing import Callable, Optional -from . import CSS, DEFAULT_OPTIONS +from . import CSS, DEFAULT_OPTIONS, Attachment from .anchors import gather_anchors, make_page_bookmark_tree from .css import get_all_computed_styles from .css.counters import CounterStyle @@ -105,12 +106,33 @@ class DocumentMetadata: """Meta-information belonging to a whole :class:`Document`. New attributes may be added in future versions of WeasyPrint. - """ - - def __init__(self, title=None, authors=None, description=None, - keywords=None, generator=None, created=None, modified=None, - attachments=None, lang=None, custom=None): + title: Optional[str] + authors: list[str] + description: Optional[str] + keywords: list[str] + generator: Optional[str] + created: Optional[str] + modified: Optional[str] + attachments: list[Attachment] + lang: Optional[str] + custom: dict + rdf_metadata_generator: Optional[Callable] = None + + def __init__( + self, + title: Optional[str] = None, + authors: Optional[list[str]] = None, + description: Optional[str] = None, + keywords: Optional[list[str]] = None, + generator: Optional[str] = None, + created: Optional[str] = None, + modified: Optional[str] = None, + attachments: Optional[list[Attachment]] = None, + lang: Optional[str] = None, + custom: Optional[dict] = None, + rdf_metadata_generator: Optional[Callable] = None, + ): #: The title of the document, as a string or :obj:`None`. #: Extracted from the ```` element in HTML #: and written to the ``/Title`` info field in PDF. @@ -156,6 +178,9 @@ def __init__(self, title=None, authors=None, description=None, #: Custom metadata, as a dict whose keys are the metadata names and #: values are the metadata values. self.custom = custom or {} + #: Custom RDF metadata generator, which will replace the default generator. + #: The function should return bytes containing an RDF XML. + self.rdf_metadata_generator = rdf_metadata_generator class DiskCache: diff --git a/weasyprint/pdf/metadata.py b/weasyprint/pdf/metadata.py index 7cde641f2..dd94ecbc1 100644 --- a/weasyprint/pdf/metadata.py +++ b/weasyprint/pdf/metadata.py @@ -1,13 +1,20 @@ -"""PDF metadata stream generation.""" +""" +PDF metadata stream generation. +""" +from typing import TYPE_CHECKING from xml.etree.ElementTree import Element, SubElement, register_namespace, tostring import pydyf -from .. import __version__ +from weasyprint import __version__ + +if TYPE_CHECKING: + from weasyprint.document import DocumentMetadata + # XML namespaces used for metadata -NS = { +NS: dict[str, str] = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'xmp': 'http://ns.adobe.com/xap/1.0/', @@ -19,13 +26,50 @@ register_namespace(key, value) -def add_metadata(pdf, metadata, variant, version, conformance, compress): +def add_metadata( + pdf: pydyf.PDF, + metadata: 'DocumentMetadata', + variant: str, + version: str, + conformance: str, + compress: bool, +) -> None: """Add PDF stream of metadata. Described in ISO-32000-1:2008, 14.3.2. """ - # Add metadata + # Add metadata. If `DocumentMetadata` has a generator, we will use it, + # otherwise we will use the default generator. + if metadata.rdf_metadata_generator is None: + xml_data = generate_rdf_metadata(metadata, variant, version, conformance) + else: + xml_data = metadata.rdf_metadata_generator( + metadata=metadata, + variant=variant, + version=version, + conformance=conformance, + ) + + header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' + footer = b'<?xpacket end="r"?>' + stream_content = b'\n'.join((header, xml_data, footer)) + extra = {'Type': '/Metadata', 'Subtype': '/XML'} + metadata = pydyf.Stream([stream_content], extra, compress) + pdf.add_object(metadata) + pdf.catalog['Metadata'] = metadata.reference + + +def generate_rdf_metadata( + metadata: 'DocumentMetadata', + variant: str, + version: str, + conformance: str, +) -> bytes: + """Generates RDF metadata. Might be replaced by + DocumentMetadata.rdf_matadata_generator(). + + """ namespace = f'pdf{variant}id' rdf = Element(f'{{{NS["rdf"]}}}RDF') @@ -82,11 +126,4 @@ def add_metadata(pdf, metadata, variant, version, conformance, compress): element.attrib[f'{{{NS["rdf"]}}}about'] = '' element = SubElement(element, f'{{{NS["xmp"]}}}ModifyDate') element.text = metadata.modified - xml = tostring(rdf, encoding='utf-8') - header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' - footer = b'<?xpacket end="r"?>' - stream_content = b'\n'.join((header, xml, footer)) - extra = {'Type': '/Metadata', 'Subtype': '/XML'} - metadata = pydyf.Stream([stream_content], extra, compress) - pdf.add_object(metadata) - pdf.catalog['Metadata'] = metadata.reference + return tostring(rdf, encoding='utf-8') From 797eb2bd6191fca1ab683ee6d9dffeba6cb76da4 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub <guillaume@courtbouillon.org> Date: Thu, 2 Jan 2025 14:21:27 +0100 Subject: [PATCH 2/6] Remove types and set default function to generate metadata --- tests/test_api.py | 3 ++- tests/test_pdf.py | 16 ++++--------- weasyprint/document.py | 35 +++++----------------------- weasyprint/pdf/metadata.py | 47 ++++++++------------------------------ 4 files changed, 22 insertions(+), 79 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index df227c646..fc8bbb564 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -18,6 +18,7 @@ from weasyprint import CSS, HTML, __main__, default_url_fetcher from weasyprint.pdf.anchors import resolve_links +from weasyprint.pdf.metadata import generate_rdf_metadata from weasyprint.urls import path2url from .draw import parse_pixels @@ -1140,7 +1141,7 @@ def assert_meta(html, **meta): meta.setdefault('attachments', []) meta.setdefault('lang', None) meta.setdefault('custom', {}) - meta.setdefault('rdf_metadata_generator', None) + meta.setdefault('rdf_metadata_generator', generate_rdf_metadata) assert vars(FakeHTML(string=html).render().metadata) == meta diff --git a/tests/test_pdf.py b/tests/test_pdf.py index e547c7c34..ec76929cc 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -719,22 +719,19 @@ def test_bleed(style, media, bleed, trim): @assert_no_logs -def test_default_rdf_metadata() -> None: +def test_default_rdf_metadata(): pdf_document = FakeHTML(string='<body>test</body>').render() pdf_document.metadata.title = None pdf_bytes = pdf_document.write_pdf( - pdf_variant='pdf/a-3b', - pdf_identifier=b'example-bytes', - uncompressed_pdf=True, - ) + pdf_variant='pdf/a-3b', pdf_identifier=b'example-bytes', uncompressed_pdf=True) assert b'<rdf:RDF xmlns:pdf="http://ns.adobe.com/pdf/1.3/"' in pdf_bytes @assert_no_logs -def test_custom_rdf_metadata() -> None: - def rdf_metadata_generator(*args, **kwargs) -> bytes: +def test_custom_rdf_metadata(): + def rdf_metadata_generator(*args, **kwargs): return b'TEST_METADATA' pdf_document = FakeHTML(string='<body>test</body>').render() @@ -743,8 +740,5 @@ def rdf_metadata_generator(*args, **kwargs) -> bytes: pdf_document.metadata.rdf_metadata_generator = rdf_metadata_generator pdf_bytes = pdf_document.write_pdf( - pdf_variant='pdf/a-3b', - pdf_identifier=b'example-bytes', - uncompressed_pdf=True, - ) + pdf_variant='pdf/a-3b', pdf_identifier=b'example-bytes', uncompressed_pdf=True) assert b'TEST_METADATA' in pdf_bytes diff --git a/weasyprint/document.py b/weasyprint/document.py index 07406a2af..9ad74e295 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -4,9 +4,8 @@ import io from hashlib import md5 from pathlib import Path -from typing import Callable, Optional -from . import CSS, DEFAULT_OPTIONS, Attachment +from . import CSS, DEFAULT_OPTIONS from .anchors import gather_anchors, make_page_bookmark_tree from .css import get_all_computed_styles from .css.counters import CounterStyle @@ -19,6 +18,7 @@ from .logger import PROGRESS_LOGGER from .matrix import Matrix from .pdf import VARIANTS, generate_pdf +from .pdf.metadata import generate_rdf_metadata from .text.fonts import FontConfiguration @@ -107,32 +107,9 @@ class DocumentMetadata: New attributes may be added in future versions of WeasyPrint. """ - title: Optional[str] - authors: list[str] - description: Optional[str] - keywords: list[str] - generator: Optional[str] - created: Optional[str] - modified: Optional[str] - attachments: list[Attachment] - lang: Optional[str] - custom: dict - rdf_metadata_generator: Optional[Callable] = None - - def __init__( - self, - title: Optional[str] = None, - authors: Optional[list[str]] = None, - description: Optional[str] = None, - keywords: Optional[list[str]] = None, - generator: Optional[str] = None, - created: Optional[str] = None, - modified: Optional[str] = None, - attachments: Optional[list[Attachment]] = None, - lang: Optional[str] = None, - custom: Optional[dict] = None, - rdf_metadata_generator: Optional[Callable] = None, - ): + def __init__(self, title=None, authors= None, description=None, keywords=None, + generator=None, created=None, modified=None, attachments=None, + lang=None, custom=None, rdf_metadata_generator=None): #: The title of the document, as a string or :obj:`None`. #: Extracted from the ``<title>`` element in HTML #: and written to the ``/Title`` info field in PDF. @@ -180,7 +157,7 @@ def __init__( self.custom = custom or {} #: Custom RDF metadata generator, which will replace the default generator. #: The function should return bytes containing an RDF XML. - self.rdf_metadata_generator = rdf_metadata_generator + self.rdf_metadata_generator = rdf_metadata_generator or generate_rdf_metadata class DiskCache: diff --git a/weasyprint/pdf/metadata.py b/weasyprint/pdf/metadata.py index dd94ecbc1..d9a3fef07 100644 --- a/weasyprint/pdf/metadata.py +++ b/weasyprint/pdf/metadata.py @@ -1,20 +1,13 @@ -""" -PDF metadata stream generation. -""" +"""PDF metadata stream generation.""" -from typing import TYPE_CHECKING from xml.etree.ElementTree import Element, SubElement, register_namespace, tostring import pydyf -from weasyprint import __version__ - -if TYPE_CHECKING: - from weasyprint.document import DocumentMetadata - +from .. import __version__ # XML namespaces used for metadata -NS: dict[str, str] = { +NS = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'xmp': 'http://ns.adobe.com/xap/1.0/', @@ -26,33 +19,15 @@ register_namespace(key, value) -def add_metadata( - pdf: pydyf.PDF, - metadata: 'DocumentMetadata', - variant: str, - version: str, - conformance: str, - compress: bool, -) -> None: +def add_metadata(pdf, metadata, variant, version, conformance, compress): """Add PDF stream of metadata. Described in ISO-32000-1:2008, 14.3.2. """ - # Add metadata. If `DocumentMetadata` has a generator, we will use it, - # otherwise we will use the default generator. - if metadata.rdf_metadata_generator is None: - xml_data = generate_rdf_metadata(metadata, variant, version, conformance) - else: - xml_data = metadata.rdf_metadata_generator( - metadata=metadata, - variant=variant, - version=version, - conformance=conformance, - ) - header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' footer = b'<?xpacket end="r"?>' + xml_data = metadata.rdf_metadata_generator(metadata, variant, version, conformance) stream_content = b'\n'.join((header, xml_data, footer)) extra = {'Type': '/Metadata', 'Subtype': '/XML'} metadata = pydyf.Stream([stream_content], extra, compress) @@ -60,14 +35,10 @@ def add_metadata( pdf.catalog['Metadata'] = metadata.reference -def generate_rdf_metadata( - metadata: 'DocumentMetadata', - variant: str, - version: str, - conformance: str, -) -> bytes: - """Generates RDF metadata. Might be replaced by - DocumentMetadata.rdf_matadata_generator(). +def generate_rdf_metadata(metadata, variant, version, conformance): + """Generate RDF metadata. + + Might be replaced by DocumentMetadata.rdf_matadata_generator(). """ namespace = f'pdf{variant}id' From 7e8c018ccd253799cc66ac9e15f49841e81dbdd5 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub <guillaume@courtbouillon.org> Date: Tue, 14 Jan 2025 05:27:51 +0100 Subject: [PATCH 3/6] Allow PDF attachment name to be given explicitly --- weasyprint/__init__.py | 6 +++++- weasyprint/pdf/anchors.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 40635bf0a..a3acac430 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -318,6 +318,9 @@ class Attachment: HTML specific arguments (``encoding`` and ``media_type``) are not supported. + :param str name: + The name of the attachment to be included in the PDF document. + May be :obj:`None`. :param str description: A description of the attachment to be included in the PDF document. May be :obj:`None`. @@ -335,11 +338,12 @@ class Attachment: """ def __init__(self, guess=None, filename=None, url=None, file_obj=None, string=None, base_url=None, url_fetcher=default_url_fetcher, - description=None, created=None, modified=None, + name=None, description=None, created=None, modified=None, relationship='Unspecified'): self.source = _select_source( guess, filename, url, file_obj, string, base_url=base_url, url_fetcher=url_fetcher) + self.name = name self.description = description self.relationship = relationship self.md5 = None diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index ebf489cdc..b47b39b28 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -351,7 +351,9 @@ def write_pdf_attachment(pdf, attachment, compress): # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename and MIME type. - if url and urlsplit(url).path: + if attachment.name: + filename = attachment.name + elif url and urlsplit(url).path: filename = basename(unquote(urlsplit(url).path)) else: filename = 'attachment.bin' From 410ce5456075c1ac0ea6545bf9c5a6068d3afd5d Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub <guillaume@courtbouillon.org> Date: Sun, 19 Jan 2025 12:08:11 +0100 Subject: [PATCH 4/6] Store attachment name in list of attachments --- tests/test_api.py | 8 ++++---- tests/test_pdf.py | 2 +- weasyprint/pdf/__init__.py | 2 +- weasyprint/pdf/anchors.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index fc8bbb564..2d5e29694 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -415,14 +415,14 @@ def test_command_line_render(tmp_path): os.environ.pop('SOURCE_DATE_EPOCH') stdout = _run('combined.html --uncompressed-pdf -') - assert stdout.count(b'attachment') == 0 + assert stdout.count(b'Filespec') == 0 stdout = _run('combined.html --uncompressed-pdf -') - assert stdout.count(b'attachment') == 0 + assert stdout.count(b'Filespec') == 0 stdout = _run('-a pattern.png --uncompressed-pdf combined.html -') - assert stdout.count(b'attachment') == 1 + assert stdout.count(b'Filespec') == 1 stdout = _run( '-a style.css -a pattern.png --uncompressed-pdf combined.html -') - assert stdout.count(b'attachment') == 2 + assert stdout.count(b'Filespec') == 2 _run('combined.html out23.pdf --timeout 30') assert (tmp_path / 'out23.pdf').read_bytes() == pdf_bytes diff --git a/tests/test_pdf.py b/tests/test_pdf.py index ec76929cc..aad87b300 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -598,7 +598,7 @@ def test_embedded_files_attachments(tmp_path): ] ) assert f'<{hashlib.md5(b"hi there").hexdigest()}>'.encode() in pdf - assert b'/F ()' in pdf + assert b'/F (attachment.bin)' in pdf assert b'/UF (attachment.bin)' in pdf name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be') assert b'/Desc <' + name.hex().encode() + b'>' in pdf diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 27dce58ce..65caf72de 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -271,7 +271,7 @@ def generate_pdf(document, target, zoom, **options): if pdf_attachments: content = pydyf.Dictionary({'Names': pydyf.Array()}) for i, pdf_attachment in enumerate(pdf_attachments): - content['Names'].append(pydyf.String(f'attachment{i}')) + content['Names'].append(pdf_attachment['F']) content['Names'].append(pdf_attachment.reference) pdf.add_object(content) if 'Names' not in pdf.catalog: diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index b47b39b28..2c3858ac7 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -378,7 +378,7 @@ def write_pdf_attachment(pdf, attachment, compress): pdf_attachment = pydyf.Dictionary({ 'Type': '/Filespec', - 'F': pydyf.String(), + 'F': pydyf.String(filename.encode(errors='ignore')), 'UF': pydyf.String(filename), 'EF': pydyf.Dictionary({'F': file_stream.reference}), 'Desc': pydyf.String(attachment.description or ''), From 2f70ca3ce18360c8f778d76135fbc8545c22d7a7 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub <guillaume@courtbouillon.org> Date: Sun, 19 Jan 2025 17:29:17 +0100 Subject: [PATCH 5/6] Call RDF generation function generate_rdf_metadata --- tests/test_api.py | 2 +- tests/test_pdf.py | 4 ++-- weasyprint/document.py | 4 ++-- weasyprint/pdf/metadata.py | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 2d5e29694..5ee0644b0 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1141,7 +1141,7 @@ def assert_meta(html, **meta): meta.setdefault('attachments', []) meta.setdefault('lang', None) meta.setdefault('custom', {}) - meta.setdefault('rdf_metadata_generator', generate_rdf_metadata) + meta.setdefault('generate_rdf_metadata', generate_rdf_metadata) assert vars(FakeHTML(string=html).render().metadata) == meta diff --git a/tests/test_pdf.py b/tests/test_pdf.py index aad87b300..ee24535da 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -731,13 +731,13 @@ def test_default_rdf_metadata(): @assert_no_logs def test_custom_rdf_metadata(): - def rdf_metadata_generator(*args, **kwargs): + def generate_rdf_metadata(*args, **kwargs): return b'TEST_METADATA' pdf_document = FakeHTML(string='<body>test</body>').render() pdf_document.metadata.title = None - pdf_document.metadata.rdf_metadata_generator = rdf_metadata_generator + pdf_document.metadata.generate_rdf_metadata = generate_rdf_metadata pdf_bytes = pdf_document.write_pdf( pdf_variant='pdf/a-3b', pdf_identifier=b'example-bytes', uncompressed_pdf=True) diff --git a/weasyprint/document.py b/weasyprint/document.py index 9ad74e295..235cf2c18 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -109,7 +109,7 @@ class DocumentMetadata: """ def __init__(self, title=None, authors= None, description=None, keywords=None, generator=None, created=None, modified=None, attachments=None, - lang=None, custom=None, rdf_metadata_generator=None): + lang=None, custom=None, generate_rdf_metadata=generate_rdf_metadata): #: The title of the document, as a string or :obj:`None`. #: Extracted from the ``<title>`` element in HTML #: and written to the ``/Title`` info field in PDF. @@ -157,7 +157,7 @@ def __init__(self, title=None, authors= None, description=None, keywords=None, self.custom = custom or {} #: Custom RDF metadata generator, which will replace the default generator. #: The function should return bytes containing an RDF XML. - self.rdf_metadata_generator = rdf_metadata_generator or generate_rdf_metadata + self.generate_rdf_metadata = generate_rdf_metadata class DiskCache: diff --git a/weasyprint/pdf/metadata.py b/weasyprint/pdf/metadata.py index d9a3fef07..12ce45282 100644 --- a/weasyprint/pdf/metadata.py +++ b/weasyprint/pdf/metadata.py @@ -27,7 +27,7 @@ def add_metadata(pdf, metadata, variant, version, conformance, compress): """ header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' footer = b'<?xpacket end="r"?>' - xml_data = metadata.rdf_metadata_generator(metadata, variant, version, conformance) + xml_data = metadata.generate_rdf_metadata(metadata, variant, version, conformance) stream_content = b'\n'.join((header, xml_data, footer)) extra = {'Type': '/Metadata', 'Subtype': '/XML'} metadata = pydyf.Stream([stream_content], extra, compress) @@ -36,9 +36,9 @@ def add_metadata(pdf, metadata, variant, version, conformance, compress): def generate_rdf_metadata(metadata, variant, version, conformance): - """Generate RDF metadata. + """Generate RDF metadata as a bytestring. - Might be replaced by DocumentMetadata.rdf_matadata_generator(). + Might be replaced by DocumentMetadata.rdf_metadata_generator(). """ namespace = f'pdf{variant}id' From 6f8a2f52f237b9a9dc277c4ce6b7349be73ec929 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub <guillaume@courtbouillon.org> Date: Sun, 19 Jan 2025 18:37:10 +0100 Subject: [PATCH 6/6] Add documentation for Factur-X / ZUGFeRD --- docs/common_use_cases.rst | 185 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 179 insertions(+), 6 deletions(-) diff --git a/docs/common_use_cases.rst b/docs/common_use_cases.rst index 8d506826f..59962b932 100644 --- a/docs/common_use_cases.rst +++ b/docs/common_use_cases.rst @@ -102,8 +102,8 @@ such as page numbers, headers, etc. Read more about the page_ at-rule. .. _page: https://developer.mozilla.org/en-US/docs/Web/CSS/@page -Generate PDFs Specialized for Accessibility (PDF/UA) and Archiving (PDF/A) --------------------------------------------------------------------------- +Generate Specialized PDFs +------------------------- WeasyPrint can generate different PDF variants, including PDF/UA and PDF/A. The feature is available by using the ``--pdf-variant`` CLI option, or the @@ -125,8 +125,8 @@ Even if WeasyPrint tries to generate valid documents, the result is not guaranteed: the HTML, CSS and PDF features chosen by the user must follow the limitations defined by the different specifications. -PDF/A -..... +PDF/A (Archiving) +................. PDF/A documents are specialized for archiving purposes. They are a simple subset of PDF, with a lot of limitations: no audio, video or JavaScript, @@ -145,8 +145,8 @@ valid PDF identifier, but you can provide your own with the If your document includes images, you must set the ``image-rendering: crisp-edges`` property to avoid anti-aliasing, that is forbidden by PDF/A. -PDF/UA -...... +PDF/UA (Universal Accessibility) +................................ PDF/UA documents are specialized for accessibility purposes. They include extra metadata that define document information and content structure. @@ -158,6 +158,179 @@ also used to define the order of the PDF content. Some information is required in your HTML file, including a ``<title>`` tag, and a ``lang`` attribute set on the ``<html>`` tag. +Factur-X / ZUGFeRD (Electronic Invoices) +........................................ + +Factur-X / ZUGFeRD is a Franco-German standard for hybrid e-invoice, the first +implementation of the European Semantic Standard EN 16931. It enables users to +include normalized metadata in PDF invoices, such as companies information or +invoice amounts, so that compatible software can automatically read this +information. This standard is based on PDF/A-3b. + +WeasyPrint can generate Factur-X / ZUGFeRD documents. Invoice metadata must be +generated by the user and included in the PDF document when rendered. Two +different metadata files are required: + +- the first one is RDF metadata, containing document metadata and PDF/A + extension information; +- the second one is Factur-X / ZUGFeRD metadata, containing invoice amounts, + plus seller and buyer information. + +Here is an example of Factur-X document generation. + +``rdf.xml``: + +.. code-block:: xml + + <x:xmpmeta + xmlns:x="adobe:ns:meta/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:pdf="http://ns.adobe.com/pdf/1.3/" + xmlns:fx="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#" + xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/" + xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#" + xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#"> + <!-- placeholder --> + <rdf:RDF> + <rdf:Description rdf:about=""> + <fx:ConformanceLevel>MINIMUM</fx:ConformanceLevel> + <fx:DocumentFileName>factur-x.xml</fx:DocumentFileName> + <fx:DocumentType>INVOICE</fx:DocumentType> + <fx:Version>1.0</fx:Version> + </rdf:Description> + <rdf:Description rdf:about=""> + <pdfaExtension:schemas> + <rdf:Bag> + <rdf:li rdf:parseType="Resource"> + <pdfaSchema:schema>Factur-X PDFA Extension Schema</pdfaSchema:schema> + <pdfaSchema:namespaceURI>urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#</pdfaSchema:namespaceURI> + <pdfaSchema:prefix>fx</pdfaSchema:prefix> + <pdfaSchema:property> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>DocumentFileName</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>name of the embedded XML invoice file</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>DocumentType</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>INVOICE</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>Version</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>The actual version of the Factur-X XML schema</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>ConformanceLevel</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>The conformance level of the embedded Factur-X data</pdfaProperty:description> + </rdf:li> + </rdf:Seq> + </pdfaSchema:property> + </rdf:li> + </rdf:Bag> + </pdfaExtension:schemas> + </rdf:Description> + </rdf:RDF> + </x:xmpmeta> + +``factur-x.xml``: + +.. code-block:: xml + + <rsm:CrossIndustryInvoice + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:qdt="urn:un:unece:uncefact:data:standard:QualifiedDataType:100" + xmlns:udt="urn:un:unece:uncefact:data:standard:UnqualifiedDataType:100" + xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100" + xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100"> + <rsm:ExchangedDocumentContext> + <ram:BusinessProcessSpecifiedDocumentContextParameter> + <ram:ID>A1</ram:ID> + </ram:BusinessProcessSpecifiedDocumentContextParameter> + <ram:GuidelineSpecifiedDocumentContextParameter> + <ram:ID>urn:factur-x.eu:1p0:minimum</ram:ID> + </ram:GuidelineSpecifiedDocumentContextParameter> + </rsm:ExchangedDocumentContext> + <rsm:ExchangedDocument> + <ram:ID>123</ram:ID> + <ram:TypeCode>380</ram:TypeCode> + <ram:IssueDateTime> + <udt:DateTimeString format="102">20200131</udt:DateTimeString> + </ram:IssueDateTime> + </rsm:ExchangedDocument> + <rsm:SupplyChainTradeTransaction> + <ram:ApplicableHeaderTradeAgreement> + <ram:BuyerReference>Buyer</ram:BuyerReference> + <ram:SellerTradeParty> + <ram:Name>Supplyer Corp</ram:Name> + <ram:SpecifiedLegalOrganization> + <ram:ID schemeID="0002">123456782</ram:ID> + </ram:SpecifiedLegalOrganization> + <ram:PostalTradeAddress> + <ram:CountryID>FR</ram:CountryID> + </ram:PostalTradeAddress> + <ram:SpecifiedTaxRegistration> + <ram:ID schemeID="VA">FR11123456782</ram:ID> + </ram:SpecifiedTaxRegistration> + </ram:SellerTradeParty> + <ram:BuyerTradeParty> + <ram:Name>Buyer Corp</ram:Name> + <ram:SpecifiedLegalOrganization> + <ram:ID schemeID="0002">987654324</ram:ID> + </ram:SpecifiedLegalOrganization> + </ram:BuyerTradeParty> + <ram:BuyerOrderReferencedDocument > + <ram:IssuerAssignedID>456</ram:IssuerAssignedID> + </ram:BuyerOrderReferencedDocument> + </ram:ApplicableHeaderTradeAgreement> + <ram:ApplicableHeaderTradeDelivery/> + <ram:ApplicableHeaderTradeSettlement> + <ram:InvoiceCurrencyCode>EUR</ram:InvoiceCurrencyCode> + <ram:SpecifiedTradeSettlementHeaderMonetarySummation> + <ram:TaxBasisTotalAmount>100.00</ram:TaxBasisTotalAmount> + <ram:TaxTotalAmount currencyID="EUR">20.00</ram:TaxTotalAmount> + <ram:GrandTotalAmount>120.00</ram:GrandTotalAmount> + <ram:DuePayableAmount>120.00</ram:DuePayableAmount> + </ram:SpecifiedTradeSettlementHeaderMonetarySummation> + </ram:ApplicableHeaderTradeSettlement> + </rsm:SupplyChainTradeTransaction> + </rsm:CrossIndustryInvoice> + +``invoice.py``: + +.. code-block:: python + + from pathlib import Path + from weasyprint import Attachment, HTML + + def generate_rdf_metadata(metadata, variant, version, conformance): + original_rdf = generate_original_rdf_metadata(metadata, variant, version, conformance) + return Path("rdf.xml").read_bytes().replace(b"<!-- placeholder -->", original_rdf) + + document = HTML(string="<h1>Invoice</h1>").render() + generate_original_rdf_metadata = document.metadata.generate_rdf_metadata + + factur_x_xml = Path("factur-x.xml").read_text() + attachment = Attachment(string=factur_x_xml, name="factur-x.xml", relationship="Data") + document.metadata.attachments = [attachment] + + document.metadata.generate_rdf_metadata = generate_rdf_metadata + document.write_pdf("invoice.pdf", pdf_variant="pdf/a-3b") + +Of course, the content of these files has to be adapted to the content of real +invoices. Using XML generators instead of plain text manipulation is also +highly recommended. + +A more detailed blog article is available on `Binary Butterfly’s website +<https://binary-butterfly.de/artikel/factur-x-zugferd-e-invoices-with-python/>`_. + Include PDF Forms -----------------