diff --git a/docs/common_use_cases.rst b/docs/common_use_cases.rst index 8d506826f..59962b932 100644 --- a/docs/common_use_cases.rst +++ b/docs/common_use_cases.rst @@ -102,8 +102,8 @@ such as page numbers, headers, etc. Read more about the page_ at-rule. .. _page: https://developer.mozilla.org/en-US/docs/Web/CSS/@page -Generate PDFs Specialized for Accessibility (PDF/UA) and Archiving (PDF/A) --------------------------------------------------------------------------- +Generate Specialized PDFs +------------------------- WeasyPrint can generate different PDF variants, including PDF/UA and PDF/A. The feature is available by using the ``--pdf-variant`` CLI option, or the @@ -125,8 +125,8 @@ Even if WeasyPrint tries to generate valid documents, the result is not guaranteed: the HTML, CSS and PDF features chosen by the user must follow the limitations defined by the different specifications. -PDF/A -..... +PDF/A (Archiving) +................. PDF/A documents are specialized for archiving purposes. They are a simple subset of PDF, with a lot of limitations: no audio, video or JavaScript, @@ -145,8 +145,8 @@ valid PDF identifier, but you can provide your own with the If your document includes images, you must set the ``image-rendering: crisp-edges`` property to avoid anti-aliasing, that is forbidden by PDF/A. -PDF/UA -...... +PDF/UA (Universal Accessibility) +................................ PDF/UA documents are specialized for accessibility purposes. They include extra metadata that define document information and content structure. @@ -158,6 +158,179 @@ also used to define the order of the PDF content. Some information is required in your HTML file, including a ```` tag, and a ``lang`` attribute set on the ``<html>`` tag. +Factur-X / ZUGFeRD (Electronic Invoices) +........................................ + +Factur-X / ZUGFeRD is a Franco-German standard for hybrid e-invoice, the first +implementation of the European Semantic Standard EN 16931. It enables users to +include normalized metadata in PDF invoices, such as companies information or +invoice amounts, so that compatible software can automatically read this +information. This standard is based on PDF/A-3b. + +WeasyPrint can generate Factur-X / ZUGFeRD documents. Invoice metadata must be +generated by the user and included in the PDF document when rendered. Two +different metadata files are required: + +- the first one is RDF metadata, containing document metadata and PDF/A + extension information; +- the second one is Factur-X / ZUGFeRD metadata, containing invoice amounts, + plus seller and buyer information. + +Here is an example of Factur-X document generation. + +``rdf.xml``: + +.. code-block:: xml + + <x:xmpmeta + xmlns:x="adobe:ns:meta/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:pdf="http://ns.adobe.com/pdf/1.3/" + xmlns:fx="urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#" + xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/" + xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#" + xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#"> + <!-- placeholder --> + <rdf:RDF> + <rdf:Description rdf:about=""> + <fx:ConformanceLevel>MINIMUM</fx:ConformanceLevel> + <fx:DocumentFileName>factur-x.xml</fx:DocumentFileName> + <fx:DocumentType>INVOICE</fx:DocumentType> + <fx:Version>1.0</fx:Version> + </rdf:Description> + <rdf:Description rdf:about=""> + <pdfaExtension:schemas> + <rdf:Bag> + <rdf:li rdf:parseType="Resource"> + <pdfaSchema:schema>Factur-X PDFA Extension Schema</pdfaSchema:schema> + <pdfaSchema:namespaceURI>urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#</pdfaSchema:namespaceURI> + <pdfaSchema:prefix>fx</pdfaSchema:prefix> + <pdfaSchema:property> + <rdf:Seq> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>DocumentFileName</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>name of the embedded XML invoice file</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>DocumentType</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>INVOICE</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>Version</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>The actual version of the Factur-X XML schema</pdfaProperty:description> + </rdf:li> + <rdf:li rdf:parseType="Resource"> + <pdfaProperty:name>ConformanceLevel</pdfaProperty:name> + <pdfaProperty:valueType>Text</pdfaProperty:valueType> + <pdfaProperty:category>external</pdfaProperty:category> + <pdfaProperty:description>The conformance level of the embedded Factur-X data</pdfaProperty:description> + </rdf:li> + </rdf:Seq> + </pdfaSchema:property> + </rdf:li> + </rdf:Bag> + </pdfaExtension:schemas> + </rdf:Description> + </rdf:RDF> + </x:xmpmeta> + +``factur-x.xml``: + +.. code-block:: xml + + <rsm:CrossIndustryInvoice + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:qdt="urn:un:unece:uncefact:data:standard:QualifiedDataType:100" + xmlns:udt="urn:un:unece:uncefact:data:standard:UnqualifiedDataType:100" + xmlns:rsm="urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100" + xmlns:ram="urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100"> + <rsm:ExchangedDocumentContext> + <ram:BusinessProcessSpecifiedDocumentContextParameter> + <ram:ID>A1</ram:ID> + </ram:BusinessProcessSpecifiedDocumentContextParameter> + <ram:GuidelineSpecifiedDocumentContextParameter> + <ram:ID>urn:factur-x.eu:1p0:minimum</ram:ID> + </ram:GuidelineSpecifiedDocumentContextParameter> + </rsm:ExchangedDocumentContext> + <rsm:ExchangedDocument> + <ram:ID>123</ram:ID> + <ram:TypeCode>380</ram:TypeCode> + <ram:IssueDateTime> + <udt:DateTimeString format="102">20200131</udt:DateTimeString> + </ram:IssueDateTime> + </rsm:ExchangedDocument> + <rsm:SupplyChainTradeTransaction> + <ram:ApplicableHeaderTradeAgreement> + <ram:BuyerReference>Buyer</ram:BuyerReference> + <ram:SellerTradeParty> + <ram:Name>Supplyer Corp</ram:Name> + <ram:SpecifiedLegalOrganization> + <ram:ID schemeID="0002">123456782</ram:ID> + </ram:SpecifiedLegalOrganization> + <ram:PostalTradeAddress> + <ram:CountryID>FR</ram:CountryID> + </ram:PostalTradeAddress> + <ram:SpecifiedTaxRegistration> + <ram:ID schemeID="VA">FR11123456782</ram:ID> + </ram:SpecifiedTaxRegistration> + </ram:SellerTradeParty> + <ram:BuyerTradeParty> + <ram:Name>Buyer Corp</ram:Name> + <ram:SpecifiedLegalOrganization> + <ram:ID schemeID="0002">987654324</ram:ID> + </ram:SpecifiedLegalOrganization> + </ram:BuyerTradeParty> + <ram:BuyerOrderReferencedDocument > + <ram:IssuerAssignedID>456</ram:IssuerAssignedID> + </ram:BuyerOrderReferencedDocument> + </ram:ApplicableHeaderTradeAgreement> + <ram:ApplicableHeaderTradeDelivery/> + <ram:ApplicableHeaderTradeSettlement> + <ram:InvoiceCurrencyCode>EUR</ram:InvoiceCurrencyCode> + <ram:SpecifiedTradeSettlementHeaderMonetarySummation> + <ram:TaxBasisTotalAmount>100.00</ram:TaxBasisTotalAmount> + <ram:TaxTotalAmount currencyID="EUR">20.00</ram:TaxTotalAmount> + <ram:GrandTotalAmount>120.00</ram:GrandTotalAmount> + <ram:DuePayableAmount>120.00</ram:DuePayableAmount> + </ram:SpecifiedTradeSettlementHeaderMonetarySummation> + </ram:ApplicableHeaderTradeSettlement> + </rsm:SupplyChainTradeTransaction> + </rsm:CrossIndustryInvoice> + +``invoice.py``: + +.. code-block:: python + + from pathlib import Path + from weasyprint import Attachment, HTML + + def generate_rdf_metadata(metadata, variant, version, conformance): + original_rdf = generate_original_rdf_metadata(metadata, variant, version, conformance) + return Path("rdf.xml").read_bytes().replace(b"<!-- placeholder -->", original_rdf) + + document = HTML(string="<h1>Invoice</h1>").render() + generate_original_rdf_metadata = document.metadata.generate_rdf_metadata + + factur_x_xml = Path("factur-x.xml").read_text() + attachment = Attachment(string=factur_x_xml, name="factur-x.xml", relationship="Data") + document.metadata.attachments = [attachment] + + document.metadata.generate_rdf_metadata = generate_rdf_metadata + document.write_pdf("invoice.pdf", pdf_variant="pdf/a-3b") + +Of course, the content of these files has to be adapted to the content of real +invoices. Using XML generators instead of plain text manipulation is also +highly recommended. + +A more detailed blog article is available on `Binary Butterfly’s website +<https://binary-butterfly.de/artikel/factur-x-zugferd-e-invoices-with-python/>`_. + Include PDF Forms ----------------- diff --git a/tests/test_api.py b/tests/test_api.py index 48b3efe63..5ee0644b0 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -18,6 +18,7 @@ from weasyprint import CSS, HTML, __main__, default_url_fetcher from weasyprint.pdf.anchors import resolve_links +from weasyprint.pdf.metadata import generate_rdf_metadata from weasyprint.urls import path2url from .draw import parse_pixels @@ -414,14 +415,14 @@ def test_command_line_render(tmp_path): os.environ.pop('SOURCE_DATE_EPOCH') stdout = _run('combined.html --uncompressed-pdf -') - assert stdout.count(b'attachment') == 0 + assert stdout.count(b'Filespec') == 0 stdout = _run('combined.html --uncompressed-pdf -') - assert stdout.count(b'attachment') == 0 + assert stdout.count(b'Filespec') == 0 stdout = _run('-a pattern.png --uncompressed-pdf combined.html -') - assert stdout.count(b'attachment') == 1 + assert stdout.count(b'Filespec') == 1 stdout = _run( '-a style.css -a pattern.png --uncompressed-pdf combined.html -') - assert stdout.count(b'attachment') == 2 + assert stdout.count(b'Filespec') == 2 _run('combined.html out23.pdf --timeout 30') assert (tmp_path / 'out23.pdf').read_bytes() == pdf_bytes @@ -1140,6 +1141,7 @@ def assert_meta(html, **meta): meta.setdefault('attachments', []) meta.setdefault('lang', None) meta.setdefault('custom', {}) + meta.setdefault('generate_rdf_metadata', generate_rdf_metadata) assert vars(FakeHTML(string=html).render().metadata) == meta diff --git a/tests/test_pdf.py b/tests/test_pdf.py index d657e6d9a..ee24535da 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -598,7 +598,7 @@ def test_embedded_files_attachments(tmp_path): ] ) assert f'<{hashlib.md5(b"hi there").hexdigest()}>'.encode() in pdf - assert b'/F ()' in pdf + assert b'/F (attachment.bin)' in pdf assert b'/UF (attachment.bin)' in pdf name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be') assert b'/Desc <' + name.hex().encode() + b'>' in pdf @@ -716,3 +716,29 @@ def test_bleed(style, media, bleed, trim): assert f'/MediaBox {str(media).replace(",", "")}'.encode() in pdf assert f'/BleedBox {str(bleed).replace(",", "")}'.encode() in pdf assert f'/TrimBox {str(trim).replace(",", "")}'.encode() in pdf + + +@assert_no_logs +def test_default_rdf_metadata(): + pdf_document = FakeHTML(string='<body>test</body>').render() + + pdf_document.metadata.title = None + + pdf_bytes = pdf_document.write_pdf( + pdf_variant='pdf/a-3b', pdf_identifier=b'example-bytes', uncompressed_pdf=True) + assert b'<rdf:RDF xmlns:pdf="http://ns.adobe.com/pdf/1.3/"' in pdf_bytes + + +@assert_no_logs +def test_custom_rdf_metadata(): + def generate_rdf_metadata(*args, **kwargs): + return b'TEST_METADATA' + + pdf_document = FakeHTML(string='<body>test</body>').render() + + pdf_document.metadata.title = None + pdf_document.metadata.generate_rdf_metadata = generate_rdf_metadata + + pdf_bytes = pdf_document.write_pdf( + pdf_variant='pdf/a-3b', pdf_identifier=b'example-bytes', uncompressed_pdf=True) + assert b'TEST_METADATA' in pdf_bytes diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 40635bf0a..a3acac430 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -318,6 +318,9 @@ class Attachment: HTML specific arguments (``encoding`` and ``media_type``) are not supported. + :param str name: + The name of the attachment to be included in the PDF document. + May be :obj:`None`. :param str description: A description of the attachment to be included in the PDF document. May be :obj:`None`. @@ -335,11 +338,12 @@ class Attachment: """ def __init__(self, guess=None, filename=None, url=None, file_obj=None, string=None, base_url=None, url_fetcher=default_url_fetcher, - description=None, created=None, modified=None, + name=None, description=None, created=None, modified=None, relationship='Unspecified'): self.source = _select_source( guess, filename, url, file_obj, string, base_url=base_url, url_fetcher=url_fetcher) + self.name = name self.description = description self.relationship = relationship self.md5 = None diff --git a/weasyprint/document.py b/weasyprint/document.py index f630fe6a5..235cf2c18 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -18,6 +18,7 @@ from .logger import PROGRESS_LOGGER from .matrix import Matrix from .pdf import VARIANTS, generate_pdf +from .pdf.metadata import generate_rdf_metadata from .text.fonts import FontConfiguration @@ -105,12 +106,10 @@ class DocumentMetadata: """Meta-information belonging to a whole :class:`Document`. New attributes may be added in future versions of WeasyPrint. - """ - - def __init__(self, title=None, authors=None, description=None, - keywords=None, generator=None, created=None, modified=None, - attachments=None, lang=None, custom=None): + def __init__(self, title=None, authors= None, description=None, keywords=None, + generator=None, created=None, modified=None, attachments=None, + lang=None, custom=None, generate_rdf_metadata=generate_rdf_metadata): #: The title of the document, as a string or :obj:`None`. #: Extracted from the ``<title>`` element in HTML #: and written to the ``/Title`` info field in PDF. @@ -156,6 +155,9 @@ def __init__(self, title=None, authors=None, description=None, #: Custom metadata, as a dict whose keys are the metadata names and #: values are the metadata values. self.custom = custom or {} + #: Custom RDF metadata generator, which will replace the default generator. + #: The function should return bytes containing an RDF XML. + self.generate_rdf_metadata = generate_rdf_metadata class DiskCache: diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 27dce58ce..65caf72de 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -271,7 +271,7 @@ def generate_pdf(document, target, zoom, **options): if pdf_attachments: content = pydyf.Dictionary({'Names': pydyf.Array()}) for i, pdf_attachment in enumerate(pdf_attachments): - content['Names'].append(pydyf.String(f'attachment{i}')) + content['Names'].append(pdf_attachment['F']) content['Names'].append(pdf_attachment.reference) pdf.add_object(content) if 'Names' not in pdf.catalog: diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index ebf489cdc..2c3858ac7 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -351,7 +351,9 @@ def write_pdf_attachment(pdf, attachment, compress): # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename and MIME type. - if url and urlsplit(url).path: + if attachment.name: + filename = attachment.name + elif url and urlsplit(url).path: filename = basename(unquote(urlsplit(url).path)) else: filename = 'attachment.bin' @@ -376,7 +378,7 @@ def write_pdf_attachment(pdf, attachment, compress): pdf_attachment = pydyf.Dictionary({ 'Type': '/Filespec', - 'F': pydyf.String(), + 'F': pydyf.String(filename.encode(errors='ignore')), 'UF': pydyf.String(filename), 'EF': pydyf.Dictionary({'F': file_stream.reference}), 'Desc': pydyf.String(attachment.description or ''), diff --git a/weasyprint/pdf/metadata.py b/weasyprint/pdf/metadata.py index 7cde641f2..12ce45282 100644 --- a/weasyprint/pdf/metadata.py +++ b/weasyprint/pdf/metadata.py @@ -25,7 +25,22 @@ def add_metadata(pdf, metadata, variant, version, conformance, compress): Described in ISO-32000-1:2008, 14.3.2. """ - # Add metadata + header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' + footer = b'<?xpacket end="r"?>' + xml_data = metadata.generate_rdf_metadata(metadata, variant, version, conformance) + stream_content = b'\n'.join((header, xml_data, footer)) + extra = {'Type': '/Metadata', 'Subtype': '/XML'} + metadata = pydyf.Stream([stream_content], extra, compress) + pdf.add_object(metadata) + pdf.catalog['Metadata'] = metadata.reference + + +def generate_rdf_metadata(metadata, variant, version, conformance): + """Generate RDF metadata as a bytestring. + + Might be replaced by DocumentMetadata.rdf_metadata_generator(). + + """ namespace = f'pdf{variant}id' rdf = Element(f'{{{NS["rdf"]}}}RDF') @@ -82,11 +97,4 @@ def add_metadata(pdf, metadata, variant, version, conformance, compress): element.attrib[f'{{{NS["rdf"]}}}about'] = '' element = SubElement(element, f'{{{NS["xmp"]}}}ModifyDate') element.text = metadata.modified - xml = tostring(rdf, encoding='utf-8') - header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>' - footer = b'<?xpacket end="r"?>' - stream_content = b'\n'.join((header, xml, footer)) - extra = {'Type': '/Metadata', 'Subtype': '/XML'} - metadata = pydyf.Stream([stream_content], extra, compress) - pdf.add_object(metadata) - pdf.catalog['Metadata'] = metadata.reference + return tostring(rdf, encoding='utf-8')