From 3ac5eb832fdbaf871fa89753fb900b865d25e6e6 Mon Sep 17 00:00:00 2001 From: Yann Trividic Date: Mon, 24 Jun 2024 16:01:35 +0200 Subject: [PATCH 1/5] WeasyPrint now produces a LinkAnnotation for each HTMLElement with an id attribute. --- weasyprint/anchors.py | 16 +++++++-- weasyprint/document.py | 5 ++- weasyprint/pdf/__init__.py | 12 +++++-- weasyprint/pdf/debug.py | 69 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 6 deletions(-) create mode 100644 weasyprint/pdf/debug.py diff --git a/weasyprint/anchors.py b/weasyprint/anchors.py index ab18e53f1..611547b37 100644 --- a/weasyprint/anchors.py +++ b/weasyprint/anchors.py @@ -27,11 +27,11 @@ def rectangle_aabb(matrix, pos_x, pos_y, width, height): return box_x1, box_y1, box_x2, box_y2 -def gather_anchors(box, anchors, links, bookmarks, forms, parent_matrix=None, +def gather_anchors(box, anchors, links, bookmarks, forms, debug, parent_matrix=None, parent_form=None): """Gather anchors and other data related to specific positions in PDF. - Currently finds anchors, links, bookmarks and forms. + Currently finds anchors, links, bookmarks, forms and debug ids. """ # Get box transformation matrix. @@ -121,8 +121,18 @@ def gather_anchors(box, anchors, links, bookmarks, forms, parent_matrix=None, if has_anchor: anchors[anchor_name] = pos_x, pos_y + # And this is what got added for debugging, + # everything that's not covered by the previous categories + else: + # Not sure why, but all elements are here twice? + if(box.element is not None and box.element.get("id") is not None) : + # print(box.element.tag, box.element.get("id")) + pos_x, pos_y, width, height = box.hit_area() + rectangle = rectangle_aabb(matrix, pos_x, pos_y, width, height) + debug.append((box.element, box.style, rectangle, box)) + for child in box.all_children(): - gather_anchors(child, anchors, links, bookmarks, forms, matrix, parent_form) + gather_anchors(child, anchors, links, bookmarks, forms, debug, matrix, parent_form) def make_page_bookmark_tree(page, skipped_levels, last_by_depth, diff --git a/weasyprint/document.py b/weasyprint/document.py index f630fe6a5..a7868e8f6 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -76,7 +76,10 @@ def __init__(self, page_box): #: The key ``None`` will contain inputs that are not part of a form. self.forms = {None: []} - gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms) + #: Appended for the debug PDF + self.debug = [] + + gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms, self.debug) self._page_box = page_box def paint(self, stream, scale=1): diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 15d12e811..7ef26e699 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -16,6 +16,8 @@ add_annotations, add_forms, add_links, add_outlines, resolve_links, write_pdf_attachment) +from .debug import (add_debug, resolve_debug) + VARIANTS = { name: data for variants in (pdfa.VARIANTS, pdfua.VARIANTS) for (name, data) in variants.items()} @@ -145,11 +147,14 @@ def generate_pdf(document, target, zoom, **options): # Links and anchors page_links_and_anchors = list(resolve_links(document.pages)) + # Debug links and anchors + page_debug = list(resolve_debug(document.pages)) + annot_files = {} pdf_pages, page_streams = [], [] compress = not options['uncompressed_pdf'] - for page_number, (page, links_and_anchors) in enumerate( - zip(document.pages, page_links_and_anchors)): + for page_number, (page, links_and_anchors, debug) in enumerate( + zip(document.pages, page_links_and_anchors, page_debug)): # Draw from the top-left corner matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale) @@ -192,6 +197,9 @@ def generate_pdf(document, target, zoom, **options): add_forms( page.forms, matrix, pdf, pdf_page, resources, stream, document.font_config.font_map, compress) + add_debug(debug[0], matrix, pdf, pdf_page, pdf_names, mark) + page.paint(stream, scale) + page.paint(stream, scale) # Bleed diff --git a/weasyprint/pdf/debug.py b/weasyprint/pdf/debug.py new file mode 100644 index 000000000..5590dfe2e --- /dev/null +++ b/weasyprint/pdf/debug.py @@ -0,0 +1,69 @@ + +import pydyf + +def add_debug(debug, matrix, pdf, page, names, mark): + """Include anchors for each element with an ID in a given PDF page.""" + if not debug: + return + + if 'Annots' not in page: + page['Annots'] = pydyf.Array() + + ids = {} + + for i, (element, style, rectangle, box) in enumerate(debug): # style is usused for now? + id = element.get("id") + if id.startswith("auto-id"): + id = "-".join(id.split("-")[:4]) + if id in ids: + ids[id] += 1 + else: + ids[id] = 0 + final_id = id + "-" + str(ids[id]) + element.set("id", final_id) + id = final_id + # print("add_debug", element.get("id")) + x1, y1 = matrix.transform_point(*rectangle[:2]) + x2, y2 = matrix.transform_point(*rectangle[2:]) + box.annotation = pydyf.Dictionary({ + 'Type': '/Annot', + 'Subtype': '/Link', + # 'Subtype': '/Square', + 'Rect': pydyf.Array([x1, y1, x2, y2]), + 'P': page.reference, + # 'BS': pydyf.Dictionary({'W': 1}), # border style + 'T': pydyf.String(id), # the title element gets added as metadata + }) + + # Internal links are deactivated when in local + # See: https://github.com/mozilla/pdf.js/issues/12415 + # box.annotation['A'] = pydyf.Dictionary({ + # 'Type': '/Action', + # 'S': '/URI', + # 'URI': pydyf.String("#" + id) + # }) + + # Internal links - works better with a local version PDFjs... But why? + box.annotation['Dest'] = pydyf.String(id) + + # In order to preserve page references + names.append([id, pydyf.Array([page.reference, '/XYZ', x1, y1, 0])]) + + # Actually adding the PDF object + pdf.add_object(box.annotation) + page['Annots'].append(box.annotation.reference) + +def resolve_debug(pages): + '''Resolve the added debug IDs. Inspired from resolve_links. + ''' + debug = list() + paged_debug = [] + for i, page in enumerate(pages): + paged_debug.append([]) + # for (element, style, rectangle, box) in page.debug: + # debug.append(element.get('id')) + for page in pages: + page_debug = [] + for m in page.debug: + page_debug.append(m) + yield page_debug, paged_debug.pop(0) From b4bb02a53ac7e7dffdddc1bf5e3791e003c9e9c3 Mon Sep 17 00:00:00 2001 From: Yann Trividic Date: Mon, 24 Jun 2024 16:09:17 +0200 Subject: [PATCH 2/5] ruff checks passed! --- weasyprint/anchors.py | 3 ++- weasyprint/document.py | 3 ++- weasyprint/pdf/__init__.py | 2 +- weasyprint/pdf/debug.py | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/weasyprint/anchors.py b/weasyprint/anchors.py index 611547b37..886c3e264 100644 --- a/weasyprint/anchors.py +++ b/weasyprint/anchors.py @@ -132,7 +132,8 @@ def gather_anchors(box, anchors, links, bookmarks, forms, debug, parent_matrix=N debug.append((box.element, box.style, rectangle, box)) for child in box.all_children(): - gather_anchors(child, anchors, links, bookmarks, forms, debug, matrix, parent_form) + gather_anchors(child, anchors, links, bookmarks, forms, debug, matrix, + parent_form) def make_page_bookmark_tree(page, skipped_levels, last_by_depth, diff --git a/weasyprint/document.py b/weasyprint/document.py index a7868e8f6..0b92790dc 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -79,7 +79,8 @@ def __init__(self, page_box): #: Appended for the debug PDF self.debug = [] - gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms, self.debug) + gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms, + self.debug) self._page_box = page_box def paint(self, stream, scale=1): diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 7ef26e699..95283b6e9 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -16,7 +16,7 @@ add_annotations, add_forms, add_links, add_outlines, resolve_links, write_pdf_attachment) -from .debug import (add_debug, resolve_debug) +from .debug import add_debug, resolve_debug VARIANTS = { name: data for variants in (pdfa.VARIANTS, pdfua.VARIANTS) diff --git a/weasyprint/pdf/debug.py b/weasyprint/pdf/debug.py index 5590dfe2e..03c343edc 100644 --- a/weasyprint/pdf/debug.py +++ b/weasyprint/pdf/debug.py @@ -1,6 +1,7 @@ import pydyf + def add_debug(debug, matrix, pdf, page, names, mark): """Include anchors for each element with an ID in a given PDF page.""" if not debug: @@ -11,7 +12,7 @@ def add_debug(debug, matrix, pdf, page, names, mark): ids = {} - for i, (element, style, rectangle, box) in enumerate(debug): # style is usused for now? + for i, (element, style, rectangle, box) in enumerate(debug): id = element.get("id") if id.startswith("auto-id"): id = "-".join(id.split("-")[:4]) @@ -56,7 +57,6 @@ def add_debug(debug, matrix, pdf, page, names, mark): def resolve_debug(pages): '''Resolve the added debug IDs. Inspired from resolve_links. ''' - debug = list() paged_debug = [] for i, page in enumerate(pages): paged_debug.append([]) From b33b9177813d7ebba9da6675643e022b7f174719 Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sat, 28 Sep 2024 17:25:20 +0200 Subject: [PATCH 3/5] @liZe's code has been integrated into the PR --- weasyprint/anchors.py | 27 +++++------- weasyprint/document.py | 6 +-- weasyprint/pdf/__init__.py | 16 ++----- weasyprint/pdf/anchors.py | 2 +- weasyprint/pdf/debug.py | 86 +++++++++++++------------------------- 5 files changed, 44 insertions(+), 93 deletions(-) diff --git a/weasyprint/anchors.py b/weasyprint/anchors.py index 886c3e264..499d85bfe 100644 --- a/weasyprint/anchors.py +++ b/weasyprint/anchors.py @@ -27,11 +27,11 @@ def rectangle_aabb(matrix, pos_x, pos_y, width, height): return box_x1, box_y1, box_x2, box_y2 -def gather_anchors(box, anchors, links, bookmarks, forms, debug, parent_matrix=None, +def gather_anchors(box, anchors, links, bookmarks, forms, parent_matrix=None, parent_form=None): """Gather anchors and other data related to specific positions in PDF. - Currently finds anchors, links, bookmarks, forms and debug ids. + Currently finds anchors, links, bookmarks and forms. """ # Get box transformation matrix. @@ -113,27 +113,20 @@ def gather_anchors(box, anchors, links, bookmarks, forms, debug, parent_matrix=N links.append((link_type, target, rectangle, box)) if is_input: forms[parent_form].append((box.element, box.style, rectangle)) - if matrix and (has_bookmark or has_anchor): - pos_x, pos_y = matrix.transform_point(pos_x, pos_y) if has_bookmark: + if matrix: + pos_x, pos_y = matrix.transform_point(pos_x, pos_y) bookmark = (bookmark_level, bookmark_label, (pos_x, pos_y), state) bookmarks.append(bookmark) if has_anchor: - anchors[anchor_name] = pos_x, pos_y - - # And this is what got added for debugging, - # everything that's not covered by the previous categories - else: - # Not sure why, but all elements are here twice? - if(box.element is not None and box.element.get("id") is not None) : - # print(box.element.tag, box.element.get("id")) - pos_x, pos_y, width, height = box.hit_area() - rectangle = rectangle_aabb(matrix, pos_x, pos_y, width, height) - debug.append((box.element, box.style, rectangle, box)) + pos_x1, pos_y1, pos_x2, pos_y2 = pos_x, pos_y, pos_x + width, pos_y + height + if matrix: + pos_x1, pos_y1 = matrix.transform_point(pos_x1, pos_y1) + pos_x2, pos_y2 = matrix.transform_point(pos_x2, pos_y2) + anchors[anchor_name] = (pos_x1, pos_y1, pos_x2, pos_y2) for child in box.all_children(): - gather_anchors(child, anchors, links, bookmarks, forms, debug, matrix, - parent_form) + gather_anchors(child, anchors, links, bookmarks, forms, matrix, parent_form) def make_page_bookmark_tree(page, skipped_levels, last_by_depth, diff --git a/weasyprint/document.py b/weasyprint/document.py index 0b92790dc..f630fe6a5 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -76,11 +76,7 @@ def __init__(self, page_box): #: The key ``None`` will contain inputs that are not part of a form. self.forms = {None: []} - #: Appended for the debug PDF - self.debug = [] - - gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms, - self.debug) + gather_anchors(page_box, self.anchors, self.links, self.bookmarks, self.forms) self._page_box = page_box def paint(self, stream, scale=1): diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 95283b6e9..66f38ae41 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -8,7 +8,7 @@ from ..html import W3C_DATE_RE from ..logger import LOGGER, PROGRESS_LOGGER from ..matrix import Matrix -from . import pdfa, pdfua +from . import debug, pdfa, pdfua from .fonts import build_fonts_dictionary from .stream import Stream @@ -16,10 +16,8 @@ add_annotations, add_forms, add_links, add_outlines, resolve_links, write_pdf_attachment) -from .debug import add_debug, resolve_debug - VARIANTS = { - name: data for variants in (pdfa.VARIANTS, pdfua.VARIANTS) + name: data for variants in (pdfa.VARIANTS, pdfua.VARIANTS, debug.VARIANTS) for (name, data) in variants.items()} @@ -147,14 +145,11 @@ def generate_pdf(document, target, zoom, **options): # Links and anchors page_links_and_anchors = list(resolve_links(document.pages)) - # Debug links and anchors - page_debug = list(resolve_debug(document.pages)) - annot_files = {} pdf_pages, page_streams = [], [] compress = not options['uncompressed_pdf'] - for page_number, (page, links_and_anchors, debug) in enumerate( - zip(document.pages, page_links_and_anchors, page_debug)): + for page_number, (page, links_and_anchors) in enumerate( + zip(document.pages, page_links_and_anchors)): # Draw from the top-left corner matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale) @@ -197,9 +192,6 @@ def generate_pdf(document, target, zoom, **options): add_forms( page.forms, matrix, pdf, pdf_page, resources, stream, document.font_config.font_map, compress) - add_debug(debug[0], matrix, pdf, pdf_page, pdf_names, mark) - page.paint(stream, scale) - page.paint(stream, scale) # Bleed diff --git a/weasyprint/pdf/anchors.py b/weasyprint/pdf/anchors.py index 9cbd0f3ee..c2b60bd01 100644 --- a/weasyprint/pdf/anchors.py +++ b/weasyprint/pdf/anchors.py @@ -406,7 +406,7 @@ def resolve_links(pages): paged_anchors = [] for i, page in enumerate(pages): paged_anchors.append([]) - for anchor_name, (point_x, point_y) in page.anchors.items(): + for anchor_name, (point_x, point_y, _, _) in page.anchors.items(): if anchor_name not in anchors: paged_anchors[-1].append((anchor_name, point_x, point_y)) anchors.add(anchor_name) diff --git a/weasyprint/pdf/debug.py b/weasyprint/pdf/debug.py index 03c343edc..a945f1eae 100644 --- a/weasyprint/pdf/debug.py +++ b/weasyprint/pdf/debug.py @@ -1,69 +1,39 @@ +"""PDF generation with debug information.""" import pydyf +from ..matrix import Matrix -def add_debug(debug, matrix, pdf, page, names, mark): - """Include anchors for each element with an ID in a given PDF page.""" - if not debug: - return - if 'Annots' not in page: - page['Annots'] = pydyf.Array() +def debug(pdf, metadata, document, page_streams, attachments, compress): + """Set debug PDF metadata.""" - ids = {} + # Add links on ids. + pages = zip(pdf.pages['Kids'][::3], document.pages, page_streams) + for pdf_page_number, document_page, stream in pages: + if not document_page.anchors: + continue - for i, (element, style, rectangle, box) in enumerate(debug): - id = element.get("id") - if id.startswith("auto-id"): - id = "-".join(id.split("-")[:4]) - if id in ids: - ids[id] += 1 - else: - ids[id] = 0 - final_id = id + "-" + str(ids[id]) - element.set("id", final_id) - id = final_id - # print("add_debug", element.get("id")) - x1, y1 = matrix.transform_point(*rectangle[:2]) - x2, y2 = matrix.transform_point(*rectangle[2:]) - box.annotation = pydyf.Dictionary({ - 'Type': '/Annot', - 'Subtype': '/Link', - # 'Subtype': '/Square', - 'Rect': pydyf.Array([x1, y1, x2, y2]), - 'P': page.reference, - # 'BS': pydyf.Dictionary({'W': 1}), # border style - 'T': pydyf.String(id), # the title element gets added as metadata - }) + page = pdf.objects[pdf_page_number] + if 'Annots' not in page: + page['Annots'] = pydyf.Array() - # Internal links are deactivated when in local - # See: https://github.com/mozilla/pdf.js/issues/12415 - # box.annotation['A'] = pydyf.Dictionary({ - # 'Type': '/Action', - # 'S': '/URI', - # 'URI': pydyf.String("#" + id) - # }) + for id, (x1, y1, x2, y2) in document_page.anchors.items(): + # TODO: handle zoom correctly. + matrix = Matrix(0.75, 0, 0, 0.75) @ stream.ctm + x1, y1 = matrix.transform_point(x1, y1) + x2, y2 = matrix.transform_point(x2, y2) + annotation = pydyf.Dictionary({ + 'Type': '/Annot', + 'Subtype': '/Link', + 'Rect': pydyf.Array([x1, y1, x2, y2]), + 'BS': pydyf.Dictionary({'W': 0}), + 'P': page.reference, + 'T': pydyf.String(id), # id added as metadata + }) - # Internal links - works better with a local version PDFjs... But why? - box.annotation['Dest'] = pydyf.String(id) + pdf.add_object(annotation) + page['Annots'].append(annotation.reference) - # In order to preserve page references - names.append([id, pydyf.Array([page.reference, '/XYZ', x1, y1, 0])]) - # Actually adding the PDF object - pdf.add_object(box.annotation) - page['Annots'].append(box.annotation.reference) - -def resolve_debug(pages): - '''Resolve the added debug IDs. Inspired from resolve_links. - ''' - paged_debug = [] - for i, page in enumerate(pages): - paged_debug.append([]) - # for (element, style, rectangle, box) in page.debug: - # debug.append(element.get('id')) - for page in pages: - page_debug = [] - for m in page.debug: - page_debug.append(m) - yield page_debug, paged_debug.pop(0) +VARIANTS = {'debug': (debug, {})} From 7276b5f2822b003dc2cc09b0171d908a4ddc67a4 Mon Sep 17 00:00:00 2001 From: yanntrividic Date: Fri, 30 Aug 2024 15:04:38 +0200 Subject: [PATCH 4/5] Dest key added to the debug annotations --- weasyprint/pdf/debug.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/weasyprint/pdf/debug.py b/weasyprint/pdf/debug.py index a945f1eae..9059e0478 100644 --- a/weasyprint/pdf/debug.py +++ b/weasyprint/pdf/debug.py @@ -32,6 +32,10 @@ def debug(pdf, metadata, document, page_streams, attachments, compress): 'T': pydyf.String(id), # id added as metadata }) + # The next line makes all of this relevent to use + # with PDFjs + annotation['Dest'] = pydyf.String(id) + pdf.add_object(annotation) page['Annots'].append(annotation.reference) From 8da63da982e27369662cb2750b88c56cf20a77ed Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Sat, 28 Sep 2024 17:40:58 +0200 Subject: [PATCH 5/5] Fix tests --- tests/test_api.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 63ee8caaf..48b3efe63 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -127,8 +127,10 @@ def _round_meta(pages): """Eliminate errors of floating point arithmetic for metadata.""" for page in pages: anchors = page.anchors - for anchor_name, (pos_x, pos_y) in anchors.items(): - anchors[anchor_name] = round(pos_x, 6), round(pos_y, 6) + for anchor_name, (x1, y1, x2, y2) in anchors.items(): + anchors[anchor_name] = ( + round(x1, 6), round(y1, 6), + round(x2, 6), round(y2, 6)) links = page.links for i, link in enumerate(links): link_type, target, rectangle, box = link @@ -884,8 +886,8 @@ def test_links_1(): ], [('internal', 'hello', (0, 0, 200, 30))], ], [ - {'hello': (0, 200)}, - {'lipsum': (0, 0)} + {'hello': (0, 200, 200, 290)}, + {'lipsum': (0, 0, 200, 90)} ], [ ( [ @@ -966,7 +968,7 @@ def test_links_6(): ''', [[ ('internal', 'lipsum', (5, 10, 195, 10)), ('external', 'https://weasyprint.org/', (0, 10, 200, 10))]], - [{'lipsum': (5, 10)}], + [{'lipsum': (5, 10, 195, 10)}], [([('internal', 'lipsum', (5, 10, 195, 10)), ('external', 'https://weasyprint.org/', (0, 10, 200, 10))], [('lipsum', 5, 10)])], @@ -982,7 +984,7 @@ def test_links_7(): margin: 10px 5px" id="lipsum"> ''', [[('internal', 'lipsum', (5, 10, 195, 10))]], - [{'lipsum': (5, 10)}], + [{'lipsum': (5, 10, 195, 10)}], [([('internal', 'lipsum', (5, 10, 195, 10))], [('lipsum', 5, 10)])], base_url=None) @@ -998,7 +1000,7 @@ def test_links_8(): ''', [[('internal', 'lipsum', (0, 0, 200, 15)), ('internal', 'missing', (0, 15, 200, 30))]], - [{'lipsum': (0, 15)}], + [{'lipsum': (0, 15, 200, 30)}], [([('internal', 'lipsum', (0, 0, 200, 15))], [('lipsum', 0, 15)])], base_url=None, warnings=[ @@ -1014,7 +1016,7 @@ def test_links_9(): transform: rotate(90deg) scale(2)"> ''', [[('internal', 'lipsum', (30, 10, 70, 210))]], - [{'lipsum': (70, 10)}], + [{'lipsum': (70, 10, 30, 210)}], [([('internal', 'lipsum', (30, 10, 70, 210))], [('lipsum', 70, 10)])], round=True)