From 95d2c37527c348d4640cc8ebd9d04f416191ff18 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Tue, 30 Apr 2024 13:47:45 +0200
Subject: [PATCH 1/3] Try to subset fonts using Harfbuzz

Related to #2120.
---
 weasyprint/pdf/fonts.py  | 266 +++++++++++++++++++++++++++++++++++++--
 weasyprint/pdf/stream.py | 196 +----------------------------
 weasyprint/text/ffi.py   |  63 +++++++++-
 3 files changed, 320 insertions(+), 205 deletions(-)

diff --git a/weasyprint/pdf/fonts.py b/weasyprint/pdf/fonts.py
index b80dd3a23..85f5177f2 100644
--- a/weasyprint/pdf/fonts.py
+++ b/weasyprint/pdf/fonts.py
@@ -1,10 +1,258 @@
 """Fonts integration in PDF."""
 
+import io
+from hashlib import md5
 from math import ceil
 
 import pydyf
+from fontTools import subset
+from fontTools.ttLib import TTFont, TTLibError, ttFont
+from fontTools.varLib.mutator import instantiateVariableFont
 
 from ..logger import LOGGER
+from ..text.constants import PANGO_STRETCH_PERCENT
+from ..text.ffi import ffi, harfbuzz, harfbuzz_subset, pango, units_to_double
+from ..text.fonts import get_hb_object_data, get_pango_font_hb_face
+
+
+class Font:
+    def __init__(self, pango_font):
+        self.hb_font = pango.pango_font_get_hb_font(pango_font)
+        self.hb_face = get_pango_font_hb_face(pango_font)
+        self.file_content = get_hb_object_data(self.hb_face)
+        self.index = harfbuzz.hb_face_get_index(self.hb_face)
+
+        pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
+        self.description = description = ffi.gc(
+            pango.pango_font_describe(pango_font),
+            pango.pango_font_description_free)
+        self.font_size = pango.pango_font_description_get_size(description)
+        self.style = pango.pango_font_description_get_style(description)
+        self.family = ffi.string(
+            pango.pango_font_description_get_family(description))
+
+        self.variations = {}
+        variations = pango.pango_font_description_get_variations(self.description)
+        if variations != ffi.NULL:
+            self.variations = {
+                part.split('=')[0]: float(part.split('=')[1])
+                for part in ffi.string(variations).decode().split(',')}
+        if 'wght' in self.variations:
+            pango.pango_font_description_set_weight(
+                self.description, int(round(self.variations['wght'])))
+        if self.variations.get('ital'):
+            pango.pango_font_description_set_style(
+                self.description, pango.PANGO_STYLE_ITALIC)
+        elif self.variations.get('slnt'):
+            pango.pango_font_description_set_style(
+                self.description, pango.PANGO_STYLE_OBLIQUE)
+        if 'wdth' in self.variations:
+            stretch = min(
+                PANGO_STRETCH_PERCENT.items(),
+                key=lambda item: abs(item[0] - self.variations['wdth']))[1]
+            pango.pango_font_description_set_stretch(self.description, stretch)
+        description_string = ffi.string(
+            pango.pango_font_description_to_string(description))
+
+        # Never use the built-in hash function here: it’s not stable
+        self.hash = ''.join(
+            chr(65 + letter % 26) for letter
+            in md5(description_string, usedforsecurity=False).digest()[:6])
+
+        # Name
+        fields = description_string.split(b' ')
+        if fields and b'=' in fields[-1]:
+            fields.pop()  # Remove variations
+        if fields:
+            fields.pop()  # Remove font size
+        else:
+            fields = [b'Unknown']
+        self.name = b'/' + self.hash.encode() + b'+' + b'-'.join(fields)
+
+        # Ascent & descent
+        if self.font_size:
+            self.ascent = int(
+                pango.pango_font_metrics_get_ascent(pango_metrics) /
+                self.font_size * 1000)
+            self.descent = -int(
+                pango.pango_font_metrics_get_descent(pango_metrics) /
+                self.font_size * 1000)
+        else:
+            self.ascent = self.descent = 0
+
+        # Tables and metadata
+        table_count = ffi.new('unsigned int *', 100)
+        table_tags = ffi.new('hb_tag_t[100]')
+        table_name = ffi.new('char[4]')
+        harfbuzz.hb_face_get_table_tags(self.hb_face, 0, table_count, table_tags)
+        self.tables = []
+        for i in range(table_count[0]):
+            harfbuzz.hb_tag_to_string(table_tags[i], table_name)
+            self.tables.append(ffi.string(table_name).decode())
+        self.bitmap = 'EBDT' in self.tables and 'EBLC' in self.tables
+        self.italic_angle = 0  # TODO: this should be different
+        self.upem = harfbuzz.hb_face_get_upem(self.hb_face)
+        self.png = harfbuzz.hb_ot_color_has_png(self.hb_face)
+        self.svg = harfbuzz.hb_ot_color_has_svg(self.hb_face)
+        self.stemv = 80
+        self.stemh = 80
+        self.widths = {}
+        self.cmap = {}
+        self.used_in_forms = False
+
+        # Font flags
+        self.flags = 2 ** (3 - 1)  # Symbolic, custom character set
+        if self.style:
+            self.flags += 2 ** (7 - 1)  # Italic
+        if b'Serif' in fields:
+            self.flags += 2 ** (2 - 1)  # Serif
+
+    def clean(self, cmap, hinting):
+        # Subset font.
+        self.subset(cmap, hinting)
+
+        # Transform variable into static font
+        if 'fvar' in self.tables:
+            full_font = io.BytesIO(self.file_content)
+            ttfont = TTFont(full_font, fontNumber=self.index)
+            if 'wght' not in self.variations:
+                weight = pango.pango_font_description_get_weight(
+                    self.description)
+                self.variations['wght'] = weight
+            if 'opsz' not in self.variations:
+                self.variations['opsz'] = units_to_double(self.font_size)
+            if 'slnt' not in self.variations:
+                slnt = 0
+                if self.style == 1:
+                    for axe in ttfont['fvar'].axes:
+                        if axe.axisTag == 'slnt':
+                            if axe.maxValue == 0:
+                                slnt = axe.minValue
+                            else:
+                                slnt = axe.maxValue
+                            break
+                self.variations['slnt'] = slnt
+            if 'ital' not in self.variations:
+                self.variations['ital'] = int(self.style == 2)
+            partial_font = io.BytesIO()
+            try:
+                ttfont = instantiateVariableFont(ttfont, self.variations)
+                for key, (advance, bearing) in ttfont['hmtx'].metrics.items():
+                    if advance < 0:
+                        ttfont['hmtx'].metrics[key] = (0, bearing)
+                ttfont.save(partial_font)
+            except Exception:
+                LOGGER.warning('Unable to mutate variable font')
+            else:
+                self.file_content = partial_font.getvalue()
+
+        if not (self.png or self.svg):
+            return
+
+        full_font = io.BytesIO(self.file_content)
+        ttfont = TTFont(full_font, fontNumber=self.index)
+        try:
+            # Add empty glyphs instead of PNG or SVG emojis
+            if 'loca' not in self.tables or 'glyf' not in self.tables:
+                ttfont['loca'] = ttFont.getTableClass('loca')()
+                ttfont['glyf'] = ttFont.getTableClass('glyf')()
+                ttfont['glyf'].glyphOrder = ttfont.getGlyphOrder()
+                ttfont['glyf'].glyphs = {
+                    name: ttFont.getTableModule('glyf').Glyph()
+                    for name in ttfont['glyf'].glyphOrder}
+            else:
+                for glyph in ttfont['glyf'].glyphs:
+                    ttfont['glyf'][glyph] = (
+                        ttFont.getTableModule('glyf').Glyph())
+            for table_name in ('CBDT', 'CBLC', 'SVG '):
+                if table_name in ttfont:
+                    del ttfont[table_name]
+            output_font = io.BytesIO()
+            ttfont.save(output_font)
+            self.file_content = output_font.getvalue()
+        except TTLibError:
+            LOGGER.warning('Unable to save emoji font')
+
+    @property
+    def type(self):
+        return 'otf' if self.file_content[:4] == b'OTTO' else 'ttf'
+
+    def subset(self, cmap, hinting):
+        if not cmap:
+            return
+
+        if harfbuzz_subset:
+            hb_subset = harfbuzz_subset.hb_subset_input_create_or_fail()
+
+            # Only keep used glyphs.
+            gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
+            gid_array = ffi.new(f'hb_codepoint_t[{len(cmap)}]', sorted(cmap))
+            harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(cmap))
+
+            # Set flags.
+            flags = (
+                harfbuzz_subset.HB_SUBSET_FLAGS_RETAIN_GIDS |
+                harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
+                harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
+            if not hinting:
+                flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
+            harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
+
+            # Drop useless tables.
+            drop_set = harfbuzz_subset.hb_subset_input_set(
+                hb_subset, harfbuzz_subset.HB_SUBSET_SETS_DROP_TABLE_TAG)
+            drop_tables = tuple(harfbuzz.hb_tag_from_string(name, -1) for name in (
+                b'BASE', b'DSIG', b'EBDT', b'EBLC', b'EBSC', b'GPOS', b'GSUB', b'JSTF',
+                b'LTSH', b'PCLT', b'SVG '))
+            drop_tables_array = ffi.new(
+                f'hb_codepoint_t[{len(drop_tables)}]', drop_tables)
+            harfbuzz.hb_set_add_sorted_array(
+                drop_set, drop_tables_array, len(drop_tables))
+
+            # Subset font.
+            hb_face = harfbuzz_subset.hb_subset_or_fail(self.hb_face, hb_subset)
+
+            # Drop empty glyphs after last one used.
+            gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
+            keep = tuple(range(max(cmap) + 1))
+            gid_array = ffi.new(f'hb_codepoint_t[{len(keep)}]', keep)
+            harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(keep))
+
+            # Set flags.
+            flags = (
+                harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
+                harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
+            if not hinting:
+                flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
+            harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
+
+            # Subset font.
+            hb_face = harfbuzz_subset.hb_subset_or_fail(hb_face, hb_subset)
+
+            # Store new font.
+            if hb_face:
+                file_content = get_hb_object_data(hb_face)
+                if file_content:
+                    self.file_content = file_content
+                    return
+            LOGGER.warning('Unable to subset font with Harfbuzz')
+        else:
+           full_font = io.BytesIO(self.file_content)
+           optimized_font = io.BytesIO()
+           options = subset.Options(
+               retain_gids=True, passthrough_tables=True, ignore_missing_glyphs=True,
+               hinting=hinting, desubroutinize=True)
+           options.drop_tables += ['GSUB', 'GPOS', 'SVG']
+           subsetter = subset.Subsetter(options)
+           subsetter.populate(gids=cmap)
+           try:
+               ttfont = TTFont(full_font, fontNumber=self.index)
+               subsetter.subset(ttfont)
+           except TTLibError:
+               LOGGER.warning('Unable to subset font with fontTools')
+           else:
+               ttfont.save(optimized_font)
+               self.file_content = optimized_font.getvalue()
 
 
 def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
@@ -37,18 +285,20 @@ def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
         font_references_by_file_hash[file_hash] = font_stream.reference
 
     for font in fonts.values():
-        if not font.ttfont or (subset and not font.used_in_forms):
+        if subset and not font.used_in_forms:
             # Only store widths and map for used glyphs
             font_widths = font.widths
             cmap = font.cmap
         else:
             # Store width and Unicode map for all glyphs
+            full_font = io.BytesIO(font.file_content)
+            ttfont = TTFont(full_font, fontNumber=font.index)
             font_widths, cmap = {}, {}
-            for letter, key in font.ttfont.getBestCmap().items():
-                glyph = font.ttfont.getGlyphID(key)
+            for letter, key in ttfont.getBestCmap().items():
+                glyph = ttfont.getGlyphID(key)
                 if glyph not in cmap:
                     cmap[glyph] = chr(letter)
-                width = font.ttfont.getGlyphSet()[key].width
+                width = ttfont.getGlyphSet()[key].width
                 font_widths[glyph] = width * 1000 / font.upem
 
         max_x = max(font_widths.values()) if font_widths else 0
@@ -178,17 +428,19 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
         'Differences': pydyf.Array(differences),
     })
     char_procs = pydyf.Dictionary({})
-    font_glyphs = font.ttfont['EBDT'].strikeData[0]
+    full_font = io.BytesIO(font.file_content)
+    ttfont = TTFont(full_font, fontNumber=font.index)
+    font_glyphs = ttfont['EBDT'].strikeData[0]
     widths = [0] * (last - first + 1)
     glyphs_info = {}
     for key, glyph in font_glyphs.items():
         glyph_format = glyph.getFormat()
-        glyph_id = font.ttfont.getGlyphID(key)
+        glyph_id = ttfont.getGlyphID(key)
 
         # Get and store glyph metrics
         if glyph_format == 5:
             data = glyph.data
-            subtables = font.ttfont['EBLC'].strikes[0].indexSubTables
+            subtables = ttfont['EBLC'].strikes[0].indexSubTables
             for subtable in subtables:
                 first_index = subtable.firstGlyphIndex
                 last_index = subtable.lastGlyphIndex
diff --git a/weasyprint/pdf/stream.py b/weasyprint/pdf/stream.py
index a8f47c710..a1d7abb06 100644
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@@ -1,201 +1,11 @@
 """PDF stream."""
 
-import io
-from hashlib import md5
-
 import pydyf
-from fontTools import subset
-from fontTools.ttLib import TTFont, TTLibError, ttFont
-from fontTools.varLib.mutator import instantiateVariableFont
 
-from ..logger import LOGGER
 from ..matrix import Matrix
-from ..text.constants import PANGO_STRETCH_PERCENT
-from ..text.ffi import ffi, harfbuzz, pango, units_to_double
-from ..text.fonts import get_hb_object_data, get_pango_font_hb_face, get_pango_font_key
-
-
-class Font:
-    def __init__(self, pango_font):
-        self.hb_font = pango.pango_font_get_hb_font(pango_font)
-        self.hb_face = get_pango_font_hb_face(pango_font)
-        self.file_content = get_hb_object_data(self.hb_face)
-        self.index = harfbuzz.hb_face_get_index(self.hb_face)
-
-        pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
-        self.description = description = ffi.gc(
-            pango.pango_font_describe(pango_font),
-            pango.pango_font_description_free)
-        self.font_size = pango.pango_font_description_get_size(description)
-        self.style = pango.pango_font_description_get_style(description)
-        self.family = ffi.string(
-            pango.pango_font_description_get_family(description))
-
-        self.variations = {}
-        variations = pango.pango_font_description_get_variations(
-            self.description)
-        if variations != ffi.NULL:
-            self.variations = {
-                part.split('=')[0]: float(part.split('=')[1])
-                for part in ffi.string(variations).decode().split(',')}
-        if 'wght' in self.variations:
-            pango.pango_font_description_set_weight(
-                self.description, int(round(self.variations['wght'])))
-        if self.variations.get('ital'):
-            pango.pango_font_description_set_style(
-                self.description, pango.PANGO_STYLE_ITALIC)
-        elif self.variations.get('slnt'):
-            pango.pango_font_description_set_style(
-                self.description, pango.PANGO_STYLE_OBLIQUE)
-        if 'wdth' in self.variations:
-            stretch = min(
-                PANGO_STRETCH_PERCENT.items(),
-                key=lambda item: abs(item[0] - self.variations['wdth']))[1]
-            pango.pango_font_description_set_stretch(self.description, stretch)
-        description_string = ffi.string(
-            pango.pango_font_description_to_string(description))
-
-        # Never use the built-in hash function here: it’s not stable
-        self.hash = ''.join(
-            chr(65 + letter % 26) for letter
-            in md5(description_string, usedforsecurity=False).digest()[:6])
-
-        # Name
-        fields = description_string.split(b' ')
-        if fields and b'=' in fields[-1]:
-            fields.pop()  # Remove variations
-        if fields:
-            fields.pop()  # Remove font size
-        else:
-            fields = [b'Unknown']
-        self.name = b'/' + self.hash.encode() + b'+' + b'-'.join(fields)
-
-        # Ascent & descent
-        if self.font_size:
-            self.ascent = int(
-                pango.pango_font_metrics_get_ascent(pango_metrics) /
-                self.font_size * 1000)
-            self.descent = -int(
-                pango.pango_font_metrics_get_descent(pango_metrics) /
-                self.font_size * 1000)
-        else:
-            self.ascent = self.descent = 0
-
-        # Fonttools
-        full_font = io.BytesIO(self.file_content)
-        try:
-            self.ttfont = TTFont(full_font, fontNumber=self.index)
-        except Exception:
-            LOGGER.warning('Unable to read font')
-            self.ttfont = None
-            self.bitmap = False
-        else:
-            self.bitmap = (
-                'EBDT' in self.ttfont and 'EBLC' in self.ttfont and (
-                    'glyf' not in self.ttfont or not self.ttfont['glyf'].glyphs))
-
-        # Various properties
-        self.italic_angle = 0  # TODO: this should be different
-        self.upem = harfbuzz.hb_face_get_upem(self.hb_face)
-        self.png = harfbuzz.hb_ot_color_has_png(self.hb_face)
-        self.svg = harfbuzz.hb_ot_color_has_svg(self.hb_face)
-        self.stemv = 80
-        self.stemh = 80
-        self.widths = {}
-        self.cmap = {}
-        self.used_in_forms = False
-
-        # Font flags
-        self.flags = 2 ** (3 - 1)  # Symbolic, custom character set
-        if self.style:
-            self.flags += 2 ** (7 - 1)  # Italic
-        if b'Serif' in fields:
-            self.flags += 2 ** (2 - 1)  # Serif
-
-    def clean(self, cmap, hinting):
-        if self.ttfont is None:
-            return
-
-        # Subset font
-        if cmap:
-            optimized_font = io.BytesIO()
-            options = subset.Options(
-                retain_gids=True, passthrough_tables=True,
-                ignore_missing_glyphs=True, hinting=hinting,
-                desubroutinize=True)
-            options.drop_tables += ['GSUB', 'GPOS', 'SVG']
-            subsetter = subset.Subsetter(options)
-            subsetter.populate(gids=cmap)
-            try:
-                subsetter.subset(self.ttfont)
-            except TTLibError:
-                LOGGER.warning('Unable to optimize font')
-            else:
-                self.ttfont.save(optimized_font)
-                self.file_content = optimized_font.getvalue()
-
-        # Transform variable into static font
-        if 'fvar' in self.ttfont:
-            if 'wght' not in self.variations:
-                weight = pango.pango_font_description_get_weight(
-                    self.description)
-                self.variations['wght'] = weight
-            if 'opsz' not in self.variations:
-                self.variations['opsz'] = units_to_double(self.font_size)
-            if 'slnt' not in self.variations:
-                slnt = 0
-                if self.style == 1:
-                    for axe in self.ttfont['fvar'].axes:
-                        if axe.axisTag == 'slnt':
-                            if axe.maxValue == 0:
-                                slnt = axe.minValue
-                            else:
-                                slnt = axe.maxValue
-                            break
-                self.variations['slnt'] = slnt
-            if 'ital' not in self.variations:
-                self.variations['ital'] = int(self.style == 2)
-            partial_font = io.BytesIO()
-            try:
-                ttfont = instantiateVariableFont(self.ttfont, self.variations)
-                for key, (advance, bearing) in ttfont['hmtx'].metrics.items():
-                    if advance < 0:
-                        ttfont['hmtx'].metrics[key] = (0, bearing)
-                ttfont.save(partial_font)
-            except Exception:
-                LOGGER.warning('Unable to mutate variable font')
-            else:
-                self.ttfont = ttfont
-                self.file_content = partial_font.getvalue()
-
-        if not (self.png or self.svg):
-            return
-
-        try:
-            # Add empty glyphs instead of PNG or SVG emojis
-            if 'loca' not in self.ttfont or 'glyf' not in self.ttfont:
-                self.ttfont['loca'] = ttFont.getTableClass('loca')()
-                self.ttfont['glyf'] = ttFont.getTableClass('glyf')()
-                self.ttfont['glyf'].glyphOrder = self.ttfont.getGlyphOrder()
-                self.ttfont['glyf'].glyphs = {
-                    name: ttFont.getTableModule('glyf').Glyph()
-                    for name in self.ttfont['glyf'].glyphOrder}
-            else:
-                for glyph in self.ttfont['glyf'].glyphs:
-                    self.ttfont['glyf'][glyph] = (
-                        ttFont.getTableModule('glyf').Glyph())
-            for table_name in ('CBDT', 'CBLC', 'SVG '):
-                if table_name in self.ttfont:
-                    del self.ttfont[table_name]
-            output_font = io.BytesIO()
-            self.ttfont.save(output_font)
-            self.file_content = output_font.getvalue()
-        except TTLibError:
-            LOGGER.warning('Unable to save emoji font')
-
-    @property
-    def type(self):
-        return 'otf' if self.file_content[:4] == b'OTTO' else 'ttf'
+from ..text.ffi import ffi
+from ..text.fonts import get_pango_font_key
+from .fonts import Font
 
 
 class Stream(pydyf.Stream):
diff --git a/weasyprint/text/ffi.py b/weasyprint/text/ffi.py
index 198bb8ad5..3542828c7 100644
--- a/weasyprint/text/ffi.py
+++ b/weasyprint/text/ffi.py
@@ -12,20 +12,67 @@
     typedef ... hb_font_t;
     typedef ... hb_face_t;
     typedef ... hb_blob_t;
+    typedef uint32_t hb_tag_t;
     typedef uint32_t hb_codepoint_t;
+    hb_tag_t hb_tag_from_string (const char *str, int len);
+    void hb_tag_to_string (hb_tag_t tag, char *buf);
     hb_blob_t * hb_face_reference_blob (hb_face_t *face);
     unsigned int hb_face_get_index (const hb_face_t *face);
     unsigned int hb_face_get_upem (const hb_face_t *face);
     const char * hb_blob_get_data (hb_blob_t *blob, unsigned int *length);
     bool hb_ot_color_has_png (hb_face_t *face);
-    hb_blob_t * hb_ot_color_glyph_reference_png (
-        hb_font_t *font, hb_codepoint_t glyph);
+    hb_blob_t * hb_ot_color_glyph_reference_png (hb_font_t *font, hb_codepoint_t glyph);
     bool hb_ot_color_has_svg (hb_face_t *face);
-    hb_blob_t * hb_ot_color_glyph_reference_svg (
-        hb_face_t *face, hb_codepoint_t glyph);
+    hb_blob_t * hb_ot_color_glyph_reference_svg (hb_face_t *face, hb_codepoint_t glyph);
     void hb_blob_destroy (hb_blob_t *blob);
+    unsigned int hb_face_get_table_tags (
+        const hb_face_t *face, unsigned int start_offset, unsigned int *table_count,
+        hb_tag_t *table_tags);
+    unsigned int hb_face_get_glyph_count (const hb_face_t *face);
+    hb_blob_t * hb_face_reference_table (const hb_face_t *face, hb_tag_t tag);
 
 
+    // HarfBuzz Subset
+
+    typedef ... hb_subset_input_t;
+    typedef ... hb_set_t;
+
+    typedef enum {
+        HB_SUBSET_FLAGS_DEFAULT = 0x00000000u,
+        HB_SUBSET_FLAGS_NO_HINTING = 0x00000001u,
+        HB_SUBSET_FLAGS_RETAIN_GIDS = 0x00000002u,
+        HB_SUBSET_FLAGS_DESUBROUTINIZE = 0x00000004u,
+        HB_SUBSET_FLAGS_NAME_LEGACY = 0x00000008u,
+        HB_SUBSET_FLAGS_SET_OVERLAPS_FLAG = 0x00000010u,
+        HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED = 0x00000020u,
+        HB_SUBSET_FLAGS_NOTDEF_OUTLINE = 0x00000040u,
+        HB_SUBSET_FLAGS_GLYPH_NAMES = 0x00000080u,
+        HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES = 0x00000100u,
+        HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE = 0x00000200u,
+    } hb_subset_flags_t;
+
+    typedef enum {
+        HB_SUBSET_SETS_GLYPH_INDEX = 0,
+        HB_SUBSET_SETS_UNICODE,
+        HB_SUBSET_SETS_NO_SUBSET_TABLE_TAG,
+        HB_SUBSET_SETS_DROP_TABLE_TAG,
+        HB_SUBSET_SETS_NAME_ID,
+        HB_SUBSET_SETS_NAME_LANG_ID,
+        HB_SUBSET_SETS_LAYOUT_FEATURE_TAG,
+        HB_SUBSET_SETS_LAYOUT_SCRIPT_TAG,
+    } hb_subset_sets_t;
+
+    hb_subset_input_t * hb_subset_input_create_or_fail (void);
+    hb_set_t * hb_subset_input_glyph_set (hb_subset_input_t *input);
+    void hb_set_add (hb_set_t *set, hb_codepoint_t codepoint);
+    void hb_set_add_sorted_array (
+        hb_set_t *set, const hb_codepoint_t *sorted_codepoints,
+        unsigned int num_codepoints);
+    hb_face_t * hb_subset_or_fail (hb_face_t *source, const hb_subset_input_t *input);
+    void hb_subset_input_set_flags (hb_subset_input_t *input, unsigned  value);
+    hb_set_t * hb_subset_input_set (
+        hb_subset_input_t *input, hb_subset_sets_t set_type);
+
     // Pango
 
     typedef unsigned int guint;
@@ -402,11 +449,13 @@
 ''')
 
 
-def _dlopen(ffi, *names):
+def _dlopen(ffi, *names, allow_fail=False):
     """Try various names for the same library, for different platforms."""
     for name in names:
         with suppress(OSError):
             return ffi.dlopen(name)
+    if allow_fail:
+        return
     # Re-raise the exception.
     print(
         '\n-----\n\n'
@@ -439,6 +488,10 @@ def _dlopen(ffi, *names):
     ffi, 'harfbuzz', 'harfbuzz-0.0', 'libharfbuzz-0',
     'libharfbuzz.so.0', 'libharfbuzz.so.0', 'libharfbuzz.0.dylib',
     'libharfbuzz-0.dll')
+harfbuzz_subset = _dlopen(
+    ffi, 'harfbuzz-subset', 'harfbuzz-subset-0.0', 'libharfbuzz-subset-0',
+    'libharfbuzz-subset.so.0', 'libharfbuzz-subset.so.0', 'libharfbuzz-subset.0.dylib',
+    'libharfbuzz-subset-0.dll', allow_fail=True)
 fontconfig = _dlopen(
     ffi, 'fontconfig-1', 'fontconfig', 'libfontconfig', 'libfontconfig.so.1',
     'libfontconfig.1.dylib', 'libfontconfig-1.dll')

From 95c47ae9ce8203abd547b74aaedd51ed2421aa47 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Sat, 8 Jun 2024 16:41:57 +0200
Subject: [PATCH 2/3] Clean PDF font management

---
 weasyprint/pdf/fonts.py | 408 +++++++++++++++++++++-------------------
 1 file changed, 210 insertions(+), 198 deletions(-)

diff --git a/weasyprint/pdf/fonts.py b/weasyprint/pdf/fonts.py
index 85f5177f2..6271f8c0f 100644
--- a/weasyprint/pdf/fonts.py
+++ b/weasyprint/pdf/fonts.py
@@ -24,12 +24,10 @@ def __init__(self, pango_font):
 
         pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
         self.description = description = ffi.gc(
-            pango.pango_font_describe(pango_font),
-            pango.pango_font_description_free)
+            pango.pango_font_describe(pango_font), pango.pango_font_description_free)
         self.font_size = pango.pango_font_description_get_size(description)
         self.style = pango.pango_font_description_get_style(description)
-        self.family = ffi.string(
-            pango.pango_font_description_get_family(description))
+        self.family = ffi.string(pango.pango_font_description_get_family(description))
 
         self.variations = {}
         variations = pango.pango_font_description_get_variations(self.description)
@@ -37,29 +35,29 @@ def __init__(self, pango_font):
             self.variations = {
                 part.split('=')[0]: float(part.split('=')[1])
                 for part in ffi.string(variations).decode().split(',')}
-        if 'wght' in self.variations:
+        if weight := self.variations.get('weight'):
             pango.pango_font_description_set_weight(
-                self.description, int(round(self.variations['wght'])))
+                self.description, int(round(weight)))
         if self.variations.get('ital'):
             pango.pango_font_description_set_style(
                 self.description, pango.PANGO_STYLE_ITALIC)
         elif self.variations.get('slnt'):
             pango.pango_font_description_set_style(
                 self.description, pango.PANGO_STYLE_OBLIQUE)
-        if 'wdth' in self.variations:
+        if (width := self.variations.get('wdth')) is not None:
             stretch = min(
                 PANGO_STRETCH_PERCENT.items(),
-                key=lambda item: abs(item[0] - self.variations['wdth']))[1]
+                key=lambda item: abs(item[0] - width))[1]
             pango.pango_font_description_set_stretch(self.description, stretch)
         description_string = ffi.string(
             pango.pango_font_description_to_string(description))
 
-        # Never use the built-in hash function here: it’s not stable
+        # Never use the built-in hash function here: it’s not stable.
         self.hash = ''.join(
             chr(65 + letter % 26) for letter
             in md5(description_string, usedforsecurity=False).digest()[:6])
 
-        # Name
+        # Set font name.
         fields = description_string.split(b' ')
         if fields and b'=' in fields[-1]:
             fields.pop()  # Remove variations
@@ -69,7 +67,7 @@ def __init__(self, pango_font):
             fields = [b'Unknown']
         self.name = b'/' + self.hash.encode() + b'+' + b'-'.join(fields)
 
-        # Ascent & descent
+        # Set ascent and descent.
         if self.font_size:
             self.ascent = int(
                 pango.pango_font_metrics_get_ascent(pango_metrics) /
@@ -80,7 +78,7 @@ def __init__(self, pango_font):
         else:
             self.ascent = self.descent = 0
 
-        # Tables and metadata
+        # Get font tables and set metadata.
         table_count = ffi.new('unsigned int *', 100)
         table_tags = ffi.new('hb_tag_t[100]')
         table_name = ffi.new('char[4]')
@@ -100,7 +98,7 @@ def __init__(self, pango_font):
         self.cmap = {}
         self.used_in_forms = False
 
-        # Font flags
+        # Set font flags.
         self.flags = 2 ** (3 - 1)  # Symbolic, custom character set
         if self.style:
             self.flags += 2 ** (7 - 1)  # Italic
@@ -108,10 +106,12 @@ def __init__(self, pango_font):
             self.flags += 2 ** (2 - 1)  # Serif
 
     def clean(self, cmap, hinting):
+        """Remove useless data from font."""
+
         # Subset font.
         self.subset(cmap, hinting)
 
-        # Transform variable into static font
+        # Transform variable into static font.
         if 'fvar' in self.tables:
             full_font = io.BytesIO(self.file_content)
             ttfont = TTFont(full_font, fontNumber=self.index)
@@ -146,141 +146,151 @@ def clean(self, cmap, hinting):
             else:
                 self.file_content = partial_font.getvalue()
 
-        if not (self.png or self.svg):
-            return
-
-        full_font = io.BytesIO(self.file_content)
-        ttfont = TTFont(full_font, fontNumber=self.index)
-        try:
-            # Add empty glyphs instead of PNG or SVG emojis
-            if 'loca' not in self.tables or 'glyf' not in self.tables:
-                ttfont['loca'] = ttFont.getTableClass('loca')()
-                ttfont['glyf'] = ttFont.getTableClass('glyf')()
-                ttfont['glyf'].glyphOrder = ttfont.getGlyphOrder()
-                ttfont['glyf'].glyphs = {
-                    name: ttFont.getTableModule('glyf').Glyph()
-                    for name in ttfont['glyf'].glyphOrder}
-            else:
-                for glyph in ttfont['glyf'].glyphs:
-                    ttfont['glyf'][glyph] = (
-                        ttFont.getTableModule('glyf').Glyph())
-            for table_name in ('CBDT', 'CBLC', 'SVG '):
-                if table_name in ttfont:
-                    del ttfont[table_name]
-            output_font = io.BytesIO()
-            ttfont.save(output_font)
-            self.file_content = output_font.getvalue()
-        except TTLibError:
-            LOGGER.warning('Unable to save emoji font')
+        # Remove images.
+        if self.png or self.svg:
+            full_font = io.BytesIO(self.file_content)
+            ttfont = TTFont(full_font, fontNumber=self.index)
+            try:
+                # Add empty glyphs instead of PNG or SVG emojis.
+                if 'loca' not in self.tables or 'glyf' not in self.tables:
+                    ttfont['loca'] = ttFont.getTableClass('loca')()
+                    ttfont['glyf'] = ttFont.getTableClass('glyf')()
+                    ttfont['glyf'].glyphOrder = ttfont.getGlyphOrder()
+                    ttfont['glyf'].glyphs = {
+                        name: ttFont.getTableModule('glyf').Glyph()
+                        for name in ttfont['glyf'].glyphOrder}
+                else:
+                    for glyph in ttfont['glyf'].glyphs:
+                        ttfont['glyf'][glyph] = ttFont.getTableModule('glyf').Glyph()
+                for table_name in ('CBDT', 'CBLC', 'SVG '):
+                    if table_name in ttfont:
+                        del ttfont[table_name]
+                output_font = io.BytesIO()
+                ttfont.save(output_font)
+                self.file_content = output_font.getvalue()
+            except TTLibError:
+                LOGGER.warning('Unable to save emoji font')
 
     @property
     def type(self):
         return 'otf' if self.file_content[:4] == b'OTTO' else 'ttf'
 
     def subset(self, cmap, hinting):
+        """Remove unused glyphs and tables from font."""
         if not cmap:
             return
 
         if harfbuzz_subset:
-            hb_subset = harfbuzz_subset.hb_subset_input_create_or_fail()
-
-            # Only keep used glyphs.
-            gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
-            gid_array = ffi.new(f'hb_codepoint_t[{len(cmap)}]', sorted(cmap))
-            harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(cmap))
-
-            # Set flags.
-            flags = (
-                harfbuzz_subset.HB_SUBSET_FLAGS_RETAIN_GIDS |
-                harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
-                harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
-            if not hinting:
-                flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
-            harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
-
-            # Drop useless tables.
-            drop_set = harfbuzz_subset.hb_subset_input_set(
-                hb_subset, harfbuzz_subset.HB_SUBSET_SETS_DROP_TABLE_TAG)
-            drop_tables = tuple(harfbuzz.hb_tag_from_string(name, -1) for name in (
-                b'BASE', b'DSIG', b'EBDT', b'EBLC', b'EBSC', b'GPOS', b'GSUB', b'JSTF',
-                b'LTSH', b'PCLT', b'SVG '))
-            drop_tables_array = ffi.new(
-                f'hb_codepoint_t[{len(drop_tables)}]', drop_tables)
-            harfbuzz.hb_set_add_sorted_array(
-                drop_set, drop_tables_array, len(drop_tables))
-
-            # Subset font.
-            hb_face = harfbuzz_subset.hb_subset_or_fail(self.hb_face, hb_subset)
-
-            # Drop empty glyphs after last one used.
-            gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
-            keep = tuple(range(max(cmap) + 1))
-            gid_array = ffi.new(f'hb_codepoint_t[{len(keep)}]', keep)
-            harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(keep))
-
-            # Set flags.
-            flags = (
-                harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
-                harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
-            if not hinting:
-                flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
-            harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
-
-            # Subset font.
-            hb_face = harfbuzz_subset.hb_subset_or_fail(hb_face, hb_subset)
-
-            # Store new font.
-            if hb_face:
-                file_content = get_hb_object_data(hb_face)
-                if file_content:
-                    self.file_content = file_content
-                    return
-            LOGGER.warning('Unable to subset font with Harfbuzz')
+            self._harfbuzz_subset(cmap, hinting)
         else:
-           full_font = io.BytesIO(self.file_content)
-           optimized_font = io.BytesIO()
-           options = subset.Options(
-               retain_gids=True, passthrough_tables=True, ignore_missing_glyphs=True,
-               hinting=hinting, desubroutinize=True)
-           options.drop_tables += ['GSUB', 'GPOS', 'SVG']
-           subsetter = subset.Subsetter(options)
-           subsetter.populate(gids=cmap)
-           try:
-               ttfont = TTFont(full_font, fontNumber=self.index)
-               subsetter.subset(ttfont)
-           except TTLibError:
-               LOGGER.warning('Unable to subset font with fontTools')
-           else:
-               ttfont.save(optimized_font)
-               self.file_content = optimized_font.getvalue()
-
-
-def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
+            self._fonttools_subset(cmap, hinting)
+
+    def _harfbuzz_subset(self, cmap, hinting):
+        """Subset font using Harfbuzz."""
+        hb_subset = harfbuzz_subset.hb_subset_input_create_or_fail()
+
+        # Only keep used glyphs.
+        gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
+        gid_array = ffi.new(f'hb_codepoint_t[{len(cmap)}]', sorted(cmap))
+        harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(cmap))
+
+        # Set flags.
+        flags = (
+            harfbuzz_subset.HB_SUBSET_FLAGS_RETAIN_GIDS |
+            harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
+            harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
+        if not hinting:
+            flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
+        harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
+
+        # Drop useless tables.
+        drop_set = harfbuzz_subset.hb_subset_input_set(
+            hb_subset, harfbuzz_subset.HB_SUBSET_SETS_DROP_TABLE_TAG)
+        drop_tables = tuple(harfbuzz.hb_tag_from_string(name, -1) for name in (
+            b'BASE', b'DSIG', b'EBDT', b'EBLC', b'EBSC', b'GPOS', b'GSUB', b'JSTF',
+            b'LTSH', b'PCLT', b'SVG '))
+        drop_tables_array = ffi.new(f'hb_codepoint_t[{len(drop_tables)}]', drop_tables)
+        harfbuzz.hb_set_add_sorted_array(drop_set, drop_tables_array, len(drop_tables))
+
+        # Subset font.
+        hb_face = harfbuzz_subset.hb_subset_or_fail(self.hb_face, hb_subset)
+
+        # Drop empty glyphs after last one used.
+        gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
+        keep = tuple(range(max(cmap) + 1))
+        gid_array = ffi.new(f'hb_codepoint_t[{len(keep)}]', keep)
+        harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(keep))
+
+        # Set flags.
+        flags = (
+            harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
+            harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
+        if not hinting:
+            flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
+        harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)
+
+        # Subset font.
+        hb_face = harfbuzz_subset.hb_subset_or_fail(hb_face, hb_subset)
+
+        # Store new font.
+        if hb_face:
+            file_content = get_hb_object_data(hb_face)
+            if file_content:
+                self.file_content = file_content
+                return
+
+        LOGGER.warning('Unable to subset font with Harfbuzz')
+
+    def _fonttools_subset(self, cmap, hinting):
+        """Subset font using Fonttools."""
+        full_font = io.BytesIO(self.file_content)
+
+        # Set subset options.
+        options = subset.Options(
+            retain_gids=True, passthrough_tables=True, ignore_missing_glyphs=True,
+            hinting=hinting, desubroutinize=True)
+        options.drop_tables += ['GSUB', 'GPOS', 'SVG']
+        subsetter = subset.Subsetter(options)
+        subsetter.populate(gids=cmap)
+
+        # Subset font.
+        try:
+            ttfont = TTFont(full_font, fontNumber=self.index)
+            subsetter.subset(ttfont)
+        except TTLibError:
+            LOGGER.warning('Unable to subset font with fontTools')
+        else:
+            optimized_font = io.BytesIO()
+            ttfont.save(optimized_font)
+            self.file_content = optimized_font.getvalue()
+
+
+def build_fonts_dictionary(pdf, fonts, compress, subset, options):
+    """Build PDF dictionary for fonts."""
     pdf_fonts = pydyf.Dictionary()
     fonts_by_file_hash = {}
     for font in fonts.values():
         fonts_by_file_hash.setdefault(font.hash, []).append(font)
     font_references_by_file_hash = {}
     for file_hash, file_fonts in fonts_by_file_hash.items():
-        # TODO: find why we can have multiple fonts for one font file
+        # TODO: Find why we can have multiple fonts for one font file.
         font = file_fonts[0]
         if font.bitmap:
             continue
 
-        # Clean font, optimize and handle emojis
+        # Clean font, optimize and handle emojis.
         cmap = {}
         if subset and not font.used_in_forms:
             for file_font in file_fonts:
                 cmap = {**cmap, **file_font.cmap}
         font.clean(cmap, options['hinting'])
 
-        # Include font
+        # Include font.
         if font.type == 'otf':
             font_extra = pydyf.Dictionary({'Subtype': '/OpenType'})
         else:
             font_extra = pydyf.Dictionary({'Length1': len(font.file_content)})
-        font_stream = pydyf.Stream(
-            [font.file_content], font_extra, compress=compress_pdf)
+        font_stream = pydyf.Stream([font.file_content], font_extra, compress=compress)
         pdf.add_object(font_stream)
         font_references_by_file_hash[file_hash] = font_stream.reference
 
@@ -301,18 +311,6 @@ def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
                 width = ttfont.getGlyphSet()[key].width
                 font_widths[glyph] = width * 1000 / font.upem
 
-        max_x = max(font_widths.values()) if font_widths else 0
-        bbox = (0, font.descent, max_x, font.ascent)
-
-        widths = pydyf.Array()
-        for i in sorted(font_widths):
-            if i - 1 not in font_widths:
-                widths.append(i)
-                current_widths = pydyf.Array()
-                widths.append(current_widths)
-            current_widths.append(font_widths[i])
-
-        font_file = f'FontFile{3 if font.type == "otf" else 2}'
         to_unicode = pydyf.Stream([
             b'/CIDInit /ProcSet findresource begin',
             b'12 dict begin',
@@ -327,7 +325,7 @@ def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
             b'1 begincodespacerange',
             b'<0000> <ffff>',
             b'endcodespacerange',
-            f'{len(cmap)} beginbfchar'.encode()], compress=compress_pdf)
+            f'{len(cmap)} beginbfchar'.encode()], compress=compress)
         for glyph, text in cmap.items():
             unicode_codepoints = ''.join(
                 f'{letter.encode("utf-16-be").hex()}' for letter in text)
@@ -349,64 +347,18 @@ def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
 
         if font.bitmap:
             _build_bitmap_font_dictionary(
-                font_dictionary, pdf, font, widths, compress_pdf, subset)
+                font_dictionary, pdf, font, font_widths, compress, subset)
         else:
-            flags = font.flags
-            if len(widths) > 1 and len(set(font.widths.values())) == 1:
-                flags += 2 ** (1 - 1)  # FixedPitch
-            font_descriptor = pydyf.Dictionary({
-                'Type': '/FontDescriptor',
-                'FontName': font.name,
-                'FontFamily': pydyf.String(font.family),
-                'Flags': flags,
-                'FontBBox': pydyf.Array(bbox),
-                'ItalicAngle': font.italic_angle,
-                'Ascent': font.ascent,
-                'Descent': font.descent,
-                'CapHeight': bbox[3],
-                'StemV': font.stemv,
-                'StemH': font.stemh,
-                font_file: font_references_by_file_hash[font.hash],
-            })
-            if str(options['pdf_version']) <= '1.4':  # Cast for bytes and None
-                cids = sorted(font.widths)
-                padded_width = int(ceil((cids[-1] + 1) / 8))
-                bits = ['0'] * padded_width * 8
-                for cid in cids:
-                    bits[cid] = '1'
-                stream = pydyf.Stream(
-                    (int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
-                    compress=compress_pdf)
-                pdf.add_object(stream)
-                font_descriptor['CIDSet'] = stream.reference
-            if font.type == 'otf':
-                font_descriptor['Subtype'] = '/OpenType'
-            pdf.add_object(font_descriptor)
-            subfont_dictionary = pydyf.Dictionary({
-                'Type': '/Font',
-                'Subtype': f'/CIDFontType{0 if font.type == "otf" else 2}',
-                'BaseFont': font.name,
-                'CIDSystemInfo': pydyf.Dictionary({
-                    'Registry': pydyf.String('Adobe'),
-                    'Ordering': pydyf.String('Identity'),
-                    'Supplement': 0,
-                }),
-                'CIDToGIDMap': '/Identity',
-                'W': widths,
-                'FontDescriptor': font_descriptor.reference,
-            })
-            pdf.add_object(subfont_dictionary)
-            font_dictionary['Encoding'] = '/Identity-H'
-            font_dictionary['DescendantFonts'] = pydyf.Array(
-                [subfont_dictionary.reference])
+            _build_vector_font_dictionary(
+                font_dictionary, pdf, font, font_widths, compress,
+                font_references_by_file_hash[font.hash], options['pdf_version'])
         pdf.add_object(font_dictionary)
         pdf_fonts[font.hash] = font_dictionary.reference
 
     return pdf_fonts
 
 
-def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
-                                  compress_pdf, subset):
+def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths, compress, subset):
     # https://docs.microsoft.com/typography/opentype/spec/ebdt
     font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
     font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
@@ -415,14 +367,13 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
     else:
         chars = tuple(range(256))
     first, last = chars[0], chars[-1]
+    differences = []
+    for glyph in sorted(widths):
+        if glyph - 1 not in widths:
+            differences.append(glyph)
+        differences.append(f'/{glyph}')
     font_dictionary['FirstChar'] = first
     font_dictionary['LastChar'] = last
-    differences = []
-    for index, index_widths in zip(widths[::2], widths[1::2]):
-        differences.append(index)
-        for i in range(len(index_widths)):
-            if i + index in chars:
-                differences.append(f'/{i + index}')
     font_dictionary['Encoding'] = pydyf.Dictionary({
         'Type': '/Encoding',
         'Differences': pydyf.Array(differences),
@@ -437,7 +388,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
         glyph_format = glyph.getFormat()
         glyph_id = ttfont.getGlyphID(key)
 
-        # Get and store glyph metrics
+        # Get and store glyph metrics.
         if glyph_format == 5:
             data = glyph.data
             subtables = ttfont['EBLC'].strikes[0].indexSubTables
@@ -474,7 +425,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
             'subglyphs': None,
         }
 
-        # Decode bitmaps
+        # Decode bitmaps.
         if 0 in (width, height) or not data:
             glyph_info['bitmap'] = b''
         elif glyph_format in (1, 6):
@@ -486,8 +437,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
             bitmap_bits = ''.join(
                 bits[i * width:(i + 1) * width] + padding * '0'
                 for i in range(height))
-            glyph_info['bitmap'] = int(bitmap_bits, 2).to_bytes(
-                height * stride, 'big')
+            glyph_info['bitmap'] = int(bitmap_bits, 2).to_bytes(height * stride, 'big')
         elif glyph_format in (8, 9):
             subglyphs = glyph_info['subglyphs'] = []
             i = 0 if glyph_format == 9 else 1
@@ -503,11 +453,11 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
             glyph_info['bitmap'] = bytes(height * stride)
 
     for glyph_id, glyph_info in glyphs_info.items():
-        # Don’t store glyph not in cmap
+        # Don’t store glyph not in cmap.
         if glyph_id not in chars:
             continue
 
-        # Draw glyph
+        # Draw glyph.
         stride = glyph_info['stride']
         width = glyph_info['width']
         height = glyph_info['height']
@@ -525,9 +475,8 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
                     continue
                 subglyph = glyphs_info[sub_id]
                 if subglyph['bitmap'] is None:
-                    # TODO: support subglyph in subglyph
-                    LOGGER.warning(
-                        f'Unsupported subglyph in subglyph: {sub_id}')
+                    # TODO: Support subglyph in subglyph.
+                    LOGGER.warning(f'Unsupported subglyph in subglyph: {sub_id}')
                     continue
                 for row_y in range(subglyph['height']):
                     row_slice = slice(
@@ -561,10 +510,73 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
             b'/BPC 1',
             b'/D [1 0]',
             b'ID', bitmap, b'EI'
-        ], compress=compress_pdf)
+        ], compress=compress)
         pdf.add_object(bitmap_stream)
         char_procs[glyph_id] = bitmap_stream.reference
 
     pdf.add_object(char_procs)
     font_dictionary['Widths'] = pydyf.Array(widths)
     font_dictionary['CharProcs'] = char_procs.reference
+
+
+def _build_vector_font_dictionary(font_dictionary, pdf, font, widths, compress,
+                                  reference, pdf_version):
+    font_file = f'FontFile{3 if font.type == "otf" else 2}'
+    max_x = max(widths.values()) if widths else 0
+    bbox = (0, font.descent, max_x, font.ascent)
+    flags = font.flags
+    if len(widths) > 1 and len(set(font.widths.values())) == 1:
+        flags += 2 ** (1 - 1)  # FixedPitch
+    font_descriptor = pydyf.Dictionary({
+        'Type': '/FontDescriptor',
+        'FontName': font.name,
+        'FontFamily': pydyf.String(font.family),
+        'Flags': flags,
+        'FontBBox': pydyf.Array(bbox),
+        'ItalicAngle': font.italic_angle,
+        'Ascent': font.ascent,
+        'Descent': font.descent,
+        'CapHeight': bbox[3],
+        'StemV': font.stemv,
+        'StemH': font.stemh,
+        font_file: reference,
+    })
+    if str(pdf_version) <= '1.4':  # Cast for bytes and None
+        cids = sorted(font.widths)
+        padded_width = int(ceil((cids[-1] + 1) / 8))
+        bits = ['0'] * padded_width * 8
+        for cid in cids:
+            bits[cid] = '1'
+        stream = pydyf.Stream(
+            (int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
+            compress=compress)
+        pdf.add_object(stream)
+        font_descriptor['CIDSet'] = stream.reference
+    if font.type == 'otf':
+        font_descriptor['Subtype'] = '/OpenType'
+    pdf.add_object(font_descriptor)
+
+    pdf_widths = pydyf.Array()
+    for i in sorted(widths):
+        if i - 1 not in widths:
+            pdf_widths.append(i)
+            current_widths = pydyf.Array()
+            pdf_widths.append(current_widths)
+        current_widths.append(widths[i])
+
+    subfont_dictionary = pydyf.Dictionary({
+        'Type': '/Font',
+        'Subtype': f'/CIDFontType{0 if font.type == "otf" else 2}',
+        'BaseFont': font.name,
+        'CIDSystemInfo': pydyf.Dictionary({
+            'Registry': pydyf.String('Adobe'),
+            'Ordering': pydyf.String('Identity'),
+            'Supplement': 0,
+        }),
+        'CIDToGIDMap': '/Identity',
+        'W': pdf_widths,
+        'FontDescriptor': font_descriptor.reference,
+    })
+    pdf.add_object(subfont_dictionary)
+    font_dictionary['Encoding'] = '/Identity-H'
+    font_dictionary['DescendantFonts'] = pydyf.Array([subfont_dictionary.reference])

From 5485d2edf67f391227a780a7523476aee0a36482 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Sat, 8 Jun 2024 16:58:58 +0200
Subject: [PATCH 3/3] Update documentation to advertise about harfbuzz-subset

---
 docs/first_steps.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/first_steps.rst b/docs/first_steps.rst
index 3abee5ee8..ce5daf547 100644
--- a/docs/first_steps.rst
+++ b/docs/first_steps.rst
@@ -70,7 +70,7 @@ in a `virtual environment`_ using `pip`_::
 .. _pip: https://pip.pypa.io/
 
 
-Alpine ≥ 3.14
+Alpine ≥ 3.17
 +++++++++++++
 
 To install WeasyPrint using your distribution’s package::
@@ -80,12 +80,12 @@ To install WeasyPrint using your distribution’s package::
 To install WeasyPrint inside a virtualenv using wheels (if possible), you need
 the following packages::
 
-  apk add py3-pip gcc musl-dev python3-dev pango zlib-dev jpeg-dev openjpeg-dev g++ libffi-dev
+  apk add py3-pip gcc musl-dev python3-dev pango zlib-dev jpeg-dev openjpeg-dev g++ libffi-dev harfbuzz-subset
 
 To install WeasyPrint inside a virtualenv without using wheels, you need the
 following packages::
 
-  apk add py3-pip gcc musl-dev python3-dev pango zlib-dev jpeg-dev openjpeg-dev g++ libffi-dev
+  apk add py3-pip gcc musl-dev python3-dev pango zlib-dev jpeg-dev openjpeg-dev g++ libffi-dev harfbuzz-subset
 
 
 Archlinux
@@ -116,15 +116,15 @@ To install WeasyPrint using your distribution’s package::
 To install WeasyPrint inside a virtualenv using wheels (if possible), you need
 the following packages::
 
-  apt install python3-pip libpango-1.0-0 libpangoft2-1.0-0
+  apt install python3-pip libpango-1.0-0 libpangoft2-1.0-0 libharfbuzz-subset0
 
 To install WeasyPrint inside a virtualenv without using wheels, you need the
 following packages::
 
-  apt install python3-pip libpango-1.0-0 libpangoft2-1.0-0 libjpeg-dev libopenjp2-7-dev libffi-dev
+  apt install python3-pip libpango-1.0-0 libpangoft2-1.0-0 libharfbuzz-subset0 libjpeg-dev libopenjp2-7-dev libffi-dev
 
 
-Fedora ≥ 34
+Fedora ≥ 39
 +++++++++++
 
 To install WeasyPrint using your distribution’s package::
@@ -152,12 +152,12 @@ To install WeasyPrint using your distribution’s package::
 To install WeasyPrint inside a virtualenv using wheels (if possible), you need
 the following packages::
 
-  apt install python3-pip libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0
+  apt install python3-pip libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libharfbuzz-subset0
 
 To install WeasyPrint inside a virtualenv without using wheels, you need the
 following packages::
 
-  apt install python3-pip libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libffi-dev libjpeg-dev libopenjp2-7-dev
+  apt install python3-pip libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libharfbuzz-subset0 libffi-dev libjpeg-dev libopenjp2-7-dev
 
 
 macOS