From 8f6da705f4f4ad92f7db3ed0dac11b0559b9c096 Mon Sep 17 00:00:00 2001
From: Lasse Fister <commander@graphicore.de>
Date: Fri, 11 Sep 2015 03:09:41 +0200
Subject: [PATCH] Improved KernFeatureWriter for LTR/RTL kerning

This implements what was discussed here https://github.com/unified-font-object/ufo-spec/issues/16#issuecomment-120036174

The result's have been used but are not yet approved by anyone :-)
This is the output file: https://github.com/Tarobish/Jomhuria/blob/master/sources/kerning.fea

@behdad wrote:

> * Generate both an LTR kern lookup and an RTL one; including all kerning pairs in both.
  * Reference the RTL lookup from script systems that are RTL, and the LTR one for others.
>
>That's the crudest heuristic, but it works. From there on, you can add more complexity to achieve smaller kern tables. But none of these really belongs into the UFO per se. For example, a huge improvement:
>
> * Associate each glyph to a Unicode character,
  * Exclude from RTL kern table all glyphs associated with Unicode characters that have Bidi_Type=L,
  * Exclude from LTR kern table all glyphs associated with Unicode characters that have Bidi_Type=R or Bidi_Type=AL.
---
 tools/getKernFeatureFromUFO.py | 523 ++++++++++++++++++++++++++++++++-
 1 file changed, 513 insertions(+), 10 deletions(-)

diff --git a/tools/getKernFeatureFromUFO.py b/tools/getKernFeatureFromUFO.py
index 58573b630..6b33ebd9a 100755
--- a/tools/getKernFeatureFromUFO.py
+++ b/tools/getKernFeatureFromUFO.py
@@ -1,14 +1,203 @@
 #! /usr/bin/env python
+from __future__ import print_function
 import sys
+import re
+import unicodedata2 as unicodedata
+from fontTools import agl
+
 from defcon import Font
-from ufo2fdk.kernFeatureWriter import KernFeatureWriter, inlineGroupInstance
+from ufo2fdk.kernFeatureWriter import KernFeatureWriter, inlineGroupInstance \
+                                    , side1FeaPrefix, side2FeaPrefix
+
+def info(*objs):
+    print('INFO: ', *objs, file=sys.stderr)
+
+def warning(*objs):
+    print('WARNING: ', *objs, file=sys.stderr)
+
+def scriptGetHorizontalDirection (script):
+    """
+    from: https://github.com/behdad/harfbuzz/blob/master/src/hb-common.cc#L446
+
+    see: http://goo.gl/x9ilM
+    see also: http://unicode.org/iso15924/iso15924-codes.html
+    """
+    if script.lower() in (
+                # Unicode-1.1 additions
+                  'arab' # ARABIC
+                , 'hebr' # HEBREW
+
+                # Unicode-3.0 additions
+                , 'syrc' # SYRIAC
+                , 'thaa' # THAANA
+
+                # Unicode-4.0 additions
+                , 'cprt' # CYPRIOT
+
+                # Unicode-4.1 additions
+                , 'khar' # KHAROSHTHI
+
+                # Unicode-5.0 additions
+                , 'phnx' # PHOENICIAN
+                , 'nkoo' # NKO
+
+                # Unicode-5.1 additions
+                , 'lydi' # LYDIAN
+
+                # Unicode-5.2 additions
+                , 'avst' # AVESTAN
+                , 'armi' # IMPERIAL_ARAMAIC
+                , 'phli' # INSCRIPTIONAL_PAHLAVI
+                , 'prti' # INSCRIPTIONAL_PARTHIAN
+                , 'sarb' # OLD_SOUTH_ARABIAN
+                , 'orkh' # OLD_TURKIC
+                , 'samr' # SAMARITAN
+
+                # Unicode-6.0 additions
+                , 'mand' # MANDAIC
+
+                # Unicode-6.1 additions
+                , 'merc' # MEROITIC_CURSIVE
+                , 'mero' # MEROITIC_HIEROGLYPHS
+
+                # Unicode-7.0 additions
+                , 'mani' # MANICHAEAN
+                , 'mend' # MENDE_KIKAKUI
+                , 'nbat' # NABATAEAN
+                , 'narb' # OLD_NORTH_ARABIAN
+                , 'palm' # PALMYRENE
+                , 'phlp' # PSALTER_PAHLAVI
+                # Unicode-8.0 additions
+                , 'hung' # OLD_HUNGARIAN
+                ):
+        return 'RTL'
+    return 'LTR'
+
+
+
+class KernFeatureWriterWithHorizontalDirections(KernFeatureWriter):
+    def __init__(self, font, scripts, groupNamePrefix='', **options):
+        """
+        font: a defcon font
+        scripts: list of four letter unicode script codes in the "scripts" argument.
+
+        options:
+            simpleImplementation: bool, default False
+                True:
+                    Generate both an LTR kern lookup and an RTL one; including all kerning pairs in both.
+                    Reference the RTL lookup from script systems that are RTL, and the LTR one for others.
+                False (default value):
+                    The same as "True" but try to eliminate pairs from the RTL and LTR lookup to reduce
+                    kern table size.
+
+                see: https://github.com/unified-font-object/ufo-spec/issues/16#issuecomment-120036174
+
+            ignoreMissingUnicodeInGroupItems: bool, default True
+                True (default value):
+                    create "LTR" or "RTL" pure groups if the reason to be
+                    "MIX" would be only glyphs that don't yield a unicode/direction
+                False:
+                    create "MIX" rules if a group has members without
+                    unicode and with unicode, but is otherwise homogenous
+                    "LTR" or "RTL"
+
+                Details:
+                For each side of a pair the method getPairItemDirection
+                returns one of:"LTR", "RTL", "MIX", False
+                False: discards the kerning pair completeley.
+                MIX: creates two rules for the pair in both script directions
+                LTR and RTL: create just one lookup in the respective direction.
+
+                If the pair-item is a glyph-group each member is evaluated.
+                If the items of a group yield different values "MIX" is
+                returned. For the filesize however "LTR" or "RTL" is preferable.
+                A group member that yields `False` can change a homogenous
+                group into a "MIX" group. In case of the Jomhuria input
+                kerning data this happens, because we can't determine a
+                unicode for the glyph/name. However, the glyphs in that font
+                that don't yield a useful unicode are not used in text layout
+                (they are not inserted by fea `sub` rules). They also
+                shouldn't be in the kerning data at all (but they are). Thus
+                it's better to ignore these cases and produce more rules
+                with homogenous "RTL" or "LTR" value.
+
+                Hence the more aggressive option is the default here.
+                Glyphs that happen in the text layout that don't yield
+                unicode values (we use several approaches here) are not
+                desireable in general (e.g. it breaks pdf text extraction).
+
+            createPureLookups: bool, default True
+                True (default value):
+                    Keep the "mixed rules" and the "pure rules" of a direction
+                    separated, by creating up to two lookups per direction.
+                False:
+                    Merge the "mixed rules" and the "pure rules" of a direction
+                    into one lookup per direction.
+
+                Helps when analyzing the result of this script by preserving
+                the data of pure vs mixed pairs.
+
+            usePureLookupsInDFLT: bool, default False
+                True:
+                    Use the "pure lookups" for `script DFLT` not in the scripts
+                    defined by the `scripts` argument.
+                False (default value):
+                    Use the "pure lookups" in the scripts with the same direction
+                    as the lookup has, not as DFLT for all scripts.
 
+                    Default because of: https://github.com/unified-font-object/ufo-spec/issues/16#issuecomment-120036174
 
+                NOTE: Has only an effect if "createPureLookups" is true.
 
-class KernFeatureWriterRTL(KernFeatureWriter):
-    def getFeatureRulesForPairs(self, pairs):
+                This may be useful for research and also may help to find/fix
+                possible bugs.
+                The direction of the kerning pairs in the "pure lookups"
+                is in theory unambigous. They should always be kerned by
+                the shaping engine in the direction that is forseen by
+                this code. Putting them into DFLT may or may not be better
+                depending on how the fea "script" tags are processed and
+                if there are effects by the metadata of the text that is
+                processed, i.e. if the user tags the text as some script
+                that we don't define here.
+                AFAIK a glyph can be associated with many different scripts
+                in unicode, so it may be hard to determine  the right kern
+                lookup sometimes. (The more data in DFLT, the better?)
         """
-        Write pair rules to a list of strings.
+        # TODO: There may be a way to find out the scripts to use by looking
+        # at the font data. Unicode should be capable of providing that
+        # information, but there seems to be no ready to use implementation
+        # for it.
+        self.scripts = _scripts = {}
+        if not scripts:
+            raise TypeError('Need at least one script in the "scripts" argument iterable.')
+
+        for script in scripts:
+            script = script.lower()
+            writingDir = scriptGetHorizontalDirection(script)
+            if writingDir not in _scripts:
+                _scripts[writingDir] = []
+            _scripts[writingDir].append(script)
+
+        self.scriptDirs = self.scripts.keys()
+
+        self.options = {}
+        for k, default in self._optionDefaults.iteritems():
+            self.options[k] = options.get(k, default)
+
+        super(KernFeatureWriterWithHorizontalDirections, self).__init__(font, groupNamePrefix)
+
+    _optionDefaults = {
+          'simpleImplementation': False
+        , 'ignoreMissingUnicodeInGroupItems':  True
+        , 'createPureLookups': True
+        , 'usePureLookupsInDFLT': False
+    }
+
+    getFeatureRulesForPairsLTR = KernFeatureWriter.getFeatureRulesForPairs
+
+    def getFeatureRulesForPairsRTL(self, pairs):
+        """
+        Write RTL pair rules to a list of strings.
 
         You should not call this method directly.
         """
@@ -17,17 +206,331 @@ def getFeatureRulesForPairs(self, pairs):
             if not side1 or not side2:
                 continue
             if isinstance(side1, inlineGroupInstance) or isinstance(side2, inlineGroupInstance):
-                line = "enum pos {0:s} {1:s} <{2:d} 0 {2:d} 0>;"
+                line = 'enum pos {0:s} {1:s} <{2:d} 0 {2:d} 0>;'
             else:
-                line = "pos {0:s} {1:s} <{2:d} 0 {2:d} 0>;"
+                line = 'pos {0:s} {1:s} <{2:d} 0 {2:d} 0>;'
             if isinstance(side1, inlineGroupInstance):
-                side1 = "[%s]" % " ".join(sorted(side1))
+                side1 = '[%s]' % ' '.join(sorted(side1))
             if isinstance(side2, inlineGroupInstance):
-                side2 = "[%s]" % " ".join(sorted(side2))
+                side2 = '[%s]' % ' '.join(sorted(side2))
             rules.append(line.format(side1, side2, value))
         return rules
 
+
+    def getWritingDirRules(self, writingDir, pairs):
+        """
+            Return a list of the kerning rules and some comments
+        """
+        glyphGlyph, glyphGroupDecomposed, groupGlyphDecomposed, glyphGroup, groupGlyph, groupGroup = self.getSeparatedPairs(pairs)
+        order = [
+            ('# glyph, glyph', glyphGlyph),
+            ('# glyph, group exceptions', glyphGroupDecomposed),
+            ('# group exceptions, glyph', groupGlyphDecomposed),
+            ('# glyph, group', glyphGroup),
+            ('# group, glyph', groupGlyph),
+            ('# group, group', groupGroup),
+        ]
+
+        getFeatureRulesForPairs = self.getFeatureRulesForPairsLTR \
+                            if writingDir == 'LTR' \
+                            else self.getFeatureRulesForPairsRTL
+
+        rules = []
+        for note, pairs in order:
+            if pairs:
+                rules.append(note)
+                rules += getFeatureRulesForPairs(pairs)
+                rules.append('')
+        # remove the last empty line
+        if rules and rules[-1] == '': rules.pop();
+        return rules
+
+    def getUnicodeFromGlyphName(self, name):
+        # Ask the font
+        glyph = self.font[name]
+        if glyph.unicode:
+            return unichr(glyph.unicode)
+
+        # Names can be constructed like uni1234_uni4567.old or f_i.swash
+        # since we need the unicode value only to determine a direction,
+        # the first glyph name should be enough.
+        firstNamePart = name.split('.', 1)[0].split('_', 1)[0]
+
+        # Ask AGLFN
+        if firstNamePart in agl.AGL2UV:
+            return unichr(agl.AGL2UV[firstNamePart])
+
+        # Try to parse the name into a unicode value
+        # matches things like u1EE29* or uni1234*
+        match = re.match('^(uni|u)([A-F0-9]{4,}).*', firstNamePart, re.IGNORECASE)
+        if match:
+            code = int(match.group(2), 16)
+            return unichr(code)
+
+        # no luck
+        return False
+
+    def getPairItemDirection(self, item):
+        """
+        Return one of: "LTR", "RTL", "MIX", False
+
+        Note that when mentioning "groups" that includes single glyphs.
+        A single item is treated as a group with just one member.
+
+        This was the initial description
+
+        * Associate each glyph to a Unicode character,
+        * Exclude from RTL kern table all glyphs associated
+                with Unicode characters that have Bidi_Type=L,
+        * Exclude from LTR kern table all glyphs associated
+               with Unicode characters that have Bidi_Type=R or Bidi_Type=AL.
+
+        However, the exclusion model got altered into an inclusion model
+        because groups could define mixed directions and thus would have
+        to stay in all kern tables.
+
+        An alternative way could be to break up the groups and reorder
+        them to create a couple of better defined groups.
+
+        FILTERING:
+        Only if all memnbers of a group are `False` the whole group and
+        subsequently pair is dissmissed. Alternativeley we could dismiss
+        the whole group when at least one member is `False`. But I believe
+        that would create more problems.
+        Rather: we should filter the group contents much earlier in
+        KernFeatureWriter.getPairs or KernFeatureWriter.getGroups
+
+        See also: the docs for options.ignoreMissingUnicodeInGroupItems
+                  in __init__
+        """
+        # Note: KernFeatureWriter.getPairs should already have filtered
+        # empty groups and nonexisting pairs, thus it is not checked here.
+        if item.startswith(side1FeaPrefix) or item.startswith(side2FeaPrefix):
+            groups = self.side1Groups if item.startswith(side1FeaPrefix) \
+                                      else self.side2Groups
+            names = groups[item]
+        else:
+            names = [item]
+
+        writingDirs = set()
+        for name in names:
+            unicodeChar = self.getUnicodeFromGlyphName(name)
+            # http://unicode.org/reports/tr9/#Table_Bidirectional_Character_Types
+            if unicodeChar == False:
+                if not self.options['ignoreMissingUnicodeInGroupItems']:
+                    # may result in a "MIX" pair if this is a bigger group
+                    writingDirs.add(False)
+                continue
+            bidiType = unicodedata.bidirectional(unicodeChar)
+
+            # L: Left-to-Right
+            if bidiType == 'L':
+                writingDirs.add('LTR')
+            # R: Right-to-Left
+            # AL: Right-to-Left Arabic
+            elif bidiType in ('R', 'AL'):
+                writingDirs.add('RTL')
+            # AN: Arabic Number
+            # FIXME: remove bidiType == 'AN' this from kerning? This
+            # filtering should rather happen earlier, somewhere in
+            # KernFeatureWriter.getPairs or KernFeatureWriter.getGroups
+            # elif bidiType == 'AN':
+            #    writingDirs.add(False)
+            else:
+                writingDirs.add('MIX')
+
+        # if writingDirs.has(False)
+        #     return False
+        if len(writingDirs) > 1:
+            return 'MIX'
+        elif len(writingDirs) == 0:
+            return False
+        return writingDirs.pop() # "LTR", "RTL", "MIX" or False
+
+    def getPairData(self, pair):
+        """
+        Returns writingDir for the pair
+
+        writingDir may be "LTR", "RTL", "MIX" or False
+
+        "LTR" and "RTL": The pair goes only into the respective direction lookup.
+        "MIX": A pair with mixed entry types, goes into the lookups for both directions
+        False: The pair is removed from kerning
+        """
+
+        side1writingDir, side2writingDir = [self.getPairItemDirection(side)
+                                for side in pair]
+
+        if not side1writingDir or not side2writingDir:
+            # Filtered/Removed
+            writingDir = False
+        elif side1writingDir in ['LTR', 'RTL'] and side1writingDir == side2writingDir:
+            writingDir = side1writingDir
+        else:
+            writingDir = 'MIX'
+
+        return writingDir
+
+    def getPairsData(self, pairs):
+        """
+        Returns: (purePairs, mixedPairs)
+
+        purePairs = {'LTR': { *dict of pairdata* },'RTL': { *dict of pairdata* }}
+        mixedPairs = { *dict of pairdata* }
+        """
+
+        purePairs = {'LTR': {},'RTL': {}}
+        mixedPairs = {}
+
+        if self.options['simpleImplementation']:
+            mixedPairs.update(pairs)
+            return purePairs, mixedPairs
+
+        # try to reduce kern table size
+        for pair, value in pairs.iteritems():
+            writingDir = self.getPairData(pair)
+            if not writingDir:
+                continue
+            elif writingDir == 'MIX':
+                mixedPairs[pair] = value
+            else:
+                purePairs[writingDir][pair] = value
+        return purePairs, mixedPairs
+
+    def createSeparatedPureAndMixedLookups(self, purePairs, mixedPairs):
+        lookups = {}
+        for scriptDir, pairs in purePairs.iteritems():
+            if len(pairs):
+                label = 'kernPure{0}'.format(scriptDir)
+                lookups[label] = self.getWritingDirRules(scriptDir, pairs)
+        if len(mixedPairs):
+            for scriptDir in ['LTR', 'RTL']:
+                label = 'kernMixed{0}'.format(scriptDir)
+                lookups[label] = self.getWritingDirRules(scriptDir, mixedPairs)
+        return lookups
+
+    def createUnifiedPureAndMixedLookups(self, purePairs, mixedPairs):
+        lookups = {}
+        for scriptDir in ['LTR', 'RTL']:
+            unifiedPairs = {}
+            if scriptDir in purePairs:
+                unifiedPairs.update(purePairs[scriptDir])
+            unifiedPairs.update(mixedPairs)
+            if not len(unifiedPairs):
+                continue;
+            label = 'kernMixed{0}'.format(scriptDir)
+            lookups[label] = self.getWritingDirRules(scriptDir, unifiedPairs)
+        return lookups
+
+    def getLookupData(self, pairs):
+        """
+        Returns lookups, directions.
+
+        lookups = {lookupLabel: [rules]}
+        directions = { (lookupLabel, lookupLabel, ...): [scripts]}
+        """
+        purePairs, mixedPairs = self.getPairsData(pairs)
+
+        if self.options['createPureLookups']:
+            lookups = self.createSeparatedPureAndMixedLookups(purePairs, mixedPairs)
+        else:
+            lookups = self.createUnifiedPureAndMixedLookups(purePairs, mixedPairs)
+
+        directions = {}
+        for scriptDir, scripts in self.scripts.iteritems():
+            if len(scripts):
+                labels = []
+                if not self.options['usePureLookupsInDFLT']:
+                    # kernPure* lookups must also be present in lookups
+                    # to get used.
+                    labels.append('kernPure{0}'.format(scriptDir))
+                labels.append('kernMixed{0}'.format(scriptDir))
+                directions[tuple(labels)] = scripts
+
+        return lookups, directions
+
+    def compileKern(self, headerText, classes, lookups, directions):
+        # line indentation
+        lineFormat = '    {0}'
+        def makeLines (lines):
+            return [lineFormat.format(line) for line in lines]
+        # lookup definition
+        lookupOpenFormat = 'lookup {label} {{'
+        lookupCloseFormat = '}} {label};'
+        # lookup usage
+        lookupUsageFormat = 'lookup {label};'
+
+        # write the lookups
+        feature = []
+
+        # add kerning classes
+        if classes:
+            feature.append('# kerning classes')
+            feature += classes
+            feature.append('')
+
+        for label, rules in lookups.iteritems():
+            feature.append(lookupOpenFormat.format(label=label))
+            feature += makeLines(rules)
+            feature.append(lookupCloseFormat.format(label=label))
+            feature.append('')
+
+        # write the feature
+        feature.append('feature kern {')
+        if headerText:
+            for line in headerText.splitlines():
+                line = line.strip()
+                if not line.startswith('#'):
+                    line = '# ' + line
+                feature.append(lineFormat.format(line))
+
+        # the usage of the lookups
+        usage = []
+        if self.options['createPureLookups'] and self.options['usePureLookupsInDFLT']:
+            # put kernPure* into the DFLT script
+            pureLabels = ['kernPure{0}'.format(script) for script in self.scripts]
+            for label in pureLabels:
+                if label not in lookups:
+                    continue
+                usage.append(lookupUsageFormat.format(label=label))
+
+        # Use the lookups in their specific scripts
+        for labels, scripts in directions.iteritems():
+            lookupReferences = [lookupUsageFormat.format(label=label)
+                                for label in labels if label in lookups]
+            if not lookupReferences:
+                # no actual lookups present for the label
+                continue
+            for script in scripts:
+                usage.append('script {0};'.format(script))
+                usage += lookupReferences
+
+        feature += makeLines(usage)
+        feature.append('} kern;')
+        return '\n'.join(feature)
+
+    def write(self, headerText=None):
+        """
+        Write the feature text. If *headerText* is provided
+        it will inserted after the ``feature kern {`` line.
+        """
+        if not self.pairs:
+            return ''
+
+        lineFormat = '    {0}'
+
+        # get the classes
+        groups = dict(self.side1Groups)
+        groups.update(self.side2Groups)
+        classes = self.getClassDefinitionsForGroups(groups)
+
+        # get the rules
+        lookups, directions = self.getLookupData(self.pairs)
+        return self.compileKern(headerText, classes, lookups, directions)
+
 if __name__ == '__main__':
     font = Font(path=sys.argv[1])
-    kfw = KernFeatureWriterRTL(font)
-    print kfw.write()
+    scripts = ['arab', 'latn']
+
+    kfw = KernFeatureWriterWithHorizontalDirections(font, scripts)
+    print(kfw.write())