write multiple Bidi_Class @missing lines (#267)

* refactor Bidi_Class defaults into new DefaultValues * write multiple Bidi_Class `@missing` lines * omit explicit lines for unassigned code points with default values * props parser handle multiple `@missing` lines
unicode-org · May 31, 2022 · ceb29e7 · ceb29e7
1 parent f74d661
commit ceb29e7
Show file tree

Hide file tree

Showing 7 changed files with 392 additions and 325 deletions.
diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt
diff --git a/unicodetools/src/main/java/org/unicode/props/DefaultValues.java b/unicodetools/src/main/java/org/unicode/props/DefaultValues.java
@@ -0,0 +1,128 @@
+package org.unicode.props;
+
+import org.unicode.props.UcdPropertyValues.Bidi_Class_Values;
+import org.unicode.props.UcdPropertyValues.Block_Values;
+
+import com.ibm.icu.dev.util.UnicodeMap;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.VersionInfo;
+
+/**
+ * Default property values for some properties and certain ranges
+ * other than all of Unicode.
+ */
+public final class DefaultValues {
+    public static final class BidiClass {
+        private static final Bidi_Class_Values L = Bidi_Class_Values.Left_To_Right;
+        private static final Bidi_Class_Values R = Bidi_Class_Values.Right_To_Left;
+        private static final Bidi_Class_Values AL = Bidi_Class_Values.Arabic_Letter;
+        private static final Bidi_Class_Values BN = Bidi_Class_Values.Boundary_Neutral;
+        private static final Bidi_Class_Values ET = Bidi_Class_Values.European_Terminator;
+
+        public static enum Option { ALL, OMIT_BN };
+
+        private static final class Builder {
+            int compositeVersion;
+            IndexUnicodeProperties props;
+            UnicodeMap<Block_Values> blocks;
+            UnicodeMap<Bidi_Class_Values> bidi = new UnicodeMap<>();
+
+            Builder(VersionInfo version) {
+                compositeVersion =
+                        (version.getMajor() << 16) | (version.getMinor() << 8) | version.getMilli();
+                props = IndexUnicodeProperties.make(version);
+                blocks = props.loadEnum(UcdProperty.Block);
+            }
+
+            UnicodeMap<Bidi_Class_Values> build(Option option) {
+                // Overall default
+                bidi.setMissing(L);
+
+                // Set defaults in ascending order of Unicode versions,
+                // at least if there are overlaps, so that a later change
+                // can override parts of an earlier, larger range.
+                // Adding a block before it existed in the given version is a no-op.
+                // If a block has had its default value since it was allocated,
+                // then we could simply use minVersion=0x30000
+                // (but it would be less obvious which block got its default when).
+
+                // Unicode 3.0 was the first version to publish UAX #9, effectively create
+                // the Bidi_Class property, and assign default Bidi_Class values.
+                addBlockValueIfAtLeast(Block_Values.Hebrew, 0x30000, R);
+                addBlockValueIfAtLeast(Block_Values.Arabic, 0x30000, AL);
+                addBlockValueIfAtLeast(Block_Values.Syriac, 0x30000, AL);
+                addRangeValueIfAtLeast(0x0750, 0x077F, 0x30000, AL);
+                addBlockValueIfAtLeast(Block_Values.Thaana, 0x30000, AL);
+                addRangeValueIfAtLeast(0xFB1D, 0xFB4F, 0x30000, R);
+                addBlockValueIfAtLeast(Block_Values.Arabic_Presentation_Forms_A, 0x30000, AL);
+                addBlockValueIfAtLeast(Block_Values.Arabic_Presentation_Forms_B, 0x30000, AL);
+
+                addRangeValueIfAtLeast(0x07C0, 0x8FF, 0x40000, R);
+                addRangeValueIfAtLeast(0x10800, 0x10FFF, 0x40000, R);
+
+                // In order to be precise, exclude U+FEFF ("BOM") for Unicode 4.0 & 4.0.1.
+                // See https://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html
+                // This had no real effect, since U+FEFF was already
+                // an *assigned* character with bc=BN.
+                if (0x40000 <= compositeVersion && compositeVersion < 0x40100) {
+                    bidi.put(0xFEFF, L);
+                }
+
+                // The noncharacter code points FDD0..FDEF were designated in Unicode 3.1, but the
+                // whole enclosing block FB50..FDFF Arabic Presentation Forms-A kept default bc=AL.
+                // Unicode 4.0 then excluded these noncharacters from bc=AL.
+                addRangeValueIfAtLeast(0xFDD0, 0xFDEF, 0x40000, L);
+
+                // Unicode 4.0.1 changed all noncharacter code points and
+                // default ignorables to default bc=BN.
+                // Since many of these ranges are not aligned with block boundaries,
+                // we may omit them when presenting defaults.
+                if (compositeVersion >= 0x40001 && option != Option.OMIT_BN) {
+                    UnicodeSet nonchars = props.loadBinary(UcdProperty.Noncharacter_Code_Point);
+                    bidi.putAll(nonchars, BN);
+                    UnicodeSet defaultIgnorable =
+                            props.loadBinary(UcdProperty.Default_Ignorable_Code_Point);
+                    bidi.putAll(defaultIgnorable, BN);
+                }
+
+                addBlockValueIfAtLeast(Block_Values.Arabic_Supplement, 0x40100, AL);
+                addRangeValueIfAtLeast(0x1E800, 0x1EFFF, 0x50200, R);
+                addBlockValueIfAtLeast(Block_Values.Arabic_Extended_A, 0x60100, AL);
+                addBlockValueIfAtLeast(
+                        Block_Values.Arabic_Mathematical_Alphabetic_Symbols, 0x60100, AL);
+                addBlockValueIfAtLeast(
+                        Block_Values.Currency_Symbols, 0x60300, ET); // default ET since 6.3
+
+                addBlockValueIfAtLeast(Block_Values.Syriac_Supplement, 0xA0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Hanifi_Rohingya, 0xB0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Sogdian, 0xB0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Indic_Siyaq_Numbers, 0xB0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Ottoman_Siyaq_Numbers, 0xC0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Arabic_Extended_B, 0xE0000, AL);
+                addBlockValueIfAtLeast(Block_Values.Arabic_Extended_C, 0xF0000, AL);
+
+                return bidi;
+            }
+
+            private void addRangeValueIfAtLeast(
+                    int start, int end, int minVersion, Bidi_Class_Values bidiValue) {
+                if (compositeVersion >= minVersion) {
+                    bidi.putAll(start, end, bidiValue);
+                }
+            }
+
+            private void addBlockValueIfAtLeast(
+                    Block_Values blockValue, int minVersion, Bidi_Class_Values bidiValue) {
+                if (compositeVersion >= minVersion) {
+                    UnicodeSet block = blocks.keySet(blockValue);
+                    bidi.putAll(block, bidiValue);
+                }
+            }
+        }
+
+        public static UnicodeMap<Bidi_Class_Values> forVersion(
+                VersionInfo version, Option option) {
+            return new Builder(version).build(option);
+        }
+    }
+}
diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java
@@ -218,36 +218,46 @@ public boolean useOldFile(VersionInfo ucdVersionRequested) {
         return ucdVersionRequested.compareTo(maxOldVersion) <= 0;
     }
 
-    public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger) {
-        put(data, intRange, string, merger, false);
-    }
-
     public static final Normalizer2 NFD = Normalizer2.getNFDInstance();
     public static final Normalizer2 NFC = Normalizer2.getNFCInstance();
 
-    public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger, boolean hackHangul) {
-        if (string != null && string.isEmpty() && property != UcdProperty.NFKC_Casefold) {
-            string = null;
+    public void put(UnicodeMap<String> data, IntRange intRange, String string) {
+        put(data, intRange, string, null);
+    }
+
+    public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger) {
+        put(data, null, intRange, string, merger, false);
+    }
+
+    public void put(
+            UnicodeMap<String> data, UnicodeSet missingSet,
+            IntRange intRange, String value,
+            Merge<String> merger, boolean hackHangul) {
+        if (value != null && value.isEmpty() && property != UcdProperty.NFKC_Casefold) {
+            value = null;
         }
-        string = normalizeAndVerify(string);
+        value = normalizeAndVerify(value);
         if (intRange.string != null) {
-            PropertyUtilities.putNew(data, intRange.string, string, merger);
+            PropertyUtilities.putNew(data, intRange.string, value, merger);
         } else {
             for (int codepoint = intRange.start; codepoint <= intRange.end; ++codepoint) {
                 try {
                     if (hackHangul) {
-                        String fullDecomp = NFD.getDecomposition(codepoint); // use ICU for Hangul decomposition
+                        // Use ICU for Hangul decomposition.
+                        String fullDecomp = NFD.getDecomposition(codepoint);
                         if (fullDecomp.length() > 2) {
                             fullDecomp = NFC.normalize(fullDecomp.substring(0,2)) + fullDecomp.substring(2);
                         }
-                        PropertyUtilities.putNew(data, codepoint, fullDecomp, merger);
-                    } else if (string == CONSTRUCTED_NAME) {
-                        PropertyUtilities.putNew(data, codepoint, UCharacter.getName(codepoint), merger); // use ICU for Hangul Name construction, constant
+                        PropertyUtilities.putNew(data, missingSet, codepoint, fullDecomp, merger);
+                    } else if (value == CONSTRUCTED_NAME) {
+                        // Use ICU for Hangul Name construction, constant.
+                        PropertyUtilities.putNew(
+                                data, missingSet, codepoint, UCharacter.getName(codepoint), merger);
                     } else {
-                        PropertyUtilities.putNew(data, codepoint, string, merger);
+                        PropertyUtilities.putNew(data, missingSet, codepoint, value, merger);
                     }
                 } catch (final Exception e) {
-                    String msg = String.format("%s: %04X..%04X  %s", property, intRange.start, intRange.end, string);
+                    String msg = String.format("%s: %04X..%04X  %s", property, intRange.start, intRange.end, value);
                     throw new UnicodePropertyException(msg, e);
                 }
             }
@@ -368,9 +378,7 @@ public void checkRegex(String part) {
             }
         }
     }
-    public void put(UnicodeMap<String> data, IntRange intRange, String string) {
-        put(data, intRange, string, null);
-    }
+
     public String getDefaultValue() {
         return defaultValue;
     }
@@ -428,6 +436,10 @@ enum Contents { DATA, MISSING, EMPTY }
         private final boolean withRange;
         private final boolean withMissing;
         private final Iterator<String> rawLines;
+        /**
+         * Code points covered by @missing lines for less than all of Unicode.
+         */
+        private final UnicodeSet missingSet = new UnicodeSet();
         private State state = State.LOOK;
         String line;  // original line for logging and error messages
         String line2;  // modified line for parsing
@@ -465,7 +477,7 @@ public boolean hasNext() {
                         if (line2.contains("# EOF")) {
                             stats.containsEOF = true;
                         } else {
-                            if (line2.contains("@missing")) {  // quick test
+                            if (line2.contains("@missing:")) {  // quick test
                                 // # @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
                                 if (!withMissing) {
                                     throw new IllegalArgumentException(
@@ -502,9 +514,16 @@ public boolean hasNext() {
                     } catch (Exception e) {
                         throw new IllegalArgumentException("line: " + line, e);
                     }
-                    if (contents != Contents.DATA &&
-                            (intRange.start != 0 || intRange.end != 0x10FFFF)) {
-                        System.err.println("Unexpected range: " + line);
+                    if (contents != Contents.DATA) {
+                        if (intRange.start != 0 || intRange.end != 0x10FFFF) {
+                            if (contents == Contents.MISSING) {
+                                // @missing line for less than all of Unicode
+                                missingSet.add(intRange.start, intRange.end);
+                                contents = Contents.DATA;
+                            } else {
+                                System.err.println("Unexpected range: " + line);
+                            }
+                        }
                     }
                 }
                 state = State.HAVE_NEXT;
@@ -932,7 +951,7 @@ private static void parseFields(UcdLine line,
                 }
                 String value = propInfo.fieldNumber >= parts.length ? "" 
                         : parts[propInfo.fieldNumber];
-                propInfo.put(data, line.intRange, value, merger,
+                propInfo.put(data, line.missingSet, line.intRange, value, merger,
                         hackHangul && propInfo.property == UcdProperty.Decomposition_Mapping);
             }
         } else {
@@ -950,7 +969,7 @@ private static void parseSimpleFieldFile(UcdLineParser parser,
             PropertyParsingInfo propInfo, UnicodeMap<String> data) {
         for (UcdLine line : parser) {
             if (line.contents == UcdLine.Contents.DATA) {
-                propInfo.put(data, line.intRange, line.parts[1], null, false);
+                propInfo.put(data, line.missingSet, line.intRange, line.parts[1], null, false);
             } else {
                 setPropDefault(
                         propInfo.property, line.parts[1], line.line,

diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyUtilities.java b/unicodetools/src/main/java/org/unicode/props/PropertyUtilities.java
@@ -6,6 +6,7 @@
 import org.unicode.text.utility.Utility;
 
 import com.ibm.icu.dev.util.UnicodeMap;
+import com.ibm.icu.text.UnicodeSet;
 
 public class PropertyUtilities {
 
@@ -36,9 +37,11 @@ static final <K, V, M extends Map<K,V>> M putNew(M map, K key, V value) {
         return map;
     }
 
-    static final <V> UnicodeMap<V> putNew(UnicodeMap<V> map, int key, V value, Merge<V> merger) {
+    static final <V> UnicodeMap<V> putNew(
+            UnicodeMap<V> map, UnicodeSet missingSet,
+            int key, V value, Merge<V> merger) {
         final V oldValue = map.get(key);
-        if (oldValue != null) {
+        if (oldValue != null && (missingSet == null || !missingSet.contains(key))) {
             if (merger == null) {
                 throw new UnicodePropertyException("Key already present in UnicodeMap: " + Utility.hex(key) + ",\told: " + oldValue + ",\tnew: " + value);
             }