diff --git a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt index 772025d23d..e0ca708c41 100644 --- a/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt +++ b/unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt @@ -1,5 +1,5 @@ # DerivedBidiClass-15.0.0.txt -# Date: 2022-04-26, 23:14:28 GMT +# Date: 2022-05-28, 20:32:55 GMT # © 2022 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see https://www.unicode.org/terms_of_use.html @@ -11,26 +11,9 @@ # Bidi Class (listing UnicodeData.txt, field 4: see UAX #44: https://www.unicode.org/reports/tr44/) # Unlike other properties, unassigned code points in blocks -# reserved for right-to-left scripts are given either types R or AL. -# -# The unassigned code points that default to AL are in the ranges: -# [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF -# \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F -# \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] -# -# This includes code points in the Arabic, Syriac, and Thaana blocks, among others. -# -# The unassigned code points that default to R are in the ranges: -# [\u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F -# \U00010800-\U00010CFF \U00010D40-\U00010EBF \U00010F00-\U00010F2F \U00010F70-\U00010FFF -# \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF \U0001EF00-\U0001EFFF] -# -# This includes code points in the Hebrew, NKo, and Phoenician blocks, among others. -# -# The unassigned code points that default to ET are in the range: -# [\u20A0-\u20CF] -# -# This consists of code points in the Currency Symbols block. +# reserved for right-to-left scripts are given either values R or AL, +# and unassigned code points in the Currency Symbols block are given the value ET. +# For details see the @missing lines below. # # The unassigned code points that default to BN have one of the following properties: # Default_Ignorable_Code_Point @@ -43,6 +26,101 @@ # @missing: 0000..10FFFF; Left_To_Right +# 0590..05FF Hebrew +# @missing: 0590..05FF; Right_To_Left + +# 0600..06FF Arabic +# 0700..074F Syriac +# 0750..077F Arabic_Supplement +# 0780..07BF Thaana +# @missing: 0600..07BF; Arabic_Letter + +# 07C0..07FF NKo +# 0800..083F Samaritan +# 0840..085F Mandaic +# @missing: 07C0..085F; Right_To_Left + +# 0860..086F Syriac_Supplement +# 0870..089F Arabic_Extended_B +# 08A0..08FF Arabic_Extended_A +# @missing: 0860..08FF; Arabic_Letter + +# 20A0..20CF Currency_Symbols +# @missing: 20A0..20CF; European_Terminator + +# FB00..FB4F Alphabetic_Presentation_Forms (partial) +# @missing: FB1D..FB4F; Right_To_Left + +# FB50..FDFF Arabic_Presentation_Forms_A (partial) +# @missing: FB50..FDCF; Arabic_Letter + +# FB50..FDFF Arabic_Presentation_Forms_A (partial) +# @missing: FDF0..FDFF; Arabic_Letter + +# FE70..FEFF Arabic_Presentation_Forms_B +# @missing: FE70..FEFF; Arabic_Letter + +# 10800..1083F Cypriot_Syllabary +# 10840..1085F Imperial_Aramaic +# 10860..1087F Palmyrene +# 10880..108AF Nabataean +# 108E0..108FF Hatran +# 10900..1091F Phoenician +# 10920..1093F Lydian +# 10980..1099F Meroitic_Hieroglyphs +# 109A0..109FF Meroitic_Cursive +# 10A00..10A5F Kharoshthi +# 10A60..10A7F Old_South_Arabian +# 10A80..10A9F Old_North_Arabian +# 10AC0..10AFF Manichaean +# 10B00..10B3F Avestan +# 10B40..10B5F Inscriptional_Parthian +# 10B60..10B7F Inscriptional_Pahlavi +# 10B80..10BAF Psalter_Pahlavi +# 10C00..10C4F Old_Turkic +# 10C80..10CFF Old_Hungarian +# @missing: 10800..10CFF; Right_To_Left + +# 10D00..10D3F Hanifi_Rohingya +# @missing: 10D00..10D3F; Arabic_Letter + +# 10E60..10E7F Rumi_Numeral_Symbols +# 10E80..10EBF Yezidi +# @missing: 10D40..10EBF; Right_To_Left + +# 10EC0..10EFF Arabic_Extended_C +# @missing: 10EC0..10EFF; Arabic_Letter + +# 10F00..10F2F Old_Sogdian +# @missing: 10F00..10F2F; Right_To_Left + +# 10F30..10F6F Sogdian +# @missing: 10F30..10F6F; Arabic_Letter + +# 10F70..10FAF Old_Uyghur +# 10FB0..10FDF Chorasmian +# 10FE0..10FFF Elymaic +# @missing: 10F70..10FFF; Right_To_Left + +# 1E800..1E8DF Mende_Kikakui +# 1E900..1E95F Adlam +# @missing: 1E800..1EC6F; Right_To_Left + +# 1EC70..1ECBF Indic_Siyaq_Numbers +# @missing: 1EC70..1ECBF; Arabic_Letter + +# @missing: 1ECC0..1ECFF; Right_To_Left + +# 1ED00..1ED4F Ottoman_Siyaq_Numbers +# @missing: 1ED00..1ED4F; Arabic_Letter + +# @missing: 1ED50..1EDFF; Right_To_Left + +# 1EE00..1EEFF Arabic_Mathematical_Alphabetic_Symbols +# @missing: 1EE00..1EEFF; Arabic_Letter + +# @missing: 1EF00..1EFFF; Right_To_Left + # ================================================ # Bidi_Class=Left_To_Right diff --git a/unicodetools/src/main/java/org/unicode/props/DefaultValues.java b/unicodetools/src/main/java/org/unicode/props/DefaultValues.java index 59ff0fc0a7..af31e53eb4 100644 --- a/unicodetools/src/main/java/org/unicode/props/DefaultValues.java +++ b/unicodetools/src/main/java/org/unicode/props/DefaultValues.java @@ -19,6 +19,8 @@ public static final class BidiClass { private static final Bidi_Class_Values BN = Bidi_Class_Values.Boundary_Neutral; private static final Bidi_Class_Values ET = Bidi_Class_Values.European_Terminator; + public static enum Option { ALL, OMIT_BN }; + private static final class Builder { int compositeVersion; IndexUnicodeProperties props; @@ -32,7 +34,7 @@ private static final class Builder { blocks = props.loadEnum(UcdProperty.Block); } - UnicodeMap build() { + UnicodeMap build(Option option) { // Overall default bidi.setMissing(L); @@ -73,7 +75,9 @@ UnicodeMap build() { // Unicode 4.0.1 changed all noncharacter code points and // default ignorables to default bc=BN. - if (compositeVersion >= 0x40001) { + // Since many of these ranges are not aligned with block boundaries, + // we may omit them when presenting defaults. + if (compositeVersion >= 0x40001 && option != Option.OMIT_BN) { UnicodeSet nonchars = props.loadBinary(UcdProperty.Noncharacter_Code_Point); bidi.putAll(nonchars, BN); UnicodeSet defaultIgnorable = @@ -117,8 +121,8 @@ private void addBlockValueIfAtLeast( } public static UnicodeMap forVersion( - VersionInfo version) { - return new Builder(version).build(); + VersionInfo version, Option option) { + return new Builder(version).build(option); } } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 28f498969e..86b92c2724 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -24,8 +24,13 @@ import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Tabber; import org.unicode.props.BagFormatter; +import org.unicode.props.DefaultValues; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; import org.unicode.cldr.util.props.UnicodeLabel; import org.unicode.props.UnicodeProperty; +import org.unicode.props.UcdPropertyValues.Bidi_Class_Values; +import org.unicode.props.UcdPropertyValues.Block_Values; import org.unicode.text.UCD.MakeUnicodeFiles.Format.PrintStyle; import org.unicode.text.utility.ChainException; import org.unicode.text.utility.Settings; @@ -41,6 +46,7 @@ import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; public class MakeUnicodeFiles { static boolean DEBUG = false; @@ -398,11 +404,11 @@ public static void generateFile() throws IOException { final Iterator it = Format.theFormat.getFiles().iterator(); boolean gotOne = false; while (it.hasNext()) { - final String propname = it.next(); - if (!matcher.reset(propname).find()) { + final String filename = it.next(); + if (!matcher.reset(filename).find()) { continue; } - generateFile(propname); + generateFile(filename); gotOne = true; } if (!gotOne) { @@ -981,10 +987,9 @@ public static void generatePropertyFile(String filename) throws IOException { if (dir == null) { dir = ""; } + dir = "UCD/" + Default.ucdVersion() + '/' + dir; final UnicodeDataFile udf = - UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucdVersion() + '/' + dir, - filename). + UnicodeDataFile.openAndWriteHeader(dir, filename). setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter pwFile = udf.out; // bf2.openUTF8Writer(UCD_Types.GEN_DIR, "Test" + filename + ".txt"); @@ -1127,6 +1132,14 @@ private static void writeEnumeratedValues( // else if (propName.length() != 0) propName = propName + "; "; //pw.println("# @missing: 0000..10FFFF; " + propName + missing); printDefaultValueComment(pw, propName, prop, propName != null && propName.length() != 0, missing); + if (prop.getName().equals("Bidi_Class")) { + VersionInfo versionInfo = Default.ucdVersionInfo(); + Bidi_Class_Values overallDefault = Bidi_Class_Values.forName(missing); + UnicodeMap defaultBidiValues = + DefaultValues.BidiClass.forVersion( + versionInfo, DefaultValues.BidiClass.Option.OMIT_BN); + writeEnumeratedMissingValues(pw, overallDefault, defaultBidiValues); + } } for (final Iterator it = aliases.iterator(); it.hasNext();) { final String value = it.next(); @@ -1241,6 +1254,44 @@ private static void writeEnumeratedValues( } } + private static void writeEnumeratedMissingValues( + PrintWriter pw, T overallDefault, UnicodeMap defaultValues) { + VersionInfo versionInfo = Default.ucdVersionInfo(); + IndexUnicodeProperties props = IndexUnicodeProperties.make(versionInfo); + UnicodeMap blocks = props.loadEnum(UcdProperty.Block); + Iterator> blockIter = blocks.entryRanges().iterator(); + UnicodeMap.EntryRange blockRange = null; + + for (UnicodeMap.EntryRange range : defaultValues.entryRanges()) { + if (range.value == overallDefault) { + continue; + } + int start = range.codepoint; + int end = range.codepointEnd; + pw.println(); + // Skip blocks before this default-value range. + while ((blockRange == null || blockRange.codepointEnd < start) && blockIter.hasNext()) { + blockRange = blockIter.next(); + } + // Print blocks that overlap with this default-value range. + while (blockRange.codepoint <= end) { + if (blockRange.value != Block_Values.No_Block) { + String partial = + blockRange.codepoint < start || blockRange.codepointEnd > end + ? " (partial)" : ""; + pw.printf("# %04X..%04X %s%s\n", + blockRange.codepoint, blockRange.codepointEnd, + blockRange.value, partial); + } + if (blockRange.codepointEnd > end || !blockIter.hasNext()) { + break; + } + blockRange = blockIter.next(); + } + pw.printf("# @missing: %04X..%04X; %s\n", start, end, range.value); + } + } + //static NumberFormat nf = NumberFormat.getInstance(); static Comparator NUMERIC_STRING_COMPARATOR = new Comparator() { @Override diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UCD.java b/unicodetools/src/main/java/org/unicode/text/UCD/UCD.java index 493784a10c..a016801523 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UCD.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UCD.java @@ -458,7 +458,8 @@ public byte getBidiClass(int codePoint) { } if (defaultBidiValues == null) { - defaultBidiValues = DefaultValues.BidiClass.forVersion(versionInfo); + defaultBidiValues = DefaultValues.BidiClass.forVersion( + versionInfo, DefaultValues.BidiClass.Option.ALL); } Bidi_Class_Values bidi = defaultBidiValues.get(codePoint); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index ffe2de79fc..66456b5404 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -233,27 +233,10 @@ Value: V15_0 File: extracted/DerivedBidiClass Property: Bidi_Class # Bidi Class (listing UnicodeData.txt, field 4: see UAX #44: https://www.unicode.org/reports/tr44/) -# Unlike other properties, unassigned code points in blocks -# reserved for right-to-left scripts are given either types R or AL. -# -# The unassigned code points that default to AL are in the ranges: -# [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF -# \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F -# \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] -# -# This includes code points in the Arabic, Syriac, and Thaana blocks, among others. -# -# The unassigned code points that default to R are in the ranges: -# [\u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F -# \U00010800-\U00010CFF \U00010D40-\U00010EBF \U00010F00-\U00010F2F \U00010F70-\U00010FFF -# \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF \U0001EF00-\U0001EFFF] -# -# This includes code points in the Hebrew, NKo, and Phoenician blocks, among others. -# -# The unassigned code points that default to ET are in the range: -# [\u20A0-\u20CF] -# -# This consists of code points in the Currency Symbols block. +# Unlike other properties, unassigned code points in blocks +# reserved for right-to-left scripts are given either values R or AL, +# and unassigned code points in the Currency Symbols block are given the value ET. +# For details see the @missing lines below. # # The unassigned code points that default to BN have one of the following properties: # Default_Ignorable_Code_Point