Skip to content

Commit

Permalink
write multiple Bidi_Class @missing lines (#267)
Browse files Browse the repository at this point in the history
* refactor Bidi_Class defaults into new DefaultValues
* write multiple Bidi_Class `@missing` lines
* omit explicit lines for unassigned code points with default values
* props parser handle multiple `@missing` lines
  • Loading branch information
markusicu authored May 31, 2022
1 parent f74d661 commit ceb29e7
Show file tree
Hide file tree
Showing 7 changed files with 392 additions and 325 deletions.
243 changes: 102 additions & 141 deletions unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt

Large diffs are not rendered by default.

128 changes: 128 additions & 0 deletions unicodetools/src/main/java/org/unicode/props/DefaultValues.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package org.unicode.props;

import org.unicode.props.UcdPropertyValues.Bidi_Class_Values;
import org.unicode.props.UcdPropertyValues.Block_Values;

import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;

/**
* Default property values for some properties and certain ranges
* other than all of Unicode.
*/
public final class DefaultValues {
public static final class BidiClass {
private static final Bidi_Class_Values L = Bidi_Class_Values.Left_To_Right;
private static final Bidi_Class_Values R = Bidi_Class_Values.Right_To_Left;
private static final Bidi_Class_Values AL = Bidi_Class_Values.Arabic_Letter;
private static final Bidi_Class_Values BN = Bidi_Class_Values.Boundary_Neutral;
private static final Bidi_Class_Values ET = Bidi_Class_Values.European_Terminator;

public static enum Option { ALL, OMIT_BN };

private static final class Builder {
int compositeVersion;
IndexUnicodeProperties props;
UnicodeMap<Block_Values> blocks;
UnicodeMap<Bidi_Class_Values> bidi = new UnicodeMap<>();

Builder(VersionInfo version) {
compositeVersion =
(version.getMajor() << 16) | (version.getMinor() << 8) | version.getMilli();
props = IndexUnicodeProperties.make(version);
blocks = props.loadEnum(UcdProperty.Block);
}

UnicodeMap<Bidi_Class_Values> build(Option option) {
// Overall default
bidi.setMissing(L);

// Set defaults in ascending order of Unicode versions,
// at least if there are overlaps, so that a later change
// can override parts of an earlier, larger range.
// Adding a block before it existed in the given version is a no-op.
// If a block has had its default value since it was allocated,
// then we could simply use minVersion=0x30000
// (but it would be less obvious which block got its default when).

// Unicode 3.0 was the first version to publish UAX #9, effectively create
// the Bidi_Class property, and assign default Bidi_Class values.
addBlockValueIfAtLeast(Block_Values.Hebrew, 0x30000, R);
addBlockValueIfAtLeast(Block_Values.Arabic, 0x30000, AL);
addBlockValueIfAtLeast(Block_Values.Syriac, 0x30000, AL);
addRangeValueIfAtLeast(0x0750, 0x077F, 0x30000, AL);
addBlockValueIfAtLeast(Block_Values.Thaana, 0x30000, AL);
addRangeValueIfAtLeast(0xFB1D, 0xFB4F, 0x30000, R);
addBlockValueIfAtLeast(Block_Values.Arabic_Presentation_Forms_A, 0x30000, AL);
addBlockValueIfAtLeast(Block_Values.Arabic_Presentation_Forms_B, 0x30000, AL);

addRangeValueIfAtLeast(0x07C0, 0x8FF, 0x40000, R);
addRangeValueIfAtLeast(0x10800, 0x10FFF, 0x40000, R);

// In order to be precise, exclude U+FEFF ("BOM") for Unicode 4.0 & 4.0.1.
// See https://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html
// This had no real effect, since U+FEFF was already
// an *assigned* character with bc=BN.
if (0x40000 <= compositeVersion && compositeVersion < 0x40100) {
bidi.put(0xFEFF, L);
}

// The noncharacter code points FDD0..FDEF were designated in Unicode 3.1, but the
// whole enclosing block FB50..FDFF Arabic Presentation Forms-A kept default bc=AL.
// Unicode 4.0 then excluded these noncharacters from bc=AL.
addRangeValueIfAtLeast(0xFDD0, 0xFDEF, 0x40000, L);

// Unicode 4.0.1 changed all noncharacter code points and
// default ignorables to default bc=BN.
// Since many of these ranges are not aligned with block boundaries,
// we may omit them when presenting defaults.
if (compositeVersion >= 0x40001 && option != Option.OMIT_BN) {
UnicodeSet nonchars = props.loadBinary(UcdProperty.Noncharacter_Code_Point);
bidi.putAll(nonchars, BN);
UnicodeSet defaultIgnorable =
props.loadBinary(UcdProperty.Default_Ignorable_Code_Point);
bidi.putAll(defaultIgnorable, BN);
}

addBlockValueIfAtLeast(Block_Values.Arabic_Supplement, 0x40100, AL);
addRangeValueIfAtLeast(0x1E800, 0x1EFFF, 0x50200, R);
addBlockValueIfAtLeast(Block_Values.Arabic_Extended_A, 0x60100, AL);
addBlockValueIfAtLeast(
Block_Values.Arabic_Mathematical_Alphabetic_Symbols, 0x60100, AL);
addBlockValueIfAtLeast(
Block_Values.Currency_Symbols, 0x60300, ET); // default ET since 6.3

addBlockValueIfAtLeast(Block_Values.Syriac_Supplement, 0xA0000, AL);
addBlockValueIfAtLeast(Block_Values.Hanifi_Rohingya, 0xB0000, AL);
addBlockValueIfAtLeast(Block_Values.Sogdian, 0xB0000, AL);
addBlockValueIfAtLeast(Block_Values.Indic_Siyaq_Numbers, 0xB0000, AL);
addBlockValueIfAtLeast(Block_Values.Ottoman_Siyaq_Numbers, 0xC0000, AL);
addBlockValueIfAtLeast(Block_Values.Arabic_Extended_B, 0xE0000, AL);
addBlockValueIfAtLeast(Block_Values.Arabic_Extended_C, 0xF0000, AL);

return bidi;
}

private void addRangeValueIfAtLeast(
int start, int end, int minVersion, Bidi_Class_Values bidiValue) {
if (compositeVersion >= minVersion) {
bidi.putAll(start, end, bidiValue);
}
}

private void addBlockValueIfAtLeast(
Block_Values blockValue, int minVersion, Bidi_Class_Values bidiValue) {
if (compositeVersion >= minVersion) {
UnicodeSet block = blocks.keySet(blockValue);
bidi.putAll(block, bidiValue);
}
}
}

public static UnicodeMap<Bidi_Class_Values> forVersion(
VersionInfo version, Option option) {
return new Builder(version).build(option);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -218,36 +218,46 @@ public boolean useOldFile(VersionInfo ucdVersionRequested) {
return ucdVersionRequested.compareTo(maxOldVersion) <= 0;
}

public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger) {
put(data, intRange, string, merger, false);
}

public static final Normalizer2 NFD = Normalizer2.getNFDInstance();
public static final Normalizer2 NFC = Normalizer2.getNFCInstance();

public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger, boolean hackHangul) {
if (string != null && string.isEmpty() && property != UcdProperty.NFKC_Casefold) {
string = null;
public void put(UnicodeMap<String> data, IntRange intRange, String string) {
put(data, intRange, string, null);
}

public void put(UnicodeMap<String> data, IntRange intRange, String string, Merge<String> merger) {
put(data, null, intRange, string, merger, false);
}

public void put(
UnicodeMap<String> data, UnicodeSet missingSet,
IntRange intRange, String value,
Merge<String> merger, boolean hackHangul) {
if (value != null && value.isEmpty() && property != UcdProperty.NFKC_Casefold) {
value = null;
}
string = normalizeAndVerify(string);
value = normalizeAndVerify(value);
if (intRange.string != null) {
PropertyUtilities.putNew(data, intRange.string, string, merger);
PropertyUtilities.putNew(data, intRange.string, value, merger);
} else {
for (int codepoint = intRange.start; codepoint <= intRange.end; ++codepoint) {
try {
if (hackHangul) {
String fullDecomp = NFD.getDecomposition(codepoint); // use ICU for Hangul decomposition
// Use ICU for Hangul decomposition.
String fullDecomp = NFD.getDecomposition(codepoint);
if (fullDecomp.length() > 2) {
fullDecomp = NFC.normalize(fullDecomp.substring(0,2)) + fullDecomp.substring(2);
}
PropertyUtilities.putNew(data, codepoint, fullDecomp, merger);
} else if (string == CONSTRUCTED_NAME) {
PropertyUtilities.putNew(data, codepoint, UCharacter.getName(codepoint), merger); // use ICU for Hangul Name construction, constant
PropertyUtilities.putNew(data, missingSet, codepoint, fullDecomp, merger);
} else if (value == CONSTRUCTED_NAME) {
// Use ICU for Hangul Name construction, constant.
PropertyUtilities.putNew(
data, missingSet, codepoint, UCharacter.getName(codepoint), merger);
} else {
PropertyUtilities.putNew(data, codepoint, string, merger);
PropertyUtilities.putNew(data, missingSet, codepoint, value, merger);
}
} catch (final Exception e) {
String msg = String.format("%s: %04X..%04X %s", property, intRange.start, intRange.end, string);
String msg = String.format("%s: %04X..%04X %s", property, intRange.start, intRange.end, value);
throw new UnicodePropertyException(msg, e);
}
}
Expand Down Expand Up @@ -368,9 +378,7 @@ public void checkRegex(String part) {
}
}
}
public void put(UnicodeMap<String> data, IntRange intRange, String string) {
put(data, intRange, string, null);
}

public String getDefaultValue() {
return defaultValue;
}
Expand Down Expand Up @@ -428,6 +436,10 @@ enum Contents { DATA, MISSING, EMPTY }
private final boolean withRange;
private final boolean withMissing;
private final Iterator<String> rawLines;
/**
* Code points covered by @missing lines for less than all of Unicode.
*/
private final UnicodeSet missingSet = new UnicodeSet();
private State state = State.LOOK;
String line; // original line for logging and error messages
String line2; // modified line for parsing
Expand Down Expand Up @@ -465,7 +477,7 @@ public boolean hasNext() {
if (line2.contains("# EOF")) {
stats.containsEOF = true;
} else {
if (line2.contains("@missing")) { // quick test
if (line2.contains("@missing:")) { // quick test
// # @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
if (!withMissing) {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -502,9 +514,16 @@ public boolean hasNext() {
} catch (Exception e) {
throw new IllegalArgumentException("line: " + line, e);
}
if (contents != Contents.DATA &&
(intRange.start != 0 || intRange.end != 0x10FFFF)) {
System.err.println("Unexpected range: " + line);
if (contents != Contents.DATA) {
if (intRange.start != 0 || intRange.end != 0x10FFFF) {
if (contents == Contents.MISSING) {
// @missing line for less than all of Unicode
missingSet.add(intRange.start, intRange.end);
contents = Contents.DATA;
} else {
System.err.println("Unexpected range: " + line);
}
}
}
}
state = State.HAVE_NEXT;
Expand Down Expand Up @@ -932,7 +951,7 @@ private static void parseFields(UcdLine line,
}
String value = propInfo.fieldNumber >= parts.length ? ""
: parts[propInfo.fieldNumber];
propInfo.put(data, line.intRange, value, merger,
propInfo.put(data, line.missingSet, line.intRange, value, merger,
hackHangul && propInfo.property == UcdProperty.Decomposition_Mapping);
}
} else {
Expand All @@ -950,7 +969,7 @@ private static void parseSimpleFieldFile(UcdLineParser parser,
PropertyParsingInfo propInfo, UnicodeMap<String> data) {
for (UcdLine line : parser) {
if (line.contents == UcdLine.Contents.DATA) {
propInfo.put(data, line.intRange, line.parts[1], null, false);
propInfo.put(data, line.missingSet, line.intRange, line.parts[1], null, false);
} else {
setPropDefault(
propInfo.property, line.parts[1], line.line,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.unicode.text.utility.Utility;

import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.UnicodeSet;

public class PropertyUtilities {

Expand Down Expand Up @@ -36,9 +37,11 @@ static final <K, V, M extends Map<K,V>> M putNew(M map, K key, V value) {
return map;
}

static final <V> UnicodeMap<V> putNew(UnicodeMap<V> map, int key, V value, Merge<V> merger) {
static final <V> UnicodeMap<V> putNew(
UnicodeMap<V> map, UnicodeSet missingSet,
int key, V value, Merge<V> merger) {
final V oldValue = map.get(key);
if (oldValue != null) {
if (oldValue != null && (missingSet == null || !missingSet.contains(key))) {
if (merger == null) {
throw new UnicodePropertyException("Key already present in UnicodeMap: " + Utility.hex(key) + ",\told: " + oldValue + ",\tnew: " + value);
}
Expand Down
Loading

0 comments on commit ceb29e7

Please sign in to comment.