diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java index e5c8268b9..b7232c295 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java @@ -1,5 +1,8 @@ package org.unicode.jsp; +import com.google.common.base.Joiner; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.lang.CharSequences; import com.ibm.icu.lang.UCharacter; @@ -12,13 +15,19 @@ import com.ibm.icu.text.Transform; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.LocaleData; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.VersionInfo; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Locale; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeSet; import org.unicode.idna.Idna.IdnaType; import org.unicode.idna.Idna2003; import org.unicode.idna.Idna2008; @@ -28,9 +37,13 @@ import org.unicode.props.UnicodeProperty.BaseProperty; import org.unicode.props.UnicodeProperty.Factory; import org.unicode.props.UnicodeProperty.SimpleProperty; +import org.unicode.text.utility.Utility; public class XPropertyFactory extends UnicodeProperty.Factory { + private static final Joiner JOIN_COMMAS = Joiner.on(","); + private static final boolean DEBUG_MULTI = false; + static final UnicodeSet ALL = new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze(); @@ -96,6 +109,7 @@ public final Factory add2(UnicodeProperty sp) { add( new CodepointTransformProperty( new Transform() { + @Override public String transform(Integer source) { return Normalizer.normalize(source, Normalizer.NFC); } @@ -105,6 +119,7 @@ public String transform(Integer source) { add( new CodepointTransformProperty( new Transform() { + @Override public String transform(Integer source) { return Normalizer.normalize(source, Normalizer.NFD); } @@ -114,6 +129,7 @@ public String transform(Integer source) { add( new CodepointTransformProperty( new Transform() { + @Override public String transform(Integer source) { return Normalizer.normalize(source, Normalizer.NFKC); } @@ -123,6 +139,7 @@ public String transform(Integer source) { add( new CodepointTransformProperty( new Transform() { + @Override public String transform(Integer source) { return Normalizer.normalize(source, Normalizer.NFKD); } @@ -133,6 +150,7 @@ public String transform(Integer source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { return UCharacter.foldCase(source, true); } @@ -142,6 +160,7 @@ public String transform(String source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { return UCharacter.toLowerCase(ULocale.ROOT, source); } @@ -151,6 +170,7 @@ public String transform(String source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { return UCharacter.toUpperCase(ULocale.ROOT, source); } @@ -160,6 +180,7 @@ public String transform(String source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { return UCharacter.toTitleCase(ULocale.ROOT, source, null); } @@ -170,6 +191,7 @@ public String transform(String source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { StringBuilder b = new StringBuilder(); for (int cp : CharSequences.codePoints(source)) { @@ -184,6 +206,7 @@ public String transform(String source) { add( new StringTransformProperty( new StringTransform() { + @Override public String transform(String source) { String result = NFM.nfm.get(source); return result == null ? source : result; @@ -201,6 +224,7 @@ public String transform(String source) { add( new CodepointTransformProperty( new Transform() { + @Override public String transform(Integer source) { return UnicodeUtilities.getSubheader().getSubheader(source); } @@ -239,6 +263,9 @@ public String transform(Integer source) { .setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0")); addCollationProperty(); + addExamplarProperty(LocaleData.ES_STANDARD, "exem", "exemplar"); + addExamplarProperty(LocaleData.ES_AUXILIARY, "exema", "exemplar_aux"); + addExamplarProperty(LocaleData.ES_PUNCTUATION, "exemp", "exemplar_punct"); // set up the special script property UnicodeProperty scriptProp = base.getProperty("sc"); @@ -251,7 +278,8 @@ public String transform(Integer source) { .setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1") .addValueAliases( ScriptTester.getScriptSpecialsAlternates(), - AliasAddAction.IGNORE_IF_MISSING)); + AliasAddAction.IGNORE_IF_MISSING) + .setMultivalued(true)); CachedProps cp = CachedProps.CACHED_PROPS; for (String prop : cp.getAvailable()) { @@ -289,6 +317,81 @@ public String transform(Integer source) { .setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0")); } + private void addExamplarProperty( + int exemplarType, String propertyAbbreviation, String propertyName) { + Multimap data = TreeMultimap.create(); + Set localeSet = new TreeSet<>(); + + for (ULocale ulocale : ULocale.getAvailableLocales()) { + if (!ulocale.getCountry().isEmpty() || !ulocale.getVariant().isEmpty()) { + continue; + // we want to skip cases where characters are in the parent locale, but there is no + // ULocale parentLocale = ulocale.getParent(); + } + UnicodeSet exemplarSet = LocaleData.getExemplarSet(ulocale, 0, exemplarType); + if (!ulocale.getScript().isEmpty()) { + // we can't find out the parent locale or defaultContent locale in ICU, so we hack + // it + String langLocale = ulocale.getLanguage(); + UnicodeSet langExemplarSet = + LocaleData.getExemplarSet(new ULocale(langLocale), 0, exemplarType); + if (langExemplarSet.equals(exemplarSet)) { + continue; + } + } + String locale = ulocale.toLanguageTag(); + localeSet.add(locale); + for (UnicodeSetIterator it = new UnicodeSetIterator(exemplarSet); it.nextRange(); ) { + if (it.codepoint == UnicodeSetIterator.IS_STRING) { + // flatten + int cp = 0; + for (int i = 0; i < it.string.length(); i += Character.charCount(cp)) { + cp = it.string.codePointAt(i); + data.put(cp, locale); + } + } else { + for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) { + data.put(cp, locale); + } + } + } + } + + // convert to UnicodeMap + UnicodeMap unicodeMap = new UnicodeMap<>(); + for (Entry> entry : data.asMap().entrySet()) { + String value = JOIN_COMMAS.join(entry.getValue()).intern(); + unicodeMap.put(entry.getKey(), value); + } + if (DEBUG_MULTI) { + System.out.println("\n" + propertyName); + for (UnicodeMap.EntryRange entry : unicodeMap.entryRanges()) { + System.out.println( + Utility.hex(entry.codepoint) + + (entry.codepoint == entry.codepointEnd + ? "" + : "-" + Utility.hex(entry.codepointEnd)) + + " ;\t" + + entry.value); + } + } + + // put locales into right format + String[] localeList = localeSet.toArray(new String[localeSet.size()]); + String[][] locales = new String[][] {localeList, localeList}; // abbreviations are the same + + add( + new UnicodeProperty.UnicodeMapProperty() + .set(unicodeMap) + .setMain( + propertyName, + propertyAbbreviation, + UnicodeProperty.ENUMERATED, + "1.1") + .addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS) + .setMultivalued(true)); + } + private void addCollationProperty() { RuleBasedCollator c = UnicodeSetUtilities.RAW_COLLATOR; // (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); @@ -652,6 +755,7 @@ public StringTransformProperty( setUniformUnassigned(hasUniformUnassigned); } + @Override protected String _getValue(int codepoint) { return transform.transform(UTF16.valueOf(codepoint)); } @@ -666,6 +770,7 @@ public CodepointTransformProperty( setUniformUnassigned(hasUniformUnassigned); } + @Override protected String _getValue(int codepoint) { return transform.transform(codepoint); } @@ -682,6 +787,7 @@ public static class EncodingProperty extends SimpleProperty { encoder = new CharEncoder(charset, false, false); } + @Override protected String _getValue(int codepoint) { int len = encoder.getValue(codepoint, temp, 0); if (len < 0) { @@ -697,6 +803,7 @@ protected String _getValue(int codepoint) { return result.toString(); } + @Override public boolean isDefault(int codepoint) { int len = encoder.getValue(codepoint, temp, 0); return len < 0; @@ -716,6 +823,7 @@ public static class EncodingPropertyBoolean extends SimpleProperty { encoder = new CharEncoder(charset, true, true); } + @Override protected String _getValue(int codepoint) { return (encoder.getValue(codepoint, null, 0) > 0) ? "Yes" : "No"; } @@ -731,6 +839,7 @@ public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) { return this; } + @Override protected UnicodeMap _getUnicodeMap() { UnicodeMap result = new UnicodeMap(); result.putAll(unicodeSet, "Yes"); @@ -743,10 +852,12 @@ public XPropertyFactory.UnicodeSetProperty set(String string) { return set(new UnicodeSet(string).freeze()); } + @Override protected String _getValue(int codepoint) { return YESNO_ARRAY[unicodeSet.contains(codepoint) ? 0 : 1]; } + @Override protected List _getAvailableValues(List result) { return YESNO; } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java new file mode 100644 index 000000000..8ed9706ef --- /dev/null +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestMultivalued.java @@ -0,0 +1,51 @@ +package org.unicode.jsptest; + +import com.ibm.icu.text.UnicodeSet; +import org.junit.jupiter.api.Test; +import org.unicode.jsp.UnicodeSetUtilities; +import org.unicode.unittest.TestFmwkMinusMinus; + +public class TestMultivalued extends TestFmwkMinusMinus { + @Test + public void TestScx1Script() { + String unicodeSetString = "\\p{scx=deva}"; + UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); + + UnicodeSet mustContain = new UnicodeSet("[ᳵ।]"); // one character B&D, other B&D&D&G&... + assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain)); + + UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bangla character + assertFalse( + unicodeSetString + " !contains " + mustNotContain, + parsed.containsAll(mustNotContain)); + } + + @Test + public void TestScxMulti() { + String unicodeSetString = "\\p{scx=beng,deva}"; + String exceptionMessage = null; + try { + UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); + } catch (Exception e) { + exceptionMessage = e.getMessage(); + } + assertEquals( + "Expected exception", + "Multivalued property values can't contain commas.", + exceptionMessage); + } + + @Test + public void TestExemplars() { + String unicodeSetString = "\\p{exem=da}"; + UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString); + + UnicodeSet mustContain = new UnicodeSet("[æ]"); + assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain)); + + UnicodeSet mustNotContain = new UnicodeSet("[ç]"); + assertFalse( + unicodeSetString + " !contains " + mustNotContain, + parsed.containsAll(mustNotContain)); + } +} diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index e05911654..d0b97a857 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -413,14 +413,6 @@ public void TestPerMill(final String name, final Charset charset) { } } - @Test - public void TestScriptSpecials() { - // UnicodeSet set = UnicodeSetUtilities.parseUnicodeSet("[:scs=Hant:]"); - // assertNotEquals("Hant", 0, set.size()); - UnicodeSet set2 = UnicodeSetUtilities.parseUnicodeSet("[:scx=Arab,Syrc:]"); - assertNotEquals("Arab Syrc", 0, set2.size()); - } - @Test public void TestGC() { Map> SPECIAL_GC = diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 773e78f4e..615986a7a 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -6,6 +6,7 @@ */ package org.unicode.props; +import com.google.common.base.Splitter; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.SymbolTable; @@ -32,6 +33,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { + private static final Splitter SPLIT_COMMAS = Splitter.on(","); public static final UnicodeSet NONCHARACTERS = new UnicodeSet("[:noncharactercodepoint:]").freeze(); public static final UnicodeSet PRIVATE_USE = new UnicodeSet("[:gc=privateuse:]").freeze(); @@ -151,6 +153,13 @@ public static synchronized void ResetCacheProperties() { private boolean hasUniformUnassigned = true; + private boolean isMultivalued = false; + + public UnicodeProperty setMultivalued(boolean value) { + isMultivalued = value; + return this; + } + /* * Name: Unicode_1_Name Name: ISO_Comment Name: Name Name: Unicode_1_Name * @@ -309,7 +318,7 @@ public final String getValue(int codepoint, boolean getShortest) { public final String getFirstNameAlias() { if (firstNameAlias == null) { - firstNameAlias = (String) getNameAliases().get(0); + firstNameAlias = getNameAliases().get(0); } return firstNameAlias; } @@ -378,10 +387,15 @@ public final UnicodeSet getSet(PatternMatcher matcher) { * the original contents. */ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { - return getSet( - new SimpleMatcher( - propertyValue, isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), - result); + if (isMultivalued && propertyValue.contains(",")) { + throw new IllegalArgumentException("Multivalued property values can't contain commas."); + } else { + return getSet( + new SimpleMatcher( + propertyValue, + isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), + result); + } } private UnicodeMap unicodeMap = null; @@ -407,13 +421,20 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { Iterator it = um.getAvailableValues(null).iterator(); main: while (it.hasNext()) { - String value = (String) it.next(); + String value = it.next(); temp.clear(); Iterator it2 = getValueAliases(value, temp).iterator(); while (it2.hasNext()) { - String value2 = (String) it2.next(); + String value2 = it2.next(); // System.out.println("Values:" + value2); - if (matcher.test(value2) || matcher.test(toSkeleton(value2))) { + if (isMultivalued && value2.contains(",")) { + for (String part : SPLIT_COMMAS.split(value2)) { + if (matcher.test(part) || matcher.test(toSkeleton(part))) { + um.keySet(value, result); + continue main; + } + } + } else if (matcher.test(value2) || matcher.test(toSkeleton(value2))) { um.keySet(value, result); continue main; } @@ -537,7 +558,7 @@ protected UnicodeMap _getUnicodeMap() { // if (DEBUG && i == 0x41) System.out.println(i + "\t" + // getValue(i)); String value = getValue(i); - String resultValue = (String) result.getValue(i); + String resultValue = result.getValue(i); if (!value.equals(resultValue)) { throw new RuntimeException("Value failure at: " + Utility.hex(i)); } @@ -760,13 +781,13 @@ public final Factory add(UnicodeProperty sp) { List c = sp.getNameAliases(new ArrayList<>(1)); Iterator it = c.iterator(); while (it.hasNext()) { - skeletonNames.put(toSkeleton((String) it.next()), sp); + skeletonNames.put(toSkeleton(it.next()), sp); } return this; } public UnicodeProperty getProperty(String propertyAlias) { - return (UnicodeProperty) skeletonNames.get(toSkeleton(propertyAlias)); + return skeletonNames.get(toSkeleton(propertyAlias)); } public final List getAvailableNames() { @@ -790,7 +811,7 @@ public final List getAvailableNames(int propertyTypeMask, List r if (result == null) result = new ArrayList<>(1); Iterator it = canonicalNames.keySet().iterator(); while (it.hasNext()) { - String item = (String) it.next(); + String item = it.next(); UnicodeProperty property = getProperty(item); if (DEBUG) System.out.println("Properties: " + item + "," + property.getType()); if (!property.isType(propertyTypeMask)) { @@ -1008,11 +1029,13 @@ public UnicodeProperty setFilter(StringFilter filter) { List temp = new ArrayList<>(1); + @Override public List _getAvailableValues(List result) { temp.clear(); return filter.addUnique(property.getAvailableValues(temp), result); } + @Override public List _getNameAliases(List result) { temp.clear(); return filter.addUnique(property.getNameAliases(temp), result); @@ -1023,13 +1046,14 @@ public String _getValue(int codepoint) { return filter.remap(property.getValue(codepoint)); } + @Override public List _getValueAliases(String valueAlias, List result) { if (backmap == null) { backmap = new HashMap<>(1); temp.clear(); Iterator it = property.getAvailableValues(temp).iterator(); while (it.hasNext()) { - String item = (String) it.next(); + String item = it.next(); String mappedItem = filter.remap(item); if (backmap.get(mappedItem) != null && !allowValueAliasCollisions) { throw new IllegalArgumentException( @@ -1038,7 +1062,7 @@ public List _getValueAliases(String valueAlias, List result) { backmap.put(mappedItem, item); } } - valueAlias = (String) backmap.get(valueAlias); + valueAlias = backmap.get(valueAlias); temp.clear(); return filter.addUnique(property.getValueAliases(valueAlias, temp), result); } @@ -1065,7 +1089,7 @@ public final List addUnique(Collection source, List resu if (result == null) result = new ArrayList<>(1); Iterator it = source.iterator(); while (it.hasNext()) { - UnicodeProperty.addUnique(remap((String) it.next()), result); + UnicodeProperty.addUnique(remap(it.next()), result); } return result; } @@ -1305,7 +1329,7 @@ public SimpleProperty setValues(String[] valueAliases, String[] alternateValueAl public SimpleProperty setValues(List valueAliases) { this.values = new LinkedHashSet<>(valueAliases); for (Iterator it = this.values.iterator(); it.hasNext(); ) { - _addToValues((String) it.next(), null); + _addToValues(it.next(), null); } return this; } @@ -1321,7 +1345,7 @@ protected void _fillValues() { List newvalues = getUnicodeMap_internal().getAvailableValues(new ArrayList()); for (Iterator it = newvalues.iterator(); it.hasNext(); ) { - _addToValues((String) it.next(), null); + _addToValues(it.next(), null); } } @@ -1380,7 +1404,7 @@ public UnicodeMapProperty set(UnicodeMap map) { @Override protected String _getValue(int codepoint) { - return (String) unicodeMap.getValue(codepoint); + return unicodeMap.getValue(codepoint); } /* protected List _getValueAliases(String valueAlias, List result) { @@ -1407,7 +1431,7 @@ public boolean isValidValue(String propertyValue) { if (isType(STRING_OR_MISC_MASK)) { return true; } - Collection values = (Collection) getAvailableValues(); + Collection values = getAvailableValues(); for (String valueAlias : values) { if (UnicodeProperty.compareNames(valueAlias, propertyValue) == 0) { return true; @@ -1426,7 +1450,7 @@ public List getValueAliases() { if (isType(STRING_OR_MISC_MASK)) { return result; } - Collection values = (Collection) getAvailableValues(); + Collection values = getAvailableValues(); for (String valueAlias : values) { UnicodeProperty.addAllUnique(getValueAliases(valueAlias), result); }