diff --git a/.gitignore b/.gitignore index 60e7ec63e..c6d5a34bd 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ perf-*.xml test-*.xml # Directories +.idea/ .settings/ .vs/ .vscode/ diff --git a/docs/ucdxml.md b/docs/ucdxml.md new file mode 100644 index 000000000..6711254f4 --- /dev/null +++ b/docs/ucdxml.md @@ -0,0 +1,69 @@ +# UCDXML + +There are three separate processes for generating and validating UCDXML files and their corresponding UAX42 report. + +1. Generate the UCDXML files. +2. (Optional) You can compare the generated UCDXML files against each other (e.g., Flat vs Grouped) or against + previous versions. +3. Generate UAX42. There are three steps involved: + + 1. Generate the property value fragments. The updated versions should live in + unicodetools/src/main/resources/org/unicode/uax42/fragments + 2. Generate the index.html and index.rnc files for UAX42. + 3. (Optional) Validate the UCDXML files using index.rnc. + +## Generate UCDXML files + +- You can generate flat or grouped versions of UCDXML. +- You can generate UCDXML files for: + - the full range of code points + - the Unihan code points + - code points that are not Unihan code points + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range ALL --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range UNIHAN --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range NOUNIHAN --output FLAT"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range ALL --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range UNIHAN --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.UCDXML"' '-Dexec.args="--range NOUNIHAN --output GROUPED"' -DCLDR_DIR=$(cd ../cldr; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +``` + +## Compare UCDXML files + +After generating UCDXML files, you can compare: + +- Different versions of the same type (range and output) of UCDXML file +- Grouped and flat versions of the same code point range + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.CompareUCDXML"' '-Dexec.args="-a {path to file} -b {path to file}"' +``` + +## Generating TR42 + +### Step 1 - Generate property value fragments + +``` +mvn compile exec:java '-Dexec.mainClass="org.unicode.xml.GeneratePropertyValues"' -DCLDR_DIR=$(cd ../cldr ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) +``` + +UAX42 fragments live in unicodetools/src/main/resources/org/unicode/uax42/fragments + +### Step 2 - Generate TR42 index.html and index.rnc + +``` +mvn xml:transform -f $(cd ./unicodetools/src/main/resources/org/unicode/uax42; pwd) -Doutputdir=$(cd ../Generated/uax42; pwd) +``` + +### Step 3 - Validate generated UAX XML files + +You'll need a [RELAX NG](https://relaxng.org/) schema validator. +We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example. + +1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang) +2. Run the following: + ``` + java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\index.rnc + ``` + Note that the UAX xml file has to be saved as NFD as the Unihan syntax regular expressions are expecting NFD. diff --git a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java index 914168c90..ec3d513a3 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdProperty.java @@ -85,12 +85,16 @@ public enum UcdProperty { Emoji_SB(PropertyType.Miscellaneous, "ESB"), ISO_Comment(PropertyType.Miscellaneous, "isc"), Jamo_Short_Name(PropertyType.Miscellaneous, "JSN"), + NC_Corrected(PropertyType.Miscellaneous, "ncCorrected"), + NC_Original(PropertyType.Miscellaneous, "ncOriginal"), + NC_Version(PropertyType.Miscellaneous, "ncVersion"), Name(PropertyType.Miscellaneous, "na"), Name_Alias(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "Name_Alias"), Named_Sequences(PropertyType.Miscellaneous, "NS"), Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"), Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"), Unicode_1_Name(PropertyType.Miscellaneous, "na1"), + emoji_variation_sequence(PropertyType.Miscellaneous, "EVS"), kAlternateHanYu(PropertyType.Miscellaneous, "cjkAlternateHanYu"), kAlternateJEF(PropertyType.Miscellaneous, "cjkAlternateJEF"), kAlternateKangXi(PropertyType.Miscellaneous, "cjkAlternateKangXi"), diff --git a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java index f8bb22f1a..0aac98c26 100644 --- a/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java +++ b/unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java @@ -766,6 +766,7 @@ public static East_Asian_Width_Values forName(String name) { // Emoji_DCM // Emoji_KDDI // Emoji_SB + // emoji_variation_sequence // Equivalent_Unified_Ideograph // FC_NFKC_Closure public enum General_Category_Values implements Named { @@ -1668,6 +1669,9 @@ public static Line_Break_Values forName(String name) { // Name_Alias // Named_Sequences // Named_Sequences_Prov + // NC_Corrected + // NC_Original + // NC_Version public enum NFC_Quick_Check_Values implements Named { Maybe("M"), No("N"), diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java b/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java index 7f9f3008b..c7912b690 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/LoadImage.java @@ -891,7 +891,8 @@ public static void doSb(String outputDir) throws IOException { // try { // copy(new URL(url), new File(outputDir + "/sb","sb_" + code + ".gif")); //// BufferedImage sourceImage = ImageIO.read(new URL(url)); - //// writeImage(sourceImage,outputDir + "/sb","sb_" + code, "gif"); + //// writeImage(sourceImage,outputDir + "/sb","sb_" + code, + // "gif"); // System.out.println(code); // } catch (Exception e) { // System.out.println("Skipping " + code); diff --git a/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java new file mode 100644 index 000000000..2d268878e --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java @@ -0,0 +1,351 @@ +package org.unicode.xml; + +import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.util.VersionInfo; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.unicode.props.UnicodeProperty; + +/** + * Used by UCDXML to get string values of attributes for each code point from + * IndexUnicodeProperties. + */ +public class AttributeResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final UnicodeMap map_age; + private final UnicodeMap map_block; + private final UnicodeMap map_decomposition_type; + private final UnicodeMap map_general_category; + private final UnicodeMap map_script; + private final UnicodeMap map_script_extensions; + private final HashMap> map_NameAlias; + + // If there is a change in any of these properties between two adjacent characters, it will + // result in a new range. + private final UCDPropertyDetail[] rangeDefiningPropertyDetails = { + UCDPropertyDetail.Age_Detail, + UCDPropertyDetail.Bidi_Class_Detail, + UCDPropertyDetail.Block_Detail, + UCDPropertyDetail.Decomposition_Mapping_Detail, + UCDPropertyDetail.Numeric_Type_Detail, + UCDPropertyDetail.Numeric_Value_Detail, + UCDPropertyDetail.Vertical_Orientation_Detail + }; + + public AttributeResolver(IndexUnicodeProperties iup) { + indexUnicodeProperties = iup; + map_age = indexUnicodeProperties.loadEnum(UcdProperty.Age); + map_block = indexUnicodeProperties.loadEnum(UcdProperty.Block); + map_decomposition_type = indexUnicodeProperties.loadEnum(UcdProperty.Decomposition_Type); + map_general_category = indexUnicodeProperties.loadEnum(UcdProperty.General_Category); + map_script = indexUnicodeProperties.loadEnum(UcdProperty.Script); + map_script_extensions = + indexUnicodeProperties.getProperty(UcdProperty.Script_Extensions).getUnicodeMap(); + + // UCD code is only set up to read a single Alias value from NameAliases.txt + // Instead, we'll load the Alias and the Type data as part of the constructor. We'll keep in + // memory as it + // NameAliases isn't too large. + map_NameAlias = loadNameAliases(); + } + + protected enum AliasType { + ABBREVIATION("abbreviation"), + ALTERNATE("alternate"), + CONTROL("control"), + CORRECTION("correction"), + FIGMENT("figment"), + NONE("none"); + + private final String aliasType; + + AliasType(String aliasType) { + this.aliasType = aliasType; + } + + public String toString() { + return aliasType; + } + } + + private static class NameAlias { + + private String alias; + private final AliasType type; + + private NameAlias(String alias, AliasType type) { + this.alias = alias; + this.type = type; + } + + public String getAlias() { + return alias; + } + + public AliasType getType() { + return type; + } + } + + private static class NameAliasComparator implements java.util.Comparator { + + @Override + public int compare(NameAlias o1, NameAlias o2) { + return o1.getAlias().compareTo(o2.getAlias()); + } + } + + private HashMap> loadNameAliases() { + HashMap> nameAliasesByCodePoint = new HashMap<>(); + final PropertyParsingInfo fileInfo = + PropertyParsingInfo.getPropertyInfo(UcdProperty.Name_Alias); + String fullFilename = fileInfo.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = new UcdLineParser(FileUtilities.in("", fullFilename)); + NameAliasComparator nameAliasComparator = new NameAliasComparator(); + + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + int codepoint = Integer.parseInt(parts[0], 16); + NameAlias nameAlias; + if (parts.length < 3) { + nameAlias = new NameAlias(parts[1], AliasType.NONE); + } else { + nameAlias = + new NameAlias( + parts[1], AliasType.valueOf(parts[2].toUpperCase(Locale.ROOT))); + } + + if (nameAliasesByCodePoint.containsKey(codepoint)) { + LinkedList nameAliases = + new LinkedList<>(nameAliasesByCodePoint.get(codepoint)); + nameAliases.add(nameAlias); + nameAliases.sort(nameAliasComparator); + nameAliasesByCodePoint.replace(codepoint, nameAliases); + } else { + nameAliasesByCodePoint.put(codepoint, new LinkedList<>(List.of(nameAlias))); + } + } + return nameAliasesByCodePoint; + } + + public String getAttributeValue(UcdProperty prop, int codepoint) { + String resolvedValue = indexUnicodeProperties.getResolvedValue(prop, codepoint); + switch (prop.getType()) { + case Numeric: + switch (prop) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + return (resolvedValue.equals("NaN")) ? null : resolvedValue; + default: + return Optional.ofNullable(resolvedValue).orElse("NaN"); + } + case String: + switch (prop) { + case Equivalent_Unified_Ideograph: + String EqUIdeo = getMappingValue(codepoint, resolvedValue, false, ""); + return (EqUIdeo.equals("#")) ? null : EqUIdeo; + case kCompatibilityVariant: + String kCompatibilityVariant = + getMappingValue(codepoint, resolvedValue, false, "U+"); + return (kCompatibilityVariant.equals("#")) ? "" : kCompatibilityVariant; + case kSimplifiedVariant: + case kTraditionalVariant: + String kVariant = + getMappingValue( + codepoint, + resolvedValue, + isUnihanAttributeRange(codepoint), + "U+"); + return (kVariant.equals("#")) ? "" : kVariant; + case Bidi_Mirroring_Glyph: + // Returning empty string for bmg to maintain compatibility with older + // generated files. + String bmg = getMappingValue(codepoint, resolvedValue, false, ""); + return (bmg.equals("#")) ? "" : bmg; + default: + return getMappingValue(codepoint, resolvedValue, false, ""); + } + case Miscellaneous: + switch (prop) { + case Jamo_Short_Name: + // return map_jamo_short_name.get(codepoint).getShortName(); + return Optional.ofNullable(resolvedValue).orElse(""); + case Name: + if (resolvedValue != null + && resolvedValue.startsWith("CJK UNIFIED IDEOGRAPH-")) { + return "CJK UNIFIED IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) { + return "CJK COMPATIBILITY IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("TANGUT IDEOGRAPH-")) { + return "TANGUT IDEOGRAPH-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("KHITAN SMALL SCRIPT CHARACTER-")) { + return "KHITAN SMALL SCRIPT CHARACTER-#"; + } + if (resolvedValue != null && resolvedValue.startsWith("NUSHU CHARACTER-")) { + return "NUSHU CHARACTER-#"; + } + if (resolvedValue != null + && resolvedValue.startsWith("EGYPTIAN HIEROGLYPH-")) { + return "EGYPTIAN HIEROGLYPH-#"; + } + return Optional.ofNullable(resolvedValue).orElse(""); + case kDefinition: + return resolvedValue; + default: + if (resolvedValue != null) { + return resolvedValue.replaceAll("\\|", " "); + } + return ""; + } + case Catalog: + switch (prop) { + case Age: + String age = map_age.get(codepoint).getShortName(); + return (age.equals("NA")) ? "unassigned" : age; + case Block: + return map_block.get(codepoint).getShortName(); + case Script: + return map_script.get(codepoint).getShortName(); + case Script_Extensions: + StringBuilder extensionBuilder = new StringBuilder(); + String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0); + for (String extension : extensions) { + extensionBuilder.append( + UcdPropertyValues.Script_Values.valueOf(extension) + .getShortName()); + extensionBuilder.append(" "); + } + return extensionBuilder.toString().trim(); + default: + throw new RuntimeException("Missing Catalog case"); + } + case Enumerated: + switch (prop) { + case Decomposition_Type: + // Returning lower case to maintain compatibility with older generated + // files. + return map_decomposition_type + .get(codepoint) + .getShortName() + .toLowerCase(Locale.ROOT); + default: + final UnicodeProperty property = indexUnicodeProperties.getProperty(prop); + final List valueAliases = + property.getValueAliases(property.getValue(codepoint)); + return valueAliases.get(0); + } + case Binary: + { + switch (resolvedValue) { + // Seems overkill to get this from UcdPropertyValues.Binary + case "No": + return "N"; + case "Yes": + return "Y"; + default: + throw new RuntimeException("Unexpected Binary value"); + } + } + default: + throw new RuntimeException("Missing PropertyType case"); + } + } + + public boolean isUnassignedCodePoint(int codepoint) { + return UcdPropertyValues.General_Category_Values.Unassigned.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Private_Use.equals(getgc(codepoint)) + || UcdPropertyValues.General_Category_Values.Surrogate.equals(getgc(codepoint)); + } + + public UcdPropertyValues.General_Category_Values getgc(int codepoint) { + return map_general_category.get(codepoint); + } + + public String getNChar(int codepoint) { + return getAttributeValue(UcdProperty.Noncharacter_Code_Point, codepoint); + } + + public HashMap getNameAliases(int codepoint) { + HashMap nameAliases = new LinkedHashMap<>(); + LinkedList nameAliasList = map_NameAlias.get(codepoint); + if (null != nameAliasList && !nameAliasList.isEmpty()) { + for (NameAlias nameAlias : nameAliasList) { + nameAliases.put(nameAlias.getAlias(), nameAlias.getType().toString()); + } + return nameAliases; + } + return null; + } + + private String getMappingValue( + int codepoint, String resolvedValue, boolean ignoreUnihanRange, String prefix) { + if (null == resolvedValue) { + return "#"; + } + int[] resolvedValueInts = resolvedValue.codePoints().toArray(); + if (resolvedValueInts.length == 1 + && resolvedValueInts[0] == codepoint + && !ignoreUnihanRange) { + return "#"; + } + StringBuilder sb = new StringBuilder(); + for (int i : resolvedValueInts) { + sb.append(prefix).append(getCPString(i)).append(" "); + } + return sb.toString().trim(); + } + + public boolean isDifferentRange(VersionInfo ucdVersion, int codepointA, int codepointB) { + boolean isDifference = false; + for (UCDPropertyDetail propDetail : rangeDefiningPropertyDetails) { + UcdProperty prop = propDetail.getUcdProperty(); + if (ucdVersion.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || ucdVersion.compareTo(propDetail.getMaxVersion()) < 0)) { + isDifference = + isDifference + || !getAttributeValue(prop, codepointA) + .equals(getAttributeValue(prop, codepointB)); + } + } + return isDifference; + } + + private static String getCPString(int codepoint) { + return String.format("%4s", Integer.toHexString(codepoint)) + .replace(" ", "0") + .toUpperCase(Locale.ROOT); + } + + public String getHexString(int codepoint) { + return getCPString(codepoint); + } + + public boolean isUnihanAttributeRange(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + || !getAttributeValue(UcdProperty.kCompatibilityVariant, codepoint).isEmpty(); + } + + public boolean isUnifiedIdeograph(int codepoint) { + return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y") + && getAttributeValue(UcdProperty.Name, codepoint).equals("CJK UNIFIED IDEOGRAPH-#"); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java new file mode 100644 index 000000000..f09f98e86 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/CompareUCDXML.java @@ -0,0 +1,203 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Objects; +import org.unicode.props.UcdProperty; + +/** + * Utility for comparing two UCDXML files. Originally intended to compare UCDXML files generated + * using https://github.com/eric-muller/ucdxml to UCDXML files generated using + * org.unicode.xml.UCDXML. + */ +public class CompareUCDXML { + + private static final String NEWLINE = System.getProperty("line.separator"); + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("fileA", 'a', UOption.REQUIRES_ARG), + UOption.create("fileB", 'b', UOption.REQUIRES_ARG) + }; + + private static final UcdProperty[] codepointSequenceProperties = + new UcdProperty[] { + UcdProperty.Named_Sequences, + UcdProperty.Named_Sequences_Prov, + UcdProperty.Standardized_Variant, + UcdProperty.Emoji_DCM, + UcdProperty.Emoji_KDDI, + UcdProperty.Emoji_SB, + UcdProperty.Do_Not_Emit_Preferred + }; + + private static final HashMap knownDifferences; + + static { + knownDifferences = new HashMap<>(); + + // https://github.com/unicode-org/properties/issues/296 + knownDifferences.put(0x31E4, new String[] {"Hani", "Zyyy"}); + knownDifferences.put(0x31E5, new String[] {"Hani", "Zyyy"}); + + // https://github.com/unicode-org/unicodetools/issues/325 + knownDifferences.put(0x109F7, new String[] {"1/6", "2/12"}); + knownDifferences.put(0x109F8, new String[] {"1/4", "3/12"}); + knownDifferences.put(0x109F9, new String[] {"1/3", "4/12"}); + knownDifferences.put(0x109FB, new String[] {"1/2", "6/12"}); + knownDifferences.put(0x109FD, new String[] {"2/3", "8/12"}); + knownDifferences.put(0x109FE, new String[] {"3/4", "9/12"}); + knownDifferences.put(0x109FF, new String[] {"5/6", "10/12"}); + + // https://github.com/unicode-org/properties/issues/172 + knownDifferences.put(0x5146, new String[] {"1000000", "1000000 1000000000000"}); + knownDifferences.put(0x79ED, new String[] {"1000000000", "1000000000 1000000000000"}); + } + + private static final int HELP = 0, FILE_A = 1, FILE_B = 2, LOGFILE = 3; + + public static void main(String[] args) throws Exception { + File fileA = null; + File fileB = null; + int errorCount = 0; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println("CompareUcdXML --fileA {file path} --fileB {file path}"); + System.exit(0); + } + + if (options[FILE_A].doesOccur) { + try { + fileA = new File(options[FILE_A].value); + if (!fileA.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_A].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileA (or -a)"); + } + + if (options[FILE_B].doesOccur) { + try { + fileB = new File(options[FILE_B].value); + if (!fileB.exists()) { + throw new IOException(); + } + } catch (Exception e) { + throw new IllegalArgumentException("Could not find " + options[FILE_B].value); + } + } else { + throw new IllegalArgumentException("Missing command line option: --fileB (or -b)"); + } + + System.out.println("Comparing " + fileA + " and " + fileB); + + final XMLProperties xmlPropsA = new XMLProperties(fileA); + final XMLProperties xmlPropsB = new XMLProperties(fileB); + + // First, iterate through the UcdProperties on each codepoint. + for (final UcdProperty prop : UcdProperty.values()) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + if (!fileAMap.equals(fileBMap)) { + for (int i = 0; i <= 0x10ffff; ++i) { + try { + String xmlValA = fileAMap.get(i); + String xmlValB = fileBMap.get(i); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0) + && !isKnownDifference(i, xmlValA, xmlValB)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + String.format("0x%04X", i) + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", i)); + System.out.println(e.getMessage()); + } + } + } + } + // Now handle anything that contains codepoint sequences. + for (UcdProperty prop : codepointSequenceProperties) { + UnicodeMap fileAMap = xmlPropsA.getMap(prop); + UnicodeMap fileBMap = xmlPropsB.getMap(prop); + UnicodeSet differences = fileAMap.keySet().addAll(fileBMap.keySet()); + for (String key : differences) { + try { + String xmlValA = fileAMap.get(key); + String xmlValB = fileBMap.get(key); + if (!Objects.equals(xmlValA, xmlValB)) { + // At least one string is != null and the strings are different, but we + // don't care if one + // is null and one is empty_string + // As far as we care, empty_string == null == "00000" + int lenA = + (xmlValA == null + ? 0 + : (xmlValA.equals("00000") ? 0 : xmlValA.length())); + int lenB = + (xmlValB == null + ? 0 + : (xmlValB.equals("00000") ? 0 : xmlValB.length())); + if (!(lenA == 0 && lenB == 0)) { + errorCount++; + System.out.println( + "For UCDProperty " + + prop.name() + + " (" + + prop.getShortName() + + ") [" + + key + + "], "); + System.out.println("\t" + fileA + " = " + xmlValA); + System.out.println("\t" + fileB + " = " + xmlValB); + } + } + } catch (Exception e) { + System.out.println("Exception thrown for " + String.format("0x%04X", key)); + System.out.println(e.getMessage()); + } + } + } + System.exit(errorCount); + } + + private static boolean isKnownDifference(int codepoint, String xmlValA, String xmlValB) { + if (knownDifferences.containsKey(codepoint)) { + String knownValue1 = knownDifferences.get(codepoint)[0]; + String knownValue2 = knownDifferences.get(codepoint)[1]; + return (knownValue1.equals(xmlValA) && knownValue2.equals(xmlValB)) + || (knownValue1.equals(xmlValB) && knownValue2.equals(xmlValA)); + } + return false; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java new file mode 100644 index 000000000..0d28734b0 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/GeneratePropertyValues.java @@ -0,0 +1,1772 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues.*; +import org.unicode.text.utility.Settings; + +/** + * Utility for generating fragments that describe the property values in a format that can be + * displayed in UAX42. UAX42 fragments live in + * unicodetools/src/main/resources/org/unicode/uax42/fragments + */ +public class GeneratePropertyValues { + + private enum VALUESOUTPUTTYPE { + VALUE_PER_LINE, + ALPHABETICAL_GROUP, + NUMERICAL_GROUP, + MAX_LINE_LENGTH; + } + + private enum SCHEMA { + // Manual indicates a fragment file that is maintained manually rather than generated from + // this utility. + // Manual + NAMESPACE("namespace"), + // Manual + DATATYPES("datatypes"), + // Manual + START("start"), + BOOLEAN("boolean"), + // Manual + DESCRIPTION("description"), + // Manual + REPERTOIRE("repertoire"), + PROPERTIES("properties"), + TANGUT("tangut"), + NUSHU("nushu"), + EMOJI_DATA("emoji-data"), + // Manual + BLOCK("block"), + // Manual + NAMED_SEQUENCES("named-sequences"), + // Manual + NORMALIZATION_CORRECTIONS("normalization-corrections"), + // Manual + STANDARDIZED_VARIANTS("standardized-variants"), + // Manual + CJK_RADICALS("cjk-radicals"), + // Manual + EMOJI_SOURCES("emoji-sources"), + DO_NOT_EMIT("do-not-emit"); + + final String name; + + SCHEMA(String name) { + this.name = name; + } + + String getName() { + return this.name; + } + } + + private static final class TR38Details { + boolean isList; + String syntax; + + public TR38Details(boolean isList, String syntax) { + this.isList = isList; + this.syntax = syntax; + } + + public boolean isList() { + return isList; + } + + public String getSyntax() { + return syntax; + } + } + + private static final int MAX_LINE_LENGTH = 70; + private static final String NEWLINE = System.lineSeparator(); + private static final String DOUBLELINE = System.lineSeparator() + System.lineSeparator(); + private static final String TRIPLELINE = + System.lineSeparator() + System.lineSeparator() + System.lineSeparator(); + private static File destinationFolder = null; + + private static HashMap syntaxTR38; + private static final String NAMESPACE = "http://unicode.org/ns/2001/ucdxml"; + private static final String TR38URL = "https://www.unicode.org/reports/tr38"; + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.OPTIONAL_ARG), + UOption.create("outputfolder", 'f', UOption.REQUIRES_ARG) + }; + + private static final int HELP = 0, UCDVERSION = 1, OUTPUTFOLDER = 2; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "GeneratePropertyValuesList [--ucdversion {version number}] [--outputfolder {destination}]"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + ucdVersion = VersionInfo.getInstance(Settings.latestVersion); + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = new File(options[OUTPUTFOLDER].value); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + try { + destinationFolder = new File(Settings.Output.GEN_DIR + "uax42\\fragments\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + + Settings.Output.GEN_DIR + + "uax42\\fragments\\"); + } + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + buildPropertyValues(ucdVersion); + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when generating uax42 fragment files."); + System.exit(1); + } + } + + private static void buildPropertyValues( + // It would be nice to be able to generate values by ucdVersion. Leaving this here for + // now... + VersionInfo ucdVersion) throws IOException, URISyntaxException { + syntaxTR38 = parseTR38(); + + createPropertyFragment( + SCHEMA.BOOLEAN, + getFormattedValues(SCHEMA.BOOLEAN, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Age, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Age, VALUESOUTPUTTYPE.NUMERICAL_GROUP)); + createPropertyFragment( + UcdProperty.Name, SCHEMA.PROPERTIES, getFormattedSyntax(UcdProperty.Name)); + createPropertyFragment( + UcdProperty.Unicode_1_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Unicode_1_Name)); + createPropertyFragment( + UcdProperty.Name_Alias.getShortName() + ".xml", + "name-alias element", + SCHEMA.PROPERTIES, + getFormattedElement(UcdProperty.Name_Alias)); + createPropertyFragment( + UcdProperty.Block, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Block, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.General_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.General_Category, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Canonical_Combining_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Canonical_Combining_Class, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Bidi_Class, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Bidi_Class, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.Bidi_Mirrored, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Mirrored)); + createPropertyFragment( + UcdProperty.Bidi_Mirroring_Glyph, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Mirroring_Glyph)); + createPropertyFragment( + UcdProperty.Bidi_Control, + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Bidi_Control)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Bidi_Paired_Bracket_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Bidi_Paired_Bracket, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Bidi_Paired_Bracket)); + createPropertyFragment( + "decomposition.xml", + "decomposition properties", + SCHEMA.PROPERTIES, + getFormattedDecompositionProperties()); + createPropertyFragment( + "composition.xml", + "composition properties", + SCHEMA.PROPERTIES, + getFormattedCompositionProperties()); + createPropertyFragment( + "quickcheck.xml", + "quick check properties", + SCHEMA.PROPERTIES, + getFormattedQuickCheckProperties()); + createPropertyFragment( + "numeric.xml", + "numeric properties", + SCHEMA.PROPERTIES, + getFormattedNumericProperties()); + createPropertyFragment( + "joining.xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedJoiningProperties()); + createPropertyFragment( + UcdProperty.Join_Control.getShortName() + ".xml", + "joining properties", + SCHEMA.PROPERTIES, + getFormattedBoolean(UcdProperty.Join_Control)); + createPropertyFragment( + UcdProperty.Line_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute(UcdProperty.Line_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP)); + createPropertyFragment( + UcdProperty.East_Asian_Width, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.East_Asian_Width, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + "casing.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCasingProperties()); + createPropertyFragment( + "simple_case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedSimpleCaseMappingProperties()); + createPropertyFragment( + "case_mapping.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseMappingProperties()); + createPropertyFragment( + "case_folding.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseFoldingProperties()); + createPropertyFragment( + "case_other.xml", + "casing properties", + SCHEMA.PROPERTIES, + getFormattedCaseOtherProperties()); + createPropertyFragment( + "script.xml", + "script properties", + SCHEMA.PROPERTIES, + getFormattedScriptProperties()); + createPropertyFragment( + UcdProperty.ISO_Comment, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.ISO_Comment)); + createPropertyFragment( + UcdProperty.Hangul_Syllable_Type, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Hangul_Syllable_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH)); + createPropertyFragment( + UcdProperty.Jamo_Short_Name, + SCHEMA.PROPERTIES, + getFormattedSyntax(UcdProperty.Jamo_Short_Name)); + createPropertyFragment( + UcdProperty.Indic_Syllabic_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Syllabic_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Positional_Category, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Positional_Category, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + UcdProperty.Indic_Conjunct_Break, + SCHEMA.PROPERTIES, + getFormattedAttribute( + UcdProperty.Indic_Conjunct_Break, VALUESOUTPUTTYPE.VALUE_PER_LINE)); + createPropertyFragment( + "identifier.xml", + "identifier properties", + SCHEMA.PROPERTIES, + getFormattedIdentifierProperties()); + createPropertyFragment( + "pattern.xml", + "pattern properties", + SCHEMA.PROPERTIES, + getFormattedPatternProperties()); + createPropertyFragment( + "function_graphic.xml", + "properties related to function and graphic characteristics", + SCHEMA.PROPERTIES, + getFormattedFunctionGraphicProperties()); + createPropertyFragment( + "boundaries.xml", + "properties related to boundaries", + SCHEMA.PROPERTIES, + getFormattedBoundaryProperties()); + createPropertyFragment( + "ideographs.xml", + "properties related to ideographs", + SCHEMA.PROPERTIES, + getFormattedIdeographProperties()); + createPropertyFragment( + "miscellaneous.xml", + "miscellaneous properties", + SCHEMA.PROPERTIES, + getFormattedMiscellaneousProperties()); + createPropertyFragment( + "Unihan.xml", + "Unihan properties", + SCHEMA.PROPERTIES, + getFormattedUnihanProperties()); + createPropertyFragment( + "Tangut.xml", "Tangut data", SCHEMA.TANGUT, getFormattedTangutProperties()); + createPropertyFragment( + "Nushu.xml", "Nushu data", SCHEMA.NUSHU, getFormattedNushuProperties()); + createPropertyFragment( + "Emoji.xml", "Emoji properties", SCHEMA.EMOJI_DATA, getFormattedEmojiProperties()); + createPropertyFragment( + "do-not-emit.xml", + "do-not-emit", + SCHEMA.DO_NOT_EMIT, + getFormattedDoNotEmit(VALUESOUTPUTTYPE.VALUE_PER_LINE)); + } + + private static void createPropertyFragment(SCHEMA schema, String formattedFragment) + throws IOException { + createPropertyFragment( + schema.getName() + ".xml", schema.getName(), schema, formattedFragment); + } + + private static void createPropertyFragment( + UcdProperty ucdProperty, SCHEMA schema, String formattedFragment) throws IOException { + createPropertyFragment( + ucdProperty.getShortName() + ".xml", + ucdProperty.getShortName() + " attribute", + schema, + formattedFragment); + } + + private static void createPropertyFragment( + String filename, String title, SCHEMA schema, String formattedFragment) + throws IOException { + BufferedWriter writer = getFragmentWriter(filename); + writer.write( + "" + + NEWLINE + + "" + + NEWLINE); + writer.write(formattedFragment); + writer.write(NEWLINE + ""); + writer.flush(); + writer.close(); + } + + private static BufferedWriter getFragmentWriter(String filename) throws IOException { + File fragmentFolder = new File(destinationFolder + File.separator); + if (!fragmentFolder.exists()) { + if (!fragmentFolder.mkdir()) { + throw new IOException(); + } + } + File outputFile = new File(fragmentFolder, filename); + FileOutputStream fileOutputStream = new FileOutputStream(outputFile); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + return new BufferedWriter(outputStreamWriter); + } + + private static String getFormattedAttribute( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + List values; + StringBuilder stringBuilder = new StringBuilder(); + + switch (ucdProperty) { + case Age: + values = getAgeValues(); + break; + case Block: + values = getBlockValues(); + break; + case General_Category: + values = getGeneralCategoryValues(); + break; + case Canonical_Combining_Class: + values = getCanonicalCombiningClassValues(); + break; + case Bidi_Class: + values = getBidirectionalValues(); + break; + case Bidi_Paired_Bracket_Type: + values = getBidiPairedBracketTypeValues(); + break; + case Decomposition_Type: + values = getDecompositionTypeValues(); + break; + case NFC_Quick_Check: + values = getNFCQuickCheckValues(); + break; + case NFD_Quick_Check: + values = getNFDQuickCheckValues(); + break; + case NFKC_Quick_Check: + values = getNFKCQuickCheckValues(); + break; + case NFKD_Quick_Check: + values = getNFKDQuickCheckValues(); + break; + case Numeric_Type: + values = getNumericTypeValues(); + break; + case Joining_Type: + values = getJoiningTypeValues(); + break; + case Joining_Group: + values = getJoiningGroupValues(); + break; + case Line_Break: + values = getLineBreakValues(); + break; + case East_Asian_Width: + values = getEastAsianWidthValues(); + break; + case Hangul_Syllable_Type: + values = getHangulSyllableTypeValues(); + break; + case Indic_Syllabic_Category: + values = getIndicSyllabicCategoryValues(); + break; + case Indic_Positional_Category: + values = getIndicPositionalCategoryValues(); + break; + case Indic_Conjunct_Break: + values = getIndicConjunctBreakValues(); + break; + case Vertical_Orientation: + values = getVerticalOrientationValues(); + break; + case Grapheme_Cluster_Break: + values = getGraphemeClusterBreakValues(); + break; + case Word_Break: + values = getWordBreakValues(); + break; + case Sentence_Break: + values = getSentenceBreakValues(); + break; + case Do_Not_Emit_Type: + values = getDoNotEmitTypeValues(); + break; + + default: + throw new IllegalStateException( + ucdProperty.getShortName() + + " is not handled by " + + "getFormattedAttribute."); + } + String formattedValues = formatValues(attributeString.length(), values, valuesoutputtype); + stringBuilder + .append(" code-point-attributes &=") + .append(NEWLINE) + .append(attributeString) + .append("{ "); + if (formattedValues.contains(NEWLINE)) { + stringBuilder.append(formattedValues).append(NEWLINE); + stringBuilder.append( + String.format("%" + (attributeString.length() + "}?".length()) + "s", "}?")); + } else { + stringBuilder.append(formattedValues).append(" }?"); + } + return stringBuilder.toString(); + } + + private static String getFormattedSyntax(UcdProperty ucdProperty) { + final PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + if (propInfo.getRegex() == null) { + throw new NullPointerException( + "Could not find syntax for " + ucdProperty.getShortName()); + } + + String attributeString = + ucdProperty.getShortName().startsWith("cjk") + ? " attribute " + ucdProperty.getShortName().substring(2) + " " + : " attribute " + ucdProperty.getShortName() + " "; + String formattedAttributeString; + switch (ucdProperty) { + // { text } + case ISO_Comment: + formattedAttributeString = attributeString + "{ text }?"; + break; + + // { single-code-point } + case Equivalent_Unified_Ideograph: + formattedAttributeString = attributeString + "{ single-code-point }?"; + break; + + // { "" | single-code-point } + case Bidi_Mirroring_Glyph: + formattedAttributeString = attributeString + "{ \"\" | single-code-point }?"; + break; + + // { "#" | single-code-point } + case Bidi_Paired_Bracket: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Simple_Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | single-code-point }?"; + break; + + // { "#" | zero-or-more-code-points } + case Decomposition_Mapping: + case NFKC_Casefold: + case NFKC_Simple_Casefold: + formattedAttributeString = + attributeString + "{ \"#\" | zero-or-more-code-points }?"; + break; + + // { "#" | one-or-more-code-points } + case FC_NFKC_Closure: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Case_Folding: + formattedAttributeString = attributeString + "{ \"#\" | one-or-more-code-points }?"; + break; + + // { "NaN" | RegEx } + case Numeric_Value: + formattedAttributeString = + attributeString + + "{ \"NaN\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + + // Special cases + case Name: + formattedAttributeString = + attributeString + + "{ \"\" |" + + NEWLINE + + " \"CJK UNIFIED IDEOGRAPH-#\" |" + + NEWLINE + + " \"CJK COMPATIBILITY IDEOGRAPH-#\" |" + + NEWLINE + + " \"EGYPTIAN HIEROGLYPH-#\" |" + + NEWLINE + + " \"TANGUT IDEOGRAPH-#\" |" + + NEWLINE + + " \"KHITAN SMALL SCRIPT CHARACTER-#\" |" + + NEWLINE + + " \"NUSHU CHARACTER-#\" |" + + NEWLINE + + " xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" }" + + NEWLINE + + " }?"; + break; + case Unicode_1_Name: + formattedAttributeString = + attributeString + + "{ \"\" | xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + break; + case Script: + formattedAttributeString = attributeString + "{ script }?"; + break; + case Script_Extensions: + formattedAttributeString = attributeString + "{ list { script + } }?"; + break; + case kTGT_MergedSrc: + // Ideally, should be obtained from a TR. + String kTGT_MergedSrc = + NEWLINE + + " { xsd:string {pattern=\"L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2006-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1997-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L1986-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"S1968-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"N1966-[0-9]{3}(-[0-9A-Z]{3,4})?\"}" + + NEWLINE + + " | xsd:string {pattern=\"H2004-[A-Z]-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"L2012-[0-9]{4}\"}" + + NEWLINE + + " | xsd:string {pattern=\"UTN42-[0-9]{3}\"}" + + NEWLINE + + " }?"; + formattedAttributeString = attributeString + kTGT_MergedSrc; + break; + case kReading: + // Ideally, should be obtained from a TR. + String kReading = "{ xsd:string }?"; + formattedAttributeString = attributeString + kReading; + break; + + default: + formattedAttributeString = + attributeString + + "{ xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?"; + } + return " code-point-attributes &=" + NEWLINE + formattedAttributeString; + } + + private static String getFormattedTR38Syntax(UcdProperty ucdProperty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isShowIfEmpty = false; + for (UCDPropertyDetail propDetail : UCDPropertyDetail.cjkValues()) { + if (propDetail.getUcdProperty().equals(ucdProperty)) { + isShowIfEmpty = propDetail.isCJKShowIfEmpty(); + } + } + + String attributeString = " attribute " + ucdProperty.getShortName().substring(2); + TR38Details tr38Details = syntaxTR38.get(ucdProperty.name()); + if (tr38Details == null) { + throw new NullPointerException( + "Could not locate details for " + ucdProperty.name() + " in " + TR38URL); + } + String formattedSyntax = formatTR38Syntax(tr38Details, isShowIfEmpty); + + return " code-point-attributes &=" + attributeString + NEWLINE + formattedSyntax; + } + + private static String getFormattedElement(UcdProperty ucdProperty) { + // Currently scoped to UcdProperty.Name_Alias, but might need to handle different + // properties. + String nameAliasElement = "name-alias"; + List values = getNameAliasTypeValues(); + PropertyParsingInfo propInfo = PropertyParsingInfo.getPropertyInfo(ucdProperty); + + String elementString = " element " + nameAliasElement + " {" + NEWLINE; + String attributeAliasString = + " attribute alias { xsd:string { pattern=\"" + + cleanRegex(propInfo.getRegex().toString()) + + "\" } }?," + + NEWLINE; + String attributeTypeString = " attribute type "; + + String formattedValues = + formatValues( + attributeTypeString.length(), values, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + + return " code-point-attributes &=" + + NEWLINE + + elementString + + attributeAliasString + + attributeTypeString + + "{ " + + formattedValues + + NEWLINE + + String.format( + "%" + (attributeTypeString.length() + "}? } *".length()) + "s", "}? } *"); + } + + private static String getFormattedBoolean(UcdProperty ucdProperty) { + String attributeString = " attribute " + ucdProperty.getShortName() + " "; + + return " code-point-attributes &=" + NEWLINE + attributeString + "{ boolean }?"; + } + + private static String getFormattedValues(SCHEMA schema, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getBinaryValues(); + String formattedValues = formatValues(2, values, valuesoutputtype); + return " " + schema.getName() + " = " + formattedValues; + } + + private static String getFormattedPropertyValues( + UcdProperty ucdProperty, VALUESOUTPUTTYPE valuesoutputtype) { + List values = getScriptValues(); + String formattedValues = formatValues(11, values, valuesoutputtype); + return " " + ucdProperty.name().toLowerCase() + " = " + formattedValues; + } + + private static String getFormattedDoNotEmit(VALUESOUTPUTTYPE valuesoutputtype) { + List values = getDoNotEmitTypeValues(); + String formattedValues = formatValues(26, values, valuesoutputtype); + return " ucd.content &=\n" + + " element do-not-emit {\n" + + " element instead {\n" + + " attribute of { one-or-more-code-points },\n" + + " attribute use { one-or-more-code-points },\n" + + " attribute because { " + + formattedValues + + NEWLINE + + " } }+ }?"; + } + + private static String formatTR38Syntax(TR38Details tr38Details, boolean isShowIfEmpty) { + // TODO: We should determine whether we still want to show empty values in the XML files. + // TODO: See org.unicode.xml.UcdPropertyDetail.isCJKShowIfEmpty() + boolean isList = tr38Details.isList(); + String syntax = cleanRegex(tr38Details.getSyntax()); + // This is a kludge as it depends on only having single OR double quotes in the syntax. If + // we have both, we'll + // need to do more investigation on what RELAXNG Compact supports. + String QUOTMARK = syntax.contains("\"") ? "'" : "\""; + + boolean hasNewlines = syntax.contains("\n"); + if (hasNewlines) { + int indent; + String firstLinePrefix; + String ending = isList ? " )+}}?" : " }?"; + if (isShowIfEmpty) { + indent = (isList ? 15 : 8); + firstLinePrefix = isList ? " { \"\" | list { " : " { \"\" | "; + } else { + indent = (isList ? 12 : 4); + firstLinePrefix = isList ? " { list { ( " : " { "; + } + String padding = String.format("%" + indent + "s", ""); + StringBuilder formattedSyntaxBuilder = new StringBuilder(); + Pattern syntaxPattern = Pattern.compile("([^\r\n]+)"); + Matcher matcher = syntaxPattern.matcher(syntax); + while (matcher.find()) { + if (formattedSyntaxBuilder.length() == 0) { + // First line + formattedSyntaxBuilder + .append(firstLinePrefix) + .append("xsd:string { pattern=") + .append(QUOTMARK) + .append(matcher.group(1)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } else { + // Everything else + formattedSyntaxBuilder + .append(padding) + .append( + matcher.group(1) + .replaceAll( + "^[| ]*", + " | xsd:string { pattern=" + QUOTMARK)) + .append(QUOTMARK) + .append(" }") + .append(NEWLINE); + } + } + formattedSyntaxBuilder.append(ending); + return formattedSyntaxBuilder.toString(); + + } else { + if (isShowIfEmpty) { + if (isList) { + return " { \"\" | list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { \"\" | xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " } }?"; + } + } else { + if (isList) { + return " { list { xsd:string { pattern=" + + QUOTMARK + + syntax + + QUOTMARK + + " }+ } }?"; + } else { + return " { xsd:string { pattern=" + QUOTMARK + syntax + QUOTMARK + " } }?"; + } + } + } + } + + private static String formatValues( + int indent, List values, VALUESOUTPUTTYPE valuesoutputtype) { + StringBuilder valueBlock = new StringBuilder(); + StringBuilder currentLine = new StringBuilder(); + String padding = String.format("%" + indent + "s", ""); + String groupPrefix = ""; + for (String value : values) { + StringBuilder formattedValue = new StringBuilder(); + if (valueBlock.length() > 0 || currentLine.length() > 0) { + formattedValue.append("| "); + } + if (value.startsWith("xsd")) { + formattedValue.append(value); + } else { + formattedValue.append("\"").append(value).append("\""); + } + + switch (valuesoutputtype) { + case NUMERICAL_GROUP: + case ALPHABETICAL_GROUP: + String valuePrefix = getValuePrefix(value, valuesoutputtype); + if (groupPrefix.isEmpty()) { + currentLine.append(formattedValue); + groupPrefix = valuePrefix; + } else if (valuePrefix.equals(groupPrefix)) { + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + } else { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + groupPrefix = valuePrefix; + } + break; + + case MAX_LINE_LENGTH: + int testLength = + valueBlock.length() == 0 + ? padding.length() + currentLine.length() + " ".length() + : currentLine.length() + " ".length(); + if ((testLength + formattedValue.length()) > MAX_LINE_LENGTH) { + valueBlock.append(currentLine).append(NEWLINE); + currentLine.setLength(0); + currentLine.append(padding).append(formattedValue); + } else { + if (currentLine.length() > 0) { + currentLine.append(" "); + } + currentLine.append(formattedValue); + } + break; + + case VALUE_PER_LINE: + default: + if (valueBlock.length() > 0) { + valueBlock.append(NEWLINE).append(padding).append("| "); + } + if (value.startsWith("xsd")) { + valueBlock.append(value); + } else { + valueBlock.append("\"").append(value).append("\""); + } + } + } + valueBlock.append(currentLine); + return valueBlock.toString(); + } + + private static String getValuePrefix(String value, VALUESOUTPUTTYPE valuesoutputtype) { + if (valuesoutputtype == VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) { + return value.substring(0, 1); + } + if (valuesoutputtype == VALUESOUTPUTTYPE.NUMERICAL_GROUP) { + if (value.contains(".")) { + return value.substring(0, value.indexOf(".")); + } else { + // String value in list of numbers. See Age_Values for an example. + return value; + } + } else { + throw new IllegalArgumentException(); + } + } + + private static String cleanRegex(String regex) { + return regex.replaceAll("\\[-", "[\\\\-").replaceAll("\\\\/", "/").replaceAll("\\\\'", "'"); + } + + // ********************* Combined properties ********************// + + private static String getFormattedDecompositionProperties() { + return getFormattedAttribute( + UcdProperty.Decomposition_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Decomposition_Mapping); + } + + private static String getFormattedCompositionProperties() { + return getFormattedBoolean(UcdProperty.Composition_Exclusion) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Full_Composition_Exclusion); + } + + private static String getFormattedQuickCheckProperties() { + return getFormattedAttribute(UcdProperty.NFC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKC_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.NFKD_Quick_Check, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + TRIPLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFD) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKC) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Expands_On_NFKD) + + TRIPLELINE + + getFormattedSyntax(UcdProperty.FC_NFKC_Closure); + } + + private static String getFormattedNumericProperties() { + return getFormattedAttribute(UcdProperty.Numeric_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Numeric_Value); + } + + private static String getFormattedJoiningProperties() { + return getFormattedAttribute(UcdProperty.Joining_Type, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Joining_Group, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedCasingProperties() { + return getFormattedBoolean(UcdProperty.Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Lowercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Uppercase) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Lowercase); + } + + private static String getFormattedSimpleCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Simple_Titlecase_Mapping); + } + + private static String getFormattedCaseMappingProperties() { + return getFormattedSyntax(UcdProperty.Uppercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Lowercase_Mapping) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Titlecase_Mapping); + } + + private static String getFormattedCaseFoldingProperties() { + return getFormattedSyntax(UcdProperty.Simple_Case_Folding) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Case_Folding); + } + + private static String getFormattedCaseOtherProperties() { + return getFormattedBoolean(UcdProperty.Case_Ignorable) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Cased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Casemapped) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Lowercased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_NFKC_Casefolded) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Titlecased) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Changes_When_Uppercased) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Casefold) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.NFKC_Simple_Casefold); + } + + private static String getFormattedScriptProperties() { + return getFormattedPropertyValues(UcdProperty.Script, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Script_Extensions); + } + + private static String getFormattedIdentifierProperties() { + return getFormattedBoolean(UcdProperty.ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_ID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.XID_Continue) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Start) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ID_Compat_Math_Continue); + } + + private static String getFormattedPatternProperties() { + return getFormattedBoolean(UcdProperty.Pattern_Syntax) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Pattern_White_Space); + } + + private static String getFormattedFunctionGraphicProperties() { + return getFormattedBoolean(UcdProperty.Dash) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hyphen) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Quotation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Terminal_Punctuation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Sentence_Terminal) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Diacritic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extender) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Soft_Dotted) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Alphabetic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Math) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.ASCII_Hex_Digit) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Default_Ignorable_Code_Point) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Logical_Order_Exception) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Prepended_Concatenation_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Modifier_Combining_Mark) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.White_Space) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Vertical_Orientation, VALUESOUTPUTTYPE.MAX_LINE_LENGTH) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Regional_Indicator); + } + + private static String getFormattedBoundaryProperties() { + return getFormattedBoolean(UcdProperty.Grapheme_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Other_Grapheme_Extend) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Grapheme_Link) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Grapheme_Cluster_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute(UcdProperty.Word_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP) + + DOUBLELINE + + getFormattedAttribute( + UcdProperty.Sentence_Break, VALUESOUTPUTTYPE.ALPHABETICAL_GROUP); + } + + private static String getFormattedIdeographProperties() { + return getFormattedBoolean(UcdProperty.Ideographic) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Unified_Ideograph) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.Equivalent_Unified_Ideograph) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Binary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Trinary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.IDS_Unary_Operator) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Radical); + } + + private static String getFormattedMiscellaneousProperties() { + return getFormattedBoolean(UcdProperty.Deprecated) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Variation_Selector) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Noncharacter_Code_Point); + } + + private static String getFormattedUnihanProperties() { + return getFormattedTR38Syntax(UcdProperty.kAccountingNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kAlternateTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kBigFive) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCangjie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCantonese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCCCII) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauer) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCheungBauerIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCihaiT) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1986) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCNS1992) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCompatibilityVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kCowles) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kDefinition) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kEACC) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFanqie) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFenn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFennIndex) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kFourCornerCode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB3) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB5) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB7) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGB8) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGradeLevel) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kGSR) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHangul) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanYu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinlu) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHanyuPinyin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHDZRadBreak) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kHKGlyph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIBMJapan) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIICore) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_GSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_HSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_JSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KPSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_KSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_MSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_SSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_TSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_UKSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_USource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRG_VSource) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGDaeJaweon) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGHanyuDaZidian) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kIRGKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJa) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapanese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseKun) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJapaneseOn) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJinmeiyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis0) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJis1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJIS0213) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kJoyoKanji) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKangXi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKarlgren) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKorean) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanEducationHanja) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kKoreanName) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kLau) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMainlandTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMandarin) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMatthews) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMeyerWempe) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMojiJoho) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kMorohashi) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kNelson) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kOtherNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPhonetic) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPrimaryNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kPseudoGB1) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSAdobe_Japan1_6) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kRSUnicode) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSBGY) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSimplifiedVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Index) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSMSZD2003Readings) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpecializedSemanticVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kSpoofingVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kStrange) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTaiwanTelegraph) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGH) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTGHZ2013) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTotalStrokes) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kTraditionalVariant) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kUnihanCore2020) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnamese) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kVietnameseNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXerox) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kXHC1983) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuang) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZhuangNumeric) + + DOUBLELINE + + getFormattedTR38Syntax(UcdProperty.kZVariant); + } + + private static String getFormattedTangutProperties() { + return getFormattedSyntax(UcdProperty.kRSTUnicode) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kTGT_MergedSrc); + } + + private static String getFormattedNushuProperties() { + return getFormattedSyntax(UcdProperty.kSrc_NushuDuben) + + DOUBLELINE + + getFormattedSyntax(UcdProperty.kReading); + } + + private static String getFormattedEmojiProperties() { + return getFormattedBoolean(UcdProperty.Emoji) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Presentation) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Modifier_Base) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Emoji_Component) + + DOUBLELINE + + getFormattedBoolean(UcdProperty.Extended_Pictographic); + } + + // ********************* Attribute values ********************// + + private static List getBinaryValues() { + List values = new ArrayList<>(); + for (Binary binaryValues : Binary.values()) { + values.add(binaryValues.getShortName()); + } + // Binary should display as Y | N. + values.sort(Collections.reverseOrder()); + return values; + } + + private static List getAgeValues() { + List values = new ArrayList<>(); + for (Age_Values ageValues : Age_Values.values()) { + String shortName = ageValues.getShortName(); + if (shortName.equals("NA")) { + values.add("unassigned"); + } else if (shortName.equals("13.1")) { + // https://github.com/unicode-org/unicodetools/issues/100 + } else { + values.add(shortName); + } + } + return values; + } + + private static List getNameAliasTypeValues() { + List values = new ArrayList<>(); + for (AttributeResolver.AliasType aliastypeValues : AttributeResolver.AliasType.values()) { + if (!aliastypeValues.equals(AttributeResolver.AliasType.NONE)) { + values.add(aliastypeValues.toString()); + } + } + return values; + } + + private static List getBlockValues() { + List values = new ArrayList<>(); + for (Block_Values blockValues : Block_Values.values()) { + values.add(blockValues.getShortName()); + } + return values; + } + + private static List getGeneralCategoryValues() { + List values = new ArrayList<>(); + for (General_Category_Values generalCategoryValues : General_Category_Values.values()) { + if (!generalCategoryValues + .getShortName() + .toUpperCase() + .equals(generalCategoryValues.getShortName())) { + // Some of the General_Category_Values (LC, L, M, N, P, S, Z, C) stand for grouping + // of related + // General_Category values. They won't occur on any individual code point, so can be + // ignored. + values.add(generalCategoryValues.getShortName()); + } + } + return values; + } + + private static List getCanonicalCombiningClassValues() { + List values = new ArrayList<>(); + values.add("xsd:integer { minInclusive=\"0\" maxInclusive=\"254\" }"); + // Because the set of values that this property has taken across the various versions of the + // UCD is rather + // large, our schema does not restrict the possible values to those actually used. + // for (Canonical_Combining_Class_Values canonicalCombiningClassValues : + // Canonical_Combining_Class_Values.values()) { + // values.add(canonicalCombiningClassValues.getShortName()); + // } + return values; + } + + private static List getBidirectionalValues() { + List values = new ArrayList<>(); + for (Bidi_Class_Values bidiClassValues : Bidi_Class_Values.values()) { + values.add(bidiClassValues.getShortName()); + } + return values; + } + + private static List getBidiPairedBracketTypeValues() { + List values = new ArrayList<>(); + // Order should be Open/Close/None + values.add(Bidi_Paired_Bracket_Type_Values.Open.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.Close.getShortName()); + values.add(Bidi_Paired_Bracket_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Bidi_Paired_Bracket_Type_Values bidiPairedBracketTypeValue : + Bidi_Paired_Bracket_Type_Values.values()) { + if (!values.contains(bidiPairedBracketTypeValue.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getDecompositionTypeValues() { + List values = new ArrayList<>(); + for (Decomposition_Type_Values decompositionTypeValues : + Decomposition_Type_Values.values()) { + // We want "none" to be last. + if (decompositionTypeValues != Decomposition_Type_Values.None) { + values.add(decompositionTypeValues.getNames().getOtherNames().get(0)); + } + } + values.add(Decomposition_Type_Values.None.getNames().getOtherNames().get(0)); + return values; + } + + private static List getNFCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFC_Quick_Check_Values.Yes.getShortName()); + values.add(NFC_Quick_Check_Values.No.getShortName()); + values.add(NFC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFC_Quick_Check_Values nfcQuickCheckValues : NFC_Quick_Check_Values.values()) { + if (!values.contains(nfcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFD_Quick_Check_Values.Yes.getShortName()); + values.add(NFD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFD_Quick_Check_Values nfdQuickCheckValues : NFD_Quick_Check_Values.values()) { + if (!values.contains(nfdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKCQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No/Maybe + values.add(NFKC_Quick_Check_Values.Yes.getShortName()); + values.add(NFKC_Quick_Check_Values.No.getShortName()); + values.add(NFKC_Quick_Check_Values.Maybe.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKC_Quick_Check_Values nfkcQuickCheckValues : NFKC_Quick_Check_Values.values()) { + if (!values.contains(nfkcQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNFKDQuickCheckValues() { + List values = new ArrayList<>(); + // Order should be Yes/No + values.add(NFKD_Quick_Check_Values.Yes.getShortName()); + values.add(NFKD_Quick_Check_Values.No.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (NFKD_Quick_Check_Values nfkdQuickCheckValues : NFKD_Quick_Check_Values.values()) { + if (!values.contains(nfkdQuickCheckValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getNumericTypeValues() { + List values = new ArrayList<>(); + // Order should be Decimal/Digit/Numeric/None + values.add(Numeric_Type_Values.Decimal.getShortName()); + values.add(Numeric_Type_Values.Digit.getShortName()); + values.add(Numeric_Type_Values.Numeric.getShortName()); + values.add(Numeric_Type_Values.None.getShortName()); + // Now let's check to see if there is anything else that we didn't expect + for (Numeric_Type_Values numericTypeValues : Numeric_Type_Values.values()) { + if (!values.contains(numericTypeValues.getShortName())) { + throw new IllegalArgumentException(); + } + } + return values; + } + + private static List getJoiningTypeValues() { + List values = new ArrayList<>(); + for (Joining_Type_Values joiningTypeValues : Joining_Type_Values.values()) { + values.add(joiningTypeValues.getShortName()); + } + return values; + } + + private static List getJoiningGroupValues() { + List values = new ArrayList<>(); + for (Joining_Group_Values joiningGroupValues : Joining_Group_Values.values()) { + values.add(joiningGroupValues.getShortName()); + } + return values; + } + + private static List getLineBreakValues() { + List values = new ArrayList<>(); + for (Line_Break_Values lineBreakValues : Line_Break_Values.values()) { + values.add(lineBreakValues.getShortName()); + } + return values; + } + + private static List getEastAsianWidthValues() { + List values = new ArrayList<>(); + for (East_Asian_Width_Values eastAsianWidthValues : East_Asian_Width_Values.values()) { + values.add(eastAsianWidthValues.getShortName()); + } + return values; + } + + private static List getScriptValues() { + List excludedValues = + Arrays.asList( + Script_Values.Han_with_Bopomofo, + Script_Values.Japanese, + Script_Values.Korean, + Script_Values.Math_Symbols, + Script_Values.Emoji_Symbols, + Script_Values.Other_Symbols, + Script_Values.Unwritten); + List values = new ArrayList<>(); + for (Script_Values scriptValue : Script_Values.values()) { + if (!excludedValues.contains(scriptValue)) { + values.add(scriptValue.getShortName()); + } + // Include the following if you want to add other names + // if (!scriptValue.getNames().getOtherNames().isEmpty()) { + // values.add(scriptValue.getNames().getOtherNames().get(0)); + // } + } + Collections.sort(values); + return values; + } + + private static List getHangulSyllableTypeValues() { + List values = new ArrayList<>(); + for (Hangul_Syllable_Type_Values hangulSyllableTypeValues : + Hangul_Syllable_Type_Values.values()) { + values.add(hangulSyllableTypeValues.getShortName()); + } + return values; + } + + private static List getIndicSyllabicCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Syllabic_Category_Values indicSyllabicCategoryValues : + Indic_Syllabic_Category_Values.values()) { + values.add(indicSyllabicCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicPositionalCategoryValues() { + List values = new ArrayList<>(); + for (Indic_Positional_Category_Values indicPositionalCategoryValues : + Indic_Positional_Category_Values.values()) { + values.add(indicPositionalCategoryValues.getShortName()); + } + return values; + } + + private static List getIndicConjunctBreakValues() { + List values = new ArrayList<>(); + for (Indic_Conjunct_Break_Values indicConjunctBreakValues : + Indic_Conjunct_Break_Values.values()) { + values.add(indicConjunctBreakValues.getShortName()); + } + return values; + } + + private static List getVerticalOrientationValues() { + List values = new ArrayList<>(); + for (Vertical_Orientation_Values verticalOrientationValues : + Vertical_Orientation_Values.values()) { + values.add(verticalOrientationValues.getShortName()); + } + return values; + } + + private static List getGraphemeClusterBreakValues() { + List values = new ArrayList<>(); + for (Grapheme_Cluster_Break_Values graphemeClusterBreakValues : + Grapheme_Cluster_Break_Values.values()) { + values.add(graphemeClusterBreakValues.getShortName()); + } + return values; + } + + private static List getWordBreakValues() { + List values = new ArrayList<>(); + for (Word_Break_Values wordBreakValues : Word_Break_Values.values()) { + values.add(wordBreakValues.getShortName()); + } + return values; + } + + private static List getSentenceBreakValues() { + List values = new ArrayList<>(); + for (Sentence_Break_Values sentenceBreakValues : Sentence_Break_Values.values()) { + values.add(sentenceBreakValues.getShortName()); + } + return values; + } + + private static List getDoNotEmitTypeValues() { + List values = new ArrayList<>(); + for (Do_Not_Emit_Type_Values doNotEmitTypeValues : Do_Not_Emit_Type_Values.values()) { + values.add(doNotEmitTypeValues.getShortName()); + } + Collections.sort(values); + return values; + } + + // ********************* Utility methods ********************// + + private static HashMap parseTR38() throws IOException, URISyntaxException { + HashMap syntaxTR38 = new HashMap<>(); + URI uri = new URI(TR38URL); + StringBuilder stringBuilder = new StringBuilder(); + try (InputStream is = uri.toURL().openStream()) { + int ptr = 0; + while ((ptr = is.read()) != -1) { + stringBuilder.append((char) ptr); + } + } + Pattern syntaxPattern = + Pattern.compile( + ">Property.*?(.*?).*?>Delimiter.*?>(.*?).*?>Syntax.*?>(.*?)", + Pattern.DOTALL); + Matcher matcher = syntaxPattern.matcher(stringBuilder.toString()); + while (matcher.find()) { + String delimiter = matcher.group(2).trim(); + boolean isList = false; + switch (delimiter) { + case "N/A": + break; + case "space": + isList = true; + break; + default: + throw new IllegalArgumentException( + "Only \"space\" or \"N/A\" are supported values for Delimiter." + + " Found: " + + delimiter); + } + TR38Details tr38Details = + new TR38Details(isList, matcher.group(3).trim().replaceAll("
", "")); + syntaxTR38.put(matcher.group(1).trim(), tr38Details); + } + return syntaxTR38; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java new file mode 100644 index 000000000..d607a661f --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDDataResolver.java @@ -0,0 +1,214 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.PropertyParsingInfo; +import org.unicode.props.UcdLineParser; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** Helper class for building sections of UCDXML files based on IndexUnicodeProperties values. */ +public class UCDDataResolver { + + private final IndexUnicodeProperties indexUnicodeProperties; + private final String namespace; + private final UCDXMLWriter writer; + + public UCDDataResolver(IndexUnicodeProperties iup, String namespace, UCDXMLWriter writer) { + indexUnicodeProperties = iup; + this.namespace = namespace; + this.writer = writer; + } + + public void buildSection(UCDSectionDetail.UcdSection ucdSection) throws SAXException { + VersionInfo minVersion = ucdSection.getMinVersion(); + VersionInfo maxVersion = ucdSection.getMaxVersion(); + String tag = ucdSection.toString(); + String childTag = ucdSection.getChildTag(); + boolean parserWithRange = ucdSection.getParserWithRange(); + boolean parserWithMissing = ucdSection.getParserWithMissing(); + UCDSectionComponent[] ucdSectionComponents = + ucdSection.getUcdSectionDetail().getUcdSectionComponents(); + + if (isCompatibleVersion(minVersion, maxVersion)) { + writer.startElement(tag); + { + for (UCDSectionComponent ucdSectionComponent : ucdSectionComponents) { + if (isCompatibleVersion( + ucdSectionComponent.getMinVersion(), + ucdSectionComponent.getMaxVersion())) { + final PropertyParsingInfo fileInfoEVS = + PropertyParsingInfo.getPropertyInfo( + ucdSectionComponent.getUcdProperty()); + String fullFilename = + fileInfoEVS.getFullFileName(indexUnicodeProperties.getUcdVersion()); + UcdLineParser parser = + new UcdLineParser(FileUtilities.in("", fullFilename)); + parser.withRange(parserWithRange); + parser.withMissing(parserWithMissing); + switch (ucdSection) { + case BLOCKS: + for (UcdLineParser.UcdLine line : parser) { + if (!line.getOriginalLine().startsWith("#")) { + AttributesImpl attributes = + getBlockAttributes(namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + break; + case NAMEDSEQUENCES: + HashMap namedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + namedSequences.put(parts[0], parts[1]); + } + List names = new ArrayList<>(namedSequences.keySet()); + Collections.sort(names); + for (String name : names) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, namedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + case PROVISIONALNAMEDSEQUENCES: + HashMap provisionalNamedSequences = new HashMap<>(); + for (UcdLineParser.UcdLine line : parser) { + String[] parts = line.getParts(); + provisionalNamedSequences.put(parts[0], parts[1]); + } + List psNames = + new ArrayList<>(provisionalNamedSequences.keySet()); + Collections.sort(psNames); + for (String name : psNames) { + AttributesImpl attributes = + getNamedSequenceAttributes( + namespace, name, provisionalNamedSequences); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + break; + default: + for (UcdLineParser.UcdLine line : parser) { + AttributesImpl attributes = + getAttributes(ucdSection, namespace, line); + writer.startElement(childTag, attributes); + { + writer.endElement(childTag); + } + } + } + } + } + writer.endElement(tag); + } + } + } + + private AttributesImpl getAttributes( + UCDSectionDetail.UcdSection ucdSection, String namespace, UcdLineParser.UcdLine line) { + switch (ucdSection) { + case CJKRADICALS: + return getCJKRadicalAttributes(namespace, line); + case DONOTEMIT: + return getDoNotEmitAttributes(namespace, line); + case EMOJISOURCES: + return getEmojiSourceAttributes(namespace, line); + case NORMALIZATIONCORRECTIONS: + return getNCAttributes(namespace, line); + case STANDARDIZEDVARIANTS: + return getSVAttributes(namespace, line); + default: + throw new IllegalArgumentException( + "getAttributes failed on an unexpected UcdSection"); + } + } + + private static AttributesImpl getBlockAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + String[] range = parts[0].split("\\.\\."); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "first-cp", "first-cp", "CDATA", range[0]); + attributes.addAttribute(namespace, "last-cp", "last-cp", "CDATA", range[1]); + attributes.addAttribute(namespace, "name", "name", "CDATA", parts[1]); + return attributes; + } + + private static AttributesImpl getCJKRadicalAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "number", "number", "CDATA", parts[0]); + attributes.addAttribute(namespace, "radical", "radical", "CDATA", parts[1]); + attributes.addAttribute(namespace, "ideograph", "ideograph", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getDoNotEmitAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "of", "of", "CDATA", parts[0]); + attributes.addAttribute(namespace, "use", "use", "CDATA", parts[1]); + attributes.addAttribute(namespace, "because", "because", "CDATA", parts[2]); + return attributes; + } + + private static AttributesImpl getEmojiSourceAttributes( + String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "unicode", "unicode", "CDATA", parts[0]); + attributes.addAttribute(namespace, "docomo", "docomo", "CDATA", parts[1]); + attributes.addAttribute(namespace, "kddi", "kddi", "CDATA", parts[2]); + attributes.addAttribute(namespace, "softbank", "softbank", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getNamedSequenceAttributes( + String namespace, String name, HashMap namedSequences) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "name", "name", "CDATA", name); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", namedSequences.get(name)); + return attributes; + } + + private static AttributesImpl getNCAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cp", "cp", "CDATA", parts[0]); + attributes.addAttribute(namespace, "old", "old", "CDATA", parts[1]); + attributes.addAttribute(namespace, "new", "new", "CDATA", parts[2]); + attributes.addAttribute(namespace, "version", "version", "CDATA", parts[3]); + return attributes; + } + + private static AttributesImpl getSVAttributes(String namespace, UcdLineParser.UcdLine line) { + String[] parts = line.getParts(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(namespace, "cps", "cps", "CDATA", parts[0]); + attributes.addAttribute(namespace, "desc", "desc", "CDATA", parts[1]); + attributes.addAttribute( + namespace, "when", "when", "CDATA", parts[2] != null ? parts[2] : ""); + return attributes; + } + + private boolean isCompatibleVersion(VersionInfo minVersion, VersionInfo maxVersion) { + return (indexUnicodeProperties.getUcdVersion().compareTo(minVersion) >= 0 + && (maxVersion == null + || indexUnicodeProperties.getUcdVersion().compareTo(maxVersion) <= 0)); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java new file mode 100644 index 000000000..9dab8117b --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDPropertyDetail.java @@ -0,0 +1,2360 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import java.util.LinkedHashSet; +import java.util.Set; +import org.unicode.props.UcdProperty; + +/** + * Helper class for determining how and when UCD properties should be shown in UCDXML. Also includes + * information about when a UCDProperty was added to Unicode. + */ +public class UCDPropertyDetail { + + private static LinkedHashSet basePropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet cjkPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet ucdxmlPropertyDetails = + new LinkedHashSet(); + private static LinkedHashSet allPropertyDetails = + new LinkedHashSet(); + + public static UCDPropertyDetail Age_Detail = + new UCDPropertyDetail( + UcdProperty.Age, VersionInfo.getInstance(3, 2, 0), 1, true, false, false, true); + public static UCDPropertyDetail Name_Detail = + new UCDPropertyDetail( + UcdProperty.Name, + VersionInfo.getInstance(1, 1, 0), + 2, + true, + false, + false, + true); + public static UCDPropertyDetail Jamo_Short_Name_Detail = + new UCDPropertyDetail( + UcdProperty.Jamo_Short_Name, + VersionInfo.getInstance(5, 1, 0), + 3, + true, + false, + false, + true); + public static UCDPropertyDetail General_Category_Detail = + new UCDPropertyDetail( + UcdProperty.General_Category, + VersionInfo.getInstance(1, 1, 0), + 4, + true, + false, + false, + true); + public static UCDPropertyDetail Canonical_Combining_Class_Detail = + new UCDPropertyDetail( + UcdProperty.Canonical_Combining_Class, + VersionInfo.getInstance(1, 1, 0), + 5, + true, + false, + false, + true); + public static UCDPropertyDetail Decomposition_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Decomposition_Type, + VersionInfo.getInstance(1, 1, 0), + 6, + true, + false, + false, + true); + public static UCDPropertyDetail Decomposition_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Decomposition_Mapping, + VersionInfo.getInstance(1, 1, 0), + 7, + true, + false, + false, + true); + public static UCDPropertyDetail Numeric_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Numeric_Type, + VersionInfo.getInstance(1, 1, 0), + 8, + true, + false, + false, + true); + public static UCDPropertyDetail Numeric_Value_Detail = + new UCDPropertyDetail( + UcdProperty.Numeric_Value, + VersionInfo.getInstance(1, 1, 0), + 9, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Class_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Class, + VersionInfo.getInstance(1, 1, 0), + 10, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Paired_Bracket_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Paired_Bracket_Type, + VersionInfo.getInstance(6, 3, 0), + 11, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Paired_Bracket_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Paired_Bracket, + VersionInfo.getInstance(6, 3, 0), + 12, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Mirrored_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Mirrored, + VersionInfo.getInstance(1, 1, 0), + 13, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Mirroring_Glyph_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Mirroring_Glyph, + VersionInfo.getInstance(3, 0, 1), + 14, + true, + false, + false, + true); + public static UCDPropertyDetail Simple_Uppercase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Simple_Uppercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 15, + true, + false, + false, + true); + public static UCDPropertyDetail Simple_Lowercase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Simple_Lowercase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 16, + true, + false, + false, + true); + public static UCDPropertyDetail Simple_Titlecase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Simple_Titlecase_Mapping, + VersionInfo.getInstance(1, 1, 0), + 17, + true, + false, + false, + true); + public static UCDPropertyDetail Uppercase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Uppercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 18, + true, + false, + false, + true); + public static UCDPropertyDetail Lowercase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Lowercase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 19, + true, + false, + false, + true); + public static UCDPropertyDetail Titlecase_Mapping_Detail = + new UCDPropertyDetail( + UcdProperty.Titlecase_Mapping, + VersionInfo.getInstance(2, 1, 8), + 20, + true, + false, + false, + true); + // public static UcdPropertyDetail Special_Case_Condition_Detail = new UcdPropertyDetail + // ( + // UcdProperty.Special_Case_Condition, VersionInfo.getInstance(1,1,0), 21, + // true, false, false, true); + public static UCDPropertyDetail Simple_Case_Folding_Detail = + new UCDPropertyDetail( + UcdProperty.Simple_Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 22, + true, + false, + false, + true); + public static UCDPropertyDetail Case_Folding_Detail = + new UCDPropertyDetail( + UcdProperty.Case_Folding, + VersionInfo.getInstance(3, 0, 1), + 23, + true, + false, + false, + true); + public static UCDPropertyDetail Joining_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Joining_Type, + VersionInfo.getInstance(2, 0, 0), + 24, + true, + false, + false, + true); + public static UCDPropertyDetail Joining_Group_Detail = + new UCDPropertyDetail( + UcdProperty.Joining_Group, + VersionInfo.getInstance(2, 0, 0), + 25, + true, + false, + false, + true); + public static UCDPropertyDetail East_Asian_Width_Detail = + new UCDPropertyDetail( + UcdProperty.East_Asian_Width, + VersionInfo.getInstance(3, 0, 0), + 26, + true, + false, + false, + true); + public static UCDPropertyDetail Line_Break_Detail = + new UCDPropertyDetail( + UcdProperty.Line_Break, + VersionInfo.getInstance(3, 0, 0), + 27, + true, + false, + false, + true); + public static UCDPropertyDetail Script_Detail = + new UCDPropertyDetail( + UcdProperty.Script, + VersionInfo.getInstance(3, 1, 0), + 28, + true, + false, + false, + true); + public static UCDPropertyDetail Script_Extensions_Detail = + new UCDPropertyDetail( + UcdProperty.Script_Extensions, + VersionInfo.getInstance(6, 1, 0), + 29, + true, + false, + false, + true); + public static UCDPropertyDetail Dash_Detail = + new UCDPropertyDetail( + UcdProperty.Dash, + VersionInfo.getInstance(2, 0, 0), + 30, + true, + false, + false, + true); + public static UCDPropertyDetail White_Space_Detail = + new UCDPropertyDetail( + UcdProperty.White_Space, + VersionInfo.getInstance(2, 0, 0), + 31, + true, + false, + false, + true); + public static UCDPropertyDetail Hyphen_Detail = + new UCDPropertyDetail( + UcdProperty.Hyphen, + VersionInfo.getInstance(2, 0, 0), + 32, + true, + false, + false, + true); + public static UCDPropertyDetail Quotation_Mark_Detail = + new UCDPropertyDetail( + UcdProperty.Quotation_Mark, + VersionInfo.getInstance(2, 0, 0), + 33, + true, + false, + false, + true); + public static UCDPropertyDetail Radical_Detail = + new UCDPropertyDetail( + UcdProperty.Radical, + VersionInfo.getInstance(3, 2, 0), + 34, + true, + false, + false, + true); + public static UCDPropertyDetail Ideographic_Detail = + new UCDPropertyDetail( + UcdProperty.Ideographic, + VersionInfo.getInstance(2, 0, 0), + 35, + true, + false, + false, + true); + public static UCDPropertyDetail Unified_Ideograph_Detail = + new UCDPropertyDetail( + UcdProperty.Unified_Ideograph, + VersionInfo.getInstance(3, 2, 0), + 36, + true, + false, + false, + true); + public static UCDPropertyDetail IDS_Binary_Operator_Detail = + new UCDPropertyDetail( + UcdProperty.IDS_Binary_Operator, + VersionInfo.getInstance(3, 2, 0), + 37, + true, + false, + false, + true); + public static UCDPropertyDetail IDS_Trinary_Operator_Detail = + new UCDPropertyDetail( + UcdProperty.IDS_Trinary_Operator, + VersionInfo.getInstance(3, 2, 0), + 38, + true, + false, + false, + true); + public static UCDPropertyDetail Hangul_Syllable_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Hangul_Syllable_Type, + VersionInfo.getInstance(4, 0, 0), + 39, + true, + false, + false, + true); + public static UCDPropertyDetail Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( + UcdProperty.Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 40, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Default_Ignorable_Code_Point_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Default_Ignorable_Code_Point, + VersionInfo.getInstance(3, 2, 0), + 41, + true, + false, + false, + true); + public static UCDPropertyDetail Alphabetic_Detail = + new UCDPropertyDetail( + UcdProperty.Alphabetic, + VersionInfo.getInstance(1, 1, 0), + 42, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Alphabetic_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Alphabetic, + VersionInfo.getInstance(3, 1, 0), + 43, + true, + false, + false, + true); + public static UCDPropertyDetail Uppercase_Detail = + new UCDPropertyDetail( + UcdProperty.Uppercase, + VersionInfo.getInstance(3, 1, 0), + 44, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Uppercase_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Uppercase, + VersionInfo.getInstance(3, 1, 0), + 45, + true, + false, + false, + true); + public static UCDPropertyDetail Lowercase_Detail = + new UCDPropertyDetail( + UcdProperty.Lowercase, + VersionInfo.getInstance(3, 1, 0), + 46, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Lowercase_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Lowercase, + VersionInfo.getInstance(3, 1, 0), + 47, + true, + false, + false, + true); + public static UCDPropertyDetail Math_Detail = + new UCDPropertyDetail( + UcdProperty.Math, + VersionInfo.getInstance(2, 0, 0), + 48, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Math_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Math, + VersionInfo.getInstance(3, 1, 0), + 49, + true, + false, + false, + true); + public static UCDPropertyDetail Hex_Digit_Detail = + new UCDPropertyDetail( + UcdProperty.Hex_Digit, + VersionInfo.getInstance(2, 0, 0), + 50, + true, + false, + false, + true); + public static UCDPropertyDetail ASCII_Hex_Digit_Detail = + new UCDPropertyDetail( + UcdProperty.ASCII_Hex_Digit, + VersionInfo.getInstance(3, 1, 1), + 51, + true, + false, + false, + true); + public static UCDPropertyDetail Noncharacter_Code_Point_Detail = + new UCDPropertyDetail( + UcdProperty.Noncharacter_Code_Point, + VersionInfo.getInstance(3, 0, 1), + 52, + true, + false, + false, + true); + public static UCDPropertyDetail Variation_Selector_Detail = + new UCDPropertyDetail( + UcdProperty.Variation_Selector, + VersionInfo.getInstance(4, 0, 1), + 53, + true, + false, + false, + true); + public static UCDPropertyDetail Bidi_Control_Detail = + new UCDPropertyDetail( + UcdProperty.Bidi_Control, + VersionInfo.getInstance(2, 0, 0), + 54, + true, + false, + false, + true); + public static UCDPropertyDetail Join_Control_Detail = + new UCDPropertyDetail( + UcdProperty.Join_Control, + VersionInfo.getInstance(2, 0, 0), + 55, + true, + false, + false, + true); + public static UCDPropertyDetail Grapheme_Base_Detail = + new UCDPropertyDetail( + UcdProperty.Grapheme_Base, + VersionInfo.getInstance(3, 2, 0), + 56, + true, + false, + false, + true); + public static UCDPropertyDetail Grapheme_Extend_Detail = + new UCDPropertyDetail( + UcdProperty.Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 57, + true, + false, + false, + true); + public static UCDPropertyDetail Other_Grapheme_Extend_Detail = + new UCDPropertyDetail( + UcdProperty.Other_Grapheme_Extend, + VersionInfo.getInstance(3, 2, 0), + 58, + true, + false, + false, + true); + public static UCDPropertyDetail Grapheme_Link_Detail = + new UCDPropertyDetail( + UcdProperty.Grapheme_Link, + VersionInfo.getInstance(3, 2, 0), + 59, + true, + false, + false, + true); + public static UCDPropertyDetail Sentence_Terminal_Detail = + new UCDPropertyDetail( + UcdProperty.Sentence_Terminal, + VersionInfo.getInstance(9, 0, 0), + 60, + true, + false, + false, + true); + public static UCDPropertyDetail Extender_Detail = + new UCDPropertyDetail( + UcdProperty.Extender, + VersionInfo.getInstance(2, 0, 0), + 61, + true, + false, + false, + true); + public static UCDPropertyDetail Terminal_Punctuation_Detail = + new UCDPropertyDetail( + UcdProperty.Terminal_Punctuation, + VersionInfo.getInstance(2, 0, 0), + 62, + true, + false, + false, + true); + public static UCDPropertyDetail Diacritic_Detail = + new UCDPropertyDetail( + UcdProperty.Diacritic, + VersionInfo.getInstance(2, 0, 0), + 63, + true, + false, + false, + true); + public static UCDPropertyDetail Deprecated_Detail = + new UCDPropertyDetail( + UcdProperty.Deprecated, + VersionInfo.getInstance(3, 2, 0), + 64, + true, + false, + false, + true); + public static UCDPropertyDetail ID_Start_Detail = + new UCDPropertyDetail( + UcdProperty.ID_Start, + VersionInfo.getInstance(3, 1, 0), + 65, + true, + false, + false, + true); + public static UCDPropertyDetail Other_ID_Start_Detail = + new UCDPropertyDetail( + UcdProperty.Other_ID_Start, + VersionInfo.getInstance(4, 0, 0), + 66, + true, + false, + false, + true); + public static UCDPropertyDetail XID_Start_Detail = + new UCDPropertyDetail( + UcdProperty.XID_Start, + VersionInfo.getInstance(3, 1, 0), + 67, + true, + false, + false, + true); + public static UCDPropertyDetail ID_Continue_Detail = + new UCDPropertyDetail( + UcdProperty.ID_Continue, + VersionInfo.getInstance(3, 1, 0), + 68, + true, + false, + false, + true); + public static UCDPropertyDetail Other_ID_Continue_Detail = + new UCDPropertyDetail( + UcdProperty.Other_ID_Continue, + VersionInfo.getInstance(4, 1, 0), + 69, + true, + false, + false, + true); + public static UCDPropertyDetail XID_Continue_Detail = + new UCDPropertyDetail( + UcdProperty.XID_Continue, + VersionInfo.getInstance(3, 1, 0), + 70, + true, + false, + false, + true); + public static UCDPropertyDetail Soft_Dotted_Detail = + new UCDPropertyDetail( + UcdProperty.Soft_Dotted, + VersionInfo.getInstance(3, 2, 0), + 71, + true, + false, + false, + true); + public static UCDPropertyDetail Logical_Order_Exception_Detail = + new UCDPropertyDetail( + UcdProperty.Logical_Order_Exception, + VersionInfo.getInstance(3, 2, 0), + 72, + true, + false, + false, + true); + public static UCDPropertyDetail Pattern_White_Space_Detail = + new UCDPropertyDetail( + UcdProperty.Pattern_White_Space, + VersionInfo.getInstance(4, 1, 0), + 73, + true, + false, + false, + true); + public static UCDPropertyDetail Pattern_Syntax_Detail = + new UCDPropertyDetail( + UcdProperty.Pattern_Syntax, + VersionInfo.getInstance(4, 1, 0), + 74, + true, + false, + false, + true); + public static UCDPropertyDetail Grapheme_Cluster_Break_Detail = + new UCDPropertyDetail( + UcdProperty.Grapheme_Cluster_Break, + VersionInfo.getInstance(4, 1, 0), + 75, + true, + false, + false, + true); + public static UCDPropertyDetail Word_Break_Detail = + new UCDPropertyDetail( + UcdProperty.Word_Break, + VersionInfo.getInstance(4, 1, 0), + 76, + true, + false, + false, + true); + public static UCDPropertyDetail Sentence_Break_Detail = + new UCDPropertyDetail( + UcdProperty.Sentence_Break, + VersionInfo.getInstance(4, 1, 0), + 77, + true, + false, + false, + true); + public static UCDPropertyDetail Composition_Exclusion_Detail = + new UCDPropertyDetail( + UcdProperty.Composition_Exclusion, + VersionInfo.getInstance(3, 0, 0), + 78, + true, + false, + false, + true); + public static UCDPropertyDetail Full_Composition_Exclusion_Detail = + new UCDPropertyDetail( + UcdProperty.Full_Composition_Exclusion, + VersionInfo.getInstance(3, 1, 0), + 79, + true, + false, + false, + true); + public static UCDPropertyDetail NFC_Quick_Check_Detail = + new UCDPropertyDetail( + UcdProperty.NFC_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 80, + true, + false, + false, + true); + public static UCDPropertyDetail NFD_Quick_Check_Detail = + new UCDPropertyDetail( + UcdProperty.NFD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 81, + true, + false, + false, + true); + public static UCDPropertyDetail NFKC_Quick_Check_Detail = + new UCDPropertyDetail( + UcdProperty.NFKC_Quick_Check, + VersionInfo.getInstance(5, 2, 0), + 82, + true, + false, + false, + true); + public static UCDPropertyDetail NFKD_Quick_Check_Detail = + new UCDPropertyDetail( + UcdProperty.NFKD_Quick_Check, + VersionInfo.getInstance(3, 2, 0), + 83, + true, + false, + false, + true); + public static UCDPropertyDetail Expands_On_NFC_Detail = + new UCDPropertyDetail( + UcdProperty.Expands_On_NFC, + VersionInfo.getInstance(3, 2, 0), + 84, + true, + false, + false, + true); + public static UCDPropertyDetail Expands_On_NFD_Detail = + new UCDPropertyDetail( + UcdProperty.Expands_On_NFD, + VersionInfo.getInstance(3, 2, 0), + 85, + true, + false, + false, + true); + public static UCDPropertyDetail Expands_On_NFKC_Detail = + new UCDPropertyDetail( + UcdProperty.Expands_On_NFKC, + VersionInfo.getInstance(3, 2, 0), + 86, + true, + false, + false, + true); + public static UCDPropertyDetail Expands_On_NFKD_Detail = + new UCDPropertyDetail( + UcdProperty.Expands_On_NFKD, + VersionInfo.getInstance(3, 2, 0), + 87, + true, + false, + false, + true); + public static UCDPropertyDetail FC_NFC_Closure_Detail = + new UCDPropertyDetail( + UcdProperty.FC_NFKC_Closure, + VersionInfo.getInstance(3, 1, 0), + 88, + true, + false, + false, + true); + public static UCDPropertyDetail Case_Ignorable_Detail = + new UCDPropertyDetail( + UcdProperty.Case_Ignorable, + VersionInfo.getInstance(5, 2, 0), + 89, + true, + false, + false, + true); + public static UCDPropertyDetail Cased_Detail = + new UCDPropertyDetail( + UcdProperty.Cased, + VersionInfo.getInstance(5, 2, 0), + 90, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_CaseFolded_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 91, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_CaseMapped_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_Casemapped, + VersionInfo.getInstance(5, 2, 0), + 92, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_NFKC_Casefolded_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_NFKC_Casefolded, + VersionInfo.getInstance(5, 2, 0), + 93, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_Lowercased_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_Lowercased, + VersionInfo.getInstance(5, 2, 0), + 94, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_Titlecased_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_Titlecased, + VersionInfo.getInstance(5, 2, 0), + 95, + true, + false, + false, + true); + public static UCDPropertyDetail Changes_When_Uppercased_Detail = + new UCDPropertyDetail( + UcdProperty.Changes_When_Uppercased, + VersionInfo.getInstance(5, 2, 0), + 96, + true, + false, + false, + true); + public static UCDPropertyDetail NFKC_Casefold_Detail = + new UCDPropertyDetail( + UcdProperty.NFKC_Casefold, + VersionInfo.getInstance(5, 2, 0), + 97, + true, + false, + false, + true); + public static UCDPropertyDetail Indic_Syllabic_Category_Detail = + new UCDPropertyDetail( + UcdProperty.Indic_Syllabic_Category, + VersionInfo.getInstance(6, 1, 0), + 98, + true, + false, + false, + true); + // public static UcdPropertyDetail Indic_Matra_Category_Detail = new UcdPropertyDetail ( + // UcdProperty.Indic_Matra_Category, VersionInfo.getInstance(6,1,0), + // VersionInfo.getInstance(7,0,0), 99, + // true, false, false, true); + public static UCDPropertyDetail Indic_Positional_Category_Detail = + new UCDPropertyDetail( + UcdProperty.Indic_Positional_Category, + VersionInfo.getInstance(8, 0, 0), + 100, + true, + false, + false, + true); + public static UCDPropertyDetail kJa_Detail = + new UCDPropertyDetail( + UcdProperty.kJa, + VersionInfo.getInstance(8, 0, 0), + 101, + false, + true, + false, + true); + public static UCDPropertyDetail Prepended_Concatenation_Mark_Detail = + new UCDPropertyDetail( + UcdProperty.Prepended_Concatenation_Mark, + VersionInfo.getInstance(9, 0, 0), + 102, + true, + false, + false, + true); + public static UCDPropertyDetail Vertical_Orientation_Detail = + new UCDPropertyDetail( + UcdProperty.Vertical_Orientation, + VersionInfo.getInstance(10, 0, 0), + 103, + true, + false, + false, + true); + public static UCDPropertyDetail Regional_Indicator_Detail = + new UCDPropertyDetail( + UcdProperty.Regional_Indicator, + VersionInfo.getInstance(10, 0, 0), + 104, + true, + false, + false, + true); + public static UCDPropertyDetail Block_Detail = + new UCDPropertyDetail( + UcdProperty.Block, + VersionInfo.getInstance(2, 0, 0), + 105, + true, + false, + false, + true); + public static UCDPropertyDetail Equivalent_Unified_Ideograph_Detail = + new UCDPropertyDetail( + UcdProperty.Equivalent_Unified_Ideograph, + VersionInfo.getInstance(11, 0, 0), + 106, + false, + true, + false, + true); + public static UCDPropertyDetail kCompatibilityVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kCompatibilityVariant, + VersionInfo.getInstance(3, 2, 0), + 107, + false, + true, + true, + true); + public static UCDPropertyDetail kRSUnicode_Detail = + new UCDPropertyDetail( + UcdProperty.kRSUnicode, + VersionInfo.getInstance(2, 0, 0), + 108, + false, + true, + false, + true); + // public static UcdPropertyDetail kIRG_RSIndex_Detail = new UcdPropertyDetail ( + // UcdProperty.kIRG_RSIndex, VersionInfo.getInstance(11,0,0), 109, + // false, true, false, true); + public static UCDPropertyDetail kIRG_GSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_GSource, + VersionInfo.getInstance(3, 0, 0), + 110, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_TSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_TSource, + VersionInfo.getInstance(3, 0, 0), + 111, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_JSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_JSource, + VersionInfo.getInstance(3, 0, 0), + 112, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_KSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_KSource, + VersionInfo.getInstance(3, 0, 0), + 113, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_KPSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_KPSource, + VersionInfo.getInstance(3, 1, 1), + 114, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_VSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_VSource, + VersionInfo.getInstance(3, 0, 0), + 115, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_HSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_HSource, + VersionInfo.getInstance(3, 1, 0), + 116, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_USource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_USource, + VersionInfo.getInstance(4, 0, 1), + 117, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_MSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_MSource, + VersionInfo.getInstance(5, 2, 0), + 118, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_UKSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_UKSource, + VersionInfo.getInstance(13, 0, 0), + 119, + false, + true, + true, + true); + public static UCDPropertyDetail kIRG_SSource_Detail = + new UCDPropertyDetail( + UcdProperty.kIRG_SSource, + VersionInfo.getInstance(13, 0, 0), + 120, + false, + true, + true, + true); + public static UCDPropertyDetail kIICore_Detail = + new UCDPropertyDetail( + UcdProperty.kIICore, + VersionInfo.getInstance(4, 1, 0), + 121, + false, + true, + false, + true); + public static UCDPropertyDetail kUnihanCore2020_Detail = + new UCDPropertyDetail( + UcdProperty.kUnihanCore2020, + VersionInfo.getInstance(13, 0, 0), + 122, + false, + true, + false, + true); + public static UCDPropertyDetail kGB0_Detail = + new UCDPropertyDetail( + UcdProperty.kGB0, + VersionInfo.getInstance(2, 0, 0), + 123, + false, + true, + false, + true); + public static UCDPropertyDetail kGB1_Detail = + new UCDPropertyDetail( + UcdProperty.kGB1, + VersionInfo.getInstance(2, 0, 0), + 124, + false, + true, + false, + true); + public static UCDPropertyDetail kGB3_Detail = + new UCDPropertyDetail( + UcdProperty.kGB3, + VersionInfo.getInstance(2, 0, 0), + 125, + false, + true, + false, + true); + public static UCDPropertyDetail kGB5_Detail = + new UCDPropertyDetail( + UcdProperty.kGB5, + VersionInfo.getInstance(2, 0, 0), + 126, + false, + true, + false, + true); + public static UCDPropertyDetail kGB7_Detail = + new UCDPropertyDetail( + UcdProperty.kGB7, + VersionInfo.getInstance(2, 0, 0), + 127, + false, + true, + false, + true); + public static UCDPropertyDetail kGB8_Detail = + new UCDPropertyDetail( + UcdProperty.kGB8, + VersionInfo.getInstance(2, 0, 0), + 128, + false, + true, + false, + true); + public static UCDPropertyDetail kCNS1986_Detail = + new UCDPropertyDetail( + UcdProperty.kCNS1986, + VersionInfo.getInstance(2, 0, 0), + 129, + false, + true, + false, + true); + public static UCDPropertyDetail kCNS1992_Detail = + new UCDPropertyDetail( + UcdProperty.kCNS1992, + VersionInfo.getInstance(2, 0, 0), + 130, + false, + true, + false, + true); + public static UCDPropertyDetail kJis0_Detail = + new UCDPropertyDetail( + UcdProperty.kJis0, + VersionInfo.getInstance(2, 0, 0), + 131, + false, + true, + false, + true); + public static UCDPropertyDetail kJis1_Detail = + new UCDPropertyDetail( + UcdProperty.kJis1, + VersionInfo.getInstance(2, 0, 0), + 132, + false, + true, + false, + true); + public static UCDPropertyDetail kJIS0213_Detail = + new UCDPropertyDetail( + UcdProperty.kJIS0213, + VersionInfo.getInstance(3, 1, 1), + 133, + false, + true, + false, + true); + public static UCDPropertyDetail kKSC0_Detail = + new UCDPropertyDetail( + UcdProperty.kKSC0, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 134, + false, + true, + false, + true); + public static UCDPropertyDetail kKSC1_Detail = + new UCDPropertyDetail( + UcdProperty.kKSC1, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 135, + false, + true, + false, + true); + public static UCDPropertyDetail kKPS0_Detail = + new UCDPropertyDetail( + UcdProperty.kKPS0, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 136, + false, + true, + false, + true); + public static UCDPropertyDetail kKPS1_Detail = + new UCDPropertyDetail( + UcdProperty.kKPS1, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 137, + false, + true, + false, + true); + public static UCDPropertyDetail kHKSCS_Detail = + new UCDPropertyDetail( + UcdProperty.kHKSCS, + VersionInfo.getInstance(3, 1, 1), + VersionInfo.getInstance(15, 1, 0), + 138, + false, + true, + false, + true); + public static UCDPropertyDetail kCantonese_Detail = + new UCDPropertyDetail( + UcdProperty.kCantonese, + VersionInfo.getInstance(2, 0, 0), + 139, + false, + true, + false, + true); + public static UCDPropertyDetail kHangul_Detail = + new UCDPropertyDetail( + UcdProperty.kHangul, + VersionInfo.getInstance(5, 0, 0), + 140, + false, + true, + false, + true); + public static UCDPropertyDetail kDefinition_Detail = + new UCDPropertyDetail( + UcdProperty.kDefinition, + VersionInfo.getInstance(2, 0, 0), + 141, + false, + true, + false, + true); + public static UCDPropertyDetail kHanYu_Detail = + new UCDPropertyDetail( + UcdProperty.kHanYu, + VersionInfo.getInstance(2, 0, 0), + 142, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateHanYu_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateHanYu, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(3,1,1), 143, + // false, true, false, true); + public static UCDPropertyDetail kMandarin_Detail = + new UCDPropertyDetail( + UcdProperty.kMandarin, + VersionInfo.getInstance(2, 0, 0), + 144, + false, + true, + false, + true); + public static UCDPropertyDetail kCihaiT_Detail = + new UCDPropertyDetail( + UcdProperty.kCihaiT, + VersionInfo.getInstance(3, 2, 0), + 145, + false, + true, + false, + true); + public static UCDPropertyDetail kSBGY_Detail = + new UCDPropertyDetail( + UcdProperty.kSBGY, + VersionInfo.getInstance(3, 2, 0), + 146, + false, + true, + false, + true); + public static UCDPropertyDetail kNelson_Detail = + new UCDPropertyDetail( + UcdProperty.kNelson, + VersionInfo.getInstance(2, 0, 0), + 147, + false, + true, + false, + true); + public static UCDPropertyDetail kCowles_Detail = + new UCDPropertyDetail( + UcdProperty.kCowles, + VersionInfo.getInstance(3, 1, 1), + 148, + false, + true, + false, + true); + public static UCDPropertyDetail kMatthews_Detail = + new UCDPropertyDetail( + UcdProperty.kMatthews, + VersionInfo.getInstance(2, 0, 0), + 149, + false, + true, + false, + true); + public static UCDPropertyDetail kOtherNumeric_Detail = + new UCDPropertyDetail( + UcdProperty.kOtherNumeric, + VersionInfo.getInstance(3, 2, 0), + 150, + false, + true, + false, + true); + public static UCDPropertyDetail kPhonetic_Detail = + new UCDPropertyDetail( + UcdProperty.kPhonetic, + VersionInfo.getInstance(3, 1, 0), + 151, + false, + true, + false, + true); + public static UCDPropertyDetail kGSR_Detail = + new UCDPropertyDetail( + UcdProperty.kGSR, + VersionInfo.getInstance(4, 0, 1), + 152, + false, + true, + false, + true); + public static UCDPropertyDetail kFenn_Detail = + new UCDPropertyDetail( + UcdProperty.kFenn, + VersionInfo.getInstance(3, 1, 1), + 153, + false, + true, + false, + true); + public static UCDPropertyDetail kFennIndex_Detail = + new UCDPropertyDetail( + UcdProperty.kFennIndex, + VersionInfo.getInstance(4, 1, 0), + 154, + false, + true, + false, + true); + public static UCDPropertyDetail kKarlgren_Detail = + new UCDPropertyDetail( + UcdProperty.kKarlgren, + VersionInfo.getInstance(3, 1, 1), + 155, + false, + true, + false, + true); + public static UCDPropertyDetail kCangjie_Detail = + new UCDPropertyDetail( + UcdProperty.kCangjie, + VersionInfo.getInstance(3, 1, 1), + 156, + false, + true, + false, + true); + public static UCDPropertyDetail kMeyerWempe_Detail = + new UCDPropertyDetail( + UcdProperty.kMeyerWempe, + VersionInfo.getInstance(3, 1, 0), + 157, + false, + true, + false, + true); + public static UCDPropertyDetail kSimplifiedVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kSimplifiedVariant, + VersionInfo.getInstance(2, 0, 0), + 158, + false, + true, + false, + true); + public static UCDPropertyDetail kTraditionalVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kTraditionalVariant, + VersionInfo.getInstance(2, 0, 0), + 159, + false, + true, + false, + true); + public static UCDPropertyDetail kSpecializedSemanticVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kSpecializedSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 160, + false, + true, + false, + true); + public static UCDPropertyDetail kSemanticVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kSemanticVariant, + VersionInfo.getInstance(2, 0, 0), + 161, + false, + true, + false, + true); + public static UCDPropertyDetail kVietnamese_Detail = + new UCDPropertyDetail( + UcdProperty.kVietnamese, + VersionInfo.getInstance(3, 1, 1), + 162, + false, + true, + false, + true); + public static UCDPropertyDetail kLau_Detail = + new UCDPropertyDetail( + UcdProperty.kLau, + VersionInfo.getInstance(3, 1, 1), + 163, + false, + true, + false, + true); + public static UCDPropertyDetail kTang_Detail = + new UCDPropertyDetail( + UcdProperty.kTang, + VersionInfo.getInstance(2, 0, 0), + 164, + false, + true, + false, + true); + public static UCDPropertyDetail kZVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kZVariant, + VersionInfo.getInstance(2, 0, 0), + 165, + false, + true, + false, + true); + public static UCDPropertyDetail kJapaneseKun_Detail = + new UCDPropertyDetail( + UcdProperty.kJapaneseKun, + VersionInfo.getInstance(2, 0, 0), + 166, + false, + true, + false, + true); + public static UCDPropertyDetail kJapaneseOn_Detail = + new UCDPropertyDetail( + UcdProperty.kJapaneseOn, + VersionInfo.getInstance(2, 0, 0), + 167, + false, + true, + false, + true); + public static UCDPropertyDetail kKangXi_Detail = + new UCDPropertyDetail( + UcdProperty.kKangXi, + VersionInfo.getInstance(2, 0, 0), + 168, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateKangXi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateKangXi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 169, + // false, true, false, true); + public static UCDPropertyDetail kBigFive_Detail = + new UCDPropertyDetail( + UcdProperty.kBigFive, + VersionInfo.getInstance(2, 0, 0), + 170, + false, + true, + false, + true); + public static UCDPropertyDetail kCCCII_Detail = + new UCDPropertyDetail( + UcdProperty.kCCCII, + VersionInfo.getInstance(2, 0, 0), + 171, + false, + true, + false, + true); + public static UCDPropertyDetail kDaeJaweon_Detail = + new UCDPropertyDetail( + UcdProperty.kDaeJaweon, + VersionInfo.getInstance(2, 0, 0), + 172, + false, + true, + false, + true); + public static UCDPropertyDetail kEACC_Detail = + new UCDPropertyDetail( + UcdProperty.kEACC, + VersionInfo.getInstance(2, 0, 0), + 173, + false, + true, + false, + true); + public static UCDPropertyDetail kFrequency_Detail = + new UCDPropertyDetail( + UcdProperty.kFrequency, + VersionInfo.getInstance(3, 2, 0), + VersionInfo.getInstance(16, 0, 0), + 174, + false, + true, + false, + true); + public static UCDPropertyDetail kGradeLevel_Detail = + new UCDPropertyDetail( + UcdProperty.kGradeLevel, + VersionInfo.getInstance(3, 2, 0), + 175, + false, + true, + false, + true); + public static UCDPropertyDetail kHDZRadBreak_Detail = + new UCDPropertyDetail( + UcdProperty.kHDZRadBreak, + VersionInfo.getInstance(4, 1, 0), + 176, + false, + true, + false, + true); + public static UCDPropertyDetail kHKGlyph_Detail = + new UCDPropertyDetail( + UcdProperty.kHKGlyph, + VersionInfo.getInstance(3, 1, 1), + 177, + false, + true, + false, + true); + public static UCDPropertyDetail kHanyuPinlu_Detail = + new UCDPropertyDetail( + UcdProperty.kHanyuPinlu, + VersionInfo.getInstance(4, 0, 1), + 178, + false, + true, + false, + true); + public static UCDPropertyDetail kHanyuPinyin_Detail = + new UCDPropertyDetail( + UcdProperty.kHanyuPinyin, + VersionInfo.getInstance(5, 2, 0), + 179, + false, + true, + false, + true); + public static UCDPropertyDetail kIRGHanyuDaZidian_Detail = + new UCDPropertyDetail( + UcdProperty.kIRGHanyuDaZidian, + VersionInfo.getInstance(3, 0, 0), + 180, + false, + true, + false, + true); + public static UCDPropertyDetail kIRGKangXi_Detail = + new UCDPropertyDetail( + UcdProperty.kIRGKangXi, + VersionInfo.getInstance(3, 0, 0), + 181, + false, + true, + false, + true); + public static UCDPropertyDetail kIRGDaeJaweon_Detail = + new UCDPropertyDetail( + UcdProperty.kIRGDaeJaweon, + VersionInfo.getInstance(3, 0, 0), + 182, + false, + true, + false, + true); + public static UCDPropertyDetail kIRGDaiKanwaZiten_Detail = + new UCDPropertyDetail( + UcdProperty.kIRGDaiKanwaZiten, + VersionInfo.getInstance(3, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 183, + false, + true, + false, + true); + public static UCDPropertyDetail kKorean_Detail = + new UCDPropertyDetail( + UcdProperty.kKorean, + VersionInfo.getInstance(2, 0, 0), + 184, + false, + true, + false, + true); + public static UCDPropertyDetail kMainlandTelegraph_Detail = + new UCDPropertyDetail( + UcdProperty.kMainlandTelegraph, + VersionInfo.getInstance(2, 0, 0), + 185, + false, + true, + false, + true); + public static UCDPropertyDetail kMorohashi_Detail = + new UCDPropertyDetail( + UcdProperty.kMorohashi, + VersionInfo.getInstance(2, 0, 0), + 186, + false, + true, + false, + true); + // public static UcdPropertyDetail kAlternateMorohashi_Detail = new UcdPropertyDetail ( + // UcdProperty.kAlternateMorohashi, VersionInfo.getInstance(2,0,0), + // VersionInfo.getInstance(4,0,1), 187, + // false, true, false, true); + public static UCDPropertyDetail kPrimaryNumeric_Detail = + new UCDPropertyDetail( + UcdProperty.kPrimaryNumeric, + VersionInfo.getInstance(3, 2, 0), + 188, + false, + true, + false, + true); + public static UCDPropertyDetail kTaiwanTelegraph_Detail = + new UCDPropertyDetail( + UcdProperty.kTaiwanTelegraph, + VersionInfo.getInstance(2, 0, 0), + 189, + false, + true, + false, + true); + public static UCDPropertyDetail kXerox_Detail = + new UCDPropertyDetail( + UcdProperty.kXerox, + VersionInfo.getInstance(2, 0, 0), + 190, + false, + true, + false, + true); + public static UCDPropertyDetail kPseudoGB1_Detail = + new UCDPropertyDetail( + UcdProperty.kPseudoGB1, + VersionInfo.getInstance(2, 0, 0), + 191, + false, + true, + false, + true); + public static UCDPropertyDetail kIBMJapan_Detail = + new UCDPropertyDetail( + UcdProperty.kIBMJapan, + VersionInfo.getInstance(2, 0, 0), + 192, + false, + true, + false, + true); + public static UCDPropertyDetail kAccountingNumeric_Detail = + new UCDPropertyDetail( + UcdProperty.kAccountingNumeric, + VersionInfo.getInstance(3, 2, 0), + 193, + false, + true, + false, + true); + public static UCDPropertyDetail kCheungBauer_Detail = + new UCDPropertyDetail( + UcdProperty.kCheungBauer, + VersionInfo.getInstance(5, 0, 0), + 194, + false, + true, + false, + true); + public static UCDPropertyDetail kCheungBauerIndex_Detail = + new UCDPropertyDetail( + UcdProperty.kCheungBauerIndex, + VersionInfo.getInstance(5, 0, 0), + 195, + false, + true, + false, + true); + public static UCDPropertyDetail kFourCornerCode_Detail = + new UCDPropertyDetail( + UcdProperty.kFourCornerCode, + VersionInfo.getInstance(5, 0, 0), + 196, + false, + true, + false, + true); + // public static UcdPropertyDetail kWubi_Detail = new UcdPropertyDetail ( + // UcdProperty.kWubi, VersionInfo.getInstance(11,0,0), 197, + // false, true, false, true); + public static UCDPropertyDetail kXHC1983_Detail = + new UCDPropertyDetail( + UcdProperty.kXHC1983, + VersionInfo.getInstance(5, 1, 0), + 198, + false, + true, + false, + true); + public static UCDPropertyDetail kJinmeiyoKanji_Detail = + new UCDPropertyDetail( + UcdProperty.kJinmeiyoKanji, + VersionInfo.getInstance(11, 0, 0), + 199, + false, + true, + false, + true); + public static UCDPropertyDetail kJoyoKanji_Detail = + new UCDPropertyDetail( + UcdProperty.kJoyoKanji, + VersionInfo.getInstance(11, 0, 0), + 200, + false, + true, + false, + true); + public static UCDPropertyDetail kKoreanEducationHanja_Detail = + new UCDPropertyDetail( + UcdProperty.kKoreanEducationHanja, + VersionInfo.getInstance(11, 0, 0), + 201, + false, + true, + false, + true); + public static UCDPropertyDetail kKoreanName_Detail = + new UCDPropertyDetail( + UcdProperty.kKoreanName, + VersionInfo.getInstance(11, 0, 0), + 202, + false, + true, + false, + true); + public static UCDPropertyDetail kTGH_Detail = + new UCDPropertyDetail( + UcdProperty.kTGH, + VersionInfo.getInstance(11, 0, 0), + 203, + false, + true, + false, + true); + public static UCDPropertyDetail kTGHZ2013_Detail = + new UCDPropertyDetail( + UcdProperty.kTGHZ2013, + VersionInfo.getInstance(13, 0, 0), + 204, + false, + true, + false, + true); + public static UCDPropertyDetail kSpoofingVariant_Detail = + new UCDPropertyDetail( + UcdProperty.kSpoofingVariant, + VersionInfo.getInstance(13, 0, 0), + 205, + false, + true, + false, + true); + public static UCDPropertyDetail kRSKanWa_Detail = + new UCDPropertyDetail( + UcdProperty.kRSKanWa, + VersionInfo.getInstance(2, 0, 0), + 206, + false, + true, + false, + true); + public static UCDPropertyDetail kRSJapanese_Detail = + new UCDPropertyDetail( + UcdProperty.kRSJapanese, + VersionInfo.getInstance(2, 0, 0), + 207, + false, + true, + false, + true); + public static UCDPropertyDetail kRSKorean_Detail = + new UCDPropertyDetail( + UcdProperty.kRSKorean, + VersionInfo.getInstance(2, 0, 0), + 208, + false, + true, + false, + true); + public static UCDPropertyDetail kRSKangXi_Detail = + new UCDPropertyDetail( + UcdProperty.kRSKangXi, + VersionInfo.getInstance(2, 0, 0), + VersionInfo.getInstance(15, 1, 0), + 209, + false, + true, + false, + true); + public static UCDPropertyDetail kRSAdobe_Japan1_6_Detail = + new UCDPropertyDetail( + UcdProperty.kRSAdobe_Japan1_6, + VersionInfo.getInstance(4, 1, 0), + 210, + false, + true, + false, + true); + public static UCDPropertyDetail kTotalStrokes_Detail = + new UCDPropertyDetail( + UcdProperty.kTotalStrokes, + VersionInfo.getInstance(3, 1, 0), + 211, + false, + true, + false, + true); + public static UCDPropertyDetail kRSTUnicode_Detail = + new UCDPropertyDetail( + UcdProperty.kRSTUnicode, + VersionInfo.getInstance(9, 0, 0), + 212, + false, + true, + false, + true); + public static UCDPropertyDetail kTGT_MergedSrc_Detail = + new UCDPropertyDetail( + UcdProperty.kTGT_MergedSrc, + VersionInfo.getInstance(9, 0, 0), + 213, + false, + true, + false, + true); + public static UCDPropertyDetail kSrc_NushuDuben_Detail = + new UCDPropertyDetail( + UcdProperty.kSrc_NushuDuben, + VersionInfo.getInstance(10, 0, 0), + 214, + false, + true, + false, + true); + public static UCDPropertyDetail kReading_Detail = + new UCDPropertyDetail( + UcdProperty.kReading, + VersionInfo.getInstance(10, 0, 0), + 215, + false, + true, + false, + true); + public static UCDPropertyDetail ISO_Comment_Detail = + new UCDPropertyDetail( + UcdProperty.ISO_Comment, + VersionInfo.getInstance(11, 0, 0), + 216, + true, + false, + false, + true); + public static UCDPropertyDetail Unicode_1_Name_Detail = + new UCDPropertyDetail( + UcdProperty.Unicode_1_Name, + VersionInfo.getInstance(2, 0, 0), + 217, + true, + false, + false, + true); + public static UCDPropertyDetail Name_Alias_Detail = + new UCDPropertyDetail( + UcdProperty.Name_Alias, + VersionInfo.getInstance(5, 0, 0), + 218, + false, + false, + false, + true); + public static UCDPropertyDetail Emoji_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji, + VersionInfo.getInstance(13, 0, 0), + 219, + true, + false, + false, + true); + public static UCDPropertyDetail Emoji_Presentation_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_Presentation, + VersionInfo.getInstance(13, 0, 0), + 220, + true, + false, + false, + true); + public static UCDPropertyDetail Emoji_Modifier_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_Modifier, + VersionInfo.getInstance(13, 0, 0), + 221, + true, + false, + false, + true); + public static UCDPropertyDetail Emoji_Modifier_Base_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_Modifier_Base, + VersionInfo.getInstance(13, 0, 0), + 222, + true, + false, + false, + true); + public static UCDPropertyDetail Emoji_Component_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_Component, + VersionInfo.getInstance(13, 0, 0), + 223, + true, + false, + false, + true); + public static UCDPropertyDetail Extended_Pictographic_Detail = + new UCDPropertyDetail( + UcdProperty.Extended_Pictographic, + VersionInfo.getInstance(13, 0, 0), + 224, + true, + false, + false, + true); + public static UCDPropertyDetail kStrange_Detail = + new UCDPropertyDetail( + UcdProperty.kStrange, + VersionInfo.getInstance(14, 0, 0), + 225, + false, + true, + false, + true); + public static UCDPropertyDetail kAlternateTotalStrokes_Detail = + new UCDPropertyDetail( + UcdProperty.kAlternateTotalStrokes, + VersionInfo.getInstance(15, 0, 0), + 226, + false, + true, + false, + true); + public static UCDPropertyDetail NFKC_Simple_Casefold_Detail = + new UCDPropertyDetail( + UcdProperty.NFKC_Simple_Casefold, + VersionInfo.getInstance(15, 1, 0), + 227, + true, + false, + false, + true); + public static UCDPropertyDetail ID_Compat_Math_Start_Detail = + new UCDPropertyDetail( + UcdProperty.ID_Compat_Math_Start, + VersionInfo.getInstance(15, 1, 0), + 228, + true, + false, + false, + true); + public static UCDPropertyDetail ID_Compat_Math_Continue_Detail = + new UCDPropertyDetail( + UcdProperty.ID_Compat_Math_Continue, + VersionInfo.getInstance(15, 1, 0), + 229, + true, + false, + false, + true); + public static UCDPropertyDetail IDS_Unary_Operator_Detail = + new UCDPropertyDetail( + UcdProperty.IDS_Unary_Operator, + VersionInfo.getInstance(15, 1, 0), + 230, + true, + false, + false, + true); + public static UCDPropertyDetail kJapanese_Detail = + new UCDPropertyDetail( + UcdProperty.kJapanese, + VersionInfo.getInstance(15, 1, 0), + 231, + false, + true, + false, + true); + public static UCDPropertyDetail kMojiJoho_Detail = + new UCDPropertyDetail( + UcdProperty.kMojiJoho, + VersionInfo.getInstance(15, 1, 0), + 232, + false, + true, + false, + true); + public static UCDPropertyDetail kSMSZD2003Index_Detail = + new UCDPropertyDetail( + UcdProperty.kSMSZD2003Index, + VersionInfo.getInstance(15, 1, 0), + 233, + false, + true, + false, + true); + public static UCDPropertyDetail kSMSZD2003Readings_Detail = + new UCDPropertyDetail( + UcdProperty.kSMSZD2003Readings, + VersionInfo.getInstance(15, 1, 0), + 234, + false, + true, + false, + true); + public static UCDPropertyDetail kVietnameseNumeric_Detail = + new UCDPropertyDetail( + UcdProperty.kVietnameseNumeric, + VersionInfo.getInstance(15, 1, 0), + 235, + false, + true, + false, + true); + public static UCDPropertyDetail kZhuang_Detail = + new UCDPropertyDetail( + UcdProperty.kZhuang, + VersionInfo.getInstance(16, 0, 0), + 236, + false, + true, + false, + true); + public static UCDPropertyDetail kZhuangNumeric_Detail = + new UCDPropertyDetail( + UcdProperty.kZhuangNumeric, + VersionInfo.getInstance(15, 1, 0), + 237, + false, + true, + false, + true); + public static UCDPropertyDetail Indic_Conjunct_Break_Detail = + new UCDPropertyDetail( + UcdProperty.Indic_Conjunct_Break, + VersionInfo.getInstance(15, 1, 0), + 238, + true, + false, + false, + true); + public static UCDPropertyDetail Modifier_Combining_Mark_Detail = + new UCDPropertyDetail( + UcdProperty.Modifier_Combining_Mark, + VersionInfo.getInstance(16, 0, 0), + 239, + true, + false, + false, + true); + public static UCDPropertyDetail kFanqie_Detail = + new UCDPropertyDetail( + UcdProperty.kFanqie, + VersionInfo.getInstance(16, 0, 0), + 240, + false, + true, + false, + true); + public static UCDPropertyDetail Basic_Emoji_Detail = + new UCDPropertyDetail(UcdProperty.Basic_Emoji, -1, false, false, false, false); + public static UCDPropertyDetail CJK_Radical_Detail = + new UCDPropertyDetail(UcdProperty.CJK_Radical, -2, false, false, false, false); + public static UCDPropertyDetail Confusable_MA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_MA, -3, false, false, false, false); + public static UCDPropertyDetail Confusable_ML_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_ML, -4, false, false, false, false); + public static UCDPropertyDetail Confusable_SA_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SA, -5, false, false, false, false); + public static UCDPropertyDetail Confusable_SL_Detail = + new UCDPropertyDetail(UcdProperty.Confusable_SL, -6, false, false, false, false); + public static UCDPropertyDetail Do_Not_Emit_Preferred_Detail = + new UCDPropertyDetail( + UcdProperty.Do_Not_Emit_Preferred, -7, false, false, false, false); + public static UCDPropertyDetail Do_Not_Emit_Type_Detail = + new UCDPropertyDetail(UcdProperty.Do_Not_Emit_Type, -8, false, false, false, false); + public static UCDPropertyDetail Emoji_DCM_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_DCM, + VersionInfo.getInstance(6, 0, 0), + -9, + false, + false, + false, + false); + public static UCDPropertyDetail Emoji_KDDI_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_KDDI, + VersionInfo.getInstance(6, 0, 0), + -10, + false, + false, + false, + false); + public static UCDPropertyDetail Emoji_SB_Detail = + new UCDPropertyDetail( + UcdProperty.Emoji_SB, + VersionInfo.getInstance(6, 0, 0), + -11, + false, + false, + false, + false); + public static UCDPropertyDetail Identifier_Status_Detail = + new UCDPropertyDetail( + UcdProperty.Identifier_Status, + VersionInfo.getInstance(9, 0, 0), + -12, + false, + false, + false, + false); + public static UCDPropertyDetail Identifier_Type_Detail = + new UCDPropertyDetail( + UcdProperty.Identifier_Type, + VersionInfo.getInstance(9, 0, 0), + -13, + false, + false, + false, + false); + public static UCDPropertyDetail Idn_2008_Detail = + new UCDPropertyDetail(UcdProperty.Idn_2008, -14, false, false, false, false); + public static UCDPropertyDetail Idn_Mapping_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Mapping, -15, false, false, false, false); + public static UCDPropertyDetail Idn_Status_Detail = + new UCDPropertyDetail(UcdProperty.Idn_Status, -16, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Detail = + new UCDPropertyDetail(UcdProperty.Named_Sequences, -17, false, false, false, false); + public static UCDPropertyDetail Named_Sequences_Prov_Detail = + new UCDPropertyDetail( + UcdProperty.Named_Sequences_Prov, -18, false, false, false, false); + public static UCDPropertyDetail Other_Joining_Type_Detail = + new UCDPropertyDetail(UcdProperty.Other_Joining_Type, -19, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Flag_Sequence_Detail = + new UCDPropertyDetail( + UcdProperty.RGI_Emoji_Flag_Sequence, -20, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Keycap_Sequence_Detail = + new UCDPropertyDetail( + UcdProperty.RGI_Emoji_Keycap_Sequence, -21, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Modifier_Sequence_Detail = + new UCDPropertyDetail( + UcdProperty.RGI_Emoji_Modifier_Sequence, -22, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Tag_Sequence_Detail = + new UCDPropertyDetail( + UcdProperty.RGI_Emoji_Tag_Sequence, -23, false, false, false, false); + public static UCDPropertyDetail RGI_Emoji_Zwj_Sequence_Detail = + new UCDPropertyDetail( + UcdProperty.RGI_Emoji_Zwj_Sequence, -24, false, false, false, false); + public static UCDPropertyDetail Standardized_Variant_Detail = + new UCDPropertyDetail( + UcdProperty.Standardized_Variant, -25, false, false, false, false); + + private UcdProperty ucdProperty; + private VersionInfo minVersion; + private VersionInfo maxVersion; + private int sortOrder; + private boolean isBaseAttribute; + private boolean isCJKAttribute; + private boolean isCJKShowIfEmpty; + private boolean isOrgUCDXMLAttribute; + + private UCDPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + minVersion, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UCDPropertyDetail( + UcdProperty ucdProperty, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this( + ucdProperty, + null, + null, + sortOrder, + isBaseAttribute, + isCJKAttribute, + isCJKShowIfEmpty, + isOrgUCDXMLAttribute); + } + + private UCDPropertyDetail( + UcdProperty ucdProperty, + VersionInfo minVersion, + VersionInfo maxVersion, + int sortOrder, + boolean isBaseAttribute, + boolean isCJKAttribute, + boolean isCJKShowIfEmpty, + boolean isOrgUCDXMLAttribute) { + this.ucdProperty = ucdProperty; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.sortOrder = sortOrder; + this.isBaseAttribute = isBaseAttribute; + this.isCJKAttribute = isCJKAttribute; + this.isCJKShowIfEmpty = isCJKShowIfEmpty; + this.isOrgUCDXMLAttribute = isOrgUCDXMLAttribute; + + allPropertyDetails.add(this); + if (isBaseAttribute) { + basePropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + if (isCJKAttribute) { + cjkPropertyDetails.add(this); + ucdxmlPropertyDetails.add(this); + } + } + + public static Set values() { + return allPropertyDetails; + } + + public static Set baseValues() { + return basePropertyDetails; + } + + public static Set cjkValues() { + return cjkPropertyDetails; + } + + public static Set ucdxmlValues() { + return ucdxmlPropertyDetails; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public boolean isBaseAttribute() { + return this.isBaseAttribute; + } + + public boolean isCJKAttribute() { + return this.isCJKAttribute; + } + + public boolean isCJKShowIfEmpty() { + return this.isCJKShowIfEmpty; + } + + public boolean isOrgUCDXMLAttribute() { + return this.isOrgUCDXMLAttribute; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java new file mode 100644 index 000000000..0cef1e345 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionComponent.java @@ -0,0 +1,29 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +/** Helper class that defines an object that stores the version range of a given UcdProperty. */ +public class UCDSectionComponent { + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UcdProperty ucdProperty; + + UCDSectionComponent(VersionInfo minVersion, VersionInfo maxVersion, UcdProperty ucdProperty) { + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdProperty = ucdProperty; + } + + public VersionInfo getMinVersion() { + return this.minVersion; + } + + public VersionInfo getMaxVersion() { + return this.maxVersion; + } + + public UcdProperty getUcdProperty() { + return this.ucdProperty; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java new file mode 100644 index 000000000..6db3cf82b --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDSectionDetail.java @@ -0,0 +1,229 @@ +package org.unicode.xml; + +import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdProperty; + +/** + * Helper class that defines an object that stores information about a section of the UCDXML file. + * Information includes the section name, the type of elements that the section contains, and the + * version range of the section. + */ +public class UCDSectionDetail { + + public enum UcdSection { + BLOCKS( + "blocks", + "block", + VersionInfo.getInstance(1, 1, 0), + null, + Blocks_Detail, + true, + true), + CJKRADICALS( + "cjk-radicals", + "cjk-radical", + VersionInfo.getInstance(1, 1, 0), + null, + CJKRadicals_Detail, + false, + false), + DONOTEMIT( + "do-not-emit", + "instead", + VersionInfo.getInstance(16, 0, 0), + null, + DoNotEmit_Detail, + false, + false), + EMOJISOURCES( + "emoji-sources", + "emoji-source", + VersionInfo.getInstance(1, 1, 0), + null, + EmojiSources_Detail, + true, + false), + NAMEDSEQUENCES( + "named-sequences", + "named-sequence", + VersionInfo.getInstance(1, 1, 0), + null, + NamedSequences_Detail, + false, + false), + PROVISIONALNAMEDSEQUENCES( + "provisional-named-sequences", + "named-sequence", + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + ProvisionalNamedSequences_Detail, + false, + false), + NORMALIZATIONCORRECTIONS( + "normalization-corrections", + "normalization-correction", + VersionInfo.getInstance(1, 1, 0), + null, + NormalizationCorrections_Detail, + true, + false), + STANDARDIZEDVARIANTS( + "standardized-variants", + "standardized-variant", + VersionInfo.getInstance(1, 1, 0), + null, + StandardizedVariants_Detail, + true, + false); + private final String tag; + private final String childTag; + private final VersionInfo minVersion; + private final VersionInfo maxVersion; + private final UCDSectionDetail ucdSectionDetail; + private final boolean parserWithRange; + private final boolean parserWithMissing; + + UcdSection( + String tag, + String childTag, + VersionInfo minVersion, + VersionInfo maxVersion, + UCDSectionDetail ucdSectionDetail, + boolean parserWithRange, + boolean parserWithMissing) { + this.tag = tag; + this.childTag = childTag; + this.minVersion = minVersion; + this.maxVersion = maxVersion; + this.ucdSectionDetail = ucdSectionDetail; + this.parserWithRange = parserWithRange; + this.parserWithMissing = parserWithMissing; + } + + public String toString() { + return tag; + } + + public String getChildTag() { + return childTag; + } + + public VersionInfo getMinVersion() { + return minVersion; + } + + public VersionInfo getMaxVersion() { + return maxVersion; + } + + public UCDSectionDetail getUcdSectionDetail() { + return ucdSectionDetail; + } + + public boolean getParserWithRange() { + return parserWithRange; + } + + public boolean getParserWithMissing() { + return parserWithMissing; + } + } + + public static UCDSectionDetail Blocks_Detail = + new UCDSectionDetail( + UcdSection.BLOCKS, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Block) + }, + 0); + public static UCDSectionDetail NamedSequences_Detail = + new UCDSectionDetail( + UcdSection.NAMEDSEQUENCES, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Named_Sequences) + }, + 1); + public static UCDSectionDetail ProvisionalNamedSequences_Detail = + new UCDSectionDetail( + UcdSection.PROVISIONALNAMEDSEQUENCES, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(5, 0, 0), + VersionInfo.getInstance(13, 0, 0), + UcdProperty.Named_Sequences_Prov) + }, + 1); + public static UCDSectionDetail NormalizationCorrections_Detail = + new UCDSectionDetail( + UcdSection.NORMALIZATIONCORRECTIONS, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.NC_Original) + }, + 2); + public static UCDSectionDetail StandardizedVariants_Detail = + new UCDSectionDetail( + UcdSection.STANDARDIZEDVARIANTS, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Standardized_Variant), + new UCDSectionComponent( + VersionInfo.getInstance(13, 0, 0), + null, + UcdProperty.emoji_variation_sequence) + }, + 3); + public static UCDSectionDetail CJKRadicals_Detail = + new UCDSectionDetail( + UcdSection.CJKRADICALS, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.CJK_Radical) + }, + 4); + public static UCDSectionDetail EmojiSources_Detail = + new UCDSectionDetail( + UcdSection.EMOJISOURCES, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), null, UcdProperty.Emoji_DCM) + }, + 5); + public static UCDSectionDetail DoNotEmit_Detail = + new UCDSectionDetail( + UcdSection.DONOTEMIT, + new UCDSectionComponent[] { + new UCDSectionComponent( + VersionInfo.getInstance(1, 1, 0), + null, + UcdProperty.Do_Not_Emit_Type) + }, + 6); + + private final UcdSection ucdSection; + private final UCDSectionComponent[] ucdSectionComponents; + private final int sortOrder; + + private UCDSectionDetail( + UcdSection ucdSection, UCDSectionComponent[] ucdSectionComponents, int sortOrder) { + this.ucdSection = ucdSection; + this.ucdSectionComponents = ucdSectionComponents; + this.sortOrder = sortOrder; + } + + public UcdSection getSection() { + return this.ucdSection; + } + + public UCDSectionComponent[] getUcdSectionComponents() { + return this.ucdSectionComponents; + } + + public int getSortOrder() { + return this.sortOrder; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXML.java b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java new file mode 100644 index 000000000..d4c302e1d --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXML.java @@ -0,0 +1,838 @@ +package org.unicode.xml; + +import com.ibm.icu.dev.tool.UOption; +import com.ibm.icu.util.VersionInfo; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.transform.TransformerConfigurationException; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.unicode.text.utility.Settings; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Utility for generating UCDXML files. The utility can build flat or grouped versions of UCDXML for + * non-Unihan code points, Unihan code points, or the complete range of code points. + */ +public class UCDXML { + + private static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private enum UCDXMLOUTPUTRANGE { + ALL, + NOUNIHAN, + UNIHAN; + } + + private enum UCDXMLOUTPUTTYPE { + FLAT, + GROUPED; + } + + private enum Range { + RESERVED("reserved"), + SURROGATE("surrogate"), + NONCHARACTER("noncharacter"), + CHARACTER("char"), + CJKUNIFIEDIDEOGRAPH("char"), + NONRANGE("nonrange"); + + private final String tag; + + Range(String tag) { + this.tag = tag; + } + + public String toString() { + return tag; + } + } + + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("ucdversion", 'v', UOption.OPTIONAL_ARG), + UOption.create("range", 'r', UOption.REQUIRES_ARG), + UOption.create("output", 'o', UOption.REQUIRES_ARG), + UOption.create("outputfolder", 'f', UOption.OPTIONAL_ARG) + }; + private static final int HELP = 0, UCDVERSION = 1, RANGE = 2, OUTPUT = 3, OUTPUTFOLDER = 4; + + public static void main(String[] args) throws Exception { + + VersionInfo ucdVersion = null; + UCDXMLOUTPUTRANGE[] ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.ALL, UCDXMLOUTPUTRANGE.NOUNIHAN, UCDXMLOUTPUTRANGE.UNIHAN + }; + UCDXMLOUTPUTTYPE[] ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] {UCDXMLOUTPUTTYPE.FLAT, UCDXMLOUTPUTTYPE.GROUPED}; + File destinationFolder = null; + + UOption.parseArgs(args, options); + + if (options[HELP].doesOccur) { + System.out.println( + "UCDXML [--ucdversion {version number}] [--outputfolder {destination}] " + + "--range [ALL|NOUNIHAN|UNIHAN] --output [FLAT|GROUPED]"); + System.exit(0); + } + + try { + if (options[UCDVERSION].doesOccur) { + try { + ucdVersion = VersionInfo.getInstance(options[UCDVERSION].value); + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[UCDVERSION].value + + " to a valid UCD version"); + } + } else { + ucdVersion = VersionInfo.getInstance(Settings.latestVersion); + } + if (options[RANGE].doesOccur) { + try { + ucdxmloutputranges = + new UCDXMLOUTPUTRANGE[] { + UCDXMLOUTPUTRANGE.valueOf( + options[RANGE].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[RANGE].value + + " to one of [ALL|NOUNIHAN|UNIHAN]"); + } + } + if (options[OUTPUT].doesOccur) { + try { + ucdxmloutputtypes = + new UCDXMLOUTPUTTYPE[] { + UCDXMLOUTPUTTYPE.valueOf( + options[OUTPUT].value.toUpperCase(Locale.ROOT)) + }; + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not convert " + + options[OUTPUT].value + + " to one of [FLAT|GROUPED]"); + } + } + if (options[OUTPUTFOLDER].doesOccur) { + try { + destinationFolder = + new File( + options[OUTPUTFOLDER].value + + ucdVersion.getVersionString(3, 3) + + "/"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + options[OUTPUTFOLDER].value); + } + } else { + try { + destinationFolder = + new File( + Settings.Output.GEN_DIR + + "ucdxml\\" + + ucdVersion.getVersionString(3, 3) + + "\\"); + if (!destinationFolder.exists()) { + if (!destinationFolder.mkdirs()) { + throw new IOException(); + } + } + } catch (Exception e) { + throw new IllegalArgumentException( + "Could not find or create " + + Settings.Output.GEN_DIR + + "ucdxml\\" + + ucdVersion.getVersionString(3, 3) + + "\\"); + } + } + + } catch (Exception e) { + System.err.println(e.getMessage()); + System.exit(1); + } + + if (ucdVersion != null && destinationFolder.exists()) { + for (UCDXMLOUTPUTRANGE ucdxmloutputrange : ucdxmloutputranges) { + for (UCDXMLOUTPUTTYPE ucdxmloutputtype : ucdxmloutputtypes) { + System.out.println( + "Building the " + + ucdxmloutputrange + + " " + + ucdxmloutputtype + + " UcdXML file for " + + ucdVersion); + buildUcdXMLFile( + ucdVersion, destinationFolder, ucdxmloutputrange, ucdxmloutputtype); + } + } + System.out.println("End"); + System.exit(0); + } else { + System.err.println("Unexpected error when building UcdXML file."); + System.exit(1); + } + } + + private static void buildUcdXMLFile( + VersionInfo ucdVersion, + File destinationFolder, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws IOException, TransformerConfigurationException, SAXException { + int lowCodePoint = 0x0; + int highCodePoint = 0x10FFFF; + // Tangut + // int lowCodePoint = 0x17000; + // int highCodePoint = 0x1B2FB; + // 0x10FFFF + + File tempFile = new File(destinationFolder, "temp.xml"); + String outputFilename = + "ucd." + + outputRange.toString().toLowerCase(Locale.ROOT) + + "." + + outputType.toString().toLowerCase(Locale.ROOT) + + ".xml"; + File destinationFile = new File(destinationFolder, outputFilename); + + FileOutputStream fileOutputStream = new FileOutputStream(tempFile); + UCDXMLWriter writer = new UCDXMLWriter(fileOutputStream); + + IndexUnicodeProperties iup = IndexUnicodeProperties.make(ucdVersion); + AttributeResolver attributeResolver = new AttributeResolver(iup); + UCDDataResolver ucdDataResolver = new UCDDataResolver(iup, NAMESPACE, writer); + + writer.startFile(); + writer.startElement("ucd"); + { + writer.startElement("description"); + { + writer.addContent("Unicode " + ucdVersion.getVersionString(3, 3)); + writer.endElement("description"); + } + buildRepertoire( + writer, + attributeResolver, + ucdVersion, + lowCodePoint, + highCodePoint, + outputRange, + outputType); + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.BLOCKS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.PROVISIONALNAMEDSEQUENCES); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.NORMALIZATIONCORRECTIONS); + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.STANDARDIZEDVARIANTS); + if (ucdVersion.compareTo(VersionInfo.getInstance(5, 2, 0)) >= 0) { + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.CJKRADICALS); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(6, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.EMOJISOURCES); + } + if (ucdVersion.compareTo(VersionInfo.getInstance(16, 0, 0)) >= 0) { + ucdDataResolver.buildSection(UCDSectionDetail.UcdSection.DONOTEMIT); + } + } + writer.endElement("ucd"); + } + writer.endFile(); + fileOutputStream.close(); + cleanUcdXMLFile(tempFile, destinationFile); + if (!tempFile.delete()) { + throw new IOException("Could not delete temporary file " + tempFile); + } + } + + private static void cleanUcdXMLFile(File tempFile, File destinationFile) throws IOException { + // XALAN writes out characters outside the BMP as entities. + // Use this code to replace the entities with the correct characters. + // See: https://issues.apache.org/jira/browse/XALANJ-2595 + + FileInputStream fileInputStream = new FileInputStream(tempFile); + FileOutputStream fileOutputStream = new FileOutputStream(destinationFile); + + InputStreamReader inputStreamReader = + new InputStreamReader(fileInputStream, StandardCharsets.UTF_8); + OutputStreamWriter outputStreamWriter = + new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); + + BufferedReader bufferedReader = new BufferedReader(inputStreamReader); + BufferedWriter bufferedWriter = new BufferedWriter(outputStreamWriter); + + String line; + while ((line = bufferedReader.readLine()) != null) { + Matcher matcher = Pattern.compile("&#(\\d+);").matcher(line); + line = + matcher.replaceAll( + matchResult -> + new String( + Character.toChars(Integer.parseInt(matcher.group(1))))); + bufferedWriter.append(line); + bufferedWriter.newLine(); + } + bufferedWriter.flush(); + fileInputStream.close(); + fileOutputStream.close(); + } + + private static void buildRepertoire( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodePoint, + int highCodePoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + writer.startElement("repertoire"); + { + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + CodePoint = + buildGroup( + writer, + attributeResolver, + ucdVersion, + CodePoint, + highCodePoint, + outputRange, + outputType); + } else { + CodePoint = + buildChars( + writer, + attributeResolver, + ucdVersion, + CodePoint, + highCodePoint, + outputRange, + outputType, + null); + } + } + } + writer.endElement("repertoire"); + } + } + + private static int buildGroup( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodePoint, + int highCodePoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType) + throws SAXException { + + int lastCodePointInGroup = + getLastCodePointInGroup(attributeResolver, lowCodePoint, highCodePoint); + + AttributesImpl groupAttrs = + getGroupAttributes( + ucdVersion, + attributeResolver, + lowCodePoint, + lastCodePointInGroup, + outputRange); + + writer.startElement("group", groupAttrs); + { + buildChars( + writer, + attributeResolver, + ucdVersion, + lowCodePoint, + lastCodePointInGroup, + outputRange, + outputType, + groupAttrs); + writer.endElement("group"); + } + return lastCodePointInGroup; + } + + private static int buildChars( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int lowCodePoint, + int highCodePoint, + UCDXMLOUTPUTRANGE outputRange, + UCDXMLOUTPUTTYPE outputType, + AttributesImpl groupAttrs) + throws SAXException { + + ArrayList range = new ArrayList<>(); + Range rangeType = Range.NONRANGE; + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (attributeResolver.isUnassignedCodePoint(CodePoint) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && attributeResolver.isUnifiedIdeograph(CodePoint))) { + Range currentRangeType = getRangeType(attributeResolver, CodePoint); + if (!range.isEmpty()) { + if (!currentRangeType.equals(rangeType) + || attributeResolver.isDifferentRange( + ucdVersion, CodePoint, CodePoint - 1)) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + } + } + range.add(CodePoint); + rangeType = currentRangeType; + } else { + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, + attributeResolver, + ucdVersion, + range, + rangeType, + groupAttrs); + } else { + buildUngroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType); + } + } + range.clear(); + rangeType = Range.NONRANGE; + } + if (isWritableCodePoint(CodePoint, outputRange, attributeResolver)) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedChar( + writer, + attributeResolver, + ucdVersion, + CodePoint, + outputRange, + groupAttrs); + } else { + buildUngroupedChar( + writer, attributeResolver, ucdVersion, CodePoint, outputRange); + } + } + } + } + // Handle any range before the end of the repertoire element. + if (!range.isEmpty()) { + if (outputRange != UCDXMLOUTPUTRANGE.UNIHAN) { + if (outputType == UCDXMLOUTPUTTYPE.GROUPED) { + buildGroupedRange( + writer, attributeResolver, ucdVersion, range, rangeType, groupAttrs); + } else { + buildUngroupedRange(writer, attributeResolver, ucdVersion, range, rangeType); + } + } + } + return highCodePoint; + } + + private static void buildUngroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int CodePoint, + UCDXMLOUTPUTRANGE outputRange) + throws SAXException { + + AttributesImpl charAttributes = + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); + buildChar(writer, attributeResolver, CodePoint, charAttributes); + } + + private static void buildGroupedChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + int CodePoint, + UCDXMLOUTPUTRANGE outputRange, + AttributesImpl groupAttrs) + throws SAXException { + + AttributesImpl orgCharAttributes = + getAttributes(ucdVersion, attributeResolver, CodePoint, outputRange); + AttributesImpl charAttributes = new AttributesImpl(); + charAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); + + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgCharAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + charAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + buildChar(writer, attributeResolver, CodePoint, charAttributes); + } + + private static void buildChar( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + int CodePoint, + AttributesImpl charAttributes) + throws SAXException { + writer.startElement("char", charAttributes); + { + HashMap nameAliases = attributeResolver.getNameAliases(CodePoint); + if (null != nameAliases && !nameAliases.isEmpty()) { + for (String alias : nameAliases.keySet()) { + AttributesImpl nameAliasAt = new AttributesImpl(); + nameAliasAt.addAttribute(NAMESPACE, "alias", "alias", "CDATA", alias); + String type = nameAliases.get(alias); + if (!Objects.equals(type, "none")) { + nameAliasAt.addAttribute( + NAMESPACE, "type", "type", "CDATA", nameAliases.get(alias)); + } + writer.startElement("name-alias", nameAliasAt); + { + writer.endElement("name-alias"); + } + } + } + writer.endElement("char"); + } + } + + private static void buildGroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType, + AttributesImpl groupAttrs) + throws SAXException { + AttributesImpl orgRangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + AttributesImpl rangeAttributes = new AttributesImpl(); + if (range.size() == 1) { + rangeAttributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + rangeAttributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + rangeAttributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { + String qName = propDetail.getUcdProperty().getShortName(); + if (qName.startsWith("cjk")) { + qName = qName.substring(2); + } + String orgCharAttributesValue = orgRangeAttributes.getValue(qName); + String groupAttributeValue = groupAttrs.getValue(qName); + if (!Objects.equals(orgCharAttributesValue, groupAttributeValue)) { + rangeAttributes.addAttribute( + NAMESPACE, + qName, + qName, + "CDATA", + Objects.requireNonNullElse(orgCharAttributesValue, "")); + } + } + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static void buildUngroupedRange( + UCDXMLWriter writer, + AttributeResolver attributeResolver, + VersionInfo ucdVersion, + ArrayList range, + Range rangeType) + throws SAXException { + AttributesImpl rangeAttributes = + getReservedAttributes(ucdVersion, attributeResolver, range); + writer.startElement(rangeType.tag, rangeAttributes); + { + writer.endElement(rangeType.tag); + } + } + + private static boolean isWritableCodePoint( + int CodePoint, UCDXMLOUTPUTRANGE outputRange, AttributeResolver attributeResolver) { + return outputRange == UCDXMLOUTPUTRANGE.ALL + || (outputRange == UCDXMLOUTPUTRANGE.UNIHAN + && attributeResolver.isUnihanAttributeRange(CodePoint)) + || (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN + && !attributeResolver.isUnifiedIdeograph(CodePoint)); + } + + private static Range getRangeType(AttributeResolver attributeResolver, int CodePoint) { + String NChar = attributeResolver.getNChar(CodePoint); + UcdPropertyValues.General_Category_Values gc = attributeResolver.getgc(CodePoint); + + if (attributeResolver.isUnihanAttributeRange(CodePoint)) { + return Range.CJKUNIFIEDIDEOGRAPH; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Surrogate)) { + return Range.SURROGATE; + } + if (gc.equals(UcdPropertyValues.General_Category_Values.Private_Use)) { + return Range.CHARACTER; + } + if (NChar.equals(UcdPropertyValues.Binary.Yes.getShortName())) { + return Range.NONCHARACTER; + } + return Range.RESERVED; + } + + private static int getLastCodePointInGroup( + AttributeResolver attributeResolver, int lowCodePoint, int highCodePoint) { + String blk = attributeResolver.getAttributeValue(UcdProperty.Block, lowCodePoint); + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!blk.equals(attributeResolver.getAttributeValue(UcdProperty.Block, CodePoint))) { + return CodePoint - 1; + } + if (CodePoint == 0x20 - 1 // put the C0 controls in their own group + || CodePoint == 0xa0 - 1 // put the C1 controls in their own group + || CodePoint == 0x1160 - 1 // split the jamos into three groups + || CodePoint == 0x11a8 - 1 // split the jamos into three groups + || CodePoint == 0x1f1e6 - 1 // put the regional indicators in their own group + ) { + return CodePoint; + } + } + return highCodePoint; + } + + private static AttributesImpl getAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int CodePoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(CodePoint)); + + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); + boolean isAttributeIncluded = + getIsAttributeIncluded( + attrValue, + attributeResolver.isUnihanAttributeRange(CodePoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = prop.getNames().getAllNames().get(1); + } + attributes.addAttribute(NAMESPACE, propName, propName, "CDATA", attrValue); + } + } + } + return attributes; + } + + private static AttributesImpl getGroupAttributes( + VersionInfo version, + AttributeResolver attributeResolver, + int lowCodePoint, + int highCodePoint, + UCDXMLOUTPUTRANGE outputRange) { + AttributesImpl attributes = new AttributesImpl(); + + for (UCDPropertyDetail propDetail : UCDPropertyDetail.ucdxmlValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) < 0)) { + int totalCount = 0; + Map counters = new LinkedHashMap<>(); + + for (int CodePoint = lowCodePoint; CodePoint <= highCodePoint; CodePoint++) { + if (!attributeResolver.isUnassignedCodePoint(CodePoint)) { + String attrValue = attributeResolver.getAttributeValue(prop, CodePoint); + int currentCount = + (counters.get(attrValue) == null) ? 0 : counters.get(attrValue); + currentCount++; + totalCount++; + counters.put(attrValue, currentCount); + } + } + int max = Integer.MIN_VALUE; + String bestAttrValue = null; + for (String attrValue : counters.keySet()) { + int thisCount = counters.get(attrValue); + if (thisCount > max) { + max = thisCount; + bestAttrValue = attrValue; + } + } + switch (prop) { + case Decomposition_Mapping: + case Simple_Uppercase_Mapping: + case Simple_Lowercase_Mapping: + case Simple_Titlecase_Mapping: + case Uppercase_Mapping: + case Lowercase_Mapping: + case Titlecase_Mapping: + case Simple_Case_Folding: + case Case_Folding: + if (bestAttrValue != null) { + bestAttrValue = "#"; + } + } + if (max > 0.2 * totalCount && max > 1) { + boolean isAttributeIncluded = + getIsAttributeIncluded( + bestAttrValue, + attributeResolver.isUnihanAttributeRange(lowCodePoint), + propDetail, + prop, + outputRange); + if (isAttributeIncluded) { + String propName = prop.getShortName(); + if (propName.startsWith("cjk")) { + propName = prop.getNames().getAllNames().get(1); + } + attributes.addAttribute( + NAMESPACE, propName, propName, "CDATA", bestAttrValue); + } + } + } + } + return attributes; + } + + private static boolean getIsAttributeIncluded( + String attrValue, + boolean isUnihanAttributeRange, + UCDPropertyDetail propDetail, + UcdProperty prop, + UCDXMLOUTPUTRANGE outputRange) { + if (attrValue == null) { + return false; + } + if (isUnihanAttributeRange) { + if (outputRange == UCDXMLOUTPUTRANGE.UNIHAN) { + if (prop.equals(UcdProperty.Numeric_Type) && !attrValue.equals("None")) { + return true; + } + if (prop.equals(UcdProperty.Numeric_Value) && !attrValue.equals("NaN")) { + return true; + } + return propDetail.isCJKAttribute() + && (propDetail.isCJKShowIfEmpty() || !attrValue.isEmpty()); + } + if (outputRange == UCDXMLOUTPUTRANGE.NOUNIHAN && propDetail.isCJKAttribute()) { + return false; + } + if (propDetail.isCJKShowIfEmpty()) { + return true; + } + } + if (propDetail.isBaseAttribute()) { + return true; + } + return !attrValue.isEmpty(); + } + + private static AttributesImpl getReservedAttributes( + VersionInfo version, AttributeResolver attributeResolver, ArrayList range) { + AttributesImpl attributes = new AttributesImpl(); + + if (range.size() == 1) { + attributes.addAttribute( + NAMESPACE, "cp", "cp", "CDATA", attributeResolver.getHexString(range.get(0))); + } else { + attributes.addAttribute( + NAMESPACE, + "first-cp", + "first-cp", + "CDATA", + attributeResolver.getHexString(range.get(0))); + attributes.addAttribute( + NAMESPACE, + "last-cp", + "last-cp", + "CDATA", + attributeResolver.getHexString(range.get(range.size() - 1))); + } + for (UCDPropertyDetail propDetail : UCDPropertyDetail.baseValues()) { + UcdProperty prop = propDetail.getUcdProperty(); + if (version.compareTo(propDetail.getMinVersion()) >= 0 + && (propDetail.getMaxVersion() == null + || version.compareTo(propDetail.getMaxVersion()) <= 0)) { + String attrValue = + attributeResolver.getAttributeValue( + propDetail.getUcdProperty(), range.get(0)); + + attributes.addAttribute( + NAMESPACE, prop.getShortName(), prop.getShortName(), "CDATA", attrValue); + } + } + return attributes; + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java new file mode 100644 index 000000000..7358ed26f --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/UCDXMLWriter.java @@ -0,0 +1,77 @@ +package org.unicode.xml; + +import java.io.FileOutputStream; +import java.text.SimpleDateFormat; +import java.util.Date; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** Helper class for writing the contents for the UCDXML files. */ +public class UCDXMLWriter { + + public static final String NAMESPACE = "http://www.unicode.org/ns/2003/ucd/1.0"; + + private final TransformerHandler transformerHandler; + + public TransformerHandler getTransformerHandler() { + return transformerHandler; + } + + public UCDXMLWriter(FileOutputStream f) throws TransformerConfigurationException { + TransformerFactory tfactory = TransformerFactory.newInstance(); + SAXTransformerFactory sfactory = (SAXTransformerFactory) tfactory; + transformerHandler = sfactory.newTransformerHandler(); + Transformer transformer = transformerHandler.getTransformer(); + transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty(OutputKeys.STANDALONE, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "3"); + transformerHandler.setResult(new StreamResult(f)); + } + + public void startFile() throws SAXException { + String copyrightYear = new SimpleDateFormat("yyyy").format(new Date()); + transformerHandler.startDocument(); + char[] c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + c = (" \u00A9 " + copyrightYear + " Unicode\u00AE, Inc. ").toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + c = " For terms of use, see http://www.unicode.org/terms_of_use.html ".toCharArray(); + transformerHandler.comment(c, 0, c.length); + c = "\n\n\n".toCharArray(); + transformerHandler.characters(c, 0, c.length); + } + + public void endFile() throws SAXException { + transformerHandler.endDocument(); + } + + public void startElement(String tagName) throws SAXException { + AttributesImpl attributes = new AttributesImpl(); + startElement(tagName, attributes); + } + + public void startElement(String tagName, AttributesImpl attributes) throws SAXException { + transformerHandler.startElement(NAMESPACE, tagName, tagName, attributes); + } + + public void addContent(String s) throws SAXException { + char[] d = s.toCharArray(); + transformerHandler.characters(d, 0, d.length); + } + + public void endElement(String tagName) throws SAXException { + transformerHandler.endElement(NAMESPACE, tagName, tagName); + } +} diff --git a/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java new file mode 100644 index 000000000..d1f6e178e --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/xml/XMLProperties.java @@ -0,0 +1,491 @@ +package org.unicode.xml; + +import com.ibm.icu.impl.UnicodeMap; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.unicode.cldr.util.XMLFileReader; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.text.utility.Utility; +import org.xml.sax.*; + +/** + * Helper class for org.unicode.xml.CompareUCDXML. Facilitates traversal of the contents of a UCDXML + * file. + */ +public class XMLProperties { + + enum XmlLeaf { + // Leaf + BLOCK, + BLOCKS, + CHAR, + CJK_RADICAL, + CJK_RADICALS, + DESCRIPTION, + DO_NOT_EMIT, + EMOJI_SOURCE, + EMOJI_SOURCES, + GROUP, + INSTEAD, + NAME_ALIAS, + NAMED_SEQUENCE, + NAMED_SEQUENCES, + NONCHARACTER, + NORMALIZATION_CORRECTION, + NORMALIZATION_CORRECTIONS, + PROVISIONAL_NAMED_SEQUENCES, + REPERTOIRE, + RESERVED, + STANDARDIZED_VARIANT, + STANDARDIZED_VARIANTS, + SURROGATE, + UCD; + static final XmlLeaf GREATEST_LEAF = NAME_ALIAS; + static final XmlLeaf GREATEST_BOTH = CHAR; + + static XmlLeaf forString(String source) { + try { + return XmlLeaf.valueOf(source.toUpperCase().replace('-', '_')); + } catch (final Exception e) { + return null; + } + } + } + + static class IntRange { + int start; + int end; + } + + Map> property2data = + new EnumMap>(UcdProperty.class); + + { + for (final UcdProperty prop : UcdProperty.values()) { + property2data.put(prop, new UnicodeMap()); + } + } + + Set leavesNotHandled = new LinkedHashSet(); + + public XMLProperties(File ucdxmlFile) { + readFile(ucdxmlFile); + + for (final UcdProperty prop : property2data.keySet()) { + final UnicodeMap map = property2data.get(prop); + map.freeze(); + } + } + + public void readFile(File ucdxmlFile) { + try { + System.out.println("Reading: " + ucdxmlFile.toString()); + final FileInputStream fis = new FileInputStream(ucdxmlFile); + final XMLReader xmlReader = XMLFileReader.createXMLReader(false); + xmlReader.setErrorHandler(new MyErrorHandler()); + xmlReader.setContentHandler(new MyContentHandler()); + final InputSource is = new InputSource(fis); + is.setSystemId(ucdxmlFile.toString()); + xmlReader.parse(is); + fis.close(); + } catch (final IOException | SAXException e) { + System.out.println("\t" + "Can't read " + ucdxmlFile); + System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); + } + } + + class MyContentHandler implements ContentHandler { + IntRange cp = new IntRange(); + HashMap attributes = new HashMap(); + HashMap groupAttributes = new HashMap(); + private final List lastElements = new ArrayList(); + + public MyContentHandler() {} + + @Override + public void characters(char[] arg0, int arg1, int arg2) throws SAXException { + final String chars = String.valueOf(arg0, arg1, arg2).trim(); + if (!chars.trim().isEmpty() + && lastElements.get(lastElements.size() - 1) != XmlLeaf.DESCRIPTION) { + throw new IllegalArgumentException("Should have no element content"); + } + } + + @Override + public void endElement(String arg0, String arg1, String arg2) throws SAXException { + try { + if (lastElements.isEmpty()) { + System.out.println( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2); + } else { + final XmlLeaf removed = lastElements.remove(lastElements.size() - 1); + } + } catch (ArrayIndexOutOfBoundsException e) { + throw new IllegalArgumentException( + "endElement: can't remove last element. Args: " + + arg0 + + ", " + + arg1 + + ", " + + arg2, + e); + } + } + + @Override + public void endDocument() throws SAXException {} + + @Override + public void endPrefixMapping(String arg0) throws SAXException {} + + @Override + public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {} + + @Override + public void processingInstruction(String arg0, String arg1) throws SAXException {} + + @Override + public void setDocumentLocator(Locator arg0) {} + + @Override + public void skippedEntity(String arg0) throws SAXException {} + + @Override + public void startDocument() throws SAXException {} + + @Override + public void startPrefixMapping(String arg0, String arg1) throws SAXException {} + + @Override + public void startElement( + String namespaceURI, String localName, String qName, Attributes atts) { + try { + final XmlLeaf xmlLeaf = XmlLeaf.forString(qName); + if (xmlLeaf == null) { + throw new IllegalArgumentException(qName); + } + lastElements.add(xmlLeaf); + // System.out.println("Added:\t" + lastElements); + + if (xmlLeaf == XmlLeaf.GROUP) { + groupAttributes.clear(); + addAttributes(atts, groupAttributes); + return; + } + attributes.clear(); + attributes.putAll(groupAttributes); + addAttributes(atts, attributes); + String cps; + switch (xmlLeaf) { + case CHAR: + case RESERVED: + case SURROGATE: + case NONCHARACTER: + parseCp(attributes); + for (final Map.Entry entry : attributes.entrySet()) { + doAttributes(entry.getKey(), entry.getValue()); + } + if (xmlLeaf == XmlLeaf.NONCHARACTER) { + property2data + .get(UcdProperty.Noncharacter_Code_Point) + .putAll(cp.start, cp.end, "Yes"); + } + break; + case BLOCK: + parseCp(attributes); + property2data + .get(UcdProperty.Block) + .putAll(cp.start, cp.end, attributes.get("name")); + break; + case NAMED_SEQUENCE: + cps = Utility.fromHex(attributes.get("cps")); + property2data + .get(UcdProperty.Named_Sequences) + .put(cps, attributes.get("name")); + break; + case CJK_RADICAL: + final String number = attributes.get("number"); + setProp( + Utility.fromHex(attributes.get("radical")), + UcdProperty.CJK_Radical, + number); + setProp( + Utility.fromHex(attributes.get("ideograph")), + UcdProperty.CJK_Radical, + number); + break; + case EMOJI_SOURCE: + cps = Utility.fromHex(attributes.get("unicode")); + setProp(cps, UcdProperty.Emoji_DCM, attributes.get("docomo")); + setProp(cps, UcdProperty.Emoji_KDDI, attributes.get("kddi")); + setProp(cps, UcdProperty.Emoji_SB, attributes.get("softbank")); + break; + case REPERTOIRE: + case BLOCKS: + case CJK_RADICALS: + case EMOJI_SOURCES: + case NAMED_SEQUENCES: + case PROVISIONAL_NAMED_SEQUENCES: + case NORMALIZATION_CORRECTIONS: + case STANDARDIZED_VARIANTS: + case DESCRIPTION: + case DO_NOT_EMIT: + // non-informational nodes, skip + if (atts.getLength() != 0) { + throw new IllegalArgumentException("Has attributes"); + } + break; + case UCD: + if (atts.getLength() != 0) { + throw new IllegalArgumentException( + "Has wrong number of attributes: " + attributes.entrySet()); + } + break; + case NAME_ALIAS: + final String alias = + attributes.get("alias") + "(" + attributes.get("type") + ")"; + appendProp(cp.start, UcdProperty.Name_Alias, alias); + break; + case STANDARDIZED_VARIANT: + { + String desc = attributes.get("desc"); + final String when = attributes.get("when"); + if (!when.isEmpty()) { + desc = desc + "(" + when + ")"; + } + cps = Utility.fromHex(attributes.get("cps")); + appendProp(cps, UcdProperty.Standardized_Variant, desc); + break; + } + case NORMALIZATION_CORRECTION: + final String correction = + "old: " + + attributes.get("old") + + " new: " + + attributes.get("new") + + " version: " + + attributes.get("version"); + cps = Utility.fromHex(attributes.get("cp")); + appendProp(cps, UcdProperty.NC_Original, correction); + break; + case INSTEAD: + final String instead = + "use: " + + attributes.get("use") + + " because: " + + attributes.get("because"); + cps = attributes.get("of"); + appendProp(cps, UcdProperty.Do_Not_Emit_Preferred, instead); + break; + case GROUP: + break; // handled above. Leaving case for clarity + default: + leavesNotHandled.add(qName); + break; + } + } catch (final Exception e) { + System.out.println( + "Exception: " + + qName + + "\t" + + e.getClass().getName() + + "\t" + + e.getMessage()); + } + } + + public void addAttributes(Attributes atts, Map map) { + for (int i = 0; i < atts.getLength(); ++i) { + map.put(atts.getQName(i), atts.getValue(i)); + } + } + + public void setProp(String cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void setProp(int cps, UcdProperty ucdProperty, String docomo) { + if (docomo != null) { + property2data.get(ucdProperty).put(cps, docomo); + } + } + + public void appendProp(int cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void appendProp(String cps, UcdProperty ucdProperty, String docomo) { + final UnicodeMap unicodeMap = property2data.get(ucdProperty); + final String former = unicodeMap.get(cps); + unicodeMap.put(cps, former == null ? docomo : former + "; " + docomo); + } + + public void parseCp(HashMap attributes2) { + final String cpString = attributes2.get("cp"); + if (cpString != null) { + cp.start = cp.end = Integer.parseInt(cpString, 16); + } else { + cp.start = Integer.parseInt(attributes2.get("first-cp"), 16); + cp.end = Integer.parseInt(attributes2.get("last-cp"), 16); + } + } + + public UnicodeMap doAttributes(String key, String value) { + UcdProperty prop = UcdProperty.forString(key); + // if (prop == UcdProperty.Deprecated && cp.start > 0xE0000 && cp.start < + // 0xE00FF) { + // System.out.println(Utility.hex(cp.start) + "," + Utility.hex(cp.end) + + // "\t" + key + "\t" + value); + // } + if (prop == null) { + if (key.endsWith("cp")) { + if (key.equals("cp") || key.equals("last-cp") || key.equals("first-cp")) { + return null; + } + } else if (key.equals("InSC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } else if (key.equals("InMC")) { + prop = UcdProperty.Indic_Syllabic_Category; + } + if (prop == null) { + return null; + } + } + final UnicodeMap data = property2data.get(prop); + if (data == null) { + System.out.println("can't get data for " + key); + return null; + } + data.putAll(cp.start, cp.end, value.intern()); + return data; + } + } + + static class MyErrorHandler implements ErrorHandler { + @Override + public void error(SAXParseException exception) throws SAXException { + // System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void fatalError(SAXParseException exception) throws SAXException { + // System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); + throw exception; + } + + @Override + public void warning(SAXParseException exception) throws SAXException { + // System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); + throw exception; + } + } + + public UnicodeMap getMap(UcdProperty prop) { + return property2data.get(prop); + } + + public Set getLeavesNotHandled() { + return leavesNotHandled; + } + + static String show(String ival) { + if (ival == null) { + return "null"; + } else if (ival.isEmpty()) { + return ""; + } else if (ival.codePointAt(0) < 0x20) { + return "\\u{" + Utility.hex(ival, 4) + "}"; + } + return "«" + ival + "»"; + } + + // private static final String NO_VALUE = + // IndexUnicodeProperties.DefaultValueType.NO_VALUE.toString(); + // private static final String NAN = IndexUnicodeProperties.DefaultValueType.NaN.toString(); + + static final boolean HACK_XML_DEFAULTS = false; + + public static String getXmlResolved(UcdProperty property, int codePoint, String propertyValue) { + if (property == UcdProperty.Name) { + int debug = 0; + } + switch (property.getType()) { + case Binary: + if (HACK_XML_DEFAULTS) { + if (propertyValue == null) { + propertyValue = "No"; + } else { + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + } + // $FALL-THROUGH$ + case Enumerated: + case Catalog: + if (propertyValue != null) { + propertyValue = IndexUnicodeProperties.normalizeValue(property, propertyValue); + } + break; + case Numeric: + // if (HACK_XML_DEFAULTS) { + // if (propertyValue == null || propertyValue.isEmpty()) { + // propertyValue = "NaN"; + // } + // } + switch (property) { + case kOtherNumeric: + case kPrimaryNumeric: + case kAccountingNumeric: + if (propertyValue == null || propertyValue.isEmpty()) { + propertyValue = "NaN"; + } + break; + } + break; + case Miscellaneous: + if (propertyValue != null) { + switch (property) { + case Script_Extensions: + propertyValue = + IndexUnicodeProperties.normalizeValue(property, propertyValue); + break; + // case Name: + // break; + default: + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + } + } + break; + case String: + if (propertyValue != null) { + propertyValue = propertyValue.replace("#", Utility.hex(codePoint)); + propertyValue = Utility.fromHex(propertyValue); + } + break; + default: + break; + } + return propertyValue; + // return propertyValue == null ? "" : propertyValue; + } +} diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt index 7d4ce84e7..0f9cbda3d 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt @@ -67,6 +67,7 @@ CJKR ; CJK_Radical EDCM ; Emoji_DCM EKDDI ; Emoji_KDDI ESB ; Emoji_SB +EVS ; emoji_variation_sequence NS ; Named_Sequences NSP ; Named_Sequences_Prov SV ; Standardized_Variant @@ -160,6 +161,9 @@ cjkJoyoKanji ; kJoyoKanji cjkKoreanEducationHanja ; kKoreanEducationHanja cjkKoreanName ; kKoreanName cjkTGH ; kTGH +ncCorrected ; NC_Corrected +ncOriginal ; NC_Original +ncVersion ; NC_Version # 13.0 cjkSpoofingVariant ; kSpoofingVariant cjkTGHZ2013 ; kTGHZ2013 @@ -187,4 +191,4 @@ kReading ; kReading kEH_Func ; kEH_Func kEH_FVal ; kEH_FVal -kEH_UniK ; kEH_UniK \ No newline at end of file +kEH_UniK ; kEH_UniK diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt index a9b3e9f12..e280c7ff2 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexPropertyRegex.txt @@ -44,9 +44,11 @@ $codePoint0 = ($codePoints)? # Main data Bidi_Mirroring_Glyph ; SINGLE_VALUED ; $codePoint +Bidi_Paired_Bracket ; SINGLE_VALUED ; $codePoint Simple_Lowercase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Titlecase_Mapping ; SINGLE_VALUED ; $codePoint Simple_Uppercase_Mapping ; SINGLE_VALUED ; $codePoint +Equivalent_Unified_Ideograph; SINGLE_VALUED ; $codePoint NFKC_Casefold ; SINGLE_VALUED ; $codePoint0 NFKC_Simple_Casefold ; SINGLE_VALUED ; $codePoint0 @@ -142,7 +144,7 @@ kHanYu ; MULTI_VALUED ; [1-8][0-9]{4}\.[0-3 kIRGHanyuDaZidian ; SINGLE_VALUED ; [1-8][0-9]{4}\.[0-3][0-9][01] kCNS1992 ; SINGLE_VALUED ; [1-9]-[0-9A-F]{4} kTotalStrokes ; ORDERED ; [1-9][0-9]{0,2} -kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} +kRSUnicode ; ORDERED ; [1-9][0-9]{0,2}\'?\.[0-9]{1,2} kRSJapanese ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKanWa ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} kRSKangXi ; EXTENSIBLE ; [1-9][0-9]{0,2}\.[0-9]{1,2} @@ -170,13 +172,13 @@ kHanyuPinlu ; MULTI_VALUED ; [a-z\x{308}]+[1-5]\ kCantonese ; MULTI_VALUED ; [a-z]{1,6}[1-6] kTang ; MULTI_VALUED ; \*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+ -kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? -kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) +kJinmeiyoKanji ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})? +kJoyoKanji ; MULTI_VALUED ; (20[0-9]{2})|(U\+2?[0-9A-F]{4}) kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2} -kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* -kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} +kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})* +kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3} -kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} +kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4} # Unihan properties from 13.0 and later. No regexes for now. @@ -199,28 +201,37 @@ kZhuangNumeric ; MULTI_VALUED ; .* kFanqie ; MULTI_VALUED ; .* kZhuang ; MULTI_VALUED ; .* +kSrc_NushuDuben ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kReading ; SINGLE_VALUED ; [a-z]{1,6}[1-6]+ +kRSTUnicode ; SINGLE_VALUED ; [0-9]+\.[0-9]+ +kTGT_MergedSrc ; SINGLE_VALUED ; L2008-[0-9A-F]{4,5}(-[0-9]{4,5})? + +NC_Original ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Corrected ; SINGLE_VALUED ; [0-9A-F]{4,5} +NC_Version ; SINGLE_VALUED ; [0-9]\.[0-9]\.[0-9] + # ============================= # Catalog/Enum/Binary Properties # All not listed are SINGLE_VALUED ; null # ============================= -Script_Extensions ; MULTI_VALUED ; -Standardized_Variant ; MULTI_VALUED ; .* +Script_Extensions ; MULTI_VALUED ; +Standardized_Variant ; MULTI_VALUED ; .* -Idn_Status ; SINGLE_VALUED ; -Idn_Mapping ; SINGLE_VALUED ; $codePoints -Idn_2008 ; SINGLE_VALUED ; +Idn_Status ; SINGLE_VALUED ; +Idn_Mapping ; SINGLE_VALUED ; $codePoints +Idn_2008 ; SINGLE_VALUED ; -Identifier_Status ; SINGLE_VALUED ; -Identifier_Type ; MULTI_VALUED ; +Identifier_Status ; SINGLE_VALUED ; +Identifier_Type ; MULTI_VALUED ; -Confusable_SL ; SINGLE_VALUED ; $codePoints -Confusable_SA ; SINGLE_VALUED ; $codePoints -Confusable_ML ; SINGLE_VALUED ; $codePoints -Confusable_MA ; SINGLE_VALUED ; $codePoints +Confusable_SL ; SINGLE_VALUED ; $codePoints +Confusable_SA ; SINGLE_VALUED ; $codePoints +Confusable_ML ; SINGLE_VALUED ; $codePoints +Confusable_MA ; SINGLE_VALUED ; $codePoints -#Emoji ; SINGLE_VALUED ; -#Emoji_Presentation ; SINGLE_VALUED ; -#Emoji_Modifier ; SINGLE_VALUED ; -#Emoji_Modifier_Base ; SINGLE_VALUED ; +#Emoji ; SINGLE_VALUED ; +#Emoji_Presentation ; SINGLE_VALUED ; +#Emoji_Modifier ; SINGLE_VALUED ; +#Emoji_Modifier_Base ; SINGLE_VALUED ; diff --git a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt index f7c9da838..5ff7cbdf5 100644 --- a/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt +++ b/unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt @@ -36,6 +36,8 @@ FileType ; Unihan_OtherMappings ; PropertyValue FileType ; Unihan_RadicalStrokeCounts ; PropertyValue FileType ; Unihan_Readings ; PropertyValue FileType ; Unihan_Variants ; PropertyValue +FileType ; NushuSources ; PropertyValue +FileType ; TangutSources ; PropertyValue # NameAliases File Type # Contains a multivalued property, where successive values are not in the same line, but are divided out on successive lines with the same code point @@ -43,6 +45,7 @@ FileType ; Unihan_Variants ; PropertyValue FileType ; NameAliases ; NameAliases FileType ; NameAliasesProv ; NameAliases FileType ; StandardizedVariants ; StandardizedVariants +FileType ; emoji-variation-sequences ; StandardizedVariants # CJKRadicals File Type @@ -320,6 +323,15 @@ Unihan_Variants ; kSpoofingVariant Unihan_Variants ; kTraditionalVariant Unihan_Variants ; kZVariant +NushuSources ; kSrc_NushuDuben +NushuSources ; kReading +TangutSources ; kRSTUnicode +TangutSources ; kTGT_MergedSrc + +NormalizationCorrections ; NC_Original +NormalizationCorrections ; NC_Corrected +NormalizationCorrections ; NC_Version + # Properties removed from Unihan before 5.1. # Point to a nonexistent file so that we don’t try to read them from the most recent monolithic # Unihan, as we would then get confused by the other (still-extant) properties in that file. @@ -438,6 +450,7 @@ EmojiSources ; Emoji_SB ; 3 NamedSequences ; Named_Sequences NamedSequencesProv ; Named_Sequences_Prov StandardizedVariants ; Standardized_Variant +emoji-variation-sequences ; emoji-variation-sequence DoNotEmit ; Do_Not_Emit_Preferred ; 1 DoNotEmit ; Do_Not_Emit_Type ; 2 @@ -488,15 +501,6 @@ emoji/*/emoji-zwj-sequences; RGI_Emoji_Zwj_Sequence #emoji/*/emoji-test ; Emoji_Short_Name - -FileType ; TangutSources ; PropertyValue -TangutSources ; kTGT_MergedSrc -TangutSources ; kRSTUnicode - -FileType ; NushuSources ; PropertyValue -NushuSources ; kSrc_NushuDuben -NushuSources ; kReading - FileType ; Unikemet ; PropertyValue Unikemet ; kEH_Cat Unikemet ; kEH_Core diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml new file mode 100644 index 000000000..617113bf2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml new file mode 100644 index 000000000..c1380221b --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Bidi_M.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Bidi_M { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml new file mode 100644 index 000000000..7c7873459 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Emoji.xml @@ -0,0 +1,20 @@ + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml new file mode 100644 index 000000000..8340250dc --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InCB.xml @@ -0,0 +1,9 @@ + + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml new file mode 100644 index 000000000..a7de62387 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InPC.xml @@ -0,0 +1,21 @@ + + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml new file mode 100644 index 000000000..ddddc27a4 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/InSC.xml @@ -0,0 +1,42 @@ + + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml new file mode 100644 index 000000000..568f5e270 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/JSN.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml new file mode 100644 index 000000000..4cbf1d0f0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Join_C.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute Join_C { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml new file mode 100644 index 000000000..c2b53b2fe --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Name_Alias.xml @@ -0,0 +1,10 @@ + + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml new file mode 100644 index 000000000..8919bba32 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Nushu.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml new file mode 100644 index 000000000..a6ff2d092 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Set_of_code_points.xml @@ -0,0 +1,8 @@ + + + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml new file mode 100644 index 000000000..21e52208a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Tangut.xml @@ -0,0 +1,18 @@ + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml new file mode 100644 index 000000000..ba4c042f8 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/Unihan.xml @@ -0,0 +1,347 @@ + + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml new file mode 100644 index 000000000..8a1722f22 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/age.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml new file mode 100644 index 000000000..d3e70a6ab --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bc.xml @@ -0,0 +1,17 @@ + + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml new file mode 100644 index 000000000..ecd721a63 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/blk.xml @@ -0,0 +1,344 @@ + + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml new file mode 100644 index 000000000..1d9b2beb8 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/block.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml new file mode 100644 index 000000000..d4431070d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bmg.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml new file mode 100644 index 000000000..fae36d68d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boolean.xml @@ -0,0 +1,4 @@ + + + boolean = "Y" | "N" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml new file mode 100644 index 000000000..abe4ffe9a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/boundaries.xml @@ -0,0 +1,58 @@ + + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml new file mode 100644 index 000000000..3924ed3e9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpb.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml new file mode 100644 index 000000000..183c9bf3f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/bpt.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml new file mode 100644 index 000000000..8708699be --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_folding.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml new file mode 100644 index 000000000..c1296b7b9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml new file mode 100644 index 000000000..df4b97e64 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/case_other.xml @@ -0,0 +1,32 @@ + + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml new file mode 100644 index 000000000..503f05999 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/casing.xml @@ -0,0 +1,14 @@ + + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml new file mode 100644 index 000000000..8226509d7 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ccc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml new file mode 100644 index 000000000..45c49ed2c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjk-radicals.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml new file mode 100644 index 000000000..08222c4f0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkEACC.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= attribute cjkEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml new file mode 100644 index 000000000..49f9c3917 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/cjkIRG_TSource.xml @@ -0,0 +1,6 @@ + + + code-point-attributes &= attribute cjkIRG_TSource + { xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4} +| TU-[023][0-9A-F]{4}" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml new file mode 100644 index 000000000..96ce4abcf --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/composition.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml new file mode 100644 index 000000000..c26367d97 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes.xml @@ -0,0 +1,5 @@ + + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml new file mode 100644 index 000000000..c3cda88df --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/datatypes_code_points.xml @@ -0,0 +1,9 @@ + + + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml new file mode 100644 index 000000000..833a7d1e0 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/decomposition.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml new file mode 100644 index 000000000..97bb063e7 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/description.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element description { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml new file mode 100644 index 000000000..5381491e7 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/do-not-emit.xml @@ -0,0 +1,22 @@ + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml new file mode 100644 index 000000000..d51bf2441 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ea.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml new file mode 100644 index 000000000..96d122953 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/emoji-sources.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml new file mode 100644 index 000000000..7ce510adc --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/function_graphic.xml @@ -0,0 +1,68 @@ + + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml new file mode 100644 index 000000000..36cd1f774 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/gc.xml @@ -0,0 +1,12 @@ + + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml new file mode 100644 index 000000000..11f3b0dd9 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/groups.xml @@ -0,0 +1,8 @@ + + + + group = + element group { + code-point-attributes, + code-point* } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml new file mode 100644 index 000000000..385cd466a --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/hst.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml new file mode 100644 index 000000000..0ab95a27f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/identifier.xml @@ -0,0 +1,26 @@ + + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml new file mode 100644 index 000000000..0c758e342 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/ideographs.xml @@ -0,0 +1,23 @@ + + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml new file mode 100644 index 000000000..f19b59317 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/isc.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute isc { text }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml new file mode 100644 index 000000000..9a6820c7b --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/jis-code-point.xml @@ -0,0 +1,5 @@ + + + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml new file mode 100644 index 000000000..184fcca14 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/joining.xml @@ -0,0 +1,53 @@ + + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml new file mode 100644 index 000000000..ee1f36cac --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/lb.xml @@ -0,0 +1,24 @@ + + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml new file mode 100644 index 000000000..5dafe8c22 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/miscellaneous.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml new file mode 100644 index 000000000..4c4644c31 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na.xml @@ -0,0 +1,13 @@ + + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml new file mode 100644 index 000000000..592de98c3 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/na1.xml @@ -0,0 +1,5 @@ + + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml new file mode 100644 index 000000000..2859ea29d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/named-sequences.xml @@ -0,0 +1,15 @@ + + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml new file mode 100644 index 000000000..e75306a26 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/namespace.xml @@ -0,0 +1,5 @@ + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml new file mode 100644 index 000000000..7231a8c26 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/normalization-corrections.xml @@ -0,0 +1,11 @@ + + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml new file mode 100644 index 000000000..24230aee1 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/numeric.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml new file mode 100644 index 000000000..baa00a73c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/pattern.xml @@ -0,0 +1,8 @@ + + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml new file mode 100644 index 000000000..224c2287e --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/quickcheck.xml @@ -0,0 +1,31 @@ + + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml new file mode 100644 index 000000000..0cfc86e40 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire.xml @@ -0,0 +1,6 @@ + + + + ucd.content &= + element repertoire { (code-point | group) + }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml new file mode 100644 index 000000000..cdfd1ad88 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/repertoire_Code_points.xml @@ -0,0 +1,23 @@ + + + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml new file mode 100644 index 000000000..b22243aaf --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/script.xml @@ -0,0 +1,49 @@ + + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml new file mode 100644 index 000000000..e2acb669c --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/simple_case_mapping.xml @@ -0,0 +1,11 @@ + + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml new file mode 100644 index 000000000..a415a1152 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/standardized-variants.xml @@ -0,0 +1,10 @@ + + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml new file mode 100644 index 000000000..ba0e2262f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/fragments/start.xml @@ -0,0 +1,6 @@ + + + + start = + element ucd { ucd.content } + \ No newline at end of file diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index.xml b/unicodetools/src/main/resources/org/unicode/uax42/index.xml new file mode 100644 index 000000000..c0f05f5c2 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index.xml @@ -0,0 +1,1353 @@ + + +
+ + Unicode Character Database in XML + + + + + 2025 + + + + Wilcock + John + + + + + + New value for the age attribute: 16.0. + + New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. + + New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. + + New value for the jg attribute: Kashmiri_Yeh. + New value for the InSC attribute: Reordering_Killer. + + New attributes: MCM, kFanqie, kZhuang. + + Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. + + Added the do-not-emit element. + + + + Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. + + + + New value for the age attribute: 15.1. + + New value for the blk attribute: CJK_Ext_I. + + New values for the lb attribute: AK, AP, + AS, VF, VI. + + Modified values for the number, radical attributes of the + cjk-radical + element. + + Changed single value into list for the nv code point attribute. + + New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. + + Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. + + Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. + + New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. + + + + Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. + + + + New value for the age attribute: 15.0. + + New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. + + New values for the script attribute: Kawi, Nagm. + + New Unihan attribute: kAlternateTotalStrokes. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. + + + + Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. + + + + New value for the age attribute: 14.0. + + New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. + + New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. + + New values for the jg attribute: Thin_Yeh, Vertical_Tail. + + New Unihan attribute: kStrange. + + Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. + + Removal of the kWubi attribute, which has never been present in + released versions of the UCD. + + + + Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. + + + + New value for the age attribute: 13.0. + + New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. + + New values for the script attribute: Chrs, Diak, + Kits, Yezi. + + New value for the InPC attribute: Top_And_Bottom_And_Left. + + New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. + + New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. + + Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. + + + + Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. + + + + New value for the age attribute: 12.1. + + + + + + New value for the age attribute: 12.0. + + New values for the script attribute: Elym, Hmnp, + Nand, Wcho. + + New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. + + Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. + + + + Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. + + + + New value for the age attribute: 11.0. + + New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. + + New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. + + New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. + + New value for the wb attribute: WSegSpace. + + New values for the InSC attribute: Consonant_Initial_Postfixed. + + New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. + + Modified patterns for the kTGT_MergedSrc attribute. + + Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. + + + + Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. + + + + New value for the age attribute: 10.0. + + New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. + + New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. + + New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. + + New value for the InPC attribute: Bottom_And_Left. + + Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. + + New code point attributes: vo, + RI + + New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. + + + + Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. + + + + New value for the age attribute: 9.0. + + New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. + + New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. + + New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. + + New values for the lb attribute: EB, EM, ZWJ. + + New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. + + New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. + + Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. + + + + Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. + + + + New value for the age attribute: 8.0. + + New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. + + New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. + + New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. + + New code point attributes: InPC, kJa. + + New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. + + Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008. + + + Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. + + + + New value for the age attribute: 7.0. + + New values for the jg attribute. + + New values for the sc attribute. + + New values for the blk attribute. + + New values for the InSC attribute. + + New values for the kIICore attribute. + + New values for the kIRG_GSource attribute. + + + + Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. + + + + New value for the age attribute: 6.3. + + New values DQ, HL, SQ for the WB attribute(forUnicode6.3). + + New code point attributes bpt and bpb (for Unicode 6.3). + + New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). + + Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). + + Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). + + Clarified that the child elements list-like elements are in no particular order. + + + Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. + + + + New value for the age attribute: 6.2. + + New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). + + Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). + + + + Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. + + + + Clarified the default values. + Indicate that property values may change from one release to the next. + Introduced the blk attributes, for the Block property. + + Introduced the scx attribute, for the ScriptExtensions property. + + Introduced the name-alias element, for the Name_Alias property. + + New value for the age attribute: 6.1. + + New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. + + New values for the lb attribute: HL and CJ. + + New value for the jg attribute: Rohingya_Yeh. + + The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. + + For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. + + The values of the ccc are now restricted to 0..254, instead of 0..255. + + Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. + + + + Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. + + + + New value for the age attribute: 6.0. + + New value for the jg attribute: + Teh_Marbuta_Goal + + New values for the script attribute: Batk, Brah, + Mand. + + Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. + + Added the InSC and InMC elements. + + Added the emoji-sources element. + + + + Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. + + + + Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point + + Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. + + Changed the type of standardized-variants/@cps from text to + two-code-points. + + New values for the jg attribute: Farsi_Yeh and Nya. + + New value for the age attribute: 5.2. + + New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. + + New value for the lb attribute: CP. + + New value for the sc attribute: Zinh. + + New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. + + New attributes kHanyuPinyin and kIRG_MSource. + + New element + cjk-radicals + + Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. + + Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. + + + + Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. + + + + First approved version, for Unicode 5.1.0. + For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. + + Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. + + Value NL added to the WB attribute (for Unicode 5.1). + + Value PP added to the GCB attribute (for Unicode 5.1). + + Corrected the Vai script value to Vaii. + + Removed the discussion of elements or attributes in different namespace. + Removed the code-point element. + + + + + + Promoted to Draft UAX. + Changed the title from "An XML representation of the UCD" + Value 5.1 added to the age attribute (for Unicode 5.1). + + Value SM added to the gcb attribute (for Unicode 5.1). + + Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). + + Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). + + Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). + + Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). + + Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). + + + jamo + attribute renamed to + JSN + + + sfc + attribute renamed to + scf + + Attribute kXHC1983 added (for Unicode 5.1.0). + + Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). + + Element provisional-named-sequences added (for Unicode 5.0) + + + + + + First working draft. + + + + + + + This annex describes an XML representation of the Unicode Character Database. + + + + +
+ Introduction + In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. + + Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. + + Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. + + Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. + + This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. + + It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. + + Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. + + This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. + +
+ + + +
+ Overall schema + +
+ General principles + Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. + + Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. + + Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. + + An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. + + While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). + + Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. + +
+ + +
+ Namespace + The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. + + + In all our examples, we assume that this namespace is the default one. + +
+ + +
+ Datatypes + We use a standard XML Schema datatypes: + + Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. + + +
+ + +
+ Root Element + The root element of valid documents is a ucd. + + +
+ + +
+ Common attributes + A large number of properties are boolean. We uniformly use the values Y and + N for those: + + +
+ + +
+ Ordering of elements + In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. + +
+
+ + +
+ Description + The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. + + It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. + + +
+ + +
+ Repertoire + The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: + + + + +
+ Sets of code points + It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. + + + This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. + + In the repertoire, there must be at most one code-point + element for a given code point. + +
+ + +
+ Code point types + When thinking about Unicode code points, it is useful to split them into four types: + + + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points + + This leads to four elements to describe sets of code points: + + +
+ + +
+ Group + While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. + + For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. + + + Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: + + + <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/> + <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/> + <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/> + is equivalent to this fragment which uses a group: + + + <group age="3.2" gc="Lo" sc="Buhd"> + <char cp="1740" na="BUHID LETTER A"/> + <char cp="1741" na="BUHID LETTER I"/> + <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/> + <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/> + </group> + The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. + + As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. + + Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. + + +
+ + +
+ Properties + Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. + + The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. + + The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. + + For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. + + Note that the set of possible values for a property captured in this schema may change from one + version to the next. + + + +
+ Age property + The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. + + +
+ + +
+ Name properties + There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). + + + + The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + and + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: + + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/> + is equivalent to + + <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/> + <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> + which in turn is equivalent to: + + <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/> +
+ + +
+ Name Alias properties + The Name_Alias property is represented by zero or more name-alias + child elements: + + +
+ + +
+ Block property + The Block property is represented by the blk attribute: + + +
+ + +
+ General Category + The general category is represented by the gc attribute. + + +
+ + +
+ Combining properties + The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. + + Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. + + +
+ + +
+ Bidirectionality properties + The bidirectional class is represented by the bc attribute. + + + The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. + + + The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. + + + Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. + + The Bidi_Control property is represented by the Bidi_C attribute. + + + The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. + + + +
+ + +
+ Decomposition properties + The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. + + Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. + + + The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: + + + The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. + + +
+ + +
+ Numeric Properties + The numeric type is represented by the nt attribute. + + The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. + + +
+ + +
+ Joining properties + The joining class of a character is represented by the jt attribute. + + The jg attribute is the joining group of the character. + + + The Join_Control property is represented by the Join_C attribute. + + +
+ + +
+ Linebreak properties + The Line_Break property is represented by the lb attribute. + + +
+ + +
+ East Asian Width property + The East Asian width property is represented by the ea attribute. + + +
+ + +
+ Case properties + The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. + + + Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. + + The simple case mappings are recorded in the suc, slc, stc + attributes. + + + The non-simple casing are recorded in the uc, lc and tc + attributes. + + + The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. + + + The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: + + + Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. + +
+ + +
+ Script properties + The script and script extension properties are represented by the sc and + scx attributes respectively. + + +
+ + +
+ ISO Comment properties + The ISO 10646 comment field is represented by the isc attribute. + + +
+ + +
+ Hangul properties + The property Hangul_Syllable_Type is represented by the hst attribute. + + + The property Jamo_Short_Name is represented by the JSN attribute: + + +
+ + +
+ Indic properties + The property Indic_Syllabic_Category is represented by the InSC + attribute. + + + The property Indic_Positional_Category is represented by the InPC + attribute: + + + The property Indic_Conjunct_Break is represented by the InCB attribute: + + +
+ + +
+ Identifier and Pattern and programming language properties + + The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: + + + The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: + + +
+ + +
+ Properties related to function and graphic characteristics + The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. + + +
+ + +
+ Properties related to boundaries + The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: + + +
+ + +
+ Properties related to ideographs + The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: + + +
+ + +
+ Miscellaneous properties + The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: + + +
+ + +
+ Unihan properties + The Unihan properties (from the Unihan database) are represented as attributes. + + +
+ + +
+ Tangut data + The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. + + +
+ + +
+ Nushu data + The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading. + +
+ + +
+ Emoji properties + The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: + + +
+
+
+ + +
+ Blocks + The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. + + +
+ + +
+ Named Sequences + The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. + + Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. + + +
+ + +
+ Normalization Corrections + The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. + + +
+ + +
+ Standardized Variants + The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. + + +
+ + +
+ CJK Radicals + The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. + + +
+ + +
+ Emoji sources + The emoji-sources child of the ucd describes the emoji sources. + + + + +
+ + +
+ Do Not Emit + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + + + +
+ + +
+ The full schema + Our schema is just the accumulation of the pieces we have described so far: + + + + + + + + + + + + + + + + + + + + + An expanded version is linked from the top of this document. +
+ + +
+ Examples + Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + +
+ + + + Acknowledgments + Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and Laurențiu Iancu. + + +
diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl new file mode 100644 index 000000000..f0a95fa95 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2html.xsl @@ -0,0 +1,611 @@ + + + + + + + + + + + + + + + + + + + + + <xsl:choose> + <xsl:when test="articleinfo/unicode:tr/@class='uax'"> + <xsl:text>UAX</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='uts'"> + <xsl:text>UTS</xsl:text> + </xsl:when> + <xsl:when test="articleinfo/unicode:tr/@class='utr'"> + <xsl:text>UTR</xsl:text> + </xsl:when> + </xsl:choose> + <xsl:text> #</xsl:text> + <xsl:value-of select="articleinfo/unicode:tr/@number"/> + <xsl:text>: </xsl:text> + <xsl:value-of select="title"/> + + + + + + + + + + + + +
+ + [Unicode] +  Technical Reports +
 
+
+

+ + + + + Unicode® Standard Annex + + + Unicode® Technical Standard + + + Unicode® Technical Report + + + # + +

+

+ + +
+ + +
+ + + + + + +
+ +

Modifications

+

This section indicates the changes introduced by each revision.

+ +
+ +
+ + + + + Working draft + + + Proposed Update + + + + + + + + + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .html + + + + https://www.unicode.org/reports/tr + + / + + + + https://www.unicode.org/reports/tr + + /tr + + - + + .rnc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Version + Unicode + + + + +
+ + + Editor + + + Editors + + + + +
Date + + + + +
This Version + + + + +
Previous Version + + + n/a + + + + + + + + +
Latest Version + +
Latest Proposed Update + proposed.html +
Schema + + + + +
Revision + + + + + + +
+
+ + + + + + + + + +
+
+ + + ( + mailto: + ) + + + + + +

Summary

+ +
+ + +

+
+ + + + +

Status

+ + +

This document has been reviewed by Unicode members and other interested parties, and has been + approved for publication by the Unicode Consortium. This is a stable document and may be used as reference + material or cited as a normative reference by other specifications.

+
+ +

+ + This is a draft document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress.

+
+
+ + +
+

A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part.

+
+

Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata].

+
+ +
+

A Unicode Technical Standard (UTS) is an independent specification. Conformance to the Unicode + Standard does not imply conformance to any UTS.

+
+

Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

+
+ +
+

A Unicode Technical Report (UTR) contains informative material. Conformance to the Unicode + Standard does not imply conformance to any UTR. Other specifications, however, are free to make normative + references to a UTR.

+
+

Please submit corrigenda and other comments with the online reporting form [ + Feedback]. Related information that is useful in understanding this document is found in References. For the latest version of the Unicode Standard see [Unicode]. For a list of current Unicode Technical Reports see [Reports]. For more information about versions of the Unicode Standard, see + [Versions].

+
+
+
+ + + + +

Contents

+ +
+ + +
  • + + +
      + +
    +
    +
  • +
    + + + + + + +      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    + + + +

    +
    + + +

    +
    + + +
    +
    + + + + + + + + + _blank + + + + + + + + + + + + + + + + + + + [ + + + + + + + + + + + + : + + + , ] + + + + +

    + [, + ] + + = + + + +

    +
    + + +

    + [] + + = + +

    +
    + + + + + + + + +
    +

    + Revision +

    + +
    +
    + + +
    +

    + +

    +
    +
    + + +
      + +
    +
    + + +
  • + +
  • +
    + + + + + + + + + + + + + + + + background-color: #ffff00; border-style:dotted; border-width:1px + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    diff --git a/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl new file mode 100644 index 000000000..b7a8dfa81 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/index2rnc.xsl @@ -0,0 +1,45 @@ + + + + + + + + + + + + # Copyright © Unicode, Inc. + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.html b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html new file mode 100644 index 000000000..13bf8181d --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.html @@ -0,0 +1,3482 @@ + + + + + + + UAX #42: Unicode Character Database in XML + + + + + + + + + + + +
    + + [Unicode] +  Technical Reports +
     
    +
    +

    + Proposed Update Unicode® Standard Annex #42

    +

    Unicode Character Database in XML

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    VersionUnicode 16.0.0 +
    + Editor + + John Wilcock
    +
    Date + 2024-08-15 +
    This Version + + https://www.unicode.org/reports/tr42/tr42-36.html + +
    Previous Version + + https://www.unicode.org/reports/tr42/tr42-34.html + +
    Latest Version + https://www.unicode.org/reports/tr42/ +
    Latest Proposed Update + https://www.unicode.org/reports/tr42/proposed.html +
    Schema + + https://www.unicode.org/reports/tr42/tr42-36.rnc + +
    Revision + + 36 + +
    +

    Summary

    +

    + This annex describes an XML representation of the Unicode Character Database. +

    +

    + Status +

    +

    + This is a + draft + document which may be updated, replaced, or + superseded by other documents at any time. Publication does not imply endorsement by the Unicode + Consortium. This is not a stable document; it is inappropriate to cite this document as other than a + work in progress. +

    +
    +

    + + A Unicode Standard Annex (UAX) forms an integral part of the Unicode Standard, but is + published online as a separate document. The Unicode Standard may require conformance to normative + content in a Unicode Standard Annex, if so specified in the Conformance chapter of that version of the + Unicode Standard. The version number of a UAX document corresponds to the version of the Unicode Standard + of which it forms a part. +

    +
    +

    + Please submit corrigenda and other comments with the online reporting form [Feedback]. Related information that is useful in + understanding this annex is found in Unicode Standard Annex #41, “Common References for Unicode Standard + Annexes.” For the latest version of the Unicode Standard, see [Unicode]. For a list of current Unicode + Technical Reports, see [Reports]. For more information about + versions of the Unicode Standard, see [Versions]. For any + errata which may apply to this annex, see [Errata]. +

    +

    Contents

    + +
    +

    + 1 Introduction +

    +

    In working on Unicode implementations, it is often useful to access the full content of the Unicode + Character Database (UCD). For example, in establishing mappings from characters to glyphs in fonts, it is + convenient to see the character scalar value, the character name, the character East Asian width, along with + the shape and metrics of the proposed glyph to map to; looking at all this data simultaneously helps in + evaluating the mapping. +

    +

    Directly accessing the data files that constitute the UCD is sometimes a daunting proposition. The data is + dispersed in a number of files of various formats, and there are just enough peculiarities (all justified by + the processing power available at the time the UCD representation was designed) to require a fairly intimate + knowledge of the data format itself, in addition to the meaning of the data. +

    +

    Many programming environments (for example, Java or ICU) do give access to the UCD. However, those + environments tend to lag behind releases of the standard, or support only some of the UCD content. +

    +

    Unibook is a wonderful tool to explore the UCD and in many cases is just the ticket; however, it is + difficult to use when the task at hand has not been built-in, or when non-UCD data is to be displayed as + well. +

    +

    This annex presents an alternative representation of the UCD, which is meant to overcome these + difficulties. We have chosen an XML representation, because parsing becomes a non-issue: there are a number + of XML parsers freely available, and using them is often fairly easy. In addition, there are freely + available tools that can perform powerful operations on XML data; for example, XPATH and XQUERY engines can + be thought of as a “grep” for XML data and XSLT engines can be thought of as + “awk” for XML data. +

    +

    It is important to note that we are interested in exploring the content of the UCD, rather than in using + the UCD data to process character streams. Thus, we are not concerned so much by the speed of processing or + the size of our representation. +

    +

    Our representation supports the creation of documents that represent only parts of the UCD, either by not + representing all the characters, or by not representing all the properties. This can be useful when only + some of the data is needed. +

    +

    This annex presents only the XML representation format of the UCD. The data itself is part of the Unicode + Character Database. +

    +

    + 2 Overall schema +

    +

    + 2.1 General principles +

    +

    Our schema can be used to create and validate documents which are intended to represent properties of + Unicode code points, blocks, named sequences, normalization corrections, standardized variants, CJK + radicals and emoji sources. A document may represent the values actually assigned in a given version of + the UCD, or it may represent a draft version of the UCD, or a private agreement on Private Use + characters. The validity of a XML document with respect to the schema defined in this annex does not + assert anything about the correctness of the values. +

    +

    Valid documents may provide values for only some of the code points, or some of the Unicode + properties. Furthermore, they may also incorporate non-Unicode properties. +

    +

    Our schema is defined using English. However, a useful subset of the validity constraints can be + captured using a schema language, thereby simplifying the task of validating documents. We have chosen + Relax NG [ISO 19757], + in the compact syntax , as the schema language. It is important to stress that the schema which is + defined in English imposes more constraints on the documents than can be validated with the Relax NG + schema. +

    +

    An important characteristic of Relax NG is that its schemas do not modify or augment the infoset of + the documents. Therefore, it is possible to process our XML representation without using the schema. + Also, the schema is relatively straightforward and can be converted mechanically to other schema + languages. +

    +

    While our XML representation is not intended to be used during processing of characters and strings, + it is still a design principle for our schema to support the relatively efficient representation of the + UCD. This is achieved by an inheritance mechanism, similar to property inheritance in CSS or in XSL:FO + (see section 4.3 Group). +

    +

    Many invariants impose constraints on the values of the different properties for a given code point. + For example, if the value of the Numeric Type property is None, then the value of the + Numeric Value property should be the empty string; and if the value of the Other + Alphabetic property is true, then the value of the Alphabetic property should be + true. Those invariants are not captured in the schema. +

    +

    + 2.2 Namespace +

    +

    The namespace for our elements is “http://www.unicode.org/ns/2003/ucd/1.0”. Our + attributes are in the empty namespace. +

    +

    + + [namespace declaration, + 1] + + = + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + +

    +

    In all our examples, we assume that this namespace is the default one. +

    +

    + 2.3 Datatypes +

    +

    We use a standard XML Schema datatypes:

    +

    + + [datatypes declaration, + 2] + + = + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + +

    +

    Characters are pervasive in the UCD, and will need to be represented. Representing characters directly + by themselves would seem the most obvious choice; for example, we could express that the decomposition + of U+00E8 is “&#x0065;&#x0300;”, that is have exactly two characters in (the + infoset of) the XML document. However, the current XML specification limits the set of characters + that can be part of a document. Another problem is that the various tools (XML parser, XPATH engine, + etc.) may equate U+00E8 with U+0065 U+0300, thus making it difficult to figure out which of the two + sequences is contained in the database (which is sometimes important for our purposes). Therefore, we + chose instead to represent characters by their code points; we follow the usual convention of four to + six hexadecimal digits (uppercase) and code points in a sequence separated by space; for example, the + decomposition of U+00E8 will be represented by the nine characters “0065 0300” in the + infoset. +

    +

    + + [datatype for code points, + 3] + + = + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + +

    +

    + 2.4 Root Element +

    +

    The root element of valid documents is a ucd. +

    +

    + + [schema start, + 4] + + = + + start = + element ucd { ucd.content } + +

    +

    + 2.5 Common attributes +

    +

    A large number of properties are boolean. We uniformly use the values Y and + N for those: +

    +

    + + [boolean, + 5] + + = + + boolean = "Y" | "N" + +

    +

    + 2.6 Ordering of elements +

    +

    In elements that hold lists of child elements, such as repertoire, + group, or standardized-variants, the schema does not require that the + child elements be in any particular order. +

    +

    + 3 Description +

    +

    The root element may have a description child element, which in turn contains any string, + which is meant to describe what the XML document purports to describe. +

    +

    It is recommended that if the document purports to represent the UCD of some Unicode version, the + description be selected in accord with the rules listed in [Versions]; and + conversely, that documents which do not purport to represent the UCD be described as such. +

    +

    + + [description, + 6] + + = + + ucd.content &= + element description { text }? + +

    +

    + 4 Repertoire +

    +

    The repertoire child element of the ucd element describes the code points and + their properties. As we will see shortly, code points can be described individually or as part of a group: +

    +

    + + [repertoire, + 7] + + = + + ucd.content &= + element repertoire { (code-point | group) + }? + +

    +

    + 4.1 Sets of code points +

    +

    It is often the case that successive code points have the same property values, for a given set of + properties. The most striking example is that of an unallocated plane, where all but the last two + code points are reserved and have the same property values. Another example is the URO (U+4E00 + .. U+9FA5) where all the code points have the same property values if we ignore their name and their + Unihan properties. +

    +

    + + [Set of code points, + 8] + + = + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + +

    +

    This observation suggests that it is profitable to represent sets of code points which share the + same properties, rather than individual code points. To make the representation of the sets simple, + we restrict them to be segments in the code point space, that is a set is defined by the first and + last code point it contains. Those are captured by the attributes first-cp and + last-cp. The attribute cp is a shorthand notation for the case where the set + has a single code point. +

    +

    In the repertoire, there must be at most one code-point + element for a given code point. +

    +

    + 4.2 Code point types +

    +

    When thinking about Unicode code points, it is useful to split them into four types: +

    + those assigned to abstract characters (PUA or not) + the noncharacters + the surrogate code points + the reserved code points +

    This leads to four elements to describe sets of code points: +

    +

    + + [Code points, + 9] + + = + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + +

    +

    + 4.3 Group +

    +

    While we already recognized the situation where a set of code points have exactly the same set of + property values, another common situation is that of code points which have almost all the same + property values. +

    +

    For example, the characters U+1740 BUHID LETTER A .. U+1753 BUHID VOWEL SIGN U all have the age + “3.2”, and all have the script “Buhd”. On the one hand, it is convenient + to support data files in which those properties are explicitly listed with every code point, at this + makes answering questions like “what is the age of U+1749?” easier, because that data + is expressed right there. On the other hand, this leads to rather large data files, and it also tends + to obscure the differences between similar characters. +

    +

    Our representation accounts for this situation with the notion of groups. A + group element is simply a container of code points that also holds default values for + the properties. If a code point inside a group does not list explicitly a property but the + group lists it, then the code point inherits that property from its + group. For example, the fragment with explicit properties: +

    +
    +    <char cp="1740" age="3.2" na="BUHID LETTER A" gc="Lo" sc="Buhd"/>
    +    <char cp="1741" age="3.2" na="BUHID LETTER I" gc="Lo" sc="Buhd"/>
    +    <char cp="1752" age="3.2" na="BUHID VOWEL SIGN I" gc="Mn" sc="Buhd"/>
    +    <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" gc="Lo" sc="Mong"/>
    +

    is equivalent to this fragment which uses a group: +

    +
    +    <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +    </group>
    +

    The element for U+1740 does not have the age attribute, and it therefore inherits it + from its enclosing group element, that is “3.2”. On the other hand, + the element for U+1820 does have this attribute, so the value is “3.0”. +

    +

    As this example illustrates, the notion of group does not necessarily align with the + notion of Unicode block. It is entirely defined and limited to our representation. In particular, the + value of a property for a code point can always be determined from the XML document alone, assuming + that this property and this code point are expressed at all. Of course, one may create an XML + representation where the groups happen to coincide with the Unicode blocks. +

    +

    Groups cannot be nested. The motivation for this limitation is to make the life of consumers + easier: either a property is defined by the element for a code point, or it is defined by the + immediately enclosing group element. +

    +

    + + [groups, + 10] + + = + + group = + element group { + code-point-attributes, + code-point* } + +

    +

    + 4.4 Properties +

    +

    Each property, except for the Special_Case_Condition and Name_Alias + properties, is represented by an attribute. In an XML data file, the absence of an attribute (may be + only on some code-points) means that the document does not express the value + of the corresponding property. Conversely, the presence of an attribute is an expression of the + corresponding property value; the implied null value is represented by the empty string. +

    +

    The Name_Alias property is represented by zero or more name-alias child + elements. Unlike the situation for properties represented by attributes, it is not possible to determine + whether all the aliases have been represented in a data file by inspecting that data file. +

    +

    The name of an attribute is the abbreviated name of the property as given in the file + PropertyAliases.txt in the corresponding version of the UCD. For the Unihan + properties, the name is that given in the various versions of the Unihan database. +

    +

    For catalog and enumerated properties, the values are those listed in the file + PropertyValueAliases.txt in the corresponding version of the UCD; if there is an abbreviated + name, it is used, otherwise the long name is used. +

    +

    Note that the set of possible values for a property captured in this schema may change from one + version to the next. +

    +

    + 4.4.1 Age property +

    +

    The age attribute captures the version of Unicode in which a code point was + assigned to an abstract character, or made a surrogate or non-character. +

    +

    + + [age attribute, + 11] + + = + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + +

    +

    + 4.4.2 Name properties +

    +

    There are two name properties: the name given by the current version of the standard + (na), and possibly the name this character had in version 1.0 of the standard + (na1). +

    +

    + + [na attribute, + 12] + + = + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + +

    +

    + + [na1 attribute, + 13] + + = + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + +

    +

    The majority of the characters in Unicode have a name which is of the form CJK UNIFIED + IDEOGRAPH-<code point>. It also happens that character names cannot + contain the character U+0023 # NUMBER SIGN, so we adopted the following convention: if a + code point has the attribute na (either directly or by inheritance from an enclosing + group), then occurrences of the character # in the name are to be interpreted as the value of the + code point. For example: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +

    and

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    are equivalent. The # can be in any position in the value of the na + attribute. The convention also applies just as well to a set of multiple code points: +

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-3400"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-3401"/>
    +

    is equivalent to

    +
    +    <char cp="3400" na="CJK UNIFIED IDEOGRAPH-#"/>
    +    <char cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    which in turn is equivalent to:

    +
    +    <char first-cp="3400" last-cp="3401" na="CJK UNIFIED IDEOGRAPH-#"/>
    +

    + 4.4.3 Name Alias properties +

    +

    The Name_Alias property is represented by zero or more name-alias + child elements: +

    +

    + + [name-alias element, + 14] + + = + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + +

    +

    + 4.4.4 Block property +

    +

    The Block property is represented by the blk attribute: +

    +

    + + [blk attribute, + 15] + + = + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + +

    +

    + 4.4.5 General Category +

    +

    The general category is represented by the gc attribute. +

    +

    + + [gc attribute, + 16] + + = + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + +

    +

    + 4.4.6 Combining properties +

    +

    The combining class is represented by the ccc attribute, which holds the decimal + representation of the combining class. +

    +

    Because the set of values that this property has taken across the various versions of the UCD + is rather large, our schema does not restrict the possible values to those actually used. +

    +

    + + [ccc attribute, + 17] + + = + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + +

    +

    + 4.4.7 Bidirectionality properties +

    +

    The bidirectional class is represented by the bc attribute. +

    +

    + + [bc attribute, + 18] + + = + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + +

    +

    The mirrored property is represented by the Bidi_M attribute, which takes a + boolean value. +

    +

    + + [Bidi_M attribute, + 19] + + = + + code-point-attributes &= + attribute Bidi_M { boolean }? + +

    +

    The bmg attribute is the code point of a character whose glyph is typically + a mirrored image of the glyph for the current character. +

    +

    + + [bmg attribute, + 20] + + = + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + +

    +

    Note that we do not express the “Best Fit” element recorded in BidiMirroring.txt. + For one thing, it is not meant to be machine readable. More importantly, the idea underlying the + mirrored glyph is delicate to use, since it makes assumptions about the design of the fonts, and + the best fit goes even farther. +

    +

    The Bidi_Control property is represented by the Bidi_C attribute. +

    +

    + + [Bidi_C attribute, + 21] + + = + + code-point-attributes &= + attribute Bidi_C { boolean }? + +

    +

    The bidi paired bracket type and bidi paired bracket properties are represented by the + bpt and bpb attributes respectively. +

    +

    + + [bpt attribute, + 22] + + = + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + +

    +

    + + [bpb attribute, + 23] + + = + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + +

    +

    + 4.4.8 Decomposition properties +

    +

    The decomposition type and decomposition mapping properties are represented by the dt + and dm attributes. +

    +

    Most characters have a decomposition mapping to themselves. This is very similar to the + situation we encountered with names, and we adopted a similar convention: if the value of a + decomposition mapping is the character itself, we use the attribute value # (U+0023 # + NUMBER SIGN) as a shorthand notation; this enables those attributes to be captured in groups. +

    +

    + + [decomposition properties, + 24] + + = + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + +

    +

    The properties Composition_Exclusion and Full_Composition_Exclusion are + represented by the attributes CE and Comp_Ex: +

    +

    + + [composition properties, + 25] + + = + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + +

    +

    The properties NFC_Quick_Check, NFD_Quick_Check, + NFKC_Quick_Check, NFKD_Quick_Check, Expands_On_NFC, + Expands_On_NFD, Expands_On_NFKC, Expands_On_NKFD, + FC_NFKC_Closure have corresponding attributes. +

    +

    + + [quick check properties, + 26] + + = + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + +

    +

    + 4.4.9 Numeric Properties +

    +

    The numeric type is represented by the nt attribute. +

    +

    The numeric value is represented by the nv attribute, represented as a whole + number or a fraction. +

    +

    + + [numeric properties, + 27] + + = + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + +

    +

    + 4.4.10 Joining properties +

    +

    The joining class of a character is represented by the jt attribute. +

    +

    The jg attribute is the joining group of the character. +

    +

    + + [joining properties, + 28] + + = + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + +

    +

    The Join_Control property is represented by the Join_C attribute. +

    +

    + + [joining properties, + 29] + + = + + code-point-attributes &= + attribute Join_C { boolean }? + +

    +

    + 4.4.11 Linebreak properties +

    +

    The Line_Break property is represented by the lb attribute. +

    +

    + + [lb attribute, + 30] + + = + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + +

    +

    + 4.4.12 East Asian Width property +

    +

    The East Asian width property is represented by the ea attribute. +

    +

    + + [ea attribute, + 31] + + = + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + +

    +

    + 4.4.13 Case properties +

    +

    The Uppercase, Lowercase, Other_Uppercase and + Other_Lowercase properties are represented by corresponding attributes. +

    +

    + + [casing properties, + 32] + + = + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + +

    +

    Most characters have a case mapping and case folding properties that simply map or fold to + themselves. This is very similar to the situation we encountered with names, and we adopted a + similar convention: if the value of a case mapping or case folding property is the character + itself, we use the attribute value # (U+0023 # NUMBER SIGN) as a shorthand notation; this + enables those attributes to be captured in groups. +

    +

    The simple case mappings are recorded in the suc, slc, stc + attributes. +

    +

    + + [casing properties, + 33] + + = + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + +

    +

    The non-simple casing are recorded in the uc, lc and tc + attributes. +

    +

    + + [casing properties, + 34] + + = + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + +

    +

    The Simple_Case_Folding and Case_Folding properties are recorded in the + scf and cf attributes respectively. +

    +

    + + [casing properties, + 35] + + = + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + +

    +

    The Case_Ignorable, Cased, Changes_When_Casefolded, + Changes_When_Casemapped, Changes_When_Lowercased, + Changes_When_NFKC_Casefolded, Changes_When_Titlecased, + Changes_When_Uppercased, NFKC_Casefold, and + NFKC_Simple_Casefold properties are recorded in these attributes: +

    +

    + + [casing properties, + 36] + + = + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + +

    +

    Note that the UCD records more information about case folding than is expressed in the + properties, specifically the entries in CaseFolding.txt with status T. +

    +

    + 4.4.14 Script properties +

    +

    The script and script extension properties are represented by the sc and + scx attributes respectively. +

    +

    + + [script properties, + 37] + + = + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + +

    +

    + 4.4.15 ISO Comment properties +

    +

    The ISO 10646 comment field is represented by the isc attribute. +

    +

    + + [isc attribute, + 38] + + = + + code-point-attributes &= + attribute isc { text }? + +

    +

    + 4.4.16 Hangul properties +

    +

    The property Hangul_Syllable_Type is represented by the hst attribute. +

    +

    + + [hst attribute, + 39] + + = + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + +

    +

    The property Jamo_Short_Name is represented by the JSN attribute: +

    +

    + + [JSN attribute, + 40] + + = + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + +

    +

    + 4.4.17 Indic properties +

    +

    The property Indic_Syllabic_Category is represented by the InSC + attribute. +

    +

    + + [InSC attribute, + 41] + + = + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + +

    +

    The property Indic_Positional_Category is represented by the InPC + attribute: +

    +

    + + [InPC attribute, + 42] + + = + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + +

    +

    The property Indic_Conjunct_Break is represented by the InCB attribute: +

    +

    + + [InCB attribute, + 43] + + = + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + +

    +

    + 4.4.18 Identifier and Pattern and programming language properties +

    +

    The properties ID_Start, Other_ID_Start, XID_Start, + ID_Continue, Other_ID_Continue, XID_Continue, + ID_Compat_Math_Start, and ID_Compat_Math_Continue are represented by + corresponding attributes: +

    +

    + + [identifier properties, + 44] + + = + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + +

    +

    The properties Pattern_Syntax and Pattern_White_Space are represented + by corresponding attributes: +

    +

    + + [pattern properties, + 45] + + = + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + +

    +

    + 4.4.19 Properties related to function and graphic characteristics +

    +

    The properties Dash, Hyphen, Quotation_Mark, + Terminal_Punctuation, Sentence_Terminal, Diacritic, + Extender, Soft_Dotted, Alphabetic, + Other_Alphabetic, Math, Other_Math, Hex_Digit, + ASCII_Hex_Digit, Default_Ignorable_Code_Point, + Other_Default_Ignorable_Code_Point, Logical_Order_Exception, + Prepended_Concatenation_Mark, Modifier_Combining_Mark, + White_Space, Vertical_Orientation, and Regional_Indicator + describe the function or graphic characteristic of a character, and have each a corresponding + attribute. +

    +

    + + [properties related to function and graphic characteristics, + 46] + + = + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + +

    +

    + 4.4.20 Properties related to boundaries +

    +

    The properties Grapheme_Base, Grapheme_Extend, + Other_Grapheme_Extend, Grapheme_Link, + Grapheme_Cluster_Break, Word_Break, and Sentence_Break each + have a corresponding attribute: +

    +

    + + [properties related to boundaries, + 47] + + = + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + +

    +

    + 4.4.21 Properties related to ideographs +

    +

    The properties Ideographic, Unified_Ideograph, + Equivalent_Unified_Ideograph, IDS_Binary_Operator, + IDS_Trinary_Operator, IDS_Unary_Operator, and Radical have + corresponding attributes: +

    +

    + + [properties related to ideographs, + 48] + + = + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + +

    +

    + 4.4.22 Miscellaneous properties +

    +

    The properties Deprecated, Variation_Selector, and + Noncharacter_Code_Point have corresponding attributes: +

    +

    + + [miscellaneous properties, + 49] + + = + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + +

    +

    + 4.4.23 Unihan properties +

    +

    The Unihan properties (from the Unihan database) are represented as attributes. +

    +

    + + [Unihan properties, + 50] + + = + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + +

    +

    + 4.4.24 Tangut data +

    +

    The Tangut data are represented as attributes. The attribute kRSTUnicode + represents the radical stroke index. The attribute kTGT_MergedSrc indicates the + source reference for the character. +

    +

    + + [Tangut data, + 51] + + = + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + +

    +

    + 4.4.25 Nushu data +

    +

    The Nushu data are represented as attributes. The attribute kSrc_NushuDuben + indicates the page number and order of the item from the NushuDuben reference source. Nushu common + reading is represented as kReading.

    +

    + + [Nushu data, + 52] + + = + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + +

    +

    + 4.4.26 Emoji properties +

    +

    The properties Emoji, EPres, EMod, EBase, + EComp, and ExtPict have corresponding attributes: +

    +

    + + [Emoji properties, + 53] + + = + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + +

    +

    + 5 Blocks +

    +

    The blocks child of the ucd describes the blocks. It has one child + block element per block, with attributes to describe the extent and name of the block. +

    +

    + + [blocks, + 54] + + = + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + +

    +

    + 6 Named Sequences +

    +

    The named-sequences child of the ucd describes the named sequences. It has one + child named-sequence element per named sequence, with attributes to describe the name and + sequence. +

    +

    Similarly, the provisional-named-sequences child of the ucd describes the + provisional named sequences. +

    +

    + + [named sequences, + 55] + + = + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + +

    +

    + 7 Normalization Corrections +

    +

    The normalization-corrections child of the ucd describes the normalization + corrections. It has one child normalization-correction element per correction, with + attributes to describe the code point affected, its old normalization, its new normalization and the + version of Unicode in which the correction was made. +

    +

    + + [normalization corrections, + 56] + + = + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + +

    +

    + 8 Standardized Variants +

    +

    The standardized-variants child of the ucd describes the standardized + variant. It has one child element standardized-variant per variant. The attributes on that + last element capture the variation sequence, the description of the desired appearance, and the shaping + environment under which the appearance is different. +

    +

    + + [standardized variants, + 57] + + = + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + +

    +

    + 9 CJK Radicals +

    +

    The cjk-radicals child of the ucd describes the CJK radicals. It has one + child element cjk-radical per radical. The attributes on that last element capture the + radical number, the corresponding CJK radical character, and the corresponding CJK unified ideograph. +

    +

    + + [cjk radicals, + 58] + + = + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + +

    +

    + 10 Emoji sources +

    +

    The emoji-sources child of the ucd describes the emoji sources. +

    +

    + + [emoji sources, + 59] + + = + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + +

    +

    + + [datatype for code points, + 60] + + = + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + +

    +

    + 11 Do Not Emit +

    +

    + The do-not-emit child of the ucd describes the + character sequences that should not be emitted or generated in newly authored texts. + +

    +

    + + [do-not-emit, + 61] + + = + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + +

    +

    + 12 The full schema +

    +

    Our schema is just the accumulation of the pieces we have described so far: +

    +

    + + [UCD RelaxNG schema] + + = + + + [namespace declaration: 1] + + + [datatypes: 2, 3, 60] + + + [schema start: 4] + + + [boolean: 5] + + + [description: 6] + + + [repertoire: 7, 8, 9, 10] + + + [attributes: 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] + + + [Tangut data: 51] + + + [Nushu data: 52] + + + [blocks: 54] + + + [named sequences: 55] + + + [normalization corrections: 56] + + + [standardized variants: 57] + + + [cjk radicals: 58] + + + [emoji sources: 59] + + + [Emoji properties: 53] + + + [do-not-emit: 61] + + +

    +

    An expanded version is linked from the top of this document.

    +

    + 13 Examples +

    +

    Here is a fragment of the UCD for a few representative + characters (only some of the properties are represented): +

    +
    +            
    +  <ucd xmlns="http://www.unicode.org/ns/2003/ucd/1.0">
    +    <repertoire>
    +      <char cp="001F" age="1.1" na="&lt;control&gt;" na1="UNIT SEPARATOR"
    +            gc="Cc" bc="S" lb="CM"/>
    +
    +      <char cp="0020" age="1.1" na="SPACE" gc="Zs" bc="WS" ea="Na" lb="SP"/>
    +
    +      <char cp="0026" age="1.1" na="AMPERSAND" gc="Po" bc="ON" ea="Na"/>
    +
    +      <char cp="0028" age="1.1" na="LEFT PARENTHESIS" na1="OPENING PARENTHESIS"
    +            gc="Ps" bc="ON" Bidi_M="y" bmg="0029" ea="Na" lb="OP"/>
    +
    +      <char cp="0041" age="1.1" na="LATIN CAPITAL LETTER A"
    +            gc="Lu" slc="0061" ea="Na" sc="Latn"/>
    +
    +      <char cp="AC00" age="2.0" na="HANGUL SYLLABLE GA" gc="Lo"
    +            dt="can" dm="1100 1161" ea="W" lb="ID" sc="Hang"/>
    +
    +      <char cp="20094" age="3.1" na="CJK UNIFIED IDEOGRAPH-20094"
    +            gc="Lo" ea="W" lb="ID" sc="Hani" kIRG_GSource="KX"
    +            kIRGHanyuDaZidian="10036.060" kIRG_TSource="5-214E"
    +           kRSUnicode="4.3" kIRGKangXi="0082.090"/>
    +
    +      <group age="3.2" gc="Lo" sc="Buhd">
    +        <char cp="1740" na="BUHID LETTER A"/>
    +        <char cp="1741" na="BUHID LETTER I"/>
    +        <char cp="1752" na="BUHID VOWEL SIGN I" gc="Mn"/>
    +        <char cp="1820" age="3.0" na="MONGOLIAN LETTER A" sc="Mong"/>
    +      </group>
    +    </repertoire>
    +  </ucd>
    +
    +
    +

    + Acknowledgments +

    +

    Thanks to Markus Scherer and Mark Davis for their help developing this XML representation. Thanks to + the reviewers: Julie Allen, Ernest van den Boogaard, Daniel Bünzli, John Cowan, Asmus Freytag, + Felix Sasaki, Andrew West. Special thanks to Eric Muller and Laurențiu Iancu. +

    +

    + Modifications +

    +

    This section indicates the changes introduced by each revision.

    +
    +

    + Revision 36 +

    +
      +
    • New value for the age attribute: 16.0. +
    • +
    • New values for the blk attribute: Egyptian_Hieroglyphs_Ext_A, + Garay, Gurung_Khema, Kirat_Rai, Myanmar_Ext_C, + Ol_Onal, Sunuwar, Symbols_for_Legacy_Computing_Sup, + Todhri, Tulu_Tigalari. +
    • +
    • New values for the script attribute: Gara, Gukh, + Krai, Onao, Sunu, Todr, Tutg. +
    • +
    • New value for the jg attribute: Kashmiri_Yeh.
    • +
    • New value for the InSC attribute: Reordering_Killer. +
    • +
    • New attributes: MCM, kFanqie, kZhuang. +
    • +
    • Modified patterns for the cjk-radical/@number, kRSUnicode and + kIRG_GSource + attributes. +
    • +
    • Added the do-not-emit element. +
    • +
    +
    +
    +

    Revision 35 being a proposed update, only changes between revisions 34 and 36 are + noted here. +

    +
    +
    +

    + Revision 34 +

    +
      +
    • New value for the age attribute: 15.1. +
    • +
    • New value for the blk attribute: CJK_Ext_I. +
    • +
    • New values for the lb attribute: AK, AP, + AS, VF, VI. +
    • +
    • Modified values for the number, radical attributes of the + cjk-radical + element. +
    • +
    • Changed single value into list for the nv code point attribute. +
    • +
    • New code point attributes: ID_Compat_Math_Continue, + ID_Compat_Math_Start, IDSU, NFKC_SCF, InCB. +
    • +
    • Modified patterns for the kBigFive, kIRG_GSource, + kMorohashi, kRSUnicode attributes. +
    • +
    • Changed single values into lists for the kMorohashi, kPrimaryNumeric + Unihan attributes. +
    • +
    • New Unihan attributes: kJapanese, kMojiJoho, + kSMSZD2003Index, kSMSZD2003Readings, kVietnameseNumeric, + kZhuangNumeric. +
    • +
    +
    +
    +

    Revision 33 being a proposed update, only changes between revisions 32 and 34 are + noted here. +

    +
    +
    +

    + Revision 32 +

    +
      +
    • New value for the age attribute: 15.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_C, CJK_Ext_H, + Cyrillic_Ext_D, Devanagari_Ext_A, Kaktovik_Numerals, Kawi, + Nag_Mundari. +
    • +
    • New values for the script attribute: Kawi, Nagm. +
    • +
    • New Unihan attribute: kAlternateTotalStrokes. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_TSource, kSemanticVariant, kSpecializedSemanticVariant, + kZVariant + attributes. +
    • +
    +
    +
    +

    Revision 31 being a proposed update, only changes between revisions 30 and 32 are + noted here. +

    +
    +
    +

    + Revision 30 +

    +
      +
    • New value for the age attribute: 14.0. +
    • +
    • New values for the blk attribute: Arabic_Ext_B, + Cypro_Minoan, Ethiopic_Ext_B, Kana_Ext_B, + Latin_Ext_F, Latin_Ext_G, Old_Uyghur, Tangsa, + Toto, UCAS_Ext_A, Vithkuqi, Znamenny_Music. +
    • +
    • New values for the script attribute: Cpmn, Ougr, + Tnsa, Toto, Vith. +
    • +
    • New values for the jg attribute: Thin_Yeh, Vertical_Tail. +
    • +
    • New Unihan attribute: kStrange. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_MSource, + kIRG_VSource, kPhonetic, kSpoofingVariant attributes. +
    • +
    • Removal of the kWubi attribute, which has never been present in + released versions of the UCD. +
    • +
    +
    +
    +

    Revision 29 being a proposed update, only changes between revisions 28 and 30 are + noted here. +

    +
    +
    +

    + Revision 28 +

    +
      +
    • New value for the age attribute: 13.0. +
    • +
    • New values for the blk attribute: Chorasmian, CJK_Ext_G, + Dives_Akuru, Khitan_Small_Script, Lisu_Sup, + Symbols_For_Legacy_Computing, Tangut_Sup, Yezidi. +
    • +
    • New values for the script attribute: Chrs, Diak, + Kits, Yezi. +
    • +
    • New value for the InPC attribute: Top_And_Bottom_And_Left. +
    • +
    • New Unihan attributes kSpoofingVariant, kUnihanCore2020, + kIRG_SSource, kIRG_UKSource, kTGHZ2013. +
    • +
    • New Emoji attributes Emoji, EPres, EMod, + EBase, EComp, ExtPict. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, kKangXi, + kSemanticVariant, kSimplifiedVariant, + kSpecializedSemanticVariant, kTraditionalVariant attributes. +
    • +
    +
    +
    +

    Revision 27 being a proposed update, only changes between revisions 26 and 28 are + noted here. +

    +
    +
    +

    + Revision 26 +

    +
      +
    • New value for the age attribute: 12.1. +
    • +
    +
    +
    +

    + Revision 25 +

    +
      +
    • New value for the age attribute: 12.0. +
    • +
    • New values for the script attribute: Elym, Hmnp, + Nand, Wcho. +
    • +
    • New values for the blk attribute: + Egyptian_Hieroglyph_Format_Controls, Elymaic, Nandinagari, + Nyiakeng_Puachue_Hmong, Ottoman_Siyaq_Numbers, Small_Kana_Ext, + Symbols_And_Pictographs_Ext_A, Tamil_Sup, Wancho. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_KSource, + kIRG_TSource, kTaiwanTelegraph attributes. +
    • +
    +
    +
    +

    Revision 24 being a proposed update, only changes between revisions 23 and 25 are + noted here. +

    +
    +
    +

    + Revision 23 +

    +
      +
    • New value for the age attribute: 11.0. +
    • +
    • New values for the blk attribute: Chess_Symbols, + Dogra, Georgian_Ext, Gunjala_Gondi, + Hanifi_Rohingya, Indic_Siyaq_Numbers, Makasar, + Mayan_Numerals, Medefaidrin, Old_Sogdian, Sogdian. +
    • +
    • New values for the script attribute: Dogr, Gong, + Maka, Medf, Rohg, Sogd, Sogo. +
    • +
    • New values for the jg attribute: Hanifi_Rohingya_Kinna_Ya, + Hanifi_Rohingya_Pa. +
    • +
    • New value for the wb attribute: WSegSpace. +
    • +
    • New values for the InSC attribute: Consonant_Initial_Postfixed. +
    • +
    • New attributes: EqUIdeo, kJinmeiyoKanji, kJoyoKanji, + kKoreanEducationHanja, kKoreanName, kTGH. +
    • +
    • Modified patterns for the kTGT_MergedSrc attribute. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_HSource and + kIRG_VSource + attributes. +
    • +
    +
    +
    +

    Revision 22 being a proposed update, only changes between revisions 21 and 23 are + noted here. +

    +
    +
    +

    + Revision 21 +

    +
      +
    • New value for the age attribute: 10.0. +
    • +
    • New values for the blk attribute: CJK_Ext_F, Kana_Ext_A, + Masaram_Gondi, Nushu, Soyombo, Syriac_Sup, + Zanabazar_Square. +
    • +
    • New values for the sc attribute: Gonm, Nshu, + Soyo, Zanb. +
    • +
    • New values for the jg attribute: Malayalam_Nga, + Malayalam_Ja, Malayalam_Nya, Malayalam_Tta, Malayalam_Nna, + Malayalam_Nnna, Malayalam_Bha, Malayalam_Ra, + Malayalam_Lla, Malayalam_Llla, Malayalam_Ssa. +
    • +
    • New value for the InPC attribute: Bottom_And_Left. +
    • +
    • Modified patterns for the kIRG_GSource, kIRG_JSource, + kIRG_KSource + attributes. +
    • +
    • New code point attributes: vo, + RI +
    • +
    • New code point attributes for Nushu data: kSrc_NushuDuben and + kReading. +
    • +
    +
    +
    +

    Revision 20 being a proposed update, only changes between revisions 19 and 21 are + noted here. +

    +
    +
    +

    + Revision 19 +

    +
      +
    • New value for the age attribute: 9.0. +
    • +
    • New values for the sc attribute: Adlm, Bhks, + Marc, Newa, Osge, Tang. +
    • +
    • New values for the blk attribute: Adlam, Bhaiksuki, + Cyrillic_Ext_C, Glagolitic_Sup, Ideographic_Symbols, + Marchen, Mongolian_Sup, Newa, Osage, + Tangut, Tangut_Components. +
    • +
    • New values for the gcb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the wb attribute: EB, EBG, EM, + GAZ, ZWJ. +
    • +
    • New values for the lb attribute: EB, EM, ZWJ. +
    • +
    • New values for the jg attribute: African_Feh, + African_Noon, African_Qaf. +
    • +
    • New code point attributes: PCM, kRSTUnicode and + kTGT_MergedSrc. +
    • +
    • Modified patterns for the kRSUnicode, kRSKangXi, + kMandarin, kIRG_JSource, kIRG_USource and kFennIndex + attributes. +
    • +
    +
    +
    +

    Revision 18 being a proposed update, only changes between revisions 17 and 19 are + noted here. +

    +
    +
    +

    + Revision 17 +

    +
      +
    • New value for the age attribute: 8.0. +
    • +
    • New values for the sc attribute: Ahom, Hatr, + Hluw, Hung, Mult, Sgnw. +
    • +
    • New values for the blk attribute: Ahom, + Anatolian_Hieroglyphs, Cherokee_Sup, CJK_Ext_E, + Early_Dynastic_Cuneiform, Hatran, Multani, Old_Hungarian, + Sup_Symbols_And_Pictographs, Sutton_SignWriting. +
    • +
    • New values for the InSC attribute: Consonant_Killer, + Consonant_Prefixed, Consonant_With_Stacker, Syllable_Modifier. +
    • +
    • New code point attributes: InPC, kJa. +
    • +
    • New patterns for the kIRG_GSource attribute: GFC-, GGFZ-. +
    • +
    • Switched the reference to ISO 19757 from :2003 and :2003 Amd1 to :2008.
    • +
    +
    +
    +

    Revision 16 being a proposed update, only changes between revisions 15 and 17 are + noted here. +

    +
    +
    +

    + Revision 15 +

    +
      +
    • New value for the age attribute: 7.0. +
    • +
    • New values for the jg attribute. +
    • +
    • New values for the sc attribute. +
    • +
    • New values for the blk attribute. +
    • +
    • New values for the InSC attribute. +
    • +
    • New values for the kIICore attribute. +
    • +
    • New values for the kIRG_GSource attribute. +
    • +
    +
    +
    +

    Revision 14 being a proposed update, only changes between revisions 13 and 15 are + noted here. +

    +
    +
    +

    + Revision 13 +

    +
      +
    • New value for the age attribute: 6.3. +
    • +
    • New values DQ, HL, SQ for the WB attribute(forUnicode6.3). +
    • +
    • New code point attributes bpt and bpb (for Unicode 6.3). +
    • +
    • New values for the bc attribute: LRI, RLI, FSI, + PDI + (for Unicode 6.3). +
    • +
    • Updated the patterns for kHanyuPinlu and kTotalStrokes (for + Unicode6.3). +
    • +
    • Updated the patterns for kIRG_HSource and kIRG_HSource (for + Unicode6.2). +
    • +
    • Clarified that the child elements list-like elements are in no particular order.
    • +
    +
    +
    +

    Revision 12 being a proposed update, only changes between revisions 11 and 13 are + noted here. +

    +
    +
    +

    + Revision 11 +

    +
      +
    • New value for the age attribute: 6.2. +
    • +
    • New value for the gcb, wb and lb attributes: + RI + (for Unicode 6.2). +
    • +
    • Updated the patterns for kIRG_GSource and kIRG_HSource (for + Unicode 6.2). +
    • +
    +
    +
    +

    Revision 10 being a proposed update, only changes between revisions 9 and 11 are + noted here. +

    +
    +
    +

    + Revision 9 +

    +
      +
    • Clarified the default values.
    • +
    • Indicate that property values may change from one release to the next.
    • +
    • Introduced the blk attributes, for the Block property. +
    • +
    • Introduced the scx attribute, for the ScriptExtensions property. +
    • +
    • Introduced the name-alias element, for the Name_Alias property. +
    • +
    • New value for the age attribute: 6.1. +
    • +
    • New values for the script attribute: Cakm, Merc, + Mero, Plrd, Shrd, Sora, Takr. +
    • +
    • New values for the lb attribute: HL and CJ. +
    • +
    • New value for the jg attribute: Rohingya_Yeh. +
    • +
    • The value of the fc_nfkc attribute must now be either # or + one-or-more-code-points. +
    • +
    • For the nv attribute, the absence of a numeric value is now represented by + NaN + rather than by the empty string. +
    • +
    • The values of the ccc are now restricted to 0..254, instead of 0..255. +
    • +
    • Updated the patterns for kSemanticVariant, + kSpecializedSemanticVariant, kIRG_USource, and kMandarin. +
    • +
    +
    +
    +

    Revision 8 being a proposed update, only changes between revisions 7 and 9 are noted + here. +

    +
    +
    +

    + Revision 7 +

    +
      +
    • New value for the age attribute: 6.0. +
    • +
    • New value for the jg attribute: + Teh_Marbuta_Goal +
    • +
    • New values for the script attribute: Batk, Brah, + Mand. +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_HSource, + kIRG_JSource, kIRG_KSource, kIRG_MSource, + kIRG_TSource, kIRG_VSource. +
    • +
    • Added the InSC and InMC elements. +
    • +
    • Added the emoji-sources element. +
    • +
    +
    +
    +

    Revision 6 being a proposed update, only changes between revisions 5 and 7 are noted + here. +

    +
    +
    +

    + Revision 5 +

    +
      +
    • Changed the type of block/@first-cp, block/@last-cp and + normalization-corrections/@cp + from text to + single-code-point +
    • +
    • Changed the type of named-sequence/@cps, + provisional-named-sequences/@cps, normalization-correction/@old and + normalization-correction/@new + from text to one-or-more-code-points. +
    • +
    • Changed the type of standardized-variants/@cps from text to + two-code-points. +
    • +
    • New values for the jg attribute: Farsi_Yeh and Nya. +
    • +
    • New value for the age attribute: 5.2. +
    • +
    • New values for the sc attribute: Lana, Tavt, + Avst, Egyp, Samr, Lisu, Bamu, Java, + Mtei, Armi, Sarb, Prti, Phli, Orkh, + Kthi. +
    • +
    • New value for the lb attribute: CP. +
    • +
    • New value for the sc attribute: Zinh. +
    • +
    • New code point attributes CI, Cased, CWCF, + CWCM, CWL, CWKCF, CWT, CWU, + NFKC_CF. +
    • +
    • New attributes kHanyuPinyin and kIRG_MSource. +
    • +
    • New element + cjk-radicals +
    • +
    • Updated the patterns for kIRG_GSource, kIRG_JSource, + kIRG_KPSource, kIRG_KSource, kIRG_TSource, + kIRG_VSource, kHanyuPinlu, kMandarin, + kSemanticVariant, kSpecializedSemanticVariant, + kVietnamese, kZVariant. +
    • +
    • Point out that Relax NG schemas do not modify or augment the infoset, and that it ispossible + to convert mechanically our schema to other schema languages. +
    • +
    +
    +
    +

    Revision 4 being a proposed update, only changes between revisions 3 and 5 are noted + here. +

    +
    +
    +

    + Revision 3 +

    +
      +
    • First approved version, for Unicode 5.1.0.
    • +
    • For optional elements which acts as collections, such as repertoire and + named-sequences, impose that there be at least one element in the collection. +
    • +
    • Remove the constraint that the value jg is limited when jt has + certainvalues; similarly for bmg / Bidi_M and for nv / + nt. +
    • +
    • Value NL added to the WB attribute (for Unicode 5.1). +
    • +
    • Value PP added to the GCB attribute (for Unicode 5.1). +
    • +
    • Corrected the Vai script value to Vaii. +
    • +
    • Removed the discussion of elements or attributes in different namespace.
    • +
    • Removed the code-point element. +
    • +
    +
    +
    +

    + Revision 2 +

    +
      +
    • Promoted to Draft UAX.
    • +
    • Changed the title from "An XML representation of the UCD"
    • +
    • Value 5.1 added to the age attribute (for Unicode 5.1). +
    • +
    • Value SM added to the gcb attribute (for Unicode 5.1). +
    • +
    • Values CR, Extend, LF, MB added to the + WB + attribute(forUnicode5.1). +
    • +
    • Values CR, EX, LF, SC added to the SB + attribute(forUnicode5.1). +
    • +
    • Value Burushaski_Yeh_Barree added to the jg attribute (for + Unicode5.1). +
    • +
    • Value Alef_Maqsurah added to the jg attribute (for Unicode 2.x). +
    • +
    • Values Cari, Cham, Kali, Lepc, + Lyci, Lydi, Olck, Rjng, Saur, Sund and + Vai + added to the sc attribute (forUnicode5.0). +
    • +
    • + jamo + attribute renamed to + JSN +
    • +
    • + sfc + attribute renamed to + scf +
    • +
    • Attribute kXHC1983 added (for Unicode 5.1.0). +
    • +
    • Pattern for attribute kIRG_USource extended (for Unicode 5.1.0). +
    • +
    • Element provisional-named-sequences added (for Unicode 5.0) +
    • +
    +
    +
    +

    + Revision 1 +

    +
      +
    • First working draft.
    • +
    +
    +
    + + + +
    + + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc new file mode 100644 index 000000000..84d9b5875 --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/output/index.rnc @@ -0,0 +1,1455 @@ + + # Copyright © 2024 Unicode, Inc. + + + + default namespace ucd = "http://www.unicode.org/ns/2003/ucd/1.0" + + + # default; datatypes xsd = "http://www.w3.org/2001/XMLSchema-datatypes" + + single-code-point = xsd:string { pattern = "(|[1-9A-F]|(10))[0-9A-F]{4}" } + + one-or-more-code-points = list { single-code-point + } + zero-or-more-code-points = list { single-code-point * } + two-code-points = list { single-code-point, single-code-point } + + jis-code-point = xsd:string { pattern = "[0-9A-F]{4}" } + + + start = + element ucd { ucd.content } + + + boolean = "Y" | "N" + + + ucd.content &= + element description { text }? + + + ucd.content &= + element repertoire { (code-point | group) + }? + + set-of-code-points = + attribute cp { single-code-point } + | ( attribute first-cp { single-code-point }, + attribute last-cp { single-code-point } ) + + code-point |= + element reserved { + set-of-code-points, + code-point-attributes } + + code-point |= + element noncharacter { + set-of-code-points, + code-point-attributes } + + code-point |= + element surrogate { + set-of-code-points, + code-point-attributes } + + code-point |= + element char { + set-of-code-points, + code-point-attributes } + + group = + element group { + code-point-attributes, + code-point* } + + + code-point-attributes &= + attribute age { "1.1" + | "2.0" | "2.1" + | "3.0" | "3.1" | "3.2" + | "4.0" | "4.1" + | "5.0" | "5.1" | "5.2" + | "6.0" | "6.1" | "6.2" | "6.3" + | "7.0" + | "8.0" + | "9.0" + | "10.0" + | "11.0" + | "12.0" | "12.1" + | "13.0" + | "14.0" + | "15.0" | "15.1" + | "16.0" + | "17.0" + | "unassigned" + }? + + code-point-attributes &= + attribute na { "" | + "CJK UNIFIED IDEOGRAPH-#" | + "CJK COMPATIBILITY IDEOGRAPH-#" | + "EGYPTIAN HIEROGLYPH-#" | + "TANGUT IDEOGRAPH-#" | + "KHITAN SMALL SCRIPT CHARACTER-#" | + "NUSHU CHARACTER-#" | + xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } + }? + + code-point-attributes &= + attribute na1 { "" | xsd:string { pattern="[a-zA-Z0-9]+([\-_ ][a-zA-Z0-9]+)*( \(.*\))?" } }? + + code-point-attributes &= + element name-alias { + attribute alias { xsd:string { pattern="[a-zA-Z0-9]+(( -|- |[\-_ ])[a-zA-Z0-9]+)*" } }?, + attribute type { "abbreviation" | "alternate" + | "control" | "correction" + | "figment" + }? } * + + code-point-attributes &= + attribute blk { "Adlam" + | "Aegean_Numbers" + | "Ahom" + | "Alchemical" + | "Alphabetic_PF" + | "Anatolian_Hieroglyphs" + | "Ancient_Greek_Music" + | "Ancient_Greek_Numbers" + | "Ancient_Symbols" + | "Arabic" + | "Arabic_Ext_A" + | "Arabic_Ext_B" + | "Arabic_Ext_C" + | "Arabic_Math" + | "Arabic_PF_A" + | "Arabic_PF_B" + | "Arabic_Sup" + | "Armenian" + | "Arrows" + | "ASCII" + | "Avestan" + | "Balinese" + | "Bamum" + | "Bamum_Sup" + | "Bassa_Vah" + | "Batak" + | "Bengali" + | "Bhaiksuki" + | "Block_Elements" + | "Bopomofo" + | "Bopomofo_Ext" + | "Box_Drawing" + | "Brahmi" + | "Braille" + | "Buginese" + | "Buhid" + | "Byzantine_Music" + | "Carian" + | "Caucasian_Albanian" + | "Chakma" + | "Cham" + | "Cherokee" + | "Cherokee_Sup" + | "Chess_Symbols" + | "Chorasmian" + | "CJK" + | "CJK_Compat" + | "CJK_Compat_Forms" + | "CJK_Compat_Ideographs" + | "CJK_Compat_Ideographs_Sup" + | "CJK_Ext_A" + | "CJK_Ext_B" + | "CJK_Ext_C" + | "CJK_Ext_D" + | "CJK_Ext_E" + | "CJK_Ext_F" + | "CJK_Ext_G" + | "CJK_Ext_H" + | "CJK_Ext_I" + | "CJK_Radicals_Sup" + | "CJK_Strokes" + | "CJK_Symbols" + | "Compat_Jamo" + | "Control_Pictures" + | "Coptic" + | "Coptic_Epact_Numbers" + | "Counting_Rod" + | "Cuneiform" + | "Cuneiform_Numbers" + | "Currency_Symbols" + | "Cypriot_Syllabary" + | "Cypro_Minoan" + | "Cyrillic" + | "Cyrillic_Ext_A" + | "Cyrillic_Ext_B" + | "Cyrillic_Ext_C" + | "Cyrillic_Ext_D" + | "Cyrillic_Sup" + | "Deseret" + | "Devanagari" + | "Devanagari_Ext" + | "Devanagari_Ext_A" + | "Diacriticals" + | "Diacriticals_Ext" + | "Diacriticals_For_Symbols" + | "Diacriticals_Sup" + | "Dingbats" + | "Dives_Akuru" + | "Dogra" + | "Domino" + | "Duployan" + | "Early_Dynastic_Cuneiform" + | "Egyptian_Hieroglyph_Format_Controls" + | "Egyptian_Hieroglyphs" + | "Egyptian_Hieroglyphs_Ext_A" + | "Elbasan" + | "Elymaic" + | "Emoticons" + | "Enclosed_Alphanum" + | "Enclosed_Alphanum_Sup" + | "Enclosed_CJK" + | "Enclosed_Ideographic_Sup" + | "Ethiopic" + | "Ethiopic_Ext" + | "Ethiopic_Ext_A" + | "Ethiopic_Ext_B" + | "Ethiopic_Sup" + | "Garay" + | "Geometric_Shapes" + | "Geometric_Shapes_Ext" + | "Georgian" + | "Georgian_Ext" + | "Georgian_Sup" + | "Glagolitic" + | "Glagolitic_Sup" + | "Gothic" + | "Grantha" + | "Greek" + | "Greek_Ext" + | "Gujarati" + | "Gunjala_Gondi" + | "Gurmukhi" + | "Gurung_Khema" + | "Half_And_Full_Forms" + | "Half_Marks" + | "Hangul" + | "Hanifi_Rohingya" + | "Hanunoo" + | "Hatran" + | "Hebrew" + | "High_PU_Surrogates" + | "High_Surrogates" + | "Hiragana" + | "IDC" + | "Ideographic_Symbols" + | "Imperial_Aramaic" + | "Indic_Number_Forms" + | "Indic_Siyaq_Numbers" + | "Inscriptional_Pahlavi" + | "Inscriptional_Parthian" + | "IPA_Ext" + | "Jamo" + | "Jamo_Ext_A" + | "Jamo_Ext_B" + | "Javanese" + | "Kaithi" + | "Kaktovik_Numerals" + | "Kana_Ext_A" + | "Kana_Ext_B" + | "Kana_Sup" + | "Kanbun" + | "Kangxi" + | "Kannada" + | "Katakana" + | "Katakana_Ext" + | "Kawi" + | "Kayah_Li" + | "Kharoshthi" + | "Khitan_Small_Script" + | "Khmer" + | "Khmer_Symbols" + | "Khojki" + | "Khudawadi" + | "Kirat_Rai" + | "Lao" + | "Latin_1_Sup" + | "Latin_Ext_A" + | "Latin_Ext_Additional" + | "Latin_Ext_B" + | "Latin_Ext_C" + | "Latin_Ext_D" + | "Latin_Ext_E" + | "Latin_Ext_F" + | "Latin_Ext_G" + | "Lepcha" + | "Letterlike_Symbols" + | "Limbu" + | "Linear_A" + | "Linear_B_Ideograms" + | "Linear_B_Syllabary" + | "Lisu" + | "Lisu_Sup" + | "Low_Surrogates" + | "Lycian" + | "Lydian" + | "Mahajani" + | "Mahjong" + | "Makasar" + | "Malayalam" + | "Mandaic" + | "Manichaean" + | "Marchen" + | "Masaram_Gondi" + | "Math_Alphanum" + | "Math_Operators" + | "Mayan_Numerals" + | "Medefaidrin" + | "Meetei_Mayek" + | "Meetei_Mayek_Ext" + | "Mende_Kikakui" + | "Meroitic_Cursive" + | "Meroitic_Hieroglyphs" + | "Miao" + | "Misc_Arrows" + | "Misc_Math_Symbols_A" + | "Misc_Math_Symbols_B" + | "Misc_Pictographs" + | "Misc_Symbols" + | "Misc_Technical" + | "Modi" + | "Modifier_Letters" + | "Modifier_Tone_Letters" + | "Mongolian" + | "Mongolian_Sup" + | "Mro" + | "Multani" + | "Music" + | "Myanmar" + | "Myanmar_Ext_A" + | "Myanmar_Ext_B" + | "Myanmar_Ext_C" + | "Nabataean" + | "Nag_Mundari" + | "Nandinagari" + | "NB" + | "New_Tai_Lue" + | "Newa" + | "NKo" + | "Number_Forms" + | "Nushu" + | "Nyiakeng_Puachue_Hmong" + | "OCR" + | "Ogham" + | "Ol_Chiki" + | "Ol_Onal" + | "Old_Hungarian" + | "Old_Italic" + | "Old_North_Arabian" + | "Old_Permic" + | "Old_Persian" + | "Old_Sogdian" + | "Old_South_Arabian" + | "Old_Turkic" + | "Old_Uyghur" + | "Oriya" + | "Ornamental_Dingbats" + | "Osage" + | "Osmanya" + | "Ottoman_Siyaq_Numbers" + | "Pahawh_Hmong" + | "Palmyrene" + | "Pau_Cin_Hau" + | "Phags_Pa" + | "Phaistos" + | "Phoenician" + | "Phonetic_Ext" + | "Phonetic_Ext_Sup" + | "Playing_Cards" + | "Psalter_Pahlavi" + | "PUA" + | "Punctuation" + | "Rejang" + | "Rumi" + | "Runic" + | "Samaritan" + | "Saurashtra" + | "Sharada" + | "Shavian" + | "Shorthand_Format_Controls" + | "Siddham" + | "Sinhala" + | "Sinhala_Archaic_Numbers" + | "Small_Forms" + | "Small_Kana_Ext" + | "Sogdian" + | "Sora_Sompeng" + | "Soyombo" + | "Specials" + | "Sundanese" + | "Sundanese_Sup" + | "Sunuwar" + | "Sup_Arrows_A" + | "Sup_Arrows_B" + | "Sup_Arrows_C" + | "Sup_Math_Operators" + | "Sup_PUA_A" + | "Sup_PUA_B" + | "Sup_Punctuation" + | "Sup_Symbols_And_Pictographs" + | "Super_And_Sub" + | "Sutton_SignWriting" + | "Syloti_Nagri" + | "Symbols_And_Pictographs_Ext_A" + | "Symbols_For_Legacy_Computing" + | "Symbols_For_Legacy_Computing_Sup" + | "Syriac" + | "Syriac_Sup" + | "Tagalog" + | "Tagbanwa" + | "Tags" + | "Tai_Le" + | "Tai_Tham" + | "Tai_Viet" + | "Tai_Xuan_Jing" + | "Takri" + | "Tamil" + | "Tamil_Sup" + | "Tangsa" + | "Tangut" + | "Tangut_Components" + | "Tangut_Sup" + | "Telugu" + | "Thaana" + | "Thai" + | "Tibetan" + | "Tifinagh" + | "Tirhuta" + | "Todhri" + | "Toto" + | "Transport_And_Map" + | "Tulu_Tigalari" + | "UCAS" + | "UCAS_Ext" + | "UCAS_Ext_A" + | "Ugaritic" + | "Vai" + | "Vedic_Ext" + | "Vertical_Forms" + | "Vithkuqi" + | "VS" + | "VS_Sup" + | "Wancho" + | "Warang_Citi" + | "Yezidi" + | "Yi_Radicals" + | "Yi_Syllables" + | "Yijing" + | "Zanabazar_Square" + | "Znamenny_Music" + }? + + code-point-attributes &= + attribute gc { "Cc" | "Cf" | "Cn" | "Co" | "Cs" + | "Ll" | "Lm" | "Lo" | "Lt" | "Lu" + | "Mc" | "Me" | "Mn" + | "Nd" | "Nl" | "No" + | "Pc" | "Pd" | "Pe" | "Pf" | "Pi" | "Po" | "Ps" + | "Sc" | "Sk" | "Sm" | "So" + | "Zl" | "Zp" | "Zs" + }? + + code-point-attributes &= + attribute ccc { xsd:integer { minInclusive="0" maxInclusive="254" } }? + + code-point-attributes &= + attribute bc { "AL" | "AN" + | "B" | "BN" + | "CS" + | "EN" | "ES" | "ET" + | "FSI" + | "L" | "LRE" | "LRI" | "LRO" + | "NSM" + | "ON" + | "PDF" | "PDI" + | "R" | "RLE" | "RLI" | "RLO" + | "S" + | "WS" + }? + + code-point-attributes &= + attribute Bidi_M { boolean }? + + code-point-attributes &= + attribute bmg { "" | single-code-point }? + + code-point-attributes &= + attribute Bidi_C { boolean }? + + code-point-attributes &= + attribute bpt { "o" | "c" | "n" }? + + code-point-attributes &= + attribute bpb { "#" | single-code-point }? + + code-point-attributes &= + attribute dt { "can" | "com" | "enc" | "fin" | "font" | "fra" + | "init" | "iso" | "med" | "nar" | "nb" | "sml" + | "sqr" | "sub" | "sup" | "vert" | "wide" | "none" + }? + + code-point-attributes &= + attribute dm { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute CE { boolean }? + + code-point-attributes &= + attribute Comp_Ex { boolean }? + + code-point-attributes &= + attribute NFC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFD_QC { "Y" | "N" }? + + code-point-attributes &= + attribute NFKC_QC { "Y" | "N" | "M" }? + + code-point-attributes &= + attribute NFKD_QC { "Y" | "N" }? + + + code-point-attributes &= + attribute XO_NFC { boolean }? + + code-point-attributes &= + attribute XO_NFD { boolean }? + + code-point-attributes &= + attribute XO_NFKC { boolean }? + + code-point-attributes &= + attribute XO_NFKD { boolean }? + + + code-point-attributes &= + attribute FC_NFKC { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute nt { "De" | "Di" | "Nu" | "None" }? + + code-point-attributes &= + attribute nv { "NaN" | xsd:string { pattern="-?[0-9]+(/[0-9]+)?" } }? + + code-point-attributes &= + attribute jt { "C" | "D" | "L" | "R" | "T" | "U" }? + + code-point-attributes &= + attribute jg { "African_Feh" | "African_Noon" | "African_Qaf" + | "Ain" | "Alaph" | "Alef" + | "Beh" | "Beth" | "Burushaski_Yeh_Barree" + | "Dal" | "Dalath_Rish" + | "E" + | "Farsi_Yeh" | "Fe" | "Feh" | "Final_Semkath" + | "Gaf" | "Gamal" + | "Hah" | "Hanifi_Rohingya_Kinna_Ya" + | "Hanifi_Rohingya_Pa" | "He" | "Heh" | "Heh_Goal" + | "Heth" + | "Kaf" | "Kaph" | "Kashmiri_Yeh" | "Khaph" + | "Knotted_Heh" + | "Lam" | "Lamadh" + | "Malayalam_Bha" | "Malayalam_Ja" | "Malayalam_Lla" + | "Malayalam_Llla" | "Malayalam_Nga" + | "Malayalam_Nna" | "Malayalam_Nnna" + | "Malayalam_Nya" | "Malayalam_Ra" | "Malayalam_Ssa" + | "Malayalam_Tta" | "Manichaean_Aleph" + | "Manichaean_Ayin" | "Manichaean_Beth" + | "Manichaean_Daleth" | "Manichaean_Dhamedh" + | "Manichaean_Five" | "Manichaean_Gimel" + | "Manichaean_Heth" | "Manichaean_Hundred" + | "Manichaean_Kaph" | "Manichaean_Lamedh" + | "Manichaean_Mem" | "Manichaean_Nun" + | "Manichaean_One" | "Manichaean_Pe" + | "Manichaean_Qoph" | "Manichaean_Resh" + | "Manichaean_Sadhe" | "Manichaean_Samekh" + | "Manichaean_Taw" | "Manichaean_Ten" + | "Manichaean_Teth" | "Manichaean_Thamedh" + | "Manichaean_Twenty" | "Manichaean_Waw" + | "Manichaean_Yodh" | "Manichaean_Zayin" | "Meem" + | "Mim" + | "No_Joining_Group" | "Noon" | "Nun" | "Nya" + | "Pe" + | "Qaf" | "Qaph" + | "Reh" | "Reversed_Pe" | "Rohingya_Yeh" + | "Sad" | "Sadhe" | "Seen" | "Semkath" | "Shin" + | "Straight_Waw" | "Swash_Kaf" | "Syriac_Waw" + | "Tah" | "Taw" | "Teh_Marbuta" | "Teh_Marbuta_Goal" + | "Teth" | "Thin_Yeh" + | "Vertical_Tail" + | "Waw" + | "Yeh" | "Yeh_Barree" | "Yeh_With_Tail" | "Yudh" + | "Yudh_He" + | "Zain" | "Zhain" + }? + + code-point-attributes &= + attribute Join_C { boolean }? + + code-point-attributes &= + attribute lb { "AI" | "AK" | "AL" | "AP" | "AS" + | "B2" | "BA" | "BB" | "BK" + | "CB" | "CJ" | "CL" | "CM" | "CP" | "CR" + | "EB" | "EM" | "EX" + | "GL" + | "H2" | "H3" | "HL" | "HY" + | "ID" | "IN" | "IS" + | "JL" | "JT" | "JV" + | "LF" + | "NL" | "NS" | "NU" + | "OP" + | "PO" | "PR" + | "QU" + | "RI" + | "SA" | "SG" | "SP" | "SY" + | "VF" | "VI" + | "WJ" + | "XX" + | "ZW" | "ZWJ" + }? + + code-point-attributes &= + attribute ea { "A" | "F" | "H" | "N" | "Na" | "W" }? + + code-point-attributes &= + attribute Upper { boolean }? + + code-point-attributes &= + attribute Lower { boolean }? + + code-point-attributes &= + attribute OUpper { boolean }? + + code-point-attributes &= + attribute OLower { boolean }? + + code-point-attributes &= + attribute suc { "#" | single-code-point }? + + code-point-attributes &= + attribute slc { "#" | single-code-point }? + + code-point-attributes &= + attribute stc { "#" | single-code-point }? + + code-point-attributes &= + attribute uc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute lc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute tc { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute scf { "#" | single-code-point }? + + code-point-attributes &= + attribute cf { "#" | one-or-more-code-points }? + + code-point-attributes &= + attribute CI { boolean }? + + code-point-attributes &= + attribute Cased { boolean }? + + code-point-attributes &= + attribute CWCF { boolean }? + + code-point-attributes &= + attribute CWCM { boolean }? + + code-point-attributes &= + attribute CWL { boolean }? + + code-point-attributes &= + attribute CWKCF { boolean }? + + code-point-attributes &= + attribute CWT { boolean }? + + code-point-attributes &= + attribute CWU { boolean }? + + code-point-attributes &= + attribute NFKC_CF { "#" | zero-or-more-code-points }? + + code-point-attributes &= + attribute NFKC_SCF { "#" | zero-or-more-code-points }? + + script = "Adlm" | "Aghb" | "Ahom" | "Arab" | "Armi" | "Armn" + | "Avst" + | "Bali" | "Bamu" | "Bass" | "Batk" | "Beng" | "Bhks" + | "Bopo" | "Brah" | "Brai" | "Bugi" | "Buhd" + | "Cakm" | "Cans" | "Cari" | "Cham" | "Cher" | "Chrs" + | "Copt" | "Cpmn" | "Cprt" | "Cyrl" + | "Deva" | "Diak" | "Dogr" | "Dsrt" | "Dupl" + | "Egyp" | "Elba" | "Elym" | "Ethi" + | "Gara" | "Geor" | "Glag" | "Gong" | "Gonm" | "Goth" + | "Gran" | "Grek" | "Gujr" | "Gukh" | "Guru" + | "Hang" | "Hani" | "Hano" | "Hatr" | "Hebr" | "Hira" + | "Hluw" | "Hmng" | "Hmnp" | "Hrkt" | "Hung" + | "Ital" + | "Java" + | "Kali" | "Kana" | "Kawi" | "Khar" | "Khmr" | "Khoj" + | "Kits" | "Knda" | "Krai" | "Kthi" + | "Lana" | "Laoo" | "Latn" | "Lepc" | "Limb" | "Lina" + | "Linb" | "Lisu" | "Lyci" | "Lydi" + | "Mahj" | "Maka" | "Mand" | "Mani" | "Marc" | "Medf" + | "Mend" | "Merc" | "Mero" | "Mlym" | "Modi" | "Mong" + | "Mroo" | "Mtei" | "Mult" | "Mymr" + | "Nagm" | "Nand" | "Narb" | "Nbat" | "Newa" | "Nkoo" + | "Nshu" + | "Ogam" | "Olck" | "Onao" | "Orkh" | "Orya" | "Osge" + | "Osma" | "Ougr" + | "Palm" | "Pauc" | "Perm" | "Phag" | "Phli" | "Phlp" + | "Phnx" | "Plrd" | "Prti" + | "Rjng" | "Rohg" | "Runr" + | "Samr" | "Sarb" | "Saur" | "Sgnw" | "Shaw" | "Shrd" + | "Sidd" | "Sind" | "Sinh" | "Sogd" | "Sogo" | "Sora" + | "Soyo" | "Sund" | "Sunu" | "Sylo" | "Syrc" + | "Tagb" | "Takr" | "Tale" | "Talu" | "Taml" | "Tang" + | "Tavt" | "Telu" | "Tfng" | "Tglg" | "Thaa" | "Thai" + | "Tibt" | "Tirh" | "Tnsa" | "Todr" | "Toto" | "Tutg" + | "Ugar" + | "Vaii" | "Vith" + | "Wara" | "Wcho" + | "Xpeo" | "Xsux" + | "Yezi" | "Yiii" + | "Zanb" | "Zinh" | "Zyyy" | "Zzzz" + + code-point-attributes &= + attribute sc { script }? + + code-point-attributes &= + attribute scx { list { script + } }? + + code-point-attributes &= + attribute isc { text }? + + code-point-attributes &= + attribute hst { "L" | "LV" | "LVT" | "NA" | "T" | "V" }? + + code-point-attributes &= + attribute JSN { xsd:string { pattern="[A-Z]{0,3}" } }? + + code-point-attributes &= + attribute InSC { "Avagraha" + | "Bindu" + | "Brahmi_Joining_Number" + | "Cantillation_Mark" + | "Consonant" + | "Consonant_Dead" + | "Consonant_Final" + | "Consonant_Head_Letter" + | "Consonant_Initial_Postfixed" + | "Consonant_Killer" + | "Consonant_Medial" + | "Consonant_Placeholder" + | "Consonant_Preceding_Repha" + | "Consonant_Prefixed" + | "Consonant_Subjoined" + | "Consonant_Succeeding_Repha" + | "Consonant_With_Stacker" + | "Gemination_Mark" + | "Invisible_Stacker" + | "Joiner" + | "Modifying_Letter" + | "Non_Joiner" + | "Nukta" + | "Number" + | "Number_Joiner" + | "Other" + | "Pure_Killer" + | "Register_Shifter" + | "Reordering_Killer" + | "Syllable_Modifier" + | "Tone_Letter" + | "Tone_Mark" + | "Virama" + | "Visarga" + | "Vowel" + | "Vowel_Dependent" + | "Vowel_Independent" + }? + + code-point-attributes &= + attribute InPC { "Bottom" + | "Bottom_And_Left" + | "Bottom_And_Right" + | "Left" + | "Left_And_Right" + | "NA" + | "Overstruck" + | "Right" + | "Top" + | "Top_And_Bottom" + | "Top_And_Bottom_And_Left" + | "Top_And_Bottom_And_Right" + | "Top_And_Left" + | "Top_And_Left_And_Right" + | "Top_And_Right" + | "Visual_Order_Left" + }? + + code-point-attributes &= + attribute InCB { "Consonant" + | "Extend" + | "Linker" + | "None" + }? + + code-point-attributes &= + attribute IDS { boolean }? + + code-point-attributes &= + attribute OIDS { boolean }? + + code-point-attributes &= + attribute XIDS { boolean }? + + code-point-attributes &= + attribute IDC { boolean }? + + code-point-attributes &= + attribute OIDC { boolean }? + + code-point-attributes &= + attribute XIDC { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Start { boolean }? + + code-point-attributes &= + attribute ID_Compat_Math_Continue { boolean }? + + code-point-attributes &= + attribute Pat_Syn { boolean }? + + code-point-attributes &= + attribute Pat_WS { boolean }? + + code-point-attributes &= + attribute Dash { boolean }? + + code-point-attributes &= + attribute Hyphen { boolean }? + + code-point-attributes &= + attribute QMark { boolean }? + + code-point-attributes &= + attribute Term { boolean }? + + code-point-attributes &= + attribute STerm { boolean }? + + code-point-attributes &= + attribute Dia { boolean }? + + code-point-attributes &= + attribute Ext { boolean }? + + code-point-attributes &= + attribute SD { boolean }? + + code-point-attributes &= + attribute Alpha { boolean }? + + code-point-attributes &= + attribute OAlpha { boolean }? + + code-point-attributes &= + attribute Math { boolean }? + + code-point-attributes &= + attribute OMath { boolean }? + + code-point-attributes &= + attribute Hex { boolean }? + + code-point-attributes &= + attribute AHex { boolean }? + + code-point-attributes &= + attribute DI { boolean }? + + code-point-attributes &= + attribute ODI { boolean }? + + code-point-attributes &= + attribute LOE { boolean }? + + code-point-attributes &= + attribute PCM { boolean }? + + code-point-attributes &= + attribute MCM { boolean }? + + code-point-attributes &= + attribute WSpace { boolean }? + + code-point-attributes &= + attribute vo { "R" | "Tr" | "Tu" | "U" }? + + code-point-attributes &= + attribute RI { boolean }? + + code-point-attributes &= + attribute Gr_Base { boolean }? + + code-point-attributes &= + attribute Gr_Ext { boolean }? + + code-point-attributes &= + attribute OGr_Ext { boolean }? + + code-point-attributes &= + attribute Gr_Link { boolean }? + + code-point-attributes &= + attribute GCB { "CN" | "CR" + | "EB" | "EBG" | "EM" | "EX" + | "GAZ" + | "L" | "LF" | "LV" | "LVT" + | "PP" + | "RI" + | "SM" + | "T" + | "V" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute WB { "CR" + | "DQ" + | "EB" | "EBG" | "EM" | "EX" | "Extend" + | "FO" + | "GAZ" + | "HL" + | "KA" + | "LE" | "LF" + | "MB" | "ML" | "MN" + | "NL" | "NU" + | "RI" + | "SQ" + | "WSegSpace" + | "XX" + | "ZWJ" + }? + + code-point-attributes &= + attribute SB { "AT" + | "CL" | "CR" + | "EX" + | "FO" + | "LE" | "LF" | "LO" + | "NU" + | "SC" | "SE" | "SP" | "ST" + | "UP" + | "XX" + }? + + code-point-attributes &= + attribute Ideo { boolean }? + + code-point-attributes &= + attribute UIdeo { boolean }? + + code-point-attributes &= + attribute EqUIdeo { single-code-point }? + + code-point-attributes &= + attribute IDSB { boolean }? + + code-point-attributes &= + attribute IDST { boolean }? + + code-point-attributes &= + attribute IDSU { boolean }? + + code-point-attributes &= + attribute Radical { boolean }? + + code-point-attributes &= + attribute Dep { boolean }? + + code-point-attributes &= + attribute VS { boolean }? + + code-point-attributes &= + attribute NChar { boolean }? + + code-point-attributes &= attribute kAccountingNumeric + { xsd:string { pattern="[0-9]+" } }? + + code-point-attributes &= attribute kAlternateTotalStrokes + { list { xsd:string { pattern="(\d+:[BHJKMPSUV]+)|-" }+ } }? + + code-point-attributes &= attribute kBigFive + { xsd:string { pattern="[0-9A-F]{4}'?" } }? + + code-point-attributes &= attribute kCangjie + { xsd:string { pattern="[A-Z]+" } }? + + code-point-attributes &= attribute kCantonese + { list { xsd:string { pattern="[a-z]{1,6}[1-6]" }+ } }? + + code-point-attributes &= attribute kCCCII + { list { xsd:string { pattern="[0-9A-F]{6}" }+ } }? + + code-point-attributes &= attribute kCheungBauer + { list { xsd:string { pattern="[0-9]{3}/[0-9]{2};[A-Z]*;[a-z1-6\[\]/,]+" }+ } }? + + code-point-attributes &= attribute kCheungBauerIndex + { list { xsd:string { pattern="[0-9]{3}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kCihaiT + { list { xsd:string { pattern="[1-9][0-9]{0,3}\.[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kCNS1986 + { xsd:string { pattern="[12E]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCNS1992 + { xsd:string { pattern="[1-9]-[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCompatibilityVariant + { "" | xsd:string { pattern="U\+[23]?[0-9A-F]{4}" } }? + + code-point-attributes &= attribute kCowles + { list { xsd:string { pattern="[0-9]{1,4}(\.[0-9]{1,2})?" }+ } }? + + code-point-attributes &= attribute kDaeJaweon + { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" } }? + + code-point-attributes &= attribute kDefinition + { xsd:string { pattern='[^\t"]+' } }? + + code-point-attributes &= attribute kEACC + { xsd:string { pattern="[0-9A-F]{6}" } }? + + code-point-attributes &= attribute kFanqie + { list { xsd:string { pattern="[\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{20000}-\x{2A6DF}]{2}" }+ } }? + + code-point-attributes &= attribute kFenn + { list { xsd:string { pattern="[0-9]+a?[A-KP*]" }+ } }? + + code-point-attributes &= attribute kFennIndex + { list { xsd:string { pattern="[0-9][0-9]{0,2}\.[01][0-9]" }+ } }? + + code-point-attributes &= attribute kFourCornerCode + { list { xsd:string { pattern="[0-9]{4}(\.[0-9])?" }+ } }? + + code-point-attributes &= attribute kGB0 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB3 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB5 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB7 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGB8 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kGradeLevel + { xsd:string { pattern="[1-6]" } }? + + code-point-attributes &= attribute kGSR + { list { xsd:string { pattern="[0-9]{4}[a-vx-z]'?" }+ } }? + + code-point-attributes &= attribute kHangul + { list { xsd:string { pattern="[\x{1100}-\x{1112}][\x{1161}-\x{1175}][\x{11A8}-\x{11C2}]?:[01ENX]{1,3}" }+ } }? + + code-point-attributes &= attribute kHanYu + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][0-3]" }+ } }? + + code-point-attributes &= attribute kHanyuPinlu + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+\([0-9]+\)" }+ } }? + + code-point-attributes &= attribute kHanyuPinyin + { list { xsd:string { pattern="(\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kHDZRadBreak + { xsd:string { pattern="[\x{2F00}-\x{2FD5}]\[U\+2F[0-9A-D][0-9A-F]\]:[1-8][0-9]{4}\.[0-3][0-9]0" } }? + + code-point-attributes &= attribute kHKGlyph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kIBMJapan + { list { xsd:string { pattern="F[ABC][0-9A-F]{2}" }+ } }? + + code-point-attributes &= attribute kIICore + { list { xsd:string { pattern="[ABC][GHJKMPT]{1,7}" }+ } }? + + code-point-attributes &= attribute kIRG_GSource + { "" | xsd:string { pattern="G[013578EKS]-[0-9A-F]{4}" } + | xsd:string { pattern="G4K(-\d{5})?" } + | xsd:string { pattern="G(DZ|GH|RM|WZ|XC|XH|ZH)-\d{4}\.\d{2}" } + | xsd:string { pattern="G(BK|CH|CY|HC)(-\d{4}\.\d{2})?" } + | xsd:string { pattern="GKX-\d{4}\.\d{2,3}" } + | xsd:string { pattern="G(HZ|HZR)-\d{5}\.\d{2}" } + | xsd:string { pattern="G(CE|FC|IDC23|OCD|XHZ)-\d{3}" } + | xsd:string { pattern="G(H|HF|LGYJ|PGLG|T)-\d{4}" } + | xsd:string { pattern="G(CYY|DM|JZ|KJ|XM|ZFY|ZJW|ZYS)-\d{5}" } + | xsd:string { pattern="G(FZ|IDC)-[0-9A-F]{4}" } + | xsd:string { pattern="GGFZ-\d{6}" } + | xsd:string { pattern="G(LK|Z)-\d{7}" } + | xsd:string { pattern="GU-[023][0-9A-F]{4}" } + | xsd:string { pattern="GZA-[123467]\d{5}" } + }? + + code-point-attributes &= attribute kIRG_HSource + { "" | xsd:string { pattern="H-[0-9A-F]{4}" } + | xsd:string { pattern="H(B[012])-[0-9A-F]{4}" } + | xsd:string { pattern="HD-[23]?[0-9A-F]{4}" } + | xsd:string { pattern="HU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_JSource + { "" | xsd:string { pattern="J[014]-[0-9A-F]{4}" } + | xsd:string { pattern="J3A?-[0-9A-F]{4}" } + | xsd:string { pattern="J13A?-[0-9A-F]{4}" } + | xsd:string { pattern="J14-[0-9A-F]{4}" } + | xsd:string { pattern="JA[34]?-[0-9A-F]{4}" } + | xsd:string { pattern="JARIB-[0-9A-F]{4}" } + | xsd:string { pattern="JH-(JT[ABC][0-9A-F]{3}S?|IB\d{4}|\d{6})" } + | xsd:string { pattern="JK-\d{5}" } + | xsd:string { pattern="JMJ-\d{6}" } + }? + + code-point-attributes &= attribute kIRG_KPSource + { "" | xsd:string { pattern="KP([01]-[0-9A-F]{4}|U-[023][0-9A-F]{4})" } }? + + code-point-attributes &= attribute kIRG_KSource + { "" | xsd:string { pattern="K[0-6]-[0-9A-F]{4}" } + | xsd:string { pattern="KC-\d{5}" } + | xsd:string { pattern="KU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_MSource + { "" | xsd:string { pattern="MA-[0-9A-F]{4}" } + | xsd:string { pattern="MB[12]-[0-9A-F]{4}" } + | xsd:string { pattern="MC-\d{5}" } + | xsd:string { pattern="MDH?-[23]?[0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_SSource + { "" | xsd:string { pattern="SAT-\d{5}" } }? + + code-point-attributes &= attribute kIRG_TSource + { "" | xsd:string { pattern="T([1-7A-F]|1[1-3])-[0-9A-F]{4}" } + | xsd:string { pattern="TU-[023][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRG_UKSource + { "" | xsd:string { pattern="UK-\d{5}" } }? + + code-point-attributes &= attribute kIRG_USource + { "" | xsd:string { pattern="UTC-\d{5}" } }? + + code-point-attributes &= attribute kIRG_VSource + { "" | xsd:string { pattern="V[0-4]-[0-9A-F]{4}" } + | xsd:string { pattern="VN-[023F][0-9A-F]{4}" } + }? + + code-point-attributes &= attribute kIRGDaeJaweon + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kIRGHanyuDaZidian + { list { xsd:string { pattern="[1-8][0-9]{4}\.[0-3][0-9][01]" }+ } }? + + code-point-attributes &= attribute kIRGKangXi + { list { xsd:string { pattern="[01][0-9]{3}\.[0-7][0-9][01]" }+ } }? + + code-point-attributes &= attribute kJa + { list { xsd:string { pattern="[0-9A-F]{4}S?" }+ } }? + + code-point-attributes &= attribute kJapanese + { list { xsd:string { pattern="[\x{3041}-\x{3096}\x{3099}\x{309A}\x{30A1}-\x{30FA}\x{30FC}]+" }+ } }? + + code-point-attributes &= attribute kJapaneseKun + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJapaneseOn + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kJinmeiyoKanji + { list { xsd:string { pattern="(20[0-9]{2})(:U\+[23]?[0-9A-F]{4})?" }+ } }? + + code-point-attributes &= attribute kJis0 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJis1 + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kJIS0213 + { list { xsd:string { pattern="[12],[0-9]{2},[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kJoyoKanji + { list { xsd:string { pattern="(20[0-9]{2})|(U\+[23]?[0-9A-F]{4})" }+ } }? + + code-point-attributes &= attribute kKangXi + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{2}[01]" }+ } }? + + code-point-attributes &= attribute kKarlgren + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A*]?" }+ } }? + + code-point-attributes &= attribute kKorean + { list { xsd:string { pattern="[A-Z]+" }+ } }? + + code-point-attributes &= attribute kKoreanEducationHanja + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kKoreanName + { list { xsd:string { pattern="20[0-9]{2}" }+ } }? + + code-point-attributes &= attribute kLau + { list { xsd:string { pattern="[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kMainlandTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kMandarin + { list { xsd:string { pattern="[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kMatthews + { list { xsd:string { pattern="[1-9][0-9]{0,3}(a|\.5)?" }+ } }? + + code-point-attributes &= attribute kMeyerWempe + { list { xsd:string { pattern="[1-9][0-9]{0,3}[a-t*]?" }+ } }? + + code-point-attributes &= attribute kMojiJoho + { list { xsd:string { pattern="MJ\d{6}(:(FE0[01]|E01[01][0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kMorohashi + { list { xsd:string { pattern="(\d{5}'{0,2}|H\d{3})(:(FE0[01]|E010[0-9A-F]))?" }+ } }? + + code-point-attributes &= attribute kNelson + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kOtherNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPhonetic + { list { xsd:string { pattern="[1-9][0-9]{0,3}[A-D]?\*?" }+ } }? + + code-point-attributes &= attribute kPrimaryNumeric + { list { xsd:string { pattern="[0-9]+" }+ } }? + + code-point-attributes &= attribute kPseudoGB1 + { xsd:string { pattern="[0-9]{4}" } }? + + code-point-attributes &= attribute kRSAdobe_Japan1_6 + { list { xsd:string { pattern="[CV]\+[0-9]{1,5}\+[1-9][0-9]{0,2}\.[1-9][0-9]?\.[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kRSUnicode + { list { xsd:string { pattern="[1-9][0-9]{0,2}'{0,3}\.-?[0-9]{1,2}" }+ } }? + + code-point-attributes &= attribute kSBGY + { list { xsd:string { pattern="[0-9]{3}\.[0-7][0-9]" }+ } }? + + code-point-attributes &= attribute kSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSimplifiedVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Index + { list { xsd:string { pattern="\d{1,3}\.\d{2}" }+ } }? + + code-point-attributes &= attribute kSMSZD2003Readings + { list { xsd:string { pattern="[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{300}\x{301}\x{302}\x{304}\x{308}\x{30C}]+)*\x{7CB5}[a-z]+[1-6]([a-z]+[1-6])?(,[a-z]+[1-6]([a-z]+[1-6])?)*" }+ } }? + + code-point-attributes &= attribute kSpecializedSemanticVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZFJ]+)?(,[ks][A-Za-z0-9_]+(:[TBZFJ]+)?)*)?" }+ } }? + + code-point-attributes &= attribute kSpoofingVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kStrange + { list { ( xsd:string { pattern="[ACU]" } + | xsd:string { pattern="B:U\+31[0-2AB][0-9A-F]" } + | xsd:string { pattern="[FMOR](:U\+[23]?[0-9A-F]{4})?" } + | xsd:string { pattern="H:U\+31[3-8][0-9A-F]" } + | xsd:string { pattern="I(:U\+[23]?[0-9A-F]{4})*" } + | xsd:string { pattern="K(:U\+30[A-F][0-9A-F])+" } + | xsd:string { pattern="S:[4-9][0-9]" } + )+}}? + + code-point-attributes &= attribute kTaiwanTelegraph + { list { xsd:string { pattern="[0-9]{4}" }+ } }? + + code-point-attributes &= attribute kTang + { list { xsd:string { pattern="\*?[A-Za-z()\x{E6}\x{251}\x{259}\x{25B}\x{300}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTGH + { list { xsd:string { pattern="20[0-9]{2}:[1-9][0-9]{0,3}" }+ } }? + + code-point-attributes &= attribute kTGHZ2013 + { list { xsd:string { pattern="[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kTotalStrokes + { list { xsd:string { pattern="[1-9][0-9]{0,2}" }+ } }? + + code-point-attributes &= attribute kTraditionalVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}" }+ } }? + + code-point-attributes &= attribute kUnihanCore2020 + { xsd:string { pattern="[GHJKMPT]{1,7}" } }? + + code-point-attributes &= attribute kVietnamese + { list { xsd:string { pattern="[A-Za-z\x{110}\x{111}\x{300}-\x{303}\x{306}\x{309}\x{31B}\x{323}]+" }+ } }? + + code-point-attributes &= attribute kVietnameseNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kXerox + { list { xsd:string { pattern="[0-9]{3}:[0-9]{3}" }+ } }? + + code-point-attributes &= attribute kXHC1983 + { list { xsd:string { pattern="[0-9]{4}\.[0-9]{3}\*?(,[0-9]{4}\.[0-9]{3}\*?)*:[a-z\x{300}\x{301}\x{304}\x{308}\x{30C}]+" }+ } }? + + code-point-attributes &= attribute kZhuang + { list { xsd:string { pattern="[a-z]+\*?" }+ } }? + + code-point-attributes &= attribute kZhuangNumeric + { list { xsd:string { pattern="\d+" }+ } }? + + code-point-attributes &= attribute kZVariant + { list { xsd:string { pattern="U\+[23]?[0-9A-F]{4}(<[ks][A-Za-z0-9_]+(:[TBZ]+)?(,[ks][A-Za-z0-9_]+(:[TBZ]+)?)*)?" }+ } }? + + + code-point-attributes &= + attribute kRSTUnicode { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kTGT_MergedSrc + { xsd:string {pattern="L2008-[0-9A-F]{4,5}(-[0-9]{4,5})?"} + | xsd:string {pattern="L2006-[0-9]{4}"} + | xsd:string {pattern="L1997-[0-9]{4}"} + | xsd:string {pattern="L1986-[0-9]{4}"} + | xsd:string {pattern="S1968-[0-9]{4}"} + | xsd:string {pattern="N1966-[0-9]{3}(-[0-9A-Z]{3,4})?"} + | xsd:string {pattern="H2004-[A-Z]-[0-9]{4}"} + | xsd:string {pattern="L2012-[0-9]{4}"} + | xsd:string {pattern="UTN42-[0-9]{3}"} + }? + + + code-point-attributes &= + attribute kSrc_NushuDuben { xsd:string { pattern="[0-9]+\.[0-9]+" } }? + + code-point-attributes &= + attribute kReading { xsd:string }? + + + ucd.content &= + element blocks { + element block { + attribute first-cp { single-code-point }, + attribute last-cp { single-code-point }, + attribute name { text } }+ }? + + + ucd.content &= + element named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + ucd.content &= + element provisional-named-sequences { + element named-sequence { + attribute cps { one-or-more-code-points }, + attribute name { text } }+ }? + + + ucd.content &= + element normalization-corrections { + element normalization-correction { + attribute cp { single-code-point }, + attribute old { one-or-more-code-points }, + attribute new { one-or-more-code-points }, + attribute version { text } }+ }? + + + ucd.content &= + element standardized-variants { + element standardized-variant { + attribute cps { two-code-points }, + attribute desc { text }, + attribute when { text } }+ }? + + + ucd.content &= + element cjk-radicals { + element cjk-radical { + attribute number { xsd:string {pattern="[0-9]{1,3}'{0,3}"}}, + attribute radical { single-code-point? }, + attribute ideograph { single-code-point } }+ }? + + + ucd.content &= + element emoji-sources { + element emoji-source { + attribute unicode { one-or-more-code-points }, + attribute docomo { jis-code-point? }, + attribute kddi { jis-code-point? }, + attribute softbank { jis-code-point? } }+ }? + + + code-point-attributes &= + attribute Emoji { boolean }? + + code-point-attributes &= + attribute EPres { boolean }? + + code-point-attributes &= + attribute EMod { boolean }? + + code-point-attributes &= + attribute EBase { boolean }? + + code-point-attributes &= + attribute EComp { boolean }? + + code-point-attributes &= + attribute ExtPict { boolean }? + + + ucd.content &= + element do-not-emit { + element instead { + attribute of { one-or-more-code-points }, + attribute use { one-or-more-code-points }, + attribute because { "Bengali_Khanda_Ta" + | "Deprecated" + | "Discouraged" + | "Dotless_Form" + | "Hamza_Form" + | "Indic_Atomic_Consonant" + | "Indic_Consonant_Conjunct" + | "Indic_Vowel_Letter" + | "Malayalam_Chillu" + | "Precomposed_Form" + | "Precomposed_Hieroglyph" + | "Preferred_Spelling" + | "Tamil_Shrii" + } }+ }? + diff --git a/unicodetools/src/main/resources/org/unicode/uax42/pom.xml b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml new file mode 100644 index 000000000..9ae81d56f --- /dev/null +++ b/unicodetools/src/main/resources/org/unicode/uax42/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + uax42 + Unicode Standard Annex #42 + + + + org.unicode.unicodetools + unicodetools-parent + 1.0.0 + + + + + + org.codehaus.mojo + xml-maven-plugin + 1.1.0 + + + + transform + + + + + + + ${project.basedir} + true + + index.xml + + index2html.xsl + ${outputdir} + + + .html + + + + + ${project.basedir} + true + + index.xml + + index2rnc.xsl + ${outputdir} + + + .rnc + + + + + + + + net.sf.saxon + Saxon-HE + 12.4 + + + + + + + diff --git a/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java b/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java index eb31c0452..9b6dd4983 100644 --- a/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java +++ b/unicodetools/src/test/java/org/unicode/unittest/TestLocaleConstruction.java @@ -377,7 +377,8 @@ void buildLocale(Multimap args) { //// AliasesFull aliases = new AliasesFull(dataType); //// Output> exception = new Output<>(); //// - //// for (Entry entry : validityInfo.get(dataType).entrySet()) + //// for (Entry entry : + // validityInfo.get(dataType).entrySet()) // { //// for (String code : entry.getValue().regularData) { //// String replacement = aliases.getCanonical( @@ -388,7 +389,8 @@ void buildLocale(Multimap args) { //// if (replacement != null) { //// if (DEBUG) System.out.println(code + " ==> " + replacement); //// } else if (exception.value != null){ - //// if (DEBUG) System.out.println(code + " ==> " + exception.toString()); + //// if (DEBUG) System.out.println(code + " ==> " + + // exception.toString()); //// } //// } //// }