Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweaks to Unihan property handling #1022

Merged
merged 15 commits into from
Jan 30, 2025
34 changes: 22 additions & 12 deletions unicodetools/src/main/java/org/unicode/props/UcdProperty.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public enum UcdProperty {
Numeric_Value(PropertyType.Numeric, "nv"),
kAccountingNumeric(PropertyType.Numeric, "cjkAccountingNumeric"),
kOtherNumeric(PropertyType.Numeric, "cjkOtherNumeric"),
kPrimaryNumeric(PropertyType.Numeric, "cjkPrimaryNumeric"),
kPrimaryNumeric(PropertyType.Numeric, null, ValueCardinality.Ordered, "cjkPrimaryNumeric"),

// String
Bidi_Mirroring_Glyph(PropertyType.String, "bmg"),
Expand Down Expand Up @@ -91,7 +91,11 @@ public enum UcdProperty {
Named_Sequences_Prov(PropertyType.Miscellaneous, "NSP"),
Standardized_Variant(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "SV"),
Unicode_1_Name(PropertyType.Miscellaneous, "na1"),
kAlternateTotalStrokes(PropertyType.Miscellaneous, "cjkAlternateTotalStrokes"),
kAlternateTotalStrokes(
PropertyType.Miscellaneous,
null,
ValueCardinality.Unordered,
"cjkAlternateTotalStrokes"),
kBigFive(PropertyType.Miscellaneous, "cjkBigFive"),
kCCCII(PropertyType.Miscellaneous, "cjkCCCII"),
kCNS1986(PropertyType.Miscellaneous, "cjkCNS1986"),
Expand All @@ -114,7 +118,7 @@ public enum UcdProperty {
kEH_IFAO(PropertyType.Miscellaneous, "kEH_IFAO"),
kEH_JSesh(PropertyType.Miscellaneous, "kEH_JSesh"),
kEH_UniK(PropertyType.Miscellaneous, "kEH_UniK"),
kFanqie(PropertyType.Miscellaneous, "cjkFanqie"),
kFanqie(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFanqie"),
kFenn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFenn"),
kFennIndex(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkFennIndex"),
kFourCornerCode(
Expand Down Expand Up @@ -154,7 +158,7 @@ public enum UcdProperty {
kIRG_VSource(PropertyType.Miscellaneous, "cjkIRG_VSource"),
kJIS0213(PropertyType.Miscellaneous, "cjkJIS0213"),
kJa(PropertyType.Miscellaneous, "cjkJa"),
kJapanese(PropertyType.Miscellaneous, "cjkJapanese"),
kJapanese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapanese"),
kJapaneseKun(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseKun"),
kJapaneseOn(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkJapaneseOn"),
kJinmeiyoKanji(
Expand All @@ -180,7 +184,7 @@ public enum UcdProperty {
kMandarin(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkMandarin"),
kMatthews(PropertyType.Miscellaneous, "cjkMatthews"),
kMeyerWempe(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMeyerWempe"),
kMojiJoho(PropertyType.Miscellaneous, "cjkMojiJoho"),
kMojiJoho(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkMojiJoho"),
kMorohashi(PropertyType.Miscellaneous, "cjkMorohashi"),
kNelson(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkNelson"),
kPhonetic(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkPhonetic"),
Expand All @@ -201,31 +205,37 @@ public enum UcdProperty {
"URS"),
kReading(PropertyType.Miscellaneous, "kReading"),
kSBGY(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSBGY"),
kSMSZD2003Index(PropertyType.Miscellaneous, "cjkSMSZD2003Index"),
kSMSZD2003Readings(PropertyType.Miscellaneous, "cjkSMSZD2003Readings"),
kSMSZD2003Index(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Index"),
kSMSZD2003Readings(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSMSZD2003Readings"),
kSemanticVariant(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSemanticVariant"),
kSpecializedSemanticVariant(
PropertyType.Miscellaneous,
null,
ValueCardinality.Unordered,
"cjkSpecializedSemanticVariant"),
kSpoofingVariant(PropertyType.Miscellaneous, "cjkSpoofingVariant"),
kSpoofingVariant(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkSpoofingVariant"),
kSrc_NushuDuben(PropertyType.Miscellaneous, "kSrc_NushuDuben"),
kStrange(PropertyType.Miscellaneous, "cjkStrange"),
kStrange(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkStrange"),
kTGH(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGH"),
kTGHZ2013(PropertyType.Miscellaneous, "cjkTGHZ2013"),
kTGHZ2013(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTGHZ2013"),
kTGT_MergedSrc(PropertyType.Miscellaneous, "kTGT_MergedSrc"),
kTaiwanTelegraph(PropertyType.Miscellaneous, "cjkTaiwanTelegraph"),
kTang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkTang"),
kTotalStrokes(PropertyType.Miscellaneous, null, ValueCardinality.Ordered, "cjkTotalStrokes"),
kUnihanCore2020(PropertyType.Miscellaneous, "cjkUnihanCore2020"),
kVietnamese(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnamese"),
kVietnameseNumeric(PropertyType.Miscellaneous, "cjkVietnameseNumeric"),
kVietnameseNumeric(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkVietnameseNumeric"),
kXHC1983(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkXHC1983"),
kXerox(PropertyType.Miscellaneous, "cjkXerox"),
kZVariant(PropertyType.Miscellaneous, "cjkZVariant"),
kZhuangNumeric(PropertyType.Miscellaneous, "cjkZhuangNumeric"),
kZhuang(PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuang"),
kZhuangNumeric(
PropertyType.Miscellaneous, null, ValueCardinality.Unordered, "cjkZhuangNumeric"),

// Catalog
Age(PropertyType.Catalog, Age_Values.class, null, "age"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1579,6 +1579,7 @@ public static kEH_Core_Values forName(String name) {
// kVietnameseNumeric
// kXerox
// kXHC1983
// kZhuang
// kZhuangNumeric
// kZVariant
public enum Line_Break_Values implements Named {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ cjkVietnameseNumeric ; kVietnameseNumeric
cjkZhuangNumeric ; kZhuangNumeric
# 16.0
cjkFanqie ; kFanqie
cjkZhuang ; kZhuang

kTGT_MergedSrc ; kTGT_MergedSrc
kRSTUnicode ; kRSTUnicode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ kCCCII ; EXTENSIBLE ; [0-9A-F]{6}
kEACC ; SINGLE_VALUED ; [0-9A-F]{6}
kAccountingNumeric ; SINGLE_VALUED ; [0-9]+
kOtherNumeric ; SINGLE_VALUED ; [0-9]+
kPrimaryNumeric ; SINGLE_VALUED ; [0-9]+
kPrimaryNumeric ; ORDERED ; [0-9]+
kFenn ; MULTI_VALUED ; [0-9]+a?[A-KP*]
kCowles ; MULTI_VALUED ; [0-9]{1,4}(\.[0-9]{1,2})?
kXerox ; SINGLE_VALUED ; [0-9]{3}:[0-9]{3}
Expand Down Expand Up @@ -176,11 +176,29 @@ kKoreanEducationHanja ; MULTI_VALUED ; 20[0-9]{2}
kKoreanName ; MULTI_VALUED ; (20[0-9]{2})(:U\+2?[0-9A-F]{4})*
kTGH ; MULTI_VALUED ; 20[0-9]{2}:[1-9][0-9]{0,3}


kIRG_UKSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}
kIRG_SSource ; SINGLE_VALUED ; V[0-4]-[0-9A-F]{4}


# Unihan properties from 13.0 and later. No regexes for now.
# TODO(egg): We should automate the updating of the regexes from UAX #38.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally the fields from the table would be in a machine-readable format, and the table generated from them, and our usage also.

I initially generated by dumping the table into a spreadsheet, then using formulæ to transform a bit, eg:

Property kJis1
Status Provisional
Category Other Mappings
Introduced 2
Delimiter space
Syntax [0-9]{4}
Description The JIS X 0212-1990 mapping for this ideograph in row-cell form.

=>

kJapaneseKun Status Provisional
kJapaneseKun Category Readings
kJapaneseKun Introduced 2
kJapaneseKun Delimiter space
kJapaneseKun Syntax [A-Z]+
kJapaneseKun Description The Japanese pronunciation(s) of this ideograph in the Hepburn romanization.

Then extract the delimiter and syntax for each property; but then also check the text for the ones with delimiters to see whether they were ordered or not.

However, I didn't keep up to date (obviously), so it needs a better process.

kSpoofingVariant ; MULTI_VALUED ; .*
kTGHZ2013 ; MULTI_VALUED ; .*
kUnihanCore2020 ; SINGLE_VALUED ; .*
# 14.0
kStrange ; MULTI_VALUED ; .*
# 15.0
kAlternateTotalStrokes ; MULTI_VALUED ; .*
# 15.1
kJapanese ; MULTI_VALUED ; .*
kMojiJoho ; MULTI_VALUED ; .*
kSMSZD2003Index ; MULTI_VALUED ; .*
kSMSZD2003Readings ; MULTI_VALUED ; .*
kVietnameseNumeric ; MULTI_VALUED ; .*
kZhuangNumeric ; MULTI_VALUED ; .*
# 16.0
kFanqie ; MULTI_VALUED ; .*
kZhuang ; MULTI_VALUED ; .*

# =============================
# Catalog/Enum/Binary Properties
# All not listed are SINGLE_VALUED ; null
Expand Down

This file was deleted.

Loading
Loading