Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature request: update to unicode 15.1.0 #5

Closed
SKalt opened this issue Feb 17, 2024 · 2 comments
Closed

Feature request: update to unicode 15.1.0 #5

SKalt opened this issue Feb 17, 2024 · 2 comments

Comments

@SKalt
Copy link

SKalt commented Feb 17, 2024

Hi @magiclen, would you be interested in re-running your code generator to handle the new characters from Unicode 15.1.0, which was released in September 2023? I'm asking since I was generating a list of code blocks and noticed that the new CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I block was missing.

(here's the hacky script I used to generate a list of all code blocks, if you're interested in using it for #3)
#!/usr/bin/env bash
target="/tmp/unicode_blocks.txt"
if ! [ -f "$target" ]; then
  curl -o "$target" "https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt"
fi

echo "/// all the unicode blocks ordered by their range of code points"
echo "pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &["
grep -E '^[0-9A-F]{4,}' "$target" | # find the lines with unicode ranges
  tr '[:lower:]' '[:upper:]' | # convert range names to uppercase
  awk -F '; ' '
    {
      range_name=$2;
      gsub(/[- ]/, "_", range_name); # convert range names into snake_case identifiers
      print range_name "," # you could also print `"// " $1` if you want the range itself
    }
  ' |
  sed 's/^/    /g' | # indent the output
  grep -v "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I" | # remove missing range
  cat - # output the result
echo "];"
output
/// all the unicode blocks ordered by their range of code points
pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &[
    BASIC_LATIN,
    LATIN_1_SUPPLEMENT,
    LATIN_EXTENDED_A,
    LATIN_EXTENDED_B,
    IPA_EXTENSIONS,
    SPACING_MODIFIER_LETTERS,
    COMBINING_DIACRITICAL_MARKS,
    GREEK_AND_COPTIC,
    CYRILLIC,
    CYRILLIC_SUPPLEMENT,
    ARMENIAN,
    HEBREW,
    ARABIC,
    SYRIAC,
    ARABIC_SUPPLEMENT,
    THAANA,
    NKO,
    SAMARITAN,
    MANDAIC,
    SYRIAC_SUPPLEMENT,
    ARABIC_EXTENDED_B,
    ARABIC_EXTENDED_A,
    DEVANAGARI,
    BENGALI,
    GURMUKHI,
    GUJARATI,
    ORIYA,
    TAMIL,
    TELUGU,
    KANNADA,
    MALAYALAM,
    SINHALA,
    THAI,
    LAO,
    TIBETAN,
    MYANMAR,
    GEORGIAN,
    HANGUL_JAMO,
    ETHIOPIC,
    ETHIOPIC_SUPPLEMENT,
    CHEROKEE,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
    OGHAM,
    RUNIC,
    TAGALOG,
    HANUNOO,
    BUHID,
    TAGBANWA,
    KHMER,
    MONGOLIAN,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED,
    LIMBU,
    TAI_LE,
    NEW_TAI_LUE,
    KHMER_SYMBOLS,
    BUGINESE,
    TAI_THAM,
    COMBINING_DIACRITICAL_MARKS_EXTENDED,
    BALINESE,
    SUNDANESE,
    BATAK,
    LEPCHA,
    OL_CHIKI,
    CYRILLIC_EXTENDED_C,
    GEORGIAN_EXTENDED,
    SUNDANESE_SUPPLEMENT,
    VEDIC_EXTENSIONS,
    PHONETIC_EXTENSIONS,
    PHONETIC_EXTENSIONS_SUPPLEMENT,
    COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
    LATIN_EXTENDED_ADDITIONAL,
    GREEK_EXTENDED,
    GENERAL_PUNCTUATION,
    SUPERSCRIPTS_AND_SUBSCRIPTS,
    CURRENCY_SYMBOLS,
    COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
    LETTERLIKE_SYMBOLS,
    NUMBER_FORMS,
    ARROWS,
    MATHEMATICAL_OPERATORS,
    MISCELLANEOUS_TECHNICAL,
    CONTROL_PICTURES,
    OPTICAL_CHARACTER_RECOGNITION,
    ENCLOSED_ALPHANUMERICS,
    BOX_DRAWING,
    BLOCK_ELEMENTS,
    GEOMETRIC_SHAPES,
    MISCELLANEOUS_SYMBOLS,
    DINGBATS,
    MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
    SUPPLEMENTAL_ARROWS_A,
    BRAILLE_PATTERNS,
    SUPPLEMENTAL_ARROWS_B,
    MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
    SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
    MISCELLANEOUS_SYMBOLS_AND_ARROWS,
    GLAGOLITIC,
    LATIN_EXTENDED_C,
    COPTIC,
    GEORGIAN_SUPPLEMENT,
    TIFINAGH,
    ETHIOPIC_EXTENDED,
    CYRILLIC_EXTENDED_A,
    SUPPLEMENTAL_PUNCTUATION,
    CJK_RADICALS_SUPPLEMENT,
    KANGXI_RADICALS,
    IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
    CJK_SYMBOLS_AND_PUNCTUATION,
    HIRAGANA,
    KATAKANA,
    BOPOMOFO,
    HANGUL_COMPATIBILITY_JAMO,
    KANBUN,
    BOPOMOFO_EXTENDED,
    CJK_STROKES,
    KATAKANA_PHONETIC_EXTENSIONS,
    ENCLOSED_CJK_LETTERS_AND_MONTHS,
    CJK_COMPATIBILITY,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
    YIJING_HEXAGRAM_SYMBOLS,
    CJK_UNIFIED_IDEOGRAPHS,
    YI_SYLLABLES,
    YI_RADICALS,
    LISU,
    VAI,
    CYRILLIC_EXTENDED_B,
    BAMUM,
    MODIFIER_TONE_LETTERS,
    LATIN_EXTENDED_D,
    SYLOTI_NAGRI,
    COMMON_INDIC_NUMBER_FORMS,
    PHAGS_PA,
    SAURASHTRA,
    DEVANAGARI_EXTENDED,
    KAYAH_LI,
    REJANG,
    HANGUL_JAMO_EXTENDED_A,
    JAVANESE,
    MYANMAR_EXTENDED_B,
    CHAM,
    MYANMAR_EXTENDED_A,
    TAI_VIET,
    MEETEI_MAYEK_EXTENSIONS,
    ETHIOPIC_EXTENDED_A,
    LATIN_EXTENDED_E,
    CHEROKEE_SUPPLEMENT,
    MEETEI_MAYEK,
    HANGUL_SYLLABLES,
    HANGUL_JAMO_EXTENDED_B,
    HIGH_SURROGATES,
    HIGH_PRIVATE_USE_SURROGATES,
    LOW_SURROGATES,
    PRIVATE_USE_AREA,
    CJK_COMPATIBILITY_IDEOGRAPHS,
    ALPHABETIC_PRESENTATION_FORMS,
    ARABIC_PRESENTATION_FORMS_A,
    VARIATION_SELECTORS,
    VERTICAL_FORMS,
    COMBINING_HALF_MARKS,
    CJK_COMPATIBILITY_FORMS,
    SMALL_FORM_VARIANTS,
    ARABIC_PRESENTATION_FORMS_B,
    HALFWIDTH_AND_FULLWIDTH_FORMS,
    SPECIALS,
    LINEAR_B_SYLLABARY,
    LINEAR_B_IDEOGRAMS,
    AEGEAN_NUMBERS,
    ANCIENT_GREEK_NUMBERS,
    ANCIENT_SYMBOLS,
    PHAISTOS_DISC,
    LYCIAN,
    CARIAN,
    COPTIC_EPACT_NUMBERS,
    OLD_ITALIC,
    GOTHIC,
    OLD_PERMIC,
    UGARITIC,
    OLD_PERSIAN,
    DESERET,
    SHAVIAN,
    OSMANYA,
    OSAGE,
    ELBASAN,
    CAUCASIAN_ALBANIAN,
    VITHKUQI,
    LINEAR_A,
    LATIN_EXTENDED_F,
    CYPRIOT_SYLLABARY,
    IMPERIAL_ARAMAIC,
    PALMYRENE,
    NABATAEAN,
    HATRAN,
    PHOENICIAN,
    LYDIAN,
    MEROITIC_HIEROGLYPHS,
    MEROITIC_CURSIVE,
    KHAROSHTHI,
    OLD_SOUTH_ARABIAN,
    OLD_NORTH_ARABIAN,
    MANICHAEAN,
    AVESTAN,
    INSCRIPTIONAL_PARTHIAN,
    INSCRIPTIONAL_PAHLAVI,
    PSALTER_PAHLAVI,
    OLD_TURKIC,
    OLD_HUNGARIAN,
    HANIFI_ROHINGYA,
    RUMI_NUMERAL_SYMBOLS,
    YEZIDI,
    ARABIC_EXTENDED_C,
    OLD_SOGDIAN,
    SOGDIAN,
    OLD_UYGHUR,
    CHORASMIAN,
    ELYMAIC,
    BRAHMI,
    KAITHI,
    SORA_SOMPENG,
    CHAKMA,
    MAHAJANI,
    SHARADA,
    SINHALA_ARCHAIC_NUMBERS,
    KHOJKI,
    MULTANI,
    KHUDAWADI,
    GRANTHA,
    NEWA,
    TIRHUTA,
    SIDDHAM,
    MODI,
    MONGOLIAN_SUPPLEMENT,
    TAKRI,
    AHOM,
    DOGRA,
    WARANG_CITI,
    DIVES_AKURU,
    NANDINAGARI,
    ZANABAZAR_SQUARE,
    SOYOMBO,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A,
    PAU_CIN_HAU,
    DEVANAGARI_EXTENDED_A,
    BHAIKSUKI,
    MARCHEN,
    MASARAM_GONDI,
    GUNJALA_GONDI,
    MAKASAR,
    KAWI,
    LISU_SUPPLEMENT,
    TAMIL_SUPPLEMENT,
    CUNEIFORM,
    CUNEIFORM_NUMBERS_AND_PUNCTUATION,
    EARLY_DYNASTIC_CUNEIFORM,
    CYPRO_MINOAN,
    EGYPTIAN_HIEROGLYPHS,
    EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS,
    ANATOLIAN_HIEROGLYPHS,
    BAMUM_SUPPLEMENT,
    MRO,
    TANGSA,
    BASSA_VAH,
    PAHAWH_HMONG,
    MEDEFAIDRIN,
    MIAO,
    IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION,
    TANGUT,
    TANGUT_COMPONENTS,
    KHITAN_SMALL_SCRIPT,
    TANGUT_SUPPLEMENT,
    KANA_EXTENDED_B,
    KANA_SUPPLEMENT,
    KANA_EXTENDED_A,
    SMALL_KANA_EXTENSION,
    NUSHU,
    DUPLOYAN,
    SHORTHAND_FORMAT_CONTROLS,
    ZNAMENNY_MUSICAL_NOTATION,
    BYZANTINE_MUSICAL_SYMBOLS,
    MUSICAL_SYMBOLS,
    ANCIENT_GREEK_MUSICAL_NOTATION,
    KAKTOVIK_NUMERALS,
    MAYAN_NUMERALS,
    TAI_XUAN_JING_SYMBOLS,
    COUNTING_ROD_NUMERALS,
    MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
    SUTTON_SIGNWRITING,
    LATIN_EXTENDED_G,
    GLAGOLITIC_SUPPLEMENT,
    CYRILLIC_EXTENDED_D,
    NYIAKENG_PUACHUE_HMONG,
    TOTO,
    WANCHO,
    NAG_MUNDARI,
    ETHIOPIC_EXTENDED_B,
    MENDE_KIKAKUI,
    ADLAM,
    INDIC_SIYAQ_NUMBERS,
    OTTOMAN_SIYAQ_NUMBERS,
    ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS,
    MAHJONG_TILES,
    DOMINO_TILES,
    PLAYING_CARDS,
    ENCLOSED_ALPHANUMERIC_SUPPLEMENT,
    ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
    MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,
    EMOTICONS,
    ORNAMENTAL_DINGBATS,
    TRANSPORT_AND_MAP_SYMBOLS,
    ALCHEMICAL_SYMBOLS,
    GEOMETRIC_SHAPES_EXTENDED,
    SUPPLEMENTAL_ARROWS_C,
    SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS,
    CHESS_SYMBOLS,
    SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A,
    SYMBOLS_FOR_LEGACY_COMPUTING,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
    CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H,
    TAGS,
    VARIATION_SELECTORS_SUPPLEMENT,
    SUPPLEMENTARY_PRIVATE_USE_AREA_A,
    SUPPLEMENTARY_PRIVATE_USE_AREA_B,
];
@magiclen
Copy link
Owner

Thank you for the news. I have updated the list of code blocks.

And also thanks for the script, I will take a look at that and add an array containing all blocks in the near future.

@SKalt
Copy link
Author

SKalt commented Feb 20, 2024

Excellent, thank you! 7015958 resolves this issue.

@SKalt SKalt closed this as completed Feb 20, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants