Skip to content

Commit

Permalink
syntax: add emoji and Grapheme_Cluster_Break properties
Browse files Browse the repository at this point in the history
This commit adds several emoji properties such as Emoji and
Extended_Pictographic. We also add support for the Grapheme_Cluster_Break
enumeration property.
  • Loading branch information
BurntSushi committed Nov 21, 2018
1 parent 770edd5 commit 4a970b2
Show file tree
Hide file tree
Showing 8 changed files with 2,529 additions and 1,722 deletions.
7 changes: 7 additions & 0 deletions UNICODE.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,15 @@ properties correspond to properties required by RL1.2):
* `Default_Ignorable_Code_Point` \*
* `Deprecated`
* `Diacritic`
* `Emoji`
* `Emoji_Presentation`
* `Emoji_Modifier`
* `Emoji_Modifier_Base`
* `Emoji_Component`
* `Extended_Pictographic`
* `Extender`
* `Grapheme_Base`
* `Grapheme_Cluster_Break`
* `Grapheme_Extend`
* `Hex_Digit`
* `IDS_Binary_Operator`
Expand Down
6 changes: 6 additions & 0 deletions regex-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use hir;
use unicode_tables::age;
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
use unicode_tables::general_category;
use unicode_tables::grapheme_cluster_break;
use unicode_tables::property_bool;
use unicode_tables::property_names::PROPERTY_NAMES;
use unicode_tables::property_values::PROPERTY_VALUES;
Expand Down Expand Up @@ -250,6 +251,11 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
ByValue { property_name: "Grapheme_Cluster_Break", property_value } => {
property_set(grapheme_cluster_break::BY_NAME, property_value)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
_ => {
// What else should we support?
Err(Error::PropertyNotFound)
Expand Down
455 changes: 455 additions & 0 deletions regex-syntax/src/unicode_tables/grapheme_cluster_break.rs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions regex-syntax/src/unicode_tables/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub mod age;
pub mod case_folding_simple;
pub mod general_category;
pub mod grapheme_cluster_break;
pub mod perl_word;
pub mod property_bool;
pub mod property_names;
Expand Down
3,736 changes: 2,019 additions & 1,717 deletions regex-syntax/src/unicode_tables/property_bool.rs

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions regex-syntax/src/unicode_tables/property_names.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-names tmp/ucd-11.0.0/
// ucd-generate property-names /home/andrew/tmp/ucd-11.0.0/
//
// ucd-generate is available on crates.io.

Expand Down Expand Up @@ -47,13 +47,17 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"),
("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"),
("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"), ("emoji", "Emoji"),
("emojicomponent", "Emoji_Component"), ("emojimodifier", "Emoji_Modifier"),
("emojimodifierbase", "Emoji_Modifier_Base"),
("emojipresentation", "Emoji_Presentation"),
("equideo", "Equivalent_Unified_Ideograph"),
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"),
("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"),
("ext", "Extender"), ("extender", "Extender"),
("fcnfkc", "FC_NFKC_Closure"), ("fcnfkcclosure", "FC_NFKC_Closure"),
("ext", "Extender"), ("extendedpictographic", "Extended_Pictographic"),
("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"),
("fcnfkcclosure", "FC_NFKC_Closure"),
("fullcompositionexclusion", "Full_Composition_Exclusion"),
("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"),
("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"),
Expand Down
13 changes: 12 additions & 1 deletion regex-syntax/src/unicode_tables/property_values.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-values tmp/ucd-11.0.0/ --include gc,script,scx,age
// ucd-generate property-values /home/andrew/tmp/ucd-11.0.0/ --include gc,script,scx,age,gcb
//
// ucd-generate is available on crates.io.

Expand Down Expand Up @@ -57,6 +57,17 @@ pub const PROPERTY_VALUES: &'static [(&'static str, &'static [(&'static str, &'s
("zl", "Line_Separator"), ("zp", "Paragraph_Separator"),
("zs", "Space_Separator"), ]),

("Grapheme_Cluster_Break", &[("cn", "Control"), ("control", "Control"),
("cr", "CR"), ("eb", "E_Base"), ("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"), ("ebg", "E_Base_GAZ"), ("em", "E_Modifier"),
("emodifier", "E_Modifier"), ("ex", "Extend"), ("extend", "Extend"),
("gaz", "Glue_After_Zwj"), ("glueafterzwj", "Glue_After_Zwj"), ("l", "L"),
("lf", "LF"), ("lv", "LV"), ("lvt", "LVT"), ("other", "Other"),
("pp", "Prepend"), ("prepend", "Prepend"),
("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"),
("sm", "SpacingMark"), ("spacingmark", "SpacingMark"), ("t", "T"),
("v", "V"), ("xx", "Other"), ("zwj", "ZWJ"), ]),

("Script", &[("adlam", "Adlam"), ("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"),
Expand Down
21 changes: 21 additions & 0 deletions tests/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,24 @@ mat!(uni_class_gencat_unassigned,
r"\p{Unassigned}", "\u{10FFFF}", Some((0, 4)));
mat!(uni_class_gencat_uppercase_letter,
r"\p{Uppercase_Letter}", "Ꝋ", Some((0, 3)));

// Test a smattering of properties.
mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3)));
mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4)));
mat!(uni_class_prop_picto1,
r"\p{extendedpictographic}", "\u{1FA6E}", Some((0, 4)));
mat!(uni_class_prop_picto2,
r"\p{extendedpictographic}", "\u{1FFFD}", Some((0, 4)));

mat!(uni_class_gcb_prepend,
r"\p{grapheme_cluster_break=prepend}", "\u{11D46}", Some((0, 4)));
mat!(uni_class_gcb_ri1,
r"\p{gcb=regional_indicator}", "\u{1F1E6}", Some((0, 4)));
mat!(uni_class_gcb_ri2,
r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
mat!(uni_class_gcb_ri3,
r"\p{gcb=regionalindicator}", "\u{1F1FF}", Some((0, 4)));
mat!(uni_class_gcb_lvt,
r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
mat!(uni_class_gcb_zwj,
r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));

0 comments on commit 4a970b2

Please sign in to comment.