Skip to content

Commit

Permalink
syntax: add emoji and break properties
Browse files Browse the repository at this point in the history
This commit adds several emoji properties such as Emoji and
Extended_Pictographic. We also add support for the Grapheme_Cluster_Break,
Word_Break and Sentence_Break enumeration properties.
  • Loading branch information
BurntSushi committed Dec 1, 2018
1 parent 770edd5 commit ecc1a5a
Show file tree
Hide file tree
Showing 10 changed files with 3,580 additions and 1,722 deletions.
9 changes: 9 additions & 0 deletions UNICODE.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,15 @@ properties correspond to properties required by RL1.2):
* `Default_Ignorable_Code_Point` \*
* `Deprecated`
* `Diacritic`
* `Emoji`
* `Emoji_Presentation`
* `Emoji_Modifier`
* `Emoji_Modifier_Base`
* `Emoji_Component`
* `Extended_Pictographic`
* `Extender`
* `Grapheme_Base`
* `Grapheme_Cluster_Break`
* `Grapheme_Extend`
* `Hex_Digit`
* `IDS_Binary_Operator`
Expand All @@ -127,13 +134,15 @@ properties correspond to properties required by RL1.2):
* `Quotation_Mark`
* `Radical`
* `Regional_Indicator`
* `Sentence_Break`
* `Sentence_Terminal`
* `Soft_Dotted`
* `Terminal_Punctuation`
* `Unified_Ideograph`
* `Uppercase` \*
* `Variation_Selector`
* `White_Space` \*
* `Word_Break`
* `XID_Continue`
* `XID_Start`

Expand Down
18 changes: 18 additions & 0 deletions regex-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@ use hir;
use unicode_tables::age;
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
use unicode_tables::general_category;
use unicode_tables::grapheme_cluster_break;
use unicode_tables::property_bool;
use unicode_tables::property_names::PROPERTY_NAMES;
use unicode_tables::property_values::PROPERTY_VALUES;
use unicode_tables::script;
use unicode_tables::script_extension;
use unicode_tables::sentence_break;
use unicode_tables::word_break;

type Result<T> = result::Result<T, Error>;

Expand Down Expand Up @@ -250,6 +253,21 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
ByValue { property_name: "Grapheme_Cluster_Break", property_value } => {
property_set(grapheme_cluster_break::BY_NAME, property_value)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
ByValue { property_name: "Sentence_Break", property_value } => {
property_set(sentence_break::BY_NAME, property_value)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
ByValue { property_name: "Word_Break", property_value } => {
property_set(word_break::BY_NAME, property_value)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
_ => {
// What else should we support?
Err(Error::PropertyNotFound)
Expand Down
455 changes: 455 additions & 0 deletions regex-syntax/src/unicode_tables/grapheme_cluster_break.rs

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions regex-syntax/src/unicode_tables/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
pub mod age;
pub mod case_folding_simple;
pub mod general_category;
pub mod grapheme_cluster_break;
pub mod perl_word;
pub mod property_bool;
pub mod property_names;
pub mod property_values;
pub mod script_extension;
pub mod script;
pub mod sentence_break;
pub mod word_break;
3,736 changes: 2,019 additions & 1,717 deletions regex-syntax/src/unicode_tables/property_bool.rs

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions regex-syntax/src/unicode_tables/property_names.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-names tmp/ucd-11.0.0/
// ucd-generate property-names /home/andrew/tmp/ucd-11.0.0/
//
// ucd-generate is available on crates.io.

Expand Down Expand Up @@ -47,13 +47,17 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"),
("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"),
("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"), ("emoji", "Emoji"),
("emojicomponent", "Emoji_Component"), ("emojimodifier", "Emoji_Modifier"),
("emojimodifierbase", "Emoji_Modifier_Base"),
("emojipresentation", "Emoji_Presentation"),
("equideo", "Equivalent_Unified_Ideograph"),
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"),
("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"),
("ext", "Extender"), ("extender", "Extender"),
("fcnfkc", "FC_NFKC_Closure"), ("fcnfkcclosure", "FC_NFKC_Closure"),
("ext", "Extender"), ("extendedpictographic", "Extended_Pictographic"),
("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"),
("fcnfkcclosure", "FC_NFKC_Closure"),
("fullcompositionexclusion", "Full_Composition_Exclusion"),
("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"),
("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"),
Expand Down
39 changes: 38 additions & 1 deletion regex-syntax/src/unicode_tables/property_values.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-values tmp/ucd-11.0.0/ --include gc,script,scx,age
// ucd-generate property-values /home/andrew/tmp/ucd-11.0.0/ --include gc,script,scx,age,gcb,wb,sb
//
// ucd-generate is available on crates.io.

Expand Down Expand Up @@ -57,6 +57,17 @@ pub const PROPERTY_VALUES: &'static [(&'static str, &'static [(&'static str, &'s
("zl", "Line_Separator"), ("zp", "Paragraph_Separator"),
("zs", "Space_Separator"), ]),

("Grapheme_Cluster_Break", &[("cn", "Control"), ("control", "Control"),
("cr", "CR"), ("eb", "E_Base"), ("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"), ("ebg", "E_Base_GAZ"), ("em", "E_Modifier"),
("emodifier", "E_Modifier"), ("ex", "Extend"), ("extend", "Extend"),
("gaz", "Glue_After_Zwj"), ("glueafterzwj", "Glue_After_Zwj"), ("l", "L"),
("lf", "LF"), ("lv", "LV"), ("lvt", "LVT"), ("other", "Other"),
("pp", "Prepend"), ("prepend", "Prepend"),
("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"),
("sm", "SpacingMark"), ("spacingmark", "SpacingMark"), ("t", "T"),
("v", "V"), ("xx", "Other"), ("zwj", "ZWJ"), ]),

("Script", &[("adlam", "Adlam"), ("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"),
Expand Down Expand Up @@ -286,4 +297,30 @@ pub const PROPERTY_VALUES: &'static [(&'static str, &'static [(&'static str, &'s
("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"),
("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"),
("zzzz", "Unknown"), ]),

("Sentence_Break", &[("at", "ATerm"), ("aterm", "ATerm"), ("cl", "Close"),
("close", "Close"), ("cr", "CR"), ("ex", "Extend"), ("extend", "Extend"),
("fo", "Format"), ("format", "Format"), ("le", "OLetter"), ("lf", "LF"),
("lo", "Lower"), ("lower", "Lower"), ("nu", "Numeric"),
("numeric", "Numeric"), ("oletter", "OLetter"), ("other", "Other"),
("sc", "SContinue"), ("scontinue", "SContinue"), ("se", "Sep"),
("sep", "Sep"), ("sp", "Sp"), ("st", "STerm"), ("sterm", "STerm"),
("up", "Upper"), ("upper", "Upper"), ("xx", "Other"), ]),

("Word_Break", &[("aletter", "ALetter"), ("cr", "CR"),
("doublequote", "Double_Quote"), ("dq", "Double_Quote"), ("eb", "E_Base"),
("ebase", "E_Base"), ("ebasegaz", "E_Base_GAZ"), ("ebg", "E_Base_GAZ"),
("em", "E_Modifier"), ("emodifier", "E_Modifier"), ("ex", "ExtendNumLet"),
("extend", "Extend"), ("extendnumlet", "ExtendNumLet"), ("fo", "Format"),
("format", "Format"), ("gaz", "Glue_After_Zwj"),
("glueafterzwj", "Glue_After_Zwj"), ("hebrewletter", "Hebrew_Letter"),
("hl", "Hebrew_Letter"), ("ka", "Katakana"), ("katakana", "Katakana"),
("le", "ALetter"), ("lf", "LF"), ("mb", "MidNumLet"),
("midletter", "MidLetter"), ("midnum", "MidNum"),
("midnumlet", "MidNumLet"), ("ml", "MidLetter"), ("mn", "MidNum"),
("newline", "Newline"), ("nl", "Newline"), ("nu", "Numeric"),
("numeric", "Numeric"), ("other", "Other"),
("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"),
("singlequote", "Single_Quote"), ("sq", "Single_Quote"),
("wsegspace", "WSegSpace"), ("xx", "Other"), ("zwj", "ZWJ"), ]),
];
Loading

0 comments on commit ecc1a5a

Please sign in to comment.