From e96ec2eb6c099b8439570d6ae8cc4343d0bc59f1 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Fri, 7 Jun 2024 21:35:47 -0400 Subject: [PATCH] Use stdlib alphabetic and numeric character tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …as long as the Unicode versions match --- scripts/unicode.py | 28 ++++++++++++++++++++-------- src/tables.rs | 26 ++++++++++++++++++-------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 293c03c..8815b7b 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -232,19 +232,27 @@ def emit_util_mod(f): #[inline] fn is_alphabetic(c: char) -> bool { - match c { - 'a' ..= 'z' | 'A' ..= 'Z' => true, - c if c > '\x7f' => super::derived_property::Alphabetic(c), - _ => false, + if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION { + c.is_alphabetic() + } else { + match c { + 'a' ..= 'z' | 'A' ..= 'Z' => true, + c if c > '\\x7f' => super::derived_property::Alphabetic(c), + _ => false, + } } } #[inline] fn is_numeric(c: char) -> bool { - match c { - '0' ..= '9' => true, - c if c > '\x7f' => super::general_category::N(c), - _ => false, + if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION { + c.is_numeric() + } else { + match c { + '0' ..= '9' => true, + c if c > '\\x7f' => super::general_category::N(c), + _ => false, + } } } @@ -388,6 +396,10 @@ def emit_break_module(f, break_table, break_cats, name): /// The version of [Unicode](http://www.unicode.org/) /// that this version of unicode-segmentation is based on. pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); +""" % UNICODE_VERSION) + + rf.write(""" +const UNICODE_VERSION_U8: (u8, u8, u8) = (%s, %s, %s); """ % UNICODE_VERSION) # download and parse all the data diff --git a/src/tables.rs b/src/tables.rs index 5bb5605..86a7cea 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -16,6 +16,8 @@ /// that this version of unicode-segmentation is based on. pub const UNICODE_VERSION: (u64, u64, u64) = (15, 1, 0); +const UNICODE_VERSION_U8: (u8, u8, u8) = (15, 1, 0); + pub mod util { #[inline] pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool { @@ -29,19 +31,27 @@ pub mod util { #[inline] fn is_alphabetic(c: char) -> bool { - match c { - 'a' ..= 'z' | 'A' ..= 'Z' => true, - c if c > '' => super::derived_property::Alphabetic(c), - _ => false, + if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION { + c.is_alphabetic() + } else { + match c { + 'a' ..= 'z' | 'A' ..= 'Z' => true, + c if c > '\x7f' => super::derived_property::Alphabetic(c), + _ => false, + } } } #[inline] fn is_numeric(c: char) -> bool { - match c { - '0' ..= '9' => true, - c if c > '' => super::general_category::N(c), - _ => false, + if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION { + c.is_numeric() + } else { + match c { + '0' ..= '9' => true, + c if c > '\x7f' => super::general_category::N(c), + _ => false, + } } }