From c18abfae844ea2b1c7643bd213599d419b578250 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 23:36:00 -0500 Subject: [PATCH] syntax: rewrite the regex-syntax crate This commit represents a ground up rewrite of the regex-syntax crate. This commit is also an intermediate state. That is, it adds a new regex-syntax-2 crate without making any serious changes to any other code. Subsequent commits will cover the integration of the rewrite and the removal of the old crate. The rewrite is intended to be the first phase in an effort to overhaul the entire regex crate. To that end, this rewrite takes steps in that direction: * The principle change in the public API is an explicit split between a regular expression's abstract syntax (AST) and a high-level intermediate representation (HIR) that is easier to analyze. The old version of this crate mixes these two concepts, but leaned heavily towards an HIR. The AST in the rewrite has a much closer correspondence with the concrete syntax than the old `Expr` type does. The new HIR embraces its role; all flags are now compiled away (including the `i` flag), which will simplify subsequent passes, including literal detection and the compiler. ASTs are produced by ast::parse and HIR is produced by hir::translate. A top-level parser is provided that combines these so that callers can skip straight from concrete syntax to HIR. * Error messages are vastly improved thanks to the span information that is now embedded in the AST. In addition to better formatting, error messages now also include helpful hints when trying to use features that aren't supported (like backreferences and look-around). In particular, octal support is now an opt-in option. (Octal support will continue to be enabled in regex proper to support backwards compatibility, but will be disabled in 1.0.) * More robust support for Unicode Level 1 as described in UTS#18. In particular, we now fully support Unicode character classes including set notation (difference, intersection, symmetric difference) and correct support for named general categories, scripts, script extensions and age. That is, `\p{scx:Hira}` and `p{age:3.0}` now work. To make this work, we introduce an internal interval set data structure. * With the exception of literal extraction (which will be overhauled in a later phase), all code in the rewrite uses constant stack space, even while performing analysis that requires structural induction over the AST or HIR. This is done by pushing the call stack onto the heap, and is abstracted by the `ast::Visitor` and `hir::Visitor` traits. The point of this method is to eliminate stack overflows in the general case. * Empty sub-expressions are now properly supported. Expressions like `()`, `|`, `a|` and `b|()+` are now valid syntax. The principle downsides of these changes are parse time and binary size. Both seemed to have increased (slower and bigger) by about 1.5x. Parse time is generally peanuts compared to the compiler, so we mostly don't care about that. Binary size is mildly unfortunate, and if it becomes a serious issue, it should be possible to introduce a feature that disables some level of Unicode support and/or work on compressing the Unicode tables. Compile times have increased slightly, but are still a very small fraction of the overall time it takes to compile `regex`. Fixes #174, Fixes #424 --- regex-debug/Cargo.toml | 1 + regex-debug/src/main.rs | 32 +- regex-syntax-2/Cargo.toml | 13 + regex-syntax-2/benches/bench.rs | 73 + regex-syntax-2/src/ast/mod.rs | 1507 +++++ regex-syntax-2/src/ast/parse.rs | 5257 +++++++++++++++++ regex-syntax-2/src/ast/print.rs | 591 ++ regex-syntax-2/src/ast/visitor.rs | 557 ++ regex-syntax-2/src/either.rs | 8 + regex-syntax-2/src/error.rs | 278 + regex-syntax-2/src/hir/interval.rs | 490 ++ regex-syntax-2/src/hir/literal/mod.rs | 1553 +++++ regex-syntax-2/src/hir/mod.rs | 2040 +++++++ regex-syntax-2/src/hir/translate.rs | 2506 ++++++++ regex-syntax-2/src/hir/visitor.rs | 222 + regex-syntax-2/src/lib.rs | 222 + regex-syntax-2/src/parser.rs | 201 + regex-syntax-2/src/unicode.rs | 462 ++ regex-syntax-2/src/unicode_tables/age.rs | 424 ++ .../src/unicode_tables/case_folding_simple.rs | 662 +++ .../src/unicode_tables/general_category.rs | 1844 ++++++ regex-syntax-2/src/unicode_tables/mod.rs | 9 + .../src/unicode_tables/perl_word.rs | 179 + .../src/unicode_tables/property_bool.rs | 2576 ++++++++ .../src/unicode_tables/property_names.rs | 146 + .../src/unicode_tables/property_values.rs | 277 + regex-syntax-2/src/unicode_tables/script.rs | 765 +++ .../src/unicode_tables/script_extension.rs | 785 +++ regex-syntax/benches/bench.rs | 65 + 29 files changed, 23744 insertions(+), 1 deletion(-) create mode 100644 regex-syntax-2/Cargo.toml create mode 100644 regex-syntax-2/benches/bench.rs create mode 100644 regex-syntax-2/src/ast/mod.rs create mode 100644 regex-syntax-2/src/ast/parse.rs create mode 100644 regex-syntax-2/src/ast/print.rs create mode 100644 regex-syntax-2/src/ast/visitor.rs create mode 100644 regex-syntax-2/src/either.rs create mode 100644 regex-syntax-2/src/error.rs create mode 100644 regex-syntax-2/src/hir/interval.rs create mode 100644 regex-syntax-2/src/hir/literal/mod.rs create mode 100644 regex-syntax-2/src/hir/mod.rs create mode 100644 regex-syntax-2/src/hir/translate.rs create mode 100644 regex-syntax-2/src/hir/visitor.rs create mode 100644 regex-syntax-2/src/lib.rs create mode 100644 regex-syntax-2/src/parser.rs create mode 100644 regex-syntax-2/src/unicode.rs create mode 100644 regex-syntax-2/src/unicode_tables/age.rs create mode 100644 regex-syntax-2/src/unicode_tables/case_folding_simple.rs create mode 100644 regex-syntax-2/src/unicode_tables/general_category.rs create mode 100644 regex-syntax-2/src/unicode_tables/mod.rs create mode 100644 regex-syntax-2/src/unicode_tables/perl_word.rs create mode 100644 regex-syntax-2/src/unicode_tables/property_bool.rs create mode 100644 regex-syntax-2/src/unicode_tables/property_names.rs create mode 100644 regex-syntax-2/src/unicode_tables/property_values.rs create mode 100644 regex-syntax-2/src/unicode_tables/script.rs create mode 100644 regex-syntax-2/src/unicode_tables/script_extension.rs create mode 100644 regex-syntax/benches/bench.rs diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index 54b2bb511d..1291448bbb 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -14,5 +14,6 @@ workspace = ".." docopt = "0.8" regex = { version = "0.2", path = ".." } regex-syntax = { version = "0.4.0", path = "../regex-syntax" } +regex-syntax2 = { version = "0.5.0", path = "../regex-syntax-2" } serde = "1" serde_derive = "1" diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs index f31dc22a9c..97f628d542 100644 --- a/regex-debug/src/main.rs +++ b/regex-debug/src/main.rs @@ -1,6 +1,7 @@ extern crate docopt; extern crate regex; extern crate regex_syntax as syntax; +extern crate regex_syntax2; extern crate serde; #[macro_use] extern crate serde_derive; @@ -17,6 +18,8 @@ use syntax::{ExprBuilder, Expr, Literals}; const USAGE: &'static str = " Usage: regex-debug [options] ast + regex-debug [options] ast2 + regex-debug [options] hir2 regex-debug [options] prefixes ... regex-debug [options] suffixes ... regex-debug [options] anchors @@ -51,6 +54,8 @@ Options: #[derive(Deserialize)] struct Args { cmd_ast: bool, + cmd_ast2: bool, + cmd_hir2: bool, cmd_prefixes: bool, cmd_suffixes: bool, cmd_anchors: bool, @@ -93,6 +98,10 @@ fn main() { fn run(args: &Args) -> Result<()> { if args.cmd_ast { cmd_ast(args) + } else if args.cmd_ast2 { + cmd_ast2(args) + } else if args.cmd_hir2 { + cmd_hir2(args) } else if args.cmd_prefixes { cmd_literals(args) } else if args.cmd_suffixes { @@ -109,7 +118,28 @@ fn run(args: &Args) -> Result<()> { } fn cmd_ast(args: &Args) -> Result<()> { - println!("{:#?}", try!(args.parse_one())); + let ast = try!(args.parse_one()); + println!("{:#?}", ast); + Ok(()) +} + +fn cmd_ast2(args: &Args) -> Result<()> { + use regex_syntax2::ast::parse::Parser; + + let mut parser = Parser::new(); + let ast = try!(parser.parse(&args.arg_pattern)); + println!("{:#?}", ast); + Ok(()) +} + +fn cmd_hir2(args: &Args) -> Result<()> { + use regex_syntax2::ParserBuilder; + + let mut parser = ParserBuilder::new() + .allow_invalid_utf8(false) + .build(); + let hir = try!(parser.parse(&args.arg_pattern)); + println!("{:#?}", hir); Ok(()) } diff --git a/regex-syntax-2/Cargo.toml b/regex-syntax-2/Cargo.toml new file mode 100644 index 0000000000..baf5b70fbc --- /dev/null +++ b/regex-syntax-2/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "regex-syntax2" +version = "0.5.0" #:version +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex-syntax" +homepage = "https://github.com/rust-lang/regex" +description = "A regular expression parser." +workspace = ".." + +[dependencies] +ucd-util = "0.1.0" diff --git a/regex-syntax-2/benches/bench.rs b/regex-syntax-2/benches/bench.rs new file mode 100644 index 0000000000..d1eeedd91d --- /dev/null +++ b/regex-syntax-2/benches/bench.rs @@ -0,0 +1,73 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(test)] + +extern crate regex_syntax2; +extern crate test; + +use regex_syntax2::Parser; +use test::Bencher; + +#[bench] +fn parse_simple1(b: &mut Bencher) { + b.iter(|| { + let re = r"^bc(d|e)*$"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_simple2(b: &mut Bencher) { + b.iter(|| { + let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_small1(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}|\p{N}|\s|.|\d"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium1(b: &mut Bencher) { + b.iter(|| { + let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium2(b: &mut Bencher) { + b.iter(|| { + let re = r"\s\S\w\W\d\D"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium3(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_huge(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}{100}"; + Parser::new().parse(re).unwrap() + }); +} diff --git a/regex-syntax-2/src/ast/mod.rs b/regex-syntax-2/src/ast/mod.rs new file mode 100644 index 0000000000..ad63a1b491 --- /dev/null +++ b/regex-syntax-2/src/ast/mod.rs @@ -0,0 +1,1507 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines an abstract syntax for regular expressions. +*/ + +use std::cmp::Ordering; +use std::error; +use std::fmt; + +pub use ast::visitor::{Visitor, visit}; + +pub mod parse; +pub mod print; +mod visitor; + +/// An error that occurred while parsing a regular expression into an abstract +/// syntax tree. +/// +/// Note that note all ASTs represents a valid regular expression. For example, +/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a +/// valid Unicode property name. That particular error is reported when +/// translating an AST to the high-level intermediate representation (`HIR`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the parser generated the error from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } + + /// Return an auxiliary span. This span exists only for some errors that + /// benefit from being able to point to two locations in the original + /// regular expression. For example, "duplicate" errors will have the + /// main error position set to the duplicate occurrence while its + /// auxiliary span will be set to the initial occurrence. + pub fn auxiliary_span(&self) -> Option<&Span> { + use self::ErrorKind::*; + match self.kind { + FlagDuplicate { ref original } => Some(original), + FlagRepeatedNegation { ref original, .. } => Some(original), + GroupNameDuplicate { ref original, .. } => Some(original), + _ => None, + } + } +} + +/// The type of an error that occurred while building an AST. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// The capturing group limit was exceeded. + /// + /// Note that this represents a limit on the total number of capturing + /// groups in a regex and not necessarily the number of nested capturing + /// groups. That is, the nest limit can be low and it is still possible for + /// this error to occur. + CaptureLimitExceeded, + /// An invalid escape sequence was found in a character class set. + ClassEscapeInvalid, + /// An invalid character class range was found. An invalid range is any + /// range where the start is greater than the end. + ClassRangeInvalid, + /// An opening `[` was found with no corresponding closing `]`. + ClassUnclosed, + /// An empty decimal number was given where one was expected. + DecimalEmpty, + /// An invalid decimal number was given where one was expected. + DecimalInvalid, + /// A bracketed hex literal was empty. + EscapeHexEmpty, + /// A bracketed hex literal did not correspond to a Unicode scalar value. + EscapeHexInvalid, + /// An invalid hexadecimal digit was found. + EscapeHexInvalidDigit, + /// EOF was found before an escape sequence was completed. + EscapeUnexpectedEof, + /// An unrecognized escape sequence. + EscapeUnrecognized, + /// A dangling negation was used when setting flags, e.g., `i-`. + FlagDanglingNegation, + /// A flag was used twice, e.g., `i-i`. + FlagDuplicate { + /// The position of the original flag. The error position + /// points to the duplicate flag. + original: Span, + }, + /// The negation operator was used twice, e.g., `-i-s`. + FlagRepeatedNegation { + /// The position of the original negation operator. The error position + /// points to the duplicate negation operator. + original: Span, + }, + /// Expected a flag but got EOF, e.g., `(?`. + FlagUnexpectedEof, + /// Unrecognized flag, e.g., `a`. + FlagUnrecognized, + /// A duplicate capture name was found. + GroupNameDuplicate { + /// The position of the initial occurrence of the capture name. The + /// error position itself points to the duplicate occurrence. + original: Span, + }, + /// A capture group name is empty, e.g., `(?P<>abc)`. + GroupNameEmpty, + /// An invalid character was seen for a capture group name. This includes + /// errors where the first character is a digit (even though subsequent + /// characters are allowed to be digits). + GroupNameInvalid, + /// A closing `>` could not be found for a capture group name. + GroupNameUnexpectedEof, + /// An unclosed group, e.g., `(ab`. + /// + /// The span of this error corresponds to the unclosed parenthesis. + GroupUnclosed, + /// An unopened group, e.g., `ab)`. + GroupUnopened, + /// The nest limit was exceeded. The limit stored here is the limit + /// configured in the parser. + NestLimitExceeded(u32), + /// The range provided in a counted repetition operator is invalid. The + /// range is invalid if the start is greater than the end. + RepetitionCountInvalid, + /// An opening `{` was found with no corresponding closing `}`. + RepetitionCountUnclosed, + /// A repetition operator was applied to a missing sub-expression. This + /// occurs, for example, in the regex consisting of just a `*`. It is, + /// however, possible to create a repetition operating on an empty + /// sub-expression. For example, `()*` is still considered valid. + RepetitionMissing, + /// When octal support is disabled, this error is produced when an octal + /// escape is used. The octal escape is assumed to be an invocation of + /// a backreference, which is the common case. + UnsupportedBackreference, + /// When syntax similar to PCRE's look-around is used, this error is + /// returned. Some example syntaxes that are rejected include, but are + /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and + /// `(? &str { + use self::ErrorKind::*; + match self.kind { + CaptureLimitExceeded => "capture group limit exceeded", + ClassEscapeInvalid => "invalid escape sequence in character class", + ClassRangeInvalid => "invalid character class range", + ClassUnclosed => "unclosed character class", + DecimalEmpty => "empty decimal literal", + DecimalInvalid => "invalid decimal literal", + EscapeHexEmpty => "empty hexadecimal literal", + EscapeHexInvalid => "invalid hexadecimal literal", + EscapeHexInvalidDigit => "invalid hexadecimal digit", + EscapeUnexpectedEof => "unexpected eof (escape sequence)", + EscapeUnrecognized => "unrecognized escape sequence", + FlagDanglingNegation => "dangling flag negation operator", + FlagDuplicate{..} => "duplicate flag", + FlagRepeatedNegation{..} => "repeated negation", + FlagUnexpectedEof => "unexpected eof (flag)", + FlagUnrecognized => "unrecognized flag", + GroupNameDuplicate{..} => "duplicate capture group name", + GroupNameEmpty => "empty capture group name", + GroupNameInvalid => "invalid capture group name", + GroupNameUnexpectedEof => "unclosed capture group name", + GroupUnclosed => "unclosed group", + GroupUnopened => "unopened group", + NestLimitExceeded(_) => "nest limit exceeded", + RepetitionCountInvalid => "invalid repetition count range", + RepetitionCountUnclosed => "unclosed counted repetition", + RepetitionMissing => "repetition operator missing expression", + UnsupportedBackreference => "backreferences are not supported", + UnsupportedLookAround => "look-around is not supported", + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::ErrorKind::*; + match *self { + CaptureLimitExceeded => { + write!(f, "exceeded the maximum number of \ + capturing groups ({})", ::std::u32::MAX) + } + ClassEscapeInvalid => { + write!(f, "invalid escape sequence found in character class") + } + ClassRangeInvalid => { + write!(f, "invalid character class range, \ + the start must be <= the end") + } + ClassUnclosed => { + write!(f, "unclosed character class") + } + DecimalEmpty => { + write!(f, "decimal literal empty") + } + DecimalInvalid => { + write!(f, "decimal literal invalid") + } + EscapeHexEmpty => { + write!(f, "hexadecimal literal empty") + } + EscapeHexInvalid => { + write!(f, "hexadecimal literal is not a Unicode scalar value") + } + EscapeHexInvalidDigit => { + write!(f, "invalid hexadecimal digit") + } + EscapeUnexpectedEof => { + write!(f, "incomplete escape sequence, \ + reached end of pattern prematurely") + } + EscapeUnrecognized => { + write!(f, "unrecognized escape sequence") + } + FlagDanglingNegation => { + write!(f, "dangling flag negation operator") + } + FlagDuplicate{..} => { + write!(f, "duplicate flag") + } + FlagRepeatedNegation{..} => { + write!(f, "flag negation operator repeated") + } + FlagUnexpectedEof => { + write!(f, "expected flag but got end of regex") + } + FlagUnrecognized => { + write!(f, "unrecognized flag") + } + GroupNameDuplicate{..} => { + write!(f, "duplicate capture group name") + } + GroupNameEmpty => { + write!(f, "empty capture group name") + } + GroupNameInvalid => { + write!(f, "invalid capture group character") + } + GroupNameUnexpectedEof => { + write!(f, "unclosed capture group name") + } + GroupUnclosed => { + write!(f, "unclosed group") + } + GroupUnopened => { + write!(f, "unopened group") + } + NestLimitExceeded(limit) => { + write!(f, "exceed the maximum number of \ + nested parentheses/brackets ({})", limit) + } + RepetitionCountInvalid => { + write!(f, "invalid repetition count range, \ + the start must be <= the end") + } + RepetitionCountUnclosed => { + write!(f, "unclosed counted repetition") + } + RepetitionMissing => { + write!(f, "repetition operator missing expression") + } + UnsupportedBackreference => { + write!(f, "backreferences are not supported") + } + UnsupportedLookAround => { + write!(f, "look-around, including look-ahead and look-behind, \ + is not supported") + } + _ => unreachable!(), + } + } +} + +/// Span represents the position information of a single AST item. +/// +/// All span positions are absolute byte offsets that can be used on the +/// original regular expression that was parsed. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Span { + /// The start byte offset. + pub start: Position, + /// The end byte offset. + pub end: Position, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Span({:?}, {:?})", self.start, self.end) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Span) -> Ordering { + (&self.start, &self.end).cmp(&(&other.start, &other.end)) + } +} + +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Span) -> Option { + Some(self.cmp(other)) + } +} + +/// A single position in a regular expression. +/// +/// A position encodes one half of a span, and include the byte offset, line +/// number and column number. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Position { + /// The absolute offset of this position, starting at `0` from the + /// beginning of the regular expression pattern string. + pub offset: usize, + /// The line number, starting at `1`. + pub line: usize, + /// The approximate column number, starting at `1`. + pub column: usize, +} + +impl fmt::Debug for Position { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Position(o: {:?}, l: {:?}, c: {:?})", + self.offset, self.line, self.column) + } +} + +impl Ord for Position { + fn cmp(&self, other: &Position) -> Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for Position { + fn partial_cmp(&self, other: &Position) -> Option { + Some(self.cmp(other)) + } +} + +impl Span { + /// Create a new span with the given positions. + pub fn new(start: Position, end: Position) -> Span { + Span { start: start, end: end } + } + + /// Create a new span using the given position as the start and end. + pub fn splat(pos: Position) -> Span { + Span::new(pos, pos) + } + + /// Create a new span by replacing the starting the position with the one + /// given. + pub fn with_start(self, pos: Position) -> Span { + Span { start: pos, ..self } + } + + /// Create a new span by replacing the ending the position with the one + /// given. + pub fn with_end(self, pos: Position) -> Span { + Span { end: pos, ..self } + } + + /// Returns true if and only if this span occurs on a single line. + pub fn is_one_line(&self) -> bool { + self.start.line == self.end.line + } + + /// Returns true if and only if this span is empty. That is, it points to + /// a single position in the concrete syntax of a regular expression. + pub fn is_empty(&self) -> bool { + self.start.offset == self.end.offset + } +} + +impl Position { + /// Create a new position with the given information. + /// + /// `offset` is the absolute offset of the position, starting at `0` from + /// the beginning of the regular expression pattern string. + /// + /// `line` is the line number, starting at `1`. + /// + /// `column` is the approximate column number, starting at `1`. + pub fn new(offset: usize, line: usize, column: usize) -> Position { + Position { offset: offset, line: line, column: column } + } +} + +/// An abstract syntax tree for a singular expression along with comments +/// found. +/// +/// Comments are not stored in the tree itself to avoid complexity. Each +/// comment contains a span of precisely where it occurred in the original +/// regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct WithComments { + /// The actual ast. + pub ast: Ast, + /// All comments found in the original regular expression. + pub comments: Vec, +} + +/// A comment from a regular expression with an associated span. +/// +/// A regular expression can only contain comments when the `x` flag is +/// enabled. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Comment { + /// The span of this comment, including the beginning `#` and ending `\n`. + pub span: Span, + /// The comment text, starting with the first character following the `#` + /// and ending with the last character preceding the `\n`. + pub comment: String, +} + +/// An abstract syntax tree for a single regular expression. +/// +/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap +/// space proportional to the size of the `Ast`. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the `Ast`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Ast { + /// An empty regex that matches everything. + Empty(Span), + /// A set of flags, e.g., `(?is)`. + Flags(SetFlags), + /// A single character literal, which includes escape sequences. + Literal(Literal), + /// The "any character" class. + Dot(Span), + /// A single zero-width assertion. + Assertion(Assertion), + /// A single character class. This includes all forms of character classes + /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. + Class(Class), + /// A repetition operator applied to an arbitrary regular expression. + Repetition(Repetition), + /// A grouped regular expression. + Group(Group), + /// An alternation of regular expressions. + Alternation(Alternation), + /// A concatenation of regular expressions. + Concat(Concat), +} + +impl Ast { + /// Return the span of this abstract syntax tree. + pub fn span(&self) -> &Span { + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::Class(ref x) => x.span(), + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + } + } + + /// Return true if and only if this Ast is empty. + pub fn is_empty(&self) -> bool { + match *self { + Ast::Empty(_) => true, + _ => false, + } + } + + /// Returns true if and only if this AST has any (including possibly empty) + /// subexpressions. + fn has_subexprs(&self) -> bool { + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) => false, + Ast::Class(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, + } + } +} + +/// Print a display representation of this Ast. +/// +/// This does not preserve any of the original whitespace formatting that may +/// have originally been present in the concrete syntax from which this Ast +/// was generated. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Ast`. +impl fmt::Display for Ast { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use ast::print::Printer; + Printer::new().print(self, f) + } +} + +/// An alternation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Alternation { + /// The span of this alternation. + pub span: Span, + /// The alternate regular expressions. + pub asts: Vec, +} + +impl Alternation { + /// Return this alternation as an AST. + /// + /// If this alternation contains zero ASTs, then Ast::Empty is + /// returned. If this alternation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Alternation(self), + } + } +} + +/// A concatenation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Concat { + /// The span of this concatenation. + pub span: Span, + /// The concatenation regular expressions. + pub asts: Vec, +} + +impl Concat { + /// Return this concatenation as an AST. + /// + /// If this concatenation contains zero ASTs, then Ast::Empty is + /// returned. If this concatenation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Concat(self), + } + } +} + +/// A single literal expression. +/// +/// A literal corresponds to a single Unicode scalar value. Literals may be +/// represented in their literal form, e.g., `a` or in their escaped form, +/// e.g., `\x61`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Literal { + /// The span of this literal. + pub span: Span, + /// The kind of this literal. + pub kind: LiteralKind, + /// The Unicode scalar value corresponding to this literal. + pub c: char, +} + +impl Literal { + /// If this literal was written as a `\x` hex escape, then this returns + /// the corresponding byte value. Otherwise, this returns `None`. + pub fn byte(&self) -> Option { + let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); + if self.c as u32 <= 255 && self.kind == short_hex { + Some(self.c as u8) + } else { + None + } + } +} + +/// The kind of a single literal expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum LiteralKind { + /// The literal is written verbatim, e.g., `a` or `☃`. + Verbatim, + /// The literal is written as an escape because it is punctuation, e.g., + /// `\*` or `\[`. + Punctuation, + /// The literal is written as an octal escape, e.g., `\141`. + Octal, + /// The literal is written as a hex code with a fixed number of digits + /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// `\U00000061`. + HexFixed(HexLiteralKind), + /// The literal is written as a hex code with a bracketed number of + /// digits. The only restriction is that the bracketed hex code must refer + /// to a valid Unicode scalar value. + HexBrace(HexLiteralKind), + /// The literal is written as a specially recognized escape, e.g., `\f` + /// or `\n`. + Special(SpecialLiteralKind), +} + +/// The type of a special literal. +/// +/// A special literal is a special escape sequence recognized by the regex +/// parser, e.g., `\f` or `\n`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SpecialLiteralKind { + /// Bell, spelled `\a` (`\x07`). + Bell, + /// Form feed, spelled `\f` (`\x0C`). + FormFeed, + /// Tab, spelled `\t` (`\x09`). + Tab, + /// Line feed, spelled `\n` (`\x0A`). + LineFeed, + /// Carriage return, spelled `\r` (`\x0D`). + CarriageReturn, + /// Vertical tab, spelled `\v` (`\x0B`). + VerticalTab, + /// Space, spelled `\ ` (`\x20`). Note that this can only appear when + /// parsing in verbose mode. + Space, +} + +/// The type of a Unicode hex literal. +/// +/// Note that all variants behave the same when used with brackets. They only +/// differ when used without brackets in the number of hex digits that must +/// follow. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HexLiteralKind { + /// A `\x` prefix. When used without brackets, this form is limited to + /// two digits. + X, + /// A `\u` prefix. When used without brackets, this form is limited to + /// four digits. + UnicodeShort, + /// A `\U` prefix. When used without brackets, this form is limited to + /// eight digits. + UnicodeLong, +} + +impl HexLiteralKind { + /// The number of digits that must be used with this literal form when + /// used without brackets. When used with brackets, there is no + /// restriction on the number of digits. + pub fn digits(&self) -> u32 { + match *self { + HexLiteralKind::X => 2, + HexLiteralKind::UnicodeShort => 4, + HexLiteralKind::UnicodeLong => 8, + } + } +} + +/// A single character class expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(ClassBracketed), +} + +impl Class { + /// Return the span of this character class. + pub fn span(&self) -> &Span { + match *self { + Class::Perl(ref x) => &x.span, + Class::Unicode(ref x) => &x.span, + Class::Bracketed(ref x) => &x.span, + } + } +} + +/// A Perl character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassPerl { + /// The span of this class. + pub span: Span, + /// The kind of Perl class. + pub kind: ClassPerlKind, + /// Whether the class is negated or not. e.g., `\d` is not negated but + /// `\D` is. + pub negated: bool, +} + +/// The available Perl character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassPerlKind { + /// Decimal numbers. + Digit, + /// Whitespace. + Space, + /// Word characters. + Word, +} + +/// An ASCII character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassAscii { + /// The span of this class. + pub span: Span, + /// The kind of ASCII class. + pub kind: ClassAsciiKind, + /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated + /// but `[[:^alpha:]]` is. + pub negated: bool, +} + +/// The available ASCII character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassAsciiKind { + /// `[0-9A-Za-z]` + Alnum, + /// `[A-Za-z]` + Alpha, + /// `[\x00-\x7F]` + Ascii, + /// `[ \t]` + Blank, + /// `[\x00-\x1F\x7F]` + Cntrl, + /// `[0-9]` + Digit, + /// `[!-~]` + Graph, + /// `[a-z]` + Lower, + /// `[ -~]` + Print, + /// `[!-/:-@\[-`{-~]` + Punct, + /// `[\t\n\v\f\r ]` + Space, + /// `[A-Z]` + Upper, + /// `[0-9A-Za-z_]` + Word, + /// `[0-9A-Fa-f]` + Xdigit, +} + +impl ClassAsciiKind { + /// Return the corresponding ClassAsciiKind variant for the given name. + /// + /// The name given should correspond to the lowercase version of the + /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. + /// + /// If no variant with the corresponding name exists, then `None` is + /// returned. + pub fn from_name(name: &str) -> Option { + use self::ClassAsciiKind::*; + match name { + "alnum" => Some(Alnum), + "alpha" => Some(Alpha), + "ascii" => Some(Ascii), + "blank" => Some(Blank), + "cntrl" => Some(Cntrl), + "digit" => Some(Digit), + "graph" => Some(Graph), + "lower" => Some(Lower), + "print" => Some(Print), + "punct" => Some(Punct), + "space" => Some(Space), + "upper" => Some(Upper), + "word" => Some(Word), + "xdigit" => Some(Xdigit), + _ => None, + } + } +} + +/// A Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. + /// + /// Note: be careful when using this attribute. This specifically refers + /// to whether the class is written as `\p` or `\P`, where the latter + /// is `negated = true`. However, it also possible to write something like + /// `\P{scx!=Katakana}` which is actually equivalent to + /// `\p{scx=Katakana}` and is therefore not actually negated even though + /// `negated = true` here. To test whether this class is truly negated + /// or not, use the `is_negated` method. + pub negated: bool, + /// The kind of Unicode class. + pub kind: ClassUnicodeKind, +} + +impl ClassUnicode { + /// Returns true if this class has been negated. + /// + /// Note that this takes the Unicode op into account, if it's present. + /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. + pub fn is_negated(&self) -> bool { + match self.kind { + ClassUnicodeKind::NamedValue { + op: ClassUnicodeOpKind::NotEqual, .. + } => !self.negated, + _ => self.negated, + } + } +} + +/// The available forms of Unicode character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeKind { + /// A one letter abbreviated class, e.g., `\pN`. + OneLetter(char), + /// A binary property, general category or script. The string may be + /// empty. + Named(String), + /// A property name and an associated value. + NamedValue { + /// The type of Unicode op used to associate `name` with `value`. + op: ClassUnicodeOpKind, + /// The property name (which may be empty). + name: String, + /// The property value (which may be empty). + value: String, + }, +} + +/// The type of op used in a Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeOpKind { + /// A property set to a specific value, e.g., `\p{scx=Katakana}`. + Equal, + /// A property set to a specific value using a colon, e.g., + /// `\p{scx:Katakana}`. + Colon, + /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. + NotEqual, +} + +impl ClassUnicodeOpKind { + /// Whether the op is an equality op or not. + pub fn is_equal(&self) -> bool { + match *self { + ClassUnicodeOpKind::Equal|ClassUnicodeOpKind::Colon => true, + _ => false, + } + } +} + +/// A bracketed character class, e.g., `[a-z0-9]`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBracketed { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. e.g., `[a]` is not negated but + /// `[^a]` is. + pub negated: bool, + /// The type of this set. A set is either a normal union of things, e.g., + /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. + pub kind: ClassSet, +} + +/// A character class set. +/// +/// This type corresponds to the internal structure of a bracketed character +/// class. That is, every bracketed character is one of two types: a union of +/// items (literals, ranges, other bracketed classes) or a tree of binary set +/// operations. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSet { + /// An item, which can be a single literal, range, nested character class + /// or a union of items. + Item(ClassSetItem), + /// A single binary operation (i.e., &&, -- or ~~). + BinaryOp(ClassSetBinaryOp), +} + +impl ClassSet { + /// Build a set from a union. + pub fn union(ast: ClassSetUnion) -> ClassSet { + ClassSet::Item(ClassSetItem::Union(ast)) + } + + /// Return the span of this character class set. + pub fn span(&self) -> &Span { + match *self { + ClassSet::Item(ref x) => x.span(), + ClassSet::BinaryOp(ref x) => &x.span, + } + } + + /// Return true if and only if this class set is empty. + fn is_empty(&self) -> bool { + match *self { + ClassSet::Item(ClassSetItem::Empty(_)) => true, + _ => false, + } + } +} + +/// A single component of a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSetItem { + /// An empty item. + /// + /// Note that a bracketed character class cannot contain a single empty + /// item. Empty items can appear when using one of the binary operators. + /// For example, `[&&]` is the intersection of two empty classes. + Empty(Span), + /// A single literal. + Literal(Literal), + /// A range between two literals. + Range(ClassSetRange), + /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. + Ascii(ClassAscii), + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(Box), + /// A union of items. + Union(ClassSetUnion), +} + +impl ClassSetItem { + /// Return the span of this character class set item. + pub fn span(&self) -> &Span { + match *self { + ClassSetItem::Empty(ref span) => span, + ClassSetItem::Literal(ref x) => &x.span, + ClassSetItem::Range(ref x) => &x.span, + ClassSetItem::Ascii(ref x) => &x.span, + ClassSetItem::Perl(ref x) => &x.span, + ClassSetItem::Unicode(ref x) => &x.span, + ClassSetItem::Bracketed(ref x) => &x.span, + ClassSetItem::Union(ref x) => &x.span, + } + } +} + +/// A single character class range in a set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetRange { + /// The span of this range. + pub span: Span, + /// The start of this range. + pub start: Literal, + /// The end of this range. + pub end: Literal, +} + +impl ClassSetRange { + /// Returns true if and only if this character class range is valid. + /// + /// The only case where a range is invalid is if its start is greater than + /// its end. + pub fn is_valid(&self) -> bool { + self.start.c <= self.end.c + } +} + +/// A union of items inside a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetUnion { + /// The span of the items in this operation. e.g., the `a-z0-9` in + /// `[^a-z0-9]` + pub span: Span, + /// The sequence of items that make up this union. + pub items: Vec, +} + +impl ClassSetUnion { + /// Push a new item in this union. + /// + /// The ending position of this union's span is updated to the ending + /// position of the span of the item given. If the union is empty, then + /// the starting position of this union is set to the starting position + /// of this item. + /// + /// In other words, if you only use this method to add items to a union + /// and you set the spans on each item correctly, then you should never + /// need to adjust the span of the union directly. + pub fn push(&mut self, item: ClassSetItem) { + if self.items.is_empty() { + self.span.start = item.span().start; + } + self.span.end = item.span().end; + self.items.push(item); + } + + /// Return this union as a character class set item. + /// + /// If this union contains zero items, then an empty union is + /// returned. If this concatenation contains exactly 1 item, then the + /// corresponding item is returned. Otherwise, ClassSetItem::Union is + /// returned. + pub fn into_item(mut self) -> ClassSetItem { + match self.items.len() { + 0 => ClassSetItem::Empty(self.span), + 1 => self.items.pop().unwrap(), + _ => ClassSetItem::Union(self), + } + } +} + +/// A Unicode character class set operation. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetBinaryOp { + /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. + pub span: Span, + /// The type of this set operation. + pub kind: ClassSetBinaryOpKind, + /// The left hand side of the operation. + pub lhs: Box, + /// The right hand side of the operation. + pub rhs: Box, +} + +/// The type of a Unicode character class set operation. +/// +/// Note that this doesn't explicitly represent union since there is no +/// explicit union operator. Concatenation inside a character class corresponds +/// to the union operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ClassSetBinaryOpKind { + /// The intersection of two sets, e.g., `\pN&&[a-z]`. + Intersection, + /// The difference of two sets, e.g., `\pN--[0-9]`. + Difference, + /// The symmetric difference of two sets. The symmetric difference is the + /// set of elements belonging to one but not both sets. + /// e.g., `[\pL~~[:ascii:]]`. + SymmetricDifference, +} + +/// A single zero-width assertion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Assertion { + /// The span of this assertion. + pub span: Span, + /// The assertion kind, e.g., `\b` or `^`. + pub kind: AssertionKind, +} + +/// An assertion kind. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum AssertionKind { + /// `^` + StartLine, + /// `$` + EndLine, + /// `\A` + StartText, + /// `\z` + EndText, + /// `\b` + WordBoundary, + /// `\B` + NotWordBoundary, +} + +/// A repetition operation applied to a regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The span of this operation. + pub span: Span, + /// The actual operation. + pub op: RepetitionOp, + /// Whether this operation was applied greedily or not. + pub greedy: bool, + /// The regular expression under repetition. + pub ast: Box, +} + +/// The repetition operator itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RepetitionOp { + /// The span of this operator. This includes things like `+`, `*?` and + /// `{m,n}`. + pub span: Span, + /// The type of operation. + pub kind: RepetitionKind, +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// `?` + ZeroOrOne, + /// `*` + ZeroOrMore, + /// `+` + OneOrMore, + /// `{m,n}` + Range(RepetitionRange), +} + +/// A range repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// `{m}` + Exactly(u32), + /// `{m,}` + AtLeast(u32), + /// `{m,n}` + Bounded(u32, u32), +} + +impl RepetitionRange { + /// Returns true if and only if this repetition range is valid. + /// + /// The only case where a repetition range is invalid is if it is bounded + /// and its start is greater than its end. + pub fn is_valid(&self) -> bool { + match *self { + RepetitionRange::Bounded(s, e) if s > e => false, + _ => true, + } + } +} + +/// A grouped regular expression. +/// +/// This includes both capturing and non-capturing groups. This does **not** +/// include flag-only groups like `(?is)`, but does contain any group that +/// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and +/// `(?is:a)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The span of this group. + pub span: Span, + /// The kind of this group. + pub kind: GroupKind, + /// The regular expression in this group. + pub ast: Box, +} + +impl Group { + /// If this group is non-capturing, then this returns the (possibly empty) + /// set of flags. Otherwise, `None` is returned. + pub fn flags(&self) -> Option<&Flags> { + match self.kind { + GroupKind::NonCapturing(ref flags) => Some(flags), + _ => None, + } + } + + /// Returns true if and only if this group is capturing. + pub fn is_capturing(&self) -> bool { + match self.kind { + GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::NonCapturing(_) => false, + } + } + + /// Returns the capture index of this group, if this is a capturing group. + /// + /// This returns a capture index precisely when `is_capturing` is `true`. + pub fn capture_index(&self) -> Option { + match self.kind { + GroupKind::CaptureIndex(i) => Some(i), + GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::NonCapturing(_) => None, + } + } +} + +/// The kind of a group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// `(a)` + CaptureIndex(u32), + /// `(?Pa)` + CaptureName(CaptureName), + /// `(?:a)` and `(?i:a)` + NonCapturing(Flags), +} + +/// A capture name. +/// +/// This corresponds to the name itself between the angle brackets in, e.g., +/// `(?Pexpr)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CaptureName { + /// The span of this capture name. + pub span: Span, + /// The capture name. + pub name: String, + /// The capture index. + pub index: u32, +} + +/// A group of flags that is not applied to a particular regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SetFlags { + /// The span of these flags, including the grouping parentheses. + pub span: Span, + /// The actual sequence of flags. + pub flags: Flags, +} + +/// A group of flags. +/// +/// This corresponds only to the sequence of flags themselves, e.g., `is-u`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Flags { + /// The span of this group of flags. + pub span: Span, + /// A sequence of flag items. Each item is either a flag or a negation + /// operator. + pub items: Vec, +} + +impl Flags { + /// Add the given item to this sequence of flags. + /// + /// If the item was added successfully, then `None` is returned. If the + /// given item is a duplicate, then `Some(i)` is returned, where + /// `items[i].kind == item.kind`. + pub fn add_item(&mut self, item: FlagsItem) -> Option { + for (i, x) in self.items.iter().enumerate() { + if x.kind == item.kind { + return Some(i); + } + } + self.items.push(item); + None + } + + /// Returns the state of the given flag in this set. + /// + /// If the given flag is in the set but is negated, then `Some(false)` is + /// returned. + /// + /// If the given flag is in the set and is not negated, then `Some(true)` + /// is returned. + /// + /// Otherwise, `None` is returned. + pub fn flag_state(&self, flag: Flag) -> Option { + let mut negated = false; + for x in &self.items { + match x.kind { + FlagsItemKind::Negation => { + negated = true; + } + FlagsItemKind::Flag(ref xflag) if xflag == &flag => { + return Some(!negated); + } + _ => {} + } + } + None + } +} + +/// A single item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FlagsItem { + /// The span of this item. + pub span: Span, + /// The kind of this item. + pub kind: FlagsItemKind, +} + +/// The kind of an item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum FlagsItemKind { + /// A negation operator applied to all subsequent flags in the enclosing + /// group. + Negation, + /// A single flag in a group. + Flag(Flag), +} + +impl FlagsItemKind { + /// Returns true if and only if this item is a negation operator. + pub fn is_negation(&self) -> bool { + match *self { + FlagsItemKind::Negation => true, + _ => false, + } + } +} + +/// A single flag. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Flag { + /// `i` + CaseInsensitive, + /// `m` + MultiLine, + /// `s` + DotMatchesNewLine, + /// `U` + SwapGreed, + /// `u` + Unicode, + /// `x` + IgnoreWhitespace, +} + +/// A custom `Drop` impl is used for `Ast` such that it uses constant stack +/// space but heap space proportional to the depth of the `Ast`. +impl Drop for Ast { + fn drop(&mut self) { + use std::mem; + + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, + _ => {} + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_ast = || Ast::Empty(empty_span()); + let mut stack = vec![mem::replace(self, empty_ast())]; + while let Some(mut ast) = stack.pop() { + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => {} + Ast::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Group(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Alternation(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + Ast::Concat(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + } + } + } +} + +/// A custom `Drop` impl is used for `ClassSet` such that it uses constant +/// stack space but heap space proportional to the depth of the `ClassSet`. +impl Drop for ClassSet { + fn drop(&mut self) { + use std::mem; + + match *self { + ClassSet::Item(ref item) => { + match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => return, + ClassSetItem::Bracketed(ref x) => { + if x.kind.is_empty() { + return; + } + } + ClassSetItem::Union(ref x) => { + if x.items.is_empty() { + return; + } + } + } + } + ClassSet::BinaryOp(ref op) => { + if op.lhs.is_empty() && op.rhs.is_empty() { + return; + } + } + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); + let mut stack = vec![mem::replace(self, empty_set())]; + while let Some(mut set) = stack.pop() { + match set { + ClassSet::Item(ref mut item) => { + match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => {} + ClassSetItem::Bracketed(ref mut x) => { + stack.push(mem::replace(&mut x.kind, empty_set())); + } + ClassSetItem::Union(ref mut x) => { + stack.extend( + x.items.drain(..).map(ClassSet::Item)); + } + } + } + ClassSet::BinaryOp(ref mut op) => { + stack.push(mem::replace(&mut op.lhs, empty_set())); + stack.push(mem::replace(&mut op.rhs, empty_set())); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We use a thread with an explicit stack size to test that our destructor + // for Ast can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let span = || Span::splat(Position::new(0, 0, 0)); + let mut ast = Ast::Empty(span()); + for i in 0..200 { + ast = Ast::Group(Group { + span: span(), + kind: GroupKind::CaptureIndex(i), + ast: Box::new(ast), + }); + } + assert!(!ast.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1<<10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/regex-syntax-2/src/ast/parse.rs b/regex-syntax-2/src/ast/parse.rs new file mode 100644 index 0000000000..a2f53d21b9 --- /dev/null +++ b/regex-syntax-2/src/ast/parse.rs @@ -0,0 +1,5257 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This module provides a regular expression parser. +*/ + +use std::borrow::Borrow; +use std::cell::{Cell, RefCell}; +use std::mem; +use std::result; + +use ast::{self, Ast, Position, Span}; +use either::Either; + +use is_meta_character; + +type Result = result::Result; + +/// A primitive is an expression with no sub-expressions. This includes +/// literals, assertions and non-set character classes. This representation +/// is used as intermediate state in the parser. +/// +/// This does not include ASCII character classes, since they can only appear +/// within a set character class. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Primitive { + Literal(ast::Literal), + Assertion(ast::Assertion), + Dot(Span), + Perl(ast::ClassPerl), + Unicode(ast::ClassUnicode), +} + +impl Primitive { + /// Return the span of this primitive. + fn span(&self) -> &Span { + match *self { + Primitive::Literal(ref x) => &x.span, + Primitive::Assertion(ref x) => &x.span, + Primitive::Dot(ref span) => span, + Primitive::Perl(ref x) => &x.span, + Primitive::Unicode(ref x) => &x.span, + } + } + + /// Convert this primitive into a proper AST. + fn into_ast(self) -> Ast { + match self { + Primitive::Literal(lit) => Ast::Literal(lit), + Primitive::Assertion(assert) => Ast::Assertion(assert), + Primitive::Dot(span) => Ast::Dot(span), + Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + } + } + + /// Convert this primitive into an item in a character class. + /// + /// If this primitive is not a legal item (i.e., an assertion or a dot), + /// then return an error. + fn into_class_set_item>( + self, + p: &ParserI

, + ) -> Result { + use ast::ClassSetItem; + use self::Primitive::*; + + match self { + Literal(lit) => Ok(ClassSetItem::Literal(lit)), + Perl(cls) => Ok(ClassSetItem::Perl(cls)), + Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } + + /// Convert this primitive into a literal in a character class. In + /// particular, literals are the only valid items that can appear in + /// ranges. + /// + /// If this primitive is not a legal item (i.e., a class, assertion or a + /// dot), then return an error. + fn into_class_literal>( + self, + p: &ParserI

, + ) -> Result { + use self::Primitive::*; + + match self { + Literal(lit) => Ok(lit), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which is not allowed to be a digit). +fn is_capture_char(c: char, first: bool) -> bool { + c == '_' || (!first && c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +#[derive(Clone, Debug)] +pub struct ParserBuilder { + ignore_whitespace: bool, + nest_limit: u32, + octal: bool, +} + +impl Default for ParserBuilder { + fn default() -> ParserBuilder { + ParserBuilder::new() + } +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder { + ignore_whitespace: false, + nest_limit: 250, + octal: false, + } + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), + capture_index: Cell::new(0), + nest_limit: self.nest_limit, + octal: self.octal, + initial_ignore_whitespace: self.ignore_whitespace, + ignore_whitespace: Cell::new(self.ignore_whitespace), + comments: RefCell::new(vec![]), + stack_group: RefCell::new(vec![]), + stack_class: RefCell::new(vec![]), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.octal = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ignore_whitespace = yes; + self + } +} + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + /// The current position of the parser. + pos: Cell, + /// The current capture index. + capture_index: Cell, + /// The maximum number of open parens/brackets allowed. If the parser + /// exceeds this number, then an error is returned. + nest_limit: u32, + /// Whether to support octal syntax or not. When `false`, the parser will + /// return an error helpfully pointing out that backreferences are not + /// supported. + octal: bool, + /// The initial setting for `ignore_whitespace` as provided by + /// Th`ParserBuilder`. is is used when reseting the parser's state. + initial_ignore_whitespace: bool, + /// Whether whitespace should be ignored. When enabled, comments are + /// also permitted. + ignore_whitespace: Cell, + /// A list of comments, in order of appearance. + comments: RefCell>, + /// A stack of grouped sub-expressions, including alternations. + stack_group: RefCell>, + /// A stack of nested character classes. This is only non-empty when + /// parsing a class. + stack_class: RefCell>, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// ParserI is the internal parser implementation. +/// +/// We use this separate type so that we can carry the provided pattern string +/// along with us. In particular, a `Parser` internal state is not tied to any +/// one pattern, but `ParserI` is. +/// +/// This type also lets us use `ParserI<&Parser>` in production code while +/// retaining the convenience of `ParserI` for tests, which sometimes +/// work against the internal interface of the parser. +#[derive(Clone, Debug)] +struct ParserI<'s, P> { + /// The parser state/configuration. + parser: P, + /// The full regular expression provided by the user. + pattern: &'s str, +} + +/// GroupState represents a single stack frame while parsing nested groups +/// and alternations. Each frame records the state up to an opening parenthesis +/// or a alternating bracket `|`. +#[derive(Clone, Debug)] +enum GroupState { + /// This state is pushed whenever an opening group is found. + Group { + /// The concatenation immediately preceding the opening group. + concat: ast::Concat, + /// The group that has been opened. Its sub-AST is always empty. + group: ast::Group, + /// Whether this group has the `x` flag enabled or not. + ignore_whitespace: bool, + }, + /// This state is pushed whenever a new alternation branch is found. If + /// an alternation branch is found and this state is at the top of the + /// stack, then this state should be modified to include the new + /// alternation. + Alternation(ast::Alternation), +} + +/// ClassState represents a single stack frame while parsing character classes. +/// Each frame records the state up to an intersection, difference, symmetric +/// difference or nested class. +/// +/// Note that a parser's character class stack is only non-empty when parsing +/// a character class. In all other cases, it is empty. +#[derive(Clone, Debug)] +enum ClassState { + /// This state is pushed whenever an opening bracket is found. + Open { + /// The union of class items immediately preceding this class. + union: ast::ClassSetUnion, + /// The class that has been opened. Typically this just corresponds + /// to the `[`, but it can also include `[^` since `^` indicates + /// negation of the class. + set: ast::ClassBracketed, + }, + /// This state is pushed when a operator is seen. When popped, the stored + /// set becomes the left hand side of the operator. + Op { + /// The type of the operation, i.e., &&, -- or ~~. + kind: ast::ClassSetBinaryOpKind, + /// The left-hand side of the operator. + lhs: ast::ClassSet, + }, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with either the `parse` or `parse_with_comments` + /// methods. The parse methods return an abstract syntax tree. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into an abstract syntax tree. + pub fn parse(&mut self, pattern: &str) -> Result { + ParserI::new(self, pattern).parse() + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + pub fn parse_with_comments( + &mut self, + pattern: &str, + ) -> Result { + ParserI::new(self, pattern).parse_with_comments() + } + + /// Reset the internal state of a parser. + /// + /// This is called at the beginning of every parse. This prevents the + /// parser from running with inconsistent state (say, if a previous + /// invocation returned an error and the parser is reused). + fn reset(&self) { + // These settings should be in line with the construction + // in `ParserBuilder::build`. + self.pos.set(Position { offset: 0, line: 1, column: 1}); + self.ignore_whitespace.set(self.initial_ignore_whitespace); + self.comments.borrow_mut().clear(); + self.stack_group.borrow_mut().clear(); + self.stack_class.borrow_mut().clear(); + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Build an internal parser from a parser configuration and a pattern. + fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { + ParserI { parser: parser, pattern: pattern } + } + + /// Return a reference to the parser state. + fn parser(&self) -> &Parser { + self.parser.borrow() + } + + /// Return a reference to the pattern being parsed. + fn pattern(&self) -> &str { + self.pattern.borrow() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { + ast::Error { + kind: kind, + pattern: self.pattern().to_string(), + span: span, + } + } + + /// Return the current offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn offset(&self) -> usize { + self.parser().pos.get().offset + } + + /// Return the current line number of the parser. + /// + /// The line number starts at `1`. + fn line(&self) -> usize { + self.parser().pos.get().line + } + + /// Return the current column of the parser. + /// + /// The column number starts at `1` and is reset whenever a `\n` is seen. + fn column(&self) -> usize { + self.parser().pos.get().column + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. + /// + /// The span given should correspond to the location of the opening + /// parenthesis. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self, span: Span) -> Result { + let current = self.parser().capture_index.get(); + let i = try!(current.checked_add(1).ok_or_else(|| { + self.error(span, ast::ErrorKind::CaptureLimitExceeded) + })); + self.parser().capture_index.set(i); + Ok(i) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { + let mut names = self.parser().capture_names.borrow_mut(); + match names.binary_search_by_key( + &cap.name.as_str(), + |c| c.name.as_str(), + ) { + Err(i) => { + names.insert(i, cap.clone()); + Ok(()) + } + Ok(i) => { + Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate { + original: names[i].span, + })) + } + } + } + + /// Return whether the parser should ignore whitespace or not. + fn ignore_whitespace(&self) -> bool { + self.parser().ignore_whitespace.get() + } + + /// Return the character at the current position of the parser. + /// + /// This panics if the current position does not point to a valid char. + fn char(&self) -> char { + self.char_at(self.offset()) + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..].chars().next() + .unwrap_or_else(|| { + panic!("expected char at offset {}", i) + }) + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_eof() { + return false; + } + let Position { mut offset, mut line, mut column } = self.pos(); + if self.char() == '\n' { + line = line.checked_add(1).unwrap(); + column = 1; + } else { + column = column.checked_add(1).unwrap(); + } + offset += self.char().len_utf8(); + self.parser().pos.set(Position { + offset: offset, + line: line, + column: column, + }); + self.pattern()[self.offset()..].chars().next().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.offset()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_eof() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.ignore_whitespace() { + return; + } + while !self.is_eof() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + let start = self.pos(); + let mut comment_text = String::new(); + self.bump(); + while !self.is_eof() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + comment_text.push(c); + } + let comment = ast::Comment { + span: Span::new(start, self.pos()), + comment: comment_text, + }; + self.parser().comments.borrow_mut().push(comment); + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_eof() { + return None; + } + self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() + } + + /// Returns true if the next call to `bump` would return false. + fn is_eof(&self) -> bool { + self.offset() == self.pattern().len() + } + + /// Return the current position of the parser, which includes the offset, + /// line and column. + fn pos(&self) -> Position { + self.parser().pos.get() + } + + /// Create a span at the current position of the parser. Both the start + /// and end of the span are set. + fn span(&self) -> Span { + Span::splat(self.pos()) + } + + /// Create a span that covers the current character. + fn span_char(&self) -> Span { + let mut next = Position { + offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), + line: self.line(), + column: self.column().checked_add(1).unwrap(), + }; + if self.char() == '\n' { + next.line += 1; + next.column = 1; + } + Span::new(self.pos(), next) + } + + /// Parse and push a single alternation on to the parser's internal stack. + /// If the top of the stack already has an alternation, then add to that + /// instead of pushing a new one. + /// + /// The concatenation given corresponds to a single alternation branch. + /// The concatenation returned starts the next branch and is empty. + /// + /// This assumes the parser is currently positioned at `|` and will advance + /// the parser to the character following `|`. + fn push_alternate(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '|'); + concat.span.end = self.pos(); + self.push_or_add_alternation(concat); + self.bump(); + Ok(ast::Concat { + span: self.span(), + asts: vec![], + }) + } + + /// Pushes or adds the given branch of an alternation to the parser's + /// internal stack of state. + fn push_or_add_alternation(&self, concat: ast::Concat) { + use self::GroupState::*; + + let mut stack = self.parser().stack_group.borrow_mut(); + if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { + alts.asts.push(concat.into_ast()); + return; + } + stack.push(Alternation(ast::Alternation { + span: Span::new(concat.span.start, self.pos()), + asts: vec![concat.into_ast()], + })); + } + + /// Parse and push a group AST (and its parent concatenation) on to the + /// parser's internal stack. Return a fresh concatenation corresponding + /// to the group's sub-AST. + /// + /// If a set of flags was found (with no group), then the concatenation + /// is returned with that set of flags added. + /// + /// This assumes that the parser is currently positioned on the opening + /// parenthesis. It advances the parser to the character at the start + /// of the sub-expression (or adjoining expression). + /// + /// If there was a problem parsing the start of the group, then an error + /// is returned. + fn push_group(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '('); + match try!(self.parse_group()) { + Either::Left(set) => { + let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); + if let Some(v) = ignore { + self.parser().ignore_whitespace.set(v); + } + + concat.asts.push(Ast::Flags(set)); + Ok(concat) + } + Either::Right(group) => { + let old_ignore_whitespace = self.ignore_whitespace(); + let new_ignore_whitespace = group + .flags() + .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) + .unwrap_or(old_ignore_whitespace); + self.parser().stack_group.borrow_mut().push(GroupState::Group { + concat: concat, + group: group, + ignore_whitespace: old_ignore_whitespace, + }); + self.parser().ignore_whitespace.set(new_ignore_whitespace); + Ok(ast::Concat { + span: self.span(), + asts: vec![], + }) + } + } + } + + /// Pop a group AST from the parser's internal stack and set the group's + /// AST to the given concatenation. Return the concatenation containing + /// the group. + /// + /// This assumes that the parser is currently positioned on the closing + /// parenthesis and advances the parser to the character following the `)`. + /// + /// If no such group could be popped, then an unopened group error is + /// returned. + fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + use self::GroupState::*; + + assert_eq!(self.char(), ')'); + let mut stack = self.parser().stack_group.borrow_mut(); + let (mut prior_concat, mut group, ignore_whitespace, alt) = + match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, None) + } + Some(Alternation(alt)) => { + match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, Some(alt)) + } + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + } + } + None => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }; + self.parser().ignore_whitespace.set(ignore_whitespace); + group_concat.span.end = self.pos(); + self.bump(); + group.span.end = self.pos(); + match alt { + Some(mut alt) => { + alt.span.end = group_concat.span.end; + alt.asts.push(group_concat.into_ast()); + group.ast = Box::new(alt.into_ast()); + } + None => { + group.ast = Box::new(group_concat.into_ast()); + } + } + prior_concat.asts.push(Ast::Group(group)); + Ok(prior_concat) + } + + /// Pop the last state from the parser's internal stack, if it exists, and + /// add the given concatenation to it. There either must be no state or a + /// single alternation item on the stack. Any other scenario produces an + /// error. + /// + /// This assumes that the parser has advanced to the end. + fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + concat.span.end = self.pos(); + let mut stack = self.parser().stack_group.borrow_mut(); + let ast = match stack.pop() { + None => Ok(concat.into_ast()), + Some(GroupState::Alternation(mut alt)) => { + alt.span.end = self.pos(); + alt.asts.push(concat.into_ast()); + Ok(Ast::Alternation(alt)) + } + Some(GroupState::Group { group, .. }) => { + return Err(self.error( + group.span, + ast::ErrorKind::GroupUnclosed, + )); + } + }; + // If we try to pop again, there should be nothing. + match stack.pop() { + None => ast, + Some(GroupState::Alternation(_)) => { + // This unreachable is unfortunate. This case can't happen + // because the only way we can be here is if there were two + // `GroupState::Alternation`s adjacent in the parser's stack, + // which we guarantee to never happen because we never push a + // `GroupState::Alternation` if one is already at the top of + // the stack. + unreachable!() + } + Some(GroupState::Group { group, .. }) => { + Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) + } + } + } + + /// Parse the opening of a character class and push the current class + /// parsing context onto the parser's stack. This assumes that the parser + /// is positioned at an opening `[`. The given union should correspond to + /// the union of set items built up before seeing the `[`. + /// + /// If there was a problem parsing the opening of the class, then an error + /// is returned. Otherwise, a new union of set items for the class is + /// returned (which may be populated with either a `]` or a `-`). + fn push_class_open( + &self, + parent_union: ast::ClassSetUnion, + ) -> Result { + assert_eq!(self.char(), '['); + + let (nested_set, nested_union) = try!(self.parse_set_class_open()); + self.parser().stack_class.borrow_mut().push(ClassState::Open { + union: parent_union, + set: nested_set, + }); + Ok(nested_union) + } + + /// Parse the end of a character class set and pop the character class + /// parser stack. The union given corresponds to the last union built + /// before seeing the closing `]`. The union returned corresponds to the + /// parent character class set with the nested class added to it. + /// + /// This assumes that the parser is positioned at a `]` and will advance + /// the parser to the byte immediately following the `]`. + /// + /// If the stack is empty after popping, then this returns the final + /// "top-level" character class AST (where a "top-level" character class + /// is one that is not nested inside any other character class). + /// + /// If there is no corresponding opening bracket on the parser's stack, + /// then an error is returned. + fn pop_class( + &self, + nested_union: ast::ClassSetUnion, + ) -> Result> { + assert_eq!(self.char(), ']'); + + let item = ast::ClassSet::Item(nested_union.into_item()); + let prevset = self.pop_class_op(item); + let mut stack = self.parser().stack_class.borrow_mut(); + match stack.pop() { + None => { + // We can never observe an empty stack: + // + // 1) We are guaranteed to start with a non-empty stack since + // the character class parser is only initiated when it sees + // a `[`. + // 2) If we ever observe an empty stack while popping after + // seeing a `]`, then we signal the character class parser + // to terminate. + panic!("unexpected empty character class stack") + }, + Some(ClassState::Op { .. }) => { + // This panic is unfortunate, but this case is impossible + // since we already popped the Op state if one exists above. + // Namely, every push to the class parser stack is guarded by + // whether an existing Op is already on the top of the stack. + // If it is, the existing Op is modified. That is, the stack + // can never have consecutive Op states. + panic!("unexpected ClassState::Op") + } + Some(ClassState::Open { mut union, mut set }) => { + self.bump(); + set.span.end = self.pos(); + set.kind = prevset; + if stack.is_empty() { + Ok(Either::Right(ast::Class::Bracketed(set))) + } else { + union.push(ast::ClassSetItem::Bracketed(Box::new(set))); + Ok(Either::Left(union)) + } + } + } + } + + /// Return an "unclosed class" error whose span points to the most + /// recently opened class. + /// + /// This should only be called while parsing a character class. + fn unclosed_class_error(&self) -> ast::Error { + for state in self.parser().stack_class.borrow().iter().rev() { + match *state { + ClassState::Open { ref set, .. } => { + return self.error(set.span, ast::ErrorKind::ClassUnclosed); + } + _ => {} + } + } + // We are guaranteed to have a non-empty stack with at least + // one open bracket, so we should never get here. + panic!("no open character class found") + } + + /// Push the current set of class items on to the class parser's stack as + /// the left hand side of the given operator. + /// + /// A fresh set union is returned, which should be used to build the right + /// hand side of this operator. + fn push_class_op( + &self, + next_kind: ast::ClassSetBinaryOpKind, + next_union: ast::ClassSetUnion, + ) -> ast::ClassSetUnion { + + let item = ast::ClassSet::Item(next_union.into_item()); + let new_lhs = self.pop_class_op(item); + self.parser().stack_class.borrow_mut().push(ClassState::Op { + kind: next_kind, + lhs: new_lhs, + }); + ast::ClassSetUnion { span: self.span(), items: vec![] } + } + + /// Pop a character class set from the character class parser stack. If the + /// top of the stack is just an item (not an operation), then return the + /// given set unchanged. If the top of the stack is an operation, then the + /// given set will be used as the rhs of the operation on the top of the + /// stack. In that case, the binary operation is returned as a set. + fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { + let mut stack = self.parser().stack_class.borrow_mut(); + let (kind, lhs) = match stack.pop() { + Some(ClassState::Op { kind, lhs }) => (kind, lhs), + Some(state @ ClassState::Open { .. }) => { + stack.push(state); + return rhs; + } + None => unreachable!(), + }; + let span = Span::new(lhs.span().start, rhs.span().end); + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: kind, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Parse the regular expression into an abstract syntax tree. + fn parse(&self) -> Result { + self.parse_with_comments().map(|astc| astc.ast) + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + fn parse_with_comments(&self) -> Result { + assert_eq!(self.offset(), 0, "parser can only be used once"); + self.parser().reset(); + let mut concat = ast::Concat { + span: self.span(), + asts: vec![], + }; + loop { + self.bump_space(); + if self.is_eof() { + break; + } + match self.char() { + '(' => concat = try!(self.push_group(concat)), + ')' => concat = try!(self.pop_group(concat)), + '|' => concat = try!(self.push_alternate(concat)), + '[' => { + let class = try!(self.parse_set_class()); + concat.asts.push(Ast::Class(class)); + } + '?' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::ZeroOrOne)); + } + '*' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::ZeroOrMore)); + } + '+' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::OneOrMore)); + } + '{' => { + concat = try!(self.parse_counted_repetition(concat)); + } + _ => concat.asts.push(try!(self.parse_primitive()).into_ast()), + } + } + let ast = try!(self.pop_group_end(concat)); + try!(NestLimiter::new(self).check(&ast)); + Ok(ast::WithComments { + ast: ast, + comments: mem::replace( + &mut *self.parser().comments.borrow_mut(), + vec![], + ), + }) + } + + /// Parses an uncounted repetition operation. An uncounted repetition + /// operator includes ?, * and +, but does not include the {m,n} syntax. + /// The given `kind` should correspond to the operator observed by the + /// caller. + /// + /// This assumes that the paser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + fn parse_uncounted_repetition( + &self, + mut concat: ast::Concat, + kind: ast::RepetitionKind, + ) -> Result { + assert!( + self.char() == '?' || self.char() == '*' || self.char() == '+'); + let op_start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => return Err(self.error( + self.span(), + ast::ErrorKind::RepetitionMissing, + )), + }; + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: Span::new(op_start, self.pos()), + kind: kind, + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the {m,n} syntax, and does not include the ?, * or + + /// operators. + /// + /// This assumes that the paser is currently positioned at the opening `{` + /// and advances the parser to the first character after the operator. + /// (Note that the operator may include a single additional `?`, which + /// makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + fn parse_counted_repetition( + &self, + mut concat: ast::Concat, + ) -> Result { + assert!(self.char() == '{'); + let start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => return Err(self.error( + self.span(), + ast::ErrorKind::RepetitionMissing, + )), + }; + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + let count_start = try!(self.parse_decimal()); + let mut range = ast::RepetitionRange::Exactly(count_start); + if self.is_eof() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() != '}' { + let count_end = try!(self.parse_decimal()); + range = ast::RepetitionRange::Bounded(count_start, count_end); + } else { + range = ast::RepetitionRange::AtLeast(count_start); + } + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + let op_span = Span::new(start, self.pos()); + if !range.is_valid() { + return Err(self.error( + op_span, + ast::ErrorKind::RepetitionCountInvalid, + )); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: op_span, + kind: ast::RepetitionKind::Range(range), + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parse a group (which contains a sub-expression) or a set of flags. + /// + /// If a group was found, then it is returned with an empty AST. If a set + /// of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group) or to the closing parenthesis + /// immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + fn parse_group(&self) -> Result> { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookaround_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAround, + )); + } + let inner_span = self.span(); + if self.bump_if("?P<") { + let capture_index = try!(self.next_capture_index(open_span)); + let cap = try!(self.parse_capture_name(capture_index)); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureName(cap), + ast: Box::new(Ast::Empty(self.span())), + })) + } else if self.bump_if("?") { + if self.is_eof() { + return Err(self.error( + open_span, + ast::ErrorKind::GroupUnclosed, + )); + } + let flags = try!(self.parse_flags()); + let char_end = self.char(); + self.bump(); + if char_end == ')' { + // We don't allow empty flags, e.g., `(?)`. We instead + // interpret it as a repetition operator missing its argument. + if flags.items.is_empty() { + return Err(self.error( + inner_span, + ast::ErrorKind::RepetitionMissing, + )); + } + Ok(Either::Left(ast::SetFlags { + span: Span { end: self.pos(), ..open_span }, + flags: flags, + })) + } else { + assert_eq!(char_end, ':'); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::NonCapturing(flags), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } else { + let capture_index = try!(self.next_capture_index(open_span)); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + fn parse_capture_name( + &self, + capture_index: u32, + ) -> Result { + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::GroupNameUnexpectedEof, + )); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupNameInvalid, + )); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::GroupNameUnexpectedEof, + )); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start.offset..end.offset]; + if name.is_empty() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::GroupNameEmpty, + )); + } + let capname = ast::CaptureName { + span: Span::new(start, end), + name: name.to_string(), + index: capture_index, + }; + try!(self.add_capture_name(&capname)); + Ok(capname) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + fn parse_flags(&self) -> Result { + let mut flags = ast::Flags { + span: self.span(), + items: vec![], + }; + let mut last_was_negation = None; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = Some(self.span_char()); + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Negation, + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagRepeatedNegation { + original: flags.items[i].span, + }, + )); + } + } else { + last_was_negation = None; + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Flag(try!(self.parse_flag())), + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagDuplicate { + original: flags.items[i].span, + }, + )); + } + } + if !self.bump() { + return Err(self.error( + self.span(), + ast::ErrorKind::FlagUnexpectedEof, + )); + } + } + if let Some(span) = last_was_negation { + return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); + } + flags.span.end = self.pos(); + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + fn parse_flag(&self) -> Result { + match self.char() { + 'i' => Ok(ast::Flag::CaseInsensitive), + 'm' => Ok(ast::Flag::MultiLine), + 's' => Ok(ast::Flag::DotMatchesNewLine), + 'U' => Ok(ast::Flag::SwapGreed), + 'u' => Ok(ast::Flag::Unicode), + 'x' => Ok(ast::Flag::IgnoreWhitespace), + _ => Err(self.error( + self.span_char(), + ast::ErrorKind::FlagUnrecognized, + )), + } + } + + /// Parse a primitive AST. e.g., A literal, non-set character class or + /// assertion. + /// + /// This assumes that the parser expects a primitive at the current + /// location. i.e., All other non-primitive cases have been handled. + /// For example, if the parser's position is at `|`, then `|` will be + /// treated as a literal (e.g., inside a character class). + /// + /// This advances the parser to the first character immediately following + /// the primitive. + fn parse_primitive(&self) -> Result { + match self.char() { + '\\' => self.parse_escape(), + '.' => { + let ast = Primitive::Dot(self.span_char()); + self.bump(); + Ok(ast) + } + '^' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::StartLine, + }); + self.bump(); + Ok(ast) + } + '$' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::EndLine, + }); + self.bump(); + Ok(ast) + } + c => { + let ast = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: c, + }); + self.bump(); + Ok(ast) + } + } + } + + /// Parse an escape sequence as a primitive AST. + /// + /// This assumes the parser is positioned at the start of the escape + /// sequence, i.e., `\`. It advances the parser to the first position + /// immediately following the escape sequence. + fn parse_escape(&self) -> Result { + assert_eq!(self.char(), '\\'); + let start = self.pos(); + if !self.bump() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let c = self.char(); + // Put some of the more complicated routines into helpers. + match c { + '0'...'7' => { + if !self.parser().octal { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + let mut lit = self.parse_octal(); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + '8'...'9' if !self.parser().octal => { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + 'x' | 'u' | 'U' => { + let mut lit = try!(self.parse_hex()); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + 'p' | 'P' => { + let mut cls = try!(self.parse_unicode_class()); + cls.span.start = start; + return Ok(Primitive::Unicode(cls)); + } + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + let mut cls = self.parse_perl_class(); + cls.span.start = start; + return Ok(Primitive::Perl(cls)); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + let span = Span::new(start, self.pos()); + if is_meta_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + })); + } + let special = |kind, c| Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Special(kind), + c: c, + })); + match c { + 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), + 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), + 't' => special(ast::SpecialLiteralKind::Tab, '\t'), + 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), + 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), + 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), + ' ' if self.ignore_whitespace() => { + special(ast::SpecialLiteralKind::Space, ' ') + } + 'A' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::StartText, + })), + 'z' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::EndText, + })), + 'b' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::WordBoundary, + })), + 'B' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::NotWordBoundary, + })), + _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), + } + } + + /// Parse an octal representation of a Unicode codepoint up to 3 digits + /// long. This expects the parser to be positioned at the first octal + /// digit and advances the parser to the first character immediately + /// following the octal number. This also assumes that parsing octal + /// escapes is enabled. + /// + /// Assuming the preconditions are met, this routine can never fail. + fn parse_octal(&self) -> ast::Literal { + use std::char; + use std::u32; + + assert!(self.parser().octal); + assert!('0' <= self.char() && self.char() <= '7'); + let start = self.pos(); + // Parse up to two more digits. + while + self.bump() && + '0' <= self.char() && self.char() <= '7' && + self.pos().offset - start.offset <= 2 + {} + let end = self.pos(); + let octal = &self.pattern()[start.offset..end.offset]; + // Parsing the octal should never fail since the above guarantees a + // valid number. + let codepoint = + u32::from_str_radix(octal, 8).expect("valid octal number"); + // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no + // invalid Unicode scalar values. + let c = char::from_u32(codepoint).expect("Unicode scalar value"); + ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::Octal, + c: c, + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + fn parse_hex(&self) -> Result { + assert!(self.char() == 'x' + || self.char() == 'u' + || self.char() == 'U'); + + let hex_kind = match self.char() { + 'x' => ast::HexLiteralKind::X, + 'u' => ast::HexLiteralKind::UnicodeShort, + _ => ast::HexLiteralKind::UnicodeLong, + }; + if !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + if self.char() == '{' { + self.parse_hex_brace(hex_kind) + } else { + self.parse_hex_digits(hex_kind) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + fn parse_hex_digits( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let start = self.pos(); + for i in 0..kind.digits() { + if i > 0 && !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + let end = self.pos(); + let hex = scratch.as_str(); + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::HexFixed(kind), + c: c, + }), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + fn parse_hex_brace( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let brace_pos = self.pos(); + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let end = self.pos(); + let hex = scratch.as_str(); + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if hex.is_empty() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeHexEmpty, + )); + } + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, self.pos()), + kind: ast::LiteralKind::HexBrace(kind), + c: c, + }), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + while !self.is_eof() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + let span = Span::new(start, self.pos()); + while !self.is_eof() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), + } + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges, but can also contain nested character classes of + /// any type (sans `.`). + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + fn parse_set_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = ast::ClassSetUnion { + span: self.span(), + items: vec![], + }; + loop { + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + match self.char() { + '[' => { + // If we've already parsed the opening bracket, then + // attempt to treat this as the beginning of an ASCII + // class. If ASCII class parsing fails, then the parser + // backs up to `[`. + if !self.parser().stack_class.borrow().is_empty() { + if let Some(cls) = self.maybe_parse_ascii_class() { + union.push(ast::ClassSetItem::Ascii(cls)); + continue; + } + } + union = try!(self.push_class_open(union)); + } + ']' => { + match try!(self.pop_class(union)) { + Either::Left(nested_union) => { union = nested_union; } + Either::Right(class) => return Ok(class), + } + } + '&' if self.peek() == Some('&') => { + assert!(self.bump_if("&&")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Intersection, union); + } + '-' if self.peek() == Some('-') => { + assert!(self.bump_if("--")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Difference, union); + } + '~' if self.peek() == Some('~') => { + assert!(self.bump_if("~~")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::SymmetricDifference, union); + } + _ => { + union.push(try!(self.parse_set_class_range())); + } + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like \w or \p{Greek}. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + fn parse_set_class_range(&self) -> Result { + let prim1 = try!(self.parse_set_class_item()); + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. + if self.char() != '-' + || self.peek() == Some(']') + || self.peek() == Some('-') + { + return prim1.into_class_set_item(self); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(self.unclosed_class_error()); + } + let prim2 = try!(self.parse_set_class_item()); + let range = ast::ClassSetRange { + span: Span::new(prim1.span().start, prim2.span().end), + start: try!(prim1.into_class_literal(self)), + end: try!(prim2.into_class_literal(self)), + }; + if !range.is_valid() { + return Err(self.error( + range.span, + ast::ErrorKind::ClassRangeInvalid, + )); + } + Ok(ast::ClassSetItem::Range(range)) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + fn parse_set_class_item(&self) -> Result { + if self.char() == '\\' { + self.parse_escape() + } else { + let x = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: self.char(), + }); + self.bump(); + Ok(x) + } + } + + /// Parses the opening of a character class set. This includes the opening + /// bracket along with `^` if present to indicate negation. This also + /// starts parsing the opening set of unioned items if applicable, since + /// there are special rules applied to certain characters in the opening + /// of a character class. For example, `[^]]` is the class of all + /// characters not equal to `]`. (`]` would need to be escaped in any other + /// position.) Similarly for `-`. + /// + /// In all cases, the op inside the returned `ast::ClassBracketed` is an + /// empty union. This empty union should be replaced with the actual item + /// when it is popped from the parser's stack. + /// + /// This assumes the parser is positioned at the opening `[` and advances + /// the parser to the first non-special byte of the character class. + /// + /// An error is returned if EOF is found. + fn parse_set_class_open( + &self, + ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { + assert_eq!(self.char(), '['); + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + + let negated = + if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + true + }; + // Accept any number of `-` as literal `-`. + let mut union = ast::ClassSetUnion { + span: self.span(), + items: vec![], + }; + while self.char() == '-' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: '-', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.items.is_empty() && self.char() == ']' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: ']', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + let set = ast::ClassBracketed { + span: Span::new(start, self.pos()), + negated: negated, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: Span::new(union.span.start, union.span.start), + items: vec![], + }), + }; + Ok((set, union)) + } + + /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid ASCII character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding ASCII class is returned. + fn maybe_parse_ascii_class(&self) -> Option { + // ASCII character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an ASCII character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "ASCII character characters have the syntax `[:NAME:]` + // which can only appear within character brackets." This means that + // things like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect ASCII character class, e.g., + // `[[:loower:]]`, then we treat that as a normal nested character + // class containing the characters `:elorw`. One might argue that we + // should return an error instead since the repeated colons give away + // the intent to write an ASCII class. But what if the user typed + // `[[:lower]]` instead? How can we tell that was intended to be an + // ASCII class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense + // of better failure modes. + assert_eq!(self.char(), '['); + // If parsing fails, then we back up the parser to this starting point. + let start = self.pos(); + let mut negated = false; + if !self.bump() || self.char() != ':' { + self.parser().pos.set(start); + return None; + } + if !self.bump() { + self.parser().pos.set(start); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + self.parser().pos.set(start); + return None; + } + } + let name_start = self.offset(); + while self.char() != ':' && self.bump() {} + if self.is_eof() { + self.parser().pos.set(start); + return None; + } + let name = &self.pattern()[name_start..self.offset()]; + if !self.bump_if(":]") { + self.parser().pos.set(start); + return None; + } + let kind = match ast::ClassAsciiKind::from_name(name) { + Some(kind) => kind, + None => { + self.parser().pos.set(start); + return None; + } + }; + Some(ast::ClassAscii { + span: Span::new(start, self.pos()), + kind: kind, + negated: negated, + }) + } + + /// Parse a Unicode class in either the single character notation, `\pN` + /// or the multi-character bracketed notation, `\p{Greek}`. This assumes + /// the parser is positioned at the `p` (or `P` for negation) and will + /// advance the parser to the character immediately following the class. + /// + /// Note that this does not check whether the class name is valid or not. + fn parse_unicode_class(&self) -> Result { + assert!(self.char() == 'p' || self.char() == 'P'); + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let negated = self.char() == 'P'; + if !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let (start, kind) = + if self.char() == '{' { + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + assert_eq!(self.char(), '}'); + self.bump(); + + let name = scratch.as_str(); + if let Some(i) = name.find("!=") { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: name[..i].to_string(), + value: name[i+2..].to_string(), + }) + } else if let Some(i) = name.find(':') { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: name[..i].to_string(), + value: name[i+1..].to_string(), + }) + } else if let Some(i) = name.find('=') { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: name[..i].to_string(), + value: name[i+1..].to_string(), + }) + } else { + (start, ast::ClassUnicodeKind::Named(name.to_string())) + } + } else { + let start = self.pos(); + let c = self.char(); + self.bump_and_bump_space(); + let kind = ast::ClassUnicodeKind::OneLetter(c); + (start, kind) + }; + Ok(ast::ClassUnicode { + span: Span::new(start, self.pos()), + negated: negated, + kind: kind, + }) + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + fn parse_perl_class(&self) -> ast::ClassPerl { + let c = self.char(); + let span = self.span_char(); + self.bump(); + let (negated, kind) = match c { + 'd' => (false, ast::ClassPerlKind::Digit), + 'D' => (true, ast::ClassPerlKind::Digit), + 's' => (false, ast::ClassPerlKind::Space), + 'S' => (true, ast::ClassPerlKind::Space), + 'w' => (false, ast::ClassPerlKind::Word), + 'W' => (true, ast::ClassPerlKind::Word), + c => panic!("expected valid Perl class but got '{}'", c), + }; + ast::ClassPerl { span: span, kind: kind, negated: negated } + } +} + +/// A type that traverses a fully parsed Ast and checks whether its depth +/// exceeds the specified nesting limit. If it does, then an error is returned. +#[derive(Debug)] +struct NestLimiter<'p, 's: 'p, P: 'p + 's> { + /// The parser that is checking the nest limit. + p: &'p ParserI<'s, P>, + /// The current depth while walking an Ast. + depth: u32, +} + +impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { + fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { + NestLimiter { p: p, depth: 0 } + } + + fn check(self, ast: &Ast) -> Result<()> { + ast::visit(ast, self) + } + + fn increment_depth(&mut self, span: &Span) -> Result<()> { + let new = try!(self.depth.checked_add(1).ok_or_else(|| self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ))); + let limit = self.p.parser().nest_limit; + if new > limit { + return Err(self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(limit), + )); + } + self.depth = new; + Ok(()) + } + + fn decrement_depth(&mut self) { + // Assuming the correctness of the visitor, this should never drop + // below 0. + self.depth.checked_sub(1).unwrap(); + } +} + +impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { + type Output = (); + type Err = ast::Error; + + fn finish(self) -> Result<()> { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + Ast::Class(ast::Class::Bracketed(_)) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + let span = match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + ast::ClassSetItem::Bracketed(ref x) => &x.span, + ast::ClassSetItem::Union(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + ast::ClassSetItem::Bracketed(_) + | ast::ClassSetItem::Union(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_binary_op_pre( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.increment_depth(&ast.span) + } + + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.decrement_depth(); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use ast::{self, Ast, Position, Span}; + use super::{Parser, ParserI, ParserBuilder, Primitive}; + + // Our own assert_eq, which has slightly better formatting (but honestly + // still kind of crappy). + macro_rules! assert_eq { + ($left:expr, $right:expr) => ({ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!("assertion failed: `(left == right)`\n\n\ + left: `{:?}`\nright: `{:?}`\n\n", + left_val, right_val) + } + } + } + }); + } + + // We create these errors to compare with real ast::Errors in the tests. + // We define equality between TestError and ast::Error to disregard the + // pattern string in ast::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: ast::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &ast::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for ast::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn s(str: &str) -> String { + str.to_string() + } + + fn parser(pattern: &str) -> ParserI { + ParserI::new(Parser::new(), pattern) + } + + fn parser_octal(pattern: &str) -> ParserI { + let parser = ParserBuilder::new().octal(true).build(); + ParserI::new(parser, pattern) + } + + fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI { + let p = ParserBuilder::new().nest_limit(nest_limit).build(); + ParserI::new(p, pattern) + } + + fn parser_ignore_whitespace(pattern: &str) -> ParserI { + let p = ParserBuilder::new().ignore_whitespace(true).build(); + ParserI::new(p, pattern) + } + + /// Short alias for creating a new span. + fn nspan(start: Position, end: Position) -> Span { + Span::new(start, end) + } + + /// Short alias for creating a new position. + fn npos(offset: usize, line: usize, column: usize) -> Position { + Position::new(offset, line, column) + } + + /// Create a new span from the given offset range. This assumes a single + /// line and sets the columns based on the offsets. i.e., This only works + /// out of the box for ASCII, which is fine for most tests. + fn span(range: Range) -> Span { + let start = Position::new(range.start, 1, range.start + 1); + let end = Position::new(range.end, 1, range.end + 1); + Span::new(start, end) + } + + /// Create a new span for the corresponding byte range in the given string. + fn span_range(subject: &str, range: Range) -> Span { + let start = Position { + offset: range.start, + line: 1 + subject[..range.start].matches('\n').count(), + column: 1 + subject[..range.start] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.start].chars().count()), + }; + let end = Position { + offset: range.end, + line: 1 + subject[..range.end].matches('\n').count(), + column: 1 + subject[..range.end] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.end].chars().count()), + }; + Span::new(start, end) + } + + /// Create a verbatim literal starting at the given position. + fn lit(c: char, start: usize) -> Ast { + lit_with(c, span(start..start + c.len_utf8())) + } + + /// Create a punctuation literal starting at the given position. + fn punct_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + }) + } + + /// Create a verbatim literal with the given span. + fn lit_with(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + /// Create a concatenation with the given range. + fn concat(range: Range, asts: Vec) -> Ast { + concat_with(span(range), asts) + } + + /// Create a concatenation with the given span. + fn concat_with(span: Span, asts: Vec) -> Ast { + Ast::Concat(ast::Concat { span: span, asts: asts }) + } + + /// Create an alternation with the given span. + fn alt(range: Range, asts: Vec) -> Ast { + Ast::Alternation(ast::Alternation { span: span(range), asts: asts }) + } + + /// Create a capturing group with the given span. + fn group(range: Range, index: u32, ast: Ast) -> Ast { + Ast::Group(ast::Group { + span: span(range), + kind: ast::GroupKind::CaptureIndex(index), + ast: Box::new(ast), + }) + } + + /// Create an ast::SetFlags. + /// + /// The given pattern should be the full pattern string. The range given + /// should correspond to the byte offsets where the flag set occurs. + /// + /// If negated is true, then the set is interpreted as beginning with a + /// negation. + fn flag_set( + pat: &str, + range: Range, + flag: ast::Flag, + negated: bool, + ) -> Ast { + let mut items = vec![ + ast::FlagsItem { + span: span_range(pat, (range.end - 2)..(range.end - 1)), + kind: ast::FlagsItemKind::Flag(flag), + }, + ]; + if negated { + items.insert(0, ast::FlagsItem { + span: span_range(pat, (range.start + 2)..(range.end - 2)), + kind: ast::FlagsItemKind::Negation, + }); + } + Ast::Flags(ast::SetFlags { + span: span_range(pat, range.clone()), + flags: ast::Flags { + span: span_range(pat, (range.start + 2)..(range.end - 1)), + items: items, + }, + }) + } + + #[test] + fn parse_nest_limit() { + // A nest limit of 0 still allows some types of regexes. + assert_eq!( + parser_nest_limit("", 0).parse(), + Ok(Ast::Empty(span(0..0)))); + assert_eq!( + parser_nest_limit("a", 0).parse(), + Ok(lit('a', 0))); + + // Test repetition operations, which require one level of nesting. + assert_eq!( + parser_nest_limit("a+", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("a+", 1).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser_nest_limit("(a)+", 1).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("a+*", 1).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("a+*", 2).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })), + }))); + + // Test concatenations. A concatenation requires one level of nesting. + assert_eq!( + parser_nest_limit("ab", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("ab", 1).parse(), + Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))); + assert_eq!( + parser_nest_limit("abc", 1).parse(), + Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))); + + // Test alternations. An alternation requires one level of nesting. + assert_eq!( + parser_nest_limit("a|b", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("a|b", 1).parse(), + Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))); + assert_eq!( + parser_nest_limit("a|b|c", 1).parse(), + Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))); + + // Test character classes. Classes form their own mini-recursive + // syntax! + assert_eq!( + parser_nest_limit("[a]", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("[a]", 1).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::Item( + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: 'a', + }) + ), + })))); + assert_eq!( + parser_nest_limit("[ab]", 1).parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::NestLimitExceeded(2), + }); + assert_eq!( + parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(3), + }); + assert_eq!( + parser_nest_limit("[a--b]", 1).parse().unwrap_err(), + TestError { + span: span(1..5), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(2), + }); + } + + #[test] + fn parse_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + let astc = parser(pat).parse_with_comments().unwrap(); + assert_eq!( + astc.ast, + concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('f', span_range(pat, 26..27)), + lit_with('o', span_range(pat, 27..28)), + lit_with('o', span_range(pat, 28..29)), + lit_with('b', span_range(pat, 74..75)), + lit_with('a', span_range(pat, 75..76)), + lit_with('r', span_range(pat, 76..77)), + ])); + assert_eq!(astc.comments, vec![ + ast::Comment { + span: span_range(pat, 5..26), + comment: s(" This is comment 1."), + }, + ast::Comment { + span: span_range(pat, 30..51), + comment: s(" This is comment 2."), + }, + ast::Comment { + span: span_range(pat, 53..74), + comment: s(" This is comment 3."), + }, + ast::Comment { + span: span_range(pat, 78..98), + comment: s(" This is comment 4."), + }, + ]); + } + + #[test] + fn parse_holistic() { + assert_eq!( + parser("]").parse(), + Ok(lit(']', 0))); + assert_eq!( + parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), + Ok(concat(0..36, vec![ + punct_lit('\\', span(0..2)), + punct_lit('.', span(2..4)), + punct_lit('+', span(4..6)), + punct_lit('*', span(6..8)), + punct_lit('?', span(8..10)), + punct_lit('(', span(10..12)), + punct_lit(')', span(12..14)), + punct_lit('|', span(14..16)), + punct_lit('[', span(16..18)), + punct_lit(']', span(18..20)), + punct_lit('{', span(20..22)), + punct_lit('}', span(22..24)), + punct_lit('^', span(24..26)), + punct_lit('$', span(26..28)), + punct_lit('#', span(28..30)), + punct_lit('&', span(30..32)), + punct_lit('-', span(32..34)), + punct_lit('~', span(34..36)), + ]))); + } + + #[test] + fn parse_ignore_whitespace() { + // Test that basic whitespace insensitivity works. + let pat = "(?x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + ]))); + + // Test that we can toggle whitespace insensitivity. + let pat = "(?x)a b(?-x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), + lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), + lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), + lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), + ]))); + + // Test that nesting whitespace insensitive flags works. + let pat = "a (?x:a )a "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..11), vec![ + lit_with('a', span_range(pat, 0..1)), + lit_with(' ', span_range(pat, 1..2)), + Ast::Group(ast::Group { + span: span_range(pat, 2..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 4..5), + items: vec![ + ast::FlagsItem { + span: span_range(pat, 4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::IgnoreWhitespace), + }, + ], + }), + ast: Box::new(lit_with('a', span_range(pat, 6..7))), + }), + lit_with('a', span_range(pat, 9..10)), + lit_with(' ', span_range(pat, 10..11)), + ]))); + + // Test that whitespace after an opening paren is insignificant. + let pat = "(?x)( ?P a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + }), + ast: Box::new(lit_with('a', span_range(pat, 14..15))), + }), + ]))); + let pat = "(?x)( a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit_with('a', span_range(pat, 7..8))), + }), + ]))); + let pat = "(?x)( ?: a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 8..8), + items: vec![], + }), + ast: Box::new(lit_with('a', span_range(pat, 11..12))), + }), + ]))); + let pat = r"(?x)\x { 53 }"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span(4..13), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: 'S', + }), + ]))); + + // Test that whitespace after an escape is OK. + let pat = r"(?x)\ "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span_range(pat, 4..6), + kind: ast::LiteralKind::Special( + ast::SpecialLiteralKind::Space), + c: ' ', + }), + ]))); + // ... but only when `x` mode is enabled. + let pat = r"\ "; + assert_eq!( + parser(pat).parse().unwrap_err(), + TestError { + span: span_range(pat, 0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_newlines() { + let pat = ".\n."; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..3), vec![ + Ast::Dot(span_range(pat, 0..1)), + lit_with('\n', span_range(pat, 1..2)), + Ast::Dot(span_range(pat, 2..3)), + ]))); + + let pat = "foobar\nbaz\nquux\n"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), + lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), + lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), + lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), + lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), + lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), + lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), + lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), + lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), + lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), + lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), + lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), + lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), + lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), + ]))); + } + + #[test] + fn parse_uncounted_repetition() { + assert_eq!( + parser(r"a*").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a+").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a??").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a?b").parse(), + Ok(concat(0..3, vec![ + Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }), + lit('b', 2), + ]))); + assert_eq!( + parser(r"a??b").parse(), + Ok(concat(0..4, vec![ + Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }), + lit('b', 3), + ]))); + assert_eq!( + parser(r"ab?").parse(), + Ok(concat(0..3, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ]))); + assert_eq!( + parser(r"(ab)?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(4..5), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(group(0..4, 1, concat(1..3, vec![ + lit('a', 1), + lit('b', 2), + ]))), + }))); + assert_eq!( + parser(r"|a?").parse(), + Ok(alt(0..3, vec![ + Ast::Empty(span(0..0)), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 1)), + }), + ]))); + + assert_eq!( + parser(r"*").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(*)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(?:?)").parse().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"+").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"?").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(?)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|*").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|+").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|?").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + } + + #[test] + fn parse_counted_repetition() { + assert_eq!( + parser(r"a{5}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..4), + op: ast::RepetitionOp { + span: span(1..4), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5,}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::AtLeast(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5,9}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5}?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"ab{5}").parse(), + Ok(concat(0..5, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ]))); + assert_eq!( + parser(r"ab{5}c").parse(), + Ok(concat(0..6, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + lit('c', 5), + ]))); + + assert_eq!( + parser(r"a{ 5 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{ 5 , 9 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..10), + op: ast::RepetitionOp { + span: span(1..10), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser_ignore_whitespace(r"a{5,9} ?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..8), + op: ast::RepetitionOp { + span: span(1..8), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + + assert_eq!( + parser(r"a{").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{a").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{9999999999}").parse().unwrap_err(), + TestError { + span: span(2..12), + kind: ast::ErrorKind::DecimalInvalid, + }); + assert_eq!( + parser(r"a{9").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{9,a").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{9,9999999999}").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::DecimalInvalid, + }); + assert_eq!( + parser(r"a{9,").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{9,11").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{2,1}").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountInvalid, + }); + assert_eq!( + parser(r"{5}").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|{5}").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + } + + #[test] + fn parse_alternate() { + assert_eq!( + parser(r"a|b").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..3), + asts: vec![lit('a', 0), lit('b', 2)], + }))); + assert_eq!( + parser(r"(a|b)").parse(), + Ok(group(0..5, 1, Ast::Alternation(ast::Alternation { + span: span(1..4), + asts: vec![lit('a', 1), lit('b', 3)], + })))); + + assert_eq!( + parser(r"a|b|c").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..5), + asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], + }))); + assert_eq!( + parser(r"ax|by|cz").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..8), + asts: vec![ + concat(0..2, vec![lit('a', 0), lit('x', 1)]), + concat(3..5, vec![lit('b', 3), lit('y', 4)]), + concat(6..8, vec![lit('c', 6), lit('z', 7)]), + ], + }))); + assert_eq!( + parser(r"(ax|by|cz)").parse(), + Ok(group(0..10, 1, Ast::Alternation(ast::Alternation { + span: span(1..9), + asts: vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + concat(4..6, vec![lit('b', 4), lit('y', 5)]), + concat(7..9, vec![lit('c', 7), lit('z', 8)]), + ], + })))); + assert_eq!( + parser(r"(ax|(by|(cz)))").parse(), + Ok(group(0..14, 1, alt(1..13, vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + group(4..13, 2, alt(5..12, vec![ + concat(5..7, vec![lit('b', 5), lit('y', 6)]), + group(8..12, 3, concat(9..11, vec![ + lit('c', 9), + lit('z', 10), + ])), + ])), + ])))); + + assert_eq!( + parser(r"|").parse(), Ok(alt(0..1, vec![ + Ast::Empty(span(0..0)), Ast::Empty(span(1..1)), + ]))); + assert_eq!( + parser(r"||").parse(), Ok(alt(0..2, vec![ + Ast::Empty(span(0..0)), + Ast::Empty(span(1..1)), + Ast::Empty(span(2..2)), + ]))); + assert_eq!( + parser(r"a|").parse(), Ok(alt(0..2, vec![ + lit('a', 0), Ast::Empty(span(2..2)), + ]))); + assert_eq!( + parser(r"|a").parse(), Ok(alt(0..2, vec![ + Ast::Empty(span(0..0)), lit('a', 1), + ]))); + + assert_eq!( + parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![ + Ast::Empty(span(1..1)), Ast::Empty(span(2..2)), + ])))); + assert_eq!( + parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ + lit('a', 1), Ast::Empty(span(3..3)), + ])))); + assert_eq!( + parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ + Ast::Empty(span(1..1)), lit('a', 2), + ])))); + + assert_eq!( + parser(r"a|b)").parse().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::GroupUnopened, + }); + assert_eq!( + parser(r"(a|b").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + }); + } + + #[test] + fn parse_unsupported_lookaround() { + assert_eq!( + parser(r"(?=a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?!a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?<=a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?z)").parse(), Ok(Ast::Group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + }), + ast: Box::new(lit('z', 6)), + }))); + assert_eq!(parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + }))); + + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + }); + assert_eq!( + parser("(?P<>z)").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameEmpty, + }); + assert_eq!( + parser("(?Py)(?Pz)").parse().unwrap_err(), + TestError { + span: span(12..13), + kind: ast::ErrorKind::GroupNameDuplicate { + original: span(4..5), + }, + }); + } + + #[test] + fn parse_flags() { + assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + })); + assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + })); + + assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags { + span: span(0..3), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + + assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + + assert_eq!( + parser("isU").parse_flags().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::FlagUnexpectedEof, + }); + assert_eq!( + parser("isUa:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagUnrecognized, + }); + assert_eq!( + parser("isUi:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagDuplicate { + original: span(0..1), + }, + }); + assert_eq!( + parser("i-sU-i:").parse_flags().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::FlagRepeatedNegation { + original: span(1..2), + }, + }); + assert_eq!( + parser("-)").parse_flags().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + assert_eq!( + parser("i-)").parse_flags().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + assert_eq!( + parser("iU-)").parse_flags().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + } + + #[test] + fn parse_flag() { + assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); + assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); + assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); + assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); + assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); + + assert_eq!( + parser("a").parse_flag().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagUnrecognized, + }); + assert_eq!( + parser("☃").parse_flag().unwrap_err(), + TestError { + span: span_range("☃", 0..3), + kind: ast::ErrorKind::FlagUnrecognized, + }); + } + + #[test] + fn parse_primitive_non_escape() { + assert_eq!( + parser(r".").parse_primitive(), + Ok(Primitive::Dot(span(0..1)))); + assert_eq!( + parser(r"^").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::StartLine, + }))); + assert_eq!( + parser(r"$").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::EndLine, + }))); + + assert_eq!( + parser(r"a").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: 'a', + }))); + assert_eq!( + parser(r"|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: '|', + }))); + assert_eq!( + parser(r"☃").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span_range("☃", 0..3), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }))); + } + + #[test] + fn parse_escape() { + assert_eq!( + parser(r"\|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Punctuation, + c: '|', + }))); + let specials = &[ + (r"\a", '\x07', ast::SpecialLiteralKind::Bell), + (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), + (r"\t", '\t', ast::SpecialLiteralKind::Tab), + (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), + (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), + (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), + ]; + for &(pat, c, ref kind) in specials { + assert_eq!( + parser(pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Special(kind.clone()), + c: c, + }))); + } + assert_eq!( + parser(r"\A").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::StartText, + }))); + assert_eq!( + parser(r"\z").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::EndText, + }))); + assert_eq!( + parser(r"\b").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + }))); + assert_eq!( + parser(r"\B").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::NotWordBoundary, + }))); + + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\y").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_unsupported_backreference() { + assert_eq!( + parser(r"\0").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + }); + assert_eq!( + parser(r"\9").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + }); + } + + #[test] + fn parse_octal() { + for i in 0..511 { + let pat = format!(r"\{:o}", i); + assert_eq!( + parser_octal(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::Octal, + c: ::std::char::from_u32(i).unwrap(), + }))); + } + assert_eq!( + parser_octal(r"\778").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }))); + assert_eq!( + parser_octal(r"\7777").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }))); + assert_eq!( + parser_octal(r"\778").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: '8', + }), + ], + }))); + assert_eq!( + parser_octal(r"\7777").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..5), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: '7', + }), + ], + }))); + + assert_eq!( + parser_octal(r"\8").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_hex_two() { + for i in 0..256 { + let pat = format!(r"\x{:02x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), + c: ::std::char::from_u32(i).unwrap(), + }))); + } + + assert_eq!( + parser(r"\xF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\xG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\xFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + } + + #[test] + fn parse_hex_four() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\u{:04x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeShort), + c: c, + }))); + } + + assert_eq!( + parser(r"\uF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\uG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uD800").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + } + + #[test] + fn parse_hex_eight() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\U{:08x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeLong), + c: c, + }))); + } + + assert_eq!( + parser(r"\UF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\UG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(6..7), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(8..9), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(9..10), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + } + + #[test] + fn parse_hex_brace() { + assert_eq!( + parser(r"\u{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeShort), + c: '⛄', + }))); + assert_eq!( + parser(r"\U{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeLong), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{26C4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{10fFfF}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..10), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '\u{10FFFF}', + }))); + + assert_eq!( + parser(r"\x").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{FF").parse_escape().unwrap_err(), + TestError { + span: span(2..5), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{}").parse_escape().unwrap_err(), + TestError { + span: span(2..4), + kind: ast::ErrorKind::EscapeHexEmpty, + }); + assert_eq!( + parser(r"\x{FGF}").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..9), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + assert_eq!( + parser(r"\x{D800}").parse_escape().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + assert_eq!( + parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..12), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + } + + #[test] + fn parse_decimal() { + assert_eq!(parser("123").parse_decimal(), Ok(123)); + assert_eq!(parser("0").parse_decimal(), Ok(0)); + assert_eq!(parser("01").parse_decimal(), Ok(1)); + + assert_eq!( + parser("-1").parse_decimal().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser("").parse_decimal().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser("9999999999").parse_decimal().unwrap_err(), + TestError { + span: span(0..10), + kind: ast::ErrorKind::DecimalInvalid, + }); + } + + #[test] + fn parse_set_class() { + fn union(span: Span, items: Vec) -> ast::ClassSet { + ast::ClassSet::union(ast::ClassSetUnion { + span: span, + items: items, + }) + } + + fn intersection( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Intersection, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn difference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Difference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn symdifference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::SymmetricDifference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { + ast::ClassSet::Item(item) + } + + fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { + ast::ClassSetItem::Ascii(cls) + } + + fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { + ast::ClassSetItem::Unicode(cls) + } + + fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { + ast::ClassSetItem::Perl(cls) + } + + fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { + ast::ClassSetItem::Bracketed(Box::new(cls)) + } + + fn lit(span: Span, c: char) -> ast::ClassSetItem { + ast::ClassSetItem::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + fn empty(span: Span) -> ast::ClassSetItem { + ast::ClassSetItem::Empty(span) + } + + fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { + let pos1 = Position { + offset: span.start.offset + start.len_utf8(), + column: span.start.column + 1, + ..span.start + }; + let pos2 = Position { + offset: span.end.offset - end.len_utf8(), + column: span.end.column - 1, + ..span.end + }; + ast::ClassSetItem::Range(ast::ClassSetRange { + span: span, + start: ast::Literal { + span: Span { end: pos1, ..span }, + kind: ast::LiteralKind::Verbatim, + c: start, + }, + end: ast::Literal { + span: Span { start: pos2, ..span }, + kind: ast::LiteralKind::Verbatim, + c: end, + }, + }) + } + + fn alnum(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Alnum, + negated: negated, + } + } + + fn lower(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Lower, + negated: negated, + } + } + + assert_eq!( + parser("[[:alnum:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..11), + negated: false, + kind: itemset(item_ascii(alnum(span(1..10), false))), + })))); + assert_eq!( + parser("[[[:alnum:]]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..13), + negated: false, + kind: itemset(item_bracket(ast::ClassBracketed { + span: span(1..12), + negated: false, + kind: itemset(item_ascii(alnum(span(2..11), false))), + })), + })))); + assert_eq!( + parser("[[:alnum:]&&[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: intersection( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + assert_eq!( + parser("[[:alnum:]--[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: difference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + assert_eq!( + parser("[[:alnum:]~~[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: symdifference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + + assert_eq!( + parser("[a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), 'a')), + })))); + assert_eq!( + parser(r"[a\]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union(span(1..4), vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: ']', + }), + ]), + })))); + assert_eq!( + parser(r"[a\-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union(span(1..5), vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '-', + }), + lit(span(4..5), 'z'), + ]), + })))); + assert_eq!( + parser("[ab]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), 'a'), + lit(span(2..3), 'b'), + ]), + })))); + assert_eq!( + parser("[a-]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), 'a'), + lit(span(2..3), '-'), + ]), + })))); + assert_eq!( + parser("[-a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), '-'), + lit(span(2..3), 'a'), + ]), + })))); + assert_eq!( + parser(r"[\pL]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(item_unicode(ast::ClassUnicode { + span: span(1..4), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('L'), + })), + })))); + assert_eq!( + parser(r"[\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + })))); + assert_eq!( + parser(r"[a\wz]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union(span(1..5), vec![ + lit(span(1..2), 'a'), + item_perl(ast::ClassPerl { + span: span(2..4), + kind: ast::ClassPerlKind::Word, + negated: false, + }), + lit(span(4..5), 'z'), + ]), + })))); + + assert_eq!( + parser("[a-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(range(span(1..4), 'a', 'z')), + })))); + assert_eq!( + parser("[a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..8), + negated: false, + kind: union(span(1..7), vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ]), + })))); + assert_eq!( + parser(r"[\w&&a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + union(span(5..11), vec![ + range(span(5..8), 'a', 'c'), + range(span(8..11), 'x', 'z'), + ]), + ), + })))); + assert_eq!( + parser(r"[a-cx-z&&\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + union(span(1..7), vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ]), + itemset(item_perl(ast::ClassPerl { + span: span(9..11), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + ), + })))); + assert_eq!( + parser(r"[a--b--c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: difference( + span(1..8), + difference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })))); + assert_eq!( + parser(r"[a~~b~~c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: symdifference( + span(1..8), + symdifference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })))); + assert_eq!( + parser(r"[\^&&^]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '^', + })), + itemset(lit(span(5..6), '^')), + ), + })))); + assert_eq!( + parser(r"[\&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '&', + })), + itemset(lit(span(5..6), '&')), + ), + })))); + assert_eq!( + parser(r"[&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: intersection( + span(1..5), + intersection( + span(1..3), + itemset(empty(span(1..1))), + itemset(empty(span(3..3))), + ), + itemset(empty(span(5..5))), + ), + })))); + + let pat = "[☃-⛄]"; + assert_eq!( + parser(pat).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span_range(pat, 0..9), + negated: false, + kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { + span: span_range(pat, 1..8), + start: ast::Literal { + span: span_range(pat, 1..4), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }, + end: ast::Literal { + span: span_range(pat, 5..8), + kind: ast::LiteralKind::Verbatim, + c: '⛄', + }, + })), + })))); + + assert_eq!( + parser(r"[]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), ']')), + })))); + assert_eq!( + parser(r"[]\[]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union(span(1..4), vec![ + lit(span(1..2), ']'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '[', + }), + ]), + })))); + assert_eq!( + parser(r"[\[]]").parse(), + Ok(concat(0..5, vec![ + Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '[', + })), + })), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ]))); + + assert_eq!( + parser("[").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[-]").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[[:alnum:]").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser(r"[\b]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[\w-a]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[a-\w]").parse().unwrap_err(), + TestError { + span: span(3..5), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[z-a]").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::ClassRangeInvalid, + }); + + assert_eq!( + parser_ignore_whitespace("[a ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser_ignore_whitespace("[a- ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + } + + #[test] + fn parse_set_class_open() { + assert_eq!( + parser("[a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..1), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ - a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[--a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ] a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[-]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + + assert_eq!( + parser("[").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser_ignore_whitespace("[ ") + .parse_set_class_open() + .unwrap_err(), + TestError { + span: span(0..5), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[^").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[]").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[-").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[--").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::ClassUnclosed, + }); + } + + #[test] + fn maybe_parse_ascii_class() { + assert_eq!( + parser(r"[:alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + })); + assert_eq!( + parser(r"[:alnum:]A").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + })); + assert_eq!( + parser(r"[:^alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..10), + kind: ast::ClassAsciiKind::Alnum, + negated: true, + })); + + let p = parser(r"[:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:^"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[^:alnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + } + + #[test] + fn parse_unicode_class() { + assert_eq!( + parser(r"\pN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + }))); + assert_eq!( + parser(r"\PN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: true, + kind: ast::ClassUnicodeKind::OneLetter('N'), + }))); + assert_eq!( + parser(r"\p{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("N")), + }))); + assert_eq!( + parser(r"\P{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: true, + kind: ast::ClassUnicodeKind::Named(s("N")), + }))); + assert_eq!( + parser(r"\p{Greek}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + }))); + + assert_eq!( + parser(r"\p{scx:Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s("scx"), + value: s("Katakana"), + }, + }))); + assert_eq!( + parser(r"\p{scx=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s("scx"), + value: s("Katakana"), + }, + }))); + assert_eq!( + parser(r"\p{scx!=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..17), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s("scx"), + value: s("Katakana"), + }, + }))); + + assert_eq!( + parser(r"\p{:}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s(""), + value: s(""), + }, + }))); + assert_eq!( + parser(r"\p{=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s(""), + value: s(""), + }, + }))); + assert_eq!( + parser(r"\p{!=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..6), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s(""), + value: s(""), + }, + }))); + + assert_eq!( + parser(r"\p").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{N").parse_escape().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{Greek").parse_escape().unwrap_err(), + TestError { + span: span(8..8), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + + assert_eq!( + parser(r"\pNz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + assert_eq!( + parser(r"\p{Greek}z").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..10), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })), + Ast::Literal(ast::Literal { + span: span(9..10), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + } + + #[test] + fn parse_perl_class() { + assert_eq!( + parser(r"\d").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + }))); + assert_eq!( + parser(r"\D").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: true, + }))); + assert_eq!( + parser(r"\s").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: false, + }))); + assert_eq!( + parser(r"\S").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: true, + }))); + assert_eq!( + parser(r"\w").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: false, + }))); + assert_eq!( + parser(r"\W").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: true, + }))); + + assert_eq!( + parser(r"\d").parse(), + Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })))); + assert_eq!( + parser(r"\dz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..3), + asts: vec![ + Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })), + Ast::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + } +} diff --git a/regex-syntax-2/src/ast/print.rs b/regex-syntax-2/src/ast/print.rs new file mode 100644 index 0000000000..0d6dfb0a20 --- /dev/null +++ b/regex-syntax-2/src/ast/print.rs @@ -0,0 +1,591 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This module provides a regular expression printer. +*/ + +use std::fmt; + +use ast::{self, Ast}; +use ast::visitor::{self, Visitor}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { + _priv: (), + } + } + + fn build(&self) -> Printer { + Printer { + _priv: (), + } + } +} + +/// A printer for a regular expression abstract syntax tree. +/// +/// A printer converts an abstract syntax tree (AST) to a regular expression +/// pattern string. This particular printer uses constant stack space and heap +/// space proportional to the size of the AST. +/// +/// This printer will not necessarily preserve the original formatting of the +/// regular expression pattern string. For example, all whitespace and comments +/// are ignored. +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { + visitor::visit(ast, Writer { printer: self, wtr: wtr }) + } +} + +#[derive(Debug)] +struct Writer<'p, W> { + printer: &'p mut Printer, + wtr: W, +} + +impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()) + } + } + + fn visit_post(&mut self, ast: &Ast) -> fmt::Result { + use ast::Class; + + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + Ast::Class(Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_post(x) + } + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), + } + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + match *ast { + ast::ClassSetItem::Bracketed(ref x) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + use ast::ClassSetItem::*; + + match *ast { + Empty(_) => Ok(()), + Literal(ref x) => self.fmt_literal(x), + Range(ref x) => { + try!(self.fmt_literal(&x.start)); + try!(self.wtr.write_str("-")); + try!(self.fmt_literal(&x.end)); + Ok(()) + } + Ascii(ref x) => self.fmt_class_ascii(x), + Unicode(ref x) => self.fmt_class_unicode(x), + Perl(ref x) => self.fmt_class_perl(x), + Bracketed(ref x) => self.fmt_class_bracketed_post(x), + Union(_) => Ok(()), + } + } + + fn visit_class_set_binary_op_in( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + self.fmt_class_set_binary_op_kind(&ast.kind) + } +} + +impl<'p, W: fmt::Write> Writer<'p, W> { + fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { + use ast::GroupKind::*; + match ast.kind { + CaptureIndex(_) => self.wtr.write_str("("), + CaptureName(ref x) => { + try!(self.wtr.write_str("(?P<")); + try!(self.wtr.write_str(&x.name)); + try!(self.wtr.write_str(">")); + Ok(()) + } + NonCapturing(ref flags) => { + try!(self.wtr.write_str("(?")); + try!(self.fmt_flags(flags)); + try!(self.wtr.write_str(":")); + Ok(()) + } + } + } + + fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { + self.wtr.write_str(")") + } + + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { + use ast::RepetitionKind::*; + match ast.op.kind { + ZeroOrOne if ast.greedy => self.wtr.write_str("?"), + ZeroOrOne => self.wtr.write_str("??"), + ZeroOrMore if ast.greedy => self.wtr.write_str("*"), + ZeroOrMore => self.wtr.write_str("*?"), + OneOrMore if ast.greedy => self.wtr.write_str("+"), + OneOrMore => self.wtr.write_str("+?"), + Range(ref x) => { + try!(self.fmt_repetition_range(x)); + if !ast.greedy { + try!(self.wtr.write_str("?")); + } + Ok(()) + } + } + } + + fn fmt_repetition_range( + &mut self, + ast: &ast::RepetitionRange, + ) -> fmt::Result { + use ast::RepetitionRange::*; + match *ast { + Exactly(x) => write!(self.wtr, "{{{}}}", x), + AtLeast(x) => write!(self.wtr, "{{{},}}", x), + Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), + } + } + + fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { + use ast::LiteralKind::*; + + match ast.kind { + Verbatim => self.wtr.write_char(ast.c), + Punctuation => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + HexFixed(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{:02X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{:04X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{:08X}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + } + Special(ast::SpecialLiteralKind::Bell) => { + self.wtr.write_str(r"\a") + } + Special(ast::SpecialLiteralKind::FormFeed) => { + self.wtr.write_str(r"\f") + } + Special(ast::SpecialLiteralKind::Tab) => { + self.wtr.write_str(r"\t") + } + Special(ast::SpecialLiteralKind::LineFeed) => { + self.wtr.write_str(r"\n") + } + Special(ast::SpecialLiteralKind::CarriageReturn) => { + self.wtr.write_str(r"\r") + } + Special(ast::SpecialLiteralKind::VerticalTab) => { + self.wtr.write_str(r"\v") + } + Special(ast::SpecialLiteralKind::Space) => { + self.wtr.write_str(r"\ ") + } + } + } + + fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { + use ast::AssertionKind::*; + match ast.kind { + StartLine => self.wtr.write_str("^"), + EndLine => self.wtr.write_str("$"), + StartText => self.wtr.write_str(r"\A"), + EndText => self.wtr.write_str(r"\z"), + WordBoundary => self.wtr.write_str(r"\b"), + NotWordBoundary => self.wtr.write_str(r"\B"), + } + } + + fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { + try!(self.wtr.write_str("(?")); + try!(self.fmt_flags(&ast.flags)); + try!(self.wtr.write_str(")")); + Ok(()) + } + + fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { + use ast::{Flag, FlagsItemKind}; + + for item in &ast.items { + try!(match item.kind { + FlagsItemKind::Negation => self.wtr.write_str("-"), + FlagsItemKind::Flag(ref flag) => { + match *flag { + Flag::CaseInsensitive => self.wtr.write_str("i"), + Flag::MultiLine => self.wtr.write_str("m"), + Flag::DotMatchesNewLine => self.wtr.write_str("s"), + Flag::SwapGreed => self.wtr.write_str("U"), + Flag::Unicode => self.wtr.write_str("u"), + Flag::IgnoreWhitespace => self.wtr.write_str("x"), + } + } + }); + } + Ok(()) + } + + fn fmt_class_bracketed_pre( + &mut self, + ast: &ast::ClassBracketed, + ) -> fmt::Result { + if ast.negated { + self.wtr.write_str("[^") + } else { + self.wtr.write_str("[") + } + } + + fn fmt_class_bracketed_post( + &mut self, + _ast: &ast::ClassBracketed, + ) -> fmt::Result { + self.wtr.write_str("]") + } + + fn fmt_class_set_binary_op_kind( + &mut self, + ast: &ast::ClassSetBinaryOpKind, + ) -> fmt::Result { + use ast::ClassSetBinaryOpKind::*; + match *ast { + Intersection => self.wtr.write_str("&&"), + Difference => self.wtr.write_str("--"), + SymmetricDifference => self.wtr.write_str("~~"), + } + } + + fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { + use ast::ClassPerlKind::*; + match ast.kind { + Digit if ast.negated => self.wtr.write_str(r"\D"), + Digit => self.wtr.write_str(r"\d"), + Space if ast.negated => self.wtr.write_str(r"\S"), + Space => self.wtr.write_str(r"\s"), + Word if ast.negated => self.wtr.write_str(r"\W"), + Word => self.wtr.write_str(r"\w"), + } + } + + fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { + use ast::ClassAsciiKind::*; + match ast.kind { + Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), + Alnum => self.wtr.write_str("[:alnum:]"), + Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), + Alpha => self.wtr.write_str("[:alpha:]"), + Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), + Ascii => self.wtr.write_str("[:ascii:]"), + Blank if ast.negated => self.wtr.write_str("[:^blank:]"), + Blank => self.wtr.write_str("[:blank:]"), + Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), + Cntrl => self.wtr.write_str("[:cntrl:]"), + Digit if ast.negated => self.wtr.write_str("[:^digit:]"), + Digit => self.wtr.write_str("[:digit:]"), + Graph if ast.negated => self.wtr.write_str("[:^graph:]"), + Graph => self.wtr.write_str("[:graph:]"), + Lower if ast.negated => self.wtr.write_str("[:^lower:]"), + Lower => self.wtr.write_str("[:lower:]"), + Print if ast.negated => self.wtr.write_str("[:^print:]"), + Print => self.wtr.write_str("[:print:]"), + Punct if ast.negated => self.wtr.write_str("[:^punct:]"), + Punct => self.wtr.write_str("[:punct:]"), + Space if ast.negated => self.wtr.write_str("[:^space:]"), + Space => self.wtr.write_str("[:space:]"), + Upper if ast.negated => self.wtr.write_str("[:^upper:]"), + Upper => self.wtr.write_str("[:upper:]"), + Word if ast.negated => self.wtr.write_str("[:^word:]"), + Word => self.wtr.write_str("[:word:]"), + Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), + Xdigit => self.wtr.write_str("[:xdigit:]"), + } + } + + fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { + use ast::ClassUnicodeKind::*; + use ast::ClassUnicodeOpKind::*; + + if ast.negated { + try!(self.wtr.write_str(r"\P")); + } else { + try!(self.wtr.write_str(r"\p")); + } + match ast.kind { + OneLetter(c) => self.wtr.write_char(c), + Named(ref x) => write!(self.wtr, "{{{}}}", x), + NamedValue { op: Equal, ref name, ref value } => { + write!(self.wtr, "{{{}={}}}", name, value) + } + NamedValue { op: Colon, ref name, ref value } => { + write!(self.wtr, "{{{}:{}}}", name, value) + } + NamedValue { op: NotEqual, ref name, ref value } => { + write!(self.wtr, "{{{}!={}}}", name, value) + } + } + } +} + +#[cfg(test)] +mod tests { + use ast::parse::ParserBuilder; + use super::Printer; + + fn roundtrip(given: &str) { + roundtrip_with(|b| b, given); + } + + fn roundtrip_with(mut f: F, given: &str) + where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let ast = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&ast, &mut dst).unwrap(); + assert_eq!(given, dst); + } + + #[test] + fn scratch() { + roundtrip("."); + } + + #[test] + fn print_literal() { + roundtrip("a"); + roundtrip(r"\["); + roundtrip_with(|b| b.octal(true), r"\141"); + roundtrip(r"\x61"); + roundtrip(r"\x7F"); + roundtrip(r"\u0061"); + roundtrip(r"\U00000061"); + roundtrip(r"\x{61}"); + roundtrip(r"\x{7F}"); + roundtrip(r"\u{61}"); + roundtrip(r"\U{61}"); + + roundtrip(r"\a"); + roundtrip(r"\f"); + roundtrip(r"\t"); + roundtrip(r"\n"); + roundtrip(r"\r"); + roundtrip(r"\v"); + roundtrip(r"(?x)\ "); + } + + #[test] + fn print_dot() { + roundtrip("."); + } + + #[test] + fn print_concat() { + roundtrip("ab"); + roundtrip("abcde"); + roundtrip("a(bcd)ef"); + } + + #[test] + fn print_alternation() { + roundtrip("a|b"); + roundtrip("a|b|c|d|e"); + roundtrip("|a|b|c|d|e"); + roundtrip("|a|b|c|d|e|"); + roundtrip("a(b|c|d)|e|f"); + } + + #[test] + fn print_assertion() { + roundtrip(r"^"); + roundtrip(r"$"); + roundtrip(r"\A"); + roundtrip(r"\z"); + roundtrip(r"\b"); + roundtrip(r"\B"); + } + + #[test] + fn print_repetition() { + roundtrip("a?"); + roundtrip("a??"); + roundtrip("a*"); + roundtrip("a*?"); + roundtrip("a+"); + roundtrip("a+?"); + roundtrip("a{5}"); + roundtrip("a{5}?"); + roundtrip("a{5,}"); + roundtrip("a{5,}?"); + roundtrip("a{5,10}"); + roundtrip("a{5,10}?"); + } + + #[test] + fn print_flags() { + roundtrip("(?i)"); + roundtrip("(?-i)"); + roundtrip("(?s-i)"); + roundtrip("(?-si)"); + roundtrip("(?siUmux)"); + } + + #[test] + fn print_group() { + roundtrip("(?i:a)"); + roundtrip("(?Pa)"); + roundtrip("(a)"); + } + + #[test] + fn print_class() { + roundtrip(r"[abc]"); + roundtrip(r"[a-z]"); + roundtrip(r"[^a-z]"); + roundtrip(r"[a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[a-z0-9---]"); + roundtrip(r"[a-z&&m-n]"); + roundtrip(r"[[a-z&&m-n]]"); + roundtrip(r"[a-z--m-n]"); + roundtrip(r"[a-z~~m-n]"); + roundtrip(r"[a-z[0-9]]"); + roundtrip(r"[a-z[^0-9]]"); + + roundtrip(r"\d"); + roundtrip(r"\D"); + roundtrip(r"\s"); + roundtrip(r"\S"); + roundtrip(r"\w"); + roundtrip(r"\W"); + + roundtrip(r"[[:alnum:]]"); + roundtrip(r"[[:^alnum:]]"); + roundtrip(r"[[:alpha:]]"); + roundtrip(r"[[:^alpha:]]"); + roundtrip(r"[[:ascii:]]"); + roundtrip(r"[[:^ascii:]]"); + roundtrip(r"[[:blank:]]"); + roundtrip(r"[[:^blank:]]"); + roundtrip(r"[[:cntrl:]]"); + roundtrip(r"[[:^cntrl:]]"); + roundtrip(r"[[:digit:]]"); + roundtrip(r"[[:^digit:]]"); + roundtrip(r"[[:graph:]]"); + roundtrip(r"[[:^graph:]]"); + roundtrip(r"[[:lower:]]"); + roundtrip(r"[[:^lower:]]"); + roundtrip(r"[[:print:]]"); + roundtrip(r"[[:^print:]]"); + roundtrip(r"[[:punct:]]"); + roundtrip(r"[[:^punct:]]"); + roundtrip(r"[[:space:]]"); + roundtrip(r"[[:^space:]]"); + roundtrip(r"[[:upper:]]"); + roundtrip(r"[[:^upper:]]"); + roundtrip(r"[[:word:]]"); + roundtrip(r"[[:^word:]]"); + roundtrip(r"[[:xdigit:]]"); + roundtrip(r"[[:^xdigit:]]"); + + roundtrip(r"\pL"); + roundtrip(r"\PL"); + roundtrip(r"\p{L}"); + roundtrip(r"\P{L}"); + roundtrip(r"\p{X=Y}"); + roundtrip(r"\P{X=Y}"); + roundtrip(r"\p{X:Y}"); + roundtrip(r"\P{X:Y}"); + roundtrip(r"\p{X!=Y}"); + roundtrip(r"\P{X!=Y}"); + } +} diff --git a/regex-syntax-2/src/ast/visitor.rs b/regex-syntax-2/src/ast/visitor.rs new file mode 100644 index 0000000000..268ac45f1b --- /dev/null +++ b/regex-syntax-2/src/ast/visitor.rs @@ -0,0 +1,557 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; + +use ast::{self, Ast}; + +/// A trait for visiting an abstract syntax tree (AST) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on an abstract syntax tree without necessarily using recursion. +/// In particular, this permits callers to do case analysis with constant stack +/// usage, which can be important since the size of an abstract syntax tree +/// may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +/// +/// Note that the abstract syntax tree for a regular expression is quite +/// complex. Unless you specifically need it, you might be able to use the +/// much simpler +/// [high-level intermediate representation](../hir/struct.Hir.html) +/// and its +/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) +/// instead. +pub trait Visitor { + /// The result of visiting an AST. + type Output; + /// An error that visiting an AST might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the AST or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the AST. + fn start(&mut self) {} + + /// This method is called on an `Ast` before descending into child `Ast` + /// nodes. + fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Ast` after descending all of its child + /// `Ast` nodes. + fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an + /// [`Alternation`](struct.Alternation.html). + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// before descending into child nodes. + fn visit_class_set_item_pre( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// after descending into child nodes. + fn visit_class_set_item_post( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// before descending into child nodes. + fn visit_class_set_binary_op_pre( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// after descending into child nodes. + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between the left hand and right hand child nodes + /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + fn visit_class_set_binary_op_in( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Ast` while calling the +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Ast` without using a stack size proportional to the depth +/// of the `Ast`. Namely, this method will instead use constant stack size, but +/// will use heap space proportional to the size of the `Ast`. This may be +/// desirable in cases where the size of `Ast` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(ast: &Ast, visitor: V) -> Result { + HeapVisitor::new().visit(ast, visitor) +} + +/// HeapVisitor visits every item in an `Ast` recursively using constant stack +/// size and a heap size proportional to the size of the `Ast`. +struct HeapVisitor<'a> { + /// A stack of `Ast` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Ast, Frame<'a>)>, + /// Similar to the `Ast` stack above, but is used only for character + /// classes. In particular, character classes embed their own mini + /// recursive syntax. + stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Ast`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a ast::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a ast::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, +} + +/// Represents a single stack frame while performing structural induction over +/// a character class. +enum ClassFrame<'a> { + /// The stack frame used while visiting every child node of a union of + /// character class items. + Union { + /// The child node we are currently visiting. + head: &'a ast::ClassSetItem, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [ast::ClassSetItem], + }, + /// The stack frame used while a binary class operation. + Binary { + op: &'a ast::ClassSetBinaryOp, + }, + /// A stack frame allocated just before descending into a binary operator's + /// left hand child node. + BinaryLHS { + op: &'a ast::ClassSetBinaryOp, + lhs: &'a ast::ClassSet, + rhs: &'a ast::ClassSet, + }, + /// A stack frame allocated just before descending into a binary operator's + /// right hand child node. + BinaryRHS { + op: &'a ast::ClassSetBinaryOp, + rhs: &'a ast::ClassSet, + }, +} + +/// A representation of the inductive step when performing structural induction +/// over a character class. +/// +/// Note that there is no analogous explicit type for the inductive step for +/// `Ast` nodes because the inductive step is just an `Ast`. For character +/// classes, the inductive step can produce one of two possible child nodes: +/// an item or a binary operation. (An item cannot be a binary operation +/// because that would imply binary operations can be unioned in the concrete +/// syntax, which is not possible.) +enum ClassInduct<'a> { + Item(&'a ast::ClassSetItem), + BinaryOp(&'a ast::ClassSetBinaryOp), +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![], stack_class: vec![] } + } + + fn visit( + &mut self, + mut ast: &'a Ast, + mut visitor: V, + ) -> Result { + self.stack.clear(); + self.stack_class.clear(); + + visitor.start(); + loop { + try!(visitor.visit_pre(ast)); + if let Some(x) = try!(self.induct(ast, &mut visitor)) { + let child = x.child(); + self.stack.push((ast, x)); + ast = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + try!(visitor.visit_post(ast)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation {..} = x { + try!(visitor.visit_alternation_in()); + } + ast = x.child(); + self.stack.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this AST, so we can post visit it now. + try!(visitor.visit_post(post_ast)); + } + } + } + + /// Build a stack frame for the given AST if one is needed (which occurs if + /// and only if there are child nodes in the AST). Otherwise, return None. + /// + /// If this visits a class, then the underlying visitor implementation may + /// return an error which will be passed on here. + fn induct( + &mut self, + ast: &'a Ast, + visitor: &mut V, + ) -> Result>, V::Err> { + Ok(match *ast { + Ast::Class(ast::Class::Bracketed(ref x)) => { + try!(self.visit_class(x, visitor)); + None + } + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { + Some(Frame::Concat { + head: &x.asts[0], + tail: &x.asts[1..], + }) + } + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => { + Some(Frame::Alternation { + head: &x.asts[0], + tail: &x.asts[1..], + }) + } + _ => None, + }) + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { + head: &tail[0], + tail: &tail[1..], + }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } + + fn visit_class( + &mut self, + ast: &'a ast::ClassBracketed, + visitor: &mut V, + ) -> Result<(), V::Err> { + let mut ast = ClassInduct::from_bracketed(ast); + loop { + try!(self.visit_class_pre(&ast, visitor)); + if let Some(x) = self.induct_class(&ast) { + let child = x.child(); + self.stack_class.push((ast, x)); + ast = child; + continue; + } + try!(self.visit_class_post(&ast, visitor)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack_class.pop() { + None => return Ok(()), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a union or a binary op, then we might have + // additional inductive steps to process. + if let Some(x) = self.pop_class(frame) { + if let ClassFrame::BinaryRHS { ref op, .. } = x { + try!(visitor.visit_class_set_binary_op_in(op)); + } + ast = x.child(); + self.stack_class.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this class node, so we can post visit it now. + try!(self.visit_class_post(&post_ast, visitor)); + } + } + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_pre( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + try!(visitor.visit_class_set_item_pre(item)); + } + ClassInduct::BinaryOp(op) => { + try!(visitor.visit_class_set_binary_op_pre(op)); + } + } + Ok(()) + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_post( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + try!(visitor.visit_class_set_item_post(item)); + } + ClassInduct::BinaryOp(op) => { + try!(visitor.visit_class_set_binary_op_post(op)); + } + } + Ok(()) + } + + /// Build a stack frame for the given class node if one is needed (which + /// occurs if and only if there are child nodes). Otherwise, return None. + fn induct_class( + &self, + ast: &ClassInduct<'a>, + ) -> Option> { + match *ast { + ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { + match x.kind { + ast::ClassSet::Item(ref item) => { + Some(ClassFrame::Union { + head: item, + tail: &[], + }) + } + ast::ClassSet::BinaryOp(ref op) => { + Some(ClassFrame::Binary { op: op }) + } + } + } + ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { + if x.items.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &x.items[0], + tail: &x.items[1..], + }) + } + } + ClassInduct::BinaryOp(op) => { + Some(ClassFrame::BinaryLHS { + op: op, + lhs: &op.lhs, + rhs: &op.rhs, + }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop_class(&self, induct: ClassFrame<'a>) -> Option> { + match induct { + ClassFrame::Union { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &tail[0], + tail: &tail[1..], + }) + } + } + ClassFrame::Binary {..} => None, + ClassFrame::BinaryLHS { op, rhs, .. } => { + Some(ClassFrame::BinaryRHS { + op: op, + rhs: rhs, + }) + } + ClassFrame::BinaryRHS {..} => None, + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child AST node to visit. + fn child(&self) -> &'a Ast { + match *self { + Frame::Repetition(rep) => &rep.ast, + Frame::Group(group) => &group.ast, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} + +impl<'a> ClassFrame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child class node to visit. + fn child(&self) -> ClassInduct<'a> { + match *self { + ClassFrame::Union { head, .. } => ClassInduct::Item(head), + ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), + ClassFrame::BinaryLHS { ref lhs, .. } => { + ClassInduct::from_set(lhs) + } + ClassFrame::BinaryRHS { ref rhs, .. } => { + ClassInduct::from_set(rhs) + } + } + } +} + +impl<'a> ClassInduct<'a> { + fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { + ClassInduct::from_set(&ast.kind) + } + + fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { + match *ast { + ast::ClassSet::Item(ref item) => ClassInduct::Item(item), + ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), + } + } +} + +impl<'a> fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let x = match *self { + ClassFrame::Union{..} => "Union", + ClassFrame::Binary{..} => "Binary", + ClassFrame::BinaryLHS{..} => "BinaryLHS", + ClassFrame::BinaryRHS{..} => "BinaryRHS", + }; + write!(f, "{}", x) + } +} + +impl<'a> fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let x = match *self { + ClassInduct::Item(it) => { + match *it { + ast::ClassSetItem::Empty(_) => "Item(Empty)", + ast::ClassSetItem::Literal(_) => "Item(Literal)", + ast::ClassSetItem::Range(_) => "Item(Range)", + ast::ClassSetItem::Ascii(_) => "Item(Ascii)", + ast::ClassSetItem::Perl(_) => "Item(Perl)", + ast::ClassSetItem::Unicode(_) => "Item(Unicode)", + ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", + ast::ClassSetItem::Union(_) => "Item(Union)", + } + } + ClassInduct::BinaryOp(it) => { + match it.kind { + ast::ClassSetBinaryOpKind::Intersection => { + "BinaryOp(Intersection)" + } + ast::ClassSetBinaryOpKind::Difference => { + "BinaryOp(Difference)" + } + ast::ClassSetBinaryOpKind::SymmetricDifference => { + "BinaryOp(SymmetricDifference)" + } + } + } + }; + write!(f, "{}", x) + } +} diff --git a/regex-syntax-2/src/either.rs b/regex-syntax-2/src/either.rs new file mode 100644 index 0000000000..7ae41e4ced --- /dev/null +++ b/regex-syntax-2/src/either.rs @@ -0,0 +1,8 @@ +/// A simple binary sum type. +/// +/// This is occasionally useful in an ad hoc fashion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Either { + Left(Left), + Right(Right), +} diff --git a/regex-syntax-2/src/error.rs b/regex-syntax-2/src/error.rs new file mode 100644 index 0000000000..53f1231d8e --- /dev/null +++ b/regex-syntax-2/src/error.rs @@ -0,0 +1,278 @@ +use std::cmp; +use std::error; +use std::fmt; +use std::result; + +use ast; +use hir; + +/// A type alias for dealing with errors returned by this crate. +pub type Result = result::Result; + +/// This error type encompasses any error that can be returned by this crate. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + /// An error that occurred while translating concrete syntax into abstract + /// syntax (AST). + Parse(ast::Error), + /// An error that occurred while translating abstract syntax into a high + /// level intermediate representation (HIR). + Translate(hir::Error), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From for Error { + fn from(err: ast::Error) -> Error { + Error::Parse(err) + } +} + +impl From for Error { + fn from(err: hir::Error) -> Error { + Error::Translate(err) + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Parse(ref x) => x.description(), + Error::Translate(ref x) => x.description(), + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Parse(ref x) => x.fmt(f), + Error::Translate(ref x) => x.fmt(f), + _ => unreachable!(), + } + } +} + +/// A helper type for formatting nice error messages. +/// +/// This type is responsible for reporting regex parse errors in a nice human +/// readable format. Most of its complexity is from interspersing notational +/// markers pointing out the position where an error occurred. +#[derive(Debug)] +pub struct Formatter<'e, E: 'e> { + /// The original regex pattern in which the error occurred. + pattern: &'e str, + /// The error kind. It must impl fmt::Display. + err: &'e E, + /// The primary span of the error. + span: &'e ast::Span, + /// An auxiliary and optional span, in case the error needs to point to + /// two locations (e.g., when reporting a duplicate capture group name). + aux_span: Option<&'e ast::Span>, +} + +impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { + fn from(err: &'e ast::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: err.auxiliary_span(), + } + } +} + +impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { + fn from(err: &'e hir::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: None, + } + } +} + +impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let spans = Spans::from_formatter(self); + if self.pattern.contains('\n') { + let divider = repeat_char('~', 79); + + try!(writeln!(f, "regex parse error:")); + try!(writeln!(f, "{}", divider)); + let notated = spans.notate(); + try!(write!(f, "{}", notated)); + try!(writeln!(f, "{}", divider)); + // If we have error spans that cover multiple lines, then we just + // note the line numbers. + if !spans.multi_line.is_empty() { + let mut notes = vec![]; + for span in &spans.multi_line { + notes.push(format!( + "on line {} (column {}) through line {} (column {})", + span.start.line, span.start.column, + span.end.line, span.end.column - 1)); + } + try!(writeln!(f, "{}", notes.join("\n"))); + } + try!(write!(f, "error: {}", self.err)); + } else { + try!(writeln!(f, "regex parse error:")); + let notated = Spans::from_formatter(self).notate(); + try!(write!(f, "{}", notated)); + try!(write!(f, "error: {}", self.err)); + } + Ok(()) + } +} + +/// This type represents an arbitrary number of error spans in a way that makes +/// it convenient to notate the regex pattern. ("Notate" means "point out +/// exactly where the error occurred in the regex pattern.") +/// +/// Technically, we can only ever have two spans given our current error +/// structure. However, after toiling with a specific algorithm for handling +/// two spans, it became obvious that an algorithm to handle an arbitrary +/// number of spans was actually much simpler. +struct Spans<'p> { + /// The original regex pattern string. + pattern: &'p str, + /// The total width that should be used for line numbers. The width is + /// used for left padding the line numbers for alignment. + /// + /// A value of `0` means line numbers should not be displayed. That is, + /// the pattern is itself only one line. + line_number_width: usize, + /// All error spans that occur on a single line. This sequence always has + /// length equivalent to the number of lines in `pattern`, where the index + /// of the sequence represents a line number, starting at `0`. The spans + /// in each line are sorted in ascending order. + by_line: Vec>, + /// All error spans that occur over one or more lines. That is, the start + /// and end position of the span have different line numbers. The spans are + /// sorted in ascending order. + multi_line: Vec, +} + +impl<'p> Spans<'p> { + /// Build a sequence of spans from a formatter. + fn from_formatter<'e, E: fmt::Display>( + fmter: &'p Formatter<'e, E>, + ) -> Spans<'p> { + let line_count = fmter.pattern.lines().count(); + let line_number_width = + if line_count <= 1 { + 0 + } else { + line_count.to_string().len() + }; + let mut spans = Spans { + pattern: &fmter.pattern, + line_number_width: line_number_width, + by_line: vec![vec![]; line_count], + multi_line: vec![], + }; + spans.add(fmter.span.clone()); + if let Some(span) = fmter.aux_span { + spans.add(span.clone()); + } + spans + } + + /// Add the given span to this sequence, putting it in the right place. + fn add(&mut self, span: ast::Span) { + // This is grossly inefficient since we sort after each add, but right + // now, we only ever add two spans at most. + if span.is_one_line() { + let i = span.start.line - 1; // because lines are 1-indexed + self.by_line[i].push(span); + self.by_line[i].sort(); + } else { + self.multi_line.push(span); + self.multi_line.sort(); + } + } + + /// Notate the pattern string with carents (`^`) pointing at each span + /// location. This only applies to spans that occur within a single line. + fn notate(&self) -> String { + let mut notated = String::new(); + for (i, line) in self.pattern.lines().enumerate() { + if self.line_number_width > 0 { + notated.push_str(&self.left_pad_line_number(i + 1)); + notated.push_str(": "); + } else { + notated.push_str(" "); + } + notated.push_str(line); + notated.push('\n'); + if let Some(notes) = self.notate_line(i) { + notated.push_str(¬es); + notated.push('\n'); + } + } + notated + } + + /// Return notes for the line indexed at `i` (zero-based). If there are no + /// spans for the given line, then `None` is returned. Otherwise, an + /// appropriately space padded string with correctly positioned `^` is + /// returned, accounting for line numbers. + fn notate_line(&self, i: usize) -> Option { + let spans = &self.by_line[i]; + if spans.is_empty() { + return None; + } + let mut notes = String::new(); + for _ in 0..self.line_number_padding() { + notes.push(' '); + } + let mut pos = 0; + for span in spans { + for _ in pos..(span.start.column - 1) { + notes.push(' '); + pos += 1; + } + let note_len = span.end.column.saturating_sub(span.start.column); + for _ in 0..cmp::max(1, note_len) { + notes.push('^'); + pos += 1; + } + } + Some(notes) + } + + /// Left pad the given line number with spaces such that it is aligned with + /// other line numbers. + fn left_pad_line_number(&self, n: usize) -> String { + let n = n.to_string(); + let pad = self.line_number_width.checked_sub(n.len()).unwrap(); + let mut result = repeat_char(' ', pad); + result.push_str(&n); + result + } + + /// Return the line number padding beginning at the start of each line of + /// the pattern. + /// + /// If the pattern is only one line, then this returns a fixed padding + /// for visual indentation. + fn line_number_padding(&self) -> usize { + if self.line_number_width == 0 { + 4 + } else { + 2 + self.line_number_width + } + } +} + +fn repeat_char(c: char, count: usize) -> String { + ::std::iter::repeat(c).take(count).collect() +} diff --git a/regex-syntax-2/src/hir/interval.rs b/regex-syntax-2/src/hir/interval.rs new file mode 100644 index 0000000000..a7e70ef596 --- /dev/null +++ b/regex-syntax-2/src/hir/interval.rs @@ -0,0 +1,490 @@ +use std::char; +use std::cmp; +use std::fmt::Debug; +use std::slice; +use std::u8; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// ocassionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct IntervalSet { + ranges: Vec, +} + +impl IntervalSet { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new>(intervals: T) -> IntervalSet { + let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + range.case_fold_simple(&mut self.ranges); + } + self.canonicalize(); + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet) { + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = (0..drain_end).into_iter(); + let mut itb = (0..other.ranges.len()).into_iter(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: + while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I: 'a>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple(&self, intervals: &mut Vec); + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option, Option) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option, Option) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord { + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { u8::MIN } + fn max_value() -> Self { u8::MAX } + fn as_u32(self) -> u32 { self as u32 } + fn increment(self) -> Self { self.checked_add(1).unwrap() } + fn decrement(self) -> Self { self.checked_sub(1).unwrap() } +} + +impl Bound for char { + fn min_value() -> Self { '\x00' } + fn max_value() -> Self { '\u{10FFFF}' } + fn as_u32(self) -> u32 { self as u32 } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/regex-syntax-2/src/hir/literal/mod.rs b/regex-syntax-2/src/hir/literal/mod.rs new file mode 100644 index 0000000000..3113ec970f --- /dev/null +++ b/regex-syntax-2/src/hir/literal/mod.rs @@ -0,0 +1,1553 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Provides routines for extracting literal prefixes and suffixes from an `Hir`. +*/ + +use std::cmp; +use std::fmt; +use std::iter; +use std::mem; +use std::ops; + +use hir::{self, Hir, HirKind}; +use unicode; + +/// A set of literal byte strings extracted from a regular expression. +/// +/// Every member of the set is a `Literal`, which is represented by a +/// `Vec`. (Notably, it may contain invalid UTF-8.) Every member is +/// said to be either *complete* or *cut*. A complete literal means that +/// it extends until the beginning (or end) of the regular expression. In +/// some circumstances, this can be used to indicate a match in the regular +/// expression. +/// +/// A key aspect of literal extraction is knowing when to stop. It is not +/// feasible to blindly extract all literals from a regular expression, even if +/// there are finitely many. For example, the regular expression `[0-9]{10}` +/// has `10^10` distinct literals. For this reason, literal extraction is +/// bounded to some low number by default using heuristics, but the limits can +/// be tweaked. +/// +/// **WARNING**: Literal extraction uses stack space proportional to the size +/// of the `Hir` expression. At some point, this drawback will be eliminated. +/// To protect yourself, set a reasonable +/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). +/// This is done for you by default. +#[derive(Clone, Eq, PartialEq)] +pub struct Literals { + lits: Vec, + limit_size: usize, + limit_class: usize, +} + +/// A single member of a set of literals extracted from a regular expression. +/// +/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice +/// and `Vec` operations are available. +#[derive(Clone, Eq, Ord)] +pub struct Literal { + v: Vec, + cut: bool, +} + +impl Literals { + /// Returns a new empty set of literals using default limits. + pub fn empty() -> Literals { + Literals { + lits: vec![], + limit_size: 250, + limit_class: 10, + } + } + + /// Returns a set of literal prefixes extracted from the given `Hir`. + pub fn prefixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_prefixes(expr); + lits + } + + /// Returns a set of literal suffixes extracted from the given `Hir`. + pub fn suffixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_suffixes(expr); + lits + } + + /// Get the approximate size limit (in bytes) of this set. + pub fn limit_size(&self) -> usize { + self.limit_size + } + + /// Set the approximate size limit (in bytes) of this set. + /// + /// If extracting a literal would put the set over this limit, then + /// extraction stops. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { + self.limit_size = size; + self + } + + /// Get the character class size limit for this set. + pub fn limit_class(&self) -> usize { + self.limit_class + } + + /// Limits the size of character(or byte) classes considered. + /// + /// A value of `0` prevents all character classes from being considered. + /// + /// This limit also applies to case insensitive literals, since each + /// character in the case insensitive literal is converted to a class, and + /// then case folded. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { + self.limit_class = size; + self + } + + /// Returns the set of literals as a slice. Its order is unspecified. + pub fn literals(&self) -> &[Literal] { + &self.lits + } + + /// Returns the length of the smallest literal. + /// + /// Returns None is there are no literals in the set. + pub fn min_len(&self) -> Option { + let mut min = None; + for lit in &self.lits { + match min { + None => min = Some(lit.len()), + Some(m) if lit.len() < m => min = Some(lit.len()), + _ => {} + } + } + min + } + + /// Returns true if all members in this set are complete. + pub fn all_complete(&self) -> bool { + !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) + } + + /// Returns true if any member in this set is complete. + pub fn any_complete(&self) -> bool { + self.lits.iter().any(|lit| !lit.is_cut()) + } + + /// Returns true if this set contains an empty literal. + pub fn contains_empty(&self) -> bool { + self.lits.iter().any(|lit| lit.is_empty()) + } + + /// Returns true if this set is empty or if all of its members is empty. + pub fn is_empty(&self) -> bool { + self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) + } + + /// Returns a new empty set of literals using this set's limits. + pub fn to_empty(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.limit_size) + .set_limit_class(self.limit_class); + lits + } + + /// Returns the longest common prefix of all members in this set. + pub fn longest_common_prefix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .zip(lit0) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][..len] + } + + /// Returns the longest common suffix of all members in this set. + pub fn longest_common_suffix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .rev() + .zip(lit0.iter().rev()) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][self.lits[0].len() - len..] + } + + /// Returns a new set of literals with the given number of bytes trimmed + /// from the suffix of each literal. + /// + /// If any literal would be cut out completely by trimming, then None is + /// returned. + /// + /// Any duplicates that are created as a result of this transformation are + /// removed. + pub fn trim_suffix(&self, num_bytes: usize) -> Option { + if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { + return None; + } + let mut new = self.to_empty(); + for mut lit in self.lits.iter().cloned() { + let new_len = lit.len() - num_bytes; + lit.truncate(new_len); + lit.cut(); + new.lits.push(lit); + } + new.lits.sort(); + new.lits.dedup(); + Some(new) + } + + /// Returns a new set of prefixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same starting position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_prefixes(&self) -> Literals { + if self.lits.is_empty() { + return self.to_empty(); + } + let mut old: Vec = self.lits.iter().cloned().collect(); + let mut new = self.to_empty(); + 'OUTER: + while let Some(mut candidate) = old.pop() { + if candidate.is_empty() { + continue; + } + if new.lits.is_empty() { + new.lits.push(candidate); + continue; + } + for lit2 in &mut new.lits { + if lit2.is_empty() { + continue; + } + if &candidate == lit2 { + // If the literal is already in the set, then we can + // just drop it. But make sure that cut literals are + // infectious! + candidate.cut = candidate.cut || lit2.cut; + lit2.cut = candidate.cut; + continue 'OUTER; + } + if candidate.len() < lit2.len() { + if let Some(i) = position(&candidate, &lit2) { + candidate.cut(); + let mut lit3 = lit2.clone(); + lit3.truncate(i); + lit3.cut(); + old.push(lit3); + lit2.clear(); + } + } else { + if let Some(i) = position(&lit2, &candidate) { + lit2.cut(); + let mut new_candidate = candidate.clone(); + new_candidate.truncate(i); + new_candidate.cut(); + old.push(new_candidate); + candidate.clear(); + } + } + // Oops, the candidate is already represented in the set. + if candidate.is_empty() { + continue 'OUTER; + } + } + new.lits.push(candidate); + } + new.lits.retain(|lit| !lit.is_empty()); + new.lits.sort(); + new.lits.dedup(); + new + } + + /// Returns a new set of suffixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same ending position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_suffixes(&self) -> Literals { + // This is a touch wasteful... + let mut lits = self.clone(); + lits.reverse(); + let mut unamb = lits.unambiguous_prefixes(); + unamb.reverse(); + unamb + } + + /// Unions the prefixes from the given expression to this set. + /// + /// If prefixes could not be added (for example, this set would exceed its + /// size limits or the set of prefixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the beginning of `expr` to the + /// end of `expr`. + pub fn union_prefixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + prefixes(expr, &mut lits); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions the suffixes from the given expression to this set. + /// + /// If suffixes could not be added (for example, this set would exceed its + /// size limits or the set of suffixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the end of `expr` to the + /// beginning of `expr`. + pub fn union_suffixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + suffixes(expr, &mut lits); + lits.reverse(); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions this set with another set. + /// + /// If the union would cause the set to exceed its limits, then the union + /// is skipped and it returns false. Otherwise, if the union succeeds, it + /// returns true. + pub fn union(&mut self, lits: Literals) -> bool { + if self.num_bytes() + lits.num_bytes() > self.limit_size { + return false; + } + if lits.is_empty() { + self.lits.push(Literal::empty()); + } else { + self.lits.extend(lits.lits); + } + true + } + + /// Extends this set with another set. + /// + /// The set of literals is extended via a cross product. + /// + /// If a cross product would cause this set to exceed its limits, then the + /// cross product is skipped and it returns false. Otherwise, if the cross + /// product succeeds, it returns true. + pub fn cross_product(&mut self, lits: &Literals) -> bool { + if lits.is_empty() { + return true; + } + // Check that we make sure we stay in our limits. + let mut size_after; + if self.is_empty() || !self.any_complete() { + size_after = self.num_bytes(); + for lits_lit in lits.literals() { + size_after += lits_lit.len(); + } + } else { + size_after = self.lits.iter().fold(0, |accum, lit| { + accum + if lit.is_cut() { lit.len() } else { 0 } + }); + for lits_lit in lits.literals() { + for self_lit in self.literals() { + if !self_lit.is_cut() { + size_after += self_lit.len() + lits_lit.len(); + } + } + } + } + if size_after > self.limit_size { + return false; + } + + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for lits_lit in lits.literals() { + for mut self_lit in base.clone() { + self_lit.extend(&**lits_lit); + self_lit.cut = lits_lit.cut; + self.lits.push(self_lit); + } + } + true + } + + /// Extends each literal in this set with the bytes given. + /// + /// If the set is empty, then the given literal is added to the set. + /// + /// If adding any number of bytes to all members of this set causes a limit + /// to be exceeded, then no bytes are added and false is returned. If a + /// prefix of `bytes` can be fit into this set, then it is used and all + /// resulting literals are cut. + pub fn cross_add(&mut self, bytes: &[u8]) -> bool { + // N.B. This could be implemented by simply calling cross_product with + // a literal set containing just `bytes`, but we can be smarter about + // taking shorter prefixes of `bytes` if they'll fit. + if bytes.is_empty() { + return true; + } + if self.lits.is_empty() { + let i = cmp::min(self.limit_size, bytes.len()); + self.lits.push(Literal::new(bytes[..i].to_owned())); + self.lits[0].cut = i < bytes.len(); + return !self.lits[0].is_cut(); + } + let size = self.num_bytes(); + if size + self.lits.len() >= self.limit_size { + return false; + } + let mut i = 1; + while size + (i * self.lits.len()) <= self.limit_size + && i < bytes.len() { + i += 1; + } + for lit in &mut self.lits { + if !lit.is_cut() { + lit.extend(&bytes[..i]); + if i < bytes.len() { + lit.cut(); + } + } + } + true + } + + /// Adds the given literal to this set. + /// + /// Returns false if adding this literal would cause the class to be too + /// big. + pub fn add(&mut self, lit: Literal) -> bool { + if self.num_bytes() + lit.len() > self.limit_size { + return false; + } + self.lits.push(lit); + true + } + + /// Extends each literal in this set with the character class given. + /// + /// Returns false if the character class was too big to add. + pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, false) + } + + /// Extends each literal in this set with the character class given, + /// writing the bytes of each character in reverse. + /// + /// Returns false if the character class was too big to add. + fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, true) + } + + fn _add_char_class( + &mut self, + cls: &hir::ClassUnicode, + reverse: bool, + ) -> bool { + use std::char; + + if self.class_exceeds_limits(cls_char_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for c in (s..e).filter_map(char::from_u32) { + for mut lit in base.clone() { + let mut bytes = c.to_string().into_bytes(); + if reverse { + bytes.reverse(); + } + lit.extend(&bytes); + self.lits.push(lit); + } + } + } + true + } + + /// Extends each literal in this set with the byte class given. + /// + /// Returns false if the byte class was too big to add. + pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { + if self.class_exceeds_limits(cls_byte_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for b in (s..e).map(|b| b as u8) { + for mut lit in base.clone() { + lit.push(b); + self.lits.push(lit); + } + } + } + true + } + + /// Cuts every member of this set. When a member is cut, it can never + /// be extended. + pub fn cut(&mut self) { + for lit in &mut self.lits { + lit.cut(); + } + } + + /// Reverses all members in place. + pub fn reverse(&mut self) { + for lit in &mut self.lits { + lit.reverse(); + } + } + + /// Clears this set of all members. + pub fn clear(&mut self) { + self.lits.clear(); + } + + /// Pops all complete literals out of this set. + fn remove_complete(&mut self) -> Vec { + let mut base = vec![]; + for lit in mem::replace(&mut self.lits, vec![]) { + if lit.is_cut() { + self.lits.push(lit); + } else { + base.push(lit); + } + } + base + } + + /// Returns the total number of bytes in this set. + fn num_bytes(&self) -> usize { + self.lits.iter().fold(0, |accum, lit| accum + lit.len()) + } + + /// Returns true if a character class with the given size would cause this + /// set to exceed its limits. + /// + /// The size given should correspond to the number of items in the class. + fn class_exceeds_limits(&self, size: usize) -> bool { + if size > self.limit_class { + return true; + } + // This is an approximation since codepoints in a char class can encode + // to 1-4 bytes. + let new_byte_count = + if self.lits.is_empty() { + size + } else { + self.lits + .iter() + .fold(0, |accum, lit| { + accum + if lit.is_cut() { + // If the literal is cut, then we'll never add + // anything to it, so don't count it. + 0 + } else { + (lit.len() + 1) * size + } + }) + }; + new_byte_count > self.limit_size + } +} + +fn prefixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = unicode::encode_utf8(c, &mut buf).unwrap(); + lits.cross_add(&buf[..i]); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + prefixes(&**hir, lits); + } + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => { + (m, Some(m)) + } + hir::RepetitionRange::AtLeast(m) => { + (m, None) + } + hir::RepetitionRange::Bounded(m, n) => { + (m, Some(n)) + } + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, prefixes) + } + } + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es { + if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + prefixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, prefixes); + } + _ => lits.cut(), + } +} + +fn suffixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = unicode::encode_utf8(c, &mut buf).unwrap(); + let mut buf = &mut buf[..i]; + buf.reverse(); + lits.cross_add(buf); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class_reverse(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + suffixes(&**hir, lits); + } + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => { + (m, Some(m)) + } + hir::RepetitionRange::AtLeast(m) => { + (m, None) + } + hir::RepetitionRange::Bounded(m, n) => { + (m, Some(n)) + } + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, suffixes) + } + } + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es.iter().rev() { + if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + suffixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, suffixes); + } + _ => lits.cut(), + } +} + +fn repeat_zero_or_one_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_zero_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.cut(); + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_one_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f(e, lits); + lits.cut(); +} + +fn repeat_range_literals( + e: &Hir, + min: u32, + max: Option, + greedy: bool, + lits: &mut Literals, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + f(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(e.clone()), + }), lits); + } else { + if min > 0 { + let n = cmp::min(lits.limit_size, min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Hir::concat(es), lits); + if n < min as usize || lits.contains_empty() { + lits.cut(); + } + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals( + es: &[Hir], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + if !lits.cross_product(&lits2) { + lits.cut(); + } +} + +impl fmt::Debug for Literals { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Literals") + .field("lits", &self.lits) + .field("limit_size", &self.limit_size) + .field("limit_class", &self.limit_class) + .finish() + } +} + +impl Literal { + /// Returns a new complete literal with the bytes given. + pub fn new(bytes: Vec) -> Literal { + Literal { v: bytes, cut: false } + } + + /// Returns a new complete empty literal. + pub fn empty() -> Literal { + Literal { v: vec![], cut: false } + } + + /// Returns true if this literal was "cut." + pub fn is_cut(&self) -> bool { + self.cut + } + + /// Cuts this literal. + pub fn cut(&mut self) { + self.cut = true; + } +} + +impl PartialEq for Literal { + fn eq(&self, other: &Literal) -> bool { + self.v == other.v + } +} + +impl PartialOrd for Literal { + fn partial_cmp(&self, other: &Literal) -> Option { + self.v.partial_cmp(&other.v) + } +} + +impl fmt::Debug for Literal { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", escape_unicode(&self.v)) + } else { + write!(f, "Complete({})", escape_unicode(&self.v)) + } + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { &self.v } +} + +impl ops::Deref for Literal { + type Target = Vec; + fn deref(&self) -> &Vec { &self.v } +} + +impl ops::DerefMut for Literal { + fn deref_mut(&mut self) -> &mut Vec { &mut self.v } +} + +fn position(needle: &[u8], mut haystack: &[u8]) -> Option { + let mut i = 0; + while haystack.len() >= needle.len() { + if needle == &haystack[..needle.len()] { + return Some(i); + } + i += 1; + haystack = &haystack[1..]; + } + None +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else { + if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + } + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} + +fn cls_char_count(cls: &hir::ClassUnicode) -> usize { + cls.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .sum::() as usize +} + +fn cls_byte_count(cls: &hir::ClassBytes) -> usize { + cls.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .sum::() as usize +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use ParserBuilder; + use hir::Hir; + use super::{Literals, Literal, escape_bytes}; + + // To make test failures easier to read. + #[derive(Debug, Eq, PartialEq)] + struct Bytes(Vec); + #[derive(Debug, Eq, PartialEq)] + struct Unicode(Vec); + + fn escape_lits(blits: &[Literal]) -> Vec { + let mut ulits = vec![]; + for blit in blits { + ulits.push(ULiteral { + v: escape_bytes(&blit), + cut: blit.is_cut(), + }); + } + ulits + } + + fn create_lits>(it: I) -> Literals { + Literals { + lits: it.into_iter().collect(), + limit_size: 0, + limit_class: 0, + } + } + + // Needs to be pub for 1.3? + #[derive(Clone, Eq, PartialEq)] + pub struct ULiteral { + v: String, + cut: bool, + } + + impl ULiteral { + fn is_cut(&self) -> bool { self.cut } + } + + impl fmt::Debug for ULiteral { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", self.v) + } else { + write!(f, "Complete({})", self.v) + } + } + } + + impl PartialEq for ULiteral { + fn eq(&self, other: &Literal) -> bool { + self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() + } + } + + impl PartialEq for Literal { + fn eq(&self, other: &ULiteral) -> bool { + &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() + } + } + + #[allow(non_snake_case)] + fn C(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: true } + } + #[allow(non_snake_case)] + fn M(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: false } + } + + fn prefixes(lits: &mut Literals, expr: &Hir) { + lits.union_prefixes(expr); + } + + fn suffixes(lits: &mut Literals, expr: &Hir) { + lits.union_suffixes(expr); + } + + macro_rules! assert_lit_eq { + ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ + let expected: Vec = vec![$($expected_lit),*]; + let lits = $got_lits; + assert_eq!( + $which(expected.clone()), + $which(escape_lits(lits.literals()))); + assert_eq!( + !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), + lits.all_complete()); + assert_eq!( + expected.iter().any(|l| !l.is_cut()), + lits.any_complete()); + }}; + } + + macro_rules! test_lit { + ($name:ident, $which:ident, $re:expr) => { + test_lit!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // ************************************************************************ + // Tests for prefix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(pfx_one_lit1, prefixes, "a", M("a")); + test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); + test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); + test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(pfx_class1, prefixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); + test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83")); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", + M("A"), M("a")); + test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc", + M("ABC"), M("aBC"), M("AbC"), M("abC"), + M("ABc"), M("aBc"), M("Abc"), M("abc")); + test_lit!(pfx_group1, prefixes, "(a)", M("a")); + test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); + test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); + test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); + test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); + test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); + test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); + test_lit!(pfx_rep_range1, prefixes, "a{0}"); + test_lit!(pfx_rep_range2, prefixes, "a{0,}"); + test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); + test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); + test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); + test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); + test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); + test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); + test_lit!(pfx_cat3, prefixes, "(?i)[ab]z", + M("AZ"), M("BZ"), M("aZ"), M("bZ"), + M("Az"), M("Bz"), M("az"), M("bz")); + test_lit!(pfx_cat4, prefixes, "[ab][yz]", + M("ay"), M("by"), M("az"), M("bz")); + test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); + test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); + test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); + test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); + test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); + test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); + test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); + test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); + test_lit!(pfx_cat14, prefixes, "a^", C("a")); + test_lit!(pfx_cat15, prefixes, "$a"); + test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); + test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); + test_lit!(pfx_cat19, prefixes, "a.z", C("a")); + + // Test regexes with alternations. + test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); + test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(pfx_alt4, prefixes, "a|b*"); + test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); + test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); + test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c", + C("a"), C("b"), M("c"), C("a"), C("ab"), M("c")); + test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(pfx_empty1, prefixes, "^a", M("a")); + test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); + test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + + // Make sure some curious regexes have no prefixes. + test_lit!(pfx_nothing1, prefixes, "."); + test_lit!(pfx_nothing2, prefixes, "(?s)."); + test_lit!(pfx_nothing3, prefixes, "^"); + test_lit!(pfx_nothing4, prefixes, "$"); + test_lit!(pfx_nothing6, prefixes, "(?m)$"); + test_lit!(pfx_nothing7, prefixes, r"\b"); + test_lit!(pfx_nothing8, prefixes, r"\B"); + + // Test a few regexes that defeat any prefix literal detection. + test_lit!(pfx_defeated1, prefixes, ".a"); + test_lit!(pfx_defeated2, prefixes, "(?s).a"); + test_lit!(pfx_defeated3, prefixes, "a*b*c*"); + test_lit!(pfx_defeated4, prefixes, "a|."); + test_lit!(pfx_defeated5, prefixes, ".|a"); + test_lit!(pfx_defeated6, prefixes, "a|^"); + test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); + test_lit!(pfx_defeated8, prefixes, "$a"); + test_lit!(pfx_defeated9, prefixes, "(?m)$a"); + test_lit!(pfx_defeated10, prefixes, r"\ba"); + test_lit!(pfx_defeated11, prefixes, r"\Ba"); + test_lit!(pfx_defeated12, prefixes, "^*a"); + test_lit!(pfx_defeated13, prefixes, "^+a"); + + test_lit!( + pfx_crazy1, + prefixes, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam")); + + // ************************************************************************ + // Tests for quiting prefix literal search. + // ************************************************************************ + + macro_rules! test_exhausted { + ($name:ident, $which:ident, $re:expr) => { + test_exhausted!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); + test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); + test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); + test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar", + C("FO"), C("fO"), C("Fo"), C("fo")); + test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd", + C("ababababab"), M("cd")); + test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd", + C("zababababab"), M("zcd")); + test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for suffix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(sfx_one_lit1, suffixes, "a", M("a")); + test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); + test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); + test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(sfx_class1, suffixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); + test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83")); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", + M("A"), M("a")); + test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc", + M("ABC"), M("ABc"), M("AbC"), M("Abc"), + M("aBC"), M("aBc"), M("abC"), M("abc")); + test_lit!(sfx_group1, suffixes, "(a)", M("a")); + test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); + test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); + test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); + test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); + test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); + test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); + test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); + test_lit!(sfx_rep_range1, suffixes, "a{0}"); + test_lit!(sfx_rep_range2, suffixes, "a{0,}"); + test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); + test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); + test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); + test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); + test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); + test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); + test_lit!(sfx_cat3, suffixes, "(?i)[ab]z", + M("AZ"), M("Az"), M("BZ"), M("Bz"), + M("aZ"), M("az"), M("bZ"), M("bz")); + test_lit!(sfx_cat4, suffixes, "[ab][yz]", + M("ay"), M("az"), M("by"), M("bz")); + test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); + test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); + test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); + test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); + test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); + test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); + test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat12, suffixes, "ab+", C("b")); + test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); + test_lit!(sfx_cat14, suffixes, "a^"); + test_lit!(sfx_cat15, suffixes, "$a", C("a")); + test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); + test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); + test_lit!(sfx_cat19, suffixes, "a.z", C("z")); + + // Test regexes with alternations. + test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); + test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(sfx_alt4, suffixes, "a|b*"); + test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); + test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); + test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c", + C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c")); + test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(sfx_empty1, suffixes, "a$", M("a")); + test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); + + // Make sure some curious regexes have no suffixes. + test_lit!(sfx_nothing1, suffixes, "."); + test_lit!(sfx_nothing2, suffixes, "(?s)."); + test_lit!(sfx_nothing3, suffixes, "^"); + test_lit!(sfx_nothing4, suffixes, "$"); + test_lit!(sfx_nothing6, suffixes, "(?m)$"); + test_lit!(sfx_nothing7, suffixes, r"\b"); + test_lit!(sfx_nothing8, suffixes, r"\B"); + + // Test a few regexes that defeat any suffix literal detection. + test_lit!(sfx_defeated1, suffixes, "a."); + test_lit!(sfx_defeated2, suffixes, "(?s)a."); + test_lit!(sfx_defeated3, suffixes, "a*b*c*"); + test_lit!(sfx_defeated4, suffixes, "a|."); + test_lit!(sfx_defeated5, suffixes, ".|a"); + test_lit!(sfx_defeated6, suffixes, "a|^"); + test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); + test_lit!(sfx_defeated8, suffixes, "a^"); + test_lit!(sfx_defeated9, suffixes, "(?m)a$"); + test_lit!(sfx_defeated10, suffixes, r"a\b"); + test_lit!(sfx_defeated11, suffixes, r"a\B"); + test_lit!(sfx_defeated12, suffixes, "a^*"); + test_lit!(sfx_defeated13, suffixes, "a^+"); + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); + test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); + test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); + test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar", + C("AR"), C("Ar"), C("aR"), C("ar")); + test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*", + C("ababababab"), M("cd")); + test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z", + C("abababababz"), M("cdz")); + test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for generating unambiguous literal sets. + // ************************************************************************ + + macro_rules! test_unamb { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.unambiguous_prefixes(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); + test_unamb!(unambiguous2, + vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]); + test_unamb!(unambiguous3, + vec![M("Sherlock"), M("Watson")], + vec![M("Sherlock"), M("Watson")]); + test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); + test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); + test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); + test_unamb!(unambiguous9, + vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], + vec![C("a"), C("b"), C("c")]); + test_unamb!(unambiguous10, + vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], + vec![C("Mo"), C("Mu")]); + test_unamb!(unambiguous11, + vec![M("zazb"), M("azb")], vec![C("a"), C("z")]); + test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); + test_unamb!(unambiguous13, + vec![M("ABCX"), M("CDAX"), M("BCX")], + vec![C("A"), C("BCX"), C("CD")]); + test_unamb!(unambiguous14, + vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], + vec![M("DSX"), C("I"), C("MGX"), C("MV")]); + test_unamb!(unambiguous15, + vec![M("IMG_"), M("MG_"), M("CIMG")], + vec![C("C"), C("I"), C("MG_")]); + + + // ************************************************************************ + // Tests for suffix trimming. + // ************************************************************************ + macro_rules! test_trim { + ($name:ident, $trim:expr, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.trim_suffix($trim).unwrap(); + assert_eq!($expected, escape_lits(got.literals())); + } + } + } + + test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); + test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); + test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); + test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); + + // ************************************************************************ + // Tests for longest common prefix. + // ************************************************************************ + + macro_rules! test_lcp { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_prefix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcp!(lcp1, vec!["a"], "a"); + test_lcp!(lcp2, vec![], ""); + test_lcp!(lcp3, vec!["a", "b"], ""); + test_lcp!(lcp4, vec!["ab", "ab"], "ab"); + test_lcp!(lcp5, vec!["ab", "a"], "a"); + test_lcp!(lcp6, vec!["a", "ab"], "a"); + test_lcp!(lcp7, vec!["ab", "b"], ""); + test_lcp!(lcp8, vec!["b", "ab"], ""); + test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); + test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); + test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); + test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); + + // ************************************************************************ + // Tests for longest common suffix. + // ************************************************************************ + + macro_rules! test_lcs { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_suffix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcs!(lcs1, vec!["a"], "a"); + test_lcs!(lcs2, vec![], ""); + test_lcs!(lcs3, vec!["a", "b"], ""); + test_lcs!(lcs4, vec!["ab", "ab"], "ab"); + test_lcs!(lcs5, vec!["ab", "a"], ""); + test_lcs!(lcs6, vec!["a", "ab"], ""); + test_lcs!(lcs7, vec!["ab", "b"], "b"); + test_lcs!(lcs8, vec!["b", "ab"], "b"); + test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); + test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); + test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); + test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); +} diff --git a/regex-syntax-2/src/hir/mod.rs b/regex-syntax-2/src/hir/mod.rs new file mode 100644 index 0000000000..d443c538b3 --- /dev/null +++ b/regex-syntax-2/src/hir/mod.rs @@ -0,0 +1,2040 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines a high-level intermediate representation for regular expressions. +*/ +use std::char; +use std::cmp; +use std::error; +use std::fmt; +use std::u8; + +use ast::Span; +use hir::interval::{Interval, IntervalSet, IntervalSetIter}; +use unicode; + +pub use hir::visitor::{Visitor, visit}; + +mod interval; +pub mod literal; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + InvalidUtf8, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when the translator attempts to construct a character class + /// that is empty. + /// + /// Note that this restriction in the translator may be removed in the + /// future. + EmptyClassNotAllowed, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ErrorKind { + fn description(&self) -> &str { + use self::ErrorKind::*; + match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + EmptyClassNotAllowed => "empty character classes are not allowed", + _ => unreachable!(), + } + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.description()) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// The HIR of a regular expression represents an intermediate step between its +/// abstract syntax (a structured description of the concrete syntax) and +/// compiled byte codes. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating `(?i)A` +/// to `[aA]`). +/// +/// If the HIR was produced by a translator that disallows invalid UTF-8, then +/// the HIR is guaranteed to match UTF-8 exclusively. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods +/// on this `Hir` type instead of building the `HirKind` values directly. +/// This permits construction to enforce invariants like "concatenations +/// always consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For +/// example, one such attribute is whether the expression must match at the +/// beginning of the text. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + info: HirInfo, +} + +/// The kind of an arbitrary `Hir` expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A single literal character that matches exactly this character. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + Class(Class), + /// An anchor assertion. An anchor assertion match always has zero length. + Anchor(Anchor), + /// A word boundary assertion, which may or may not be Unicode aware. A + /// word boundary assertion match always has zero length. + WordBoundary(WordBoundary), + /// A repetition operation applied to a child expression. + Repetition(Repetition), + /// A possibly capturing group, which contains a child expression. + Group(Group), + /// A concatenation of expressions. A concatenation always has at least two + /// child expressions. + /// + /// A concatenation matches only if each of its child expression matches + /// one after the other. + Concat(Vec), + /// An alternation of expressions. An alternation always has at least two + /// child expressions. + /// + /// An alternation matches only if at least one of its child expression + /// matches. If multiple expressions match, then the leftmost is preferred. + Alternation(Vec), +} + +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + use std::mem; + mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + pub fn empty() -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + Hir { + kind: HirKind::Empty, + info: info, + } + } + + /// Creates a literal HIR expression. + /// + /// If the given literal has a `Byte` variant with an ASCII byte, then this + /// method panics. This enforces the invariant that `Byte` variants are + /// only used to express matching of invalid UTF-8. + pub fn literal(lit: Literal) -> Hir { + if let Literal::Byte(b) = lit { + assert!(b > 0x7F); + } + + let mut info = HirInfo::new(); + info.set_always_utf8(lit.is_unicode()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + Hir { + kind: HirKind::Literal(lit), + info: info, + } + } + + /// Creates a class HIR expression. + pub fn class(class: Class) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(class.is_always_utf8()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + Hir { + kind: HirKind::Class(class), + info: info, + } + } + + /// Creates an anchor assertion HIR expression. + pub fn anchor(anchor: Anchor) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + if let Anchor::StartText = anchor { + info.set_anchored_start(true); + info.set_any_anchored_start(true); + } + if let Anchor::EndText = anchor { + info.set_anchored_end(true); + info.set_any_anchored_end(true); + } + Hir { + kind: HirKind::Anchor(anchor), + info: info, + } + } + + /// Creates a word boundary assertion HIR expression. + pub fn word_boundary(word_boundary: WordBoundary) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + // A negated word boundary matches the empty string, but a normal + // word boundary does not! + info.set_match_empty(word_boundary.is_negated()); + // ASCII word boundaries can match invalid UTF-8. + if let WordBoundary::Ascii = word_boundary { + info.set_always_utf8(false); + } + if let WordBoundary::AsciiNegate = word_boundary { + info.set_always_utf8(false); + } + Hir { + kind: HirKind::WordBoundary(word_boundary), + info: info, + } + } + + /// Creates a repetition HIR expression. + pub fn repetition(rep: Repetition) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(rep.hir.is_always_utf8()); + info.set_all_assertions(rep.hir.is_all_assertions()); + // If this operator can match the empty string, then it can never + // be anchored. + info.set_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start() + ); + info.set_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end() + ); + info.set_any_anchored_start(rep.hir.is_any_anchored_start()); + info.set_any_anchored_end(rep.hir.is_any_anchored_end()); + info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); + Hir { + kind: HirKind::Repetition(rep), + info: info, + } + } + + /// Creates a group HIR expression. + pub fn group(group: Group) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(group.hir.is_always_utf8()); + info.set_all_assertions(group.hir.is_all_assertions()); + info.set_anchored_start(group.hir.is_anchored_start()); + info.set_anchored_end(group.hir.is_anchored_end()); + info.set_any_anchored_start(group.hir.is_any_anchored_start()); + info.set_any_anchored_end(group.hir.is_any_anchored_end()); + info.set_match_empty(group.hir.is_match_empty()); + Hir { + kind: HirKind::Group(group), + info: info, + } + } + + /// Returns the concatenation of the given expressions. + /// + /// This flattens the concatenation as appropriate. + pub fn concat(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = + info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() + || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() && e.is_match_empty(); + info.set_match_empty(x); + } + // Anchored attributes require something slightly more + // sophisticated. Normally, WLOG, to determine whether an + // expression is anchored to the start, we'd only need to check + // the first expression of a concatenation. However, + // expressions like `$\b^` are still anchored to the start, + // but the first expression in the concatenation *isn't* + // anchored to the start. So the "first" expression to look at + // is actually one that is either not an assertion or is + // specifically the StartText assertion. + info.set_anchored_start( + exprs.iter() + .take_while(|e| { + e.is_anchored_start() || e.is_all_assertions() + }) + .any(|e| { + e.is_anchored_start() + })); + // Similarly for the end anchor, but in reverse. + info.set_anchored_end( + exprs.iter() + .rev() + .take_while(|e| { + e.is_anchored_end() || e.is_all_assertions() + }) + .any(|e| { + e.is_anchored_end() + })); + Hir { + kind: HirKind::Concat(exprs), + info: info, + } + } + } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens the alternation as appropriate. + pub fn alternation(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(true); + info.set_anchored_end(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_anchored_start() && e.is_anchored_start(); + info.set_anchored_start(x); + + let x = info.is_anchored_end() && e.is_anchored_end(); + info.set_anchored_end(x); + + let x = + info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() + || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() || e.is_match_empty(); + info.set_match_empty(x); + } + Hir { + kind: HirKind::Alternation(exprs), + info: info, + } + } + } + } + + /// Build an HIR expression for `.`. + /// + /// A `.` expression matches any character except for `\n`. To build an + /// expression that matches any character, including `\n`, use the `any` + /// method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn dot(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Build an HIR expression for `(?s).`. + /// + /// A `(?s).` expression matches any character, including `\n`. To build an + /// expression that matches any character except for `\n`, then use the + /// `dot` method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn any(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Return true if and only if this HIR will always match valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression + /// to match invalid UTF-8. + pub fn is_always_utf8(&self) -> bool { + self.info.is_always_utf8() + } + + /// Returns true if and only if this entire HIR expression is made up of + /// zero-width assertions. + /// + /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but + /// not `^a`. + pub fn is_all_assertions(&self) -> bool { + self.info.is_all_assertions() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, + /// `^foo|^bar` but not `^foo|bar`. + pub fn is_anchored_start(&self) -> bool { + self.info.is_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the end + /// of text. This includes expressions like `foo$`, `(foo|bar)$`, + /// `foo$|bar$` but not `foo$|bar`. + pub fn is_anchored_end(&self) -> bool { + self.info.is_anchored_end() + } + + /// Return true if and only if this HIR contains any sub-expression that + /// is required to match at the beginning of text. Specifically, this + /// returns true if the `^` symbol (when multiline mode is disabled) or the + /// `\A` escape appear anywhere in the regex. + pub fn is_any_anchored_start(&self) -> bool { + self.info.is_any_anchored_start() + } + + /// Return true if and only if this HIR contains any sub-expression that is + /// required to match at the end of text. Specifically, this returns true + /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape + /// appear anywhere in the regex. + pub fn is_any_anchored_end(&self) -> bool { + self.info.is_any_anchored_end() + } + + /// Return true if and only if the empty string is part of the language + /// matched by this regular expression. + /// + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, + /// but not `a`, `a+` or `\b`. + pub fn is_match_empty(&self) -> bool { + self.info.is_match_empty() + } +} + +impl HirKind { + /// Return true if and only if this HIR is the empty regular expression. + /// + /// Note that this is not defined inductively. That is, it only tests if + /// this kind is the `Empty` variant. To get the inductive definition, + /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). + pub fn is_empty(&self) -> bool { + match *self { + HirKind::Empty => true, + _ => false, + } + } + + /// Returns true if and only if this kind has any (including possibly + /// empty) subexpressions. + pub fn has_subexprs(&self) -> bool { + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => false, + HirKind::Group(_) + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => true, + } + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to a single character, where a character is either +/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters +/// are preferred whenever possible. In particular, a `Byte` variant is only +/// ever produced when it could match invalid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Literal { + /// A single character represented by a Unicode scalar value. + Unicode(char), + /// A single character represented by an arbitrary byte. + Byte(u8), +} + +impl Literal { + /// Returns true if and only if this literal corresponds to a Unicode + /// scalar value. + pub fn is_unicode(&self) -> bool { + match *self { + Literal::Unicode(_) => true, + Literal::Byte(b) if b <= 0x7F => true, + Literal::Byte(_) => false, + } + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. Unicode characters are used +/// by default, while bytes are used when Unicode mode (via the `u` flag) is +/// disabled. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may +/// be produced even when it exclusively matches valid UTF-8. This is because +/// a `Bytes` variant represents an intention by the author of the regular +/// expression to disable Unicode mode, which in turn impacts the semantics of +/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not +/// match the same set of strings. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_always_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_all_ascii(), + } + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassUnicode + where I: IntoIterator + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassUnicodeRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassUnicodeRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple(); + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassUnicodeRange; + + fn next(&mut self) -> Option<&'a ClassUnicodeRange> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassUnicodeRange { + start: char, + end: char, +} + +impl fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let start = + if !self.start.is_whitespace() && !self.start.is_control() { + self.start.to_string() + } else { + format!("0x{:X}", self.start as u32) + }; + let end = + if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", self.end as u32) + }; + f.debug_struct("ClassUnicodeRange") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassUnicodeRange { + type Bound = char; + + #[inline] fn lower(&self) -> char { self.start } + #[inline] fn upper(&self) -> char { self.end } + #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; } + #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple(&self, ranges: &mut Vec) { + if !unicode::contains_simple_case_mapping(self.start, self.end) { + return; + } + let start = self.start as u32; + let end = (self.end as u32).saturating_add(1); + let mut next_simple_cp = None; + for cp in (start..end).filter_map(char::from_u32) { + if next_simple_cp.map_or(false, |next| cp < next) { + continue; + } + let it = match unicode::simple_fold(cp) { + Ok(it) => it, + Err(next) => { + next_simple_cp = next; + continue; + } + }; + for cp_folded in it { + ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); + } + } + } +} + +impl ClassUnicodeRange { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassUnicodeRange { + ClassUnicodeRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } +} + +/// A set of characters represented by arbitrary bytes (where one byte +/// corresponds to one character). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassBytes + where I: IntoIterator + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassBytesRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassBytesRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple(); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] fn lower(&self) -> u8 { self.start } + #[inline] fn upper(&self) -> u8 { self.end } + #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; } + #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple(&self, ranges: &mut Vec) { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } +} + +impl fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut debug = f.debug_struct("ClassBytesRange"); + if self.start <= 0x7F { + debug.field("start", &(self.start as char)); + } else { + debug.field("start", &self.start); + } + if self.end <= 0x7F { + debug.field("end", &(self.end as char)); + } else { + debug.field("end", &self.end); + } + debug.finish() + } +} + +/// The high-level intermediate representation for an anchor assertion. +/// +/// A matching anchor assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Anchor { + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLine, + /// Match the end of a line or the end of text. Specifically, + /// this matches at the end position of the input, or at the position + /// immediately preceding a `\n` character. + EndLine, + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + StartText, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + EndText, +} + +/// The high-level intermediate representation for a word-boundary assertion. +/// +/// A matching word boundary assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum WordBoundary { + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Unicode, + /// Match a Unicode-aware negation of a word boundary. + UnicodeNegate, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Ascii, + /// Match an ASCII-only negation of a word boundary. + AsciiNegate, +} + +impl WordBoundary { + /// Returns true if and only if this word boundary assertion is negated. + pub fn is_negated(&self) -> bool { + match *self { + WordBoundary::Unicode | WordBoundary::Ascii => false, + WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, + } + } +} + +/// The high-level intermediate representation for a group. +/// +/// This represents one of three possible group types: +/// +/// 1. A non-capturing group (e.g., `(?:expr)`). +/// 2. A capturing group (e.g., `(expr)`). +/// 3. A named capturing group (e.g., `(?Pexpr)`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The kind of this group. If it is a capturing group, then the kind + /// contains the capture group index (and the name, if it is a named + /// group). + pub kind: GroupKind, + /// The expression inside the capturing group, which may be empty. + pub hir: Box, +} + +/// The kind of group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// A normal unnamed capturing group. + /// + /// The value is the capture index of the group. + CaptureIndex(u32), + /// A named capturing group. + CaptureName { + /// The name of the group. + name: String, + /// The capture index of the group. + index: u32, + }, + /// A non-capturing group. + NonCapturing, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The kind of this repetition operator. + pub kind: RepetitionKind, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub hir: Box, +} + +impl Repetition { + /// Returns true if and only if this repetition operator makes it possible + /// to match the empty string. + /// + /// Note that this is not defined inductively. For example, while `a*` + /// will report `true`, `()+` will not, even though `()` matches the empty + /// string and one or more occurrences of something that matches the empty + /// string will always match the empty string. In order to get the + /// inductive definition, see the corresponding method on + /// [`Hir`](struct.Hir.html). + pub fn is_match_empty(&self) -> bool { + match self.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => false, + RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, + } + } +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// Matches a sub-expression zero or one times. + ZeroOrOne, + /// Matches a sub-expression zero or more times. + ZeroOrMore, + /// Matches a sub-expression one or more times. + OneOrMore, + /// Matches a sub-expression within a bounded range of times. + Range(RepetitionRange), +} + +/// The kind of a counted repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// Matches a sub-expression exactly this many times. + Exactly(u32), + /// Matches a sub-expression at least this many times. + AtLeast(u32), + /// Matches a sub-expression at least `m` times and at most `n` times. + Bounded(u32, u32), +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use std::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => return, + HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => {} + HirKind::Group(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that documents various attributes of an HIR expression. +/// +/// These attributes are typically defined inductively on the HIR. +#[derive(Clone, Debug, Eq, PartialEq)] +struct HirInfo { + /// Represent yes/no questions by a bitfield to conserve space, since + /// this is included in every HIR expression. + /// + /// If more attributes need to be added, it is OK to increase the size of + /// this as appropriate. + bools: u8, +} + +// A simple macro for defining bitfield accessors/mutators. +macro_rules! define_bool { + ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { + fn $is_fn_name(&self) -> bool { + self.bools & (0b1 << $bit) > 0 + } + + fn $set_fn_name(&mut self, yes: bool) { + if yes { + self.bools |= 1 << $bit; + } else { + self.bools &= !(1 << $bit); + } + } + } +} + +impl HirInfo { + fn new() -> HirInfo { + HirInfo { + bools: 0, + } + } + + define_bool!(0, is_always_utf8, set_always_utf8); + define_bool!(1, is_all_assertions, set_all_assertions); + define_bool!(2, is_anchored_start, set_anchored_start); + define_bool!(3, is_anchored_end, set_anchored_end); + define_bool!(4, is_any_anchored_start, set_any_anchored_start); + define_bool!(5, is_any_anchored_end, set_any_anchored_end); + define_bool!(6, is_match_empty, set_match_empty); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassUnicodeRange::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassBytesRange::new(s, e)) + .collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassUnicodeRange::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassBytesRange::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), + ('m', 'p'), ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'), + (b'm', b'p'), (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), + ('M', 'P'), ('L', 'S'), ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), ('L', 'S'), + ('a', 'j'), ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = uclass(&[ + ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), + (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), + ]); + let expected = bclass(&[ + (b'A', b'J'), (b'L', b'S'), + (b'a', b'j'), (b'l', b's'), + ]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[ + ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'), + ]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + #[test] + #[should_panic] + fn hir_byte_literal_non_ascii() { + Hir::literal(Literal::Byte(b'a')); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::group(Group { + kind: GroupKind::NonCapturing, + hir: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + kind: RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + info: HirInfo::new(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + info: HirInfo::new(), + }; + } + assert!(!expr.kind.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1<<10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/regex-syntax-2/src/hir/translate.rs b/regex-syntax-2/src/hir/translate.rs new file mode 100644 index 0000000000..c56d9745a2 --- /dev/null +++ b/regex-syntax-2/src/hir/translate.rs @@ -0,0 +1,2506 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use std::cell::{Cell, RefCell}; +use std::result; + +use ast::{self, Ast, Span, Visitor}; +use hir::{self, Error, ErrorKind, Hir}; +use unicode::{self, ClassQuery}; + +type Result = result::Result; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + allow_invalid_utf8: bool, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + allow_invalid_utf8: false, + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + allow_invalid_utf8: self.allow_invalid_utf8, + } + } + + /// When enabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the translator is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// translator will return an error). + pub fn allow_invalid_utf8( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.allow_invalid_utf8 = yes; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell>, + /// The current flag settings. + flags: Cell, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + allow_invalid_utf8: bool, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `allow_invalid_utf8` is disabled (the default), then a byte + /// character is only permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed on to the stack upon first seeing any kind of group, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags, if any, when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Option, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!("tried to unwrap Unicode class \ + from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!("tried to unwrap byte class \ + from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered) if they exist. + fn unwrap_group(self) -> Option { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result { + if self.trans().stack.borrow().is_empty() { + // This can happen if the Ast given consists of a single set of + // flags. e.g., `(?i)`. /shrug + return Ok(Hir::empty()); + } + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Class(ast::Class::Bracketed(_)) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Group(ref x) => { + let old_flags = x.flags().map(|ast| self.set_flags(ast)); + self.push(HirFrame::Group { + old_flags: old_flags, + }); + } + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { + self.push(HirFrame::Alternation); + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + } + Ast::Literal(ref x) => { + self.push(HirFrame::Expr(try!(self.hir_literal(x)))); + } + Ast::Dot(span) => { + self.push(HirFrame::Expr(try!(self.hir_dot(span)))); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x))); + } + Ast::Class(ast::Class::Perl(ref x)) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x); + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x); + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::Class(ast::Class::Unicode(ref x)) => { + let cls = hir::Class::Unicode(try!(self.hir_unicode_class(x))); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::Class(ast::Class::Bracketed(ref ast)) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate(ast.negated, &mut cls); + if cls.iter().next().is_none() { + return Err(self.error( + ast.span, ErrorKind::EmptyClassNotAllowed)); + } + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + try!(self.bytes_fold_and_negate( + &ast.span, ast.negated, &mut cls)); + if cls.iter().next().is_none() { + return Err(self.error( + ast.span, ErrorKind::EmptyClassNotAllowed)); + } + + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + if let Some(flags) = self.pop().unwrap().unwrap_group() { + self.trans().flags.set(flags); + } + self.push(HirFrame::Expr(self.hir_group(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = try!(self.class_literal_byte(x)); + cls.push(hir::ClassBytesRange::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = try!(self.class_literal_byte(&x.start)); + let end = try!(self.class_literal_byte(&x.end)); + cls.push(hir::ClassBytesRange::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + for &(s, e) in ascii_class(&x.kind) { + cls.push(hir::ClassUnicodeRange::new(s, e)); + } + self.unicode_fold_and_negate(x.negated, &mut cls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + for &(s, e) in ascii_class(&x.kind) { + cls.push(hir::ClassBytesRange::new(s as u8, e as u8)); + } + try!(self.bytes_fold_and_negate( + &x.span, x.negated, &mut cls)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = try!(self.hir_unicode_class(x)); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate(ast.negated, &mut cls1); + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + try!(self.bytes_fold_and_negate( + &ast.span, ast.negated, &mut cls1)); + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans: trans, pattern: pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option { + self.trans().stack.borrow_mut().pop() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind: kind, pattern: self.pattern.to_string(), span: span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + fn hir_literal(&self, lit: &ast::Literal) -> Result { + let ch = match try!(self.literal_to_char(lit)) { + byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), + hir::Literal::Unicode(ch) => ch, + }; + if self.flags().case_insensitive() { + self.hir_from_char_case_insensitive(lit.span, ch) + } else { + self.hir_from_char(lit.span, ch) + } + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a raw byte is returned. If that + /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns + /// an error. + fn literal_to_char(&self, lit: &ast::Literal) -> Result { + if self.flags().unicode() { + return Ok(hir::Literal::Unicode(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(hir::Literal::Unicode(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(hir::Literal::Unicode(byte as char)); + } + if !self.trans().allow_invalid_utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(hir::Literal::Byte(byte)) + } + + fn hir_from_char(&self, span: Span, c: char) -> Result { + if !self.flags().unicode() && c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + Ok(Hir::literal(hir::Literal::Unicode(c))) + } + + fn hir_from_char_case_insensitive( + &self, + span: Span, + c: char, + ) -> Result { + // If case folding won't do anything, then don't bother trying. + if !unicode::contains_simple_case_mapping(c, c) { + return self.hir_from_char(span, c); + } + if self.flags().unicode() { + let mut cls = hir::ClassUnicode::new(vec![ + hir::ClassUnicodeRange::new(c, c), + ]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Unicode(cls))) + } else { + if c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + let mut cls = hir::ClassBytes::new(vec![ + hir::ClassBytesRange::new(c as u8, c as u8), + ]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Bytes(cls))) + } + } + + fn hir_dot(&self, span: Span) -> Result { + let unicode = self.flags().unicode(); + if !unicode && !self.trans().allow_invalid_utf8 { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + Ok(if self.flags().dot_matches_new_line() { + Hir::any(!unicode) + } else { + Hir::dot(!unicode) + }) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Hir { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + match asst.kind { + ast::AssertionKind::StartLine => { + Hir::anchor(if multi_line { + hir::Anchor::StartLine + } else { + hir::Anchor::StartText + }) + } + ast::AssertionKind::EndLine => { + Hir::anchor(if multi_line { + hir::Anchor::EndLine + } else { + hir::Anchor::EndText + }) + } + ast::AssertionKind::StartText => { + Hir::anchor(hir::Anchor::StartText) + } + ast::AssertionKind::EndText => { + Hir::anchor(hir::Anchor::EndText) + } + ast::AssertionKind::WordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::Unicode + } else { + hir::WordBoundary::Ascii + }) + } + ast::AssertionKind::NotWordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::UnicodeNegate + } else { + hir::WordBoundary::AsciiNegate + }) + } + } + } + + fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { + let kind = match group.kind { + ast::GroupKind::CaptureIndex(idx) => { + hir::GroupKind::CaptureIndex(idx) + } + ast::GroupKind::CaptureName(ref capname) => { + hir::GroupKind::CaptureName { + name: capname.name.clone(), + index: capname.index, + } + } + ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + }; + Hir::group(hir::Group { + kind: kind, + hir: Box::new(expr), + }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let kind = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, + ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, + ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(m,n)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) + } + }; + let greedy = + if self.flags().swap_greed() { + !rep.greedy + } else { + rep.greedy + }; + Hir::repetition(hir::Repetition { + kind: kind, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result { + use ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err(self.error( + ast_class.span, + ErrorKind::UnicodeNotAllowed, + )); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => { + ClassQuery::ByValue { + property_name: name, + property_value: value, + } + } + }; + match unicode::class(query) { + Ok(mut class) => { + self.unicode_fold_and_negate(ast_class.negated, &mut class); + Ok(class) + } + Err(unicode::Error::PropertyNotFound) => { + Err(self.error( + ast_class.span, + ErrorKind::UnicodePropertyNotFound, + )) + } + Err(unicode::Error::PropertyValueNotFound) => { + Err(self.error( + ast_class.span, + ErrorKind::UnicodePropertyValueNotFound, + )) + } + } + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassUnicode { + use ast::ClassPerlKind::*; + use unicode_tables::perl_word::PERL_WORD; + + assert!(self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => { + let query = ClassQuery::Binary("Decimal_Number"); + unicode::class(query).unwrap() + } + Space => { + let query = ClassQuery::Binary("Whitespace"); + unicode::class(query).unwrap() + } + Word => unicode::hir_class(PERL_WORD), + }; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + class + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassBytes { + use ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + class + } + + fn unicode_fold_and_negate( + &self, + negated: bool, + class: &mut hir::ClassUnicode, + ) { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result { + match try!(self.literal_to_char(ast)) { + hir::Literal::Byte(byte) => Ok(byte), + hir::Literal::Unicode(ch) => { + if ch <= 0x7F as char { + Ok(ch as u8) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option, + multi_line: Option, + dot_matches_new_line: Option, + swap_greed: Option, + unicode: Option, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind).iter().cloned().map(|(s, e)| { + hir::ClassBytesRange::new(s as u8, e as u8) + }).collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { + use ast::ClassAsciiKind::*; + + // TODO: Get rid of these consts, which appear necessary for older + // versions of Rust. + type T = &'static [(char, char)]; + match *kind { + Alnum => { + const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; + X + } + Alpha => { + const X: T = &[('A', 'Z'), ('a', 'z')]; + X + } + Ascii => { + const X: T = &[('\x00', '\x7F')]; + X + } + Blank => { + const X: T = &[(' ', '\t')]; + X + } + Cntrl => { + const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; + X + } + Digit => { + const X: T = &[('0', '9')]; + X + } + Graph => { + const X: T = &[('!', '~')]; + X + } + Lower => { + const X: T = &[('a', 'z')]; + X + } + Print => { + const X: T = &[(' ', '~')]; + X + } + Punct => { + const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; + X + } + Space => { + const X: T = &[ + ('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'), + ('\r', '\r'), (' ', ' '), + ]; + X + } + Upper => { + const X: T = &[('A', 'Z')]; + X + } + Word => { + const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')]; + X + } + Xdigit => { + const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')]; + X + } + } +} + +#[cfg(test)] +mod tests { + use ast::{self, Ast, Position, Span}; + use ast::parse::ParserBuilder; + use hir::{self, Hir, HirKind}; + use unicode::{self, ClassQuery}; + + use super::{TranslatorBuilder, ascii_class}; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn hir_lit(s: &str) -> Hir { + match s.len() { + 0 => Hir::empty(), + _ => { + let lits = s + .chars() + .map(hir::Literal::Unicode) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_blit(s: &[u8]) -> Hir { + match s.len() { + 0 => Hir::empty(), + 1 => Hir::literal(hir::Literal::Byte(s[0])), + _ => { + let lits = s + .iter() + .cloned() + .map(hir::Literal::Byte) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_group(i: u32, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureIndex(i), + hir: Box::new(expr), + }) + } + + fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureName { + name: name.to_string(), + index: i, + }, + hir: Box::new(expr), + }) + } + + fn hir_group_nocap(expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::Range(range), + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec) -> Hir { + Hir::concat(exprs) + } + + fn hir_uclass_query(query: ClassQuery) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + fn hir_uclass_perl_word() -> Hir { + use unicode_tables::perl_word::PERL_WORD; + Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD))) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| { + assert!(s as u32 <= 0x7F); + assert!(e as u32 <= 0x7F); + hir::ClassBytesRange::new(s as u8, e as u8) + }) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + ( + HirKind::Class(Unicode(mut c1)), + HirKind::Class(Unicode(c2)), + ) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + ( + HirKind::Class(Bytes(mut c1)), + HirKind::Class(Bytes(c2)), + ) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + ( + HirKind::Class(Unicode(mut c1)), + HirKind::Class(Unicode(c2)), + ) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + ( + HirKind::Class(Bytes(mut c1)), + HirKind::Class(Bytes(c2)), + ) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_anchor(anchor: hir::Anchor) -> Hir { + Hir::anchor(anchor) + } + + fn hir_word(wb: hir::WordBoundary) -> Hir { + Hir::word_boundary(wb) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?i)"), Hir::empty()); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?P)"), hir_group_name(1, "wat", Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!(t("()|()"), hir_alt(vec![ + hir_group(1, Hir::empty()), + hir_group(2, Hir::empty()), + ])); + assert_eq!(t("(|b)"), hir_group(1, hir_alt(vec![ + Hir::empty(), + hir_lit("b"), + ]))); + assert_eq!(t("(a|)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + Hir::empty(), + ]))); + assert_eq!(t("(a||c)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + Hir::empty(), + hir_lit("c"), + ]))); + assert_eq!(t("(||)"), hir_group(1, hir_alt(vec![ + Hir::empty(), + Hir::empty(), + Hir::empty(), + ]))); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("☃"), hir_lit("☃")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t_err("(?-u)☃"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 7)), + }); + assert_eq!(t_err(r"(?-u)\xFF"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)), + }); + } + + #[test] + fn literal_case_insensitive() { + assert_eq!(t("(?i)a"), hir_uclass(&[ + ('A', 'A'), ('a', 'a'), + ])); + assert_eq!(t("(?i:a)"), hir_group_nocap(hir_uclass(&[ + ('A', 'A'), ('a', 'a')], + ))); + assert_eq!(t("a(?i)a(?-i)a"), hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])); + assert_eq!(t("(?i)ab@c"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ])); + assert_eq!(t("(?i)β"), hir_uclass(&[ + ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'), + ])); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ])); + assert_eq!(t("(?i-u)ab@c"), hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ])); + + assert_eq!(t_bytes("(?i-u)a"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes("(?i-u)\x61"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes(r"(?i-u)\x61"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t_err("(?i-u)β"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 8), + ), + }); + } + + #[test] + fn dot() { + assert_eq!(t("."), hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\u{10FFFF}'), + ])); + assert_eq!(t("(?s)."), hir_uclass(&[ + ('\0', '\u{10FFFF}'), + ])); + assert_eq!(t_bytes("(?-u)."), hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\xFF'), + ])); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[ + (b'\0', b'\xFF'), + ])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!(t_err("(?-u)."), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(6, 1, 7)), + }); + assert_eq!(t_err("(?s-u)."), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(6, 1, 7), Position::new(7, 1, 8)), + }); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); + assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); + assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); + + assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); + assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); + assert_eq!(t(r"(?-u)\B"), hir_word(hir::WordBoundary::AsciiNegate)); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!(t("(a)(b)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ])); + assert_eq!(t("(a)|(b)"), hir_alt(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ])); + assert_eq!(t("(?P)"), hir_group_name(1, "foo", Hir::empty())); + assert_eq!(t("(?Pa)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!(t("(?Pa)(?Pb)"), hir_cat(vec![ + hir_group_name(1, "foo", hir_lit("a")), + hir_group_name(2, "bar", hir_lit("b")), + ])); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!(t("(?:a)(b)"), hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_group(1, hir_lit("b")), + ])); + assert_eq!(t("(a)(?:b)(c)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_nocap(hir_lit("b")), + hir_group(2, hir_lit("c")), + ])); + assert_eq!(t("(a)(?Pb)(c)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_name(2, "foo", hir_lit("b")), + hir_group(3, hir_lit("c")), + ])); + } + + #[test] + fn flags() { + assert_eq!(t("(?i:a)a"), hir_cat(vec![ + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), + hir_lit("a"), + ])); + assert_eq!(t("(?i-u:a)β"), hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("β"), + ])); + assert_eq!(t("(?i)(?-i:a)a"), hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])); + assert_eq!(t("(?im)a^"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + ])); + assert_eq!(t("(?im)a^(?i-m)a^"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartText), + ])); + assert_eq!(t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ])); + assert_eq!(t("(?:a(?i)a)a"), hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])), + hir_lit("a"), + ])); + assert_eq!(t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#"), + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!( + t("a{1}"), + hir_range( + true, + hir::RepetitionRange::Exactly(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,}"), + hir_range( + true, + hir::RepetitionRange::AtLeast(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,2}"), + hir_range( + true, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + )); + assert_eq!( + t("a{1}?"), + hir_range( + false, + hir::RepetitionRange::Exactly(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,}?"), + hir_range( + false, + hir::RepetitionRange::AtLeast(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,2}?"), + hir_range( + false, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + )); + + assert_eq!(t("ab?"), hir_cat(vec![ + hir_lit("a"), + hir_quest(true, hir_lit("b")), + ])); + assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_cat(vec![ + hir_lit("a"), + hir_lit("b"), + ])))); + assert_eq!(t("a|b?"), hir_alt(vec![ + hir_lit("a"), + hir_quest(true, hir_lit("b")), + ])); + } + + #[test] + fn cat_alt() { + assert_eq!(t("(ab)"), hir_group(1, hir_cat(vec![ + hir_lit("a"), + hir_lit("b"), + ]))); + assert_eq!(t("a|b"), hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + ])); + assert_eq!(t("a|b|c"), hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + hir_lit("c"), + ])); + assert_eq!(t("ab|bc|cd"), hir_alt(vec![ + hir_lit("ab"), + hir_lit("bc"), + hir_lit("cd"), + ])); + assert_eq!(t("(a|b)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + ]))); + assert_eq!(t("(a|b|c)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + hir_lit("c"), + ]))); + assert_eq!(t("(ab|bc|cd)"), hir_group(1, hir_alt(vec![ + hir_lit("ab"), + hir_lit("bc"), + hir_lit("cd"), + ]))); + assert_eq!(t("(ab|(bc|(cd)))"), hir_group(1, hir_alt(vec![ + hir_lit("ab"), + hir_group(2, hir_alt(vec![ + hir_lit("bc"), + hir_group(3, hir_lit("cd")), + ])), + ]))); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))); + assert_eq!( + t("[[:alpha:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))); + assert_eq!( + t("[[:ascii:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))); + assert_eq!( + t("[[:blank:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))); + assert_eq!( + t("[[:cntrl:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))); + assert_eq!( + t("[[:digit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t("[[:graph:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))); + assert_eq!( + t("[[:lower:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))); + assert_eq!( + t("[[:print:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))); + assert_eq!( + t("[[:punct:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))); + assert_eq!( + t("[[:space:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t("[[:upper:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))); + assert_eq!( + t("[[:word:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))); + assert_eq!( + t("[[:xdigit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))); + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ])); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Lower)))); + + assert_eq!(t_err("(?-u)[[:^lower:]]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(6, 1, 7), Position::new(16, 1, 17)), + }); + assert_eq!(t_err("(?i-u)[[:^lower:]]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(7, 1, 8), Position::new(17, 1, 18)), + }); + } + + #[test] + fn class_perl() { + // Unicode + assert_eq!( + t(r"\d"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"\s"), + hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!( + t(r"\w"), + hir_uclass_perl_word()); + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!( + t(r"(?i)\w"), + hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space")))); + assert_eq!( + t(r"\W"), + hir_negate(hir_uclass_perl_word())); + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space")))); + assert_eq!( + t(r"(?i)\W"), + hir_negate(hir_uclass_perl_word())); + + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t(r"(?-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t(r"(?-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))); + assert_eq!( + t(r"(?i-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t(r"(?i-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t(r"(?i-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))); + + // ASCII only, negated + assert_eq!( + t(r"(?-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t(r"(?-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space)))); + assert_eq!( + t(r"(?-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + assert_eq!( + t(r"(?i-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t(r"(?i-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space)))); + assert_eq!( + t(r"(?i-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + } + + #[test] + fn class_unicode() { + assert_eq!( + t(r"\pZ"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\pz"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek"))); + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))); + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("Greek"))))); + + assert_eq!( + t(r"\p{any}"), + hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned"))); + assert_eq!( + t(r"\p{ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII"))); + assert_eq!( + t(r"\p{gc:any}"), + hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{gc:assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned"))); + assert_eq!( + t(r"\p{gc:ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII"))); + + assert_eq!(t_err(r"(?-u)\pZ"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 9)), + }); + assert_eq!(t_err(r"(?-u)\p{Separator}"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(18, 1, 19)), + }); + assert_eq!(t_err(r"\pE"), TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(3, 1, 4)), + }); + assert_eq!(t_err(r"\p{Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)), + }); + assert_eq!(t_err(r"\p{gc:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"\p{sc:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"\p{scx:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)), + }); + assert_eq!(t_err(r"\p{age:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)), + }); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); + assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!( + t(r"[\d]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + assert_eq!(t("(?i)[k]"), hir_uclass(&[ + ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), + ])); + assert_eq!(t("(?i)[β]"), hir_uclass(&[ + ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'), + ])); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[ + (b'K', b'K'), (b'k', b'k'), + ])); + + assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + hir_negate(hir_bclass(&[(b'a', b'a')]))); + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("greek"))))); + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("greek"))))); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!(t_err("(?-u)[^a]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)), + }); + assert_eq!(t_err(r"[^\s\S]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)), + }); + assert_eq!(t_err(r"(?-u)[^\s\S]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(12, 1, 13)), + }); + } + + #[test] + fn class_bracketed_union() { + assert_eq!( + t("[a-zA-Z]"), + hir_uclass(&[('A', 'Z'), ('a', 'z')])); + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator"))))); + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator"))))))); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!( + t(r"[a[^c]]"), + hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!( + t(r"[a-b[^c]]"), + hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!( + t(r"[a-c[^c]]"), + hir_negate(hir_uclass(&[]))); + + assert_eq!( + t(r"[^a[^c]]"), + hir_uclass(&[('c', 'c')])); + assert_eq!( + t(r"[^a-b[^c]]"), + hir_uclass(&[('c', 'c')])); + + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))); + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))); + + assert_eq!( + t(r"(?i)[^a[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')])); + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')])); + + assert_eq!(t_err(r"[^a-c[^c]]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"(?i)[^a-c[^c]]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(4, 1, 5), Position::new(14, 1, 15)), + }); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')]))); + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')]))); + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')]))); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')]))); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')]))); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')]))); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')])); + } + + #[test] + fn class_bracketed_intersect_negate() { + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^[a-z&&a-c]]"), + hir_negate(hir_uclass(&[('a', 'c')]))); + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[[[^\w]&&[^\d]]]"), + hir_negate(hir_uclass_perl_word())); + + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')]))); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit))); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + } + + #[test] + fn class_bracketed_difference() { + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]))); + + assert_eq!( + t(r"(?-u)[[:alpha:]--[:lower:]]"), + hir_bclass(&[(b'A', b'Z')])); + } + + #[test] + fn class_bracketed_symmetric_difference() { + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ])); + assert_eq!( + t(r"[a-g~~c-j]"), + hir_uclass(&[('a', 'b'), ('h', 'j')])); + + assert_eq!( + t(r"(?-u)[a-g~~c-j]"), + hir_bclass(&[(b'a', b'b'), (b'h', b'j')])); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!(t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), hir_lit("S")); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!(t(r"(?x)\x # comment + 53 # comment"), hir_lit("S")); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + assert_eq!(t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), hir_uclass_query(ClassQuery::Binary("separator"))); + + assert_eq!(t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range( + true, hir::RepetitionRange::Bounded(5, 10), hir_lit("a"))); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_always_utf8() { + // Positive examples. + assert!(t_bytes(r"a").is_always_utf8()); + assert!(t_bytes(r"ab").is_always_utf8()); + assert!(t_bytes(r"(?-u)a").is_always_utf8()); + assert!(t_bytes(r"(?-u)ab").is_always_utf8()); + assert!(t_bytes(r"\xFF").is_always_utf8()); + assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); + assert!(t_bytes(r"[^a]").is_always_utf8()); + assert!(t_bytes(r"[^a][^a]").is_always_utf8()); + assert!(t_bytes(r"\b").is_always_utf8()); + assert!(t_bytes(r"\B").is_always_utf8()); + + // Negative examples. + assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\b").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + assert!(t(r"\b").is_all_assertions()); + assert!(t(r"\B").is_all_assertions()); + assert!(t(r"^").is_all_assertions()); + assert!(t(r"$").is_all_assertions()); + assert!(t(r"\A").is_all_assertions()); + assert!(t(r"\z").is_all_assertions()); + assert!(t(r"$^\z\A\b\B").is_all_assertions()); + assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); + assert!(t(r"^$|$^").is_all_assertions()); + assert!(t(r"((\b)+())*^").is_all_assertions()); + + // Negative examples. + assert!(!t(r"^a").is_all_assertions()); + } + + #[test] + fn analysis_is_anchored() { + // Positive examples. + assert!(t(r"^").is_anchored_start()); + assert!(t(r"$").is_anchored_end()); + + assert!(t(r"^^").is_anchored_start()); + assert!(t(r"$$").is_anchored_end()); + + assert!(t(r"^$").is_anchored_start()); + assert!(t(r"^$").is_anchored_end()); + + assert!(t(r"^foo").is_anchored_start()); + assert!(t(r"foo$").is_anchored_end()); + + assert!(t(r"^foo|^bar").is_anchored_start()); + assert!(t(r"foo$|bar$").is_anchored_end()); + + assert!(t(r"^(foo|bar)").is_anchored_start()); + assert!(t(r"(foo|bar)$").is_anchored_end()); + + assert!(t(r"^+").is_anchored_start()); + assert!(t(r"$+").is_anchored_end()); + assert!(t(r"^++").is_anchored_start()); + assert!(t(r"$++").is_anchored_end()); + assert!(t(r"(^)+").is_anchored_start()); + assert!(t(r"($)+").is_anchored_end()); + + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_anchored_end()); + assert!(t(r"$^|^$").is_anchored_start()); + assert!(t(r"$^|^$").is_anchored_end()); + + assert!(t(r"\b^").is_anchored_start()); + assert!(t(r"$\b").is_anchored_end()); + assert!(t(r"^(?m:^)").is_anchored_start()); + assert!(t(r"(?m:$)$").is_anchored_end()); + assert!(t(r"(?m:^)^").is_anchored_start()); + assert!(t(r"$(?m:$)").is_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_anchored_start()); + assert!(!t(r"(?m)$").is_anchored_end()); + assert!(!t(r"(?m:^$)|$^").is_anchored_start()); + assert!(!t(r"(?m:^$)|$^").is_anchored_end()); + assert!(!t(r"$^|(?m:^$)").is_anchored_start()); + assert!(!t(r"$^|(?m:^$)").is_anchored_end()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + + assert!(!t(r"^foo|bar").is_anchored_start()); + assert!(!t(r"foo|bar$").is_anchored_end()); + + assert!(!t(r"^*").is_anchored_start()); + assert!(!t(r"$*").is_anchored_end()); + assert!(!t(r"^*+").is_anchored_start()); + assert!(!t(r"$*+").is_anchored_end()); + assert!(!t(r"^+*").is_anchored_start()); + assert!(!t(r"$+*").is_anchored_end()); + assert!(!t(r"(^)*").is_anchored_start()); + assert!(!t(r"($)*").is_anchored_end()); + } + + #[test] + fn analysis_is_any_anchored() { + // Positive examples. + assert!(t(r"^").is_any_anchored_start()); + assert!(t(r"$").is_any_anchored_end()); + assert!(t(r"\A").is_any_anchored_start()); + assert!(t(r"\z").is_any_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_any_anchored_start()); + assert!(!t(r"(?m)$").is_any_anchored_end()); + assert!(!t(r"$").is_any_anchored_start()); + assert!(!t(r"^").is_any_anchored_end()); + } + + #[test] + fn analysis_is_match_empty() { + // Positive examples. + assert!(t(r"").is_match_empty()); + assert!(t(r"()").is_match_empty()); + assert!(t(r"()*").is_match_empty()); + assert!(t(r"()+").is_match_empty()); + assert!(t(r"()?").is_match_empty()); + assert!(t(r"a*").is_match_empty()); + assert!(t(r"a?").is_match_empty()); + assert!(t(r"a{0}").is_match_empty()); + assert!(t(r"a{0,}").is_match_empty()); + assert!(t(r"a{0,1}").is_match_empty()); + assert!(t(r"a{0,10}").is_match_empty()); + assert!(t(r"\pL*").is_match_empty()); + assert!(t(r"a*|b").is_match_empty()); + assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a*a?(abcd)*").is_match_empty()); + assert!(t(r"^").is_match_empty()); + assert!(t(r"$").is_match_empty()); + assert!(t(r"(?m)^").is_match_empty()); + assert!(t(r"(?m)$").is_match_empty()); + assert!(t(r"\A").is_match_empty()); + assert!(t(r"\z").is_match_empty()); + assert!(t(r"\B").is_match_empty()); + assert!(t(r"(?-u)\B").is_match_empty()); + + // Negative examples. + assert!(!t(r"a+").is_match_empty()); + assert!(!t(r"a{1}").is_match_empty()); + assert!(!t(r"a{1,}").is_match_empty()); + assert!(!t(r"a{1,2}").is_match_empty()); + assert!(!t(r"a{1,10}").is_match_empty()); + assert!(!t(r"b|a").is_match_empty()); + assert!(!t(r"a*a+(abcd)*").is_match_empty()); + assert!(!t(r"\b").is_match_empty()); + assert!(!t(r"(?-u)\b").is_match_empty()); + } +} diff --git a/regex-syntax-2/src/hir/visitor.rs b/regex-syntax-2/src/hir/visitor.rs new file mode 100644 index 0000000000..716a96d9b4 --- /dev/null +++ b/regex-syntax-2/src/hir/visitor.rs @@ -0,0 +1,222 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use hir::{self, Hir, HirKind}; + +/// A trait for visiting the high-level IR (HIR) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on a high-level intermediate representation of a regular +/// expression without necessarily using recursion. In particular, this permits +/// callers to do case analysis with constant stack usage, which can be +/// important since the size of an HIR may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +pub trait Visitor { + /// The result of visiting an HIR. + type Output; + /// An error that visiting an HIR might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the HIR or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(hir: &Hir, visitor: V) -> Result { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a hir::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result { + self.stack.clear(); + + visitor.start(); + loop { + try!(visitor.visit_pre(hir)); + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + try!(visitor.visit_post(hir)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation {..} = x { + try!(visitor.visit_alternation_in()); + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + try!(visitor.visit_post(post_hir)); + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { + head: &x[0], + tail: &x[1..], + }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { + head: &x[0], + tail: &x[1..], + }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { + head: &tail[0], + tail: &tail[1..], + }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.hir, + Frame::Group(group) => &group.hir, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} diff --git a/regex-syntax-2/src/lib.rs b/regex-syntax-2/src/lib.rs new file mode 100644 index 0000000000..9469fe23b8 --- /dev/null +++ b/regex-syntax-2/src/lib.rs @@ -0,0 +1,222 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This crate provides a robust regular expression parser. + +This crate defines two primary types: + +* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. + An abstract syntax corresponds to a *structured representation* of the + concrete syntax of a regular expression, where the concrete syntax is the + pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it + can be converted back to the original concrete syntax (modulo some details, + like whitespace). To a first approximation, the abstract syntax is complex + and difficult to analyze. +* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation + ("HIR" or "high-level IR" for short) of regular expression. It corresponds to + an intermediate state of a regular expression that sits between the abstract + syntax and the low level compiled opcodes that are eventually responsible for + executing a regular expression search. Given some high-level IR, it is not + possible to produce the original concrete syntax (although it is possible to + produce an equivalent conrete syntax, but it will likely scarcely resemble + the original pattern). To a first approximation, the high-level IR is simple + and easy to analyze. + +These two types come with conversion routines: + +* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete + syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). +* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) + converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). + +As a convenience, the above two conversion routines are combined into one via +the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first +convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. + + +# Example + +This example shows how to parse a pattern string into its HIR: + +``` +use regex_syntax2::Parser; +use regex_syntax2::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +# Concrete syntax supported + +The concrete syntax is documented as part of the public API of the +[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). + + +# Input safety + +A key feature of this library is that it is safe to use with end user facing +input. This plays a significant role in the internal implementation. In +particular: + +1. Parsers provide a `nest_limit` option that permits callers to control how + deeply nested a regular expression is allowed to be. This makes it possible + to do case analysis over an `Ast` or an `Hir` using recursion without + worrying about stack overflow. +2. Since relying on a particular stack size is brittle, this crate goes to + great lengths to ensure that all interactions with both the `Ast` and the + `Hir` do not use recursion. Namely, they use constant stack space and heap + space proportional to the size of the original pattern string (in bytes). + This includes the type's corresponding destructors. (One exception to this + is literal extraction, but this will eventually get fixed.) + + +# Error reporting + +The `Display` implementations on all `Error` types exposed in this library +provide nice human readable errors that are suitable for showing to end users +in a monospace font. + + +# Literal extraction + +This crate provides limited support for +[literal extraction from `Hir` values](hir/literal/struct.Literals.html). +Be warned that literal extraction currently uses recursion, and therefore, +stack size proportional to the size of the `Hir`. + +The purpose of literal extraction is to speed up searches. That is, if you +know a regular expression must match a prefix or suffix literal, then it is +often quicker to search for instances of that literal, and then confirm or deny +the match using the full regular expression engine. These optimizations are +done automatically in the `regex` crate. +*/ + +#![deny(missing_docs)] + +extern crate ucd_util; + +pub use error::{Error, Result}; +pub use parser::{Parser, ParserBuilder}; + +pub mod ast; +mod either; +mod error; +pub mod hir; +mod parser; +mod unicode; +mod unicode_tables; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + let mut quoted = String::with_capacity(text.len()); + escape_into(text, &mut quoted); + quoted +} + +/// Escapes all meta characters in `text` and writes the result into `buf`. +/// +/// This will append escape characters into the given buffer. The characters +/// that are appended are safe to use as a literal in a regular expression. +pub fn escape_into(text: &str, buf: &mut String) { + for c in text.chars() { + if is_meta_character(c) { + buf.push('\\'); + } + buf.push(c); + } +} + +/// Returns true if the give character has significance in a regex. +/// +/// These are the only characters that are allowed to be escaped, with one +/// exception: an ASCII space character may be escaped when extended mode (with +/// the `x` flag) is enabld. In particular, `is_meta_character(' ')` returns +/// `false`. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +pub fn is_word_character(c: char) -> bool { + use std::cmp::Ordering; + use unicode_tables::perl_word::PERL_WORD; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return true; + } + PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok() +} + +/// Returns true if and only if the given character is an ASCII word character. +/// +/// An ASCII word character is defined by the following character class: +/// `[_0-9a-zA-Z]'. +pub fn is_word_byte(c: u8) -> bool { + match c { + b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_meta() { + assert_eq!( + escape(r"\.+*?()|[]{}^$#&-~"), + r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string(), + ); + } + + #[test] + fn word() { + assert!(is_word_byte(b'a')); + assert!(!is_word_byte(b'-')); + + assert!(is_word_character('a')); + assert!(is_word_character('β')); + assert!(!is_word_character('-')); + assert!(!is_word_character('☃')); + } +} diff --git a/regex-syntax-2/src/parser.rs b/regex-syntax-2/src/parser.rs new file mode 100644 index 0000000000..2afd2fe234 --- /dev/null +++ b/regex-syntax-2/src/parser.rs @@ -0,0 +1,201 @@ +use ast; +use hir; + +use Result; + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +/// +/// This type combines the builder options for both the +/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) +/// and the +/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +#[derive(Clone, Debug, Default)] +pub struct ParserBuilder { + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder::default() + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + ast: self.ast.build(), + hir: self.hir.build(), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.ast.nest_limit(limit); + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.octal(yes); + self + } + + /// When enabled, the parser will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the parser is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// parser will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.allow_invalid_utf8(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.ignore_whitespace(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.case_insensitive(yes); + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.multi_line(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut ParserBuilder { + self.hir.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.unicode(yes); + self + } +} + +/// A convenience parser for regular expressions. +/// +/// This parser takes as input a regular expression pattern string (the +/// "concrete syntax") and returns a high-level intermediate representation +/// (the HIR) suitable for most types of analysis. In particular, this parser +/// hides the intermediate state of producing an AST (the "abstract syntax"). +/// The AST is itself far more complex than the HIR, so this parser serves as a +/// convenience for never having to deal with it at all. +/// +/// If callers have more fine grained use cases that need an AST, then please +/// see the [`ast::parse`](ast/parse/index.html) module. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + ast: ast::parse::Parser, + hir: hir::translate::Translator, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with `parse` method. The parse method returns + /// a high level intermediate representation of the given regular + /// expression. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into a high level intermediate + /// representation. + pub fn parse(&mut self, pattern: &str) -> Result { + let ast = try!(self.ast.parse(pattern)); + let hir = try!(self.hir.translate(pattern, &ast)); + Ok(hir) + } +} diff --git a/regex-syntax-2/src/unicode.rs b/regex-syntax-2/src/unicode.rs new file mode 100644 index 0000000000..e76b203fad --- /dev/null +++ b/regex-syntax-2/src/unicode.rs @@ -0,0 +1,462 @@ +use std::cmp::Ordering; +use std::result; + +use ucd_util::{self, PropertyValues}; + +use hir; +use unicode_tables::age; +use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; +use unicode_tables::general_category; +use unicode_tables::property_bool; +use unicode_tables::property_names::PROPERTY_NAMES; +use unicode_tables::property_values::PROPERTY_VALUES; +use unicode_tables::script; +use unicode_tables::script_extension; + +type Result = result::Result; + +/// An error that occurs when dealing with Unicode. +/// +/// We don't impl the Error trait here because these always get converted +/// into other public errors. (This error type isn't exported.) +#[derive(Debug)] +pub enum Error { + PropertyNotFound, + PropertyValueNotFound, +} + +/// Encode the given Unicode character to `dst` as a single UTF-8 sequence. +/// +/// If `dst` is not long enough, then `None` is returned. Otherwise, the number +/// of bytes written is returned. +pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option { + // TODO: Remove this function once we move to at least Rust 1.15, which + // provides char::encode_utf8 for us. + const TAG_CONT: u8 = 0b1000_0000; + const TAG_TWO: u8 = 0b1100_0000; + const TAG_THREE: u8 = 0b1110_0000; + const TAG_FOUR: u8 = 0b1111_0000; + + let code = character as u32; + if code <= 0x7F && !dst.is_empty() { + dst[0] = code as u8; + Some(1) + } else if code <= 0x7FF && dst.len() >= 2 { + dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO; + dst[1] = (code & 0x3F) as u8 | TAG_CONT; + Some(2) + } else if code <= 0xFFFF && dst.len() >= 3 { + dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE; + dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code & 0x3F) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR; + dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[3] = (code & 0x3F) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// An iterator over a codepoint's simple case equivalence class. +#[derive(Debug)] +pub struct SimpleFoldIter(::std::slice::Iter<'static, char>); + +impl Iterator for SimpleFoldIter { + type Item = char; + + fn next(&mut self) -> Option { + self.0.next().map(|c| *c) + } +} + +/// Return an iterator over the equivalence class of simple case mappings +/// for the given codepoint. The equivalence class does not include the +/// given codepoint. +/// +/// If the equivalence class is empty, then this returns the next scalar +/// value that has a non-empty equivalence class, if it exists. If no such +/// scalar value exists, then `None` is returned. The point of this behavior +/// is to permit callers to avoid calling `simple_fold` more than they need +/// to, since there is some cost to fetching the equivalence class. +pub fn simple_fold(c: char) -> result::Result> { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter())) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + }) +} + +/// Returns true if and only if the given (inclusive) range contains at least +/// one Unicode scalar value that has a non-empty non-trivial simple case +/// mapping. +/// +/// This function panics if `end < start`. +pub fn contains_simple_case_mapping(start: char, end: char) -> bool { + assert!(start <= end); + CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok() +} + +/// A query for finding a character class defined by Unicode. This supports +/// either use of a property name directly, or lookup by property value. The +/// former generally refers to Binary properties (see UTS#44, Table 8), but +/// as a special exception (see UTS#18, Section 1.2) both general categories +/// (an enumeration) and scripts (a catalog) are supported as if each of their +/// possible values were a binary property. +/// +/// In all circumstances, property names and values are normalized and +/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. +/// +/// The lifetime `'a` refers to the shorter of the lifetimes of property name +/// and property value. +#[derive(Debug)] +pub enum ClassQuery<'a> { + /// Return a class corresponding to a Unicode binary property, named by + /// a single letter. + OneLetter(char), + /// Return a class corresponding to a Unicode binary property. + /// + /// Note that, by special exception (see UTS#18, Section 1.2), both + /// general category values and script values are permitted here as if + /// they were a binary property. + Binary(&'a str), + /// Return a class corresponding to all codepoints whose property + /// (identified by `property_name`) corresponds to the given value + /// (identified by `property_value`). + ByValue { + /// A property name. + property_name: &'a str, + /// A property value. + property_value: &'a str, + }, +} + +impl<'a> ClassQuery<'a> { + fn canonicalize(&self) -> Result { + match *self { + ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), + ClassQuery::Binary(name) => self.canonical_binary(name), + ClassQuery::ByValue { property_name, property_value } => { + let property_name = normalize(property_name); + let property_value = normalize(property_value); + + let canon_name = match canonical_prop(&property_name) { + None => return Err(Error::PropertyNotFound), + Some(canon_name) => canon_name, + }; + Ok(match canon_name { + "General_Category" => { + let canon = match canonical_gencat(&property_value) { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::GeneralCategory(canon) + } + "Script" => { + let canon = match canonical_script(&property_value) { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::Script(canon) + } + _ => { + let vals = match property_values(canon_name) { + None => return Err(Error::PropertyValueNotFound), + Some(vals) => vals, + }; + let canon_val = match canonical_value( + vals, + &property_value, + ) { + None => return Err(Error::PropertyValueNotFound), + Some(canon_val) => canon_val, + }; + CanonicalClassQuery::ByValue { + property_name: canon_name, + property_value: canon_val, + } + } + }) + } + } + } + + fn canonical_binary(&self, name: &str) -> Result { + let norm = normalize(name); + + if let Some(canon) = canonical_prop(&norm) { + return Ok(CanonicalClassQuery::Binary(canon)); + } + if let Some(canon) = canonical_gencat(&norm) { + return Ok(CanonicalClassQuery::GeneralCategory(canon)); + } + if let Some(canon) = canonical_script(&norm) { + return Ok(CanonicalClassQuery::Script(canon)); + } + Err(Error::PropertyNotFound) + } +} + +/// Like ClassQuery, but its parameters have been canonicalized. This also +/// differentiates binary properties from flattened general categories and +/// scripts. +#[derive(Debug)] +enum CanonicalClassQuery { + /// The canonical binary property name. + Binary(&'static str), + /// The canonical general category name. + GeneralCategory(&'static str), + /// The canonical script name. + Script(&'static str), + /// An arbitrary association between property and value, both of which + /// have been canonicalized. + /// + /// Note that by construction, the property name of ByValue will never + /// be General_Category or Script. Those two cases are subsumed by the + /// eponymous variants. + ByValue { + /// The canonical property name. + property_name: &'static str, + /// The canonical property value. + property_value: &'static str, + }, +} + +/// Looks up a Unicode class given a query. If one doesn't exist, then +/// `None` is returned. +pub fn class<'a>(query: ClassQuery<'a>) -> Result { + use self::CanonicalClassQuery::*; + + match try!(query.canonicalize()) { + Binary(name) => { + property_set(property_bool::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + GeneralCategory("Any") => { + Ok(hir_class(&[('\0', '\u{10FFFF}')])) + } + GeneralCategory("Assigned") => { + let mut cls = + try!(property_set(general_category::BY_NAME, "Unassigned") + .map(hir_class) + .ok_or(Error::PropertyNotFound)); + cls.negate(); + Ok(cls) + } + GeneralCategory("ASCII") => { + Ok(hir_class(&[('\0', '\x7F')])) + } + GeneralCategory(name) => { + property_set(general_category::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + Script(name) => { + property_set(script::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + ByValue { property_name: "Age", property_value } => { + let mut class = hir::ClassUnicode::empty(); + for set in try!(ages(property_value)) { + class.union(&hir_class(set)); + } + Ok(class) + } + ByValue { property_name: "Script_Extensions", property_value } => { + property_set(script_extension::BY_NAME, property_value) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + _ => { + // What else should we support? + Err(Error::PropertyNotFound) + } + } +} + +/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. +pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { + let hir_ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::ClassUnicode::new(hir_ranges) +} + +fn canonical_prop(normalized_name: &str) -> Option<&'static str> { + ucd_util::canonical_property_name(PROPERTY_NAMES, normalized_name) +} + +fn canonical_gencat(normalized_value: &str) -> Option<&'static str> { + match normalized_value { + "any" => Some("Any"), + "assigned" => Some("Assigned"), + "ascii" => Some("ASCII"), + _ => { + let gencats = property_values("General_Category").unwrap(); + canonical_value(gencats, normalized_value) + } + } +} + +fn canonical_script(normalized_value: &str) -> Option<&'static str> { + let scripts = property_values("Script").unwrap(); + canonical_value(scripts, normalized_value) +} + +fn canonical_value( + vals: PropertyValues, + normalized_value: &str, +) -> Option<&'static str> { + ucd_util::canonical_property_value(vals, normalized_value) +} + +fn normalize(x: &str) -> String { + let mut x = x.to_string(); + ucd_util::symbolic_name_normalize(&mut x); + x +} + +fn property_values( + canonical_property_name: &'static str, +) -> Option +{ + ucd_util::property_values(PROPERTY_VALUES, canonical_property_name) +} + +fn property_set( + name_map: &'static [(&'static str, &'static [(char, char)])], + canonical: &'static str, +) -> Option<&'static [(char, char)]> { + name_map + .binary_search_by_key(&canonical, |x| x.0) + .ok() + .map(|i| name_map[i].1) +} + +/// An iterator over Unicode Age sets. Each item corresponds to a set of +/// codepoints that were added in a particular revision of Unicode. The +/// iterator yields items in chronological order. +#[derive(Debug)] +struct AgeIter { + ages: &'static [(&'static str, &'static [(char, char)])], +} + +fn ages(canonical_age: &str) -> Result { + const AGES: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }), + } +} + +impl Iterator for AgeIter { + type Item = &'static [(char, char)]; + + fn next(&mut self) -> Option<&'static [(char, char)]> { + if self.ages.is_empty() { + None + } else { + let set = self.ages[0]; + self.ages = &self.ages[1..]; + Some(set.1) + } + } +} + +#[cfg(test)] +mod tests { + use super::{contains_simple_case_mapping, simple_fold}; + + #[test] + fn simple_fold_k() { + let xs: Vec = simple_fold('k').unwrap().collect(); + assert_eq!(xs, vec!['K', 'K']); + + let xs: Vec = simple_fold('K').unwrap().collect(); + assert_eq!(xs, vec!['k', 'K']); + + let xs: Vec = simple_fold('K').unwrap().collect(); + assert_eq!(xs, vec!['K', 'k']); + } + + #[test] + fn simple_fold_a() { + let xs: Vec = simple_fold('a').unwrap().collect(); + assert_eq!(xs, vec!['A']); + + let xs: Vec = simple_fold('A').unwrap().collect(); + assert_eq!(xs, vec!['a']); + } + + #[test] + fn simple_fold_empty() { + assert_eq!(Some('A'), simple_fold('?').unwrap_err()); + assert_eq!(Some('A'), simple_fold('@').unwrap_err()); + assert_eq!(Some('a'), simple_fold('[').unwrap_err()); + assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err()); + } + + #[test] + fn simple_fold_max() { + assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err()); + assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err()); + } + + #[test] + fn range_contains() { + assert!(contains_simple_case_mapping('A', 'A')); + assert!(contains_simple_case_mapping('Z', 'Z')); + assert!(contains_simple_case_mapping('A', 'Z')); + assert!(contains_simple_case_mapping('@', 'A')); + assert!(contains_simple_case_mapping('Z', '[')); + assert!(contains_simple_case_mapping('☃', 'Ⰰ')); + + assert!(!contains_simple_case_mapping('[', '[')); + assert!(!contains_simple_case_mapping('[', '`')); + + assert!(!contains_simple_case_mapping('☃', '☃')); + } +} diff --git a/regex-syntax-2/src/unicode_tables/age.rs b/regex-syntax-2/src/unicode_tables/age.rs new file mode 100644 index 0000000000..afba3d3ff4 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/age.rs @@ -0,0 +1,424 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate age tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V10_0", V10_0), ("V1_1", V1_1), ("V2_0", V2_0), ("V2_1", V2_1), + ("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2), ("V4_0", V4_0), + ("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1), ("V5_2", V5_2), + ("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2), ("V6_3", V6_3), + ("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0), +]; + +pub const V10_0: &'static [(char, char)] = &[ + ('ࡠ', 'ࡪ'), ('ৼ', '৽'), ('ૺ', '૿'), ('ഀ', 'ഀ'), + ('഻', '഼'), ('᳷', '᳷'), ('᷶', '᷹'), ('₿', '₿'), + ('⏿', '⏿'), ('⯒', '⯒'), ('⹅', '⹉'), ('ㄮ', 'ㄮ'), + ('鿖', '鿪'), ('𐌭', '𐌯'), ('𑨀', '𑩇'), ('𑩐', '𑪃'), + ('𑪆', '𑪜'), ('𑪞', '𑪢'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵇'), + ('𑵐', '𑵙'), ('𖿡', '𖿡'), ('𛀂', '𛄞'), ('𛅰', '𛋻'), + ('🉠', '🉥'), ('🛓', '🛔'), ('🛷', '🛸'), ('🤀', '🤋'), + ('🤟', '🤟'), ('🤨', '🤯'), ('🤱', '🤲'), ('🥌', '🥌'), + ('🥟', '🥫'), ('🦒', '🦗'), ('🧐', '🧦'), ('𬺰', '𮯠'), +]; + +pub const V1_1: &'static [(char, char)] = &[ + ('\u{0}', 'ǵ'), ('Ǻ', 'ȗ'), ('ɐ', 'ʨ'), ('ʰ', '˞'), ('ˠ', '˩'), + ('̀', 'ͅ'), ('͠', '͡'), ('ʹ', '͵'), ('ͺ', 'ͺ'), (';', ';'), + ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ώ'), ('ϐ', 'ϖ'), + ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'ϳ'), + ('Ё', 'Ќ'), ('Ў', 'я'), ('ё', 'ќ'), ('ў', '҆'), ('Ґ', 'ӄ'), + ('Ӈ', 'ӈ'), ('Ӌ', 'ӌ'), ('Ӑ', 'ӫ'), ('Ӯ', 'ӵ'), ('Ӹ', 'ӹ'), + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '։'), ('ְ', 'ֹ'), + ('ֻ', '׃'), ('א', 'ת'), ('װ', '״'), ('،', '،'), ('؛', '؛'), + ('؟', '؟'), ('ء', 'غ'), ('ـ', 'ْ'), ('٠', '٭'), ('ٰ', 'ڷ'), + ('ں', 'ھ'), ('ۀ', 'ێ'), ('ې', 'ۭ'), ('۰', '۹'), ('ँ', 'ः'), + ('अ', 'ह'), ('़', '्'), ('ॐ', '॔'), ('क़', '॰'), + ('ঁ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', '়'), + ('া', 'ৄ'), ('ে', 'ৈ'), ('ো', '্'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', '৺'), ('ਂ', 'ਂ'), + ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), ('੦', 'ੴ'), ('ઁ', 'ઃ'), ('અ', 'ઋ'), + ('ઍ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૠ'), ('૦', '૯'), + ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଶ', 'ହ'), ('଼', 'ୃ'), + ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), ('୦', '୰'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'வ'), + ('ஷ', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), ('ொ', '்'), + ('ௗ', 'ௗ'), ('௧', '௲'), ('ఁ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'ళ'), ('వ', 'హ'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), + ('ౠ', 'ౡ'), ('౦', '౯'), ('ಂ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಾ', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ം', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ന'), ('പ', 'ഹ'), + ('ാ', 'ൃ'), ('െ', 'ൈ'), ('ൊ', '്'), ('ൗ', 'ൗ'), + ('ൠ', 'ൡ'), ('൦', '൯'), ('ก', 'ฺ'), ('฿', '๛'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໝ'), ('Ⴀ', 'Ⴥ'), ('ა', 'ჶ'), + ('჻', '჻'), ('ᄀ', 'ᅙ'), ('ᅟ', 'ᆢ'), ('ᆨ', 'ᇹ'), + ('Ḁ', 'ẚ'), ('Ạ', 'ỹ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), + ('ῲ', 'ῴ'), ('ῶ', '῾'), ('\u{2000}', '\u{202e}'), ('‰', '⁆'), + ('\u{206a}', '⁰'), ('⁴', '₎'), ('₠', '₪'), ('⃐', '⃡'), + ('℀', 'ℸ'), ('⅓', 'ↂ'), ('←', '⇪'), ('∀', '⋱'), + ('⌀', '⌀'), ('⌂', '⍺'), ('␀', '␤'), ('⑀', '⑊'), + ('①', '⓪'), ('─', '▕'), ('■', '◯'), ('☀', '☓'), + ('☚', '♯'), ('✁', '✄'), ('✆', '✉'), ('✌', '✧'), + ('✩', '❋'), ('❍', '❍'), ('❏', '❒'), ('❖', '❖'), + ('❘', '❞'), ('❡', '❧'), ('❶', '➔'), ('➘', '➯'), + ('➱', '➾'), ('\u{3000}', '〷'), ('〿', '〿'), ('ぁ', 'ゔ'), + ('゙', 'ゞ'), ('ァ', 'ヾ'), ('ㄅ', 'ㄬ'), ('ㄱ', 'ㆎ'), + ('㆐', '㆟'), ('㈀', '㈜'), ('㈠', '㉃'), ('㉠', '㉻'), + ('㉿', '㊰'), ('㋀', '㋋'), ('㋐', '㋾'), ('㌀', '㍶'), + ('㍻', '㏝'), ('㏠', '㏾'), ('一', '龥'), ('\u{e000}', '鶴'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('ﬞ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), + ('ﯓ', '﴿'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), + ('︠', '︣'), ('︰', '﹄'), ('﹉', '﹒'), ('﹔', '﹦'), + ('﹨', '﹫'), ('ﹰ', 'ﹲ'), ('ﹴ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('\u{feff}', '\u{feff}'), ('!', '~'), ('。', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), + ('│', '○'), ('�', '\u{ffff}'), +]; + +pub const V2_0: &'static [(char, char)] = &[ + ('֑', '֡'), ('֣', '֯'), ('ׄ', 'ׄ'), ('ༀ', 'ཇ'), ('ཉ', 'ཀྵ'), + ('ཱ', 'ྋ'), ('ྐ', 'ྕ'), ('ྗ', 'ྗ'), ('ྙ', 'ྭ'), + ('ྱ', 'ྷ'), ('ྐྵ', 'ྐྵ'), ('ẛ', 'ẛ'), ('₫', '₫'), + ('가', '힣'), ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{10ffff}'), +]; + +pub const V2_1: &'static [(char, char)] = &[ + ('€', '€'), ('', ''), +]; + +pub const V3_0: &'static [(char, char)] = &[ + ('Ƕ', 'ǹ'), ('Ș', 'ȟ'), ('Ȣ', 'ȳ'), ('ʩ', 'ʭ'), ('˟', '˟'), + ('˪', 'ˮ'), ('͆', '͎'), ('͢', '͢'), ('ϗ', 'ϗ'), ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('Ѐ', 'Ѐ'), ('Ѝ', 'Ѝ'), + ('ѐ', 'ѐ'), ('ѝ', 'ѝ'), ('҈', '҉'), ('Ҍ', 'ҏ'), ('Ӭ', 'ӭ'), + ('֊', '֊'), ('ٓ', 'ٕ'), ('ڸ', 'ڹ'), ('ڿ', 'ڿ'), ('ۏ', 'ۏ'), + ('ۺ', '۾'), ('܀', '܍'), ('\u{70f}', 'ܬ'), ('ܰ', '݊'), ('ހ', 'ް'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('ෲ', '෴'), ('ཪ', 'ཪ'), + ('ྖ', 'ྖ'), ('ྮ', 'ྰ'), ('ྸ', 'ྸ'), ('ྺ', 'ྼ'), + ('྾', '࿌'), ('࿏', '࿏'), ('က', 'အ'), ('ဣ', 'ဧ'), + ('ဩ', 'ဪ'), ('ာ', 'ဲ'), ('ံ', '္'), ('၀', 'ၙ'), + ('ሀ', 'ሆ'), ('ለ', 'ቆ'), ('ቈ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኆ'), + ('ኈ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኮ'), ('ኰ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዎ'), ('ዐ', 'ዖ'), ('ዘ', 'ዮ'), ('ደ', 'ጎ'), + ('ጐ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ጞ'), ('ጠ', 'ፆ'), + ('ፈ', 'ፚ'), ('፡', '፼'), ('Ꭰ', 'Ᏼ'), ('ᐁ', 'ᙶ'), + ('\u{1680}', '᚜'), ('ᚠ', 'ᛰ'), ('ក', 'ៜ'), ('០', '៩'), + ('᠀', '\u{180e}'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢩ'), + ('\u{202f}', '\u{202f}'), ('⁈', '⁍'), ('₭', '₯'), ('⃢', '⃣'), + ('ℹ', '℺'), ('Ↄ', 'Ↄ'), ('⇫', '⇳'), ('⌁', '⌁'), + ('⍻', '⍻'), ('⍽', '⎚'), ('␥', '␦'), ('◰', '◷'), + ('☙', '☙'), ('♰', '♱'), ('⠀', '⣿'), ('⺀', '⺙'), + ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('〸', '〺'), + ('〾', '〾'), ('ㆠ', 'ㆷ'), ('㐀', '䶵'), ('ꀀ', 'ꒌ'), + ('꒐', '꒡'), ('꒤', '꒳'), ('꒵', '꓀'), ('꓂', '꓄'), + ('꓆', '꓆'), ('יִ', 'יִ'), ('\u{fff9}', '\u{fffb}'), +]; + +pub const V3_1: &'static [(char, char)] = &[ + ('ϴ', 'ϵ'), ('\u{fdd0}', '\u{fdef}'), ('𐌀', '𐌞'), ('𐌠', '𐌣'), + ('𐌰', '𐍊'), ('𐐀', '𐐥'), ('𐐨', '𐑍'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄪', '𝇝'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓀'), ('𝓂', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚣'), ('𝚨', '𝟉'), ('𝟎', '𝟿'), + ('𠀀', '𪛖'), ('丽', '𪘀'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const V3_2: &'static [(char, char)] = &[ + ('Ƞ', 'Ƞ'), ('͏', '͏'), ('ͣ', 'ͯ'), ('Ϙ', 'ϙ'), ('϶', '϶'), + ('Ҋ', 'ҋ'), ('Ӆ', 'ӆ'), ('Ӊ', 'ӊ'), ('Ӎ', 'ӎ'), ('Ԁ', 'ԏ'), + ('ٮ', 'ٯ'), ('ޱ', 'ޱ'), ('ჷ', 'ჸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜶'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('⁇', '⁇'), ('⁎', '⁒'), ('⁗', '⁗'), + ('\u{205f}', '\u{2063}'), ('ⁱ', 'ⁱ'), ('₰', '₱'), ('⃤', '⃪'), + ('ℽ', '⅋'), ('⇴', '⇿'), ('⋲', '⋿'), ('⍼', '⍼'), + ('⎛', '⏎'), ('⓫', '⓾'), ('▖', '▟'), ('◸', '◿'), + ('☖', '☗'), ('♲', '♽'), ('⚀', '⚉'), ('❨', '❵'), + ('⟐', '⟫'), ('⟰', '⟿'), ('⤀', '⫿'), ('〻', '〽'), + ('ゕ', 'ゖ'), ('ゟ', '゠'), ('ヿ', 'ヿ'), ('ㇰ', 'ㇿ'), + ('㉑', '㉟'), ('㊱', '㊿'), ('꒢', '꒣'), ('꒴', '꒴'), + ('꓁', '꓁'), ('꓅', '꓅'), ('侮', '頻'), ('﷼', '﷼'), + ('︀', '️'), ('﹅', '﹆'), ('ﹳ', 'ﹳ'), ('⦅', '⦆'), +]; + +pub const V4_0: &'static [(char, char)] = &[ + ('ȡ', 'ȡ'), ('ȴ', 'ȶ'), ('ʮ', 'ʯ'), ('˯', '˿'), ('͐', '͗'), + ('͝', '͟'), ('Ϸ', 'ϻ'), ('\u{600}', '\u{603}'), ('؍', 'ؕ'), + ('ٖ', '٘'), ('ۮ', 'ۯ'), ('ۿ', 'ۿ'), ('ܭ', 'ܯ'), ('ݍ', 'ݏ'), + ('ऄ', 'ऄ'), ('ঽ', 'ঽ'), ('ਁ', 'ਁ'), ('ਃ', 'ਃ'), + ('ઌ', 'ઌ'), ('ૡ', 'ૣ'), ('૱', '૱'), ('ଵ', 'ଵ'), + ('ୱ', 'ୱ'), ('௳', '௺'), ('಼', 'ಽ'), ('៝', '៝'), + ('៰', '៹'), ('ᤀ', 'ᤜ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), + ('᥀', '᥀'), ('᥄', 'ᥭ'), ('ᥰ', 'ᥴ'), ('᧠', '᧿'), + ('ᴀ', 'ᵫ'), ('⁓', '⁔'), ('℻', '℻'), ('⏏', '⏐'), + ('⓿', '⓿'), ('☔', '☕'), ('⚊', '⚑'), ('⚠', '⚡'), + ('⬀', '⬍'), ('㈝', '㈞'), ('㉐', '㉐'), ('㉼', '㉽'), + ('㋌', '㋏'), ('㍷', '㍺'), ('㏞', '㏟'), ('㏿', '㏿'), + ('䷀', '䷿'), ('﷽', '﷽'), ('﹇', '﹈'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), + ('𐄷', '𐄿'), ('𐎀', '𐎝'), ('𐎟', '𐎟'), ('𐐦', '𐐧'), + ('𐑎', '𐒝'), ('𐒠', '𐒩'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿'), + ('𝌀', '𝍖'), ('𝓁', '𝓁'), ('󠄀', '󠇯'), +]; + +pub const V4_1: &'static [(char, char)] = &[ + ('ȷ', 'Ɂ'), ('͘', '͜'), ('ϼ', 'Ͽ'), ('Ӷ', 'ӷ'), ('֢', '֢'), + ('ׅ', 'ׇ'), ('؋', '؋'), ('؞', '؞'), ('ٙ', 'ٞ'), ('ݐ', 'ݭ'), + ('ॽ', 'ॽ'), ('ৎ', 'ৎ'), ('ஶ', 'ஶ'), ('௦', '௦'), + ('࿐', '࿑'), ('ჹ', 'ჺ'), ('ჼ', 'ჼ'), ('ሇ', 'ሇ'), + ('ቇ', 'ቇ'), ('ኇ', 'ኇ'), ('ኯ', 'ኯ'), ('ዏ', 'ዏ'), + ('ዯ', 'ዯ'), ('ጏ', 'ጏ'), ('ጟ', 'ጟ'), ('ፇ', 'ፇ'), + ('፟', '፠'), ('ᎀ', '᎙'), ('ᦀ', 'ᦩ'), ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), ('᧞', '᧟'), ('ᨀ', 'ᨛ'), ('᨞', '᨟'), + ('ᵬ', '᷃'), ('⁕', '⁖'), ('⁘', '⁞'), ('ₐ', 'ₔ'), + ('₲', '₵'), ('⃫', '⃫'), ('ℼ', 'ℼ'), ('⅌', '⅌'), + ('⏑', '⏛'), ('☘', '☘'), ('♾', '♿'), ('⚒', '⚜'), + ('⚢', '⚱'), ('⟀', '⟆'), ('⬎', '⬓'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⲁ', '⳪'), ('⳹', 'ⴥ'), ('ⴰ', 'ⵥ'), + ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⸗'), ('⸜', '⸝'), + ('㇀', '㇏'), ('㉾', '㉾'), ('龦', '龻'), ('꜀', '꜖'), + ('ꠀ', '꠫'), ('並', '龎'), ('︐', '︙'), ('𐅀', '𐆊'), + ('𐎠', '𐏃'), ('𐏈', '𐏕'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), + ('𐨿', '𐩇'), ('𐩐', '𐩘'), ('𝈀', '𝉅'), ('𝚤', '𝚥'), +]; + +pub const V5_0: &'static [(char, char)] = &[ + ('ɂ', 'ɏ'), ('ͻ', 'ͽ'), ('ӏ', 'ӏ'), ('Ӻ', 'ӿ'), ('Ԑ', 'ԓ'), + ('ֺ', 'ֺ'), ('߀', 'ߺ'), ('ॻ', 'ॼ'), ('ॾ', 'ॿ'), ('ೢ', 'ೣ'), + ('ೱ', 'ೲ'), ('ᬀ', 'ᭋ'), ('᭐', '᭼'), ('᷄', '᷊'), + ('᷾', '᷿'), ('⃬', '⃯'), ('⅍', 'ⅎ'), ('ↄ', 'ↄ'), + ('⏜', '⏧'), ('⚲', '⚲'), ('⟇', '⟊'), ('⬔', '⬚'), + ('⬠', '⬣'), ('Ⱡ', 'ⱬ'), ('ⱴ', 'ⱷ'), ('ꜗ', 'ꜚ'), + ('꜠', '꜡'), ('ꡀ', '꡷'), ('𐤀', '𐤙'), ('𐤟', '𐤟'), + ('𒀀', '𒍮'), ('𒐀', '𒑢'), ('𒑰', '𒑳'), ('𝍠', '𝍱'), + ('𝟊', '𝟋'), +]; + +pub const V5_1: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϗ', 'Ϗ'), ('҇', '҇'), ('Ԕ', 'ԣ'), + ('؆', '؊'), ('ؖ', 'ؚ'), ('ػ', 'ؿ'), ('ݮ', 'ݿ'), ('ॱ', 'ॲ'), + ('ੑ', 'ੑ'), ('ੵ', 'ੵ'), ('ୄ', 'ୄ'), ('ୢ', 'ୣ'), + ('ௐ', 'ௐ'), ('ఽ', 'ఽ'), ('ౘ', 'ౙ'), ('ౢ', 'ౣ'), + ('౸', '౿'), ('ഽ', 'ഽ'), ('ൄ', 'ൄ'), ('ൢ', 'ൣ'), + ('൰', '൵'), ('൹', 'ൿ'), ('ཫ', 'ཬ'), ('࿎', '࿎'), + ('࿒', '࿔'), ('ဢ', 'ဢ'), ('ဨ', 'ဨ'), ('ါ', 'ါ'), + ('ဳ', 'ဵ'), ('်', 'ဿ'), ('ၚ', '႙'), ('႞', '႟'), + ('ᢪ', 'ᢪ'), ('ᮀ', '᮪'), ('ᮮ', '᮹'), ('ᰀ', '᰷'), + ('᰻', '᱉'), ('ᱍ', '᱿'), ('᷋', 'ᷦ'), ('ẜ', 'ẟ'), + ('Ỻ', 'ỿ'), ('\u{2064}', '\u{2064}'), ('⃰', '⃰'), ('⅏', '⅏'), + ('ↅ', 'ↈ'), ('⚝', '⚝'), ('⚳', '⚼'), ('⛀', '⛃'), + ('⟌', '⟌'), ('⟬', '⟯'), ('⬛', '⬟'), ('⬤', '⭌'), + ('⭐', '⭔'), ('Ɑ', 'Ɐ'), ('ⱱ', 'ⱳ'), ('ⱸ', 'ⱽ'), + ('ⷠ', 'ⷿ'), ('⸘', '⸛'), ('⸞', '⸰'), ('ㄭ', 'ㄭ'), + ('㇐', '㇣'), ('龼', '鿃'), ('ꔀ', 'ꘫ'), ('Ꙁ', 'ꙟ'), + ('Ꙣ', '꙳'), ('꙼', 'ꚗ'), ('ꜛ', 'ꜟ'), ('Ꜣ', 'ꞌ'), + ('ꟻ', 'ꟿ'), ('ꢀ', '꣄'), ('꣎', '꣙'), ('꤀', '꥓'), + ('꥟', '꥟'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), + ('꩜', '꩟'), ('︤', '︦'), ('𐆐', '𐆛'), ('𐇐', '𐇽'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐤠', '𐤹'), ('𐤿', '𐤿'), + ('𝄩', '𝄩'), ('🀀', '🀫'), ('🀰', '🂓'), +]; + +pub const V5_2: &'static [(char, char)] = &[ + ('Ԥ', 'ԥ'), ('ࠀ', '࠭'), ('࠰', '࠾'), ('ऀ', 'ऀ'), + ('ॎ', 'ॎ'), ('ॕ', 'ॕ'), ('ॹ', 'ॺ'), ('৻', '৻'), + ('࿕', '࿘'), ('ႚ', 'ႝ'), ('ᅚ', 'ᅞ'), ('ᆣ', 'ᆧ'), + ('ᇺ', 'ᇿ'), ('᐀', '᐀'), ('ᙷ', 'ᙿ'), ('ᢰ', 'ᣵ'), + ('ᦪ', 'ᦫ'), ('᧚', '᧚'), ('ᨠ', 'ᩞ'), ('᩠', '᩼'), + ('᩿', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ('᳐', 'ᳲ'), + ('᷽', '᷽'), ('₶', '₸'), ('⅐', '⅒'), ('↉', '↉'), + ('⏨', '⏨'), ('⚞', '⚟'), ('⚽', '⚿'), ('⛄', '⛍'), + ('⛏', '⛡'), ('⛣', '⛣'), ('⛨', '⛿'), ('❗', '❗'), + ('⭕', '⭙'), ('Ɒ', 'Ɒ'), ('Ȿ', 'Ɀ'), ('Ⳬ', '⳱'), + ('⸱', '⸱'), ('㉄', '㉏'), ('鿄', '鿋'), ('ꓐ', '꓿'), + ('ꚠ', '꛷'), ('꠰', '꠹'), ('꣠', 'ꣻ'), ('ꥠ', 'ꥼ'), + ('ꦀ', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟'), ('ꩠ', 'ꩻ'), + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), ('ꯀ', '꯭'), ('꯰', '꯹'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('恵', '舘'), ('𐡀', '𐡕'), + ('𐡗', '𐡟'), ('𐤚', '𐤛'), ('𐩠', '𐩿'), ('𐬀', '𐬵'), + ('𐬹', '𐭕'), ('𐭘', '𐭲'), ('𐭸', '𐭿'), ('𐰀', '𐱈'), + ('𐹠', '𐹾'), ('𑂀', '𑃁'), ('𓀀', '𓐮'), ('🄀', '🄊'), + ('🄐', '🄮'), ('🄱', '🄱'), ('🄽', '🄽'), ('🄿', '🄿'), + ('🅂', '🅂'), ('🅆', '🅆'), ('🅊', '🅎'), ('🅗', '🅗'), + ('🅟', '🅟'), ('🅹', '🅹'), ('🅻', '🅼'), ('🅿', '🅿'), + ('🆊', '🆍'), ('🆐', '🆐'), ('🈀', '🈀'), ('🈐', '🈱'), + ('🉀', '🉈'), ('𪜀', '𫜴'), +]; + +pub const V6_0: &'static [(char, char)] = &[ + ('Ԧ', 'ԧ'), ('ؠ', 'ؠ'), ('ٟ', 'ٟ'), ('ࡀ', '࡛'), ('࡞', '࡞'), + ('ऺ', 'ऻ'), ('ॏ', 'ॏ'), ('ॖ', 'ॗ'), ('ॳ', 'ॷ'), + ('୲', '୷'), ('ഩ', 'ഩ'), ('ഺ', 'ഺ'), ('ൎ', 'ൎ'), + ('ྌ', 'ྏ'), ('࿙', '࿚'), ('፝', '፞'), ('ᯀ', '᯳'), + ('᯼', '᯿'), ('᷼', '᷼'), ('ₕ', 'ₜ'), ('₹', '₹'), + ('⏩', '⏳'), ('⛎', '⛎'), ('⛢', '⛢'), ('⛤', '⛧'), + ('✅', '✅'), ('✊', '✋'), ('✨', '✨'), ('❌', '❌'), + ('❎', '❎'), ('❓', '❕'), ('❟', '❠'), ('➕', '➗'), + ('➰', '➰'), ('➿', '➿'), ('⟎', '⟏'), ('⵰', '⵰'), + ('⵿', '⵿'), ('ㆸ', 'ㆺ'), ('Ꙡ', 'ꙡ'), ('Ɥ', 'ꞎ'), + ('Ꞑ', 'ꞑ'), ('Ꞡ', 'ꞩ'), ('ꟺ', 'ꟺ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), + ('﮲', '﯁'), ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𖠀', '𖨸'), + ('𛀀', '𛀁'), ('🂠', '🂮'), ('🂱', '🂾'), ('🃁', '🃏'), + ('🃑', '🃟'), ('🄰', '🄰'), ('🄲', '🄼'), ('🄾', '🄾'), + ('🅀', '🅁'), ('🅃', '🅅'), ('🅇', '🅉'), ('🅏', '🅖'), + ('🅘', '🅞'), ('🅠', '🅩'), ('🅰', '🅸'), ('🅺', '🅺'), + ('🅽', '🅾'), ('🆀', '🆉'), ('🆎', '🆏'), ('🆑', '🆚'), + ('🇦', '🇿'), ('🈁', '🈂'), ('🈲', '🈺'), ('🉐', '🉑'), + ('🌀', '🌠'), ('🌰', '🌵'), ('🌷', '🍼'), ('🎀', '🎓'), + ('🎠', '🏄'), ('🏆', '🏊'), ('🏠', '🏰'), ('🐀', '🐾'), + ('👀', '👀'), ('👂', '📷'), ('📹', '📼'), ('🔀', '🔽'), + ('🕐', '🕧'), ('🗻', '🗿'), ('😁', '😐'), ('😒', '😔'), + ('😖', '😖'), ('😘', '😘'), ('😚', '😚'), ('😜', '😞'), + ('😠', '😥'), ('😨', '😫'), ('😭', '😭'), ('😰', '😳'), + ('😵', '🙀'), ('🙅', '🙏'), ('🚀', '🛅'), ('🜀', '🝳'), + ('𫝀', '𫠝'), +]; + +pub const V6_1: &'static [(char, char)] = &[ + ('֏', '֏'), ('\u{604}', '\u{604}'), ('ࢠ', 'ࢠ'), ('ࢢ', 'ࢬ'), + ('ࣤ', 'ࣾ'), ('૰', '૰'), ('ໞ', 'ໟ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ჽ', 'ჿ'), ('᮫', 'ᮭ'), ('ᮺ', 'ᮿ'), + ('᳀', '᳇'), ('ᳳ', 'ᳶ'), ('⟋', '⟋'), ('⟍', '⟍'), + ('Ⳳ', 'ⳳ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⵦ', 'ⵧ'), + ('⸲', '⸻'), ('鿌', '鿌'), ('ꙴ', 'ꙻ'), ('ꚟ', 'ꚟ'), + ('Ꞓ', 'ꞓ'), ('Ɦ', 'Ɦ'), ('ꟸ', 'ꟹ'), ('ꫠ', '꫶'), + ('郞', '隷'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𑃐', '𑃨'), + ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑅃'), ('𑆀', '𑇈'), + ('𑇐', '𑇙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𖼀', '𖽄'), + ('𖽐', '𖽾'), ('𖾏', '𖾟'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), + ('🅪', '🅫'), ('🕀', '🕃'), ('😀', '😀'), ('😑', '😑'), + ('😕', '😕'), ('😗', '😗'), ('😙', '😙'), ('😛', '😛'), + ('😟', '😟'), ('😦', '😧'), ('😬', '😬'), ('😮', '😯'), + ('😴', '😴'), +]; + +pub const V6_2: &'static [(char, char)] = &[ + ('₺', '₺'), +]; + +pub const V6_3: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}'), +]; + +pub const V7_0: &'static [(char, char)] = &[ + ('Ϳ', 'Ϳ'), ('Ԩ', 'ԯ'), ('֍', '֎'), ('\u{605}', '\u{605}'), + ('ࢡ', 'ࢡ'), ('ࢭ', 'ࢲ'), ('ࣿ', 'ࣿ'), ('ॸ', 'ॸ'), + ('ঀ', 'ঀ'), ('ఀ', 'ఀ'), ('ఴ', 'ఴ'), ('ಁ', 'ಁ'), + ('ഁ', 'ഁ'), ('෦', '෯'), ('ᛱ', 'ᛸ'), ('ᤝ', 'ᤞ'), + ('᪰', '᪾'), ('᳸', '᳹'), ('ᷧ', '᷵'), ('₻', '₽'), + ('⏴', '⏺'), ('✀', '✀'), ('⭍', '⭏'), ('⭚', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯑'), + ('⸼', '⹂'), ('Ꚙ', 'ꚝ'), ('ꞔ', 'ꞟ'), ('Ɜ', 'Ɬ'), + ('Ʞ', 'Ʇ'), ('ꟷ', 'ꟷ'), ('ꧠ', 'ꧾ'), ('ꩼ', 'ꩿ'), + ('ꬰ', 'ꭟ'), ('ꭤ', 'ꭥ'), ('︧', '︭'), ('𐆋', '𐆌'), + ('𐆠', '𐆠'), ('𐋠', '𐋻'), ('𐌟', '𐌟'), ('𐍐', '𐍺'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕯'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐡠', '𐢞'), ('𐢧', '𐢯'), + ('𐪀', '𐪟'), ('𐫀', '𐫦'), ('𐫫', '𐫶'), ('𐮀', '𐮑'), + ('𐮙', '𐮜'), ('𐮩', '𐮯'), ('𑁿', '𑁿'), ('𑅐', '𑅶'), + ('𑇍', '𑇍'), ('𑇚', '𑇚'), ('𑇡', '𑇴'), ('𑈀', '𑈑'), + ('𑈓', '𑈽'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌁', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍍'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), + ('𑖸', '𑗉'), ('𑘀', '𑙄'), ('𑙐', '𑙙'), ('𑢠', '𑣲'), + ('𑣿', '𑣿'), ('𑫀', '𑫸'), ('𒍯', '𒎘'), ('𒑣', '𒑮'), + ('𒑴', '𒑴'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), + ('𖫐', '𖫭'), ('𖫰', '𖫵'), ('𖬀', '𖭅'), ('𖭐', '𖭙'), + ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}'), + ('𞠀', '𞣄'), ('𞣇', '𞣖'), ('🂿', '🂿'), ('🃠', '🃵'), + ('🄋', '🄌'), ('🌡', '🌬'), ('🌶', '🌶'), ('🍽', '🍽'), + ('🎔', '🎟'), ('🏅', '🏅'), ('🏋', '🏎'), ('🏔', '🏟'), + ('🏱', '🏷'), ('🐿', '🐿'), ('👁', '👁'), ('📸', '📸'), + ('📽', '📾'), ('🔾', '🔿'), ('🕄', '🕊'), ('🕨', '🕹'), + ('🕻', '🖣'), ('🖥', '🗺'), ('🙁', '🙂'), ('🙐', '🙿'), + ('🛆', '🛏'), ('🛠', '🛬'), ('🛰', '🛳'), ('🞀', '🟔'), + ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), + ('🢐', '🢭'), +]; + +pub const V8_0: &'static [(char, char)] = &[ + ('ࢳ', 'ࢴ'), ('ࣣ', 'ࣣ'), ('ૹ', 'ૹ'), ('ౚ', 'ౚ'), + ('ൟ', 'ൟ'), ('Ᏽ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('₾', '₾'), + ('↊', '↋'), ('⯬', '⯯'), ('鿍', '鿕'), ('ꚞ', 'ꚞ'), + ('ꞏ', 'ꞏ'), ('Ʝ', 'ꞷ'), ('꣼', 'ꣽ'), ('ꭠ', 'ꭣ'), + ('ꭰ', 'ꮿ'), ('︮', '︯'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐣻', '𐣿'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), ('𐧒', '𐧿'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), ('𑇉', '𑇌'), + ('𑇛', '𑇟'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('𑌀', '𑌀'), ('𑍐', '𑍐'), + ('𑗊', '𑗝'), ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), + ('𒎙', '𒎙'), ('𒒀', '𒕃'), ('𔐀', '𔙆'), ('𝇞', '𝇨'), + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('🌭', '🌯'), + ('🍾', '🍿'), ('🏏', '🏓'), ('🏸', '🏿'), ('📿', '📿'), + ('🕋', '🕏'), ('🙃', '🙄'), ('🛐', '🛐'), ('🤐', '🤘'), + ('🦀', '🦄'), ('🧀', '🧀'), ('𫠠', '𬺡'), +]; + +pub const V9_0: &'static [(char, char)] = &[ + ('ࢶ', 'ࢽ'), ('ࣔ', '\u{8e2}'), ('ಀ', 'ಀ'), ('൏', '൏'), + ('ൔ', 'ൖ'), ('൘', '൞'), ('൶', '൸'), ('ᲀ', 'ᲈ'), + ('᷻', '᷻'), ('⏻', '⏾'), ('⹃', '⹄'), ('Ɪ', 'Ɪ'), + ('ꣅ', 'ꣅ'), ('𐆍', '𐆎'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𑈾', '𑈾'), ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), + ('𑙠', '𑙬'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), + ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), ('🆛', '🆬'), + ('🈻', '🈻'), ('🕺', '🕺'), ('🖤', '🖤'), ('🛑', '🛒'), + ('🛴', '🛶'), ('🤙', '🤞'), ('🤠', '🤧'), ('🤰', '🤰'), + ('🤳', '🤾'), ('🥀', '🥋'), ('🥐', '🥞'), ('🦅', '🦑'), +]; diff --git a/regex-syntax-2/src/unicode_tables/case_folding_simple.rs b/regex-syntax-2/src/unicode_tables/case_folding_simple.rs new file mode 100644 index 0000000000..72ec79f0dc --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/case_folding_simple.rs @@ -0,0 +1,662 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple /home/andrew/tmp/ucd-10.0.0/ --chars --all-pairs +// +// ucd-generate is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ + ('A', &['a']), ('B', &['b']), ('C', &['c']), ('D', &['d']), ('E', &['e']), + ('F', &['f']), ('G', &['g']), ('H', &['h']), ('I', &['i']), ('J', &['j']), + ('K', &['k', 'K', ]), ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &[ + 'o']), ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s', 'ſ', ]), + ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), ('X', &['x']), + ('Y', &['y']), ('Z', &['z']), ('a', &['A']), ('b', &['B']), ('c', &['C']), + ('d', &['D']), ('e', &['E']), ('f', &['F']), ('g', &['G']), ('h', &['H']), + ('i', &['I']), ('j', &['J']), ('k', &['K', 'K', ]), ('l', &['L']), ('m', &[ + 'M']), ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), ('r', &[ + 'R']), ('s', &['S', 'ſ', ]), ('t', &['T']), ('u', &['U']), ('v', &['V']), + ('w', &['W']), ('x', &['X']), ('y', &['Y']), ('z', &['Z']), ('µ', &['Μ', + 'μ', ]), ('À', &['à']), ('Á', &['á']), ('Â', &['â']), ('Ã', &['ã' + ]), ('Ä', &['ä']), ('Å', &['å', 'Å', ]), ('Æ', &['æ']), ('Ç', &['ç' + ]), ('È', &['è']), ('É', &['é']), ('Ê', &['ê']), ('Ë', &['ë']), + ('Ì', &['ì']), ('Í', &['í']), ('Î', &['î']), ('Ï', &['ï']), ('Ð', &[ + 'ð']), ('Ñ', &['ñ']), ('Ò', &['ò']), ('Ó', &['ó']), ('Ô', &['ô']), + ('Õ', &['õ']), ('Ö', &['ö']), ('Ø', &['ø']), ('Ù', &['ù']), ('Ú', &[ + 'ú']), ('Û', &['û']), ('Ü', &['ü']), ('Ý', &['ý']), ('Þ', &['þ']), + ('ß', &['ẞ']), ('à', &['À']), ('á', &['Á']), ('â', &['Â']), + ('ã', &['Ã']), ('ä', &['Ä']), ('å', &['Å', 'Å', ]), ('æ', &['Æ']), + ('ç', &['Ç']), ('è', &['È']), ('é', &['É']), ('ê', &['Ê']), ('ë', &[ + 'Ë']), ('ì', &['Ì']), ('í', &['Í']), ('î', &['Î']), ('ï', &['Ï']), + ('ð', &['Ð']), ('ñ', &['Ñ']), ('ò', &['Ò']), ('ó', &['Ó']), ('ô', &[ + 'Ô']), ('õ', &['Õ']), ('ö', &['Ö']), ('ø', &['Ø']), ('ù', &['Ù']), + ('ú', &['Ú']), ('û', &['Û']), ('ü', &['Ü']), ('ý', &['Ý']), ('þ', &[ + 'Þ']), ('ÿ', &['Ÿ']), ('Ā', &['ā']), ('ā', &['Ā']), ('Ă', &['ă']), + ('ă', &['Ă']), ('Ą', &['ą']), ('ą', &['Ą']), ('Ć', &['ć']), ('ć', &[ + 'Ć']), ('Ĉ', &['ĉ']), ('ĉ', &['Ĉ']), ('Ċ', &['ċ']), ('ċ', &['Ċ']), + ('Č', &['č']), ('č', &['Č']), ('Ď', &['ď']), ('ď', &['Ď']), ('Đ', &[ + 'đ']), ('đ', &['Đ']), ('Ē', &['ē']), ('ē', &['Ē']), ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), ('Ė', &['ė']), ('ė', &['Ė']), ('Ę', &['ę']), ('ę', &[ + 'Ę']), ('Ě', &['ě']), ('ě', &['Ě']), ('Ĝ', &['ĝ']), ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), ('ğ', &['Ğ']), ('Ġ', &['ġ']), ('ġ', &['Ġ']), ('Ģ', &[ + 'ģ']), ('ģ', &['Ģ']), ('Ĥ', &['ĥ']), ('ĥ', &['Ĥ']), ('Ħ', &['ħ']), + ('ħ', &['Ħ']), ('Ĩ', &['ĩ']), ('ĩ', &['Ĩ']), ('Ī', &['ī']), ('ī', &[ + 'Ī']), ('Ĭ', &['ĭ']), ('ĭ', &['Ĭ']), ('Į', &['į']), ('į', &['Į']), + ('IJ', &['ij']), ('ij', &['IJ']), ('Ĵ', &['ĵ']), ('ĵ', &['Ĵ']), ('Ķ', &[ + 'ķ']), ('ķ', &['Ķ']), ('Ĺ', &['ĺ']), ('ĺ', &['Ĺ']), ('Ļ', &['ļ']), + ('ļ', &['Ļ']), ('Ľ', &['ľ']), ('ľ', &['Ľ']), ('Ŀ', &['ŀ']), ('ŀ', &[ + 'Ŀ']), ('Ł', &['ł']), ('ł', &['Ł']), ('Ń', &['ń']), ('ń', &['Ń']), + ('Ņ', &['ņ']), ('ņ', &['Ņ']), ('Ň', &['ň']), ('ň', &['Ň']), ('Ŋ', &[ + 'ŋ']), ('ŋ', &['Ŋ']), ('Ō', &['ō']), ('ō', &['Ō']), ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), ('Ő', &['ő']), ('ő', &['Ő']), ('Œ', &['œ']), ('œ', &[ + 'Œ']), ('Ŕ', &['ŕ']), ('ŕ', &['Ŕ']), ('Ŗ', &['ŗ']), ('ŗ', &['Ŗ']), + ('Ř', &['ř']), ('ř', &['Ř']), ('Ś', &['ś']), ('ś', &['Ś']), ('Ŝ', &[ + 'ŝ']), ('ŝ', &['Ŝ']), ('Ş', &['ş']), ('ş', &['Ş']), ('Š', &['š']), + ('š', &['Š']), ('Ţ', &['ţ']), ('ţ', &['Ţ']), ('Ť', &['ť']), ('ť', &[ + 'Ť']), ('Ŧ', &['ŧ']), ('ŧ', &['Ŧ']), ('Ũ', &['ũ']), ('ũ', &['Ũ']), + ('Ū', &['ū']), ('ū', &['Ū']), ('Ŭ', &['ŭ']), ('ŭ', &['Ŭ']), ('Ů', &[ + 'ů']), ('ů', &['Ů']), ('Ű', &['ű']), ('ű', &['Ű']), ('Ų', &['ų']), + ('ų', &['Ų']), ('Ŵ', &['ŵ']), ('ŵ', &['Ŵ']), ('Ŷ', &['ŷ']), ('ŷ', &[ + 'Ŷ']), ('Ÿ', &['ÿ']), ('Ź', &['ź']), ('ź', &['Ź']), ('Ż', &['ż']), + ('ż', &['Ż']), ('Ž', &['ž']), ('ž', &['Ž']), ('ſ', &['S', 's', ]), + ('ƀ', &['Ƀ']), ('Ɓ', &['ɓ']), ('Ƃ', &['ƃ']), ('ƃ', &['Ƃ']), ('Ƅ', &[ + 'ƅ']), ('ƅ', &['Ƅ']), ('Ɔ', &['ɔ']), ('Ƈ', &['ƈ']), ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), ('Ɗ', &['ɗ']), ('Ƌ', &['ƌ']), ('ƌ', &['Ƌ']), ('Ǝ', &[ + 'ǝ']), ('Ə', &['ə']), ('Ɛ', &['ɛ']), ('Ƒ', &['ƒ']), ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), ('Ɣ', &['ɣ']), ('ƕ', &['Ƕ']), ('Ɩ', &['ɩ']), ('Ɨ', &[ + 'ɨ']), ('Ƙ', &['ƙ']), ('ƙ', &['Ƙ']), ('ƚ', &['Ƚ']), ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), ('ƞ', &['Ƞ']), ('Ɵ', &['ɵ']), ('Ơ', &['ơ']), ('ơ', &[ + 'Ơ']), ('Ƣ', &['ƣ']), ('ƣ', &['Ƣ']), ('Ƥ', &['ƥ']), ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), ('Ƨ', &['ƨ']), ('ƨ', &['Ƨ']), ('Ʃ', &['ʃ']), ('Ƭ', &[ + 'ƭ']), ('ƭ', &['Ƭ']), ('Ʈ', &['ʈ']), ('Ư', &['ư']), ('ư', &['Ư']), + ('Ʊ', &['ʊ']), ('Ʋ', &['ʋ']), ('Ƴ', &['ƴ']), ('ƴ', &['Ƴ']), ('Ƶ', &[ + 'ƶ']), ('ƶ', &['Ƶ']), ('Ʒ', &['ʒ']), ('Ƹ', &['ƹ']), ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), ('ƽ', &['Ƽ']), ('ƿ', &['Ƿ']), ('DŽ', &['Dž', 'dž', ]), + ('Dž', &['DŽ', 'dž', ]), ('dž', &['DŽ', 'Dž', ]), ('LJ', &['Lj', 'lj', ]), + ('Lj', &['LJ', 'lj', ]), ('lj', &['LJ', 'Lj', ]), ('NJ', &['Nj', 'nj', ]), + ('Nj', &['NJ', 'nj', ]), ('nj', &['NJ', 'Nj', ]), ('Ǎ', &['ǎ']), ('ǎ', &[ + 'Ǎ']), ('Ǐ', &['ǐ']), ('ǐ', &['Ǐ']), ('Ǒ', &['ǒ']), ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), ('ǔ', &['Ǔ']), ('Ǖ', &['ǖ']), ('ǖ', &['Ǖ']), ('Ǘ', &[ + 'ǘ']), ('ǘ', &['Ǘ']), ('Ǚ', &['ǚ']), ('ǚ', &['Ǚ']), ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), ('ǝ', &['Ǝ']), ('Ǟ', &['ǟ']), ('ǟ', &['Ǟ']), ('Ǡ', &[ + 'ǡ']), ('ǡ', &['Ǡ']), ('Ǣ', &['ǣ']), ('ǣ', &['Ǣ']), ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), ('Ǧ', &['ǧ']), ('ǧ', &['Ǧ']), ('Ǩ', &['ǩ']), ('ǩ', &[ + 'Ǩ']), ('Ǫ', &['ǫ']), ('ǫ', &['Ǫ']), ('Ǭ', &['ǭ']), ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), ('ǯ', &['Ǯ']), ('DZ', &['Dz', 'dz', ]), ('Dz', &['DZ', + 'dz', ]), ('dz', &['DZ', 'Dz', ]), ('Ǵ', &['ǵ']), ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), ('Ƿ', &['ƿ']), ('Ǹ', &['ǹ']), ('ǹ', &['Ǹ']), ('Ǻ', &[ + 'ǻ']), ('ǻ', &['Ǻ']), ('Ǽ', &['ǽ']), ('ǽ', &['Ǽ']), ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), ('Ȁ', &['ȁ']), ('ȁ', &['Ȁ']), ('Ȃ', &['ȃ']), ('ȃ', &[ + 'Ȃ']), ('Ȅ', &['ȅ']), ('ȅ', &['Ȅ']), ('Ȇ', &['ȇ']), ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), ('ȉ', &['Ȉ']), ('Ȋ', &['ȋ']), ('ȋ', &['Ȋ']), ('Ȍ', &[ + 'ȍ']), ('ȍ', &['Ȍ']), ('Ȏ', &['ȏ']), ('ȏ', &['Ȏ']), ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), ('Ȓ', &['ȓ']), ('ȓ', &['Ȓ']), ('Ȕ', &['ȕ']), ('ȕ', &[ + 'Ȕ']), ('Ȗ', &['ȗ']), ('ȗ', &['Ȗ']), ('Ș', &['ș']), ('ș', &['Ș']), + ('Ț', &['ț']), ('ț', &['Ț']), ('Ȝ', &['ȝ']), ('ȝ', &['Ȝ']), ('Ȟ', &[ + 'ȟ']), ('ȟ', &['Ȟ']), ('Ƞ', &['ƞ']), ('Ȣ', &['ȣ']), ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), ('ȥ', &['Ȥ']), ('Ȧ', &['ȧ']), ('ȧ', &['Ȧ']), ('Ȩ', &[ + 'ȩ']), ('ȩ', &['Ȩ']), ('Ȫ', &['ȫ']), ('ȫ', &['Ȫ']), ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), ('Ȯ', &['ȯ']), ('ȯ', &['Ȯ']), ('Ȱ', &['ȱ']), ('ȱ', &[ + 'Ȱ']), ('Ȳ', &['ȳ']), ('ȳ', &['Ȳ']), ('Ⱥ', &['ⱥ']), ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), ('Ƚ', &['ƚ']), ('Ⱦ', &['ⱦ']), ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), ('Ɂ', &['ɂ']), ('ɂ', &['Ɂ']), ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), ('Ʌ', &['ʌ']), ('Ɇ', &['ɇ']), ('ɇ', &['Ɇ']), ('Ɉ', &[ + 'ɉ']), ('ɉ', &['Ɉ']), ('Ɋ', &['ɋ']), ('ɋ', &['Ɋ']), ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), ('Ɏ', &['ɏ']), ('ɏ', &['Ɏ']), ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), ('ɒ', &['Ɒ']), ('ɓ', &['Ɓ']), ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), ('ɗ', &['Ɗ']), ('ə', &['Ə']), ('ɛ', &['Ɛ']), ('ɜ', &[ + 'Ɜ']), ('ɠ', &['Ɠ']), ('ɡ', &['Ɡ']), ('ɣ', &['Ɣ']), ('ɥ', &['Ɥ' + ]), ('ɦ', &['Ɦ']), ('ɨ', &['Ɨ']), ('ɩ', &['Ɩ']), ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), ('ɬ', &['Ɬ']), ('ɯ', &['Ɯ']), ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), ('ɵ', &['Ɵ']), ('ɽ', &['Ɽ']), ('ʀ', &['Ʀ']), + ('ʃ', &['Ʃ']), ('ʇ', &['Ʇ']), ('ʈ', &['Ʈ']), ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), ('ʋ', &['Ʋ']), ('ʌ', &['Ʌ']), ('ʒ', &['Ʒ']), ('ʝ', &[ + 'Ʝ']), ('ʞ', &['Ʞ']), ('ͅ', &['Ι', 'ι', 'ι', ]), ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), ('Ͳ', &['ͳ']), ('ͳ', &['Ͳ']), ('Ͷ', &['ͷ']), ('ͷ', &[ + 'Ͷ']), ('ͻ', &['Ͻ']), ('ͼ', &['Ͼ']), ('ͽ', &['Ͽ']), ('Ϳ', &['ϳ']), + ('Ά', &['ά']), ('Έ', &['έ']), ('Ή', &['ή']), ('Ί', &['ί']), ('Ό', &[ + 'ό']), ('Ύ', &['ύ']), ('Ώ', &['ώ']), ('Α', &['α']), ('Β', &['β', + 'ϐ', ]), ('Γ', &['γ']), ('Δ', &['δ']), ('Ε', &['ε', 'ϵ', ]), + ('Ζ', &['ζ']), ('Η', &['η']), ('Θ', &['θ', 'ϑ', 'ϴ', ]), ('Ι', &[ + 'ͅ', 'ι', 'ι', ]), ('Κ', &['κ', 'ϰ', ]), ('Λ', &['λ']), ('Μ', &[ + 'µ', 'μ', ]), ('Ν', &['ν']), ('Ξ', &['ξ']), ('Ο', &['ο']), ('Π', &[ + 'π', 'ϖ', ]), ('Ρ', &['ρ', 'ϱ', ]), ('Σ', &['ς', 'σ', ]), ('Τ', &[ + 'τ']), ('Υ', &['υ']), ('Φ', &['φ', 'ϕ', ]), ('Χ', &['χ']), ('Ψ', &[ + 'ψ']), ('Ω', &['ω', 'Ω', ]), ('Ϊ', &['ϊ']), ('Ϋ', &['ϋ']), ('ά', &[ + 'Ά']), ('έ', &['Έ']), ('ή', &['Ή']), ('ί', &['Ί']), ('α', &['Α']), + ('β', &['Β', 'ϐ', ]), ('γ', &['Γ']), ('δ', &['Δ']), ('ε', &['Ε', + 'ϵ', ]), ('ζ', &['Ζ']), ('η', &['Η']), ('θ', &['Θ', 'ϑ', 'ϴ', ]), + ('ι', &['ͅ', 'Ι', 'ι', ]), ('κ', &['Κ', 'ϰ', ]), ('λ', &['Λ']), + ('μ', &['µ', 'Μ', ]), ('ν', &['Ν']), ('ξ', &['Ξ']), ('ο', &['Ο']), + ('π', &['Π', 'ϖ', ]), ('ρ', &['Ρ', 'ϱ', ]), ('ς', &['Σ', 'σ', ]), + ('σ', &['Σ', 'ς', ]), ('τ', &['Τ']), ('υ', &['Υ']), ('φ', &['Φ', + 'ϕ', ]), ('χ', &['Χ']), ('ψ', &['Ψ']), ('ω', &['Ω', 'Ω', ]), + ('ϊ', &['Ϊ']), ('ϋ', &['Ϋ']), ('ό', &['Ό']), ('ύ', &['Ύ']), ('ώ', &[ + 'Ώ']), ('Ϗ', &['ϗ']), ('ϐ', &['Β', 'β', ]), ('ϑ', &['Θ', 'θ', 'ϴ', + ]), ('ϕ', &['Φ', 'φ', ]), ('ϖ', &['Π', 'π', ]), ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), ('ϙ', &['Ϙ']), ('Ϛ', &['ϛ']), ('ϛ', &['Ϛ']), ('Ϝ', &[ + 'ϝ']), ('ϝ', &['Ϝ']), ('Ϟ', &['ϟ']), ('ϟ', &['Ϟ']), ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), ('Ϣ', &['ϣ']), ('ϣ', &['Ϣ']), ('Ϥ', &['ϥ']), ('ϥ', &[ + 'Ϥ']), ('Ϧ', &['ϧ']), ('ϧ', &['Ϧ']), ('Ϩ', &['ϩ']), ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), ('ϫ', &['Ϫ']), ('Ϭ', &['ϭ']), ('ϭ', &['Ϭ']), ('Ϯ', &[ + 'ϯ']), ('ϯ', &['Ϯ']), ('ϰ', &['Κ', 'κ', ]), ('ϱ', &['Ρ', 'ρ', ]), + ('ϲ', &['Ϲ']), ('ϳ', &['Ϳ']), ('ϴ', &['Θ', 'θ', 'ϑ', ]), ('ϵ', &[ + 'Ε', 'ε', ]), ('Ϸ', &['ϸ']), ('ϸ', &['Ϸ']), ('Ϲ', &['ϲ']), ('Ϻ', &[ + 'ϻ']), ('ϻ', &['Ϻ']), ('Ͻ', &['ͻ']), ('Ͼ', &['ͼ']), ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), ('Ё', &['ё']), ('Ђ', &['ђ']), ('Ѓ', &['ѓ']), ('Є', &[ + 'є']), ('Ѕ', &['ѕ']), ('І', &['і']), ('Ї', &['ї']), ('Ј', &['ј']), + ('Љ', &['љ']), ('Њ', &['њ']), ('Ћ', &['ћ']), ('Ќ', &['ќ']), ('Ѝ', &[ + 'ѝ']), ('Ў', &['ў']), ('Џ', &['џ']), ('А', &['а']), ('Б', &['б']), + ('В', &['в', 'ᲀ', ]), ('Г', &['г']), ('Д', &['д', 'ᲁ', ]), + ('Е', &['е']), ('Ж', &['ж']), ('З', &['з']), ('И', &['и']), ('Й', &[ + 'й']), ('К', &['к']), ('Л', &['л']), ('М', &['м']), ('Н', &['н']), + ('О', &['о', 'ᲂ', ]), ('П', &['п']), ('Р', &['р']), ('С', &['с', + 'ᲃ', ]), ('Т', &['т', 'ᲄ', 'ᲅ', ]), ('У', &['у']), ('Ф', &['ф' + ]), ('Х', &['х']), ('Ц', &['ц']), ('Ч', &['ч']), ('Ш', &['ш']), + ('Щ', &['щ']), ('Ъ', &['ъ', 'ᲆ', ]), ('Ы', &['ы']), ('Ь', &['ь']), + ('Э', &['э']), ('Ю', &['ю']), ('Я', &['я']), ('а', &['А']), ('б', &[ + 'Б']), ('в', &['В', 'ᲀ', ]), ('г', &['Г']), ('д', &['Д', 'ᲁ', ]), + ('е', &['Е']), ('ж', &['Ж']), ('з', &['З']), ('и', &['И']), ('й', &[ + 'Й']), ('к', &['К']), ('л', &['Л']), ('м', &['М']), ('н', &['Н']), + ('о', &['О', 'ᲂ', ]), ('п', &['П']), ('р', &['Р']), ('с', &['С', + 'ᲃ', ]), ('т', &['Т', 'ᲄ', 'ᲅ', ]), ('у', &['У']), ('ф', &['Ф' + ]), ('х', &['Х']), ('ц', &['Ц']), ('ч', &['Ч']), ('ш', &['Ш']), + ('щ', &['Щ']), ('ъ', &['Ъ', 'ᲆ', ]), ('ы', &['Ы']), ('ь', &['Ь']), + ('э', &['Э']), ('ю', &['Ю']), ('я', &['Я']), ('ѐ', &['Ѐ']), ('ё', &[ + 'Ё']), ('ђ', &['Ђ']), ('ѓ', &['Ѓ']), ('є', &['Є']), ('ѕ', &['Ѕ']), + ('і', &['І']), ('ї', &['Ї']), ('ј', &['Ј']), ('љ', &['Љ']), ('њ', &[ + 'Њ']), ('ћ', &['Ћ']), ('ќ', &['Ќ']), ('ѝ', &['Ѝ']), ('ў', &['Ў']), + ('џ', &['Џ']), ('Ѡ', &['ѡ']), ('ѡ', &['Ѡ']), ('Ѣ', &['ѣ', 'ᲇ', ]), + ('ѣ', &['Ѣ', 'ᲇ', ]), ('Ѥ', &['ѥ']), ('ѥ', &['Ѥ']), ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), ('Ѩ', &['ѩ']), ('ѩ', &['Ѩ']), ('Ѫ', &['ѫ']), ('ѫ', &[ + 'Ѫ']), ('Ѭ', &['ѭ']), ('ѭ', &['Ѭ']), ('Ѯ', &['ѯ']), ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), ('ѱ', &['Ѱ']), ('Ѳ', &['ѳ']), ('ѳ', &['Ѳ']), ('Ѵ', &[ + 'ѵ']), ('ѵ', &['Ѵ']), ('Ѷ', &['ѷ']), ('ѷ', &['Ѷ']), ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), ('Ѻ', &['ѻ']), ('ѻ', &['Ѻ']), ('Ѽ', &['ѽ']), ('ѽ', &[ + 'Ѽ']), ('Ѿ', &['ѿ']), ('ѿ', &['Ѿ']), ('Ҁ', &['ҁ']), ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), ('ҋ', &['Ҋ']), ('Ҍ', &['ҍ']), ('ҍ', &['Ҍ']), ('Ҏ', &[ + 'ҏ']), ('ҏ', &['Ҏ']), ('Ґ', &['ґ']), ('ґ', &['Ґ']), ('Ғ', &['ғ']), + ('ғ', &['Ғ']), ('Ҕ', &['ҕ']), ('ҕ', &['Ҕ']), ('Җ', &['җ']), ('җ', &[ + 'Җ']), ('Ҙ', &['ҙ']), ('ҙ', &['Ҙ']), ('Қ', &['қ']), ('қ', &['Қ']), + ('Ҝ', &['ҝ']), ('ҝ', &['Ҝ']), ('Ҟ', &['ҟ']), ('ҟ', &['Ҟ']), ('Ҡ', &[ + 'ҡ']), ('ҡ', &['Ҡ']), ('Ң', &['ң']), ('ң', &['Ң']), ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), ('Ҧ', &['ҧ']), ('ҧ', &['Ҧ']), ('Ҩ', &['ҩ']), ('ҩ', &[ + 'Ҩ']), ('Ҫ', &['ҫ']), ('ҫ', &['Ҫ']), ('Ҭ', &['ҭ']), ('ҭ', &['Ҭ']), + ('Ү', &['ү']), ('ү', &['Ү']), ('Ұ', &['ұ']), ('ұ', &['Ұ']), ('Ҳ', &[ + 'ҳ']), ('ҳ', &['Ҳ']), ('Ҵ', &['ҵ']), ('ҵ', &['Ҵ']), ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), ('Ҹ', &['ҹ']), ('ҹ', &['Ҹ']), ('Һ', &['һ']), ('һ', &[ + 'Һ']), ('Ҽ', &['ҽ']), ('ҽ', &['Ҽ']), ('Ҿ', &['ҿ']), ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), ('Ӂ', &['ӂ']), ('ӂ', &['Ӂ']), ('Ӄ', &['ӄ']), ('ӄ', &[ + 'Ӄ']), ('Ӆ', &['ӆ']), ('ӆ', &['Ӆ']), ('Ӈ', &['ӈ']), ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), ('ӊ', &['Ӊ']), ('Ӌ', &['ӌ']), ('ӌ', &['Ӌ']), ('Ӎ', &[ + 'ӎ']), ('ӎ', &['Ӎ']), ('ӏ', &['Ӏ']), ('Ӑ', &['ӑ']), ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), ('ӓ', &['Ӓ']), ('Ӕ', &['ӕ']), ('ӕ', &['Ӕ']), ('Ӗ', &[ + 'ӗ']), ('ӗ', &['Ӗ']), ('Ә', &['ә']), ('ә', &['Ә']), ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), ('Ӝ', &['ӝ']), ('ӝ', &['Ӝ']), ('Ӟ', &['ӟ']), ('ӟ', &[ + 'Ӟ']), ('Ӡ', &['ӡ']), ('ӡ', &['Ӡ']), ('Ӣ', &['ӣ']), ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), ('ӥ', &['Ӥ']), ('Ӧ', &['ӧ']), ('ӧ', &['Ӧ']), ('Ө', &[ + 'ө']), ('ө', &['Ө']), ('Ӫ', &['ӫ']), ('ӫ', &['Ӫ']), ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), ('Ӯ', &['ӯ']), ('ӯ', &['Ӯ']), ('Ӱ', &['ӱ']), ('ӱ', &[ + 'Ӱ']), ('Ӳ', &['ӳ']), ('ӳ', &['Ӳ']), ('Ӵ', &['ӵ']), ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), ('ӷ', &['Ӷ']), ('Ӹ', &['ӹ']), ('ӹ', &['Ӹ']), ('Ӻ', &[ + 'ӻ']), ('ӻ', &['Ӻ']), ('Ӽ', &['ӽ']), ('ӽ', &['Ӽ']), ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), ('Ԁ', &['ԁ']), ('ԁ', &['Ԁ']), ('Ԃ', &['ԃ']), ('ԃ', &[ + 'Ԃ']), ('Ԅ', &['ԅ']), ('ԅ', &['Ԅ']), ('Ԇ', &['ԇ']), ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), ('ԉ', &['Ԉ']), ('Ԋ', &['ԋ']), ('ԋ', &['Ԋ']), ('Ԍ', &[ + 'ԍ']), ('ԍ', &['Ԍ']), ('Ԏ', &['ԏ']), ('ԏ', &['Ԏ']), ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), ('Ԓ', &['ԓ']), ('ԓ', &['Ԓ']), ('Ԕ', &['ԕ']), ('ԕ', &[ + 'Ԕ']), ('Ԗ', &['ԗ']), ('ԗ', &['Ԗ']), ('Ԙ', &['ԙ']), ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), ('ԛ', &['Ԛ']), ('Ԝ', &['ԝ']), ('ԝ', &['Ԝ']), ('Ԟ', &[ + 'ԟ']), ('ԟ', &['Ԟ']), ('Ԡ', &['ԡ']), ('ԡ', &['Ԡ']), ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), ('Ԥ', &['ԥ']), ('ԥ', &['Ԥ']), ('Ԧ', &['ԧ']), ('ԧ', &[ + 'Ԧ']), ('Ԩ', &['ԩ']), ('ԩ', &['Ԩ']), ('Ԫ', &['ԫ']), ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), ('ԭ', &['Ԭ']), ('Ԯ', &['ԯ']), ('ԯ', &['Ԯ']), ('Ա', &[ + 'ա']), ('Բ', &['բ']), ('Գ', &['գ']), ('Դ', &['դ']), ('Ե', &['ե']), + ('Զ', &['զ']), ('Է', &['է']), ('Ը', &['ը']), ('Թ', &['թ']), ('Ժ', &[ + 'ժ']), ('Ի', &['ի']), ('Լ', &['լ']), ('Խ', &['խ']), ('Ծ', &['ծ']), + ('Կ', &['կ']), ('Հ', &['հ']), ('Ձ', &['ձ']), ('Ղ', &['ղ']), ('Ճ', &[ + 'ճ']), ('Մ', &['մ']), ('Յ', &['յ']), ('Ն', &['ն']), ('Շ', &['շ']), + ('Ո', &['ո']), ('Չ', &['չ']), ('Պ', &['պ']), ('Ջ', &['ջ']), ('Ռ', &[ + 'ռ']), ('Ս', &['ս']), ('Վ', &['վ']), ('Տ', &['տ']), ('Ր', &['ր']), + ('Ց', &['ց']), ('Ւ', &['ւ']), ('Փ', &['փ']), ('Ք', &['ք']), ('Օ', &[ + 'օ']), ('Ֆ', &['ֆ']), ('ա', &['Ա']), ('բ', &['Բ']), ('գ', &['Գ']), + ('դ', &['Դ']), ('ե', &['Ե']), ('զ', &['Զ']), ('է', &['Է']), ('ը', &[ + 'Ը']), ('թ', &['Թ']), ('ժ', &['Ժ']), ('ի', &['Ի']), ('լ', &['Լ']), + ('խ', &['Խ']), ('ծ', &['Ծ']), ('կ', &['Կ']), ('հ', &['Հ']), ('ձ', &[ + 'Ձ']), ('ղ', &['Ղ']), ('ճ', &['Ճ']), ('մ', &['Մ']), ('յ', &['Յ']), + ('ն', &['Ն']), ('շ', &['Շ']), ('ո', &['Ո']), ('չ', &['Չ']), ('պ', &[ + 'Պ']), ('ջ', &['Ջ']), ('ռ', &['Ռ']), ('ս', &['Ս']), ('վ', &['Վ']), + ('տ', &['Տ']), ('ր', &['Ր']), ('ց', &['Ց']), ('ւ', &['Ւ']), ('փ', &[ + 'Փ']), ('ք', &['Ք']), ('օ', &['Օ']), ('ֆ', &['Ֆ']), ('Ⴀ', &['ⴀ' + ]), ('Ⴁ', &['ⴁ']), ('Ⴂ', &['ⴂ']), ('Ⴃ', &['ⴃ']), ('Ⴄ', &['ⴄ' + ]), ('Ⴅ', &['ⴅ']), ('Ⴆ', &['ⴆ']), ('Ⴇ', &['ⴇ']), ('Ⴈ', &['ⴈ' + ]), ('Ⴉ', &['ⴉ']), ('Ⴊ', &['ⴊ']), ('Ⴋ', &['ⴋ']), ('Ⴌ', &['ⴌ' + ]), ('Ⴍ', &['ⴍ']), ('Ⴎ', &['ⴎ']), ('Ⴏ', &['ⴏ']), ('Ⴐ', &['ⴐ' + ]), ('Ⴑ', &['ⴑ']), ('Ⴒ', &['ⴒ']), ('Ⴓ', &['ⴓ']), ('Ⴔ', &['ⴔ' + ]), ('Ⴕ', &['ⴕ']), ('Ⴖ', &['ⴖ']), ('Ⴗ', &['ⴗ']), ('Ⴘ', &['ⴘ' + ]), ('Ⴙ', &['ⴙ']), ('Ⴚ', &['ⴚ']), ('Ⴛ', &['ⴛ']), ('Ⴜ', &['ⴜ' + ]), ('Ⴝ', &['ⴝ']), ('Ⴞ', &['ⴞ']), ('Ⴟ', &['ⴟ']), ('Ⴠ', &['ⴠ' + ]), ('Ⴡ', &['ⴡ']), ('Ⴢ', &['ⴢ']), ('Ⴣ', &['ⴣ']), ('Ⴤ', &['ⴤ' + ]), ('Ⴥ', &['ⴥ']), ('Ⴧ', &['ⴧ']), ('Ⴭ', &['ⴭ']), ('Ꭰ', &['ꭰ' + ]), ('Ꭱ', &['ꭱ']), ('Ꭲ', &['ꭲ']), ('Ꭳ', &['ꭳ']), ('Ꭴ', &['ꭴ' + ]), ('Ꭵ', &['ꭵ']), ('Ꭶ', &['ꭶ']), ('Ꭷ', &['ꭷ']), ('Ꭸ', &['ꭸ' + ]), ('Ꭹ', &['ꭹ']), ('Ꭺ', &['ꭺ']), ('Ꭻ', &['ꭻ']), ('Ꭼ', &['ꭼ' + ]), ('Ꭽ', &['ꭽ']), ('Ꭾ', &['ꭾ']), ('Ꭿ', &['ꭿ']), ('Ꮀ', &['ꮀ' + ]), ('Ꮁ', &['ꮁ']), ('Ꮂ', &['ꮂ']), ('Ꮃ', &['ꮃ']), ('Ꮄ', &['ꮄ' + ]), ('Ꮅ', &['ꮅ']), ('Ꮆ', &['ꮆ']), ('Ꮇ', &['ꮇ']), ('Ꮈ', &['ꮈ' + ]), ('Ꮉ', &['ꮉ']), ('Ꮊ', &['ꮊ']), ('Ꮋ', &['ꮋ']), ('Ꮌ', &['ꮌ' + ]), ('Ꮍ', &['ꮍ']), ('Ꮎ', &['ꮎ']), ('Ꮏ', &['ꮏ']), ('Ꮐ', &['ꮐ' + ]), ('Ꮑ', &['ꮑ']), ('Ꮒ', &['ꮒ']), ('Ꮓ', &['ꮓ']), ('Ꮔ', &['ꮔ' + ]), ('Ꮕ', &['ꮕ']), ('Ꮖ', &['ꮖ']), ('Ꮗ', &['ꮗ']), ('Ꮘ', &['ꮘ' + ]), ('Ꮙ', &['ꮙ']), ('Ꮚ', &['ꮚ']), ('Ꮛ', &['ꮛ']), ('Ꮜ', &['ꮜ' + ]), ('Ꮝ', &['ꮝ']), ('Ꮞ', &['ꮞ']), ('Ꮟ', &['ꮟ']), ('Ꮠ', &['ꮠ' + ]), ('Ꮡ', &['ꮡ']), ('Ꮢ', &['ꮢ']), ('Ꮣ', &['ꮣ']), ('Ꮤ', &['ꮤ' + ]), ('Ꮥ', &['ꮥ']), ('Ꮦ', &['ꮦ']), ('Ꮧ', &['ꮧ']), ('Ꮨ', &['ꮨ' + ]), ('Ꮩ', &['ꮩ']), ('Ꮪ', &['ꮪ']), ('Ꮫ', &['ꮫ']), ('Ꮬ', &['ꮬ' + ]), ('Ꮭ', &['ꮭ']), ('Ꮮ', &['ꮮ']), ('Ꮯ', &['ꮯ']), ('Ꮰ', &['ꮰ' + ]), ('Ꮱ', &['ꮱ']), ('Ꮲ', &['ꮲ']), ('Ꮳ', &['ꮳ']), ('Ꮴ', &['ꮴ' + ]), ('Ꮵ', &['ꮵ']), ('Ꮶ', &['ꮶ']), ('Ꮷ', &['ꮷ']), ('Ꮸ', &['ꮸ' + ]), ('Ꮹ', &['ꮹ']), ('Ꮺ', &['ꮺ']), ('Ꮻ', &['ꮻ']), ('Ꮼ', &['ꮼ' + ]), ('Ꮽ', &['ꮽ']), ('Ꮾ', &['ꮾ']), ('Ꮿ', &['ꮿ']), ('Ᏸ', &['ᏸ' + ]), ('Ᏹ', &['ᏹ']), ('Ᏺ', &['ᏺ']), ('Ᏻ', &['ᏻ']), ('Ᏼ', &['ᏼ' + ]), ('Ᏽ', &['ᏽ']), ('ᏸ', &['Ᏸ']), ('ᏹ', &['Ᏹ']), ('ᏺ', &['Ᏺ' + ]), ('ᏻ', &['Ᏻ']), ('ᏼ', &['Ᏼ']), ('ᏽ', &['Ᏽ']), ('ᲀ', &['В', + 'в', ]), ('ᲁ', &['Д', 'д', ]), ('ᲂ', &['О', 'о', ]), ('ᲃ', &[ + 'С', 'с', ]), ('ᲄ', &['Т', 'т', 'ᲅ', ]), ('ᲅ', &['Т', 'т', + 'ᲄ', ]), ('ᲆ', &['Ъ', 'ъ', ]), ('ᲇ', &['Ѣ', 'ѣ', ]), ('ᲈ', &[ + 'Ꙋ', 'ꙋ', ]), ('ᵹ', &['Ᵹ']), ('ᵽ', &['Ᵽ']), ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), ('Ḃ', &['ḃ']), ('ḃ', &['Ḃ']), ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), ('Ḇ', &['ḇ']), ('ḇ', &['Ḇ']), ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), ('Ḋ', &['ḋ']), ('ḋ', &['Ḋ']), ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), ('Ḏ', &['ḏ']), ('ḏ', &['Ḏ']), ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), ('Ḓ', &['ḓ']), ('ḓ', &['Ḓ']), ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), ('Ḗ', &['ḗ']), ('ḗ', &['Ḗ']), ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), ('Ḛ', &['ḛ']), ('ḛ', &['Ḛ']), ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), ('Ḟ', &['ḟ']), ('ḟ', &['Ḟ']), ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), ('Ḣ', &['ḣ']), ('ḣ', &['Ḣ']), ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), ('Ḧ', &['ḧ']), ('ḧ', &['Ḧ']), ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), ('Ḫ', &['ḫ']), ('ḫ', &['Ḫ']), ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), ('Ḯ', &['ḯ']), ('ḯ', &['Ḯ']), ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), ('Ḳ', &['ḳ']), ('ḳ', &['Ḳ']), ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), ('Ḷ', &['ḷ']), ('ḷ', &['Ḷ']), ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), ('Ḻ', &['ḻ']), ('ḻ', &['Ḻ']), ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), ('Ḿ', &['ḿ']), ('ḿ', &['Ḿ']), ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), ('Ṃ', &['ṃ']), ('ṃ', &['Ṃ']), ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), ('Ṇ', &['ṇ']), ('ṇ', &['Ṇ']), ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), ('Ṋ', &['ṋ']), ('ṋ', &['Ṋ']), ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), ('Ṏ', &['ṏ']), ('ṏ', &['Ṏ']), ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), ('Ṓ', &['ṓ']), ('ṓ', &['Ṓ']), ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), ('Ṗ', &['ṗ']), ('ṗ', &['Ṗ']), ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), ('Ṛ', &['ṛ']), ('ṛ', &['Ṛ']), ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), ('Ṟ', &['ṟ']), ('ṟ', &['Ṟ']), ('Ṡ', &['ṡ', + 'ẛ', ]), ('ṡ', &['Ṡ', 'ẛ', ]), ('Ṣ', &['ṣ']), ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), ('ṥ', &['Ṥ']), ('Ṧ', &['ṧ']), ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), ('ṩ', &['Ṩ']), ('Ṫ', &['ṫ']), ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), ('ṭ', &['Ṭ']), ('Ṯ', &['ṯ']), ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), ('ṱ', &['Ṱ']), ('Ṳ', &['ṳ']), ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), ('ṵ', &['Ṵ']), ('Ṷ', &['ṷ']), ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), ('ṹ', &['Ṹ']), ('Ṻ', &['ṻ']), ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), ('ṽ', &['Ṽ']), ('Ṿ', &['ṿ']), ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), ('ẁ', &['Ẁ']), ('Ẃ', &['ẃ']), ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), ('ẅ', &['Ẅ']), ('Ẇ', &['ẇ']), ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), ('ẉ', &['Ẉ']), ('Ẋ', &['ẋ']), ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), ('ẍ', &['Ẍ']), ('Ẏ', &['ẏ']), ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), ('ẑ', &['Ẑ']), ('Ẓ', &['ẓ']), ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), ('ẕ', &['Ẕ']), ('ẛ', &['Ṡ', 'ṡ', ]), ('ẞ', &[ + 'ß']), ('Ạ', &['ạ']), ('ạ', &['Ạ']), ('Ả', &['ả']), ('ả', &[ + 'Ả']), ('Ấ', &['ấ']), ('ấ', &['Ấ']), ('Ầ', &['ầ']), ('ầ', &[ + 'Ầ']), ('Ẩ', &['ẩ']), ('ẩ', &['Ẩ']), ('Ẫ', &['ẫ']), ('ẫ', &[ + 'Ẫ']), ('Ậ', &['ậ']), ('ậ', &['Ậ']), ('Ắ', &['ắ']), ('ắ', &[ + 'Ắ']), ('Ằ', &['ằ']), ('ằ', &['Ằ']), ('Ẳ', &['ẳ']), ('ẳ', &[ + 'Ẳ']), ('Ẵ', &['ẵ']), ('ẵ', &['Ẵ']), ('Ặ', &['ặ']), ('ặ', &[ + 'Ặ']), ('Ẹ', &['ẹ']), ('ẹ', &['Ẹ']), ('Ẻ', &['ẻ']), ('ẻ', &[ + 'Ẻ']), ('Ẽ', &['ẽ']), ('ẽ', &['Ẽ']), ('Ế', &['ế']), ('ế', &[ + 'Ế']), ('Ề', &['ề']), ('ề', &['Ề']), ('Ể', &['ể']), ('ể', &[ + 'Ể']), ('Ễ', &['ễ']), ('ễ', &['Ễ']), ('Ệ', &['ệ']), ('ệ', &[ + 'Ệ']), ('Ỉ', &['ỉ']), ('ỉ', &['Ỉ']), ('Ị', &['ị']), ('ị', &[ + 'Ị']), ('Ọ', &['ọ']), ('ọ', &['Ọ']), ('Ỏ', &['ỏ']), ('ỏ', &[ + 'Ỏ']), ('Ố', &['ố']), ('ố', &['Ố']), ('Ồ', &['ồ']), ('ồ', &[ + 'Ồ']), ('Ổ', &['ổ']), ('ổ', &['Ổ']), ('Ỗ', &['ỗ']), ('ỗ', &[ + 'Ỗ']), ('Ộ', &['ộ']), ('ộ', &['Ộ']), ('Ớ', &['ớ']), ('ớ', &[ + 'Ớ']), ('Ờ', &['ờ']), ('ờ', &['Ờ']), ('Ở', &['ở']), ('ở', &[ + 'Ở']), ('Ỡ', &['ỡ']), ('ỡ', &['Ỡ']), ('Ợ', &['ợ']), ('ợ', &[ + 'Ợ']), ('Ụ', &['ụ']), ('ụ', &['Ụ']), ('Ủ', &['ủ']), ('ủ', &[ + 'Ủ']), ('Ứ', &['ứ']), ('ứ', &['Ứ']), ('Ừ', &['ừ']), ('ừ', &[ + 'Ừ']), ('Ử', &['ử']), ('ử', &['Ử']), ('Ữ', &['ữ']), ('ữ', &[ + 'Ữ']), ('Ự', &['ự']), ('ự', &['Ự']), ('Ỳ', &['ỳ']), ('ỳ', &[ + 'Ỳ']), ('Ỵ', &['ỵ']), ('ỵ', &['Ỵ']), ('Ỷ', &['ỷ']), ('ỷ', &[ + 'Ỷ']), ('Ỹ', &['ỹ']), ('ỹ', &['Ỹ']), ('Ỻ', &['ỻ']), ('ỻ', &[ + 'Ỻ']), ('Ỽ', &['ỽ']), ('ỽ', &['Ỽ']), ('Ỿ', &['ỿ']), ('ỿ', &[ + 'Ỿ']), ('ἀ', &['Ἀ']), ('ἁ', &['Ἁ']), ('ἂ', &['Ἂ']), ('ἃ', &[ + 'Ἃ']), ('ἄ', &['Ἄ']), ('ἅ', &['Ἅ']), ('ἆ', &['Ἆ']), ('ἇ', &[ + 'Ἇ']), ('Ἀ', &['ἀ']), ('Ἁ', &['ἁ']), ('Ἂ', &['ἂ']), ('Ἃ', &[ + 'ἃ']), ('Ἄ', &['ἄ']), ('Ἅ', &['ἅ']), ('Ἆ', &['ἆ']), ('Ἇ', &[ + 'ἇ']), ('ἐ', &['Ἐ']), ('ἑ', &['Ἑ']), ('ἒ', &['Ἒ']), ('ἓ', &[ + 'Ἓ']), ('ἔ', &['Ἔ']), ('ἕ', &['Ἕ']), ('Ἐ', &['ἐ']), ('Ἑ', &[ + 'ἑ']), ('Ἒ', &['ἒ']), ('Ἓ', &['ἓ']), ('Ἔ', &['ἔ']), ('Ἕ', &[ + 'ἕ']), ('ἠ', &['Ἠ']), ('ἡ', &['Ἡ']), ('ἢ', &['Ἢ']), ('ἣ', &[ + 'Ἣ']), ('ἤ', &['Ἤ']), ('ἥ', &['Ἥ']), ('ἦ', &['Ἦ']), ('ἧ', &[ + 'Ἧ']), ('Ἠ', &['ἠ']), ('Ἡ', &['ἡ']), ('Ἢ', &['ἢ']), ('Ἣ', &[ + 'ἣ']), ('Ἤ', &['ἤ']), ('Ἥ', &['ἥ']), ('Ἦ', &['ἦ']), ('Ἧ', &[ + 'ἧ']), ('ἰ', &['Ἰ']), ('ἱ', &['Ἱ']), ('ἲ', &['Ἲ']), ('ἳ', &[ + 'Ἳ']), ('ἴ', &['Ἴ']), ('ἵ', &['Ἵ']), ('ἶ', &['Ἶ']), ('ἷ', &[ + 'Ἷ']), ('Ἰ', &['ἰ']), ('Ἱ', &['ἱ']), ('Ἲ', &['ἲ']), ('Ἳ', &[ + 'ἳ']), ('Ἴ', &['ἴ']), ('Ἵ', &['ἵ']), ('Ἶ', &['ἶ']), ('Ἷ', &[ + 'ἷ']), ('ὀ', &['Ὀ']), ('ὁ', &['Ὁ']), ('ὂ', &['Ὂ']), ('ὃ', &[ + 'Ὃ']), ('ὄ', &['Ὄ']), ('ὅ', &['Ὅ']), ('Ὀ', &['ὀ']), ('Ὁ', &[ + 'ὁ']), ('Ὂ', &['ὂ']), ('Ὃ', &['ὃ']), ('Ὄ', &['ὄ']), ('Ὅ', &[ + 'ὅ']), ('ὑ', &['Ὑ']), ('ὓ', &['Ὓ']), ('ὕ', &['Ὕ']), ('ὗ', &[ + 'Ὗ']), ('Ὑ', &['ὑ']), ('Ὓ', &['ὓ']), ('Ὕ', &['ὕ']), ('Ὗ', &[ + 'ὗ']), ('ὠ', &['Ὠ']), ('ὡ', &['Ὡ']), ('ὢ', &['Ὢ']), ('ὣ', &[ + 'Ὣ']), ('ὤ', &['Ὤ']), ('ὥ', &['Ὥ']), ('ὦ', &['Ὦ']), ('ὧ', &[ + 'Ὧ']), ('Ὠ', &['ὠ']), ('Ὡ', &['ὡ']), ('Ὢ', &['ὢ']), ('Ὣ', &[ + 'ὣ']), ('Ὤ', &['ὤ']), ('Ὥ', &['ὥ']), ('Ὦ', &['ὦ']), ('Ὧ', &[ + 'ὧ']), ('ὰ', &['Ὰ']), ('ά', &['Ά']), ('ὲ', &['Ὲ']), ('έ', &[ + 'Έ']), ('ὴ', &['Ὴ']), ('ή', &['Ή']), ('ὶ', &['Ὶ']), ('ί', &[ + 'Ί']), ('ὸ', &['Ὸ']), ('ό', &['Ό']), ('ὺ', &['Ὺ']), ('ύ', &[ + 'Ύ']), ('ὼ', &['Ὼ']), ('ώ', &['Ώ']), ('ᾀ', &['ᾈ']), ('ᾁ', &[ + 'ᾉ']), ('ᾂ', &['ᾊ']), ('ᾃ', &['ᾋ']), ('ᾄ', &['ᾌ']), ('ᾅ', &[ + 'ᾍ']), ('ᾆ', &['ᾎ']), ('ᾇ', &['ᾏ']), ('ᾈ', &['ᾀ']), ('ᾉ', &[ + 'ᾁ']), ('ᾊ', &['ᾂ']), ('ᾋ', &['ᾃ']), ('ᾌ', &['ᾄ']), ('ᾍ', &[ + 'ᾅ']), ('ᾎ', &['ᾆ']), ('ᾏ', &['ᾇ']), ('ᾐ', &['ᾘ']), ('ᾑ', &[ + 'ᾙ']), ('ᾒ', &['ᾚ']), ('ᾓ', &['ᾛ']), ('ᾔ', &['ᾜ']), ('ᾕ', &[ + 'ᾝ']), ('ᾖ', &['ᾞ']), ('ᾗ', &['ᾟ']), ('ᾘ', &['ᾐ']), ('ᾙ', &[ + 'ᾑ']), ('ᾚ', &['ᾒ']), ('ᾛ', &['ᾓ']), ('ᾜ', &['ᾔ']), ('ᾝ', &[ + 'ᾕ']), ('ᾞ', &['ᾖ']), ('ᾟ', &['ᾗ']), ('ᾠ', &['ᾨ']), ('ᾡ', &[ + 'ᾩ']), ('ᾢ', &['ᾪ']), ('ᾣ', &['ᾫ']), ('ᾤ', &['ᾬ']), ('ᾥ', &[ + 'ᾭ']), ('ᾦ', &['ᾮ']), ('ᾧ', &['ᾯ']), ('ᾨ', &['ᾠ']), ('ᾩ', &[ + 'ᾡ']), ('ᾪ', &['ᾢ']), ('ᾫ', &['ᾣ']), ('ᾬ', &['ᾤ']), ('ᾭ', &[ + 'ᾥ']), ('ᾮ', &['ᾦ']), ('ᾯ', &['ᾧ']), ('ᾰ', &['Ᾰ']), ('ᾱ', &[ + 'Ᾱ']), ('ᾳ', &['ᾼ']), ('Ᾰ', &['ᾰ']), ('Ᾱ', &['ᾱ']), ('Ὰ', &[ + 'ὰ']), ('Ά', &['ά']), ('ᾼ', &['ᾳ']), ('ι', &['ͅ', 'Ι', 'ι', + ]), ('ῃ', &['ῌ']), ('Ὲ', &['ὲ']), ('Έ', &['έ']), ('Ὴ', &['ὴ' + ]), ('Ή', &['ή']), ('ῌ', &['ῃ']), ('ῐ', &['Ῐ']), ('ῑ', &['Ῑ' + ]), ('Ῐ', &['ῐ']), ('Ῑ', &['ῑ']), ('Ὶ', &['ὶ']), ('Ί', &['ί' + ]), ('ῠ', &['Ῠ']), ('ῡ', &['Ῡ']), ('ῥ', &['Ῥ']), ('Ῠ', &['ῠ' + ]), ('Ῡ', &['ῡ']), ('Ὺ', &['ὺ']), ('Ύ', &['ύ']), ('Ῥ', &['ῥ' + ]), ('ῳ', &['ῼ']), ('Ὸ', &['ὸ']), ('Ό', &['ό']), ('Ὼ', &['ὼ' + ]), ('Ώ', &['ώ']), ('ῼ', &['ῳ']), ('Ω', &['Ω', 'ω', ]), + ('K', &['K', 'k', ]), ('Å', &['Å', 'å', ]), ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), ('Ⅰ', &['ⅰ']), ('Ⅱ', &['ⅱ']), ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), ('Ⅴ', &['ⅴ']), ('Ⅵ', &['ⅵ']), ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), ('Ⅸ', &['ⅸ']), ('Ⅹ', &['ⅹ']), ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), ('Ⅼ', &['ⅼ']), ('Ⅽ', &['ⅽ']), ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), ('ⅰ', &['Ⅰ']), ('ⅱ', &['Ⅱ']), ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), ('ⅴ', &['Ⅴ']), ('ⅵ', &['Ⅵ']), ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), ('ⅸ', &['Ⅸ']), ('ⅹ', &['Ⅹ']), ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), ('ⅼ', &['Ⅼ']), ('ⅽ', &['Ⅽ']), ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), ('Ↄ', &['ↄ']), ('ↄ', &['Ↄ']), ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), ('Ⓒ', &['ⓒ']), ('Ⓓ', &['ⓓ']), ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), ('Ⓖ', &['ⓖ']), ('Ⓗ', &['ⓗ']), ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), ('Ⓚ', &['ⓚ']), ('Ⓛ', &['ⓛ']), ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), ('Ⓞ', &['ⓞ']), ('Ⓟ', &['ⓟ']), ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), ('Ⓢ', &['ⓢ']), ('Ⓣ', &['ⓣ']), ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), ('Ⓦ', &['ⓦ']), ('Ⓧ', &['ⓧ']), ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), ('ⓐ', &['Ⓐ']), ('ⓑ', &['Ⓑ']), ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), ('ⓔ', &['Ⓔ']), ('ⓕ', &['Ⓕ']), ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), ('ⓘ', &['Ⓘ']), ('ⓙ', &['Ⓙ']), ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), ('ⓜ', &['Ⓜ']), ('ⓝ', &['Ⓝ']), ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), ('ⓠ', &['Ⓠ']), ('ⓡ', &['Ⓡ']), ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), ('ⓤ', &['Ⓤ']), ('ⓥ', &['Ⓥ']), ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), ('ⓨ', &['Ⓨ']), ('ⓩ', &['Ⓩ']), ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), ('Ⰲ', &['ⰲ']), ('Ⰳ', &['ⰳ']), ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), ('Ⰶ', &['ⰶ']), ('Ⰷ', &['ⰷ']), ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), ('Ⰺ', &['ⰺ']), ('Ⰻ', &['ⰻ']), ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), ('Ⰾ', &['ⰾ']), ('Ⰿ', &['ⰿ']), ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), ('Ⱂ', &['ⱂ']), ('Ⱃ', &['ⱃ']), ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), ('Ⱆ', &['ⱆ']), ('Ⱇ', &['ⱇ']), ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), ('Ⱊ', &['ⱊ']), ('Ⱋ', &['ⱋ']), ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), ('Ⱎ', &['ⱎ']), ('Ⱏ', &['ⱏ']), ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), ('Ⱒ', &['ⱒ']), ('Ⱓ', &['ⱓ']), ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), ('Ⱖ', &['ⱖ']), ('Ⱗ', &['ⱗ']), ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), ('Ⱚ', &['ⱚ']), ('Ⱛ', &['ⱛ']), ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), ('Ⱞ', &['ⱞ']), ('ⰰ', &['Ⰰ']), ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), ('ⰳ', &['Ⰳ']), ('ⰴ', &['Ⰴ']), ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), ('ⰷ', &['Ⰷ']), ('ⰸ', &['Ⰸ']), ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), ('ⰻ', &['Ⰻ']), ('ⰼ', &['Ⰼ']), ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), ('ⰿ', &['Ⰿ']), ('ⱀ', &['Ⱀ']), ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), ('ⱃ', &['Ⱃ']), ('ⱄ', &['Ⱄ']), ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), ('ⱇ', &['Ⱇ']), ('ⱈ', &['Ⱈ']), ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), ('ⱋ', &['Ⱋ']), ('ⱌ', &['Ⱌ']), ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), ('ⱏ', &['Ⱏ']), ('ⱐ', &['Ⱐ']), ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), ('ⱓ', &['Ⱓ']), ('ⱔ', &['Ⱔ']), ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), ('ⱗ', &['Ⱗ']), ('ⱘ', &['Ⱘ']), ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), ('ⱛ', &['Ⱛ']), ('ⱜ', &['Ⱜ']), ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), ('Ⱡ', &['ⱡ']), ('ⱡ', &['Ⱡ']), ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), ('Ɽ', &['ɽ']), ('ⱥ', &['Ⱥ']), ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), ('ⱨ', &['Ⱨ']), ('Ⱪ', &['ⱪ']), ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), ('ⱬ', &['Ⱬ']), ('Ɑ', &['ɑ']), ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), ('Ɒ', &['ɒ']), ('Ⱳ', &['ⱳ']), ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), ('ⱶ', &['Ⱶ']), ('Ȿ', &['ȿ']), ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), ('ⲁ', &['Ⲁ']), ('Ⲃ', &['ⲃ']), ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), ('ⲅ', &['Ⲅ']), ('Ⲇ', &['ⲇ']), ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), ('ⲉ', &['Ⲉ']), ('Ⲋ', &['ⲋ']), ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), ('ⲍ', &['Ⲍ']), ('Ⲏ', &['ⲏ']), ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), ('ⲑ', &['Ⲑ']), ('Ⲓ', &['ⲓ']), ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), ('ⲕ', &['Ⲕ']), ('Ⲗ', &['ⲗ']), ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), ('ⲙ', &['Ⲙ']), ('Ⲛ', &['ⲛ']), ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), ('ⲝ', &['Ⲝ']), ('Ⲟ', &['ⲟ']), ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), ('ⲡ', &['Ⲡ']), ('Ⲣ', &['ⲣ']), ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), ('ⲥ', &['Ⲥ']), ('Ⲧ', &['ⲧ']), ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), ('ⲩ', &['Ⲩ']), ('Ⲫ', &['ⲫ']), ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), ('ⲭ', &['Ⲭ']), ('Ⲯ', &['ⲯ']), ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), ('ⲱ', &['Ⲱ']), ('Ⲳ', &['ⲳ']), ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), ('ⲵ', &['Ⲵ']), ('Ⲷ', &['ⲷ']), ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), ('ⲹ', &['Ⲹ']), ('Ⲻ', &['ⲻ']), ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), ('ⲽ', &['Ⲽ']), ('Ⲿ', &['ⲿ']), ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), ('ⳁ', &['Ⳁ']), ('Ⳃ', &['ⳃ']), ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), ('ⳅ', &['Ⳅ']), ('Ⳇ', &['ⳇ']), ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), ('ⳉ', &['Ⳉ']), ('Ⳋ', &['ⳋ']), ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), ('ⳍ', &['Ⳍ']), ('Ⳏ', &['ⳏ']), ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), ('ⳑ', &['Ⳑ']), ('Ⳓ', &['ⳓ']), ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), ('ⳕ', &['Ⳕ']), ('Ⳗ', &['ⳗ']), ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), ('ⳙ', &['Ⳙ']), ('Ⳛ', &['ⳛ']), ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), ('ⳝ', &['Ⳝ']), ('Ⳟ', &['ⳟ']), ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), ('ⳡ', &['Ⳡ']), ('Ⳣ', &['ⳣ']), ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), ('ⳬ', &['Ⳬ']), ('Ⳮ', &['ⳮ']), ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), ('ⳳ', &['Ⳳ']), ('ⴀ', &['Ⴀ']), ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), ('ⴃ', &['Ⴃ']), ('ⴄ', &['Ⴄ']), ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), ('ⴇ', &['Ⴇ']), ('ⴈ', &['Ⴈ']), ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), ('ⴋ', &['Ⴋ']), ('ⴌ', &['Ⴌ']), ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), ('ⴏ', &['Ⴏ']), ('ⴐ', &['Ⴐ']), ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), ('ⴓ', &['Ⴓ']), ('ⴔ', &['Ⴔ']), ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), ('ⴗ', &['Ⴗ']), ('ⴘ', &['Ⴘ']), ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), ('ⴛ', &['Ⴛ']), ('ⴜ', &['Ⴜ']), ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), ('ⴟ', &['Ⴟ']), ('ⴠ', &['Ⴠ']), ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), ('ⴣ', &['Ⴣ']), ('ⴤ', &['Ⴤ']), ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), ('ⴭ', &['Ⴭ']), ('Ꙁ', &['ꙁ']), ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), ('ꙃ', &['Ꙃ']), ('Ꙅ', &['ꙅ']), ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), ('ꙇ', &['Ꙇ']), ('Ꙉ', &['ꙉ']), ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ', ]), ('ꙋ', &['ᲈ', 'Ꙋ', ]), ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), ('Ꙏ', &['ꙏ']), ('ꙏ', &['Ꙏ']), ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), ('Ꙓ', &['ꙓ']), ('ꙓ', &['Ꙓ']), ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), ('Ꙗ', &['ꙗ']), ('ꙗ', &['Ꙗ']), ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), ('Ꙛ', &['ꙛ']), ('ꙛ', &['Ꙛ']), ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), ('Ꙟ', &['ꙟ']), ('ꙟ', &['Ꙟ']), ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), ('Ꙣ', &['ꙣ']), ('ꙣ', &['Ꙣ']), ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), ('Ꙧ', &['ꙧ']), ('ꙧ', &['Ꙧ']), ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), ('Ꙫ', &['ꙫ']), ('ꙫ', &['Ꙫ']), ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), ('Ꚁ', &['ꚁ']), ('ꚁ', &['Ꚁ']), ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), ('Ꚅ', &['ꚅ']), ('ꚅ', &['Ꚅ']), ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), ('Ꚉ', &['ꚉ']), ('ꚉ', &['Ꚉ']), ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), ('Ꚍ', &['ꚍ']), ('ꚍ', &['Ꚍ']), ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), ('Ꚑ', &['ꚑ']), ('ꚑ', &['Ꚑ']), ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), ('Ꚕ', &['ꚕ']), ('ꚕ', &['Ꚕ']), ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), ('Ꚙ', &['ꚙ']), ('ꚙ', &['Ꚙ']), ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), ('Ꜣ', &['ꜣ']), ('ꜣ', &['Ꜣ']), ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), ('Ꜧ', &['ꜧ']), ('ꜧ', &['Ꜧ']), ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), ('Ꜫ', &['ꜫ']), ('ꜫ', &['Ꜫ']), ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), ('Ꜯ', &['ꜯ']), ('ꜯ', &['Ꜯ']), ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), ('Ꜵ', &['ꜵ']), ('ꜵ', &['Ꜵ']), ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), ('Ꜹ', &['ꜹ']), ('ꜹ', &['Ꜹ']), ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), ('Ꜽ', &['ꜽ']), ('ꜽ', &['Ꜽ']), ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), ('Ꝁ', &['ꝁ']), ('ꝁ', &['Ꝁ']), ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), ('Ꝅ', &['ꝅ']), ('ꝅ', &['Ꝅ']), ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), ('Ꝉ', &['ꝉ']), ('ꝉ', &['Ꝉ']), ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), ('Ꝍ', &['ꝍ']), ('ꝍ', &['Ꝍ']), ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), ('Ꝑ', &['ꝑ']), ('ꝑ', &['Ꝑ']), ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), ('Ꝕ', &['ꝕ']), ('ꝕ', &['Ꝕ']), ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), ('Ꝙ', &['ꝙ']), ('ꝙ', &['Ꝙ']), ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), ('Ꝝ', &['ꝝ']), ('ꝝ', &['Ꝝ']), ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), ('Ꝡ', &['ꝡ']), ('ꝡ', &['Ꝡ']), ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), ('Ꝥ', &['ꝥ']), ('ꝥ', &['Ꝥ']), ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), ('Ꝩ', &['ꝩ']), ('ꝩ', &['Ꝩ']), ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), ('Ꝭ', &['ꝭ']), ('ꝭ', &['Ꝭ']), ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), ('Ꝺ', &['ꝺ']), ('ꝺ', &['Ꝺ']), ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), ('Ᵹ', &['ᵹ']), ('Ꝿ', &['ꝿ']), ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), ('ꞁ', &['Ꞁ']), ('Ꞃ', &['ꞃ']), ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), ('ꞅ', &['Ꞅ']), ('Ꞇ', &['ꞇ']), ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), ('ꞌ', &['Ꞌ']), ('Ɥ', &['ɥ']), ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), ('Ꞓ', &['ꞓ']), ('ꞓ', &['Ꞓ']), ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), ('Ꞙ', &['ꞙ']), ('ꞙ', &['Ꞙ']), ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), ('Ꞝ', &['ꞝ']), ('ꞝ', &['Ꞝ']), ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), ('Ꞡ', &['ꞡ']), ('ꞡ', &['Ꞡ']), ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), ('Ꞥ', &['ꞥ']), ('ꞥ', &['Ꞥ']), ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), ('Ꞩ', &['ꞩ']), ('ꞩ', &['Ꞩ']), ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), ('Ɡ', &['ɡ']), ('Ɬ', &['ɬ']), ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), ('Ʇ', &['ʇ']), ('Ʝ', &['ʝ']), ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), ('ꞵ', &['Ꞵ']), ('Ꞷ', &['ꞷ']), ('ꞷ', &['Ꞷ']), + ('ꭓ', &['Ꭓ']), ('ꭰ', &['Ꭰ']), ('ꭱ', &['Ꭱ']), ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), ('ꭴ', &['Ꭴ']), ('ꭵ', &['Ꭵ']), ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), ('ꭸ', &['Ꭸ']), ('ꭹ', &['Ꭹ']), ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), ('ꭼ', &['Ꭼ']), ('ꭽ', &['Ꭽ']), ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), ('ꮀ', &['Ꮀ']), ('ꮁ', &['Ꮁ']), ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), ('ꮄ', &['Ꮄ']), ('ꮅ', &['Ꮅ']), ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), ('ꮈ', &['Ꮈ']), ('ꮉ', &['Ꮉ']), ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), ('ꮌ', &['Ꮌ']), ('ꮍ', &['Ꮍ']), ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), ('ꮐ', &['Ꮐ']), ('ꮑ', &['Ꮑ']), ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), ('ꮔ', &['Ꮔ']), ('ꮕ', &['Ꮕ']), ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), ('ꮘ', &['Ꮘ']), ('ꮙ', &['Ꮙ']), ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), ('ꮜ', &['Ꮜ']), ('ꮝ', &['Ꮝ']), ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), ('ꮠ', &['Ꮠ']), ('ꮡ', &['Ꮡ']), ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), ('ꮤ', &['Ꮤ']), ('ꮥ', &['Ꮥ']), ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), ('ꮨ', &['Ꮨ']), ('ꮩ', &['Ꮩ']), ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), ('ꮬ', &['Ꮬ']), ('ꮭ', &['Ꮭ']), ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), ('ꮰ', &['Ꮰ']), ('ꮱ', &['Ꮱ']), ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), ('ꮴ', &['Ꮴ']), ('ꮵ', &['Ꮵ']), ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), ('ꮸ', &['Ꮸ']), ('ꮹ', &['Ꮹ']), ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), ('ꮼ', &['Ꮼ']), ('ꮽ', &['Ꮽ']), ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), ('A', &['a']), ('B', &['b']), ('C', &['c']), + ('D', &['d']), ('E', &['e']), ('F', &['f']), ('G', &['g']), + ('H', &['h']), ('I', &['i']), ('J', &['j']), ('K', &['k']), + ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &['o']), + ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s']), + ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), + ('X', &['x']), ('Y', &['y']), ('Z', &['z']), ('a', &['A']), + ('b', &['B']), ('c', &['C']), ('d', &['D']), ('e', &['E']), + ('f', &['F']), ('g', &['G']), ('h', &['H']), ('i', &['I']), + ('j', &['J']), ('k', &['K']), ('l', &['L']), ('m', &['M']), + ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), + ('r', &['R']), ('s', &['S']), ('t', &['T']), ('u', &['U']), + ('v', &['V']), ('w', &['W']), ('x', &['X']), ('y', &['Y']), + ('z', &['Z']), ('𐐀', &['𐐨']), ('𐐁', &['𐐩']), ('𐐂', &[ + '𐐪']), ('𐐃', &['𐐫']), ('𐐄', &['𐐬']), ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), ('𐐇', &['𐐯']), ('𐐈', &['𐐰']), ('𐐉', &[ + '𐐱']), ('𐐊', &['𐐲']), ('𐐋', &['𐐳']), ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), ('𐐎', &['𐐶']), ('𐐏', &['𐐷']), ('𐐐', &[ + '𐐸']), ('𐐑', &['𐐹']), ('𐐒', &['𐐺']), ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), ('𐐕', &['𐐽']), ('𐐖', &['𐐾']), ('𐐗', &[ + '𐐿']), ('𐐘', &['𐑀']), ('𐐙', &['𐑁']), ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), ('𐐜', &['𐑄']), ('𐐝', &['𐑅']), ('𐐞', &[ + '𐑆']), ('𐐟', &['𐑇']), ('𐐠', &['𐑈']), ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), ('𐐣', &['𐑋']), ('𐐤', &['𐑌']), ('𐐥', &[ + '𐑍']), ('𐐦', &['𐑎']), ('𐐧', &['𐑏']), ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), ('𐐪', &['𐐂']), ('𐐫', &['𐐃']), ('𐐬', &[ + '𐐄']), ('𐐭', &['𐐅']), ('𐐮', &['𐐆']), ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), ('𐐱', &['𐐉']), ('𐐲', &['𐐊']), ('𐐳', &[ + '𐐋']), ('𐐴', &['𐐌']), ('𐐵', &['𐐍']), ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), ('𐐸', &['𐐐']), ('𐐹', &['𐐑']), ('𐐺', &[ + '𐐒']), ('𐐻', &['𐐓']), ('𐐼', &['𐐔']), ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), ('𐐿', &['𐐗']), ('𐑀', &['𐐘']), ('𐑁', &[ + '𐐙']), ('𐑂', &['𐐚']), ('𐑃', &['𐐛']), ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), ('𐑆', &['𐐞']), ('𐑇', &['𐐟']), ('𐑈', &[ + '𐐠']), ('𐑉', &['𐐡']), ('𐑊', &['𐐢']), ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), ('𐑍', &['𐐥']), ('𐑎', &['𐐦']), ('𐑏', &[ + '𐐧']), ('𐒰', &['𐓘']), ('𐒱', &['𐓙']), ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), ('𐒴', &['𐓜']), ('𐒵', &['𐓝']), ('𐒶', &[ + '𐓞']), ('𐒷', &['𐓟']), ('𐒸', &['𐓠']), ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), ('𐒻', &['𐓣']), ('𐒼', &['𐓤']), ('𐒽', &[ + '𐓥']), ('𐒾', &['𐓦']), ('𐒿', &['𐓧']), ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), ('𐓂', &['𐓪']), ('𐓃', &['𐓫']), ('𐓄', &[ + '𐓬']), ('𐓅', &['𐓭']), ('𐓆', &['𐓮']), ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), ('𐓉', &['𐓱']), ('𐓊', &['𐓲']), ('𐓋', &[ + '𐓳']), ('𐓌', &['𐓴']), ('𐓍', &['𐓵']), ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), ('𐓐', &['𐓸']), ('𐓑', &['𐓹']), ('𐓒', &[ + '𐓺']), ('𐓓', &['𐓻']), ('𐓘', &['𐒰']), ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), ('𐓛', &['𐒳']), ('𐓜', &['𐒴']), ('𐓝', &[ + '𐒵']), ('𐓞', &['𐒶']), ('𐓟', &['𐒷']), ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), ('𐓢', &['𐒺']), ('𐓣', &['𐒻']), ('𐓤', &[ + '𐒼']), ('𐓥', &['𐒽']), ('𐓦', &['𐒾']), ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), ('𐓩', &['𐓁']), ('𐓪', &['𐓂']), ('𐓫', &[ + '𐓃']), ('𐓬', &['𐓄']), ('𐓭', &['𐓅']), ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), ('𐓰', &['𐓈']), ('𐓱', &['𐓉']), ('𐓲', &[ + '𐓊']), ('𐓳', &['𐓋']), ('𐓴', &['𐓌']), ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), ('𐓷', &['𐓏']), ('𐓸', &['𐓐']), ('𐓹', &[ + '𐓑']), ('𐓺', &['𐓒']), ('𐓻', &['𐓓']), ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), ('𐲂', &['𐳂']), ('𐲃', &['𐳃']), ('𐲄', &[ + '𐳄']), ('𐲅', &['𐳅']), ('𐲆', &['𐳆']), ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), ('𐲉', &['𐳉']), ('𐲊', &['𐳊']), ('𐲋', &[ + '𐳋']), ('𐲌', &['𐳌']), ('𐲍', &['𐳍']), ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), ('𐲐', &['𐳐']), ('𐲑', &['𐳑']), ('𐲒', &[ + '𐳒']), ('𐲓', &['𐳓']), ('𐲔', &['𐳔']), ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), ('𐲗', &['𐳗']), ('𐲘', &['𐳘']), ('𐲙', &[ + '𐳙']), ('𐲚', &['𐳚']), ('𐲛', &['𐳛']), ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), ('𐲞', &['𐳞']), ('𐲟', &['𐳟']), ('𐲠', &[ + '𐳠']), ('𐲡', &['𐳡']), ('𐲢', &['𐳢']), ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), ('𐲥', &['𐳥']), ('𐲦', &['𐳦']), ('𐲧', &[ + '𐳧']), ('𐲨', &['𐳨']), ('𐲩', &['𐳩']), ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), ('𐲬', &['𐳬']), ('𐲭', &['𐳭']), ('𐲮', &[ + '𐳮']), ('𐲯', &['𐳯']), ('𐲰', &['𐳰']), ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), ('𐳀', &['𐲀']), ('𐳁', &['𐲁']), ('𐳂', &[ + '𐲂']), ('𐳃', &['𐲃']), ('𐳄', &['𐲄']), ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), ('𐳇', &['𐲇']), ('𐳈', &['𐲈']), ('𐳉', &[ + '𐲉']), ('𐳊', &['𐲊']), ('𐳋', &['𐲋']), ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), ('𐳎', &['𐲎']), ('𐳏', &['𐲏']), ('𐳐', &[ + '𐲐']), ('𐳑', &['𐲑']), ('𐳒', &['𐲒']), ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), ('𐳕', &['𐲕']), ('𐳖', &['𐲖']), ('𐳗', &[ + '𐲗']), ('𐳘', &['𐲘']), ('𐳙', &['𐲙']), ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), ('𐳜', &['𐲜']), ('𐳝', &['𐲝']), ('𐳞', &[ + '𐲞']), ('𐳟', &['𐲟']), ('𐳠', &['𐲠']), ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), ('𐳣', &['𐲣']), ('𐳤', &['𐲤']), ('𐳥', &[ + '𐲥']), ('𐳦', &['𐲦']), ('𐳧', &['𐲧']), ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), ('𐳪', &['𐲪']), ('𐳫', &['𐲫']), ('𐳬', &[ + '𐲬']), ('𐳭', &['𐲭']), ('𐳮', &['𐲮']), ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), ('𐳱', &['𐲱']), ('𐳲', &['𐲲']), ('𑢠', &[ + '𑣀']), ('𑢡', &['𑣁']), ('𑢢', &['𑣂']), ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), ('𑢥', &['𑣅']), ('𑢦', &['𑣆']), ('𑢧', &[ + '𑣇']), ('𑢨', &['𑣈']), ('𑢩', &['𑣉']), ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), ('𑢬', &['𑣌']), ('𑢭', &['𑣍']), ('𑢮', &[ + '𑣎']), ('𑢯', &['𑣏']), ('𑢰', &['𑣐']), ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), ('𑢳', &['𑣓']), ('𑢴', &['𑣔']), ('𑢵', &[ + '𑣕']), ('𑢶', &['𑣖']), ('𑢷', &['𑣗']), ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), ('𑢺', &['𑣚']), ('𑢻', &['𑣛']), ('𑢼', &[ + '𑣜']), ('𑢽', &['𑣝']), ('𑢾', &['𑣞']), ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), ('𑣁', &['𑢡']), ('𑣂', &['𑢢']), ('𑣃', &[ + '𑢣']), ('𑣄', &['𑢤']), ('𑣅', &['𑢥']), ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), ('𑣈', &['𑢨']), ('𑣉', &['𑢩']), ('𑣊', &[ + '𑢪']), ('𑣋', &['𑢫']), ('𑣌', &['𑢬']), ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), ('𑣏', &['𑢯']), ('𑣐', &['𑢰']), ('𑣑', &[ + '𑢱']), ('𑣒', &['𑢲']), ('𑣓', &['𑢳']), ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), ('𑣖', &['𑢶']), ('𑣗', &['𑢷']), ('𑣘', &[ + '𑢸']), ('𑣙', &['𑢹']), ('𑣚', &['𑢺']), ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), ('𑣝', &['𑢽']), ('𑣞', &['𑢾']), ('𑣟', &[ + '𑢿']), ('𞤀', &['𞤢']), ('𞤁', &['𞤣']), ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), ('𞤄', &['𞤦']), ('𞤅', &['𞤧']), ('𞤆', &[ + '𞤨']), ('𞤇', &['𞤩']), ('𞤈', &['𞤪']), ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), ('𞤋', &['𞤭']), ('𞤌', &['𞤮']), ('𞤍', &[ + '𞤯']), ('𞤎', &['𞤰']), ('𞤏', &['𞤱']), ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), ('𞤒', &['𞤴']), ('𞤓', &['𞤵']), ('𞤔', &[ + '𞤶']), ('𞤕', &['𞤷']), ('𞤖', &['𞤸']), ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), ('𞤙', &['𞤻']), ('𞤚', &['𞤼']), ('𞤛', &[ + '𞤽']), ('𞤜', &['𞤾']), ('𞤝', &['𞤿']), ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), ('𞤠', &['𞥂']), ('𞤡', &['𞥃']), ('𞤢', &[ + '𞤀']), ('𞤣', &['𞤁']), ('𞤤', &['𞤂']), ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), ('𞤧', &['𞤅']), ('𞤨', &['𞤆']), ('𞤩', &[ + '𞤇']), ('𞤪', &['𞤈']), ('𞤫', &['𞤉']), ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), ('𞤮', &['𞤌']), ('𞤯', &['𞤍']), ('𞤰', &[ + '𞤎']), ('𞤱', &['𞤏']), ('𞤲', &['𞤐']), ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), ('𞤵', &['𞤓']), ('𞤶', &['𞤔']), ('𞤷', &[ + '𞤕']), ('𞤸', &['𞤖']), ('𞤹', &['𞤗']), ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), ('𞤼', &['𞤚']), ('𞤽', &['𞤛']), ('𞤾', &[ + '𞤜']), ('𞤿', &['𞤝']), ('𞥀', &['𞤞']), ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), ('𞥃', &['𞤡']), +]; diff --git a/regex-syntax-2/src/unicode_tables/general_category.rs b/regex-syntax-2/src/unicode_tables/general_category.rs new file mode 100644 index 0000000000..451a0b27c7 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/general_category.rs @@ -0,0 +1,1844 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category tmp/ucd-10.0.0/ --chars --exclude surrogate +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK), + ("Symbol", SYMBOL), ("Titlecase_Letter", TITLECASE_LETTER), + ("Unassigned", UNASSIGNED), ("Uppercase_Letter", UPPERCASE_LETTER), +]; + +pub const CASED_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), + ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), ('ʕ', 'ʯ'), ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶚ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⱻ'), ('Ȿ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꝯ'), + ('ꝱ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟺ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭥ'), ('ꭰ', 'ꮿ'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), + ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𞤀', '𞥃'), +]; + +pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ + (')', ')'), (']', ']'), ('}', '}'), ('༻', '༻'), ('༽', '༽'), + ('᚜', '᚜'), ('⁆', '⁆'), ('⁾', '⁾'), ('₎', '₎'), + ('⌉', '⌉'), ('⌋', '⌋'), ('〉', '〉'), ('❩', '❩'), + ('❫', '❫'), ('❭', '❭'), ('❯', '❯'), ('❱', '❱'), + ('❳', '❳'), ('❵', '❵'), ('⟆', '⟆'), ('⟧', '⟧'), + ('⟩', '⟩'), ('⟫', '⟫'), ('⟭', '⟭'), ('⟯', '⟯'), + ('⦄', '⦄'), ('⦆', '⦆'), ('⦈', '⦈'), ('⦊', '⦊'), + ('⦌', '⦌'), ('⦎', '⦎'), ('⦐', '⦐'), ('⦒', '⦒'), + ('⦔', '⦔'), ('⦖', '⦖'), ('⦘', '⦘'), ('⧙', '⧙'), + ('⧛', '⧛'), ('⧽', '⧽'), ('⸣', '⸣'), ('⸥', '⸥'), + ('⸧', '⸧'), ('⸩', '⸩'), ('〉', '〉'), ('》', '》'), + ('」', '」'), ('』', '』'), ('】', '】'), ('〕', '〕'), + ('〗', '〗'), ('〙', '〙'), ('〛', '〛'), ('〞', '〟'), + ('﴾', '﴾'), ('︘', '︘'), ('︶', '︶'), ('︸', '︸'), + ('︺', '︺'), ('︼', '︼'), ('︾', '︾'), ('﹀', '﹀'), + ('﹂', '﹂'), ('﹄', '﹄'), ('﹈', '﹈'), ('﹚', '﹚'), + ('﹜', '﹜'), ('﹞', '﹞'), (')', ')'), (']', ']'), + ('}', '}'), ('⦆', '⦆'), ('」', '」'), +]; + +pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ + ('_', '_'), ('‿', '⁀'), ('⁔', '⁔'), ('︳', '︴'), ('﹍', '﹏'), + ('_', '_'), +]; + +pub const CONTROL: &'static [(char, char)] = &[ + ('\u{0}', '\u{1f}'), ('\u{7f}', '\u{9f}'), +]; + +pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), ('¢', '¥'), ('֏', '֏'), ('؋', '؋'), ('৲', '৳'), + ('৻', '৻'), ('૱', '૱'), ('௹', '௹'), ('฿', '฿'), + ('៛', '៛'), ('₠', '₿'), ('꠸', '꠸'), ('﷼', '﷼'), + ('﹩', '﹩'), ('$', '$'), ('¢', '£'), ('¥', '₩'), +]; + +pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ + ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), + ('‐', '―'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), + ('⹀', '⹀'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), + ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), +]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), ('٠', '٩'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), + ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), + ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), + ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), + ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), + ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), + ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), + ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), + ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), + ('𐒠', '𐒩'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), + ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), + ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), + ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𖩠', '𖩩'), ('𖭐', '𖭙'), + ('𝟎', '𝟿'), ('𞥐', '𞥙'), +]; + +pub const ENCLOSING_MARK: &'static [(char, char)] = &[ + ('҈', '҉'), ('᪾', '᪾'), ('⃝', '⃠'), ('⃢', '⃤'), + ('꙰', '꙲'), +]; + +pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('»', '»'), ('’', '’'), ('”', '”'), ('›', '›'), + ('⸃', '⸃'), ('⸅', '⸅'), ('⸊', '⸊'), ('⸍', '⸍'), + ('⸝', '⸝'), ('⸡', '⸡'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), ('\u{600}', '\u{605}'), ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), ('\u{110bd}', '\u{110bd}'), + ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ + ('«', '«'), ('‘', '‘'), ('‛', '“'), ('‟', '‟'), + ('‹', '‹'), ('⸂', '⸂'), ('⸄', '⸄'), ('⸉', '⸉'), + ('⸌', '⸌'), ('⸜', '⸜'), ('⸠', '⸠'), +]; + +pub const LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), + ('ᳵ', 'ᳶ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), + ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), + ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), ('ⸯ', 'ⸯ'), ('々', '〆'), ('〱', '〵'), + ('〻', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), + ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), + ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), + ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛥ'), ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), + ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), + ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), + ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), + ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), + ('𐂀', '𐃺'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), + ('𐌭', '𐍀'), ('𐍂', '𐍉'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), + ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), + ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), + ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀃', '𑀷'), + ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅐', '𑅲'), + ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑊀', '𑊆'), + ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), + ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), + ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), + ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), + ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), + ('𑜀', '𑜙'), ('𑢠', '𑣟'), ('𑣿', '𑣿'), ('𑨀', '𑨀'), + ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪃'), + ('𑪆', '𑪉'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), + ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𒀀', '𒎙'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), + ('𞤀', '𞥃'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const LETTER_NUMBER: &'static [(char, char)] = &[ + ('ᛮ', 'ᛰ'), ('Ⅰ', 'ↂ'), ('ↅ', 'ↈ'), ('〇', '〇'), + ('〡', '〩'), ('〸', '〺'), ('ꛦ', 'ꛯ'), ('𐅀', '𐅴'), + ('𐍁', '𐍁'), ('𐍊', '𐍊'), ('𐏑', '𐏕'), ('𒐀', '𒑮'), +]; + +pub const LINE_SEPARATOR: &'static [(char, char)] = &[ + ('\u{2028}', '\u{2028}'), +]; + +pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), + ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), + ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), + ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), + ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), + ('ʕ', 'ʯ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᴫ'), ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), + ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), + ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), + ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), + ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), + ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), + ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), + ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), + ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), + ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), + ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ↄ', 'ↄ'), ('ⰰ', 'ⱞ'), + ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱻ'), + ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝱ', 'ꝸ'), + ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭥ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), + ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𝐚', '𝐳'), ('𝑎', '𝑔'), + ('𝑖', '𝑧'), ('𝒂', '𝒛'), ('𝒶', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝓏'), ('𝓪', '𝔃'), ('𝔞', '𝔷'), + ('𝕒', '𝕫'), ('𝖆', '𝖟'), ('𝖺', '𝗓'), ('𝗮', '𝘇'), + ('𝘢', '𝘻'), ('𝙖', '𝙯'), ('𝚊', '𝚥'), ('𝛂', '𝛚'), + ('𝛜', '𝛡'), ('𝛼', '𝜔'), ('𝜖', '𝜛'), ('𝜶', '𝝎'), + ('𝝐', '𝝕'), ('𝝰', '𝞈'), ('𝞊', '𝞏'), ('𝞪', '𝟂'), + ('𝟄', '𝟉'), ('𝟋', '𝟋'), ('𞤢', '𞥃'), +]; + +pub const MARK: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҉'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ः'), ('ऺ', '़'), ('ा', 'ॏ'), ('॑', 'ॗ'), + ('ॢ', 'ॣ'), ('ঁ', 'ঃ'), ('়', '়'), ('া', 'ৄ'), + ('ে', 'ৈ'), ('ো', '্'), ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), + ('ਁ', 'ਃ'), ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ઃ'), ('઼', '઼'), ('ા', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଃ'), + ('଼', '଼'), ('ା', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), + ('ౢ', 'ౣ'), ('ಁ', 'ಃ'), ('಼', '಼'), ('ಾ', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), + ('ഀ', 'ഃ'), ('഻', '഼'), ('ാ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', '്'), ('ൗ', 'ൗ'), ('ൢ', 'ൣ'), ('ං', 'ඃ'), + ('්', '්'), ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), + ('ෲ', 'ෳ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('็', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('່', 'ໍ'), + ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', '༿'), ('ཱ', '྄'), ('྆', '྇'), ('ྍ', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ါ', 'ှ'), ('ၖ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('ၱ', 'ၴ'), + ('ႂ', 'ႍ'), ('ႏ', 'ႏ'), ('ႚ', 'ႝ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '៓'), ('៝', '៝'), ('᠋', '᠍'), ('ᢅ', 'ᢆ'), + ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('ᨗ', 'ᨛ'), + ('ᩕ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᩿'), ('᪰', '᪾'), + ('ᬀ', 'ᬄ'), ('᬴', '᭄'), ('᭫', '᭳'), ('ᮀ', 'ᮂ'), + ('ᮡ', 'ᮭ'), ('᯦', '᯳'), ('ᰤ', '᰷'), ('᳐', '᳒'), + ('᳔', '᳨'), ('᳭', '᳭'), ('ᳲ', '᳴'), ('᳷', '᳹'), + ('᷀', '᷹'), ('᷻', '᷿'), ('⃐', '⃰'), ('⳯', '⳱'), + ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〯'), ('゙', '゚'), + ('꙯', '꙲'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), ('꛰', '꛱'), + ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠣ', 'ꠧ'), + ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), + ('ꥇ', '꥓'), ('ꦀ', 'ꦃ'), ('꦳', '꧀'), ('ꧥ', 'ꧥ'), + ('ꨩ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩍ'), ('ꩻ', 'ꩽ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫫ', 'ꫯ'), ('ꫵ', '꫶'), ('ꯣ', 'ꯪ'), + ('꯬', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), ('︠', '︯'), + ('𐇽', '𐇽'), ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐫥', '𐫦'), ('𑀀', '𑀂'), ('𑀸', '𑁆'), ('𑁿', '𑂂'), + ('𑂰', '𑂺'), ('𑄀', '𑄂'), ('𑄧', '𑄴'), ('𑅳', '𑅳'), + ('𑆀', '𑆂'), ('𑆳', '𑇀'), ('𑇊', '𑇌'), ('𑈬', '𑈷'), + ('𑈾', '𑈾'), ('𑋟', '𑋪'), ('𑌀', '𑌃'), ('𑌼', '𑌼'), + ('𑌾', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍗', '𑍗'), + ('𑍢', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐵', '𑑆'), + ('𑒰', '𑓃'), ('𑖯', '𑖵'), ('𑖸', '𑗀'), ('𑗜', '𑗝'), + ('𑘰', '𑙀'), ('𑚫', '𑚷'), ('𑜝', '𑜫'), ('𑨁', '𑨊'), + ('𑨳', '𑨹'), ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩛'), + ('𑪊', '𑪙'), ('𑰯', '𑰶'), ('𑰸', '𑰿'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), + ('𖽑', '𖽾'), ('𖾏', '𖾒'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), + ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), + ('𝉂', '𝉄'), ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), + ('𝪄', '𝪄'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞣐', '𞣖'), ('𞥄', '𞥊'), ('󠄀', '󠇯'), +]; + +pub const MATH_SYMBOL: &'static [(char, char)] = &[ + ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('±', '±'), + ('×', '×'), ('÷', '÷'), ('϶', '϶'), ('؆', '؈'), ('⁄', '⁄'), + ('⁒', '⁒'), ('⁺', '⁼'), ('₊', '₌'), ('℘', '℘'), + ('⅀', '⅄'), ('⅋', '⅋'), ('←', '↔'), ('↚', '↛'), + ('↠', '↠'), ('↣', '↣'), ('↦', '↦'), ('↮', '↮'), + ('⇎', '⇏'), ('⇒', '⇒'), ('⇔', '⇔'), ('⇴', '⋿'), + ('⌠', '⌡'), ('⍼', '⍼'), ('⎛', '⎳'), ('⏜', '⏡'), + ('▷', '▷'), ('◁', '◁'), ('◸', '◿'), ('♯', '♯'), + ('⟀', '⟄'), ('⟇', '⟥'), ('⟰', '⟿'), ('⤀', '⦂'), + ('⦙', '⧗'), ('⧜', '⧻'), ('⧾', '⫿'), ('⬰', '⭄'), + ('⭇', '⭌'), ('﬩', '﬩'), ('﹢', '﹢'), ('﹤', '﹦'), + ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), + ('¬', '¬'), ('←', '↓'), ('𝛁', '𝛁'), ('𝛛', '𝛛'), + ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), ('𝝏', '𝝏'), + ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), ('𝟃', '𝟃'), + ('𞻰', '𞻱'), +]; + +pub const MODIFIER_LETTER: &'static [(char, char)] = &[ + ('ʰ', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), ('ͺ', 'ͺ'), ('ՙ', 'ՙ'), ('ـ', 'ـ'), ('ۥ', 'ۦ'), + ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), + ('ॱ', 'ॱ'), ('ๆ', 'ๆ'), ('ໆ', 'ໆ'), ('ჼ', 'ჼ'), + ('ៗ', 'ៗ'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), ('ᱸ', 'ᱽ'), + ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⱼ', 'ⱽ'), ('ⵯ', 'ⵯ'), + ('ⸯ', 'ⸯ'), ('々', '々'), ('〱', '〵'), ('〻', '〻'), + ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('ꜗ', 'ꜟ'), + ('ꝰ', 'ꝰ'), ('ꞈ', 'ꞈ'), ('ꟸ', 'ꟹ'), ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), ('ꫳ', 'ꫴ'), + ('ꭜ', 'ꭟ'), ('ー', 'ー'), ('゙', '゚'), ('𖭀', '𖭃'), + ('𖾓', '𖾟'), ('𖿠', '𖿡'), +]; + +pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ + ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), + ('¸', '¸'), ('˂', '˅'), ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), + ('˯', '˿'), ('͵', '͵'), ('΄', '΅'), ('᾽', '᾽'), ('᾿', '῁'), + ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), + ('゛', '゜'), ('꜀', '꜖'), ('꜠', '꜡'), ('꞉', '꞊'), + ('꭛', '꭛'), ('﮲', '﯁'), ('^', '^'), ('`', '`'), + (' ̄', ' ̄'), ('🏻', '🏿'), +]; + +pub const NONSPACING_MARK: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҇'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ं'), ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), + ('्', '्'), ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ঁ', 'ঁ'), + ('়', '়'), ('ু', 'ৄ'), ('্', '্'), ('ৢ', 'ৣ'), + ('ਁ', 'ਂ'), ('਼', '਼'), ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ં'), ('઼', '઼'), ('ુ', 'ૅ'), ('ે', 'ૈ'), + ('્', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଁ'), + ('଼', '଼'), ('ି', 'ି'), ('ୁ', 'ୄ'), ('୍', '୍'), + ('ୖ', 'ୖ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ீ', 'ீ'), + ('்', '்'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೆ', 'ೆ'), ('ೌ', '್'), + ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), ('഻', '഼'), ('ു', 'ൄ'), + ('്', '്'), ('ൢ', 'ൣ'), ('්', '්'), ('ි', 'ු'), + ('ූ', 'ූ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('็', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('່', 'ໍ'), + ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('ཱ', 'ཾ'), ('ྀ', '྄'), ('྆', '྇'), ('ྍ', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ိ', 'ူ'), ('ဲ', '့'), + ('္', '်'), ('ွ', 'ှ'), ('ၘ', 'ၙ'), ('ၞ', 'ၠ'), + ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), ('ႅ', 'ႆ'), ('ႍ', 'ႍ'), + ('ႝ', 'ႝ'), ('፝', '፟'), ('ᜒ', '᜔'), ('ᜲ', '᜴'), + ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), ('឴', '឵'), ('ិ', 'ួ'), + ('ំ', 'ំ'), ('៉', '៓'), ('៝', '៝'), ('᠋', '᠍'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), + ('ᤲ', 'ᤲ'), ('᤹', '᤻'), ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), + ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), ('᩠', '᩠'), ('ᩢ', 'ᩢ'), + ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), ('᩿', '᩿'), ('᪰', '᪽'), + ('ᬀ', 'ᬃ'), ('᬴', '᬴'), ('ᬶ', 'ᬺ'), ('ᬼ', 'ᬼ'), + ('ᭂ', 'ᭂ'), ('᭫', '᭳'), ('ᮀ', 'ᮁ'), ('ᮢ', 'ᮥ'), + ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), ('᯦', '᯦'), ('ᯨ', 'ᯩ'), + ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), ('ᰬ', 'ᰳ'), ('ᰶ', '᰷'), + ('᳐', '᳒'), ('᳔', '᳠'), ('᳢', '᳨'), ('᳭', '᳭'), + ('᳴', '᳴'), ('᳸', '᳹'), ('᷀', '᷹'), ('᷻', '᷿'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('⳯', '⳱'), + ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〭'), ('゙', '゚'), + ('꙯', '꙯'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), ('꛰', '꛱'), + ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠥ', 'ꠦ'), + ('꣄', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), ('ꥇ', 'ꥑ'), + ('ꦀ', 'ꦂ'), ('꦳', '꦳'), ('ꦶ', 'ꦹ'), ('ꦼ', 'ꦼ'), + ('ꧥ', 'ꧥ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), ('ꨵ', 'ꨶ'), + ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩼ', 'ꩼ'), ('ꪰ', 'ꪰ'), + ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), ('꫁', '꫁'), + ('ꫬ', 'ꫭ'), ('꫶', '꫶'), ('ꯥ', 'ꯥ'), ('ꯨ', 'ꯨ'), + ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), ('︠', '︯'), + ('𐇽', '𐇽'), ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐫥', '𐫦'), ('𑀁', '𑀁'), ('𑀸', '𑁆'), ('𑁿', '𑂁'), + ('𑂳', '𑂶'), ('𑂹', '𑂺'), ('𑄀', '𑄂'), ('𑄧', '𑄫'), + ('𑄭', '𑄴'), ('𑅳', '𑅳'), ('𑆀', '𑆁'), ('𑆶', '𑆾'), + ('𑇊', '𑇌'), ('𑈯', '𑈱'), ('𑈴', '𑈴'), ('𑈶', '𑈷'), + ('𑈾', '𑈾'), ('𑋟', '𑋟'), ('𑋣', '𑋪'), ('𑌀', '𑌁'), + ('𑌼', '𑌼'), ('𑍀', '𑍀'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), + ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), ('𑒳', '𑒸'), + ('𑒺', '𑒺'), ('𑒿', '𑓀'), ('𑓂', '𑓃'), ('𑖲', '𑖵'), + ('𑖼', '𑖽'), ('𑖿', '𑗀'), ('𑗜', '𑗝'), ('𑘳', '𑘺'), + ('𑘽', '𑘽'), ('𑘿', '𑙀'), ('𑚫', '𑚫'), ('𑚭', '𑚭'), + ('𑚰', '𑚵'), ('𑚷', '𑚷'), ('𑜝', '𑜟'), ('𑜢', '𑜥'), + ('𑜧', '𑜫'), ('𑨁', '𑨆'), ('𑨉', '𑨊'), ('𑨳', '𑨸'), + ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩖'), ('𑩙', '𑩛'), + ('𑪊', '𑪖'), ('𑪘', '𑪙'), ('𑰰', '𑰶'), ('𑰸', '𑰽'), + ('𑰿', '𑰿'), ('𑲒', '𑲧'), ('𑲪', '𑲰'), ('𑲲', '𑲳'), + ('𑲵', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), + ('𖾏', '𖾒'), ('𛲝', '𛲞'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), ('𝨀', '𝨶'), + ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), ('𝪛', '𝪟'), + ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), ('𞥄', '𞥊'), + ('󠄀', '󠇯'), +]; + +pub const NUMBER: &'static [(char, char)] = &[ + ('0', '9'), ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('٠', '٩'), + ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('৴', '৹'), + ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('୲', '୷'), + ('௦', '௲'), ('౦', '౯'), ('౸', '౾'), ('೦', '೯'), + ('൘', '൞'), ('൦', '൸'), ('෦', '෯'), ('๐', '๙'), + ('໐', '໙'), ('༠', '༳'), ('၀', '၉'), ('႐', '႙'), + ('፩', '፼'), ('ᛮ', 'ᛰ'), ('០', '៩'), ('៰', '៹'), + ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧚'), ('᪀', '᪉'), + ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), + ('᱐', '᱙'), ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), + ('⅐', 'ↂ'), ('ↅ', '↉'), ('①', '⒛'), ('⓪', '⓿'), + ('❶', '➓'), ('⳽', '⳽'), ('〇', '〇'), ('〡', '〩'), + ('〸', '〺'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), + ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꘠', '꘩'), + ('ꛦ', 'ꛯ'), ('꠰', '꠵'), ('꣐', '꣙'), ('꤀', '꤉'), + ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), + ('0', '9'), ('𐄇', '𐄳'), ('𐅀', '𐅸'), ('𐆊', '𐆋'), + ('𐋡', '𐋻'), ('𐌠', '𐌣'), ('𐍁', '𐍁'), ('𐍊', '𐍊'), + ('𐏑', '𐏕'), ('𐒠', '𐒩'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), + ('𐢧', '𐢯'), ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), + ('𐧀', '𐧏'), ('𐧒', '𐧿'), ('𐩀', '𐩇'), ('𐩽', '𐩾'), + ('𐪝', '𐪟'), ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), + ('𐮩', '𐮯'), ('𐳺', '𐳿'), ('𐹠', '𐹾'), ('𑁒', '𑁯'), + ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑇡', '𑇴'), + ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), + ('𑛀', '𑛉'), ('𑜰', '𑜻'), ('𑣠', '𑣲'), ('𑱐', '𑱬'), + ('𑵐', '𑵙'), ('𒐀', '𒑮'), ('𖩠', '𖩩'), ('𖭐', '𖭙'), + ('𖭛', '𖭡'), ('𝍠', '𝍱'), ('𝟎', '𝟿'), ('𞣇', '𞣏'), + ('𞥐', '𞥙'), ('🄀', '🄌'), +]; + +pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ + ('(', '('), ('[', '['), ('{', '{'), ('༺', '༺'), ('༼', '༼'), + ('᚛', '᚛'), ('‚', '‚'), ('„', '„'), ('⁅', '⁅'), + ('⁽', '⁽'), ('₍', '₍'), ('⌈', '⌈'), ('⌊', '⌊'), + ('〈', '〈'), ('❨', '❨'), ('❪', '❪'), ('❬', '❬'), + ('❮', '❮'), ('❰', '❰'), ('❲', '❲'), ('❴', '❴'), + ('⟅', '⟅'), ('⟦', '⟦'), ('⟨', '⟨'), ('⟪', '⟪'), + ('⟬', '⟬'), ('⟮', '⟮'), ('⦃', '⦃'), ('⦅', '⦅'), + ('⦇', '⦇'), ('⦉', '⦉'), ('⦋', '⦋'), ('⦍', '⦍'), + ('⦏', '⦏'), ('⦑', '⦑'), ('⦓', '⦓'), ('⦕', '⦕'), + ('⦗', '⦗'), ('⧘', '⧘'), ('⧚', '⧚'), ('⧼', '⧼'), + ('⸢', '⸢'), ('⸤', '⸤'), ('⸦', '⸦'), ('⸨', '⸨'), + ('⹂', '⹂'), ('〈', '〈'), ('《', '《'), ('「', '「'), + ('『', '『'), ('【', '【'), ('〔', '〔'), ('〖', '〖'), + ('〘', '〘'), ('〚', '〚'), ('〝', '〝'), ('﴿', '﴿'), + ('︗', '︗'), ('︵', '︵'), ('︷', '︷'), ('︹', '︹'), + ('︻', '︻'), ('︽', '︽'), ('︿', '︿'), ('﹁', '﹁'), + ('﹃', '﹃'), ('﹇', '﹇'), ('﹙', '﹙'), ('﹛', '﹛'), + ('﹝', '﹝'), ('(', '('), ('[', '['), ('{', '{'), + ('⦅', '⦅'), ('「', '「'), +]; + +pub const OTHER: &'static [(char, char)] = &[ + ('\u{0}', '\u{1f}'), ('\u{7f}', '\u{9f}'), ('\u{ad}', '\u{ad}'), + ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), ('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), + ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ef}'), ('\u{5f5}', '\u{605}'), ('\u{61c}', '\u{61d}'), + ('\u{6dd}', '\u{6dd}'), ('\u{70e}', '\u{70f}'), ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), ('\u{7fb}', '\u{7ff}'), ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{89f}'), ('\u{8b5}', '\u{8b5}'), ('\u{8be}', '\u{8d3}'), + ('\u{8e2}', '\u{8e2}'), ('\u{984}', '\u{984}'), ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), ('\u{9fe}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), ('\u{a5f}', '\u{a65}'), + ('\u{a76}', '\u{a80}'), ('\u{a84}', '\u{a84}'), ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b55}'), ('\u{b58}', '\u{b5b}'), ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), ('\u{bfb}', '\u{bff}'), + ('\u{c04}', '\u{c04}'), ('\u{c0d}', '\u{c0d}'), ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3c}'), ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), ('\u{c70}', '\u{c77}'), + ('\u{c84}', '\u{c84}'), ('\u{c8d}', '\u{c8d}'), ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdd}'), ('\u{cdf}', '\u{cdf}'), ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), ('\u{cf3}', '\u{cff}'), ('\u{d04}', '\u{d04}'), + ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d81}'), ('\u{d84}', '\u{d84}'), ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e86}'), ('\u{e89}', '\u{e89}'), + ('\u{e8b}', '\u{e8c}'), ('\u{e8e}', '\u{e93}'), ('\u{e98}', '\u{e98}'), + ('\u{ea0}', '\u{ea0}'), ('\u{ea4}', '\u{ea4}'), ('\u{ea6}', '\u{ea6}'), + ('\u{ea8}', '\u{ea9}'), ('\u{eac}', '\u{eac}'), ('\u{eba}', '\u{eba}'), + ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), ('\u{ec7}', '\u{ec7}'), + ('\u{ece}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), ('\u{180e}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), ('\u{1878}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), ('\u{1abf}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1cbf}'), + ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfa}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), + ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b97}'), + ('\u{2bba}', '\u{2bbc}'), ('\u{2bc9}', '\u{2bc9}'), + ('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bff}'), + ('\u{2c2f}', '\u{2c2f}'), ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), ('\u{2e4a}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), ('\u{312f}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), ('\u{31bb}', '\u{31bf}'), + ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), + ('\u{32ff}', '\u{32ff}'), ('\u{4db6}', '\u{4dbf}'), + ('\u{9feb}', '\u{9fff}'), ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), ('\u{a7af}', '\u{a7af}'), + ('\u{a7b8}', '\u{a7f6}'), ('\u{a82c}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), + ('\u{a8fe}', '\u{a8ff}'), ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab66}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{f8ff}'), + ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fffb}'), ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), ('\u{1018f}', '\u{1018f}'), + ('\u{1019c}', '\u{1019f}'), ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), ('\u{10564}', '\u{1056e}'), + ('\u{10570}', '\u{105ff}'), ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), ('\u{10768}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), ('\u{10a34}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), ('\u{10a48}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), ('\u{10d00}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10fff}'), ('\u{1104e}', '\u{11051}'), + ('\u{11070}', '\u{1107e}'), ('\u{110bd}', '\u{110bd}'), + ('\u{110c2}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), + ('\u{11144}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), + ('\u{111ce}', '\u{111cf}'), ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133b}'), ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), ('\u{1145a}', '\u{1145a}'), + ('\u{1145c}', '\u{1145c}'), ('\u{1145e}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), ('\u{116b8}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), ('\u{1171a}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), ('\u{11740}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), ('\u{11900}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), ('\u{11a84}', '\u{11a85}'), + ('\u{11a9d}', '\u{11a9d}'), ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11fff}'), ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), ('\u{1342f}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16eff}'), + ('\u{16f45}', '\u{16f4f}'), ('\u{16f7f}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe2}', '\u{16fff}'), + ('\u{187ed}', '\u{187ff}'), ('\u{18af3}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca0}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), ('\u{1d173}', '\u{1d17a}'), + ('\u{1d1e9}', '\u{1d1ff}'), ('\u{1d246}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), ('\u{1d372}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1dfff}'), ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), ('\u{1e02b}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94b}', '\u{1e94f}'), ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1edff}'), ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), ('\u{1f10d}', '\u{1f10f}'), + ('\u{1f12f}', '\u{1f12f}'), ('\u{1f16c}', '\u{1f16f}'), + ('\u{1f1ad}', '\u{1f1e5}'), ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d5}', '\u{1f6df}'), ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6f9}', '\u{1f6ff}'), ('\u{1f774}', '\u{1f77f}'), + ('\u{1f7d5}', '\u{1f7ff}'), ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), ('\u{1f8ae}', '\u{1f8ff}'), + ('\u{1f90c}', '\u{1f90f}'), ('\u{1f93f}', '\u{1f93f}'), + ('\u{1f94d}', '\u{1f94f}'), ('\u{1f96c}', '\u{1f97f}'), + ('\u{1f998}', '\u{1f9bf}'), ('\u{1f9c1}', '\u{1f9cf}'), + ('\u{1f9e7}', '\u{1ffff}'), ('\u{2a6d7}', '\u{2a6ff}'), + ('\u{2b735}', '\u{2b73f}'), ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{e00ff}'), ('\u{e01f0}', '\u{10ffff}'), +]; + +pub const OTHER_LETTER: &'static [(char, char)] = &[ + ('ª', 'ª'), ('º', 'º'), ('ƻ', 'ƻ'), ('ǀ', 'ǃ'), ('ʔ', 'ʔ'), + ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), + ('ࠀ', 'ࠕ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), + ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), + ('क़', 'ॡ'), ('ॲ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), + ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), + ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), + ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), + ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๅ'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ະ'), + ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), + ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), ('ა', 'ჺ'), ('ჽ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡂ'), ('ᡄ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱷ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ℵ', 'ℸ'), ('ⴰ', 'ⵧ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('〆', '〆'), + ('〼', '〼'), ('ぁ', 'ゖ'), ('ゟ', 'ゟ'), ('ァ', 'ヺ'), + ('ヿ', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), + ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꀔ'), + ('ꀖ', 'ꒌ'), ('ꓐ', 'ꓷ'), ('ꔀ', 'ꘋ'), ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), ('ꙮ', 'ꙮ'), ('ꚠ', 'ꛥ'), ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), ('ꟻ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧠ', 'ꧤ'), ('ꧧ', 'ꧯ'), + ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩯ'), ('ꩱ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫜ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫲ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꯀ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('ヲ', 'ッ'), ('ア', 'ン'), ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍀'), ('𐍂', '𐍉'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐑐', '𐒝'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), + ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), + ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), + ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), + ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), + ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), + ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), + ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), ('𑣿', '𑣿'), + ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), + ('𑩜', '𑪃'), ('𑪆', '𑪉'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), + ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), + ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𒀀', '𒎙'), + ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), + ('𖩀', '𖩞'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𞠀', '𞣄'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const OTHER_NUMBER: &'static [(char, char)] = &[ + ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('৴', '৹'), ('୲', '୷'), + ('௰', '௲'), ('౸', '౾'), ('൘', '൞'), ('൰', '൸'), + ('༪', '༳'), ('፩', '፼'), ('៰', '៹'), ('᧚', '᧚'), + ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), ('⅐', '⅟'), + ('↉', '↉'), ('①', '⒛'), ('⓪', '⓿'), ('❶', '➓'), + ('⳽', '⳽'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), + ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꠰', '꠵'), + ('𐄇', '𐄳'), ('𐅵', '𐅸'), ('𐆊', '𐆋'), ('𐋡', '𐋻'), + ('𐌠', '𐌣'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), ('𐢧', '𐢯'), + ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), + ('𐧒', '𐧿'), ('𐩀', '𐩇'), ('𐩽', '𐩾'), ('𐪝', '𐪟'), + ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), ('𐮩', '𐮯'), + ('𐳺', '𐳿'), ('𐹠', '𐹾'), ('𑁒', '𑁥'), ('𑇡', '𑇴'), + ('𑜺', '𑜻'), ('𑣪', '𑣲'), ('𑱚', '𑱬'), ('𖭛', '𖭡'), + ('𝍠', '𝍱'), ('𞣇', '𞣏'), ('🄀', '🄌'), +]; + +pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), ('%', '\''), ('*', '*'), (',', ','), ('.', '/'), (':', ';'), + ('?', '@'), ('\\', '\\'), ('¡', '¡'), ('§', '§'), ('¶', '·'), + ('¿', '¿'), (';', ';'), ('·', '·'), ('՚', '՟'), ('։', '։'), + ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), + ('،', '؍'), ('؛', '؛'), ('؞', '؟'), ('٪', '٭'), ('۔', '۔'), + ('܀', '܍'), ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), + ('॰', '॰'), ('৽', '৽'), ('૰', '૰'), ('෴', '෴'), + ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), + ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), ('၊', '၏'), + ('჻', '჻'), ('፠', '፨'), ('᙭', '᙮'), ('᛫', '᛭'), + ('᜵', '᜶'), ('។', '៖'), ('៘', '៚'), ('᠀', '᠅'), + ('᠇', '᠊'), ('᥄', '᥅'), ('᨞', '᨟'), ('᪠', '᪦'), + ('᪨', '᪭'), ('᭚', '᭠'), ('᯼', '᯿'), ('᰻', '᰿'), + ('᱾', '᱿'), ('᳀', '᳇'), ('᳓', '᳓'), ('‖', '‗'), + ('†', '‧'), ('‰', '‸'), ('※', '‾'), ('⁁', '⁃'), + ('⁇', '⁑'), ('⁓', '⁓'), ('⁕', '⁞'), ('⳹', '⳼'), + ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸁'), ('⸆', '⸈'), + ('⸋', '⸋'), ('⸎', '⸖'), ('⸘', '⸙'), ('⸛', '⸛'), + ('⸞', '⸟'), ('⸪', '⸮'), ('⸰', '⸹'), ('⸼', '⸿'), + ('⹁', '⹁'), ('⹃', '⹉'), ('、', '〃'), ('〽', '〽'), + ('・', '・'), ('꓾', '꓿'), ('꘍', '꘏'), ('꙳', '꙳'), + ('꙾', '꙾'), ('꛲', '꛷'), ('꡴', '꡷'), ('꣎', '꣏'), + ('꣸', '꣺'), ('꣼', '꣼'), ('꤮', '꤯'), ('꥟', '꥟'), + ('꧁', '꧍'), ('꧞', '꧟'), ('꩜', '꩟'), ('꫞', '꫟'), + ('꫰', '꫱'), ('꯫', '꯫'), ('︐', '︖'), ('︙', '︙'), + ('︰', '︰'), ('﹅', '﹆'), ('﹉', '﹌'), ('﹐', '﹒'), + ('﹔', '﹗'), ('﹟', '﹡'), ('﹨', '﹨'), ('﹪', '﹫'), + ('!', '#'), ('%', '''), ('*', '*'), (',', ','), + ('.', '/'), (':', ';'), ('?', '@'), ('\', '\'), + ('。', '。'), ('、', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), + ('𐏐', '𐏐'), ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), + ('𐤿', '𐤿'), ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), + ('𐬹', '𐬿'), ('𐮙', '𐮜'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), + ('𑂾', '𑃁'), ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇉'), + ('𑇍', '𑇍'), ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), + ('𑊩', '𑊩'), ('𑑋', '𑑏'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), + ('𑓆', '𑓆'), ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), + ('𑜼', '𑜾'), ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), + ('𑱁', '𑱅'), ('𑱰', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), + ('𖫵', '𖫵'), ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), + ('𝪇', '𝪋'), ('𞥞', '𞥟'), +]; + +pub const OTHER_SYMBOL: &'static [(char, char)] = &[ + ('¦', '¦'), ('©', '©'), ('®', '®'), ('°', '°'), ('҂', '҂'), + ('֍', '֎'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), + ('߶', '߶'), ('৺', '৺'), ('୰', '୰'), ('௳', '௸'), + ('௺', '௺'), ('౿', '౿'), ('൏', '൏'), ('൹', '൹'), + ('༁', '༃'), ('༓', '༓'), ('༕', '༗'), ('༚', '༟'), + ('༴', '༴'), ('༶', '༶'), ('༸', '༸'), ('྾', '࿅'), + ('࿇', '࿌'), ('࿎', '࿏'), ('࿕', '࿘'), ('႞', '႟'), + ('᎐', '᎙'), ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), + ('᭴', '᭼'), ('℀', '℁'), ('℃', '℆'), ('℈', '℉'), + ('℔', '℔'), ('№', '℗'), ('℞', '℣'), ('℥', '℥'), + ('℧', '℧'), ('℩', '℩'), ('℮', '℮'), ('℺', '℻'), + ('⅊', '⅊'), ('⅌', '⅍'), ('⅏', '⅏'), ('↊', '↋'), + ('↕', '↙'), ('↜', '↟'), ('↡', '↢'), ('↤', '↥'), + ('↧', '↭'), ('↯', '⇍'), ('⇐', '⇑'), ('⇓', '⇓'), + ('⇕', '⇳'), ('⌀', '⌇'), ('⌌', '⌟'), ('⌢', '⌨'), + ('⌫', '⍻'), ('⍽', '⎚'), ('⎴', '⏛'), ('⏢', '␦'), + ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '▶'), ('▸', '◀'), + ('◂', '◷'), ('☀', '♮'), ('♰', '❧'), ('➔', '➿'), + ('⠀', '⣿'), ('⬀', '⬯'), ('⭅', '⭆'), ('⭍', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('⳥', '⳪'), ('⺀', '⺙'), ('⺛', '⻳'), + ('⼀', '⿕'), ('⿰', '⿻'), ('〄', '〄'), ('〒', '〓'), + ('〠', '〠'), ('〶', '〷'), ('〾', '〿'), ('㆐', '㆑'), + ('㆖', '㆟'), ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), + ('㉐', '㉐'), ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㋾'), + ('㌀', '㏿'), ('䷀', '䷿'), ('꒐', '꓆'), ('꠨', '꠫'), + ('꠶', '꠷'), ('꠹', '꠹'), ('꩷', '꩹'), ('﷽', '﷽'), + ('¦', '¦'), ('│', '│'), ('■', '○'), ('', '�'), + ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), ('𐆐', '𐆛'), + ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), ('𐫈', '𐫈'), + ('𑜿', '𑜿'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), ('𛲜', '𛲜'), + ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅪', '𝅬'), + ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝈀', '𝉁'), + ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), + ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), + ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🏺'), ('🐀', '🛔'), ('🛠', '🛬'), + ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), + ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), + ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), + ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), +]; + +pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = &[ + ('\u{2029}', '\u{2029}'), +]; + +pub const PRIVATE_USE: &'static [(char, char)] = &[ + ('\u{e000}', '\u{f8ff}'), ('\u{f0000}', '\u{ffffd}'), + ('\u{100000}', '\u{10fffd}'), +]; + +pub const PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), ('[', ']'), + ('_', '_'), ('{', '{'), ('}', '}'), ('¡', '¡'), ('§', '§'), + ('«', '«'), ('¶', '·'), ('»', '»'), ('¿', '¿'), (';', ';'), + ('·', '·'), ('՚', '՟'), ('։', '֊'), ('־', '־'), ('׀', '׀'), + ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), ('،', '؍'), + ('؛', '؛'), ('؞', '؟'), ('٪', '٭'), ('۔', '۔'), ('܀', '܍'), + ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), + ('॰', '॰'), ('৽', '৽'), ('૰', '૰'), ('෴', '෴'), + ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), + ('༺', '༽'), ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), + ('၊', '၏'), ('჻', '჻'), ('፠', '፨'), ('᐀', '᐀'), + ('᙭', '᙮'), ('᚛', '᚜'), ('᛫', '᛭'), ('᜵', '᜶'), + ('។', '៖'), ('៘', '៚'), ('᠀', '᠊'), ('᥄', '᥅'), + ('᨞', '᨟'), ('᪠', '᪦'), ('᪨', '᪭'), ('᭚', '᭠'), + ('᯼', '᯿'), ('᰻', '᰿'), ('᱾', '᱿'), ('᳀', '᳇'), + ('᳓', '᳓'), ('‐', '‧'), ('‰', '⁃'), ('⁅', '⁑'), + ('⁓', '⁞'), ('⁽', '⁾'), ('₍', '₎'), ('⌈', '⌋'), + ('〈', '〉'), ('❨', '❵'), ('⟅', '⟆'), ('⟦', '⟯'), + ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), ('⳹', '⳼'), + ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸮'), ('⸰', '⹉'), + ('、', '〃'), ('〈', '】'), ('〔', '〟'), ('〰', '〰'), + ('〽', '〽'), ('゠', '゠'), ('・', '・'), ('꓾', '꓿'), + ('꘍', '꘏'), ('꙳', '꙳'), ('꙾', '꙾'), ('꛲', '꛷'), + ('꡴', '꡷'), ('꣎', '꣏'), ('꣸', '꣺'), ('꣼', '꣼'), + ('꤮', '꤯'), ('꥟', '꥟'), ('꧁', '꧍'), ('꧞', '꧟'), + ('꩜', '꩟'), ('꫞', '꫟'), ('꫰', '꫱'), ('꯫', '꯫'), + ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), ('﹔', '﹡'), + ('﹣', '﹣'), ('﹨', '﹨'), ('﹪', '﹫'), ('!', '#'), + ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), + ('[', ']'), ('_', '_'), ('{', '{'), ('}', '}'), + ('⦅', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), ('𐏐', '𐏐'), + ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐤿', '𐤿'), + ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), ('𐬹', '𐬿'), + ('𐮙', '𐮜'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), ('𑂾', '𑃁'), + ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇉'), ('𑇍', '𑇍'), + ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), ('𑊩', '𑊩'), + ('𑑋', '𑑏'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), ('𑓆', '𑓆'), + ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), ('𑜼', '𑜾'), + ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), ('𑱁', '𑱅'), + ('𑱰', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), + ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), +]; + +pub const SPACING_MARK: &'static [(char, char)] = &[ + ('ः', 'ः'), ('ऻ', 'ऻ'), ('ा', 'ी'), ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), ('ং', 'ঃ'), ('া', 'ী'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৗ', 'ৗ'), ('ਃ', 'ਃ'), ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), ('ા', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), + ('ଂ', 'ଃ'), ('ା', 'ା'), ('ୀ', 'ୀ'), ('େ', 'ୈ'), + ('ୋ', 'ୌ'), ('ୗ', 'ୗ'), ('ா', 'ி'), ('ு', 'ூ'), + ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௗ', 'ௗ'), ('ఁ', 'ః'), + ('ు', 'ౄ'), ('ಂ', 'ಃ'), ('ಾ', 'ಾ'), ('ೀ', 'ೄ'), + ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), ('ೕ', 'ೖ'), ('ം', 'ഃ'), + ('ാ', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൗ', 'ൗ'), + ('ං', 'ඃ'), ('ා', 'ෑ'), ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), + ('༾', '༿'), ('ཿ', 'ཿ'), ('ါ', 'ာ'), ('ေ', 'ေ'), + ('း', 'း'), ('ျ', 'ြ'), ('ၖ', 'ၗ'), ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႏ', 'ႏ'), + ('ႚ', 'ႜ'), ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), ('ᩕ', 'ᩕ'), ('ᩗ', 'ᩗ'), ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('ᬄ', 'ᬄ'), ('ᬵ', 'ᬵ'), + ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', '᭄'), ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), ('᯲', '᯳'), ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), ('᳡', '᳡'), ('ᳲ', 'ᳳ'), ('᳷', '᳷'), + ('〮', '〯'), ('ꠣ', 'ꠤ'), ('ꠧ', 'ꠧ'), ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), ('ꥒ', '꥓'), ('ꦃ', 'ꦃ'), ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), ('ꦽ', '꧀'), ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), ('ꩻ', 'ꩻ'), ('ꩽ', 'ꩽ'), ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯤ'), ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), ('꯬', '꯬'), ('𑀀', '𑀀'), ('𑀂', '𑀂'), + ('𑂂', '𑂂'), ('𑂰', '𑂲'), ('𑂷', '𑂸'), ('𑄬', '𑄬'), + ('𑆂', '𑆂'), ('𑆳', '𑆵'), ('𑆿', '𑇀'), ('𑈬', '𑈮'), + ('𑈲', '𑈳'), ('𑈵', '𑈵'), ('𑋠', '𑋢'), ('𑌂', '𑌃'), + ('𑌾', '𑌿'), ('𑍁', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍗', '𑍗'), ('𑍢', '𑍣'), ('𑐵', '𑐷'), ('𑑀', '𑑁'), + ('𑑅', '𑑅'), ('𑒰', '𑒲'), ('𑒹', '𑒹'), ('𑒻', '𑒾'), + ('𑓁', '𑓁'), ('𑖯', '𑖱'), ('𑖸', '𑖻'), ('𑖾', '𑖾'), + ('𑘰', '𑘲'), ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑚬', '𑚬'), + ('𑚮', '𑚯'), ('𑚶', '𑚶'), ('𑜠', '𑜡'), ('𑜦', '𑜦'), + ('𑨇', '𑨈'), ('𑨹', '𑨹'), ('𑩗', '𑩘'), ('𑪗', '𑪗'), + ('𑰯', '𑰯'), ('𑰾', '𑰾'), ('𑲩', '𑲩'), ('𑲱', '𑲱'), + ('𑲴', '𑲴'), ('𖽑', '𖽾'), ('𝅥', '𝅦'), ('𝅭', '𝅲'), +]; + +pub const SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), ('|', '|'), + ('~', '~'), ('¢', '¦'), ('¨', '©'), ('¬', '¬'), ('®', '±'), + ('´', '´'), ('¸', '¸'), ('×', '×'), ('÷', '÷'), ('˂', '˅'), + ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), ('˯', '˿'), ('͵', '͵'), + ('΄', '΅'), ('϶', '϶'), ('҂', '҂'), ('֍', '֏'), ('؆', '؈'), + ('؋', '؋'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), + ('߶', '߶'), ('৲', '৳'), ('৺', '৻'), ('૱', '૱'), + ('୰', '୰'), ('௳', '௺'), ('౿', '౿'), ('൏', '൏'), + ('൹', '൹'), ('฿', '฿'), ('༁', '༃'), ('༓', '༓'), + ('༕', '༗'), ('༚', '༟'), ('༴', '༴'), ('༶', '༶'), + ('༸', '༸'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿏'), + ('࿕', '࿘'), ('႞', '႟'), ('᎐', '᎙'), ('៛', '៛'), + ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), ('᭴', '᭼'), + ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), + ('῭', '`'), ('´', '῾'), ('⁄', '⁄'), ('⁒', '⁒'), + ('⁺', '⁼'), ('₊', '₌'), ('₠', '₿'), ('℀', '℁'), + ('℃', '℆'), ('℈', '℉'), ('℔', '℔'), ('№', '℘'), + ('℞', '℣'), ('℥', '℥'), ('℧', '℧'), ('℩', '℩'), + ('℮', '℮'), ('℺', '℻'), ('⅀', '⅄'), ('⅊', '⅍'), + ('⅏', '⅏'), ('↊', '↋'), ('←', '⌇'), ('⌌', '⌨'), + ('⌫', '␦'), ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '❧'), + ('➔', '⟄'), ('⟇', '⟥'), ('⟰', '⦂'), ('⦙', '⧗'), + ('⧜', '⧻'), ('⧾', '⭳'), ('⭶', '⮕'), ('⮘', '⮹'), + ('⮽', '⯈'), ('⯊', '⯒'), ('⯬', '⯯'), ('⳥', '⳪'), + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), + ('〄', '〄'), ('〒', '〓'), ('〠', '〠'), ('〶', '〷'), + ('〾', '〿'), ('゛', '゜'), ('㆐', '㆑'), ('㆖', '㆟'), + ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), ('㉐', '㉐'), + ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㋾'), ('㌀', '㏿'), + ('䷀', '䷿'), ('꒐', '꓆'), ('꜀', '꜖'), ('꜠', '꜡'), + ('꞉', '꞊'), ('꠨', '꠫'), ('꠶', '꠹'), ('꩷', '꩹'), + ('꭛', '꭛'), ('﬩', '﬩'), ('﮲', '﯁'), ('﷼', '﷽'), + ('﹢', '﹢'), ('﹤', '﹦'), ('﹩', '﹩'), ('$', '$'), + ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), + ('|', '|'), ('~', '~'), ('¢', '₩'), ('│', '○'), + ('', '�'), ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), + ('𐆐', '𐆛'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), + ('𐫈', '𐫈'), ('𑜿', '𑜿'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), + ('𛲜', '𛲜'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), + ('𝈀', '𝉁'), ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝛁', '𝛁'), + ('𝛛', '𝛛'), ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), + ('𝝏', '𝝏'), ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), + ('𝟃', '𝟃'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), ('𝩭', '𝩴'), + ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('𞻰', '𞻱'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), + ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), ('🛰', '🛸'), + ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), ('🠐', '🡇'), + ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🤀', '🤋'), + ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), ('🦀', '🦗'), + ('🧀', '🧀'), ('🧐', '🧦'), +]; + +pub const TITLECASE_LETTER: &'static [(char, char)] = &[ + ('Dž', 'Dž'), ('Lj', 'Lj'), ('Nj', 'Nj'), ('Dz', 'Dz'), ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('ᾼ', 'ᾼ'), ('ῌ', 'ῌ'), + ('ῼ', 'ῼ'), +]; + +pub const UNASSIGNED: &'static [(char, char)] = &[ + ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), ('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), + ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ef}'), ('\u{5f5}', '\u{5ff}'), ('\u{61d}', '\u{61d}'), + ('\u{70e}', '\u{70e}'), ('\u{74b}', '\u{74c}'), ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7ff}'), ('\u{82e}', '\u{82f}'), ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), ('\u{86b}', '\u{89f}'), + ('\u{8b5}', '\u{8b5}'), ('\u{8be}', '\u{8d3}'), ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), + ('\u{9fe}', '\u{a00}'), ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), ('\u{a76}', '\u{a80}'), ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), ('\u{b4e}', '\u{b55}'), ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), ('\u{c04}', '\u{c04}'), ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3c}'), + ('\u{c45}', '\u{c45}'), ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), ('\u{c5b}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c77}'), ('\u{c84}', '\u{c84}'), ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), ('\u{cd7}', '\u{cdd}'), ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), ('\u{cf0}', '\u{cf0}'), ('\u{cf3}', '\u{cff}'), + ('\u{d04}', '\u{d04}'), ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), ('\u{d80}', '\u{d81}'), ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e86}'), + ('\u{e89}', '\u{e89}'), ('\u{e8b}', '\u{e8c}'), ('\u{e8e}', '\u{e93}'), + ('\u{e98}', '\u{e98}'), ('\u{ea0}', '\u{ea0}'), ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), ('\u{ea8}', '\u{ea9}'), ('\u{eac}', '\u{eac}'), + ('\u{eba}', '\u{eba}'), ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), ('\u{ece}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), ('\u{180f}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), ('\u{1878}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), ('\u{1abf}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1cbf}'), + ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfa}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), ('\u{2065}', '\u{2065}'), + ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b97}'), + ('\u{2bba}', '\u{2bbc}'), ('\u{2bc9}', '\u{2bc9}'), + ('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bff}'), + ('\u{2c2f}', '\u{2c2f}'), ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), ('\u{2e4a}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), ('\u{312f}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), ('\u{31bb}', '\u{31bf}'), + ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), + ('\u{32ff}', '\u{32ff}'), ('\u{4db6}', '\u{4dbf}'), + ('\u{9feb}', '\u{9fff}'), ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), ('\u{a7af}', '\u{a7af}'), + ('\u{a7b8}', '\u{a7f6}'), ('\u{a82c}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), + ('\u{a8fe}', '\u{a8ff}'), ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab66}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{d7ff}'), + ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{fefe}'), + ('\u{ff00}', '\u{ff00}'), ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), ('\u{ffef}', '\u{fff8}'), + ('\u{fffe}', '\u{ffff}'), ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), ('\u{1019c}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), ('\u{10570}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{107ff}'), ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), ('\u{10a18}', '\u{10a18}'), + ('\u{10a34}', '\u{10a37}'), ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a48}', '\u{10a4f}'), ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d00}', '\u{10e5f}'), ('\u{10e7f}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), ('\u{11070}', '\u{1107e}'), + ('\u{110c2}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), + ('\u{11144}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), + ('\u{111ce}', '\u{111cf}'), ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133b}'), ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), ('\u{1145a}', '\u{1145a}'), + ('\u{1145c}', '\u{1145c}'), ('\u{1145e}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), ('\u{116b8}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), ('\u{1171a}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), ('\u{11740}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), ('\u{11900}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), ('\u{11a84}', '\u{11a85}'), + ('\u{11a9d}', '\u{11a9d}'), ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11fff}'), ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), ('\u{1342f}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16eff}'), + ('\u{16f45}', '\u{16f4f}'), ('\u{16f7f}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe2}', '\u{16fff}'), + ('\u{187ed}', '\u{187ff}'), ('\u{18af3}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca4}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), ('\u{1d1e9}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2ff}'), ('\u{1d357}', '\u{1d35f}'), + ('\u{1d372}', '\u{1d3ff}'), ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), ('\u{1dab0}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e7ff}'), ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), ('\u{1e94b}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), ('\u{1e960}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f10d}', '\u{1f10f}'), ('\u{1f12f}', '\u{1f12f}'), + ('\u{1f16c}', '\u{1f16f}'), ('\u{1f1ad}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), ('\u{1f6d5}', '\u{1f6df}'), + ('\u{1f6ed}', '\u{1f6ef}'), ('\u{1f6f9}', '\u{1f6ff}'), + ('\u{1f774}', '\u{1f77f}'), ('\u{1f7d5}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8ff}'), ('\u{1f90c}', '\u{1f90f}'), + ('\u{1f93f}', '\u{1f93f}'), ('\u{1f94d}', '\u{1f94f}'), + ('\u{1f96c}', '\u{1f97f}'), ('\u{1f998}', '\u{1f9bf}'), + ('\u{1f9c1}', '\u{1f9cf}'), ('\u{1f9e7}', '\u{1ffff}'), + ('\u{2a6d7}', '\u{2a6ff}'), ('\u{2b735}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), ('\u{2fa1e}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{effff}'), ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), + ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), ('Ↄ', 'Ↄ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('A', 'Z'), + ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), + ('𝐀', '𝐙'), ('𝐴', '𝑍'), ('𝑨', '𝒁'), ('𝒜', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒵'), ('𝓐', '𝓩'), ('𝔄', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔸', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕬', '𝖅'), + ('𝖠', '𝖹'), ('𝗔', '𝗭'), ('𝘈', '𝘡'), ('𝘼', '𝙕'), + ('𝙰', '𝚉'), ('𝚨', '𝛀'), ('𝛢', '𝛺'), ('𝜜', '𝜴'), + ('𝝖', '𝝮'), ('𝞐', '𝞨'), ('𝟊', '𝟊'), ('𞤀', '𞤡'), +]; diff --git a/regex-syntax-2/src/unicode_tables/mod.rs b/regex-syntax-2/src/unicode_tables/mod.rs new file mode 100644 index 0000000000..6c2e9e7736 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/mod.rs @@ -0,0 +1,9 @@ +pub mod age; +pub mod case_folding_simple; +pub mod general_category; +pub mod perl_word; +pub mod property_bool; +pub mod property_names; +pub mod property_values; +pub mod script_extension; +pub mod script; diff --git a/regex-syntax-2/src/unicode_tables/perl_word.rs b/regex-syntax-2/src/unicode_tables/perl_word.rs new file mode 100644 index 0000000000..d33f79a02b --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/perl_word.rs @@ -0,0 +1,179 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word /home/andrew/tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', 'ԯ'), ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), ('ؐ', 'ؚ'), + ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), ('۪', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), + ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), + ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), ('०', '९'), + ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', 'ৄ'), + ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), + ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('ਁ', 'ਃ'), + ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), + ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), ('ઁ', 'ઃ'), + ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), ('૦', '૯'), + ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), + ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), + ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), ('ୱ', 'ୱ'), + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('௦', '௯'), + ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), + ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), + ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('౦', '౯'), + ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), + ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), ('അ', 'ഌ'), + ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), + ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), ('ൺ', 'ൿ'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), ('ෲ', 'ෳ'), + ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), + ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), + ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), ('ົ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), ('໐', '໙'), + ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), ('༠', '༩'), + ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), ('༾', 'ཇ'), + ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), ('ྙ', 'ྼ'), + ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፝', '፟'), + ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), ('ក', '៓'), + ('ៗ', 'ៗ'), ('ៜ', '៝'), ('០', '៩'), ('᠋', '᠍'), + ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'), + ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), + ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('᪰', '᪾'), ('ᬀ', 'ᭋ'), + ('᭐', '᭙'), ('᭫', '᭳'), ('ᮀ', '᯳'), ('ᰀ', '᰷'), + ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('᳐', '᳒'), + ('᳔', '᳹'), ('ᴀ', '᷹'), ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('\u{200c}', '\u{200d}'), ('‿', '⁀'), ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃐', '⃰'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), + ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), + ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('ⸯ', 'ⸯ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', '゚'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), + ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '꙲'), ('ꙴ', '꙽'), + ('ꙿ', '꛱'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), + ('꣐', '꣙'), ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('꤀', '꤭'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), + ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), + ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('︀', '️'), ('︠', '︯'), + ('︳', '︴'), ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), + ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), + ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), + ('𐅀', '𐅴'), ('𐇽', '𐇽'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), + ('𐋠', '𐋠'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍺'), + ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), + ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), + ('𐫀', '𐫇'), ('𐫉', '𐫦'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), + ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑀀', '𑁆'), ('𑁦', '𑁯'), ('𑁿', '𑂺'), + ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑄿'), + ('𑅐', '𑅳'), ('𑅶', '𑅶'), ('𑆀', '𑇄'), ('𑇊', '𑇌'), + ('𑇐', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈷'), + ('𑈾', '𑈾'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), + ('𑌀', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), + ('𑍝', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐀', '𑑊'), + ('𑑐', '𑑙'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), + ('𑖀', '𑖵'), ('𑖸', '𑗀'), ('𑗘', '𑗝'), ('𑘀', '𑙀'), + ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜹'), ('𑢠', '𑣩'), + ('𑣿', '𑣿'), ('𑨀', '𑨾'), ('𑩇', '𑩇'), ('𑩐', '𑪃'), + ('𑪆', '𑪙'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), + ('𑰸', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), + ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), + ('𖫐', '𖫭'), ('𖫰', '𖫴'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), + ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽄'), + ('𖽐', '𖽾'), ('𖾏', '𖾟'), ('𖿠', '𖿡'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲝', '𛲞'), + ('𝅥', '𝅩'), ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), + ('𝆪', '𝆭'), ('𝉂', '𝉄'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𝟎', '𝟿'), ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), + ('𝪄', '𝪄'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞠀', '𞣄'), ('𞣐', '𞣖'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), ('󠄀', '󠇯'), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_bool.rs b/regex-syntax-2/src/unicode_tables/property_bool.rs new file mode 100644 index 0000000000..ae867e3007 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_bool.rs @@ -0,0 +1,2576 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC), + ("Bidi_Control", BIDI_CONTROL), ("Case_Ignorable", CASE_IGNORABLE), + ("Cased", CASED), ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), + ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), + ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), + ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), + ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), ("Dash", DASH), + ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), + ("Deprecated", DEPRECATED), ("Diacritic", DIACRITIC), + ("Extender", EXTENDER), ("Grapheme_Base", GRAPHEME_BASE), + ("Grapheme_Extend", GRAPHEME_EXTEND), ("Grapheme_Link", GRAPHEME_LINK), + ("Hex_Digit", HEX_DIGIT), ("Hyphen", HYPHEN), + ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), + ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), + ("ID_Continue", ID_CONTINUE), ("ID_Start", ID_START), + ("Ideographic", IDEOGRAPHIC), ("Join_Control", JOIN_CONTROL), + ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), + ("Lowercase", LOWERCASE), ("Math", MATH), + ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), + ("Other_Alphabetic", OTHER_ALPHABETIC), + ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), + ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), + ("Other_ID_Continue", OTHER_ID_CONTINUE), + ("Other_ID_Start", OTHER_ID_START), ("Other_Lowercase", OTHER_LOWERCASE), + ("Other_Math", OTHER_MATH), ("Other_Uppercase", OTHER_UPPERCASE), + ("Pattern_Syntax", PATTERN_SYNTAX), + ("Pattern_White_Space", PATTERN_WHITE_SPACE), + ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), + ("Quotation_Mark", QUOTATION_MARK), ("Radical", RADICAL), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Sentence_Terminal", SENTENCE_TERMINAL), ("Soft_Dotted", SOFT_DOTTED), + ("Terminal_Punctuation", TERMINAL_PUNCTUATION), + ("Unified_Ideograph", UNIFIED_IDEOGRAPH), ("Uppercase", UPPERCASE), + ("Variation_Selector", VARIATION_SELECTOR), ("White_Space", WHITE_SPACE), + ("XID_Continue", XID_CONTINUE), ("XID_Start", XID_START), +]; + +pub const ASCII_HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'F'), ('a', 'f'), +]; + +pub const ALPHABETIC: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('ͅ', 'ͅ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), ('ա', 'և'), ('ְ', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), ('ؐ', 'ؚ'), + ('ؠ', 'ٗ'), ('ٙ', 'ٟ'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('ۡ', 'ۨ'), + ('ۭ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܿ'), ('ݍ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠗ'), ('ࠚ', 'ࠬ'), + ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), + ('ࣔ', 'ࣟ'), ('ࣣ', 'ࣩ'), ('ࣰ', 'ऻ'), ('ऽ', 'ौ'), + ('ॎ', 'ॐ'), ('ॕ', 'ॣ'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৌ'), + ('ৎ', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), ('য়', 'ৣ'), + ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', 'ੌ'), ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੰ', 'ੵ'), ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), + ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), + ('ઽ', 'ૅ'), ('ે', 'ૉ'), ('ો', 'ૌ'), ('ૐ', 'ૐ'), + ('ૠ', 'ૣ'), ('ૹ', 'ૼ'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ୄ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('ୱ', 'ୱ'), + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', 'ௌ'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', 'ౌ'), ('ౕ', 'ౖ'), + ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಽ', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', 'ೌ'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ൄ'), + ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൎ', 'ൎ'), ('ൔ', 'ൗ'), + ('ൟ', 'ൣ'), ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), + ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), + ('ก', 'ฺ'), ('เ', 'ๆ'), ('ํ', 'ํ'), ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), + ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), + ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), ('ົ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໍ', 'ໍ'), ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ཱྀ'), + ('ྈ', 'ྗ'), ('ྙ', 'ྼ'), ('က', 'ံ'), ('း', 'း'), + ('ျ', 'ဿ'), ('ၐ', 'ၢ'), ('ၥ', 'ၨ'), ('ၮ', 'ႆ'), + ('ႎ', 'ႎ'), ('ႜ', 'ႝ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፟', '፟'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜓ'), ('ᜠ', 'ᜳ'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), ('ក', 'ឳ'), ('ា', 'ៈ'), + ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', 'ᤸ'), + ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), ('ᩡ', 'ᩴ'), ('ᪧ', 'ᪧ'), + ('ᬀ', 'ᬳ'), ('ᬵ', 'ᭃ'), ('ᭅ', 'ᭋ'), ('ᮀ', 'ᮩ'), + ('ᮬ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᯧ', 'ᯱ'), ('ᰀ', 'ᰵ'), + ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᴀ', 'ᶿ'), ('ᷧ', 'ᷴ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), + ('ⷠ', 'ⷿ'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '〩'), + ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), + ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙴ', 'ꙻ'), ('ꙿ', 'ꛯ'), + ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠧ'), + ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣃ'), ('ꣅ', 'ꣅ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤪ'), ('ꤰ', 'ꥒ'), + ('ꥠ', 'ꥼ'), ('ꦀ', 'ꦲ'), ('ꦴ', 'ꦿ'), ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨶ'), + ('ꩀ', 'ꩍ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪾ'), + ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), + ('ꫲ', 'ꫵ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꯪ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), + ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), + ('𐍐', '𐍺'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀀', '𑁅'), + ('𑂂', '𑂸'), ('𑃐', '𑃨'), ('𑄀', '𑄲'), ('𑅐', '𑅲'), + ('𑅶', '𑅶'), ('𑆀', '𑆿'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈴'), ('𑈷', '𑈷'), + ('𑈾', '𑈾'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋨'), ('𑌀', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍌'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), + ('𑐀', '𑑁'), ('𑑃', '𑑅'), ('𑑇', '𑑊'), ('𑒀', '𑓁'), + ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖵'), ('𑖸', '𑖾'), + ('𑗘', '𑗝'), ('𑘀', '𑘾'), ('𑙀', '𑙀'), ('𑙄', '𑙄'), + ('𑚀', '𑚵'), ('𑜀', '𑜙'), ('𑜝', '𑜪'), ('𑢠', '𑣟'), + ('𑣿', '𑣿'), ('𑨀', '𑨲'), ('𑨵', '𑨾'), ('𑩐', '𑪃'), + ('𑪆', '𑪗'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), + ('𑰸', '𑰾'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), + ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵁'), ('𑵃', '𑵃'), + ('𑵆', '𑵇'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲞', '𛲞'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), + ('𞀦', '𞀪'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥇', '𞥇'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const BIDI_CONTROL: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), ('\u{200e}', '\u{200f}'), ('\u{202a}', '\u{202e}'), + ('\u{2066}', '\u{2069}'), +]; + +pub const CASE_IGNORABLE: &'static [(char, char)] = &[ + ('\'', '\''), ('.', '.'), (':', ':'), ('^', '^'), ('`', '`'), ('¨', '¨'), + ('\u{ad}', '\u{ad}'), ('¯', '¯'), ('´', '´'), ('·', '¸'), + ('ʰ', 'ͯ'), ('ʹ', '͵'), ('ͺ', 'ͺ'), ('΄', '΅'), ('·', '·'), + ('҃', '҉'), ('ՙ', 'ՙ'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('״', '״'), ('\u{600}', '\u{605}'), + ('ؐ', 'ؚ'), ('\u{61c}', '\u{61c}'), ('ـ', 'ـ'), ('ً', 'ٟ'), + ('ٰ', 'ٰ'), ('ۖ', '\u{6dd}'), ('۟', 'ۨ'), ('۪', 'ۭ'), + ('\u{70f}', '\u{70f}'), ('ܑ', 'ܑ'), ('ܰ', '݊'), ('ަ', 'ް'), + ('߫', 'ߵ'), ('ߺ', 'ߺ'), ('ࠖ', '࠭'), ('࡙', '࡛'), ('ࣔ', 'ं'), + ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), ('्', '्'), + ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ॱ', 'ॱ'), ('ঁ', 'ঁ'), + ('়', '়'), ('ু', 'ৄ'), ('্', '্'), ('ৢ', 'ৣ'), + ('ਁ', 'ਂ'), ('਼', '਼'), ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ં'), ('઼', '઼'), ('ુ', 'ૅ'), ('ે', 'ૈ'), + ('્', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଁ'), + ('଼', '଼'), ('ି', 'ି'), ('ୁ', 'ୄ'), ('୍', '୍'), + ('ୖ', 'ୖ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ீ', 'ீ'), + ('்', '்'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೆ', 'ೆ'), ('ೌ', '್'), + ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), ('഻', '഼'), ('ു', 'ൄ'), + ('്', '്'), ('ൢ', 'ൣ'), ('්', '්'), ('ි', 'ු'), + ('ූ', 'ූ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('ๆ', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('ໆ', 'ໆ'), + ('່', 'ໍ'), ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), + ('༹', '༹'), ('ཱ', 'ཾ'), ('ྀ', '྄'), ('྆', '྇'), + ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ိ', 'ူ'), + ('ဲ', '့'), ('္', '်'), ('ွ', 'ှ'), ('ၘ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), ('ႅ', 'ႆ'), + ('ႍ', 'ႍ'), ('ႝ', 'ႝ'), ('ჼ', 'ჼ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '឵'), ('ិ', 'ួ'), ('ំ', 'ំ'), ('៉', '៓'), + ('ៗ', 'ៗ'), ('៝', '៝'), ('᠋', '\u{180e}'), ('ᡃ', 'ᡃ'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), + ('ᤲ', 'ᤲ'), ('᤹', '᤻'), ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), + ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), ('᩠', '᩠'), ('ᩢ', 'ᩢ'), + ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), ('᩿', '᩿'), ('ᪧ', 'ᪧ'), + ('᪰', '᪾'), ('ᬀ', 'ᬃ'), ('᬴', '᬴'), ('ᬶ', 'ᬺ'), + ('ᬼ', 'ᬼ'), ('ᭂ', 'ᭂ'), ('᭫', '᭳'), ('ᮀ', 'ᮁ'), + ('ᮢ', 'ᮥ'), ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), ('᯦', '᯦'), + ('ᯨ', 'ᯩ'), ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), ('ᰬ', 'ᰳ'), + ('ᰶ', '᰷'), ('ᱸ', 'ᱽ'), ('᳐', '᳒'), ('᳔', '᳠'), + ('᳢', '᳨'), ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), + ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', '᷹'), ('᷻', '᷿'), + ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), + ('῭', '`'), ('´', '῾'), ('\u{200b}', '\u{200f}'), ('‘', '’'), + ('․', '․'), ('‧', '‧'), ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃐', '⃰'), ('ⱼ', 'ⱽ'), + ('⳯', '⳱'), ('ⵯ', 'ⵯ'), ('⵿', '⵿'), ('ⷠ', 'ⷿ'), + ('ⸯ', 'ⸯ'), ('々', '々'), ('〪', '〭'), ('〱', '〵'), + ('〻', '〻'), ('゙', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), ('ꘌ', 'ꘌ'), ('꙯', '꙲'), ('ꙴ', '꙽'), + ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚟ'), ('꛰', '꛱'), ('꜀', '꜡'), + ('ꝰ', 'ꝰ'), ('ꞈ', '꞊'), ('ꟸ', 'ꟹ'), ('ꠂ', 'ꠂ'), + ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠥ', 'ꠦ'), ('꣄', 'ꣅ'), + ('꣠', '꣱'), ('ꤦ', '꤭'), ('ꥇ', 'ꥑ'), ('ꦀ', 'ꦂ'), + ('꦳', '꦳'), ('ꦶ', 'ꦹ'), ('ꦼ', 'ꦼ'), ('ꧏ', 'ꧏ'), + ('ꧥ', 'ꧦ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), ('ꨵ', 'ꨶ'), + ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩰ', 'ꩰ'), ('ꩼ', 'ꩼ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫝ', 'ꫝ'), ('ꫬ', 'ꫭ'), ('ꫳ', 'ꫴ'), + ('꫶', '꫶'), ('꭛', 'ꭟ'), ('ꯥ', 'ꯥ'), ('ꯨ', 'ꯨ'), + ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('﮲', '﯁'), ('︀', '️'), + ('︓', '︓'), ('︠', '︯'), ('﹒', '﹒'), ('﹕', '﹕'), + ('\u{feff}', '\u{feff}'), (''', '''), ('.', '.'), (':', ':'), + ('^', '^'), ('`', '`'), ('ー', 'ー'), ('゙', '゚'), + (' ̄', ' ̄'), ('\u{fff9}', '\u{fffb}'), ('𐇽', '𐇽'), + ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐫥', '𐫦'), + ('𑀁', '𑀁'), ('𑀸', '𑁆'), ('𑁿', '𑂁'), ('𑂳', '𑂶'), + ('𑂹', '𑂺'), ('\u{110bd}', '\u{110bd}'), ('𑄀', '𑄂'), + ('𑄧', '𑄫'), ('𑄭', '𑄴'), ('𑅳', '𑅳'), ('𑆀', '𑆁'), + ('𑆶', '𑆾'), ('𑇊', '𑇌'), ('𑈯', '𑈱'), ('𑈴', '𑈴'), + ('𑈶', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋟'), ('𑋣', '𑋪'), + ('𑌀', '𑌁'), ('𑌼', '𑌼'), ('𑍀', '𑍀'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), + ('𑒳', '𑒸'), ('𑒺', '𑒺'), ('𑒿', '𑓀'), ('𑓂', '𑓃'), + ('𑖲', '𑖵'), ('𑖼', '𑖽'), ('𑖿', '𑗀'), ('𑗜', '𑗝'), + ('𑘳', '𑘺'), ('𑘽', '𑘽'), ('𑘿', '𑙀'), ('𑚫', '𑚫'), + ('𑚭', '𑚭'), ('𑚰', '𑚵'), ('𑚷', '𑚷'), ('𑜝', '𑜟'), + ('𑜢', '𑜥'), ('𑜧', '𑜫'), ('𑨁', '𑨆'), ('𑨉', '𑨊'), + ('𑨳', '𑨸'), ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩖'), + ('𑩙', '𑩛'), ('𑪊', '𑪖'), ('𑪘', '𑪙'), ('𑰰', '𑰶'), + ('𑰸', '𑰽'), ('𑰿', '𑰿'), ('𑲒', '𑲧'), ('𑲪', '𑲰'), + ('𑲲', '𑲳'), ('𑲵', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), + ('𖬰', '𖬶'), ('𖭀', '𖭃'), ('𖾏', '𖾟'), ('𖿠', '𖿡'), + ('𛲝', '𛲞'), ('\u{1bca0}', '\u{1bca3}'), ('𝅧', '𝅩'), + ('\u{1d173}', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), + ('𞥄', '𞥊'), ('🏻', '🏿'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), ('󠄀', '󠇯'), +]; + +pub const CASED: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), + ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚝ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), + ('a', 'z'), ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), + ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), + ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), + ('𝟄', '𝟋'), ('𞤀', '𞥃'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ + ('A', 'Z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ß'), ('Ā', 'Ā'), + ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), + ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), + ('ʼn', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), + ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), + ('Ž', 'Ž'), ('ſ', 'ſ'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), + ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), ('ͅ', 'ͅ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), + ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('ς', 'ς'), ('Ϗ', 'ϑ'), ('ϕ', 'ϖ'), + ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('և', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẚ', 'ẛ'), ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), + ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), + ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾀ', 'ᾯ'), ('ᾲ', 'ᾴ'), + ('ᾷ', 'ᾼ'), ('ῂ', 'ῄ'), ('ῇ', 'ῌ'), ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῷ', 'ῼ'), ('Ω', 'Ω'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), + ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), + ('ø', 'ķ'), ('Ĺ', 'ƌ'), ('Ǝ', 'ƚ'), ('Ɯ', 'Ʃ'), ('Ƭ', 'ƹ'), + ('Ƽ', 'ƽ'), ('ƿ', 'ƿ'), ('DŽ', 'Ƞ'), ('Ȣ', 'ȳ'), ('Ⱥ', 'ɔ'), + ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϑ'), ('ϕ', 'ϵ'), ('Ϸ', 'ϻ'), ('Ͻ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), ('Ḁ', 'ẛ'), ('ẞ', 'ẞ'), ('Ạ', 'ἕ'), + ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('Ω', 'Ω'), ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'Ɒ'), + ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ⳣ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꜯ'), ('Ꜳ', 'ꝯ'), + ('Ꝺ', 'ꞇ'), ('Ꞌ', 'Ɥ'), ('Ꞑ', 'ꞓ'), ('Ꞗ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), ('𐐀', '𐑏'), + ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), + ('𑢠', '𑣟'), ('𞤀', '𞥃'), +]; + +pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), + ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('Ᾰ', 'ᾼ'), ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'ῼ'), ('Ω', 'Ω'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('A', 'Z'), ('𐐀', '𐐧'), + ('𐒰', '𐓓'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), + ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), + ('DŽ', 'DŽ'), ('dž', 'LJ'), ('lj', 'NJ'), ('nj', 'nj'), ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), + ('ǯ', 'DZ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), + ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), + ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), + ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), + ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), + ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), + ('ա', 'և'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), + ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), + ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), + ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), + ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), + ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), + ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), + ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), + ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), + ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), + ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), + ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), + ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱞ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞓ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), + ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𞤢', '𞥃'), +]; + +pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), + ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), + ('Dž', 'dž'), ('Lj', 'lj'), ('Nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), + ('Dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), + ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), ('ɖ', 'ɗ'), + ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), ('ʝ', 'ʞ'), + ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), ('ᵽ', 'ᵽ'), + ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), ('ạ', 'ạ'), + ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), + ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), + ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), + ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), + ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), + ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), + ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), + ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), + ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾷ'), + ('ᾼ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), + ('ῌ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ῼ', 'ῼ'), ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱞ'), + ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞓ'), + ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), + ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐳀', '𐳲'), + ('𑣀', '𑣟'), ('𞤢', '𞥃'), +]; + +pub const DASH: &'static [(char, char)] = &[ + ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), + ('‐', '―'), ('⁓', '⁓'), ('⁻', '⁻'), ('₋', '₋'), + ('−', '−'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), + ('⹀', '⹀'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), + ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), +]; + +pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), ('͏', '͏'), ('\u{61c}', '\u{61c}'), ('ᅟ', 'ᅠ'), + ('឴', '឵'), ('᠋', '\u{180e}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), ('ㅤ', 'ㅤ'), + ('︀', '️'), ('\u{feff}', '\u{feff}'), ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), ('\u{e0000}', '\u{e0fff}'), +]; + +pub const DEPRECATED: &'static [(char, char)] = &[ + ('ʼn', 'ʼn'), ('ٳ', 'ٳ'), ('ཷ', 'ཷ'), ('ཹ', 'ཹ'), ('ឣ', 'ឤ'), + ('\u{206a}', '\u{206f}'), ('〈', '〉'), ('\u{e0001}', '\u{e0001}'), +]; + +pub const DIACRITIC: &'static [(char, char)] = &[ + ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), + ('·', '¸'), ('ʰ', '͎'), ('͐', '͗'), ('͝', '͢'), ('ʹ', '͵'), + ('ͺ', 'ͺ'), ('΄', '΅'), ('҃', '҇'), ('ՙ', 'ՙ'), ('֑', '֡'), + ('֣', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), ('ׄ', 'ׄ'), ('ً', 'ْ'), + ('ٗ', '٘'), ('۟', '۠'), ('ۥ', 'ۦ'), ('۪', '۬'), ('ܰ', '݊'), + ('ަ', 'ް'), ('߫', 'ߵ'), ('࠘', '࠙'), ('ࣣ', 'ࣾ'), ('़', '़'), + ('्', '्'), ('॑', '॔'), ('ॱ', 'ॱ'), ('়', '়'), + ('্', '্'), ('਼', '਼'), ('੍', '੍'), ('઼', '઼'), + ('્', '્'), ('૽', '૿'), ('଼', '଼'), ('୍', '୍'), + ('்', '்'), ('్', '్'), ('಼', '಼'), ('್', '್'), + ('഻', '഼'), ('്', '്'), ('්', '්'), ('็', '์'), + ('๎', '๎'), ('່', '໌'), ('༘', '༙'), ('༵', '༵'), + ('༷', '༷'), ('༹', '༹'), ('༾', '༿'), ('ྂ', '྄'), + ('྆', '྇'), ('࿆', '࿆'), ('့', '့'), ('္', '်'), + ('ႇ', 'ႍ'), ('ႏ', 'ႏ'), ('ႚ', 'ႛ'), ('៉', '៓'), + ('៝', '៝'), ('᤹', '᤻'), ('᩵', '᩼'), ('᩿', '᩿'), + ('᪰', '᪽'), ('᬴', '᬴'), ('᭄', '᭄'), ('᭫', '᭳'), + ('᮪', '᮫'), ('ᰶ', '᰷'), ('ᱸ', 'ᱽ'), ('᳐', '᳨'), + ('᳭', '᳭'), ('᳴', '᳴'), ('᳷', '᳹'), ('ᴬ', 'ᵪ'), + ('᷄', '᷏'), ('᷵', '᷹'), ('᷽', '᷿'), ('᾽', '᾽'), + ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), + ('´', '῾'), ('⳯', '⳱'), ('ⸯ', 'ⸯ'), ('〪', '〯'), + ('゙', '゜'), ('ー', 'ー'), ('꙯', '꙯'), ('꙼', '꙽'), + ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('꛰', '꛱'), ('ꜗ', '꜡'), + ('ꞈ', 'ꞈ'), ('ꟸ', 'ꟹ'), ('꣄', '꣄'), ('꣠', '꣱'), + ('꤫', '꤮'), ('꥓', '꥓'), ('꦳', '꦳'), ('꧀', '꧀'), + ('ꧥ', 'ꧥ'), ('ꩻ', 'ꩽ'), ('꪿', 'ꫂ'), ('꫶', '꫶'), + ('꭛', 'ꭟ'), ('꯬', '꯭'), ('ﬞ', 'ﬞ'), ('︠', '︯'), + ('^', '^'), ('`', '`'), ('ー', 'ー'), ('゙', '゚'), + (' ̄', ' ̄'), ('𐋠', '𐋠'), ('𐫥', '𐫦'), ('𑂹', '𑂺'), + ('𑄳', '𑄴'), ('𑅳', '𑅳'), ('𑇀', '𑇀'), ('𑇊', '𑇌'), + ('𑈵', '𑈶'), ('𑋩', '𑋪'), ('𑌼', '𑌼'), ('𑍍', '𑍍'), + ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑑂', '𑑂'), ('𑑆', '𑑆'), + ('𑓂', '𑓃'), ('𑖿', '𑗀'), ('𑘿', '𑘿'), ('𑚶', '𑚷'), + ('𑜫', '𑜫'), ('𑨴', '𑨴'), ('𑩇', '𑩇'), ('𑪙', '𑪙'), + ('𑰿', '𑰿'), ('𑵂', '𑵂'), ('𑵄', '𑵅'), ('𖫰', '𖫴'), + ('𖾏', '𖾟'), ('𝅧', '𝅩'), ('𝅭', '𝅲'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𞣐', '𞣖'), ('𞥄', '𞥆'), + ('𞥈', '𞥊'), +]; + +pub const EXTENDER: &'static [(char, char)] = &[ + ('·', '·'), ('ː', 'ˑ'), ('ـ', 'ـ'), ('ߺ', 'ߺ'), ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), ('᠊', '᠊'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), + ('ᰶ', 'ᰶ'), ('ᱻ', 'ᱻ'), ('々', '々'), ('〱', '〵'), + ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꘌ', 'ꘌ'), + ('ꧏ', 'ꧏ'), ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), ('ー', 'ー'), ('𑍝', '𑍝'), ('𑗆', '𑗈'), + ('𑪘', '𑪘'), ('𖭂', '𖭃'), ('𖿠', '𖿡'), ('𞥄', '𞥆'), +]; + +pub const GRAPHEME_BASE: &'static [(char, char)] = &[ + (' ', '~'), ('\u{a0}', '¬'), ('®', '˿'), ('Ͱ', 'ͷ'), ('ͺ', 'Ϳ'), + ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', '҂'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '֊'), ('֍', '֏'), + ('־', '־'), ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('א', 'ת'), + ('װ', '״'), ('؆', '؏'), ('؛', '؛'), ('؞', 'ي'), ('٠', 'ٯ'), + ('ٱ', 'ە'), ('۞', '۞'), ('ۥ', 'ۦ'), ('۩', '۩'), ('ۮ', '܍'), + ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('߀', 'ߪ'), + ('ߴ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), ('࠰', '࠾'), ('ࡀ', 'ࡘ'), ('࡞', '࡞'), + ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ः', 'ह'), + ('ऻ', 'ऻ'), ('ऽ', 'ी'), ('ॉ', 'ौ'), ('ॎ', 'ॐ'), + ('क़', 'ॡ'), ('।', 'ঀ'), ('ং', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ি', 'ী'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), + ('০', '৽'), ('ਃ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), ('ਾ', 'ੀ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('੦', '੯'), ('ੲ', 'ੴ'), ('ઃ', 'ઃ'), ('અ', 'ઍ'), + ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), + ('વ', 'હ'), ('ઽ', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('૦', '૱'), ('ૹ', 'ૹ'), + ('ଂ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), + ('ୀ', 'ୀ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), ('୦', '୷'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), + ('ி', 'ி'), ('ு', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), ('௦', '௺'), ('ఁ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), + ('ు', 'ౄ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('౦', '౯'), + ('౸', 'ಀ'), ('ಂ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಾ'), + ('ೀ', 'ು'), ('ೃ', 'ೄ'), ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ೱ', 'ೲ'), + ('ം', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ി', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), + ('ൎ', '൏'), ('ൔ', 'ൖ'), ('൘', 'ൡ'), ('൦', 'ൿ'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ැ', 'ෑ'), ('ෘ', 'ෞ'), + ('෦', '෯'), ('ෲ', '෴'), ('ก', 'ะ'), ('า', 'ำ'), + ('฿', 'ๆ'), ('๏', '๛'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), + ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), + ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), + ('ສ', 'ຫ'), ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('໐', '໙'), ('ໜ', 'ໟ'), + ('ༀ', '༗'), ('༚', '༴'), ('༶', '༶'), ('༸', '༸'), + ('༺', 'ཇ'), ('ཉ', 'ཬ'), ('ཿ', 'ཿ'), ('྅', '྅'), + ('ྈ', 'ྌ'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿚'), + ('က', 'ာ'), ('ေ', 'ေ'), ('း', 'း'), ('ျ', 'ြ'), + ('ဿ', 'ၗ'), ('ၚ', 'ၝ'), ('ၡ', 'ၰ'), ('ၵ', 'ႁ'), + ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႎ', 'ႜ'), ('႞', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፠', '፼'), ('ᎀ', '᎙'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('᐀', '᚜'), ('ᚠ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('᜵', '᜶'), + ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), + ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), ('។', 'ៜ'), + ('០', '៩'), ('៰', '៹'), ('᠀', '᠊'), ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), ('᥀', '᥀'), ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), + ('᧞', 'ᨖ'), ('ᨙ', 'ᨚ'), ('᨞', 'ᩕ'), ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('᪀', '᪉'), + ('᪐', '᪙'), ('᪠', '᪭'), ('ᬄ', 'ᬳ'), ('ᬵ', 'ᬵ'), + ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', 'ᭋ'), ('᭐', '᭪'), + ('᭴', '᭼'), ('ᮂ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), + ('ᮮ', 'ᯥ'), ('ᯧ', 'ᯧ'), ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), ('᯼', 'ᰫ'), ('ᰴ', 'ᰵ'), ('᰻', '᱉'), + ('ᱍ', 'ᲈ'), ('᳀', '᳇'), ('᳓', '᳓'), ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', '᳷'), ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), + ('\u{2000}', '\u{200a}'), ('‐', '‧'), ('\u{202f}', '\u{205f}'), + ('⁰', 'ⁱ'), ('⁴', '₎'), ('ₐ', 'ₜ'), ('₠', '₿'), + ('℀', '↋'), ('←', '␦'), ('⑀', '⑊'), ('①', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('⳹', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⹉'), + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), + ('\u{3000}', '〩'), ('〰', '〿'), ('ぁ', 'ゖ'), ('゛', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('㆐', 'ㆺ'), ('㇀', '㇣'), + ('ㇰ', '㈞'), ('㈠', '㋾'), ('㌀', '䶵'), ('䷀', '鿪'), + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), ('ꓐ', 'ꘫ'), ('Ꙁ', 'ꙮ'), + ('꙳', '꙳'), ('꙾', 'ꚝ'), ('ꚠ', 'ꛯ'), ('꛲', '꛷'), + ('꜀', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠤ'), ('ꠧ', '꠫'), ('꠰', '꠹'), + ('ꡀ', '꡷'), ('ꢀ', 'ꣃ'), ('꣎', '꣙'), ('ꣲ', 'ꣽ'), + ('꤀', 'ꤥ'), ('꤮', 'ꥆ'), ('ꥒ', '꥓'), ('꥟', 'ꥼ'), + ('ꦃ', 'ꦲ'), ('ꦴ', 'ꦵ'), ('ꦺ', 'ꦻ'), ('ꦽ', '꧍'), + ('ꧏ', '꧙'), ('꧞', 'ꧤ'), ('ꧦ', 'ꧾ'), ('ꨀ', 'ꨨ'), + ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), + ('ꩍ', 'ꩍ'), ('꩐', '꩙'), ('꩜', 'ꩻ'), ('ꩽ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫫ'), ('ꫮ', 'ꫵ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭥ'), ('ꭰ', 'ꯤ'), ('ꯦ', 'ꯧ'), ('ꯩ', '꯬'), + ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), + ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), ('ײַ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', '﯁'), ('ﯓ', '﴿'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('︐', '︙'), + ('︰', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('!', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), + ('│', '○'), ('', '�'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), + ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), + ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐆎'), + ('𐆐', '𐆛'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐋡', '𐋻'), ('𐌀', '𐌣'), ('𐌭', '𐍊'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎟', '𐏃'), ('𐏈', '𐏕'), + ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕯'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡗', '𐢞'), ('𐢧', '𐢯'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐣻', '𐤛'), ('𐤟', '𐤹'), ('𐤿', '𐤿'), ('𐦀', '𐦷'), + ('𐦼', '𐧏'), ('𐧒', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐩀', '𐩇'), ('𐩐', '𐩘'), ('𐩠', '𐪟'), + ('𐫀', '𐫤'), ('𐫫', '𐫶'), ('𐬀', '𐬵'), ('𐬹', '𐭕'), + ('𐭘', '𐭲'), ('𐭸', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), + ('𐹠', '𐹾'), ('𑀀', '𑀀'), ('𑀂', '𑀷'), ('𑁇', '𑁍'), + ('𑁒', '𑁯'), ('𑂂', '𑂲'), ('𑂷', '𑂸'), ('𑂻', '𑂼'), + ('𑂾', '𑃁'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('𑄃', '𑄦'), + ('𑄬', '𑄬'), ('𑄶', '𑅃'), ('𑅐', '𑅲'), ('𑅴', '𑅶'), + ('𑆂', '𑆵'), ('𑆿', '𑇉'), ('𑇍', '𑇍'), ('𑇐', '𑇟'), + ('𑇡', '𑇴'), ('𑈀', '𑈑'), ('𑈓', '𑈮'), ('𑈲', '𑈳'), + ('𑈵', '𑈵'), ('𑈸', '𑈽'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), + ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('𑊰', '𑋞'), + ('𑋠', '𑋢'), ('𑋰', '𑋹'), ('𑌂', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑌿', '𑌿'), ('𑍁', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍝', '𑍣'), + ('𑐀', '𑐷'), ('𑑀', '𑑁'), ('𑑅', '𑑅'), ('𑑇', '𑑙'), + ('𑑛', '𑑛'), ('𑑝', '𑑝'), ('𑒀', '𑒯'), ('𑒱', '𑒲'), + ('𑒹', '𑒹'), ('𑒻', '𑒼'), ('𑒾', '𑒾'), ('𑓁', '𑓁'), + ('𑓄', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖮'), ('𑖰', '𑖱'), + ('𑖸', '𑖻'), ('𑖾', '𑖾'), ('𑗁', '𑗛'), ('𑘀', '𑘲'), + ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑙁', '𑙄'), ('𑙐', '𑙙'), + ('𑙠', '𑙬'), ('𑚀', '𑚪'), ('𑚬', '𑚬'), ('𑚮', '𑚯'), + ('𑚶', '𑚶'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), ('𑜠', '𑜡'), + ('𑜦', '𑜦'), ('𑜰', '𑜿'), ('𑢠', '𑣲'), ('𑣿', '𑣿'), + ('𑨀', '𑨀'), ('𑨇', '𑨈'), ('𑨋', '𑨲'), ('𑨹', '𑨺'), + ('𑨿', '𑩆'), ('𑩐', '𑩐'), ('𑩗', '𑩘'), ('𑩜', '𑪃'), + ('𑪆', '𑪉'), ('𑪗', '𑪗'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰯'), ('𑰾', '𑰾'), + ('𑱀', '𑱅'), ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('𑲩', '𑲩'), + ('𑲱', '𑲱'), ('𑲴', '𑲴'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), + ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), + ('𖩮', '𖩯'), ('𖫐', '𖫭'), ('𖫵', '𖫵'), ('𖬀', '𖬯'), + ('𖬷', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲜', '𛲜'), ('𛲟', '𛲟'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅦', '𝅦'), ('𝅪', '𝅭'), + ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝈀', '𝉁'), + ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝍠', '𝍱'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝧿'), + ('𝨷', '𝨺'), ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪋'), + ('𞠀', '𞣄'), ('𞣇', '𞣏'), ('𞤀', '𞥃'), ('𞥐', '𞥙'), + ('𞥞', '𞥟'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄀', '🄌'), ('🄐', '🄮'), ('🄰', '🅫'), + ('🅰', '🆬'), ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), + ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), + ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), + ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), + ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), + ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), +]; + +pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҉'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ं'), ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), + ('्', '्'), ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ঁ', 'ঁ'), + ('়', '়'), ('া', 'া'), ('ু', 'ৄ'), ('্', '্'), + ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), ('ਁ', 'ਂ'), ('਼', '਼'), + ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), + ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), ('ઁ', 'ં'), ('઼', '઼'), + ('ુ', 'ૅ'), ('ે', 'ૈ'), ('્', '્'), ('ૢ', 'ૣ'), + ('ૺ', '૿'), ('ଁ', 'ଁ'), ('଼', '଼'), ('ା', 'ି'), + ('ୁ', 'ୄ'), ('୍', '୍'), ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), + ('ஂ', 'ஂ'), ('ா', 'ா'), ('ீ', 'ீ'), ('்', '்'), + ('ௗ', 'ௗ'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೂ', 'ೂ'), ('ೆ', 'ೆ'), + ('ೌ', '್'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), + ('഻', '഼'), ('ാ', 'ാ'), ('ു', 'ൄ'), ('്', '്'), + ('ൗ', 'ൗ'), ('ൢ', 'ൣ'), ('්', '්'), ('ා', 'ා'), + ('ි', 'ු'), ('ූ', 'ූ'), ('ෟ', 'ෟ'), ('ั', 'ั'), + ('ิ', 'ฺ'), ('็', '๎'), ('ັ', 'ັ'), ('ິ', 'ູ'), + ('ົ', 'ຼ'), ('່', 'ໍ'), ('༘', '༙'), ('༵', '༵'), + ('༷', '༷'), ('༹', '༹'), ('ཱ', 'ཾ'), ('ྀ', '྄'), + ('྆', '྇'), ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), ('࿆', '࿆'), + ('ိ', 'ူ'), ('ဲ', '့'), ('္', '်'), ('ွ', 'ှ'), + ('ၘ', 'ၙ'), ('ၞ', 'ၠ'), ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), + ('ႅ', 'ႆ'), ('ႍ', 'ႍ'), ('ႝ', 'ႝ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '឵'), ('ិ', 'ួ'), ('ំ', 'ំ'), ('៉', '៓'), + ('៝', '៝'), ('᠋', '᠍'), ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), + ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), ('ᤲ', 'ᤲ'), ('᤹', '᤻'), + ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), + ('᩠', '᩠'), ('ᩢ', 'ᩢ'), ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), + ('᩿', '᩿'), ('᪰', '᪾'), ('ᬀ', 'ᬃ'), ('᬴', '᬴'), + ('ᬶ', 'ᬺ'), ('ᬼ', 'ᬼ'), ('ᭂ', 'ᭂ'), ('᭫', '᭳'), + ('ᮀ', 'ᮁ'), ('ᮢ', 'ᮥ'), ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), + ('᯦', '᯦'), ('ᯨ', 'ᯩ'), ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), + ('ᰬ', 'ᰳ'), ('ᰶ', '᰷'), ('᳐', '᳒'), ('᳔', '᳠'), + ('᳢', '᳨'), ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), + ('᷀', '᷹'), ('᷻', '᷿'), ('\u{200c}', '\u{200c}'), ('⃐', '⃰'), + ('⳯', '⳱'), ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〯'), + ('゙', '゚'), ('꙯', '꙲'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), + ('꛰', '꛱'), ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), + ('ꠥ', 'ꠦ'), ('꣄', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), + ('ꥇ', 'ꥑ'), ('ꦀ', 'ꦂ'), ('꦳', '꦳'), ('ꦶ', 'ꦹ'), + ('ꦼ', 'ꦼ'), ('ꧥ', 'ꧥ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), + ('ꨵ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩼ', 'ꩼ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫬ', 'ꫭ'), ('꫶', '꫶'), ('ꯥ', 'ꯥ'), + ('ꯨ', 'ꯨ'), ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), + ('︠', '︯'), ('゙', '゚'), ('𐇽', '𐇽'), ('𐋠', '𐋠'), + ('𐍶', '𐍺'), ('𐨁', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨏'), + ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐫥', '𐫦'), ('𑀁', '𑀁'), + ('𑀸', '𑁆'), ('𑁿', '𑂁'), ('𑂳', '𑂶'), ('𑂹', '𑂺'), + ('𑄀', '𑄂'), ('𑄧', '𑄫'), ('𑄭', '𑄴'), ('𑅳', '𑅳'), + ('𑆀', '𑆁'), ('𑆶', '𑆾'), ('𑇊', '𑇌'), ('𑈯', '𑈱'), + ('𑈴', '𑈴'), ('𑈶', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋟'), + ('𑋣', '𑋪'), ('𑌀', '𑌁'), ('𑌼', '𑌼'), ('𑌾', '𑌾'), + ('𑍀', '𑍀'), ('𑍗', '𑍗'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), + ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), ('𑒰', '𑒰'), + ('𑒳', '𑒸'), ('𑒺', '𑒺'), ('𑒽', '𑒽'), ('𑒿', '𑓀'), + ('𑓂', '𑓃'), ('𑖯', '𑖯'), ('𑖲', '𑖵'), ('𑖼', '𑖽'), + ('𑖿', '𑗀'), ('𑗜', '𑗝'), ('𑘳', '𑘺'), ('𑘽', '𑘽'), + ('𑘿', '𑙀'), ('𑚫', '𑚫'), ('𑚭', '𑚭'), ('𑚰', '𑚵'), + ('𑚷', '𑚷'), ('𑜝', '𑜟'), ('𑜢', '𑜥'), ('𑜧', '𑜫'), + ('𑨁', '𑨆'), ('𑨉', '𑨊'), ('𑨳', '𑨸'), ('𑨻', '𑨾'), + ('𑩇', '𑩇'), ('𑩑', '𑩖'), ('𑩙', '𑩛'), ('𑪊', '𑪖'), + ('𑪘', '𑪙'), ('𑰰', '𑰶'), ('𑰸', '𑰽'), ('𑰿', '𑰿'), + ('𑲒', '𑲧'), ('𑲪', '𑲰'), ('𑲲', '𑲳'), ('𑲵', '𑲶'), + ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵅'), + ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), ('𖾏', '𖾒'), + ('𛲝', '𛲞'), ('𝅥', '𝅥'), ('𝅧', '𝅩'), ('𝅮', '𝅲'), + ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), + ('𞥄', '𞥊'), ('\u{e0020}', '\u{e007f}'), ('󠄀', '󠇯'), +]; + +pub const GRAPHEME_LINK: &'static [(char, char)] = &[ + ('्', '्'), ('্', '্'), ('੍', '੍'), ('્', '્'), + ('୍', '୍'), ('்', '்'), ('్', '్'), ('್', '್'), + ('഻', '഼'), ('്', '്'), ('්', '්'), ('ฺ', 'ฺ'), + ('྄', '྄'), ('္', '်'), ('᜔', '᜔'), ('᜴', '᜴'), + ('្', '្'), ('᩠', '᩠'), ('᭄', '᭄'), ('᮪', '᮫'), + ('᯲', '᯳'), ('⵿', '⵿'), ('꠆', '꠆'), ('꣄', '꣄'), + ('꥓', '꥓'), ('꧀', '꧀'), ('꫶', '꫶'), ('꯭', '꯭'), + ('𐨿', '𐨿'), ('𑁆', '𑁆'), ('𑁿', '𑁿'), ('𑂹', '𑂹'), + ('𑄳', '𑄴'), ('𑇀', '𑇀'), ('𑈵', '𑈵'), ('𑋪', '𑋪'), + ('𑍍', '𑍍'), ('𑑂', '𑑂'), ('𑓂', '𑓂'), ('𑖿', '𑖿'), + ('𑘿', '𑘿'), ('𑚶', '𑚶'), ('𑜫', '𑜫'), ('𑨴', '𑨴'), + ('𑩇', '𑩇'), ('𑪙', '𑪙'), ('𑰿', '𑰿'), ('𑵄', '𑵅'), +]; + +pub const HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'F'), ('a', 'f'), ('0', '9'), ('A', 'F'), + ('a', 'f'), +]; + +pub const HYPHEN: &'static [(char, char)] = &[ + ('-', '-'), ('\u{ad}', '\u{ad}'), ('֊', '֊'), ('᠆', '᠆'), + ('‐', '‑'), ('⸗', '⸗'), ('・', '・'), ('﹣', '﹣'), + ('-', '-'), ('・', '・'), +]; + +pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = &[ + ('⿰', '⿱'), ('⿴', '⿻'), +]; + +pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[ + ('⿲', '⿳'), +]; + +pub const ID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), + ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), + ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', '҇'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), + ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), + ('ؐ', 'ؚ'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), + ('۪', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), + ('ߺ', 'ߺ'), ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), + ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૯'), ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), + ('ୱ', 'ୱ'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), + ('௦', '௯'), ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), + ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), + ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), + ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', 'ൎ'), ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), + ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', 'ෳ'), ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), + ('༠', '༩'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፟'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('ក', '៓'), ('ៗ', 'ៗ'), ('ៜ', '៝'), + ('០', '៩'), ('᠋', '᠍'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), + ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), + ('ᤰ', '᤻'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), + ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), + ('᪰', '᪽'), ('ᬀ', 'ᭋ'), ('᭐', '᭙'), ('᭫', '᭳'), + ('ᮀ', '᯳'), ('ᰀ', '᰷'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('᳐', '᳒'), ('᳔', '᳹'), ('ᴀ', '᷹'), + ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), + ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), ('Ꙁ', '꙯'), ('ꙴ', '꙽'), ('ꙿ', '꛱'), + ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), ('꣐', '꣙'), + ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('꤀', '꤭'), + ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), + ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), ('가', '힣'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), + ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), ('︀', '️'), ('︠', '︯'), ('︳', '︴'), + ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'), + ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐇽', '𐇽'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐋠', '𐋠'), + ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍺'), ('𐎀', '𐎝'), + ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), + ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), + ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), + ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), + ('𐨿', '𐨿'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), + ('𐫉', '𐫦'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), + ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), + ('𑀀', '𑁆'), ('𑁦', '𑁯'), ('𑁿', '𑂺'), ('𑃐', '𑃨'), + ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑄿'), ('𑅐', '𑅳'), + ('𑅶', '𑅶'), ('𑆀', '𑇄'), ('𑇊', '𑇌'), ('𑇐', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈷'), ('𑈾', '𑈾'), + ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), + ('𑊟', '𑊨'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌀', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), + ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), + ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), + ('𑖸', '𑗀'), ('𑗘', '𑗝'), ('𑘀', '𑙀'), ('𑙄', '𑙄'), + ('𑙐', '𑙙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), + ('𑜝', '𑜫'), ('𑜰', '𑜹'), ('𑢠', '𑣩'), ('𑣿', '𑣿'), + ('𑨀', '𑨾'), ('𑩇', '𑩇'), ('𑩐', '𑪃'), ('𑪆', '𑪙'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱀'), + ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), + ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), + ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖫐', '𖫭'), + ('𖫰', '𖫴'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), + ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), + ('𖾏', '𖾟'), ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), + ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), + ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), + ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), + ('𝉂', '𝉄'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞠀', '𞣄'), + ('𞣐', '𞣖'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), + ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), + ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), + ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), + ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), + ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), + ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), + ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), + ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), ('󠄀', '󠇯'), +]; + +pub const ID_START: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), + ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゛', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), + ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐩠', '𐩼'), + ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), + ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), + ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), + ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), + ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), + ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), + ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), + ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), + ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), + ('𑢠', '𑣟'), ('𑣿', '𑣿'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), + ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪃'), ('𑪆', '𑪉'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), + ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), + ('𑵆', '𑵆'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), + ('𞤀', '𞥃'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const IDEOGRAPHIC: &'static [(char, char)] = &[ + ('〆', '〇'), ('〡', '〩'), ('〸', '〺'), ('㐀', '䶵'), + ('一', '鿪'), ('豈', '舘'), ('並', '龎'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛅰', '𛋻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const JOIN_CONTROL: &'static [(char, char)] = &[ + ('\u{200c}', '\u{200d}'), +]; + +pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ + ('เ', 'ไ'), ('ເ', 'ໄ'), ('ᦵ', 'ᦷ'), ('ᦺ', 'ᦺ'), + ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪹ'), ('ꪻ', 'ꪼ'), +]; + +pub const LOWERCASE: &'static [(char, char)] = &[ + ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('ß', 'ö'), + ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), + ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), + ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), + ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), + ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), + ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), + ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), + ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), + ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), + ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), + ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), + ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), + ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), + ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), + ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), + ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), ('ẟ', 'ẟ'), ('ạ', 'ạ'), + ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), + ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), + ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), + ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), + ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), + ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), + ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), + ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), + ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱞ'), ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱽ'), ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝸ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), + ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), + ('𝐚', '𝐳'), ('𝑎', '𝑔'), ('𝑖', '𝑧'), ('𝒂', '𝒛'), + ('𝒶', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝓏'), + ('𝓪', '𝔃'), ('𝔞', '𝔷'), ('𝕒', '𝕫'), ('𝖆', '𝖟'), + ('𝖺', '𝗓'), ('𝗮', '𝘇'), ('𝘢', '𝘻'), ('𝙖', '𝙯'), + ('𝚊', '𝚥'), ('𝛂', '𝛚'), ('𝛜', '𝛡'), ('𝛼', '𝜔'), + ('𝜖', '𝜛'), ('𝜶', '𝝎'), ('𝝐', '𝝕'), ('𝝰', '𝞈'), + ('𝞊', '𝞏'), ('𝞪', '𝟂'), ('𝟄', '𝟉'), ('𝟋', '𝟋'), + ('𞤢', '𞥃'), +]; + +pub const MATH: &'static [(char, char)] = &[ + ('+', '+'), ('<', '>'), ('^', '^'), ('|', '|'), ('~', '~'), ('¬', '¬'), + ('±', '±'), ('×', '×'), ('÷', '÷'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), ('ϴ', '϶'), ('؆', '؈'), ('‖', '‖'), ('′', '‴'), + ('⁀', '⁀'), ('⁄', '⁄'), ('⁒', '⁒'), ('\u{2061}', '\u{2064}'), + ('⁺', '⁾'), ('₊', '₎'), ('⃐', '⃜'), ('⃡', '⃡'), + ('⃥', '⃦'), ('⃫', '⃯'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), + ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), + ('ℼ', 'ⅉ'), ('⅋', '⅋'), ('←', '↧'), ('↩', '↮'), + ('↰', '↱'), ('↶', '↷'), ('↼', '⇛'), ('⇝', '⇝'), + ('⇤', '⇥'), ('⇴', '⋿'), ('⌈', '⌋'), ('⌠', '⌡'), + ('⍼', '⍼'), ('⎛', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), + ('⏜', '⏢'), ('■', '□'), ('▮', '▷'), ('▼', '◁'), + ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), + ('◤', '◤'), ('◧', '◬'), ('◸', '◿'), ('★', '☆'), + ('♀', '♀'), ('♂', '♂'), ('♠', '♣'), ('♭', '♯'), + ('⟀', '⟿'), ('⤀', '⫿'), ('⬰', '⭄'), ('⭇', '⭌'), + ('﬩', '﬩'), ('﹡', '﹦'), ('﹨', '﹨'), ('+', '+'), + ('<', '>'), ('\', '\'), ('^', '^'), ('|', '|'), + ('~', '~'), ('¬', '¬'), ('←', '↓'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ + ('\u{fdd0}', '\u{fdef}'), ('\u{fffe}', '\u{ffff}'), + ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ + ('ͅ', 'ͅ'), ('ְ', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), + ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٗ'), ('ٙ', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('ۡ', 'ۤ'), ('ۧ', 'ۨ'), ('ۭ', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', 'ܿ'), ('ަ', 'ް'), ('ࠖ', 'ࠗ'), ('ࠛ', 'ࠣ'), ('ࠥ', 'ࠧ'), + ('ࠩ', 'ࠬ'), ('ࣔ', 'ࣟ'), ('ࣣ', 'ࣩ'), ('ࣰ', 'ः'), + ('ऺ', 'ऻ'), ('ा', 'ौ'), ('ॎ', 'ॏ'), ('ॕ', 'ॗ'), + ('ॢ', 'ॣ'), ('ঁ', 'ঃ'), ('া', 'ৄ'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), ('ਁ', 'ਃ'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', 'ੌ'), ('ੑ', 'ੑ'), + ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), ('ઁ', 'ઃ'), ('ા', 'ૅ'), + ('ે', 'ૉ'), ('ો', 'ૌ'), ('ૢ', 'ૣ'), ('ૺ', 'ૼ'), + ('ଁ', 'ଃ'), ('ା', 'ୄ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), + ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', 'ౌ'), ('ౕ', 'ౖ'), + ('ౢ', 'ౣ'), ('ಁ', 'ಃ'), ('ಾ', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', 'ೌ'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), ('ഀ', 'ഃ'), + ('ാ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൗ', 'ൗ'), + ('ൢ', 'ൣ'), ('ං', 'ඃ'), ('ා', 'ු'), ('ූ', 'ූ'), + ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), ('ั', 'ั'), ('ิ', 'ฺ'), + ('ํ', 'ํ'), ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), + ('ໍ', 'ໍ'), ('ཱ', 'ཱྀ'), ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), + ('ါ', 'ံ'), ('း', 'း'), ('ျ', 'ှ'), ('ၖ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၢ', 'ၢ'), ('ၧ', 'ၨ'), ('ၱ', 'ၴ'), + ('ႂ', 'ႆ'), ('ႜ', 'ႝ'), ('፟', '፟'), ('ᜒ', 'ᜓ'), + ('ᜲ', 'ᜳ'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), ('ា', 'ៈ'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤫ'), ('ᤰ', 'ᤸ'), + ('ᨗ', 'ᨛ'), ('ᩕ', 'ᩞ'), ('ᩡ', 'ᩴ'), ('ᬀ', 'ᬄ'), + ('ᬵ', 'ᭃ'), ('ᮀ', 'ᮂ'), ('ᮡ', 'ᮩ'), ('ᮬ', 'ᮭ'), + ('ᯧ', 'ᯱ'), ('ᰤ', 'ᰵ'), ('ᳲ', 'ᳳ'), ('ᷧ', 'ᷴ'), + ('Ⓐ', 'ⓩ'), ('ⷠ', 'ⷿ'), ('ꙴ', 'ꙻ'), ('ꚞ', 'ꚟ'), + ('ꠣ', 'ꠧ'), ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣃ'), ('ꣅ', 'ꣅ'), + ('ꤦ', 'ꤪ'), ('ꥇ', 'ꥒ'), ('ꦀ', 'ꦃ'), ('ꦴ', 'ꦿ'), + ('ꨩ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩍ'), ('ꪰ', 'ꪰ'), + ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', 'ꪾ'), ('ꫫ', 'ꫯ'), + ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯪ'), ('ﬞ', 'ﬞ'), ('𐍶', '𐍺'), + ('𐨁', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𑀀', '𑀂'), + ('𑀸', '𑁅'), ('𑂂', '𑂂'), ('𑂰', '𑂸'), ('𑄀', '𑄂'), + ('𑄧', '𑄲'), ('𑆀', '𑆂'), ('𑆳', '𑆿'), ('𑈬', '𑈴'), + ('𑈷', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋨'), ('𑌀', '𑌃'), + ('𑌾', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍌'), ('𑍗', '𑍗'), + ('𑍢', '𑍣'), ('𑐵', '𑑁'), ('𑑃', '𑑅'), ('𑒰', '𑓁'), + ('𑖯', '𑖵'), ('𑖸', '𑖾'), ('𑗜', '𑗝'), ('𑘰', '𑘾'), + ('𑙀', '𑙀'), ('𑚫', '𑚵'), ('𑜝', '𑜪'), ('𑨁', '𑨊'), + ('𑨵', '𑨹'), ('𑨻', '𑨾'), ('𑩑', '𑩛'), ('𑪊', '𑪗'), + ('𑰯', '𑰶'), ('𑰸', '𑰾'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵁'), + ('𑵃', '𑵃'), ('𑵇', '𑵇'), ('𖬰', '𖬶'), ('𖽑', '𖽾'), + ('𛲞', '𛲞'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞥇', '𞥇'), ('🄰', '🅉'), + ('🅐', '🅩'), ('🅰', '🆉'), +]; + +pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('͏', '͏'), ('ᅟ', 'ᅠ'), ('឴', '឵'), ('\u{2065}', '\u{2065}'), + ('ㅤ', 'ㅤ'), ('ᅠ', 'ᅠ'), ('\u{fff0}', '\u{fff8}'), + ('\u{e0000}', '\u{e0000}'), ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('া', 'া'), ('ৗ', 'ৗ'), ('ା', 'ା'), ('ୗ', 'ୗ'), + ('ா', 'ா'), ('ௗ', 'ௗ'), ('ೂ', 'ೂ'), ('ೕ', 'ೖ'), + ('ാ', 'ാ'), ('ൗ', 'ൗ'), ('ා', 'ා'), ('ෟ', 'ෟ'), + ('\u{200c}', '\u{200c}'), ('〮', '〯'), ('゙', '゚'), ('𑌾', '𑌾'), + ('𑍗', '𑍗'), ('𑒰', '𑒰'), ('𑒽', '𑒽'), ('𑖯', '𑖯'), + ('𝅥', '𝅥'), ('𝅮', '𝅲'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const OTHER_ID_CONTINUE: &'static [(char, char)] = &[ + ('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚'), +]; + +pub const OTHER_ID_START: &'static [(char, char)] = &[ + ('ᢅ', 'ᢆ'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜'), +]; + +pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ + ('ª', 'ª'), ('º', 'º'), ('ʰ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), + ('ͅ', 'ͅ'), ('ͺ', 'ͺ'), ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⅰ', 'ⅿ'), + ('ⓐ', 'ⓩ'), ('ⱼ', 'ⱽ'), ('ꚜ', 'ꚝ'), ('ꝰ', 'ꝰ'), + ('ꟸ', 'ꟹ'), ('ꭜ', 'ꭟ'), +]; + +pub const OTHER_MATH: &'static [(char, char)] = &[ + ('^', '^'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), + ('‖', '‖'), ('′', '‴'), ('⁀', '⁀'), ('\u{2061}', '\u{2064}'), + ('⁽', '⁾'), ('₍', '₎'), ('⃐', '⃜'), ('⃡', '⃡'), + ('⃥', '⃦'), ('⃫', '⃯'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('↕', '↙'), ('↜', '↟'), + ('↡', '↢'), ('↤', '↥'), ('↧', '↧'), ('↩', '↭'), + ('↰', '↱'), ('↶', '↷'), ('↼', '⇍'), ('⇐', '⇑'), + ('⇓', '⇓'), ('⇕', '⇛'), ('⇝', '⇝'), ('⇤', '⇥'), + ('⌈', '⌋'), ('⎴', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), + ('⏢', '⏢'), ('■', '□'), ('▮', '▶'), ('▼', '◀'), + ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), + ('◤', '◤'), ('◧', '◬'), ('★', '☆'), ('♀', '♀'), + ('♂', '♂'), ('♠', '♣'), ('♭', '♮'), ('⟅', '⟆'), + ('⟦', '⟯'), ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), + ('﹡', '﹡'), ('﹣', '﹣'), ('﹨', '﹨'), ('\', '\'), + ('^', '^'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), +]; + +pub const OTHER_UPPERCASE: &'static [(char, char)] = &[ + ('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ + ('!', '/'), (':', '@'), ('[', '^'), ('`', '`'), ('{', '~'), ('¡', '§'), + ('©', '©'), ('«', '¬'), ('®', '®'), ('°', '±'), ('¶', '¶'), + ('»', '»'), ('¿', '¿'), ('×', '×'), ('÷', '÷'), ('‐', '‧'), + ('‰', '‾'), ('⁁', '⁓'), ('⁕', '⁞'), ('←', '\u{245f}'), + ('─', '❵'), ('➔', '\u{2bff}'), ('⸀', '\u{2e7f}'), ('、', '〃'), + ('〈', '〠'), ('〰', '〰'), ('﴾', '﴿'), ('﹅', '﹆'), +]; + +pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{2029}'), +]; + +pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), ('\u{110bd}', '\u{110bd}'), +]; + +pub const QUOTATION_MARK: &'static [(char, char)] = &[ + ('\"', '\"'), ('\'', '\''), ('«', '«'), ('»', '»'), ('‘', '‟'), + ('‹', '›'), ('⹂', '⹂'), ('「', '』'), ('〝', '〟'), + ('﹁', '﹄'), ('"', '"'), (''', '''), ('「', '」'), +]; + +pub const RADICAL: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[ + ('🇦', '🇿'), +]; + +pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ + ('!', '!'), ('.', '.'), ('?', '?'), ('։', '։'), ('؟', '؟'), + ('۔', '۔'), ('܀', '܂'), ('߹', '߹'), ('।', '॥'), ('၊', '။'), + ('።', '።'), ('፧', '፨'), ('᙮', '᙮'), ('᜵', '᜶'), + ('᠃', '᠃'), ('᠉', '᠉'), ('᥄', '᥅'), ('᪨', '᪫'), + ('᭚', '᭛'), ('᭞', '᭟'), ('᰻', '᰼'), ('᱾', '᱿'), + ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), ('⸼', '⸼'), + ('。', '。'), ('꓿', '꓿'), ('꘎', '꘏'), ('꛳', '꛳'), + ('꛷', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), ('꤯', '꤯'), + ('꧈', '꧉'), ('꩝', '꩟'), ('꫰', '꫱'), ('꯫', '꯫'), + ('﹒', '﹒'), ('﹖', '﹗'), ('!', '!'), ('.', '.'), + ('?', '?'), ('。', '。'), ('𐩖', '𐩗'), ('𑁇', '𑁈'), + ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), + ('𑇞', '𑇟'), ('𑈸', '𑈹'), ('𑈻', '𑈼'), ('𑊩', '𑊩'), + ('𑑋', '𑑌'), ('𑗂', '𑗃'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), + ('𑜼', '𑜾'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑱁', '𑱂'), + ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬸'), ('𖭄', '𖭄'), + ('𛲟', '𛲟'), ('𝪈', '𝪈'), +]; + +pub const SOFT_DOTTED: &'static [(char, char)] = &[ + ('i', 'j'), ('į', 'į'), ('ɉ', 'ɉ'), ('ɨ', 'ɨ'), ('ʝ', 'ʝ'), + ('ʲ', 'ʲ'), ('ϳ', 'ϳ'), ('і', 'і'), ('ј', 'ј'), ('ᵢ', 'ᵢ'), + ('ᶖ', 'ᶖ'), ('ᶤ', 'ᶤ'), ('ᶨ', 'ᶨ'), ('ḭ', 'ḭ'), + ('ị', 'ị'), ('ⁱ', 'ⁱ'), ('ⅈ', 'ⅉ'), ('ⱼ', 'ⱼ'), + ('𝐢', '𝐣'), ('𝑖', '𝑗'), ('𝒊', '𝒋'), ('𝒾', '𝒿'), + ('𝓲', '𝓳'), ('𝔦', '𝔧'), ('𝕚', '𝕛'), ('𝖎', '𝖏'), + ('𝗂', '𝗃'), ('𝗶', '𝗷'), ('𝘪', '𝘫'), ('𝙞', '𝙟'), + ('𝚒', '𝚓'), +]; + +pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '!'), (',', ','), ('.', '.'), (':', ';'), ('?', '?'), (';', ';'), + ('·', '·'), ('։', '։'), ('׃', '׃'), ('،', '،'), ('؛', '؛'), + ('؟', '؟'), ('۔', '۔'), ('܀', '܊'), ('܌', '܌'), ('߸', '߹'), + ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), ('๚', '๛'), + ('༈', '༈'), ('།', '༒'), ('၊', '။'), ('፡', '፨'), + ('᙭', '᙮'), ('᛫', '᛭'), ('᜵', '᜶'), ('។', '៖'), + ('៚', '៚'), ('᠂', '᠅'), ('᠈', '᠉'), ('᥄', '᥅'), + ('᪨', '᪫'), ('᭚', '᭛'), ('᭝', '᭟'), ('᰻', '᰿'), + ('᱾', '᱿'), ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), + ('⸼', '⸼'), ('⹁', '⹁'), ('、', '。'), ('꓾', '꓿'), + ('꘍', '꘏'), ('꛳', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), + ('꤯', '꤯'), ('꧇', '꧉'), ('꩝', '꩟'), ('꫟', '꫟'), + ('꫰', '꫱'), ('꯫', '꯫'), ('﹐', '﹒'), ('﹔', '﹗'), + ('!', '!'), (',', ','), ('.', '.'), (':', ';'), + ('?', '?'), ('。', '。'), ('、', '、'), ('𐎟', '𐎟'), + ('𐏐', '𐏐'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐩖', '𐩗'), + ('𐫰', '𐫵'), ('𐬺', '𐬿'), ('𐮙', '𐮜'), ('𑁇', '𑁍'), + ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), + ('𑇞', '𑇟'), ('𑈸', '𑈼'), ('𑊩', '𑊩'), ('𑑋', '𑑍'), + ('𑑛', '𑑛'), ('𑗂', '𑗅'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), + ('𑜼', '𑜾'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑪡', '𑪢'), + ('𑱁', '𑱃'), ('𑱱', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), + ('𖫵', '𖫵'), ('𖬷', '𖬹'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), + ('𝪇', '𝪊'), +]; + +pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ + ('㐀', '䶵'), ('一', '鿪'), ('﨎', '﨏'), ('﨑', '﨑'), + ('﨓', '﨔'), ('﨟', '﨟'), ('﨡', '﨡'), ('﨣', '﨤'), + ('﨧', '﨩'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), +]; + +pub const UPPERCASE: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), + ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), + ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𝐀', '𝐙'), ('𝐴', '𝑍'), + ('𝑨', '𝒁'), ('𝒜', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒵'), ('𝓐', '𝓩'), + ('𝔄', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔸', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕬', '𝖅'), ('𝖠', '𝖹'), ('𝗔', '𝗭'), + ('𝘈', '𝘡'), ('𝘼', '𝙕'), ('𝙰', '𝚉'), ('𝚨', '𝛀'), + ('𝛢', '𝛺'), ('𝜜', '𝜴'), ('𝝖', '𝝮'), ('𝞐', '𝞨'), + ('𝟊', '𝟊'), ('𞤀', '𞤡'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ + ('᠋', '᠍'), ('︀', '️'), ('󠄀', '󠇯'), +]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), +]; + +pub const XID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), + ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), + ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', '҇'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), + ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), + ('ؐ', 'ؚ'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), + ('۪', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), + ('ߺ', 'ߺ'), ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), + ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૯'), ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), + ('ୱ', 'ୱ'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), + ('௦', '௯'), ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), + ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), + ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), + ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', 'ൎ'), ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), + ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', 'ෳ'), ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), + ('༠', '༩'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፟'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('ក', '៓'), ('ៗ', 'ៗ'), ('ៜ', '៝'), + ('០', '៩'), ('᠋', '᠍'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), + ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), + ('ᤰ', '᤻'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), + ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), + ('᪰', '᪽'), ('ᬀ', 'ᭋ'), ('᭐', '᭙'), ('᭫', '᭳'), + ('ᮀ', '᯳'), ('ᰀ', '᰷'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('᳐', '᳒'), ('᳔', '᳹'), ('ᴀ', '᷹'), + ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), + ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', '゚'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), + ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '꙯'), ('ꙴ', '꙽'), + ('ꙿ', '꛱'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), + ('꣐', '꣙'), ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('꤀', '꤭'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), + ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), + ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), ('︀', '️'), + ('︠', '︯'), ('︳', '︴'), ('﹍', '﹏'), ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('0', '9'), ('A', 'Z'), + ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐇽', '𐇽'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐋠', '𐋠'), ('𐌀', '𐌟'), + ('𐌭', '𐍊'), ('𐍐', '𐍺'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), + ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), + ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), + ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), + ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), + ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), + ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), + ('𐦾', '𐦿'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), + ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫦'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀀', '𑁆'), + ('𑁦', '𑁯'), ('𑁿', '𑂺'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), + ('𑄀', '𑄴'), ('𑄶', '𑄿'), ('𑅐', '𑅳'), ('𑅶', '𑅶'), + ('𑆀', '𑇄'), ('𑇊', '𑇌'), ('𑇐', '𑇚'), ('𑇜', '𑇜'), + ('𑈀', '𑈑'), ('𑈓', '𑈷'), ('𑈾', '𑈾'), ('𑊀', '𑊆'), + ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), + ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌀', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('𑒀', '𑓅'), + ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), ('𑖸', '𑗀'), + ('𑗘', '𑗝'), ('𑘀', '𑙀'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), + ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), ('𑜝', '𑜫'), + ('𑜰', '𑜹'), ('𑢠', '𑣩'), ('𑣿', '𑣿'), ('𑨀', '𑨾'), + ('𑩇', '𑩇'), ('𑩐', '𑪃'), ('𑪆', '𑪙'), ('𑫀', '𑫸'), + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱀'), ('𑱐', '𑱙'), + ('𑱲', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), ('𑴀', '𑴆'), + ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵇'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), + ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖫐', '𖫭'), ('𖫰', '𖫴'), + ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), ('𝅭', '𝅲'), + ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), + ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), + ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), + ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('𝨀', '𝨶'), + ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), ('𝪛', '𝪟'), + ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞠀', '𞣄'), ('𞣐', '𞣖'), + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), ('󠄀', '󠇯'), +]; + +pub const XID_START: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'า'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'າ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), + ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), + ('ﹱ', 'ﹱ'), ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('A', 'Z'), + ('a', 'z'), ('ヲ', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), + ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), + ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), + ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), + ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), + ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), + ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), + ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), + ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), + ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), + ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), + ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), + ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), + ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), ('𑢠', '𑣟'), + ('𑣿', '𑣿'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), + ('𑩐', '𑩐'), ('𑩜', '𑪃'), ('𑪆', '𑪉'), ('𑫀', '𑫸'), + ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖫐', '𖫭'), + ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), + ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), + ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), + ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), + ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), + ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_names.rs b/regex-syntax-2/src/unicode_tables/property_names.rs new file mode 100644 index 0000000000..1d1032d337 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_names.rs @@ -0,0 +1,146 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-names tmp/ucd-10.0.0/ +// +// ucd-generate is available on crates.io. + +pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ + ("age", "Age"), ("ahex", "ASCII_Hex_Digit"), ("alpha", "Alphabetic"), + ("alphabetic", "Alphabetic"), ("asciihexdigit", "ASCII_Hex_Digit"), + ("bc", "Bidi_Class"), ("bidic", "Bidi_Control"), + ("bidiclass", "Bidi_Class"), ("bidicontrol", "Bidi_Control"), + ("bidim", "Bidi_Mirrored"), ("bidimirrored", "Bidi_Mirrored"), + ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), + ("bidipairedbracket", "Bidi_Paired_Bracket"), + ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), ("blk", "Block"), + ("block", "Block"), ("bmg", "Bidi_Mirroring_Glyph"), + ("bpb", "Bidi_Paired_Bracket"), ("bpt", "Bidi_Paired_Bracket_Type"), + ("c", "ISO_Comment"), + ("canonicalcombiningclass", "Canonical_Combining_Class"), + ("cased", "Cased"), ("casefolding", "Case_Folding"), + ("caseignorable", "Case_Ignorable"), ("ccc", "Canonical_Combining_Class"), + ("ce", "Composition_Exclusion"), ("cf", "Case_Folding"), + ("changeswhencasefolded", "Changes_When_Casefolded"), + ("changeswhencasemapped", "Changes_When_Casemapped"), + ("changeswhenlowercased", "Changes_When_Lowercased"), + ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), + ("changeswhentitlecased", "Changes_When_Titlecased"), + ("changeswhenuppercased", "Changes_When_Uppercased"), + ("ci", "Case_Ignorable"), ("cjkaccountingnumeric", "kAccountingNumeric"), + ("cjkcompatibilityvariant", "kCompatibilityVariant"), + ("cjkiicore", "kIICore"), ("cjkirggsource", "kIRG_GSource"), + ("cjkirghsource", "kIRG_HSource"), ("cjkirgjsource", "kIRG_JSource"), + ("cjkirgkpsource", "kIRG_KPSource"), ("cjkirgksource", "kIRG_KSource"), + ("cjkirgmsource", "kIRG_MSource"), ("cjkirgtsource", "kIRG_TSource"), + ("cjkirgusource", "kIRG_USource"), ("cjkirgvsource", "kIRG_VSource"), + ("cjkothernumeric", "kOtherNumeric"), + ("cjkprimarynumeric", "kPrimaryNumeric"), ("cjkrsunicode", "kRSUnicode"), + ("compex", "Full_Composition_Exclusion"), + ("compositionexclusion", "Composition_Exclusion"), + ("cwcf", "Changes_When_Casefolded"), ("cwcm", "Changes_When_Casemapped"), + ("cwkcf", "Changes_When_NFKC_Casefolded"), + ("cwl", "Changes_When_Lowercased"), ("cwt", "Changes_When_Titlecased"), + ("cwu", "Changes_When_Uppercased"), ("dash", "Dash"), + ("decompositionmapping", "Decomposition_Mapping"), + ("decompositiontype", "Decomposition_Type"), + ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), + ("dep", "Deprecated"), ("deprecated", "Deprecated"), + ("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"), + ("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"), + ("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"), + ("eastasianwidth", "East_Asian_Width"), ("expandsonnfc", "Expands_On_NFC"), + ("expandsonnfd", "Expands_On_NFD"), ("expandsonnfkc", "Expands_On_NFKC"), + ("expandsonnfkd", "Expands_On_NFKD"), ("ext", "Extender"), + ("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"), + ("fcnfkcclosure", "FC_NFKC_Closure"), + ("fullcompositionexclusion", "Full_Composition_Exclusion"), + ("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"), + ("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"), + ("graphemeclusterbreak", "Grapheme_Cluster_Break"), + ("graphemeextend", "Grapheme_Extend"), ("graphemelink", "Grapheme_Link"), + ("grbase", "Grapheme_Base"), ("grext", "Grapheme_Extend"), + ("grlink", "Grapheme_Link"), ("hangulsyllabletype", "Hangul_Syllable_Type"), + ("hex", "Hex_Digit"), ("hexdigit", "Hex_Digit"), + ("hst", "Hangul_Syllable_Type"), ("hyphen", "Hyphen"), + ("idc", "ID_Continue"), ("idcontinue", "ID_Continue"), + ("ideo", "Ideographic"), ("ideographic", "Ideographic"), + ("ids", "ID_Start"), ("idsb", "IDS_Binary_Operator"), + ("idsbinaryoperator", "IDS_Binary_Operator"), + ("idst", "IDS_Trinary_Operator"), ("idstart", "ID_Start"), + ("idstrinaryoperator", "IDS_Trinary_Operator"), + ("indicpositionalcategory", "Indic_Positional_Category"), + ("indicsyllabiccategory", "Indic_Syllabic_Category"), + ("inpc", "Indic_Positional_Category"), ("insc", "Indic_Syllabic_Category"), + ("jamoshortname", "Jamo_Short_Name"), ("jg", "Joining_Group"), + ("joinc", "Join_Control"), ("joincontrol", "Join_Control"), + ("joininggroup", "Joining_Group"), ("joiningtype", "Joining_Type"), + ("jsn", "Jamo_Short_Name"), ("jt", "Joining_Type"), + ("kaccountingnumeric", "kAccountingNumeric"), + ("kcompatibilityvariant", "kCompatibilityVariant"), ("kiicore", "kIICore"), + ("kirggsource", "kIRG_GSource"), ("kirghsource", "kIRG_HSource"), + ("kirgjsource", "kIRG_JSource"), ("kirgkpsource", "kIRG_KPSource"), + ("kirgksource", "kIRG_KSource"), ("kirgmsource", "kIRG_MSource"), + ("kirgtsource", "kIRG_TSource"), ("kirgusource", "kIRG_USource"), + ("kirgvsource", "kIRG_VSource"), ("kothernumeric", "kOtherNumeric"), + ("kprimarynumeric", "kPrimaryNumeric"), ("krsunicode", "kRSUnicode"), + ("lb", "Line_Break"), ("lc", "Lowercase_Mapping"), + ("linebreak", "Line_Break"), ("loe", "Logical_Order_Exception"), + ("logicalorderexception", "Logical_Order_Exception"), + ("lower", "Lowercase"), ("lowercase", "Lowercase"), + ("lowercasemapping", "Lowercase_Mapping"), ("math", "Math"), ("na", "Name"), + ("na1", "Unicode_1_Name"), ("name", "Name"), ("namealias", "Name_Alias"), + ("nchar", "Noncharacter_Code_Point"), ("nfcqc", "NFC_Quick_Check"), + ("nfcquickcheck", "NFC_Quick_Check"), ("nfdqc", "NFD_Quick_Check"), + ("nfdquickcheck", "NFD_Quick_Check"), ("nfkccasefold", "NFKC_Casefold"), + ("nfkccf", "NFKC_Casefold"), ("nfkcqc", "NFKC_Quick_Check"), + ("nfkcquickcheck", "NFKC_Quick_Check"), ("nfkdqc", "NFKD_Quick_Check"), + ("nfkdquickcheck", "NFKD_Quick_Check"), + ("noncharactercodepoint", "Noncharacter_Code_Point"), + ("nt", "Numeric_Type"), ("numerictype", "Numeric_Type"), + ("numericvalue", "Numeric_Value"), ("nv", "Numeric_Value"), + ("oalpha", "Other_Alphabetic"), ("ocomment", "ISO_Comment"), + ("odi", "Other_Default_Ignorable_Code_Point"), + ("ogrext", "Other_Grapheme_Extend"), ("oidc", "Other_ID_Continue"), + ("oids", "Other_ID_Start"), ("olower", "Other_Lowercase"), + ("omath", "Other_Math"), ("otheralphabetic", "Other_Alphabetic"), + ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), + ("othergraphemeextend", "Other_Grapheme_Extend"), + ("otheridcontinue", "Other_ID_Continue"), + ("otheridstart", "Other_ID_Start"), ("otherlowercase", "Other_Lowercase"), + ("othermath", "Other_Math"), ("otheruppercase", "Other_Uppercase"), + ("oupper", "Other_Uppercase"), ("patsyn", "Pattern_Syntax"), + ("patternsyntax", "Pattern_Syntax"), + ("patternwhitespace", "Pattern_White_Space"), + ("patws", "Pattern_White_Space"), ("pcm", "Prepended_Concatenation_Mark"), + ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), + ("qmark", "Quotation_Mark"), ("quotationmark", "Quotation_Mark"), + ("radical", "Radical"), ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), ("sb", "Sentence_Break"), ("sc", "Script"), + ("scf", "Simple_Case_Folding"), ("script", "Script"), + ("scriptextensions", "Script_Extensions"), ("scx", "Script_Extensions"), + ("sd", "Soft_Dotted"), ("sentencebreak", "Sentence_Break"), + ("sentenceterminal", "Sentence_Terminal"), ("sfc", "Simple_Case_Folding"), + ("simplecasefolding", "Simple_Case_Folding"), + ("simplelowercasemapping", "Simple_Lowercase_Mapping"), + ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), + ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), + ("slc", "Simple_Lowercase_Mapping"), ("softdotted", "Soft_Dotted"), + ("space", "White_Space"), ("stc", "Simple_Titlecase_Mapping"), + ("sterm", "Sentence_Terminal"), ("suc", "Simple_Uppercase_Mapping"), + ("tc", "Titlecase_Mapping"), ("term", "Terminal_Punctuation"), + ("terminalpunctuation", "Terminal_Punctuation"), + ("titlecasemapping", "Titlecase_Mapping"), ("uc", "Uppercase_Mapping"), + ("uideo", "Unified_Ideograph"), ("unicode1name", "Unicode_1_Name"), + ("unicoderadicalstroke", "kRSUnicode"), + ("unifiedideograph", "Unified_Ideograph"), ("upper", "Uppercase"), + ("uppercase", "Uppercase"), ("uppercasemapping", "Uppercase_Mapping"), + ("urs", "kRSUnicode"), ("variationselector", "Variation_Selector"), + ("verticalorientation", "Vertical_Orientation"), + ("vo", "Vertical_Orientation"), ("vs", "Variation_Selector"), + ("wb", "Word_Break"), ("whitespace", "White_Space"), + ("wordbreak", "Word_Break"), ("wspace", "White_Space"), + ("xidc", "XID_Continue"), ("xidcontinue", "XID_Continue"), + ("xids", "XID_Start"), ("xidstart", "XID_Start"), + ("xonfc", "Expands_On_NFC"), ("xonfd", "Expands_On_NFD"), + ("xonfkc", "Expands_On_NFKC"), ("xonfkd", "Expands_On_NFKD"), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_values.rs b/regex-syntax-2/src/unicode_tables/property_values.rs new file mode 100644 index 0000000000..1ce9795b1c --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_values.rs @@ -0,0 +1,277 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-values tmp/ucd-10.0.0 --include gc,script,scx,age +// +// ucd-generate is available on crates.io. + +pub const PROPERTY_VALUES: &'static [(&'static str, &'static [(&'static str, &'static str)])] = &[ + ("Age", &[("1.1", "V1_1"), ("10.0", "V10_0"), ("2.0", "V2_0"), + ("2.1", "V2_1"), ("3.0", "V3_0"), ("3.1", "V3_1"), ("3.2", "V3_2"), + ("4.0", "V4_0"), ("4.1", "V4_1"), ("5.0", "V5_0"), ("5.1", "V5_1"), + ("5.2", "V5_2"), ("6.0", "V6_0"), ("6.1", "V6_1"), ("6.2", "V6_2"), + ("6.3", "V6_3"), ("7.0", "V7_0"), ("8.0", "V8_0"), ("9.0", "V9_0"), + ("na", "Unassigned"), ("unassigned", "Unassigned"), ("v100", "V10_0"), + ("v11", "V1_1"), ("v20", "V2_0"), ("v21", "V2_1"), ("v30", "V3_0"), + ("v31", "V3_1"), ("v32", "V3_2"), ("v40", "V4_0"), ("v41", "V4_1"), + ("v50", "V5_0"), ("v51", "V5_1"), ("v52", "V5_2"), ("v60", "V6_0"), + ("v61", "V6_1"), ("v62", "V6_2"), ("v63", "V6_3"), ("v70", "V7_0"), + ("v80", "V8_0"), ("v90", "V9_0"), ]), + + ("General_Category", &[("c", "Other"), ("casedletter", "Cased_Letter"), + ("cc", "Control"), ("cf", "Format"), + ("closepunctuation", "Close_Punctuation"), ("cn", "Unassigned"), + ("cntrl", "Control"), ("co", "Private_Use"), ("combiningmark", "Mark"), + ("connectorpunctuation", "Connector_Punctuation"), ("control", "Control"), + ("cs", "Surrogate"), ("currencysymbol", "Currency_Symbol"), + ("dashpunctuation", "Dash_Punctuation"), + ("decimalnumber", "Decimal_Number"), ("digit", "Decimal_Number"), + ("enclosingmark", "Enclosing_Mark"), + ("finalpunctuation", "Final_Punctuation"), ("format", "Format"), + ("initialpunctuation", "Initial_Punctuation"), ("l", "Letter"), + ("lc", "Cased_Letter"), ("letter", "Letter"), + ("letternumber", "Letter_Number"), ("lineseparator", "Line_Separator"), + ("ll", "Lowercase_Letter"), ("lm", "Modifier_Letter"), + ("lo", "Other_Letter"), ("lowercaseletter", "Lowercase_Letter"), + ("lt", "Titlecase_Letter"), ("lu", "Uppercase_Letter"), ("m", "Mark"), + ("mark", "Mark"), ("mathsymbol", "Math_Symbol"), ("mc", "Spacing_Mark"), + ("me", "Enclosing_Mark"), ("mn", "Nonspacing_Mark"), + ("modifierletter", "Modifier_Letter"), + ("modifiersymbol", "Modifier_Symbol"), ("n", "Number"), + ("nd", "Decimal_Number"), ("nl", "Letter_Number"), ("no", "Other_Number"), + ("nonspacingmark", "Nonspacing_Mark"), ("number", "Number"), + ("openpunctuation", "Open_Punctuation"), ("other", "Other"), + ("otherletter", "Other_Letter"), ("othernumber", "Other_Number"), + ("otherpunctuation", "Other_Punctuation"), ("othersymbol", "Other_Symbol"), + ("p", "Punctuation"), ("paragraphseparator", "Paragraph_Separator"), + ("pc", "Connector_Punctuation"), ("pd", "Dash_Punctuation"), + ("pe", "Close_Punctuation"), ("pf", "Final_Punctuation"), + ("pi", "Initial_Punctuation"), ("po", "Other_Punctuation"), + ("privateuse", "Private_Use"), ("ps", "Open_Punctuation"), + ("punct", "Punctuation"), ("punctuation", "Punctuation"), ("s", "Symbol"), + ("sc", "Currency_Symbol"), ("separator", "Separator"), + ("sk", "Modifier_Symbol"), ("sm", "Math_Symbol"), ("so", "Other_Symbol"), + ("spaceseparator", "Space_Separator"), ("spacingmark", "Spacing_Mark"), + ("surrogate", "Surrogate"), ("symbol", "Symbol"), + ("titlecaseletter", "Titlecase_Letter"), ("unassigned", "Unassigned"), + ("uppercaseletter", "Uppercase_Letter"), ("z", "Separator"), + ("zl", "Line_Separator"), ("zp", "Paragraph_Separator"), + ("zs", "Space_Separator"), ]), + + ("Script", &[("adlam", "Adlam"), ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), + ("arabic", "Arabic"), ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), + ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), + ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), + ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), + ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), + ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), + ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), + ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), + ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), + ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("dsrt", "Deseret"), + ("dupl", "Duployan"), ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), + ("elbasan", "Elbasan"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), + ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), + ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), + ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), + ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), + ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), + ("hira", "Hiragana"), ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), + ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), + ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), ("khmer", "Khmer"), ("khmr", "Khmer"), + ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), + ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), + ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), + ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), + ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), + ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), + ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), + ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), + ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), + ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), + ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), + ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), + ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), + ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), + ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), + ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), + ("nshu", "Nushu"), ("nushu", "Nushu"), ("ogam", "Ogham"), + ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), + ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), + ("osma", "Osmanya"), ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), + ("rjng", "Rejang"), ("runic", "Runic"), ("runr", "Runic"), + ("samaritan", "Samaritan"), ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), + ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), + ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), + ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), + ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), + ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), + ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), + ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), + ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), + ("vaii", "Vai"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), + ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yi", "Yi"), + ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), + ("zzzz", "Unknown"), ]), + + ("Script_Extensions", &[("adlam", "Adlam"), ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), + ("arabic", "Arabic"), ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), + ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), + ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), + ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), + ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), + ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), + ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), + ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), + ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), + ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("dsrt", "Deseret"), + ("dupl", "Duployan"), ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), + ("elbasan", "Elbasan"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), + ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), + ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), + ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), + ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), + ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), + ("hira", "Hiragana"), ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), + ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), + ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), ("khmer", "Khmer"), ("khmr", "Khmer"), + ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), + ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), + ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), + ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), + ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), + ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), + ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), + ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), + ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), + ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), + ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), + ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), + ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), + ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), + ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), + ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), + ("nshu", "Nushu"), ("nushu", "Nushu"), ("ogam", "Ogham"), + ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), + ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), + ("osma", "Osmanya"), ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), + ("rjng", "Rejang"), ("runic", "Runic"), ("runr", "Runic"), + ("samaritan", "Samaritan"), ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), + ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), + ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), + ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), + ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), + ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), + ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), + ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), + ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), + ("vaii", "Vai"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), + ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yi", "Yi"), + ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), + ("zzzz", "Unknown"), ]), +]; diff --git a/regex-syntax-2/src/unicode_tables/script.rs b/regex-syntax-2/src/unicode_tables/script.rs new file mode 100644 index 0000000000..99c5786dea --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/script.rs @@ -0,0 +1,765 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), + ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), + ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), + ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), + ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), + ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), + ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), + ("Gujarati", GUJARATI), ("Gurmukhi", GURMUKHI), ("Han", HAN), + ("Hangul", HANGUL), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), + ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), + ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khmer", KHMER), + ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), + ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), + ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), + ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), + ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), + ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), + ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), + ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), + ("Nushu", NUSHU), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), ("Oriya", ORIYA), ("Osage", OSAGE), + ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), + ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), + ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangut", TANGUT), ("Telugu", TELUGU), + ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Ugaritic", UGARITIC), + ("Vai", VAI), ("Warang_Citi", WARANG_CITI), ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = &[ + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), +]; + +pub const AHOM: &'static [(char, char)] = &[ + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), +]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𔐀', '𔙆'), +]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), ('؆', '؋'), ('؍', 'ؚ'), ('\u{61c}', '\u{61c}'), + ('؞', '؞'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('ٖ', 'ٯ'), ('ٱ', 'ۜ'), + ('۞', 'ۿ'), ('ݐ', 'ݿ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), + ('ࣣ', 'ࣿ'), ('ﭐ', '﯁'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('𐹠', '𐹾'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = &[ + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('֊', '֊'), ('֍', '֏'), + ('ﬓ', 'ﬗ'), +]; + +pub const AVESTAN: &'static [(char, char)] = &[ + ('𐬀', '𐬵'), ('𐬹', '𐬿'), +]; + +pub const BALINESE: &'static [(char, char)] = &[ + ('ᬀ', 'ᭋ'), ('᭐', '᭼'), +]; + +pub const BAMUM: &'static [(char, char)] = &[ + ('ꚠ', '꛷'), ('𖠀', '𖨸'), +]; + +pub const BASSA_VAH: &'static [(char, char)] = &[ + ('𖫐', '𖫭'), ('𖫰', '𖫵'), +]; + +pub const BATAK: &'static [(char, char)] = &[ + ('ᯀ', '᯳'), ('᯼', '᯿'), +]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('ঀ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', 'ৄ'), + ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), + ('য়', 'ৣ'), ('০', '৽'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = &[ + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), ('𑱐', '𑱬'), +]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), ('ㄅ', 'ㄮ'), ('ㆠ', 'ㆺ'), +]; + +pub const BRAHMI: &'static [(char, char)] = &[ + ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𑁿', '𑁿'), +]; + +pub const BRAILLE: &'static [(char, char)] = &[ + ('⠀', '⣿'), +]; + +pub const BUGINESE: &'static [(char, char)] = &[ + ('ᨀ', 'ᨛ'), ('᨞', '᨟'), +]; + +pub const BUHID: &'static [(char, char)] = &[ + ('ᝀ', 'ᝓ'), +]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[ + ('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), +]; + +pub const CARIAN: &'static [(char, char)] = &[ + ('𐊠', '𐋐'), +]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[ + ('𐔰', '𐕣'), ('𐕯', '𐕯'), +]; + +pub const CHAKMA: &'static [(char, char)] = &[ + ('𑄀', '𑄴'), ('𑄶', '𑅃'), +]; + +pub const CHAM: &'static [(char, char)] = &[ + ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟'), +]; + +pub const CHEROKEE: &'static [(char, char)] = &[ + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ'), +]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), + ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), + ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), ('։', '։'), + ('\u{605}', '\u{605}'), ('،', '،'), ('؛', '؛'), ('؟', '؟'), + ('ـ', 'ـ'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), + ('।', '॥'), ('฿', '฿'), ('࿕', '࿘'), ('჻', '჻'), + ('᛫', '᛭'), ('᜵', '᜶'), ('᠂', '᠃'), ('᠅', '᠅'), + ('᳓', '᳓'), ('᳡', '᳡'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), ('\u{2000}', '\u{200b}'), ('\u{200e}', '\u{2064}'), + ('\u{2066}', '⁰'), ('⁴', '⁾'), ('₀', '₎'), ('₠', '₿'), + ('℀', '℥'), ('℧', '℩'), ('ℬ', 'ℱ'), ('ℳ', '⅍'), + ('⅏', '⅟'), ('↉', '↋'), ('←', '␦'), ('⑀', '⑊'), + ('①', '⟿'), ('⤀', '⭳'), ('⭶', '⮕'), ('⮘', '⮹'), + ('⮽', '⯈'), ('⯊', '⯒'), ('⯬', '⯯'), ('⸀', '⹉'), + ('⿰', '⿻'), ('\u{3000}', '〄'), ('〆', '〆'), ('〈', '〠'), + ('〰', '〷'), ('〼', '〿'), ('゛', '゜'), ('゠', '゠'), + ('・', 'ー'), ('㆐', '㆟'), ('㇀', '㇣'), ('㈠', '㉟'), + ('㉿', '㋏'), ('㍘', '㏿'), ('䷀', '䷿'), ('꜀', '꜡'), + ('ꞈ', '꞊'), ('꠰', '꠹'), ('꤮', '꤮'), ('ꧏ', 'ꧏ'), + ('꭛', '꭛'), ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), + ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), + ('[', '`'), ('{', '・'), ('ー', 'ー'), ('゙', '゚'), + ('¢', '₩'), ('│', '○'), ('\u{fff9}', '�'), ('𐄀', '𐄂'), + ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐆐', '𐆛'), ('𐇐', '𐇼'), + ('𐋡', '𐋻'), ('\u{1bca0}', '\u{1bca3}'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), + ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝌀', '𝍖'), ('𝍠', '𝍱'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), + ('𝟎', '𝟿'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), + ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🄌'), + ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), ('🇦', '🇿'), + ('🈁', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), ('🛰', '🛸'), + ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), ('🠐', '🡇'), + ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🤀', '🤋'), + ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), ('🦀', '🦗'), + ('🧀', '🧀'), ('🧐', '🧦'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = &[ + ('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), +]; + +pub const CUNEIFORM: &'static [(char, char)] = &[ + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), +]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐠿'), +]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', '҄'), ('҇', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), + ('ⷠ', 'ⷿ'), ('Ꙁ', 'ꚟ'), ('︮', '︯'), +]; + +pub const DESERET: &'static [(char, char)] = &[ + ('𐐀', '𐑏'), +]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('ऀ', 'ॐ'), ('॓', 'ॣ'), ('०', 'ॿ'), ('꣠', 'ꣽ'), +]; + +pub const DUPLOYAN: &'static [(char, char)] = &[ + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𛲜', '𛲟'), +]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𓀀', '𓐮'), +]; + +pub const ELBASAN: &'static [(char, char)] = &[ + ('𐔀', '𐔧'), +]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[ + ('𐌰', '𐍊'), +]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('𑌀', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), + ('𑍝', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('΄', '΄'), + ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), ('ᵦ', 'ᵪ'), + ('ᶿ', 'ᶿ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), + ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), ('𐅀', '𐆎'), + ('𐆠', '𐆠'), ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૱'), ('ૹ', '૿'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('々', '々'), + ('〇', '〇'), ('〡', '〩'), ('〸', '〻'), ('㐀', '䶵'), + ('一', '鿪'), ('豈', '舘'), ('並', '龎'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), ('〮', '〯'), ('ㄱ', 'ㆎ'), ('㈀', '㈞'), + ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[ + ('ᜠ', '᜴'), +]; + +pub const HATRAN: &'static [(char, char)] = &[ + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿'), +]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('֑', 'ׇ'), ('א', 'ת'), ('װ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('𛀁', '𛄞'), ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[ + ('𐡀', '𐡕'), ('𐡗', '𐡟'), +]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҅', '҆'), ('ً', 'ٕ'), ('ٰ', 'ٰ'), ('॑', '॒'), + ('᪰', '᪾'), ('᳐', '᳒'), ('᳔', '᳠'), ('᳢', '᳨'), + ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), ('᷀', '᷹'), + ('᷻', '᷿'), ('\u{200c}', '\u{200d}'), ('⃐', '⃰'), ('〪', '〭'), + ('゙', '゚'), ('︀', '️'), ('︠', '︭'), ('𐇽', '𐇽'), + ('𐋠', '𐋠'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), + ('𝆪', '𝆭'), ('󠄀', '󠇯'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[ + ('𐭠', '𐭲'), ('𐭸', '𐭿'), +]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[ + ('𐭀', '𐭕'), ('𐭘', '𐭟'), +]; + +pub const JAVANESE: &'static [(char, char)] = &[ + ('ꦀ', '꧍'), ('꧐', '꧙'), ('꧞', '꧟'), +]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('𑂀', '𑃁'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), + ('೦', '೯'), ('ೱ', 'ೲ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ァ', 'ヺ'), ('ヽ', 'ヿ'), ('ㇰ', 'ㇿ'), ('㋐', '㋾'), + ('㌀', '㍗'), ('ヲ', 'ッ'), ('ア', 'ン'), ('𛀀', '𛀀'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[ + ('꤀', '꤭'), ('꤯', '꤯'), +]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐩇'), ('𐩐', '𐩘'), +]; + +pub const KHMER: &'static [(char, char)] = &[ + ('ក', '៝'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿'), +]; + +pub const KHOJKI: &'static [(char, char)] = &[ + ('𑈀', '𑈑'), ('𑈓', '𑈾'), +]; + +pub const KHUDAWADI: &'static [(char, char)] = &[ + ('𑊰', '𑋪'), ('𑋰', '𑋹'), +]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), + ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⱡ', 'Ɀ'), + ('Ꜣ', 'ꞇ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꟿ'), + ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), ('ff', 'st'), ('A', 'Z'), + ('a', 'z'), +]; + +pub const LEPCHA: &'static [(char, char)] = &[ + ('ᰀ', '᰷'), ('᰻', '᱉'), ('ᱍ', 'ᱏ'), +]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = &[ + ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), +]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), +]; + +pub const LISU: &'static [(char, char)] = &[ + ('ꓐ', '꓿'), +]; + +pub const LYCIAN: &'static [(char, char)] = &[ + ('𐊀', '𐊜'), +]; + +pub const LYDIAN: &'static [(char, char)] = &[ + ('𐤠', '𐤹'), ('𐤿', '𐤿'), +]; + +pub const MAHAJANI: &'static [(char, char)] = &[ + ('𑅐', '𑅶'), +]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('ഀ', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), + ('െ', 'ൈ'), ('ൊ', '൏'), ('ൔ', 'ൣ'), ('൦', 'ൿ'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[ + ('ࡀ', '࡛'), ('࡞', '࡞'), +]; + +pub const MANICHAEAN: &'static [(char, char)] = &[ + ('𐫀', '𐫦'), ('𐫫', '𐫶'), +]; + +pub const MARCHEN: &'static [(char, char)] = &[ + ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), +]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), +]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = &[ + ('ꫠ', '꫶'), ('ꯀ', '꯭'), ('꯰', '꯹'), +]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = &[ + ('𞠀', '𞣄'), ('𞣇', '𞣖'), +]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[ + ('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿'), +]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𐦀', '𐦟'), +]; + +pub const MIAO: &'static [(char, char)] = &[ + ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), +]; + +pub const MODI: &'static [(char, char)] = &[ + ('𑘀', '𑙄'), ('𑙐', '𑙙'), +]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '\u{180e}'), ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[ + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), +]; + +pub const MULTANI: &'static [(char, char)] = &[ + ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), + ('𑊟', '𑊩'), +]; + +pub const MYANMAR: &'static [(char, char)] = &[ + ('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ'), +]; + +pub const NABATAEAN: &'static [(char, char)] = &[ + ('𐢀', '𐢞'), ('𐢧', '𐢯'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = &[ + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟'), +]; + +pub const NEWA: &'static [(char, char)] = &[ + ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), +]; + +pub const NKO: &'static [(char, char)] = &[ + ('߀', 'ߺ'), +]; + +pub const NUSHU: &'static [(char, char)] = &[ + ('𖿡', '𖿡'), ('𛅰', '𛋻'), +]; + +pub const OGHAM: &'static [(char, char)] = &[ + ('\u{1680}', '᚜'), +]; + +pub const OL_CHIKI: &'static [(char, char)] = &[ + ('᱐', '᱿'), +]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = &[ + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), +]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[ + ('𐌀', '𐌣'), ('𐌭', '𐌯'), +]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[ + ('𐪀', '𐪟'), +]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[ + ('𐍐', '𐍺'), +]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[ + ('𐎠', '𐏃'), ('𐏈', '𐏕'), +]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[ + ('𐩠', '𐩿'), +]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[ + ('𐰀', '𐱈'), +]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('଼', 'ୄ'), + ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୣ'), ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[ + ('𐒰', '𐓓'), ('𐓘', '𐓻'), +]; + +pub const OSMANYA: &'static [(char, char)] = &[ + ('𐒀', '𐒝'), ('𐒠', '𐒩'), +]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = &[ + ('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), +]; + +pub const PALMYRENE: &'static [(char, char)] = &[ + ('𐡠', '𐡿'), +]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[ + ('𑫀', '𑫸'), +]; + +pub const PHAGS_PA: &'static [(char, char)] = &[ + ('ꡀ', '꡷'), +]; + +pub const PHOENICIAN: &'static [(char, char)] = &[ + ('𐤀', '𐤛'), ('𐤟', '𐤟'), +]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[ + ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), +]; + +pub const REJANG: &'static [(char, char)] = &[ + ('ꤰ', '꥓'), ('꥟', '꥟'), +]; + +pub const RUNIC: &'static [(char, char)] = &[ + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), +]; + +pub const SAMARITAN: &'static [(char, char)] = &[ + ('ࠀ', '࠭'), ('࠰', '࠾'), +]; + +pub const SAURASHTRA: &'static [(char, char)] = &[ + ('ꢀ', 'ꣅ'), ('꣎', '꣙'), +]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('𑆀', '𑇍'), ('𑇐', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[ + ('𐑐', '𐑿'), +]; + +pub const SIDDHAM: &'static [(char, char)] = &[ + ('𑖀', '𑖵'), ('𑖸', '𑗝'), +]; + +pub const SIGNWRITING: &'static [(char, char)] = &[ + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), +]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[ + ('𑃐', '𑃨'), ('𑃰', '𑃹'), +]; + +pub const SOYOMBO: &'static [(char, char)] = &[ + ('𑩐', '𑪃'), ('𑪆', '𑪜'), ('𑪞', '𑪢'), +]; + +pub const SUNDANESE: &'static [(char, char)] = &[ + ('ᮀ', 'ᮿ'), ('᳀', '᳇'), +]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[ + ('ꠀ', '꠫'), +]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('܀', '܍'), ('\u{70f}', '݊'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ'), +]; + +pub const TAGALOG: &'static [(char, char)] = &[ + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), +]; + +pub const TAGBANWA: &'static [(char, char)] = &[ + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), +]; + +pub const TAI_LE: &'static [(char, char)] = &[ + ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), +]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[ + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), +]; + +pub const TAKRI: &'static [(char, char)] = &[ + ('𑚀', '𑚷'), ('𑛀', '𑛉'), +]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('௦', '௺'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), + ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), + ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('౦', '౯'), + ('౸', '౿'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('ހ', 'ޱ'), +]; + +pub const THAI: &'static [(char, char)] = &[ + ('ก', 'ฺ'), ('เ', '๛'), +]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ྗ'), ('ྙ', 'ྼ'), + ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = &[ + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('⵿', '⵿'), +]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('𑒀', '𑓇'), ('𑓐', '𑓙'), +]; + +pub const UGARITIC: &'static [(char, char)] = &[ + ('𐎀', '𐎝'), ('𐎟', '𐎟'), +]; + +pub const VAI: &'static [(char, char)] = &[ + ('ꔀ', 'ꘫ'), +]; + +pub const WARANG_CITI: &'static [(char, char)] = &[ + ('𑢠', '𑣲'), ('𑣿', '𑣿'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[ + ('𑨀', '𑩇'), +]; diff --git a/regex-syntax-2/src/unicode_tables/script_extension.rs b/regex-syntax-2/src/unicode_tables/script_extension.rs new file mode 100644 index 0000000000..10b6c3e03f --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/script_extension.rs @@ -0,0 +1,785 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script-extension tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), + ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), + ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), + ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), + ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), + ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), + ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), + ("Gujarati", GUJARATI), ("Gurmukhi", GURMUKHI), ("Han", HAN), + ("Hangul", HANGUL), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), + ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), + ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khmer", KHMER), + ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), + ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), + ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), + ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), + ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), + ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), + ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), + ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), + ("Nushu", NUSHU), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), ("Oriya", ORIYA), ("Osage", OSAGE), + ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), + ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), + ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangut", TANGUT), ("Telugu", TELUGU), + ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Ugaritic", UGARITIC), + ("Vai", VAI), ("Warang_Citi", WARANG_CITI), ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), +]; + +pub const AHOM: &'static [(char, char)] = &[ + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), +]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𔐀', '𔙆'), +]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), ('؆', '\u{61c}'), ('؞', 'ۜ'), ('۞', 'ۿ'), + ('ݐ', 'ݿ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), + ('ࣣ', 'ࣿ'), ('ﭐ', '﯁'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('𐋠', '𐋻'), ('𐹠', '𐹾'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = &[ + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '֊'), ('֍', '֏'), + ('ﬓ', 'ﬗ'), +]; + +pub const AVESTAN: &'static [(char, char)] = &[ + ('𐬀', '𐬵'), ('𐬹', '𐬿'), +]; + +pub const BALINESE: &'static [(char, char)] = &[ + ('ᬀ', 'ᭋ'), ('᭐', '᭼'), +]; + +pub const BAMUM: &'static [(char, char)] = &[ + ('ꚠ', '꛷'), ('𖠀', '𖨸'), +]; + +pub const BASSA_VAH: &'static [(char, char)] = &[ + ('𖫐', '𖫭'), ('𖫰', '𖫵'), +]; + +pub const BATAK: &'static [(char, char)] = &[ + ('ᯀ', '᯳'), ('᯼', '᯿'), +]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ঀ', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), + ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', '৽'), + ('᳷', '᳷'), ('꣱', '꣱'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = &[ + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), ('𑱐', '𑱬'), +]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), + ('〪', '〭'), ('〰', '〰'), ('〷', '〷'), ('・', '・'), + ('ㄅ', 'ㄮ'), ('ㆠ', 'ㆺ'), ('﹅', '﹆'), ('。', '・'), +]; + +pub const BRAHMI: &'static [(char, char)] = &[ + ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𑁿', '𑁿'), +]; + +pub const BRAILLE: &'static [(char, char)] = &[ + ('⠀', '⣿'), +]; + +pub const BUGINESE: &'static [(char, char)] = &[ + ('ᨀ', 'ᨛ'), ('᨞', '᨟'), ('ꧏ', 'ꧏ'), +]; + +pub const BUHID: &'static [(char, char)] = &[ + ('᜵', '᜶'), ('ᝀ', 'ᝓ'), +]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[ + ('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), +]; + +pub const CARIAN: &'static [(char, char)] = &[ + ('𐊠', '𐋐'), +]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[ + ('𐔰', '𐕣'), ('𐕯', '𐕯'), +]; + +pub const CHAKMA: &'static [(char, char)] = &[ + ('০', '৯'), ('၀', '၉'), ('𑄀', '𑄴'), ('𑄶', '𑅃'), +]; + +pub const CHAM: &'static [(char, char)] = &[ + ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟'), +]; + +pub const CHEROKEE: &'static [(char, char)] = &[ + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ'), +]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), + ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), + ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), + ('\u{605}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), + ('฿', '฿'), ('࿕', '࿘'), ('᛫', '᛭'), ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{2064}'), ('\u{2066}', '⁰'), ('⁴', '⁾'), + ('₀', '₎'), ('₠', '₿'), ('℀', '℥'), ('℧', '℩'), + ('ℬ', 'ℱ'), ('ℳ', '⅍'), ('⅏', '⅟'), ('↉', '↋'), + ('←', '␦'), ('⑀', '⑊'), ('①', '⟿'), ('⤀', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('⸀', '⹂'), ('⹄', '⹉'), ('⿰', '⿻'), + ('\u{3000}', '\u{3000}'), ('〄', '〄'), ('〒', '〒'), ('〠', '〠'), + ('〶', '〶'), ('㉈', '㉟'), ('㉿', '㉿'), ('㊱', '㊿'), + ('㋌', '㋏'), ('㍱', '㍺'), ('㎀', '㏟'), ('㏿', '㏿'), + ('䷀', '䷿'), ('꜀', '꜡'), ('ꞈ', '꞊'), ('꭛', '꭛'), + ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹄'), ('﹇', '﹒'), + ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), + ('[', '`'), ('{', '⦆'), ('¢', '₩'), ('│', '○'), + ('\u{fff9}', '�'), ('𐆐', '𐆛'), ('𐇐', '𐇼'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), + ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝌀', '𝍖'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), + ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), + ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🄌'), ('🄐', '🄮'), + ('🄰', '🅫'), ('🅰', '🆬'), ('🇦', '🇿'), ('🈁', '🈂'), + ('🈐', '🈻'), ('🉀', '🉈'), ('🉠', '🉥'), ('🌀', '🛔'), + ('🛠', '🛬'), ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), + ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), + ('🢐', '🢭'), ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), + ('🥐', '🥫'), ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), + ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = &[ + ('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('𐋠', '𐋻'), +]; + +pub const CUNEIFORM: &'static [(char, char)] = &[ + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), +]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐠀', '𐠅'), + ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), + ('𐠿', '𐠿'), +]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), + ('ⷠ', 'ⷿ'), ('⹃', '⹃'), ('Ꙁ', 'ꚟ'), ('︮', '︯'), +]; + +pub const DESERET: &'static [(char, char)] = &[ + ('𐐀', '𐑏'), +]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('ऀ', 'ॿ'), ('᳐', 'ᳶ'), ('᳸', '᳹'), ('⃰', '⃰'), + ('꠰', '꠹'), ('꣠', 'ꣽ'), +]; + +pub const DUPLOYAN: &'static [(char, char)] = &[ + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𛲜', '\u{1bca3}'), +]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𓀀', '𓐮'), +]; + +pub const ELBASAN: &'static [(char, char)] = &[ + ('𐔀', '𐔧'), +]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('։', '։'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('ა', 'ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('҄', '҄'), ('҇', '҇'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('⹃', '⹃'), + ('꙯', '꙯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[ + ('𐌰', '𐍊'), +]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ப', 'ப'), ('வ', 'வ'), + ('௦', '௲'), ('᳐', '᳐'), ('᳒', '᳓'), ('ᳲ', '᳴'), + ('᳸', '᳹'), ('⃰', '⃰'), ('𑌀', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('͂', '͂'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('΄', '΄'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϡ'), ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), ('ᶿ', '᷁'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), + ('ῲ', 'ῴ'), ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), ('𐆠', '𐆠'), ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ઁ', 'ઃ'), ('અ', 'ઍ'), + ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), + ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), ('ો', '્'), + ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), ('૦', '૱'), ('ૹ', '૿'), + ('꠰', '꠹'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), ('ਾ', 'ੂ'), + ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), ('꠰', '꠹'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('、', '〃'), + ('々', '】'), ('〓', '〟'), ('〡', '〭'), ('〰', '〰'), + ('〷', '〿'), ('・', '・'), ('㆐', '㆟'), ('㇀', '㇣'), + ('㈠', '㉇'), ('㊀', '㊰'), ('㋀', '㋋'), ('㍘', '㍰'), + ('㍻', '㍿'), ('㏠', '㏾'), ('㐀', '䶵'), ('一', '鿪'), + ('豈', '舘'), ('並', '龎'), ('﹅', '﹆'), ('。', '・'), + ('𝍠', '𝍱'), ('🉐', '🉑'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), + ('〮', '〰'), ('〷', '〷'), ('・', '・'), ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('﹅', '﹆'), ('。', '・'), + ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[ + ('ᜠ', '᜶'), +]; + +pub const HATRAN: &'static [(char, char)] = &[ + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿'), +]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('֑', 'ׇ'), ('א', 'ת'), ('װ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), + ('〷', '〷'), ('〼', '〽'), ('ぁ', 'ゖ'), ('゙', '゠'), + ('・', 'ー'), ('﹅', '﹆'), ('。', '・'), ('ー', 'ー'), + ('゙', '゚'), ('𛀁', '𛄞'), ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[ + ('𐡀', '𐡕'), ('𐡗', '𐡟'), +]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('̀', '́'), ('̓', '̈́'), ('͆', '͢'), ('᪰', '᪾'), ('᷂', '᷹'), + ('᷻', '᷿'), ('\u{200c}', '\u{200d}'), ('⃐', '⃯'), ('︀', '️'), + ('︠', '︭'), ('𐇽', '𐇽'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('󠄀', '󠇯'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[ + ('𐭠', '𐭲'), ('𐭸', '𐭿'), +]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[ + ('𐭀', '𐭕'), ('𐭘', '𐭟'), +]; + +pub const JAVANESE: &'static [(char, char)] = &[ + ('ꦀ', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟'), +]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('०', '९'), ('꠰', '꠹'), ('𑂀', '𑃁'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('಼', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), + ('᳚', '᳚'), ('ᳵ', 'ᳵ'), ('꠰', '꠵'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), + ('〷', '〷'), ('〼', '〽'), ('゙', '゜'), ('゠', 'ヿ'), + ('ㇰ', 'ㇿ'), ('㋐', '㋾'), ('㌀', '㍗'), ('﹅', '﹆'), + ('。', '゚'), ('𛀀', '𛀀'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[ + ('꤀', '꤯'), +]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐩇'), ('𐩐', '𐩘'), +]; + +pub const KHMER: &'static [(char, char)] = &[ + ('ក', '៝'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿'), +]; + +pub const KHOJKI: &'static [(char, char)] = &[ + ('૦', '૯'), ('𑈀', '𑈑'), ('𑈓', '𑈾'), +]; + +pub const KHUDAWADI: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), +]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), + ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('ͣ', 'ͯ'), ('҅', '҆'), + ('॑', '॒'), ('჻', '჻'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃰', '⃰'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꟿ'), ('꤮', '꤮'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), + ('ff', 'st'), ('A', 'Z'), ('a', 'z'), +]; + +pub const LEPCHA: &'static [(char, char)] = &[ + ('ᰀ', '᰷'), ('᰻', '᱉'), ('ᱍ', 'ᱏ'), +]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('॥', '॥'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), + ('᥀', '᥀'), ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = &[ + ('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), +]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), + ('𐄇', '𐄳'), ('𐄷', '𐄿'), +]; + +pub const LISU: &'static [(char, char)] = &[ + ('ꓐ', '꓿'), +]; + +pub const LYCIAN: &'static [(char, char)] = &[ + ('𐊀', '𐊜'), +]; + +pub const LYDIAN: &'static [(char, char)] = &[ + ('𐤠', '𐤹'), ('𐤿', '𐤿'), +]; + +pub const MAHAJANI: &'static [(char, char)] = &[ + ('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶'), +]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ഀ', 'ഃ'), ('അ', 'ഌ'), + ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', '൏'), + ('ൔ', 'ൣ'), ('൦', 'ൿ'), ('᳚', '᳚'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('ࡀ', '࡛'), ('࡞', '࡞'), +]; + +pub const MANICHAEAN: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𐫀', '𐫦'), ('𐫫', '𐫶'), +]; + +pub const MARCHEN: &'static [(char, char)] = &[ + ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), +]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), +]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = &[ + ('ꫠ', '꫶'), ('ꯀ', '꯭'), ('꯰', '꯹'), +]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = &[ + ('𞠀', '𞣄'), ('𞣇', '𞣖'), +]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[ + ('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿'), +]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𐦀', '𐦟'), +]; + +pub const MIAO: &'static [(char, char)] = &[ + ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), +]; + +pub const MODI: &'static [(char, char)] = &[ + ('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙'), +]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '\u{180e}'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), + ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[ + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), +]; + +pub const MULTANI: &'static [(char, char)] = &[ + ('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊩'), +]; + +pub const MYANMAR: &'static [(char, char)] = &[ + ('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ'), +]; + +pub const NABATAEAN: &'static [(char, char)] = &[ + ('𐢀', '𐢞'), ('𐢧', '𐢯'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = &[ + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟'), +]; + +pub const NEWA: &'static [(char, char)] = &[ + ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), +]; + +pub const NKO: &'static [(char, char)] = &[ + ('߀', 'ߺ'), +]; + +pub const NUSHU: &'static [(char, char)] = &[ + ('𖿡', '𖿡'), ('𛅰', '𛋻'), +]; + +pub const OGHAM: &'static [(char, char)] = &[ + ('\u{1680}', '᚜'), +]; + +pub const OL_CHIKI: &'static [(char, char)] = &[ + ('᱐', '᱿'), +]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = &[ + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), +]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[ + ('𐌀', '𐌣'), ('𐌭', '𐌯'), +]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[ + ('𐪀', '𐪟'), +]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[ + ('҃', '҃'), ('𐍐', '𐍺'), +]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[ + ('𐎠', '𐏃'), ('𐏈', '𐏕'), +]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[ + ('𐩠', '𐩿'), +]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[ + ('𐰀', '𐱈'), +]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[ + ('𐒰', '𐓓'), ('𐓘', '𐓻'), +]; + +pub const OSMANYA: &'static [(char, char)] = &[ + ('𐒀', '𐒝'), ('𐒠', '𐒩'), +]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = &[ + ('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), +]; + +pub const PALMYRENE: &'static [(char, char)] = &[ + ('𐡠', '𐡿'), +]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[ + ('𑫀', '𑫸'), +]; + +pub const PHAGS_PA: &'static [(char, char)] = &[ + ('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷'), +]; + +pub const PHOENICIAN: &'static [(char, char)] = &[ + ('𐤀', '𐤛'), ('𐤟', '𐤟'), +]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), +]; + +pub const REJANG: &'static [(char, char)] = &[ + ('ꤰ', '꥓'), ('꥟', '꥟'), +]; + +pub const RUNIC: &'static [(char, char)] = &[ + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), +]; + +pub const SAMARITAN: &'static [(char, char)] = &[ + ('ࠀ', '࠭'), ('࠰', '࠾'), +]; + +pub const SAURASHTRA: &'static [(char, char)] = &[ + ('ꢀ', 'ꣅ'), ('꣎', '꣙'), +]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('॑', '॑'), ('᳗', '᳗'), ('᳙', '᳙'), ('᳜', '᳝'), + ('᳠', '᳠'), ('𑆀', '𑇍'), ('𑇐', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[ + ('𐑐', '𐑿'), +]; + +pub const SIDDHAM: &'static [(char, char)] = &[ + ('𑖀', '𑖵'), ('𑖸', '𑗝'), +]; + +pub const SIGNWRITING: &'static [(char, char)] = &[ + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), +]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('।', '॥'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', '෴'), ('𑇡', '𑇴'), +]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[ + ('𑃐', '𑃨'), ('𑃰', '𑃹'), +]; + +pub const SOYOMBO: &'static [(char, char)] = &[ + ('𑩐', '𑪃'), ('𑪆', '𑪜'), ('𑪞', '𑪢'), +]; + +pub const SUNDANESE: &'static [(char, char)] = &[ + ('ᮀ', 'ᮿ'), ('᳀', '᳇'), +]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[ + ('।', '॥'), ('০', '৯'), ('ꠀ', '꠫'), +]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('ـ', 'ـ'), ('ً', 'ٕ'), + ('ٰ', 'ٰ'), ('܀', '܍'), ('\u{70f}', '݊'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ'), +]; + +pub const TAGALOG: &'static [(char, char)] = &[ + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), ('᜵', '᜶'), +]; + +pub const TAGBANWA: &'static [(char, char)] = &[ + ('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), +]; + +pub const TAI_LE: &'static [(char, char)] = &[ + ('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), +]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[ + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), +]; + +pub const TAKRI: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), +]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), + ('ா', 'ூ'), ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), + ('ௗ', 'ௗ'), ('௦', '௺'), ('᳚', '᳚'), ('ꣳ', 'ꣳ'), + ('𑌁', '𑌁'), ('𑌃', '𑌃'), ('𑌼', '𑌼'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ఀ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), + ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), + ('ౠ', 'ౣ'), ('౦', '౯'), ('౸', '౿'), ('᳚', '᳚'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('٠', '٩'), ('ހ', 'ޱ'), + ('ﷲ', 'ﷲ'), ('﷽', '﷽'), +]; + +pub const THAI: &'static [(char, char)] = &[ + ('ก', 'ฺ'), ('เ', '๛'), +]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ྗ'), ('ྙ', 'ྼ'), + ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = &[ + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('⵿', '⵿'), +]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), +]; + +pub const UGARITIC: &'static [(char, char)] = &[ + ('𐎀', '𐎝'), ('𐎟', '𐎟'), +]; + +pub const VAI: &'static [(char, char)] = &[ + ('ꔀ', 'ꘫ'), +]; + +pub const WARANG_CITI: &'static [(char, char)] = &[ + ('𑢠', '𑣲'), ('𑣿', '𑣿'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('、', '。'), ('〈', '】'), ('〔', '〛'), ('・', '・'), + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), ('。', '・'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[ + ('𑨀', '𑩇'), +]; diff --git a/regex-syntax/benches/bench.rs b/regex-syntax/benches/bench.rs new file mode 100644 index 0000000000..f887772b27 --- /dev/null +++ b/regex-syntax/benches/bench.rs @@ -0,0 +1,65 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(test)] + +extern crate regex_syntax; +extern crate test; + +use regex_syntax::Expr; +use test::Bencher; + +#[bench] +fn parse_simple1(b: &mut Bencher) { + b.iter(|| { + let re = r"^bc(d|e)*$"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_simple2(b: &mut Bencher) { + b.iter(|| { + let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_small1(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}|\p{N}|\s|.|\d"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium1(b: &mut Bencher) { + b.iter(|| { + let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium2(b: &mut Bencher) { + b.iter(|| { + let re = r"\s\S\w\W\d\D"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_huge(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}{100}"; + Expr::parse(re).unwrap() + }); +}