diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a474af1b..5b88d9e80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ TBD * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): +Remove guarantees in the API that connect the `u` flag with a specific HIR +representation. 1.9.6 (2023-09-30) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6c1d2745e..f8a3d4a9e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,13 +797,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1326,8 +1331,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet,