From b45421223c0fba7dab5c047d5371c2fa24521a2d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 28 Feb 2023 16:49:39 -0500 Subject: [PATCH] syntax: polish and doc updates This updates docs in a number of places, including adding examples. We also make it so zero-width matches never impact the 'utf8' property. In practice, this means '(?-u:\B)' is now considered to match valid UTF-8, which is consistent with the fact that 'a*' is considered to match valid UTF-8 too. We also do a refresh of the 'Look' and 'LookSet' APIs. --- regex-syntax/README.md | 9 +- regex-syntax/src/error.rs | 17 +- regex-syntax/src/hir/literal.rs | 26 +- regex-syntax/src/hir/mod.rs | 914 ++++++++++++++++++++++-------- regex-syntax/src/hir/translate.rs | 47 +- regex-syntax/src/lib.rs | 13 +- regex-syntax/src/parser.rs | 18 +- regex-syntax/test | 2 +- src/compile.rs | 2 +- src/exec.rs | 14 +- 10 files changed, 746 insertions(+), 316 deletions(-) diff --git a/regex-syntax/README.md b/regex-syntax/README.md index 592f84268..ff4fe094c 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -30,13 +30,12 @@ concrete syntax that produced the `Hir`. This example shows how to parse a pattern string into its HIR: ```rust -use regex_syntax::Parser; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::{hir::Hir, parse}; -let hir = Parser::new().parse("a|b").unwrap(); +let hir = parse("a|b").unwrap(); assert_eq!(hir, Hir::alternation(vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])); ``` diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index a10230a87..98869c4f7 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -1,5 +1,3 @@ -use core::{cmp, fmt, result}; - use alloc::{ format, string::{String, ToString}, @@ -9,9 +7,6 @@ use alloc::{ use crate::{ast, hir}; -/// A type alias for dealing with errors returned by this crate. -pub type Result = result::Result; - /// This error type encompasses any error that can be returned by this crate. /// /// This error type is marked as `non_exhaustive`. This means that adding a @@ -42,8 +37,8 @@ impl From for Error { #[cfg(feature = "std")] impl std::error::Error for Error {} -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { Error::Parse(ref x) => x.fmt(f), Error::Translate(ref x) => x.fmt(f), @@ -91,8 +86,8 @@ impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { } } -impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let spans = Spans::from_formatter(self); if self.pattern.contains('\n') { let divider = repeat_char('~', 79); @@ -158,7 +153,7 @@ struct Spans<'p> { impl<'p> Spans<'p> { /// Build a sequence of spans from a formatter. - fn from_formatter<'e, E: fmt::Display>( + fn from_formatter<'e, E: core::fmt::Display>( fmter: &'p Formatter<'e, E>, ) -> Spans<'p> { let mut line_count = fmter.pattern.lines().count(); @@ -238,7 +233,7 @@ impl<'p> Spans<'p> { pos += 1; } let note_len = span.end.column.saturating_sub(span.start.column); - for _ in 0..cmp::max(1, note_len) { + for _ in 0..core::cmp::max(1, note_len) { notes.push('^'); pos += 1; } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 121216ae5..bd3a2d143 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -101,10 +101,9 @@ use crate::hir::{self, Hir}; /// This shows how to extract prefixes: /// /// ``` -/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; -/// -/// let hir = Parser::new().parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// +/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; /// let got = Extractor::new().extract(&hir); /// // All literals returned are "inexact" because none of them reach the /// // match state. @@ -129,11 +128,10 @@ use crate::hir::{self, Hir}; /// ``` /// use regex_syntax::{ /// hir::literal::{Extractor, ExtractKind, Literal, Seq}, -/// Parser, +/// parse, /// }; /// -/// let hir = Parser::new().parse(r"foo|[A-Z]+bar")?; -/// +/// let hir = parse(r"foo|[A-Z]+bar")?; /// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); /// // Since 'foo' gets to a match state, it is considered exact. But 'bar' /// // does not because of the '[A-Z]+', and thus is marked inexact. @@ -237,9 +235,9 @@ impl Extractor { /// for character classes being turned into literal sequences. /// /// ``` - /// use regex_syntax::{hir::literal::{Extractor, Seq}, Parser}; + /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; /// - /// let hir = Parser::new().parse(r"[0-9]")?; + /// let hir = parse(r"[0-9]")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new([ @@ -274,9 +272,9 @@ impl Extractor { /// This shows how to decrease the limit and compares it with the default. /// /// ``` - /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// - /// let hir = Parser::new().parse(r"(abc){8}")?; + /// let hir = parse(r"(abc){8}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); @@ -311,9 +309,9 @@ impl Extractor { /// This shows how to decrease the limit and compares it with the default. /// /// ``` - /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// - /// let hir = Parser::new().parse(r"(abc){2}{2}{2}")?; + /// let hir = parse(r"(abc){2}{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); @@ -353,9 +351,9 @@ impl Extractor { /// sequence returned. /// /// ``` - /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// - /// let hir = Parser::new().parse(r"[ab]{2}{2}")?; + /// let hir = parse(r"[ab]{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new([ diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 9193366a9..f9bd51345 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1,5 +1,20 @@ /*! -Defines a high-level intermediate representation for regular expressions. +Defines a high-level intermediate (HIR) representation for regular expressions. + +The HIR is represented by the [`Hir`] type, and it principally constructed via +[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users +may use the smart constructors defined on `Hir` to build their own by hand. The +smart constructors simultaneously simplify and "optimize" the HIR, and are also +the same routines used by translation. + +Most regex engines only have an HIR like this, and usually construct it +directly from the concrete syntax. This crate however first parses the +concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, +as mentioned above. It's done this way to facilitate better error reporting, +and to have a structured representation of a regex that faithfully represents +its concrete syntax. Namely, while an `Hir` value can be converted back to an +equivalent regex pattern string, it is unlikely to look like the original due +to its simplified structure. */ use core::{char, cmp}; @@ -122,6 +137,12 @@ impl core::fmt::Display for ErrorKind { /// A high-level intermediate representation (HIR) for a regular expression. /// +/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. +/// An `HirKind` indicates what kind of regular expression it is (a literal, +/// a repetition, a look-around assertion, etc.), where as a `Properties` +/// describes various facts about the regular expression. For example, whether +/// it matches UTF-8 or if it matches the empty string. +/// /// The HIR of a regular expression represents an intermediate step between /// its abstract syntax (a structured description of the concrete syntax) and /// an actual regex matcher. The purpose of HIR is to make regular expressions @@ -133,24 +154,39 @@ impl core::fmt::Display for ErrorKind { /// it is handled automatically by the translator (e.g., by translating /// `(?i:A)` to `[aA]`). /// -/// If the HIR was produced by a translator that disallows invalid UTF-8, then -/// the HIR is guaranteed to match UTF-8 exclusively. -/// -/// This type defines its own destructor that uses constant stack space and -/// heap space proportional to the size of the HIR. -/// /// The specific type of an HIR expression can be accessed via its `kind` /// or `into_kind` methods. This extra level of indirection exists for two /// reasons: /// -/// 1. Construction of an HIR expression *must* use the constructor methods -/// on this `Hir` type instead of building the `HirKind` values directly. -/// This permits construction to enforce invariants like "concatenations -/// always consist of two or more sub-expressions." +/// 1. Construction of an HIR expression *must* use the constructor methods on +/// this `Hir` type instead of building the `HirKind` values directly. This +/// permits construction to enforce invariants like "concatenations always +/// consist of two or more sub-expressions." /// 2. Every HIR expression contains attributes that are defined inductively, -/// and can be computed cheaply during the construction process. For -/// example, one such attribute is whether the expression must match at the -/// beginning of the haystack. +/// and can be computed cheaply during the construction process. For example, +/// one such attribute is whether the expression must match at the beginning of +/// the haystack. +/// +/// In particular, if you have an `HirKind` value, then there is intentionally +/// no way to build an `Hir` value from it. You instead need to do case +/// analysis on the `HirKind` value and build the `Hir` value using its smart +/// constructors. +/// +/// # UTF-8 +/// +/// If the HIR was produced by a translator with +/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, +/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty +/// matches. +/// +/// For empty matches, those can occur at any position. It is the +/// repsonsibility of the regex engine to determine whether empty matches are +/// permitted between the code units of a single codepoint. +/// +/// # Stack space +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. /// /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular /// expression pattern string, and uses constant stack space and heap space @@ -169,38 +205,6 @@ pub struct Hir { props: Properties, } -/// The kind of an arbitrary `Hir` expression. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum HirKind { - /// The empty regular expression, which matches everything, including the - /// empty string. - Empty, - /// A literalstring that matches exactly these bytes. - Literal(Literal), - /// A single character class that matches any of the characters in the - /// class. A class can either consist of Unicode scalar values as - /// characters, or it can use bytes. - Class(Class), - /// A look-around assertion. A look-around match always has zero length. - Look(Look), - /// A repetition operation applied to a child expression. - Repetition(Repetition), - /// A capturing group, which contains a child expression. - Capture(Capture), - /// A concatenation of expressions. A concatenation always has at least two - /// child expressions. - /// - /// A concatenation matches only if each of its child expression matches - /// one after the other. - Concat(Vec), - /// An alternation of expressions. An alternation always has at least two - /// child expressions. - /// - /// An alternation matches only if at least one of its child expression - /// matches. If multiple expressions match, then the leftmost is preferred. - Alternation(Vec), -} - /// Methods for accessing the underlying `HirKind` and `Properties`. impl Hir { /// Returns a reference to the underlying HIR kind. @@ -233,7 +237,7 @@ impl Hir { /// Smart constructors for HIR values. /// -/// These constructors are called "smart" because they inductive work or +/// These constructors are called "smart" because they do inductive work or /// simplifications. For example, calling `Hir::repetition` with a repetition /// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind /// since it is equivalent to an empty regex. Another example is calling @@ -252,8 +256,9 @@ impl Hir { Hir { kind: HirKind::Empty, props } } - /// Returns an HIR expression that can never match anything. That is, the - /// set of strings in the language described by the HIR returned is `0`. + /// Returns an HIR expression that can never match anything. That is, + /// the size of the set of strings in the language described by the HIR + /// returned is `0`. /// /// This is distinct from [`Hir::empty`] in that the empty string matches /// the HIR returned by `Hir::empty`. That is, the set of strings in the @@ -278,9 +283,41 @@ impl Hir { /// Creates a literal HIR expression. /// - /// If the given literal has a `Byte` variant with an ASCII byte, then this - /// method panics. This enforces the invariant that `Byte` variants are - /// only used to express matching of invalid UTF-8. + /// This accepts anything that can be converted into a `Box<[u8]>`. + /// + /// Note that there is no mechanism for storing a `char` or a `Box` + /// in an HIR. Everything is "just bytes." Whether a `Literal` (or + /// any HIR node) matches valid UTF-8 exclusively can be queried via + /// [`Properties::is_utf8`]. + /// + /// # Example + /// + /// This example shows that concatenations of `Literal` HIR values will + /// automatically get flattened and combined together. So for example, even + /// if you concat multiple `Literal` values that are themselves not valid + /// UTF-8, they might add up to valid UTF-8. This also demonstrates just + /// how "smart" Hir's smart constructors are. + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let literals = vec![ + /// Hir::literal([0xE2]), + /// Hir::literal([0x98]), + /// Hir::literal([0x83]), + /// ]; + /// // Each literal, on its own, is invalid UTF-8. + /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); + /// + /// let concat = Hir::concat(literals); + /// // But the concatenation is valid UTF-8! + /// assert!(concat.properties().is_utf8()); + /// + /// // And also notice that the literals have been concatenated into a + /// // single `Literal`, to the point where there is no explicit `Concat`! + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, concat.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); @@ -293,7 +330,11 @@ impl Hir { Hir { kind: HirKind::Literal(lit), props } } - /// Creates a class HIR expression. + /// Creates a class HIR expression. The class may either be defined over + /// ranges of Unicode codepoints or ranges of raw byte values. + /// + /// Note that an empty class is permitted. An empty class is equivalent to + /// `Hir::fail()`. #[inline] pub fn class(class: Class) -> Hir { if class.is_empty() { @@ -330,6 +371,12 @@ impl Hir { } /// Creates a capture HIR expression. + /// + /// Note that there is no explicit HIR value for a non-capturing group. + /// Since a non-capturing group only exists to override precedence in the + /// concrete syntax and since an HIR already does its own grouping based on + /// what is parsed, there is no need to explicitly represent non-capturing + /// groups in the HIR. #[inline] pub fn capture(capture: Capture) -> Hir { let props = Properties::capture(&capture); @@ -338,8 +385,32 @@ impl Hir { /// Returns the concatenation of the given expressions. /// - /// This flattens the concatenation as appropriate. - pub fn concat(hirs: Vec) -> Hir { + /// This attempts to flatten and simplify the concatenation as appropriate. + /// + /// # Example + /// + /// This shows a simple example of basic flattening of both concatenations + /// and literals. + /// + /// ``` + /// use regex_syntax::hir::Hir; + /// + /// let hir = Hir::concat(vec![ + /// Hir::concat(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal([b'x']), + /// Hir::literal([b'y']), + /// Hir::literal([b'z']), + /// ]), + /// ]); + /// let expected = Hir::literal("abcxyz".as_bytes()); + /// assert_eq!(expected, hir); + /// ``` + pub fn concat(subs: Vec) -> Hir { // We rebuild the concatenation by simplifying it. Would be nice to do // it in place, but that seems a little tricky? let mut new = vec![]; @@ -348,8 +419,8 @@ impl Hir { // to 'prior_lit', and whenever we see anything else, we first take // any bytes in 'prior_lit' and add it to the 'new' concatenation. let mut prior_lit: Option> = None; - for hir in hirs { - let (kind, props) = hir.into_parts(); + for sub in subs { + let (kind, props) = sub.into_parts(); match kind { HirKind::Literal(Literal(bytes)) => { if let Some(ref mut prior_bytes) = prior_lit { @@ -362,9 +433,9 @@ impl Hir { // concat. We only need to do this one level deep since // Hir::concat is the only way to build concatenations, and so // flattening happens inductively. - HirKind::Concat(hirs2) => { - for hir2 in hirs2 { - let (kind2, props2) = hir2.into_parts(); + HirKind::Concat(subs2) => { + for sub2 in subs2 { + let (kind2, props2) = sub2.into_parts(); match kind2 { HirKind::Literal(Literal(bytes)) => { if let Some(ref mut prior_bytes) = prior_lit { @@ -406,17 +477,82 @@ impl Hir { /// Returns the alternation of the given expressions. /// - /// This flattens the alternation as appropriate. - pub fn alternation(hirs: Vec) -> Hir { + /// This flattens and simplifies the alternation as appropriate. This may + /// include factoring out common prefixes or even rewriting the alternation + /// as a character class. + /// + /// Note that an empty alternation is equivalent to `Hir::fail()`. (It + /// is not possible for one to write an empty alternation, or even an + /// alternation with a single sub-expression, in the concrete syntax of a + /// regex.) + /// + /// # Example + /// + /// This is a simple example showing how an alternation might get + /// simplified. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// Hir::literal([b'd']), + /// Hir::literal([b'e']), + /// Hir::literal([b'f']), + /// ]); + /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'f'), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + /// + /// And another example showing how common prefixes might get factored + /// out. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// let expected = Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::alternation(vec![ + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// assert_eq!(expected, hir); + /// ``` + /// + /// Note that these sorts of simplifications are not guaranteed. + pub fn alternation(subs: Vec) -> Hir { // We rebuild the alternation by simplifying it. We proceed similarly // as the concatenation case. But in this case, there's no literal // simplification happening. We're just flattening alternations. let mut new = vec![]; - for hir in hirs { - let (kind, props) = hir.into_parts(); + for sub in subs { + let (kind, props) = sub.into_parts(); match kind { - HirKind::Alternation(hirs2) => { - new.extend(hirs2); + HirKind::Alternation(subs2) => { + new.extend(subs2); } kind => { new.push(Hir { kind, props }); @@ -478,9 +614,21 @@ impl Hir { /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// + /// # Example + /// /// Note that this is a convenience routine for constructing the correct /// character class based on the value of `Dot`. There is no explicit "dot" /// HIR value. It is just an abbreviation for a common character class. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::dot(Dot::AnyByte); + /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ + /// ClassBytesRange::new(0x00, 0xFF), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` #[inline] pub fn dot(dot: Dot) -> Hir { match dot { @@ -524,31 +672,65 @@ impl Hir { } } -impl HirKind { - /// Return true if and only if this HIR is the empty regular expression. +/// The underlying kind of an arbitrary [`Hir`] expression. +/// +/// An `HirKind` is principally useful for doing case analysis on the type +/// of a regular expression. If you're looking to build new `Hir` values, +/// then you _must_ use the smart constructors defined on `Hir`, like +/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does +/// not expose any way of building an `Hir` directly from an `HirKind`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A literalstring that matches exactly these bytes. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. /// - /// Note that this is not defined inductively. That is, it only tests if - /// this kind is the `Empty` variant. To get the inductive definition, use - /// the `is_match_empty` method on [`Hir`]. - pub fn is_empty(&self) -> bool { - match *self { - HirKind::Empty => true, - _ => false, - } - } + /// A class may be empty. In which case, it matches nothing. + Class(Class), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), + /// A repetition operation applied to a sub-expression. + Repetition(Repetition), + /// A capturing group, which contains a sub-expression. + Capture(Capture), + /// A concatenation of expressions. + /// + /// A concatenation matches only if each of its sub-expressions match one + /// after the other. + /// + /// Concatenations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Concat(Vec), + /// An alternation of expressions. + /// + /// An alternation matches only if at least one of its sub-expressions + /// match. If multiple sub-expressions match, then the leftmost is + /// preferred. + /// + /// Alternations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Alternation(Vec), +} + +impl HirKind { + /// Returns a slice of this kind's sub-expressions, if any. + pub fn subs(&self) -> &[Hir] { + use core::slice::from_ref; - /// Returns true if and only if this kind has any (including possibly - /// empty) subexpressions. - pub fn has_subexprs(&self) -> bool { match *self { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Look(_) => false, - HirKind::Capture(_) - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => true, + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, } } } @@ -573,10 +755,14 @@ impl core::fmt::Display for Hir { /// The high-level intermediate representation of a literal. /// -/// A literal corresponds to a single character, where a character is either -/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters -/// are preferred whenever possible. In particular, a `Byte` variant is only -/// ever produced when it could match invalid UTF-8. +/// A literal corresponds to `0` or more bytes that should be matched +/// literally. The smart constructors defined on `Hir` will automatically +/// concatenate adjacent literals into one literal, and will even automatically +/// replace empty literals with `Hir::empty()`. +/// +/// Note that despite a literal being represented by a sequence of bytes, its +/// `Debug` implementation will attempt to print it as a normal string. (That +/// is, not a sequence of decimal numbers.) #[derive(Clone, Eq, PartialEq)] pub struct Literal(pub Box<[u8]>); @@ -681,7 +867,7 @@ impl Class { pub fn is_utf8(&self) -> bool { match *self { Class::Unicode(_) => true, - Class::Bytes(ref x) => x.is_all_ascii(), + Class::Bytes(ref x) => x.is_ascii(), } } @@ -699,25 +885,25 @@ impl Class { /// minimum length, if any. /// /// ``` - /// use regex_syntax::{hir::Properties, Parser}; + /// use regex_syntax::{hir::Properties, parse}; /// /// // The empty string has a min length of 0. - /// let hir = Parser::new().parse(r"")?; + /// let hir = parse(r"")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // As do other types of regexes that only match the empty string. - /// let hir = Parser::new().parse(r"^$\b\B")?; + /// let hir = parse(r"^$\b\B")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // A regex that can match the empty string but match more is still 0. - /// let hir = Parser::new().parse(r"a*")?; + /// let hir = parse(r"a*")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // A regex that matches nothing has no minimum defined. - /// let hir = Parser::new().parse(r"[a&&b]")?; + /// let hir = parse(r"[a&&b]")?; /// assert_eq!(None, hir.properties().minimum_len()); /// // Character classes usually have a minimum length of 1. - /// let hir = Parser::new().parse(r"\w")?; + /// let hir = parse(r"\w")?; /// assert_eq!(Some(1), hir.properties().minimum_len()); /// // But sometimes Unicode classes might be bigger! - /// let hir = Parser::new().parse(r"\p{Cyrillic}")?; + /// let hir = parse(r"\p{Cyrillic}")?; /// assert_eq!(Some(2), hir.properties().minimum_len()); /// /// # Ok::<(), Box>(()) @@ -743,28 +929,28 @@ impl Class { /// maximum length, if any. /// /// ``` - /// use regex_syntax::{hir::Properties, Parser}; + /// use regex_syntax::{hir::Properties, parse}; /// /// // The empty string has a max length of 0. - /// let hir = Parser::new().parse(r"")?; + /// let hir = parse(r"")?; /// assert_eq!(Some(0), hir.properties().maximum_len()); /// // As do other types of regexes that only match the empty string. - /// let hir = Parser::new().parse(r"^$\b\B")?; + /// let hir = parse(r"^$\b\B")?; /// assert_eq!(Some(0), hir.properties().maximum_len()); /// // A regex that matches nothing has no maximum defined. - /// let hir = Parser::new().parse(r"[a&&b]")?; + /// let hir = parse(r"[a&&b]")?; /// assert_eq!(None, hir.properties().maximum_len()); /// // Bounded repeats work as you expect. - /// let hir = Parser::new().parse(r"x{2,10}")?; + /// let hir = parse(r"x{2,10}")?; /// assert_eq!(Some(10), hir.properties().maximum_len()); /// // An unbounded repeat means there is no maximum. - /// let hir = Parser::new().parse(r"x{2,}")?; + /// let hir = parse(r"x{2,}")?; /// assert_eq!(None, hir.properties().maximum_len()); /// // With Unicode enabled, \w can match up to 4 bytes! - /// let hir = Parser::new().parse(r"\w")?; + /// let hir = parse(r"\w")?; /// assert_eq!(Some(4), hir.properties().maximum_len()); /// // Without Unicode enabled, \w matches at most 1 byte. - /// let hir = Parser::new().parse(r"(?-u)\w")?; + /// let hir = parse(r"(?-u)\w")?; /// assert_eq!(Some(1), hir.properties().maximum_len()); /// /// # Ok::<(), Box>(()) @@ -810,7 +996,8 @@ impl ClassUnicode { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. pub fn new(ranges: I) -> ClassUnicode where I: IntoIterator, @@ -819,6 +1006,9 @@ impl ClassUnicode { } /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. pub fn empty() -> ClassUnicode { ClassUnicode::new(vec![]) } @@ -914,7 +1104,7 @@ impl ClassUnicode { /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII codepoint. - pub fn is_all_ascii(&self) -> bool { + pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') } @@ -955,7 +1145,7 @@ impl ClassUnicode { /// If this class consists of only ASCII ranges, then return its /// corresponding and equivalent byte class. pub fn to_byte_class(&self) -> Option { - if !self.is_all_ascii() { + if !self.is_ascii() { return None; } Some(ClassBytes::new(self.ranges().iter().map(|r| { @@ -1115,7 +1305,8 @@ impl ClassBytes { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. pub fn new(ranges: I) -> ClassBytes where I: IntoIterator, @@ -1124,6 +1315,9 @@ impl ClassBytes { } /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. pub fn empty() -> ClassBytes { ClassBytes::new(vec![]) } @@ -1193,7 +1387,7 @@ impl ClassBytes { /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII byte. - pub fn is_all_ascii(&self) -> bool { + pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } @@ -1238,7 +1432,7 @@ impl ClassBytes { /// If this class consists of only ASCII ranges, then return its /// corresponding and equivalent Unicode class. pub fn to_unicode_class(&self) -> Option { - if !self.is_all_ascii() { + if !self.is_ascii() { return None; } Some(ClassUnicode::new(self.ranges().iter().map(|r| { @@ -1369,82 +1563,111 @@ impl core::fmt::Debug for ClassBytesRange { pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. - Start, + Start = 1 << 0, /// Match the end of text. Specifically, this matches at the ending /// position of the input. - End, + End = 1 << 1, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following a `\n` character. - StartLF, + StartLF = 1 << 2, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\n` character. - EndLF, + EndLF = 1 << 3, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following either a `\r` or `\n` character, but never after /// a `\r` when a `\n` follows. - StartCRLF, + StartCRLF = 1 << 4, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` /// precedes it. - EndCRLF, + EndCRLF = 1 << 5, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. - WordAscii, + WordAscii = 1 << 6, /// Match an ASCII-only negation of a word boundary. - WordAsciiNegate, + WordAsciiNegate = 1 << 7, /// Match a Unicode-aware word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. - WordUnicode, + WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. - WordUnicodeNegate, + WordUnicodeNegate = 1 << 9, } impl Look { - fn from_repr(repr: u16) -> Option { - match repr { - 0 => Some(Look::Start), - 1 => Some(Look::End), - 2 => Some(Look::StartLF), - 3 => Some(Look::EndLF), - 4 => Some(Look::StartCRLF), - 5 => Some(Look::EndCRLF), - 6 => Some(Look::WordAscii), - 7 => Some(Look::WordAsciiNegate), - 8 => Some(Look::WordUnicode), - 9 => Some(Look::WordUnicodeNegate), - _ => None, + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, } } - fn as_repr(&self) -> u16 { - match *self { - Look::Start => 0, - Look::End => 1, - Look::StartLF => 2, - Look::EndLF => 3, - Look::StartCRLF => 5, - Look::EndCRLF => 5, - Look::WordAscii => 6, - Look::WordAsciiNegate => 7, - Look::WordUnicode => 8, - Look::WordUnicodeNegate => 9, + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u16 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u16 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u16) -> Option { + match repr { + 0b00_0000_0001 => Some(Look::Start), + 0b00_0000_0010 => Some(Look::End), + 0b00_0000_0100 => Some(Look::StartLF), + 0b00_0000_1000 => Some(Look::EndLF), + 0b00_0001_0000 => Some(Look::StartCRLF), + 0b00_0010_0000 => Some(Look::EndCRLF), + 0b00_0100_0000 => Some(Look::WordAscii), + 0b00_1000_0000 => Some(Look::WordAsciiNegate), + 0b01_0000_0000 => Some(Look::WordUnicode), + 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + _ => None, } } - fn as_char(self) -> char { + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { match self { Look::Start => 'A', Look::End => 'z', Look::StartLF => '^', Look::EndLF => '$', - Look::StartCRLF => '^', - Look::EndCRLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', Look::WordAscii => 'b', Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', @@ -1482,6 +1705,9 @@ pub struct Repetition { /// /// Note that special cases like `?`, `+` and `*` all get translated into /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. pub min: u32, /// The maximum range of the repetition. /// @@ -1502,23 +1728,6 @@ pub struct Repetition { pub sub: Box, } -impl Repetition { - /// Returns true if and only if this repetition operator makes it possible - /// to match the empty string. - /// - /// Note that this is not defined inductively. For example, while `a*` - /// will report `true`, `()+` will not, even though `()` matches the empty - /// string and one or more occurrences of something that matches the - /// empty string will always match the empty string. In order to get the - /// inductive definition, see the corresponding method on [`Hir`]. - /// - /// This returns true in precisely the cases that [`Repetition::min`] - /// is equal to `0`. - pub fn is_match_empty(&self) -> bool { - self.min == 0 - } -} - /// A type describing the different flavors of `.`. /// /// This type is meant to be used with [`Hir::dot`], which is a convenience @@ -1564,8 +1773,10 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => return, - HirKind::Capture(ref x) if !x.sub.kind.has_subexprs() => return, - HirKind::Repetition(ref x) if !x.sub.kind.has_subexprs() => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } HirKind::Concat(ref x) if x.is_empty() => return, HirKind::Alternation(ref x) if x.is_empty() => return, _ => {} @@ -1601,7 +1812,8 @@ impl Drop for Hir { /// computed inductively on an HIR value. Properties are defined for all /// HIR values. /// -/// All methods on a `Properties` value take constant time. +/// All methods on a `Properties` value take constant time and are meant to +/// be cheap to call. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Properties(Box); @@ -1635,6 +1847,7 @@ impl Properties { /// `None` is returned when there is no minimum length. This occurs in /// precisely the cases where the HIR matches nothing. i.e., The language /// the regex matches is empty. An example of such a regex is `\P{any}`. + #[inline] pub fn minimum_len(&self) -> Option { self.0.minimum_len } @@ -1649,12 +1862,14 @@ impl Properties { /// occurs when the HIR matches nothing or when there is no upper bound on /// the length of matching strings. Example of such regexes are `\P{any}` /// (matches nothing) and `a+` (has no upper bound). + #[inline] pub fn maximum_len(&self) -> Option { self.0.maximum_len } /// Returns a set of all look-around assertions that appear at least once /// in this HIR value. + #[inline] pub fn look_set(&self) -> LookSet { self.0.look_set } @@ -1665,6 +1880,7 @@ impl Properties { /// /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true /// if and only if the HIR is fully anchored at the start. + #[inline] pub fn look_set_prefix(&self) -> LookSet { self.0.look_set_prefix } @@ -1676,6 +1892,7 @@ impl Properties { /// /// For example, `hir.look_set_suffix().contains(Look::End)` returns true /// if and only if the HIR is fully anchored at the end. + #[inline] pub fn look_set_suffix(&self) -> LookSet { self.0.look_set_suffix } @@ -1684,7 +1901,8 @@ impl Properties { /// valid UTF-8. /// /// When this returns false, then it is possible for this HIR expression to - /// match invalid UTF-8. + /// match invalid UTF-8, including by matching between the code units of + /// a single UTF-8 encoded codepoint. /// /// Note that this returns true even when the corresponding HIR can match /// the empty string. Since an empty string can technically appear between @@ -1693,6 +1911,57 @@ impl Properties { /// However, it is generally assumed that such empty matches are handled /// specially by the search routine if it is absolutely required that /// matches not split a codepoint. + /// + /// # Example + /// + /// This code example shows the UTF-8 property of a variety of patterns. + /// + /// ``` + /// use regex_syntax::{ParserBuilder, parse}; + /// + /// // Examples of 'is_utf8() == true'. + /// assert!(parse(r"a")?.properties().is_utf8()); + /// assert!(parse(r"[^a]")?.properties().is_utf8()); + /// assert!(parse(r".")?.properties().is_utf8()); + /// assert!(parse(r"\W")?.properties().is_utf8()); + /// assert!(parse(r"\b")?.properties().is_utf8()); + /// assert!(parse(r"\B")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); + /// // Unicode mode is enabled by default, and in + /// // that mode, all \x hex escapes are treated as + /// // codepoints. So this actually matches the UTF-8 + /// // encoding of U+00FF. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// + /// // Now we show examples of 'is_utf8() == false'. + /// // The only way to do this is to force the parser + /// // to permit invalid UTF-8, otherwise all of these + /// // would fail to parse! + /// let parse = |pattern| { + /// ParserBuilder::new().utf8(false).build().parse(pattern) + /// }; + /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); + /// // Conversely to the equivalent example above, + /// // when Unicode mode is disabled, \x hex escapes + /// // are treated as their raw byte values. + /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); + /// // Note that just because we disabled UTF-8 in the + /// // parser doesn't mean we still can't use Unicode. + /// // It is enabled by default, so \xFF is still + /// // equivalent to matching the UTF-8 encoding of + /// // U+00FF by default. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// // Even though we use raw bytes that individually + /// // are not valid UTF-8, when combined together, the + /// // overall expression *does* match valid UTF-8! + /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] pub fn is_utf8(&self) -> bool { self.0.utf8 } @@ -1702,18 +1971,33 @@ impl Properties { /// /// Note that this does not include the implicit capturing group /// corresponding to the entire match that is typically included by regex - /// engines. So for example, this method will return `0` for `a` and `1` - /// for `(a)`. + /// engines. + /// + /// # Example + /// + /// This method will return `0` for `a` and `1` for `(a)`: + /// + /// ``` + /// use regex_syntax::parse; + /// + /// assert_eq!(0, parse("a")?.properties().captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().captures_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] pub fn captures_len(&self) -> usize { self.0.captures_len } - /// Return true if and only if this HIR is a simple literal. This is only - /// true when this HIR expression is either itself a `Literal` or a + /// Return true if and only if this HIR is a simple literal. This is + /// only true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. /// - /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, - /// `` are not (even though that contain sub-expressions that are literals). + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and + /// the empty string are not (even though they contain sub-expressions that + /// are literals). + #[inline] pub fn is_literal(&self) -> bool { self.0.literal } @@ -1726,6 +2010,7 @@ impl Properties { /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation /// literals, but `f+`, `(foo)`, `foo()`, `` /// are not (even though that contain sub-expressions that are literals). + #[inline] pub fn is_alternation_literal(&self) -> bool { self.0.alternation_literal } @@ -1746,17 +2031,17 @@ impl Properties { /// like the minimum and maximum lengths. /// /// ``` - /// use regex_syntax::{hir::Properties, Parser}; + /// use regex_syntax::{hir::Properties, parse}; /// - /// let hir1 = Parser::new().parse("ab?c?")?; + /// let hir1 = parse("ab?c?")?; /// assert_eq!(Some(1), hir1.properties().minimum_len()); /// assert_eq!(Some(3), hir1.properties().maximum_len()); /// - /// let hir2 = Parser::new().parse(r"[a&&b]")?; + /// let hir2 = parse(r"[a&&b]")?; /// assert_eq!(None, hir2.properties().minimum_len()); /// assert_eq!(None, hir2.properties().maximum_len()); /// - /// let hir3 = Parser::new().parse(r"wxy?z?")?; + /// let hir3 = parse(r"wxy?z?")?; /// assert_eq!(Some(2), hir3.properties().minimum_len()); /// assert_eq!(Some(4), hir3.properties().maximum_len()); /// @@ -1776,17 +2061,17 @@ impl Properties { /// unaffected: /// /// ``` - /// use regex_syntax::{hir::Properties, Parser}; + /// use regex_syntax::{hir::Properties, parse}; /// - /// let hir1 = Parser::new().parse("ab?c?")?; + /// let hir1 = parse("ab?c?")?; /// assert_eq!(Some(1), hir1.properties().minimum_len()); /// assert_eq!(Some(3), hir1.properties().maximum_len()); /// - /// let hir2 = Parser::new().parse(r"a+")?; + /// let hir2 = parse(r"a+")?; /// assert_eq!(Some(1), hir2.properties().minimum_len()); /// assert_eq!(None, hir2.properties().maximum_len()); /// - /// let hir3 = Parser::new().parse(r"wxy?z?")?; + /// let hir3 = parse(r"wxy?z?")?; /// assert_eq!(Some(2), hir3.properties().minimum_len()); /// assert_eq!(Some(4), hir3.properties().maximum_len()); /// @@ -1834,9 +2119,9 @@ impl Properties { // Handle properties that need to visit every child hir. for prop in it { let p = prop.borrow(); - props.look_set.union(p.look_set()); - props.look_set_prefix.intersect(p.look_set_prefix()); - props.look_set_suffix.intersect(p.look_set_suffix()); + props.look_set.set_union(p.look_set()); + props.look_set_prefix.set_intersect(p.look_set_prefix()); + props.look_set_suffix.set_intersect(p.look_set_suffix()); props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); @@ -1935,29 +2220,26 @@ impl Properties { /// Create a new set of HIR properties for a look-around assertion. fn look(look: Look) -> Properties { - use self::Look::*; - - let utf8 = match look { - Start | End | StartLF | EndLF | WordAscii | WordUnicode - | WordUnicodeNegate => true, - // FIXME: Negated ASCII word boundaries can match invalid UTF-8. - // But why is this 'false' when 'HirKind::Empty' is true? After - // all, isn't WordAsciiNegate just a subset of HirKind::Empty? It - // seems to me that if we handle HirKind::Empty correctly even when - // it splits a codepoint, then we should be able to automatically - // handle WordAsciiNegate correctly too... - // - // For now, this returns 'false' because that's what it did before. - // But we should revisit this before the next release. - WordAsciiNegate => false, - }; let inner = PropertiesI { minimum_len: Some(0), maximum_len: Some(0), look_set: LookSet::singleton(look), look_set_prefix: LookSet::singleton(look), look_set_suffix: LookSet::singleton(look), - utf8, + // This requires a little explanation. Basically, we don't consider + // matching an empty string to be equivalent to matching invalid + // UTF-8, even though technically matching every empty string will + // split the UTF-8 encoding of a single codepoint when treating a + // UTF-8 encoded string as a sequence of bytes. Our defense here is + // that in such a case, a codepoint should logically be treated as + // the fundamental atom for matching, and thus the only valid match + // points are between codepoints and not bytes. + // + // More practically, this is true here because it's also true + // for 'Hir::empty()', otherwise something like 'a*' would be + // considered to match invalid UTF-8. That in turn makes this + // property borderline useless. + utf8: true, captures_len: 0, literal: false, alternation_literal: false, @@ -1989,7 +2271,10 @@ impl Properties { literal: false, alternation_literal: false, }; - if !rep.is_match_empty() { + // The repetition operator can match the empty string, then its lookset + // prefix and suffixes themselves remain empty since they are no longer + // required to match. + if rep.min > 0 { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } @@ -2027,7 +2312,7 @@ impl Properties { // Handle properties that need to visit every child hir. for x in concat.iter() { let p = x.properties(); - props.look_set.union(p.look_set()); + props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); @@ -2051,7 +2336,7 @@ impl Properties { // child exprs until one matches more than the empty string. let mut it = concat.iter(); while let Some(x) = it.next() { - props.look_set_prefix.union(x.properties().look_set_prefix()); + props.look_set_prefix.set_union(x.properties().look_set_prefix()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } @@ -2059,7 +2344,7 @@ impl Properties { // Same thing for the suffix properties, but in reverse. let mut it = concat.iter().rev(); while let Some(x) = it.next() { - props.look_set_suffix.union(x.properties().look_set_suffix()); + props.look_set_suffix.set_union(x.properties().look_set_suffix()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } @@ -2079,7 +2364,16 @@ impl Properties { /// example, an [`Hir`] provides properties that return `LookSet`s. #[derive(Clone, Copy, Default, Eq, PartialEq)] pub struct LookSet { - bits: u16, + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u16, } impl LookSet { @@ -2103,14 +2397,12 @@ impl LookSet { /// one look-around assertions. #[inline] pub fn singleton(look: Look) -> LookSet { - let mut set = LookSet::empty(); - set.insert(look); - set + LookSet::empty().insert(look) } /// Returns the total number of look-around assertions in this set. #[inline] - pub fn len(&self) -> usize { + pub fn len(self) -> usize { // OK because max value always fits in a u8, which in turn always // fits in a usize, regardless of target. usize::try_from(self.bits.count_ones()).unwrap() @@ -2118,43 +2410,70 @@ impl LookSet { /// Returns true if and only if this set is empty. #[inline] - pub fn is_empty(&self) -> bool { + pub fn is_empty(self) -> bool { self.len() == 0 } - /// Insert the given look-around assertions into this set. If the assertion - /// is already in the set, then this is a no-op. + /// Returns true if and only if the given look-around assertion is in this + /// set. #[inline] - pub fn insert(&mut self, look: Look) { - self.bits |= 1 << look.as_repr(); + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 } - /// Remove the given look-around assertion from this set. If it wasn't - /// previously in the set, then this is a no-op. + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." #[inline] - pub fn remove(&mut self, look: Look) { - self.bits &= !(1 << look.as_repr()); + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() } - /// Returns true if and only if the given look-around assertion is in this - /// set. + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. #[inline] - pub fn contains(&self, look: Look) -> bool { - self.bits & (1 << look.as_repr()) != 0 + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } /// Returns true if and only if this set contains any word boundary or /// negated word boundary assertions. This include both Unicode and ASCII /// word boundaries. #[inline] - pub fn contains_word(&self) -> bool { + pub fn contains_word(self) -> bool { self.contains_word_unicode() || self.contains_word_ascii() } /// Returns true if and only if this set contains any Unicode word boundary /// or negated Unicode word boundary assertions. #[inline] - pub fn contains_word_unicode(&self) -> bool { + pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) } @@ -2162,26 +2481,109 @@ impl LookSet { /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] - pub fn contains_word_ascii(&self) -> bool { + pub fn contains_word_ascii(self) -> bool { self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) } - /// Modifies this set to be the union of itself and the set given. + /// Returns an iterator over all of the look-around assertions in this set. #[inline] - pub fn union(&mut self, other: LookSet) { - self.bits |= other.bits; + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } } - /// Modifies this set to be the intersection of itself and the set given. + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. #[inline] - pub fn intersect(&mut self, other: LookSet) { - self.bits &= other.bits; + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } } - /// Returns an iterator over all of the look-around assertions in this set. + /// Updates this set in place with the result of inserting the given + /// assertion into this set. #[inline] - pub fn iter(self) -> LookSetIter { - LookSetIter { set: self } + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 16-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; } } @@ -2210,11 +2612,14 @@ impl Iterator for LookSetIter { #[inline] fn next(&mut self) -> Option { + if self.set.is_empty() { + return None; + } // We'll never have more than u8::MAX distinct look-around assertions, // so 'repr' will always fit into a u16. let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(repr)?; - self.set.remove(look); + let look = Look::from_repr(1 << repr)?; + self.set = self.set.remove(look); Some(look) } } @@ -3113,7 +3518,7 @@ mod tests { props: Properties::empty(), }; } - assert!(!expr.kind.is_empty()); + assert!(!matches!(*expr.kind(), HirKind::Empty)); }; // We run our test on a thread with a small stack size so we can @@ -3125,4 +3530,31 @@ mod tests { .join() .unwrap(); } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(10, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + fn look_set_debug() { + let res = format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩", res); + } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index c1ebf85c2..81ae9b898 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -9,7 +9,7 @@ use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ ast::{self, Ast, Span, Visitor}, either::Either, - hir::{self, Error, ErrorKind, Hir}, + hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, }; @@ -425,7 +425,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { - if !expr.kind().is_empty() { + if !matches!(*expr.kind(), HirKind::Empty) { exprs.push(expr); } } @@ -899,21 +899,11 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAscii }), - ast::AssertionKind::NotWordBoundary => { - Hir::look(if unicode { - hir::Look::WordUnicodeNegate - } else { - // It is possible for negated ASCII word boundaries to - // match at invalid UTF-8 boundaries, even when searching - // valid UTF-8. - if self.trans().utf8 { - return Err( - self.error(asst.span, ErrorKind::InvalidUtf8) - ); - } - hir::Look::WordAsciiNegate - }) - } + ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { + hir::Look::WordUnicodeNegate + } else { + hir::Look::WordAsciiNegate + }), }) } @@ -1055,7 +1045,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // Negating a Perl byte class is likely to cause it to match invalid // UTF-8. That's only OK if the translator is configured to allow such // things. - if self.trans().utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_ascii() { return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); } Ok(class) @@ -1123,7 +1113,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if negated { class.negate(); } - if self.trans().utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) @@ -1796,18 +1786,7 @@ mod tests { assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); - assert_eq!(t_bytes(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); - - assert_eq!( - t_err(r"(?-u)\B"), - TestError { - kind: hir::ErrorKind::InvalidUtf8, - span: Span::new( - Position::new(5, 1, 6), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } #[test] @@ -3199,21 +3178,21 @@ mod tests { assert!(props_bytes(r"\b").is_utf8()); assert!(props_bytes(r"\B").is_utf8()); assert!(props_bytes(r"(?-u)\b").is_utf8()); + assert!(props_bytes(r"(?-u)\B").is_utf8()); // Negative examples. assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); - assert!(!props_bytes(r"(?-u)\B").is_utf8()); } #[test] fn analysis_captures_len() { assert_eq!(0, props(r"a").captures_len()); assert_eq!(0, props(r"(?:a)").captures_len()); - assert_eq!(0, props(r"(?i:a)").captures_len()); - assert_eq!(0, props(r"(?i)a").captures_len()); + assert_eq!(0, props(r"(?i-u:a)").captures_len()); + assert_eq!(0, props(r"(?i-u)a").captures_len()); assert_eq!(1, props(r"(a)").captures_len()); assert_eq!(1, props(r"(?Pa)").captures_len()); assert_eq!(1, props(r"()").captures_len()); diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index c56f9d1ff..10540cab5 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -29,7 +29,8 @@ These two types come with conversion routines: As a convenience, the above two conversion routines are combined into one via the top-level [`Parser`] type. This `Parser` will first convert your pattern to -an `Ast` and then convert the `Ast` to an `Hir`. +an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level +[`parse`] free function. # Example @@ -37,14 +38,14 @@ an `Ast` and then convert the `Ast` to an `Hir`. This example shows how to parse a pattern string into its HIR: ``` -use regex_syntax::Parser; -use regex_syntax::hir::Hir; +use regex_syntax::{hir::Hir, parse}; -let hir = Parser::new().parse("a|b").unwrap(); +let hir = parse("a|b")?; assert_eq!(hir, Hir::alternation(vec![ Hir::literal("a".as_bytes()), Hir::literal("b".as_bytes()), ])); +# Ok::<(), Box>(()) ``` @@ -170,8 +171,8 @@ extern crate std; extern crate alloc; pub use crate::{ - error::{Error, Result}, - parser::{Parser, ParserBuilder}, + error::Error, + parser::{parse, Parser, ParserBuilder}, unicode::UnicodeWordError, }; diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 2851cda33..2e7a2bb80 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -1,4 +1,18 @@ -use crate::{ast, hir, Result}; +use crate::{ast, hir, Error}; + +/// A convenience routine for parsing a regex using default options. +/// +/// This is equivalent to `Parser::new().parse(pattern)`. +/// +/// If you need to set non-default options, then use a [`ParserBuilder`]. +/// +/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically +/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator +/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then +/// you should use a [`ast::parse::Parser`]. +pub fn parse(pattern: &str) -> Result { + Parser::new().parse(pattern) +} /// A builder for a regular expression parser. /// @@ -207,7 +221,7 @@ impl Parser { /// Parse the regular expression into a high level intermediate /// representation. - pub fn parse(&mut self, pattern: &str) -> Result { + pub fn parse(&mut self, pattern: &str) -> Result { let ast = self.ast.parse(pattern)?; let hir = self.hir.translate(pattern, &ast)?; Ok(hir) diff --git a/regex-syntax/test b/regex-syntax/test index 50854d2b3..a4d6cfaba 100755 --- a/regex-syntax/test +++ b/regex-syntax/test @@ -18,7 +18,7 @@ features=( unicode-segment ) for f in "${features[@]}"; do - echo "===== FEATURE: $f ===" + echo "=== FEATURE: $f ===" # We only run library tests because I couldn't figure out how to easily # make doc tests run in 'no_std' mode. In particular, without the Error # trait, using '?' in doc tests seems tricky. diff --git a/src/compile.rs b/src/compile.rs index 20eebf0ed..c29196a72 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -290,7 +290,7 @@ impl Compiler { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { - assert!(cls.is_all_ascii()); + assert!(cls.is_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); diff --git a/src/exec.rs b/src/exec.rs index dc41ee6fc..b50d2399f 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -252,7 +252,19 @@ impl ExecBuilder { let expr = parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; let props = expr.properties(); - bytes = bytes || !props.is_utf8(); + // This used to just check whether the HIR matched valid UTF-8 + // or not, but in regex-syntax 0.7, we changed our definition of + // "matches valid UTF-8" to exclude zero-width matches. And in + // particular, previously, we considered WordAsciiNegate (that + // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our + // matcher engines were built under this assumption and fixing + // them is not worth it with the imminent plan to switch over to + // regex-automata. So for now, we retain the previous behavior by + // just explicitly treating the presence of a negated ASCII word + // boundary as forcing use to use a byte oriented automaton. + bytes = bytes + || !props.is_utf8() + || props.look_set().contains(Look::WordAsciiNegate); if cfg!(feature = "perf-literal") { if !props.look_set_prefix().contains(Look::Start)