From 4a752474d50b3fea21af58235446ae5c807025d6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Apr 2023 07:20:59 -0400 Subject: [PATCH] impl: fix prefix literal matching bug This commit fixes a bug where it was possible to report a match where none existed. Basically, in the current regex crate, it just cannot deal with a mixture of look-around assertions in the prefix of a pattern and prefix literal optimizations. Before 1.8, this was handled by simply refusing to extract literals in that case. But in 1.8, with a rewrite of the literal extractor, literals are now extracted for patterns like this: (?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_)) So in 1.8, since it was still using the old engines that can't deal with this, I added some extra logic to throw away any extracted prefix literals if a look-around assertion occurred in the prefix of the pattern. The problem is that the logic I used was "always occurs in the prefix of the pattern" instead of "may occur in the prefix of the pattern." In the pattern above, it's the latter case. So it slipped by and the regex engine tried to use the prefix literals to accelerat the search. This in turn caused mishandling of the `\b` and led to a false positive match. The specific reason why the current regex engines can't deal with this is because they weren't designed to handle searches that took the surrounding context into account when resolving look-around assertions. It was a pretty big oversight on my part many years ago. The new engines we'll be migrating to Real Soon Now don't have this problem and can deal with the prefix literal optimizations while correctly handling look-around assertions in the prefix. Fixes #981 --- regex-syntax/src/hir/mod.rs | 56 +++++++++++++++++++++++++++++-- regex-syntax/src/hir/translate.rs | 6 ++++ src/exec.rs | 13 +++---- tests/regression.rs | 19 +++++++++++ 4 files changed, 85 insertions(+), 9 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index bdf2fb2c6..e5ea3701b 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1854,6 +1854,8 @@ struct PropertiesI { look_set: LookSet, look_set_prefix: LookSet, look_set_suffix: LookSet, + look_set_prefix_any: LookSet, + look_set_suffix_any: LookSet, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, @@ -1909,6 +1911,19 @@ impl Properties { self.0.look_set_prefix } + /// Returns a set of all look-around assertions that appear as a _possible_ + /// prefix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion before consuming any input. + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + /// Returns a set of all look-around assertions that appear as a suffix for /// this HIR value. That is, the set returned corresponds to the set of /// assertions that must be passed in order to be considered a match after @@ -1921,6 +1936,19 @@ impl Properties { self.0.look_set_suffix } + /// Returns a set of all look-around assertions that appear as a _possible_ + /// suffix for this HIR value. That is, the set returned corresponds to the + /// set of assertions that _may_ be passed before matching any bytes in a + /// haystack. + /// + /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns + /// true if and only if it's possible for the regex to match through a + /// anchored assertion at the end of a match without consuming any input. + #[inline] + pub fn look_set_suffix_any(&self) -> LookSet { + self.0.look_set_suffix_any + } + /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// @@ -2188,6 +2216,8 @@ impl Properties { look_set: LookSet::empty(), look_set_prefix: fix, look_set_suffix: fix, + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), utf8: true, explicit_captures_len: 0, static_explicit_captures_len, @@ -2201,6 +2231,8 @@ impl Properties { props.look_set.set_union(p.look_set()); props.look_set_prefix.set_intersect(p.look_set_prefix()); props.look_set_suffix.set_intersect(p.look_set_suffix()); + props.look_set_prefix_any.set_union(p.look_set_prefix_any()); + props.look_set_suffix_any.set_union(p.look_set_suffix_any()); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len @@ -2246,6 +2278,8 @@ impl Properties { look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings @@ -2280,6 +2314,8 @@ impl Properties { look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2297,6 +2333,8 @@ impl Properties { look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2314,6 +2352,8 @@ impl Properties { look_set: LookSet::singleton(look), look_set_prefix: LookSet::singleton(look), look_set_suffix: LookSet::singleton(look), + look_set_prefix_any: LookSet::singleton(look), + look_set_suffix_any: LookSet::singleton(look), // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will @@ -2355,15 +2395,17 @@ impl Properties { look_set: p.look_set(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), + look_set_prefix_any: p.look_set_prefix_any(), + look_set_suffix_any: p.look_set_suffix_any(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), literal: false, alternation_literal: false, }; - // The repetition operator can match the empty string, then its lookset - // prefix and suffixes themselves remain empty since they are no longer - // required to match. + // If the repetition operator can match the empty string, then its + // lookset prefix and suffixes themselves remain empty since they are + // no longer required to match. if rep.min > 0 { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); @@ -2414,6 +2456,8 @@ impl Properties { look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), + look_set_prefix_any: LookSet::empty(), + look_set_suffix_any: LookSet::empty(), utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2455,6 +2499,9 @@ impl Properties { let mut it = concat.iter(); while let Some(x) = it.next() { props.look_set_prefix.set_union(x.properties().look_set_prefix()); + props + .look_set_prefix_any + .set_union(x.properties().look_set_prefix_any()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } @@ -2463,6 +2510,9 @@ impl Properties { let mut it = concat.iter().rev(); while let Some(x) = it.next() { props.look_set_suffix.set_union(x.properties().look_set_suffix()); + props + .look_set_suffix_any + .set_union(x.properties().look_set_suffix_any()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 8751372e8..4063d334a 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3287,6 +3287,12 @@ mod tests { assert_eq!(p.minimum_len(), Some(1)); } + #[test] + fn analysis_look_set_prefix_any() { + let p = props(r"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); + assert!(p.look_set_prefix_any().contains(Look::WordUnicode)); + } + #[test] fn analysis_is_anchored() { let is_start = |p| props(p).look_set_prefix().contains(Look::Start); diff --git a/src/exec.rs b/src/exec.rs index 778a39d4c..c449b38cf 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -274,18 +274,18 @@ impl ExecBuilder { // prefixes, so disable them. prefixes = None; } else if is_set - && props.look_set_prefix().contains(Look::Start) + && props.look_set_prefix_any().contains(Look::Start) { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; - } else if props.look_set_prefix().contains_word() { + } else if props.look_set_prefix_any().contains_word() { // The new literal extractor ignores look-around while // the old one refused to extract prefixes from regexes // that began with a \b. These old creaky regex internals // can't deal with it, so we drop it. prefixes = None; - } else if props.look_set().contains(Look::StartLF) { + } else if props.look_set_prefix_any().contains(Look::StartLF) { // Similar to the reasoning for word boundaries, this old // regex engine can't handle literal prefixes with '(?m:^)' // at the beginning of a regex. @@ -298,15 +298,16 @@ impl ExecBuilder { // Partial anchors unfortunately make it hard to use // suffixes, so disable them. suffixes = None; - } else if is_set && props.look_set_suffix().contains(Look::End) + } else if is_set + && props.look_set_suffix_any().contains(Look::End) { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; - } else if props.look_set_suffix().contains_word() { + } else if props.look_set_suffix_any().contains_word() { // See the prefix case for reasoning here. suffixes = None; - } else if props.look_set().contains(Look::EndLF) { + } else if props.look_set_suffix_any().contains(Look::EndLF) { // See the prefix case for reasoning here. suffixes = None; } diff --git a/tests/regression.rs b/tests/regression.rs index e8b252538..bc22e245f 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -220,3 +220,22 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); // See: https://github.com/rust-lang/regex/issues/862 mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/981 +#[cfg(feature = "unicode")] +#[test] +fn regression_bad_word_boundary() { + let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap(); + let hay = "ubi-Darwin-x86_64.tar.gz"; + assert!(!re.is_match(text!(hay))); + let hay = "ubi-Windows-x86_64.zip"; + assert!(re.is_match(text!(hay))); +} + +// See: https://github.com/rust-lang/regex/issues/982 +#[test] +fn regression_unicode_perl_not_enabled() { + let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?"; + let re = regex_new!(pat); + assert!(re.is_ok()); +}