Skip to content

Commit

Permalink
regex-automata: fix bug in DFA quit behavior
Browse files Browse the repository at this point in the history
It turns out that the way we were dealing with quit states in the DFA
was not quite right. Basically, if we entered a quit state and a match
had been found, then we were returning the match instead of the error.
But the match might not be the correct leftmost-first match, and so, we
really shouldn't return it. Otherwise a regex like '\B.*' could match
much less than it should.

This was caught by a differential fuzzer developed in #848.
  • Loading branch information
BurntSushi committed May 21, 2023
1 parent 00f9023 commit 2147215
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 34 deletions.
6 changes: 3 additions & 3 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ impl Config {
///
/// // The match occurs before the search ever observes the snowman
/// // character, so no error occurs.
/// let haystack = "foo 123 ☃".as_bytes();
/// let haystack = "foo 123 ☃".as_bytes();
/// let expected = Some(HalfMatch::must(0, 7));
/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
Expand All @@ -572,8 +572,8 @@ impl Config {
/// // routines read one byte past the end of the search to account for
/// // look-around, and indeed, this is required here to determine whether
/// // the trailing \b matches.
/// let haystack = "foo 123☃".as_bytes();
/// let expected = MatchError::quit(0xE2, 7);
/// let haystack = "foo 123 ☃".as_bytes();
/// let expected = MatchError::quit(0xE2, 8);
/// let got = dfa.try_search_fwd(&Input::new(haystack));
/// assert_eq!(Err(expected), got);
///
Expand Down
12 changes: 0 additions & 12 deletions regex-automata/src/dfa/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,6 @@ fn find_fwd_imp<A: Automaton + ?Sized>(
// actually be tripped even if DFA::from_bytes succeeds and
// returns a supposedly valid DFA.
debug_assert!(dfa.is_quit_state(sid));
if mat.is_some() {
return Ok(mat);
}
return Err(MatchError::quit(input.haystack()[at], at));
}
}
Expand Down Expand Up @@ -307,9 +304,6 @@ fn find_rev_imp<A: Automaton + ?Sized>(
return Ok(mat);
} else {
debug_assert!(dfa.is_quit_state(sid));
if mat.is_some() {
return Ok(mat);
}
return Err(MatchError::quit(input.haystack()[at], at));
}
}
Expand Down Expand Up @@ -611,9 +605,6 @@ fn eoi_fwd<A: Automaton + ?Sized>(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
if mat.is_some() {
return Ok(());
}
return Err(MatchError::quit(b, sp.end));
}
}
Expand Down Expand Up @@ -646,9 +637,6 @@ fn eoi_rev<A: Automaton + ?Sized>(
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
if mat.is_some() {
return Ok(());
}
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
Expand Down
6 changes: 3 additions & 3 deletions regex-automata/src/hybrid/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3226,7 +3226,7 @@ impl Config {
///
/// // The match occurs before the search ever observes the snowman
/// // character, so no error occurs.
/// let haystack = "foo 123 ☃";
/// let haystack = "foo 123 ☃";
/// let expected = Some(HalfMatch::must(0, 7));
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
/// assert_eq!(expected, got);
Expand All @@ -3236,8 +3236,8 @@ impl Config {
/// // routines read one byte past the end of the search to account for
/// // look-around, and indeed, this is required here to determine whether
/// // the trailing \b matches.
/// let haystack = "foo 123☃";
/// let expected = MatchError::quit(0xE2, 7);
/// let haystack = "foo 123 ☃";
/// let expected = MatchError::quit(0xE2, 8);
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
/// assert_eq!(Err(expected), got);
///
Expand Down
12 changes: 0 additions & 12 deletions regex-automata/src/hybrid/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,6 @@ fn find_fwd_imp(
return Ok(mat);
} else if sid.is_quit() {
cache.search_finish(at);
if mat.is_some() {
return Ok(mat);
}
return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
Expand Down Expand Up @@ -432,9 +429,6 @@ fn find_rev_imp(
return Ok(mat);
} else if sid.is_quit() {
cache.search_finish(at);
if mat.is_some() {
return Ok(mat);
}
return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
Expand Down Expand Up @@ -730,9 +724,6 @@ fn eoi_fwd(
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if sid.is_quit() {
if mat.is_some() {
return Ok(());
}
return Err(MatchError::quit(b, sp.end));
}
}
Expand Down Expand Up @@ -770,9 +761,6 @@ fn eoi_rev(
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if sid.is_quit() {
if mat.is_some() {
return Ok(());
}
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
Expand Down
8 changes: 4 additions & 4 deletions regex-automata/src/meta/strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ impl Strategy for Core {
// We manually inline try_search_mayfail here because letting the
// compiler do it seems to produce pretty crappy codegen.
return if let Some(e) = self.dfa.get(input) {
trace!("using full DFA for search at {:?}", input.get_span());
trace!("using full DFA for full search at {:?}", input.get_span());
match e.try_search(input) {
Ok(x) => x,
Err(_err) => {
Expand All @@ -645,7 +645,7 @@ impl Strategy for Core {
}
}
} else if let Some(e) = self.hybrid.get(input) {
trace!("using lazy DFA for search at {:?}", input.get_span());
trace!("using lazy DFA for full search at {:?}", input.get_span());
match e.try_search(&mut cache.hybrid, input) {
Ok(x) => x,
Err(_err) => {
Expand All @@ -668,7 +668,7 @@ impl Strategy for Core {
// can use a single forward scan without needing to run the reverse
// DFA.
return if let Some(e) = self.dfa.get(input) {
trace!("using full DFA for search at {:?}", input.get_span());
trace!("using full DFA for half search at {:?}", input.get_span());
match e.try_search_half_fwd(input) {
Ok(x) => x,
Err(_err) => {
Expand All @@ -677,7 +677,7 @@ impl Strategy for Core {
}
}
} else if let Some(e) = self.hybrid.get(input) {
trace!("using lazy DFA for search at {:?}", input.get_span());
trace!("using lazy DFA for half search at {:?}", input.get_span());
match e.try_search_half_fwd(&mut cache.hybrid, input) {
Ok(x) => x,
Err(_err) => {
Expand Down
30 changes: 30 additions & 0 deletions testdata/regression.toml
Original file line number Diff line number Diff line change
Expand Up @@ -689,3 +689,33 @@ name = "captures-wrong-order"
regex = '(a){0}(a)'
haystack = 'a'
matches = [[[0, 1], [], [0, 1]]]

# This tests a bug in how quit states are handled in the DFA. At some point
# during development, the DFAs were tweaked slightly such that if they hit
# a quit state (which means, they hit a byte that the caller configured should
# stop the search), then it might not return an error necessarily. Namely, if a
# match had already been found, then it would be returned instead of an error.
#
# But this is actually wrong! Why? Because even though a match had been found,
# it wouldn't be fully correct to return it once a quit state has been seen
# because you can't determine whether the match offset returned is the correct
# greedy/leftmost-first match. Since you can't complete the search as requested
# by the caller, the DFA should just stop and return an error.
#
# Interestingly, this does seem to produce an unavoidable difference between
# 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs.
# The former will stop immediately once a match is known to occur and return
# 'Ok(true)', where as the latter could find the match but quit with an
# 'Err(..)' first.
#
# Thankfully, I believe this inconsistency between 'is_match()' and 'find()'
# cannot be observed in the higher level meta regex API because it specifically
# will try another engine that won't fail in the case of a DFA failing.
#
# This regression happened in the regex crate rewrite, but before anything got
# released.
[[test]]
name = "negated-unicode-word-boundary-dfa-fail"
regex = '\B.*'
haystack = "!\u02D7"
matches = [[0, 3]]

0 comments on commit 2147215

Please sign in to comment.