From 1d98af133599790f6c7ff2a2a3838705596e5c85 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sun, 9 Jul 2023 17:31:56 +0200 Subject: [PATCH] allow starting DFA in noncontinuous bytes regex-automtaton already supports transversing the DFA one byte at a time with `next_state`. This is potentially very useful when scanning noncontinuous data like network stream or a rope data structures as commonly used in editors. However, to start the DFA with `start_state_forward`/`start_state_reverse` currently requires an `Input` and will look ahead/look one byte behind the span boundaries. To support that (especially when using prefilters/literal optimization) a streaming use case can not provide such a haystack easily (it can be worked around with a temporary array and copying one byte over but its extremely brittle/hacky). This commit adds the `start_state_forward_with`/`start_state_reverse_with` function which allow passing the information extracted from the Input directly. --- regex-automata/src/dfa/automaton.rs | 37 ++++++++ regex-automata/src/dfa/dense.rs | 68 ++++++++++---- regex-automata/src/dfa/sparse.rs | 69 ++++++++++---- regex-automata/src/hybrid/dfa.rs | 135 ++++++++++++++++++++++------ regex-automata/src/util/start.rs | 36 +++++--- 5 files changed, 274 insertions(+), 71 deletions(-) diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a15..8e09255d0 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -8,6 +8,7 @@ use crate::{ primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, }, + Span, }; /// A trait describing the interface of a deterministic finite automaton (DFA). @@ -253,6 +254,14 @@ pub unsafe trait Automaton { input: &Input<'_>, ) -> Result; + /// TODO + fn start_state_forward_with( + &self, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result; + /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// @@ -280,6 +289,14 @@ pub unsafe trait Automaton { input: &Input<'_>, ) -> Result; + /// TODO + fn start_state_reverse_with( + &self, + mode: Anchored, + look_ahead: Option, + span: Span, + ) -> Result; + /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that /// state's identifier. @@ -1806,6 +1823,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).start_state_forward(input) } + #[inline] + fn start_state_forward_with( + &self, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result { + (**self).start_state_forward_with(mode, look_behind, span) + } + #[inline] fn start_state_reverse( &self, @@ -1814,6 +1841,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).start_state_reverse(input) } + #[inline] + fn start_state_reverse_with( + &self, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result { + (**self).start_state_reverse_with(mode, look_behind, span) + } + #[inline] fn universal_start_state(&self, mode: Anchored) -> Option { (**self).universal_start_state(mode) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 35f037ca6..c28091cc0 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -44,6 +44,7 @@ use crate::{ start::{Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, + Span, }; /// The label that is pre-pended to a serialized DFA. @@ -2883,7 +2884,9 @@ impl OwnedDFA { let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st + .start(inp.get_anchored(), start) + .expect("valid Input configuration") }; if self.start_kind().has_unanchored() { let inp = Input::new("").anchored(Anchored::No); @@ -3215,15 +3218,33 @@ unsafe impl> Automaton for DFA { &self, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_forward_with( + input.get_anchored(), + input.start().checked_sub(1).map(|i| input.haystack()[i]), + input.get_span(), + ) + } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_forward_with( + &self, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result { + debug_assert_eq!( + span.start != 0, + look_behind.is_some(), + "look_behind should be provided if and only if the DFA starts at an offset" + ); + if !self.quitset.is_empty() { + if let Some(byte) = look_behind { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.start - 1)); + } } } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) + let start = self.st.start_map.fwd_with(look_behind); + self.st.start(mode, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -3231,15 +3252,29 @@ unsafe impl> Automaton for DFA { &self, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_reverse_with( + input.get_anchored(), + input.haystack().get(input.end()).copied(), + input.get_span(), + ) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_reverse_with( + &self, + mode: Anchored, + look_ahead: Option, + span: Span, + ) -> Result { + if !self.quitset.is_empty() { + if let Some(byte) = look_ahead { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.end)); + } } } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + let start = self.st.start_map.rev_with(look_ahead); + self.st.start(mode, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4175,11 +4210,10 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + mode: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); let index = match mode { Anchored::No => { if !self.kind.has_unanchored() { diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec2340..373744311 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -67,6 +67,7 @@ use crate::{ start::{Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, + Span, }; const LABEL: &str = "rust-regex-automata-dfa-sparse"; @@ -1206,36 +1207,69 @@ unsafe impl> Automaton for DFA { self.flags.is_always_start_anchored } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state_forward( &self, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_forward_with( + input.get_anchored(), + input.start().checked_sub(1).map(|i| input.haystack()[i]), + input.get_span(), + ) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_forward_with( + &self, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result { + debug_assert_eq!( + span.start != 0, + look_behind.is_some(), + "look_behind should be provided if and only if the DFA starts at an offset" + ); + if !self.quitset.is_empty() { + if let Some(byte) = look_behind { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.start - 1)); + } } } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) + let start = self.st.start_map.fwd_with(look_behind); + self.st.start(mode, start) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state_reverse( &self, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_reverse_with( + input.get_anchored(), + input.haystack().get(input.end()).copied(), + input.get_span(), + ) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_reverse_with( + &self, + mode: Anchored, + look_ahead: Option, + span: Span, + ) -> Result { + if !self.quitset.is_empty() { + if let Some(byte) = look_ahead { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.end)); + } } } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + let start = self.st.start_map.rev_with(look_ahead); + self.st.start(mode, start) } #[inline] @@ -2145,11 +2179,10 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + mode: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); let index = match mode { Anchored::No => { if !self.kind.has_unanchored() { diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 86963248f..3fa056cdf 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -30,6 +30,7 @@ use crate::{ sparse_set::SparseSets, start::{Start, StartByteMap}, }, + Span, }; /// The minimum number of states that a lazy DFA's cache size must support. @@ -1547,20 +1548,64 @@ impl DFA { cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_forward_with( + cache, + input.get_anchored(), + input.start().checked_sub(1).map(|i| input.haystack()[i]), + input.get_span(), + ) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state_forward_with( + &self, + cache: &mut Cache, + mode: Anchored, + look_behind: Option, + span: Span, + ) -> Result { + debug_assert_eq!( + span.start != 0, + look_behind.is_some(), + "look_behind should be provided if and only if the DFA starts at an offset" + ); + if !self.quitset.is_empty() { + if let Some(byte) = look_behind { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.start - 1)); + } } } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; + let start_type = self.start_map.fwd_with(look_behind); + let start = + LazyRef::new(self, cache).get_cached_start_id(mode, start_type)?; if !start.is_unknown() { return Ok(start); } - Lazy::new(self, cache).cache_start_group(input, start_type) + Lazy::new(self, cache).cache_start_group(mode, start_type, span) } /// Return the ID of the start state for this lazy DFA when executing a @@ -1592,20 +1637,59 @@ impl DFA { cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + self.start_state_reverse_with( + cache, + input.get_anchored(), + input.haystack().get(input.end()).copied(), + input.get_span(), + ) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state_reverse_with( + &self, + cache: &mut Cache, + mode: Anchored, + look_ahead: Option, + span: Span, + ) -> Result { + if !self.quitset.is_empty() { + if let Some(byte) = look_ahead { + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, span.end)); + } } } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; + let start_type = self.start_map.rev_with(look_ahead); + let start = + LazyRef::new(self, cache).get_cached_start_id(mode, start_type)?; if !start.is_unknown() { return Ok(start); } - Lazy::new(self, cache).cache_start_group(input, start_type) + Lazy::new(self, cache).cache_start_group(mode, start_type, span) } /// Returns the total number of patterns that match in this state. @@ -2122,10 +2206,10 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + mode: Anchored, start: Start, + span: Span, ) -> Result { - let mode = input.get_anchored(); let nfa_start_id = match mode { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), @@ -2142,8 +2226,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(|_| MatchError::gave_up(span.start))?; + self.set_start_state(mode, start, id); Ok(id) } @@ -2574,13 +2658,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + mode: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match mode { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,11 +2726,10 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + mode: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); let index = match mode { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083..0f1072f0c 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -73,23 +73,39 @@ impl StartByteMap { /// Return the forward starting configuration for the given `input`. #[cfg_attr(feature = "perf-inline", inline(always))] + #[cfg(test)] pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } + self.fwd_with( + input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i)) + .copied(), + ) } /// Return the reverse starting configuration for the given `input`. #[cfg_attr(feature = "perf-inline", inline(always))] + #[cfg(test)] pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { + self.rev_with(input.haystack().get(input.end()).copied()) + } + + /// Return the forward starting configuration for the given `look_behind` + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn fwd_with(&self, look_behind: Option) -> Start { + match look_behind { + None => Start::Text, + Some(byte) => self.get(byte), + } + } + + /// Return the reverse starting configuration for the given `look_ahead`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn rev_with(&self, look_ahead: Option) -> Start { + match look_ahead { None => Start::Text, - Some(&byte) => self.get(byte), + Some(byte) => self.get(byte), } }