Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow starting DFA in noncontinuous bytes #1031

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions regex-automata/src/dfa/automaton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::{
primitives::{PatternID, StateID},
search::{Anchored, HalfMatch, Input, MatchError},
},
Span,
};

/// A trait describing the interface of a deterministic finite automaton (DFA).
Expand Down Expand Up @@ -253,6 +254,14 @@ pub unsafe trait Automaton {
input: &Input<'_>,
) -> Result<StateID, MatchError>;

/// TODO
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError>;

/// Return the ID of the start state for this lazy DFA when executing a
/// reverse search.
///
Expand Down Expand Up @@ -280,6 +289,14 @@ pub unsafe trait Automaton {
input: &Input<'_>,
) -> Result<StateID, MatchError>;

/// TODO
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError>;

/// If this DFA has a universal starting state for the given anchor mode
/// and the DFA supports universal starting states, then this returns that
/// state's identifier.
Expand Down Expand Up @@ -1806,6 +1823,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
(**self).start_state_forward(input)
}

#[inline]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
(**self).start_state_forward_with(mode, look_behind, span)
}

#[inline]
fn start_state_reverse(
&self,
Expand All @@ -1814,6 +1841,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
(**self).start_state_reverse(input)
}

#[inline]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
(**self).start_state_reverse_with(mode, look_behind, span)
}

#[inline]
fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
(**self).universal_start_state(mode)
Expand Down
68 changes: 51 additions & 17 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ use crate::{
start::{Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
Span,
};

/// The label that is pre-pended to a serialized DFA.
Expand Down Expand Up @@ -2883,7 +2884,9 @@ impl OwnedDFA {
let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
// This OK because we only call 'start' under conditions
// in which we know it will succeed.
dfa.st.start(inp, start).expect("valid Input configuration")
dfa.st
.start(inp.get_anchored(), start)
.expect("valid Input configuration")
};
if self.start_kind().has_unanchored() {
let inp = Input::new("").anchored(Anchored::No);
Expand Down Expand Up @@ -3215,31 +3218,63 @@ unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.start() > 0 {
let offset = input.start() - 1;
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_forward_with(
input.get_anchored(),
input.start().checked_sub(1).map(|i| input.haystack()[i]),
input.get_span(),
)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
debug_assert_eq!(
span.start != 0,
look_behind.is_some(),
"look_behind should be provided if and only if the DFA starts at an offset"
);
if !self.quitset.is_empty() {
if let Some(byte) = look_behind {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.start - 1));
}
}
}
let start = self.st.start_map.fwd(&input);
self.st.start(input, start)
let start = self.st.start_map.fwd_with(look_behind);
self.st.start(mode, start)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
let offset = input.end();
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_reverse_with(
input.get_anchored(),
input.haystack().get(input.end()).copied(),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() {
if let Some(byte) = look_ahead {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.end));
}
}
}
let start = self.st.start_map.rev(&input);
self.st.start(input, start)
let start = self.st.start_map.rev_with(look_ahead);
self.st.start(mode, start)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
Expand Down Expand Up @@ -4175,11 +4210,10 @@ impl<T: AsRef<[u32]>> StartTable<T> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start(
&self,
input: &Input<'_>,
mode: Anchored,
start: Start,
) -> Result<StateID, MatchError> {
let start_index = start.as_usize();
let mode = input.get_anchored();
let index = match mode {
Anchored::No => {
if !self.kind.has_unanchored() {
Expand Down
69 changes: 51 additions & 18 deletions regex-automata/src/dfa/sparse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ use crate::{
start::{Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
Span,
};

const LABEL: &str = "rust-regex-automata-dfa-sparse";
Expand Down Expand Up @@ -1206,36 +1207,69 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
self.flags.is_always_start_anchored
}

#[inline]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.start() > 0 {
let offset = input.start() - 1;
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_forward_with(
input.get_anchored(),
input.start().checked_sub(1).map(|i| input.haystack()[i]),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward_with(
&self,
mode: Anchored,
look_behind: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
debug_assert_eq!(
span.start != 0,
look_behind.is_some(),
"look_behind should be provided if and only if the DFA starts at an offset"
);
if !self.quitset.is_empty() {
if let Some(byte) = look_behind {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.start - 1));
}
}
}
let start = self.st.start_map.fwd(&input);
self.st.start(input, start)
let start = self.st.start_map.fwd_with(look_behind);
self.st.start(mode, start)
}

#[inline]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse(
&self,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
let offset = input.end();
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
self.start_state_reverse_with(
input.get_anchored(),
input.haystack().get(input.end()).copied(),
input.get_span(),
)
}

#[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse_with(
&self,
mode: Anchored,
look_ahead: Option<u8>,
span: Span,
) -> Result<StateID, MatchError> {
if !self.quitset.is_empty() {
if let Some(byte) = look_ahead {
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, span.end));
}
}
}
let start = self.st.start_map.rev(&input);
self.st.start(input, start)
let start = self.st.start_map.rev_with(look_ahead);
self.st.start(mode, start)
}

#[inline]
Expand Down Expand Up @@ -2145,11 +2179,10 @@ impl<T: AsRef<[u8]>> StartTable<T> {
/// panics.
fn start(
&self,
input: &Input<'_>,
mode: Anchored,
start: Start,
) -> Result<StateID, MatchError> {
let start_index = start.as_usize();
let mode = input.get_anchored();
let index = match mode {
Anchored::No => {
if !self.kind.has_unanchored() {
Expand Down
Loading