Skip to content

Commit

Permalink
exec: add Aho-Corasick optimization
Browse files Browse the repository at this point in the history
Finally, if a regex is just `foo|bar|baz|...|quux`, we will now use plain
old Aho-Corasick. The reason why we weren't doing this before is because
Aho-Corasick didn't support proper leftmost-first match semantics. But
since aho-corasick 0.7, it does, so we can now use it as a drop-in
replacement.

This basically fixes a pretty bad performance bug in a really common case,
but it is otherwise really hacked. First of all, this only happens when a
regex is literally `foo|bar|...|baz`. Even something like
`foo|b(a)r|...|baz` will prevent this optimization from happening, which
is a little silly. Second of all, this optimization only kicks in after
we've compiled the full pattern, which adds quite a bit of overhead. Fixing
this isn't trivial, since we may need the compiled program to resolve
capturing groups. The way to do this is probably to specialize compilation
for certain types of expressions. Maybe.

Anyway, we hack this in for now, and punt on further improvements until
we can really re-think how this should all work.
  • Loading branch information
BurntSushi committed Mar 29, 2019
1 parent 461673d commit d7c01cc
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
96 changes: 96 additions & 0 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use thread_local::CachedThreadLocal;
use syntax::ParserBuilder;
use syntax::hir::Hir;
Expand Down Expand Up @@ -86,6 +87,16 @@ struct ExecReadOnly {
/// Prefix literals are stored on the `Program`, since they are used inside
/// the matching engines.
suffixes: LiteralSearcher,
/// An Aho-Corasick automaton with leftmost-first match semantics.
///
/// This is only set when the entire regex is a simple unanchored
/// alternation of literals. We could probably use it more circumstances,
/// but this is already hacky enough in this architecture.
///
/// N.B. We use u32 as a state ID representation under the assumption that
/// if we were to exhaust the ID space, we probably would have long
/// surpassed the compilation size limit.
ac: Option<AhoCorasick<u32>>,
/// match_type encodes as much upfront knowledge about how we're going to
/// execute a search as possible.
match_type: MatchType,
Expand Down Expand Up @@ -287,6 +298,7 @@ impl ExecBuilder {
dfa: Program::new(),
dfa_reverse: Program::new(),
suffixes: LiteralSearcher::empty(),
ac: None,
match_type: MatchType::Nothing,
});
return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
Expand Down Expand Up @@ -319,12 +331,32 @@ impl ExecBuilder {
dfa.dfa_size_limit = self.options.dfa_size_limit;
dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;

let mut ac = None;
if parsed.exprs.len() == 1 {
if let Some(lits) = alternation_literals(&parsed.exprs[0]) {
// If we have a small number of literals, then let Teddy
// handle things (see literal/mod.rs).
if lits.len() > 32 {
let fsm = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostFirst)
.auto_configure(&lits)
// We always want this to reduce size, regardless of
// what auto-configure does.
.byte_classes(true)
.build_with_size::<u32, _, _>(&lits)
.expect("AC automaton too big");
ac = Some(fsm);
}
}
}

let mut ro = ExecReadOnly {
res: self.options.pats,
nfa: nfa,
dfa: dfa,
dfa_reverse: dfa_reverse,
suffixes: LiteralSearcher::suffixes(suffixes),
ac: ac,
match_type: MatchType::Nothing,
};
ro.match_type = ro.choose_match_type(self.match_type);
Expand Down Expand Up @@ -633,6 +665,11 @@ impl<'c> ExecNoSync<'c> {
lits.find_end(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AhoCorasick => {
self.ro.ac.as_ref().unwrap()
.find(&text[start..])
.map(|m| (start + m.start(), start + m.end()))
}
}
}

Expand Down Expand Up @@ -1163,6 +1200,9 @@ impl ExecReadOnly {
// aren't anchored. We would then only search for all of them when at
// the beginning of the input and use the subset in all other cases.
if self.res.len() == 1 {
if self.ac.is_some() {
return Literal(MatchLiteralType::AhoCorasick);
}
if self.nfa.prefixes.complete() {
return if self.nfa.is_anchored_start {
Literal(MatchLiteralType::AnchoredStart)
Expand Down Expand Up @@ -1254,6 +1294,9 @@ enum MatchLiteralType {
AnchoredStart,
/// Match literals only at the end of text.
AnchoredEnd,
/// Use an Aho-Corasick automaton. This requires `ac` to be Some on
/// ExecReadOnly.
AhoCorasick,
}

#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -1295,6 +1338,59 @@ impl ProgramCacheInner {
}
}

/// Alternation literals checks if the given HIR is a simple alternation of
/// literals, and if so, returns them. Otherwise, this returns None.
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
use syntax::hir::{HirKind, Literal};

// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
//
// This code should be refactored once we overhaul this crate's
// optimization pipeline, because this is a terribly inflexible way to go
// about things.

if !expr.is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};

let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
match *lit {
Literal::Unicode(c) => {
let mut buf = [0; 4];
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}
Literal::Byte(b) => {
dst.push(b);
}
}
};

let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
_ => unreachable!("expected literal, got {:?}", e),
}
}
}
_ => unreachable!("expected literal or concat, got {:?}", alt),
}
lits.push(lit);
}
Some(lits)
}

#[cfg(test)]
mod test {
#[test]
Expand Down
10 changes: 10 additions & 0 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,13 @@ ismatch!(
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
true);

// Tests that our Aho-Corasick optimization works correctly. It only
// kicks in when we have >32 literals.
mat!(
ahocorasick1,
"samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\
A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z",
"samwise",
Some((0, 7))
);

0 comments on commit d7c01cc

Please sign in to comment.