diff --git a/HACKING.md b/HACKING.md index 6114475fcc..bba1f55b2c 100644 --- a/HACKING.md +++ b/HACKING.md @@ -112,7 +112,7 @@ the NFA algorithm, because it was one fewer epsilon transition that it had to follow. There exist more instructions and they are defined and documented in -src/inst.rs. +src/prog.rs. Compilation has several knobs and a few unfortunately complicated invariants. Namely, the output of compilation can be one of two types of programs: a @@ -163,7 +163,7 @@ engine (or engines) to use. The logic for choosing which engine to execute is in src/exec.rs and is documented on the Exec type. Exec values collection regular expression -Programs (defined in src/program.rs), which contain all the necessary tidbits +Programs (defined in src/prog.rs), which contain all the necessary tidbits for actually executing a regular expression on search text. For the most part, the execution logic is straight-forward and follows the diff --git a/README.md b/README.md index 30056bb63c..b1be030702 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,34 @@ fn some_helper_function(text: &str) -> bool { Specifically, in this example, the regex will be compiled when it is used for the first time. On subsequent uses, it will reuse the previous compilation. +### Usage: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + ### Usage: `regex!` compiler plugin The `regex!` compiler plugin will compile your regexes at compile time. **This diff --git a/benches/bench_dynamic_compile.rs b/benches/bench_dynamic_compile.rs index 17ab319b30..436e3a7ae9 100644 --- a/benches/bench_dynamic_compile.rs +++ b/benches/bench_dynamic_compile.rs @@ -8,54 +8,55 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use regex_syntax::Expr; use test::Bencher; -use regex::internal::ProgramBuilder; +use regex::internal::Compiler; #[bench] fn compile_simple(b: &mut Bencher) { b.iter(|| { - let re = r"^bc(d|e)*$"; - ProgramBuilder::new(&re).compile().unwrap() + let re = Expr::parse(r"^bc(d|e)*$").unwrap(); + Compiler::new().compile(&[re]).unwrap() }); } #[bench] fn compile_simple_bytes(b: &mut Bencher) { b.iter(|| { - let re = r"^bc(d|e)*$"; - ProgramBuilder::new(&re).bytes(true).compile().unwrap() + let re = Expr::parse(r"^bc(d|e)*$").unwrap(); + Compiler::new().bytes(true).compile(&[re]).unwrap() }); } #[bench] fn compile_small(b: &mut Bencher) { b.iter(|| { - let re = r"\p{L}|\p{N}|\s|.|\d"; - ProgramBuilder::new(&re).compile().unwrap() + let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap(); + Compiler::new().compile(&[re]).unwrap() }); } #[bench] fn compile_small_bytes(b: &mut Bencher) { b.iter(|| { - let re = r"\p{L}|\p{N}|\s|.|\d"; - ProgramBuilder::new(&re).bytes(true).compile().unwrap() + let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap(); + Compiler::new().bytes(true).compile(&[re]).unwrap() }); } #[bench] fn compile_huge(b: &mut Bencher) { b.iter(|| { - let re = r"\p{L}{100}"; - ProgramBuilder::new(&re).compile().unwrap() + let re = Expr::parse(r"\p{L}{100}").unwrap(); + Compiler::new().compile(&[re]).unwrap() }); } #[bench] fn compile_huge_bytes(b: &mut Bencher) { b.iter(|| { - let re = r"\p{L}{100}"; - ProgramBuilder::new(&re).bytes(true).compile().unwrap() + let re = Expr::parse(r"\p{L}{100}").unwrap(); + Compiler::new().bytes(true).compile(&[re]).unwrap() }); } diff --git a/examples/set.rs b/examples/set.rs new file mode 100644 index 0000000000..caf0fead51 --- /dev/null +++ b/examples/set.rs @@ -0,0 +1,19 @@ +extern crate regex; + +use regex::RegexSet; + +fn main() { + let res = &[ + "abc", + "xyzz", + "^[ga-fh-z]+$", + ]; + let text = "abcggggggggxyz"; + let set = RegexSet::new(res).unwrap(); + println!("{:?}", set); + let m = set.is_match("abcggggggggxyz"); + println!("match? {:?}", m); + for mi in set.matches(text) { + println!("{:?}", mi); + } +} diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 3a77bd3f88..274c318049 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -177,6 +177,19 @@ pub enum Repeater { }, } +impl Repeater { + /// Returns true if and only if this repetition can match the empty string. + fn matches_empty(&self) -> bool { + use self::Repeater::*; + match *self { + ZeroOrOne => true, + ZeroOrMore => true, + OneOrMore => false, + Range { min, .. } => min == 0, + } + } +} + /// A character class. /// /// A character class has a canonical format that the parser guarantees. Its @@ -315,7 +328,9 @@ impl Expr { /// the beginning of text. pub fn is_anchored_start(&self) -> bool { match *self { - Repeat { ref e, .. } => e.is_anchored_start(), + Repeat { ref e, r, .. } => { + !r.matches_empty() && e.is_anchored_start() + } Group { ref e, .. } => e.is_anchored_start(), Concat(ref es) => es[0].is_anchored_start(), Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()), @@ -328,7 +343,9 @@ impl Expr { /// end of the text. pub fn is_anchored_end(&self) -> bool { match *self { - Repeat { ref e, .. } => e.is_anchored_end(), + Repeat { ref e, r, .. } => { + !r.matches_empty() && e.is_anchored_end() + } Group { ref e, .. } => e.is_anchored_end(), Concat(ref es) => es[es.len() - 1].is_anchored_end(), Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()), @@ -1059,9 +1076,6 @@ mod tests { assert!(e("^a|^b").is_anchored_start()); assert!(e("(^a)|(^b)").is_anchored_start()); assert!(e("(^(a|b))").is_anchored_start()); - assert!(e("^*").is_anchored_start()); - assert!(e("(^)*").is_anchored_start()); - assert!(e("((^)*)*").is_anchored_start()); assert!(!e("^a|b").is_anchored_start()); assert!(!e("a|^b").is_anchored_start()); @@ -1074,9 +1088,6 @@ mod tests { assert!(e("a$|b$").is_anchored_end()); assert!(e("(a$)|(b$)").is_anchored_end()); assert!(e("((a|b)$)").is_anchored_end()); - assert!(e("$*").is_anchored_end()); - assert!(e("($)*").is_anchored_end()); - assert!(e("(($)*)*").is_anchored_end()); assert!(!e("a$|b").is_anchored_end()); assert!(!e("a|b$").is_anchored_end()); diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml index 8f90a034cc..d36839f3fe 100644 --- a/regex_macros/Cargo.toml +++ b/regex_macros/Cargo.toml @@ -22,6 +22,10 @@ path = ".." version = "0.1" features = ["pattern"] +[dependencies.regex-syntax] +path = "../regex-syntax" +version = "0.2" + [dev-dependencies] lazy_static = "0.1" rand = "0.3" diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index d96aae9ea2..37e96566d6 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -18,8 +18,12 @@ #![feature(plugin_registrar, quote, rustc_private)] extern crate regex; -extern crate syntax; +extern crate regex_syntax; extern crate rustc_plugin; +extern crate syntax; + +use std::collections::BTreeMap; +use std::usize; use syntax::ast; use syntax::codemap; @@ -32,7 +36,8 @@ use syntax::ptr::P; use rustc_plugin::Registry; -use regex::internal::{Inst, EmptyLook, Program, ProgramBuilder}; +use regex::internal::{Compiler, EmptyLook, Inst, Program}; +use regex_syntax::Expr; /// For the `regex!` syntax extension. Do not use. #[plugin_registrar] @@ -67,15 +72,21 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) }; // We use the largest possible size limit because this is happening at // compile time. We trust the programmer. - let bprog = ProgramBuilder::new(®ex).size_limit(::std::usize::MAX); - let prog = match bprog.compile() { + let expr = match Expr::parse(®ex) { + Ok(expr) => expr, + Err(err) => { + cx.span_err(sp, &err.to_string()); + return DummyResult::any(sp) + } + }; + let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) { Ok(re) => re, Err(err) => { cx.span_err(sp, &err.to_string()); return DummyResult::any(sp) } }; - let names = prog.cap_names.iter().cloned().collect(); + let names = prog.captures.iter().cloned().collect(); let mut gen = NfaGen { cx: &*cx, sp: sp, @@ -98,8 +109,8 @@ impl<'a> NfaGen<'a> { fn code(&mut self) -> P { // Most or all of the following things are used in the quasiquoted // expression returned. - let num_cap_locs = 2 * self.prog.num_captures(); - let num_insts = self.prog.insts.len(); + let num_cap_locs = 2 * self.prog.captures.len(); + let num_insts = self.prog.len(); let cap_names = self.vec_expr(self.names.iter(), &mut |cx, name| match *name { Some(ref name) => { @@ -109,21 +120,20 @@ impl<'a> NfaGen<'a> { None => cx.expr_none(self.sp), } ); - let named_groups = { - let mut named_groups = ::std::collections::BTreeMap::new(); + let capture_name_idx = { + let mut capture_name_idx = BTreeMap::new(); for (i, name) in self.names.iter().enumerate() { if let Some(ref name) = *name { - named_groups.insert(name.to_owned(), i); + capture_name_idx.insert(name.to_owned(), i); } } - self.vec_expr(named_groups.iter(), + self.vec_expr(capture_name_idx.iter(), &mut |cx, (name, group_idx)| quote_expr!(cx, ($name, $group_idx)) ) }; - let prefix_anchor = self.prog.anchored_begin; - + let is_anchored_start = self.prog.is_anchored_start; let step_insts = self.step_insts(); let add_insts = self.add_insts(); let regex = &*self.original; @@ -135,9 +145,9 @@ impl<'a> NfaGen<'a> { // the user is only warned about *their* unused variable/code, and not the // unused code generated by regex!. See #14185 for an example. #[allow(dead_code)] -static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names; +static CAPTURES: &'static [Option<&'static str>] = &$cap_names; #[allow(dead_code)] -static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups; +static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx; #[allow(dead_code)] fn exec<'t>( @@ -175,14 +185,14 @@ fn exec<'t>( clist.empty(); nlist.empty(); 'LOOP: loop { if clist.size == 0 { - if matched || (!at.is_beginning() && $prefix_anchor) { + if matched || (!at.is_start() && $is_anchored_start) { break; } // TODO: Prefix matching... Hmm. // Prefix matching now uses a DFA, so I think this is // going to require encoding that DFA statically. } - if clist.size == 0 || (!$prefix_anchor && !matched) { + if clist.size == 0 || (!$is_anchored_start && !matched) { self.add(clist, &mut caps, 0, at); } let at_next = self.input.at(at.next_pos()); @@ -322,8 +332,8 @@ fn exec<'t>( ::regex::Regex::Native(::regex::internal::ExNative { original: $regex, - names: &CAP_NAMES, - groups: &NAMED_GROUPS, + names: &CAPTURES, + groups: &CAPTURE_NAME_IDX, prog: exec, }) }) @@ -332,7 +342,7 @@ fn exec<'t>( // Generates code for the `add` method, which is responsible for adding // zero-width states to the next queue of states to visit. fn add_insts(&self) -> P { - let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let arms = self.prog.iter().enumerate().map(|(pc, inst)| { let body = match *inst { Inst::EmptyLook(ref inst) => { let nextpc = inst.goto; @@ -422,7 +432,7 @@ fn exec<'t>( // Generates the code for the `step` method, which processes all states // in the current queue that consume a single character. fn step_insts(&self) -> P { - let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let arms = self.prog.iter().enumerate().map(|(pc, inst)| { let body = match *inst { Inst::Match => quote_expr!(self.cx, { for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { diff --git a/src/backtrack.rs b/src/backtrack.rs index 6238f296d1..b80ff9cf60 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -26,10 +26,9 @@ // the bitset has to be zeroed on each execution, which becomes quite expensive // on large bitsets. +use exec::Search; use input::{Input, InputAt}; -use inst::InstPtr; -use program::Program; -use re::CaptureIdxs; +use prog::{Program, InstPtr}; /// Returns true iff the given regex and input should be executed by this /// engine with reasonable memory usage. @@ -51,10 +50,10 @@ const MAX_INPUT_SIZE: usize = 128 * (1 << 10); /// A backtracking matching engine. #[derive(Debug)] -pub struct Backtrack<'a, 'r, 'c, I> { +pub struct Backtrack<'a, 'b, 'c: 'b, 'm: 'b, 'r, I> { prog: &'r Program, input: I, - caps: &'c mut CaptureIdxs, + search: &'b mut Search<'m, 'c>, m: &'a mut BacktrackCache, } @@ -85,14 +84,14 @@ enum Job { SaveRestore { slot: usize, old_pos: Option }, } -impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { +impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { /// Execute the backtracking matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( prog: &'r Program, - mut caps: &mut CaptureIdxs, + search: &'b mut Search<'c, 'm>, input: I, start: usize, ) -> bool { @@ -101,7 +100,7 @@ impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { let mut b = Backtrack { prog: prog, input: input, - caps: caps, + search: search, m: &mut m, }; b.exec_(start) @@ -124,7 +123,7 @@ impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { // (Probably because backtracking is limited to such small // inputs/regexes in the first place.) let visited_len = - (self.prog.insts.len() * (self.input.len() + 1) + BIT_SIZE - 1) + (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) / BIT_SIZE; self.m.visited.truncate(visited_len); @@ -146,8 +145,8 @@ impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { self.clear(); // If this is an anchored regex at the beginning of the input, then // we're either already done or we only need to try backtracking once. - if self.prog.anchored_begin { - return if !at.is_beginning() { + if self.prog.is_anchored_start { + return if !at.is_start() { false } else { self.backtrack(at) @@ -184,11 +183,16 @@ impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { match job { Job::Inst { ip, at } => { if self.step(ip, at) { - return true; + // Only quit if we're matching one regex. + // If we're matching a regex set, then mush on and + // try to find other matches. + if self.search.matches.len() <= 1 { + return true; + } } } Job::SaveRestore { slot, old_pos } => { - self.caps[slot] = old_pos; + self.search.captures[slot] = old_pos; } } } @@ -196,26 +200,29 @@ impl<'a, 'r, 'c, I: Input> Backtrack<'a, 'r, 'c, I> { } fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { - use inst::Inst::*; + use prog::Inst::*; loop { // This loop is an optimization to avoid constantly pushing/popping // from the stack. Namely, if we're pushing a job only to run it // next, avoid the push and just mutate `ip` (and possibly `at`) // in place. - match self.prog.insts[ip] { - Match => return true, + match self.prog[ip] { + Match(slot) => { + self.search.set_match(slot); + return true; + } Save(ref inst) => { - if inst.slot < self.caps.len() { + if inst.slot < self.search.captures.len() { // If this path doesn't work out, then we save the old // capture index (if one exists) in an alternate // job. If the next path fails, then the alternate // job is popped and the old capture index is restored. - let old_pos = self.caps[inst.slot]; + let old_pos = self.search.captures[inst.slot]; self.m.jobs.push(Job::SaveRestore { slot: inst.slot, old_pos: old_pos, }); - self.caps[inst.slot] = Some(at.pos()); + self.search.captures[inst.slot] = Some(at.pos()); } ip = inst.goto; } diff --git a/src/compile.rs b/src/compile.rs index d49f1a9deb..a3471d80b1 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -8,23 +8,20 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::collections::HashSet; +use std::collections::HashMap; use std::iter; use std::result; +use std::sync::Arc; use syntax::{Expr, Repeater, CharClass, ClassRange}; use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences}; -use Error; -use inst::{ - Insts, Inst, InstPtr, EmptyLook, +use prog::{ + Program, Inst, InstPtr, EmptyLook, InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes, }; -pub struct Compiled { - pub insts: Insts, - pub cap_names: Vec>, -} +use Error; type InstHoleIdx = InstPtr; @@ -38,12 +35,10 @@ struct Patch { pub struct Compiler { insts: Vec, - cap_names: Vec>, - seen_caps: HashSet, + compiled: Program, + capture_name_idx: HashMap, + num_exprs: usize, size_limit: usize, - bytes: bool, - dfa: bool, - reverse: bool, suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, @@ -56,12 +51,10 @@ impl Compiler { pub fn new() -> Self { Compiler { insts: vec![], - cap_names: vec![None], - seen_caps: HashSet::new(), + compiled: Program::new(), + capture_name_idx: HashMap::new(), + num_exprs: 0, size_limit: 10 * (1 << 20), - bytes: false, - dfa: false, - reverse: false, suffix_cache: SuffixCache::new(1000), utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), byte_classes: ByteClassSet::new(), @@ -88,7 +81,7 @@ impl Compiler { /// /// Note that `dfa(true)` implies `bytes(true)`. pub fn bytes(mut self, yes: bool) -> Self { - self.bytes = yes; + self.compiled.is_bytes = yes; self } @@ -100,15 +93,15 @@ impl Compiler { /// based engines handle the preceding `.*?` explicitly, which is difficult /// or impossible in the DFA engine.) pub fn dfa(mut self, yes: bool) -> Self { - self.dfa = yes; - self.bytes = yes; + self.compiled.is_dfa = yes; + self.compiled.is_bytes = yes; self } /// When set, the machine returned is suitable for matching text in /// reverse. In particular, all concatenations are flipped. pub fn reverse(mut self, yes: bool) -> Self { - self.reverse = yes; + self.compiled.is_reverse = yes; self } @@ -117,8 +110,27 @@ impl Compiler { /// The compiler is guaranteed to succeed unless the program exceeds the /// specified size limit. If the size limit is exceeded, then compilation /// stops and returns an error. - pub fn compile(mut self, expr: &Expr) -> result::Result { - if self.dfa && !self.reverse && !expr.is_anchored_start() { + pub fn compile( + mut self, + exprs: &[Expr], + ) -> result::Result { + debug_assert!(exprs.len() >= 1); + self.num_exprs = exprs.len(); + if exprs.len() == 1 { + self.compile_one(&exprs[0]) + } else { + self.compile_many(exprs) + } + } + + fn compile_one(mut self, expr: &Expr) -> result::Result { + // If we're compiling a forward DFA and we aren't anchored, then + // add a `.*?` before the first capture group. + // Other matching engines handle this by baking the logic into the + // matching engine itself. + self.compiled.is_anchored_start = expr.is_anchored_start(); + self.compiled.is_anchored_end = expr.is_anchored_end(); + if self.compiled.needs_dotstar() { let patch = try!(self.c(&Expr::Repeat { e: Box::new(Expr::AnyChar), r: Repeater::ZeroOrMore, @@ -126,20 +138,64 @@ impl Compiler { })); self.fill_to_next(patch.hole); } + self.compiled.captures = vec![None]; + self.compiled.start = self.insts.len(); let patch = try!(self.c_capture(0, expr)); self.fill_to_next(patch.hole); - self.push_compiled(Inst::Match); + self.compiled.matches = vec![self.insts.len()]; + self.push_compiled(Inst::Match(0)); + self.compile_finish() + } + + fn compile_many( + mut self, + exprs: &[Expr], + ) -> result::Result { + debug_assert!(exprs.len() > 1); + + self.compiled.is_anchored_start = + exprs.iter().all(|e| e.is_anchored_start()); + self.compiled.is_anchored_end = + exprs.iter().all(|e| e.is_anchored_end()); + if self.compiled.needs_dotstar() { + let patch = try!(self.c(&Expr::Repeat { + e: Box::new(Expr::AnyChar), + r: Repeater::ZeroOrMore, + greedy: false, + })); + self.fill_to_next(patch.hole); + } - let byte_classes = self.byte_classes.byte_classes(); - let insts = self.insts.into_iter().map(|inst| inst.unwrap()).collect(); - Ok(Compiled { - insts: Insts::new(insts, self.bytes, self.reverse, byte_classes), - cap_names: self.cap_names, - }) + self.compiled.start = self.insts.len(); + for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { + let split = self.push_split_hole(); + let Patch { hole, entry } = try!(self.c_capture(0, expr)); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + + let next = self.insts.len(); + self.fill_split(split, Some(entry), Some(next)); + } + let i = exprs.len() - 1; + let Patch { hole, .. } = try!(self.c_capture(0, &exprs[i])); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + + self.compile_finish() + } + + fn compile_finish(mut self) -> result::Result { + self.compiled.insts = + self.insts.into_iter().map(|inst| inst.unwrap()).collect(); + self.compiled.byte_classes = self.byte_classes.byte_classes(); + self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); + Ok(self.compiled) } fn c(&mut self, expr: &Expr) -> Result { - use inst; + use prog; use syntax::Expr::*; try!(self.check_size()); @@ -159,50 +215,52 @@ impl Compiler { Class(ref cls) => { self.c_class(cls) } - StartLine if self.reverse => { + StartLine if self.compiled.is_reverse => { self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(inst::EmptyLook::EndLine) + self.c_empty_look(prog::EmptyLook::EndLine) } StartLine => { self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(inst::EmptyLook::StartLine) + self.c_empty_look(prog::EmptyLook::StartLine) } - EndLine if self.reverse => { + EndLine if self.compiled.is_reverse => { self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(inst::EmptyLook::StartLine) + self.c_empty_look(prog::EmptyLook::StartLine) } EndLine => { self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(inst::EmptyLook::EndLine) + self.c_empty_look(prog::EmptyLook::EndLine) } - StartText if self.reverse => { - self.c_empty_look(inst::EmptyLook::EndText) + StartText if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) } StartText => { - self.c_empty_look(inst::EmptyLook::StartText) + self.c_empty_look(prog::EmptyLook::StartText) } - EndText if self.reverse => { - self.c_empty_look(inst::EmptyLook::StartText) + EndText if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) } EndText => { - self.c_empty_look(inst::EmptyLook::EndText) + self.c_empty_look(prog::EmptyLook::EndText) } - WordBoundary => self.c_empty_look(inst::EmptyLook::WordBoundary), + WordBoundary => self.c_empty_look(prog::EmptyLook::WordBoundary), NotWordBoundary => { - self.c_empty_look(inst::EmptyLook::NotWordBoundary) + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } Group { ref e, i: None, name: None } => self.c(e), Group { ref e, i, ref name } => { // it's impossible to have a named capture without an index let i = i.expect("capture index"); - if !self.seen_caps.contains(&i) { - self.cap_names.push(name.clone()); - self.seen_caps.insert(i); + if i >= self.compiled.captures.len() { + self.compiled.captures.push(name.clone()); + if let Some(ref name) = *name { + self.capture_name_idx.insert(name.to_owned(), i); + } } self.c_capture(2 * i, e) } Concat(ref es) => { - if self.reverse { + if self.compiled.is_reverse { self.c_concat(es.iter().rev()) } else { self.c_concat(es) @@ -214,22 +272,30 @@ impl Compiler { } fn c_capture(&mut self, first_slot: usize, expr: &Expr) -> Result { - let entry = self.insts.len(); - let hole = self.push_hole(InstHole::Save { slot: first_slot }); - let patch = try!(self.c(expr)); - self.fill(hole, patch.entry); - self.fill_to_next(patch.hole); - let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); - Ok(Patch { hole: hole, entry: entry }) + if self.num_exprs > 1 || self.compiled.is_dfa { + // Don't ever compile Save instructions for regex sets because + // they are never used. They are also never used in DFA programs + // because DFAs can't handle captures. + self.c(expr) + } else { + let entry = self.insts.len(); + let hole = self.push_hole(InstHole::Save { slot: first_slot }); + let patch = try!(self.c(expr)); + self.fill(hole, patch.entry); + self.fill_to_next(patch.hole); + let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); + Ok(Patch { hole: hole, entry: entry }) + } } fn c_literal(&mut self, chars: &[char], casei: bool) -> Result { assert!(!chars.is_empty()); - let mut chars: Box> = if self.reverse { - Box::new(chars.iter().rev()) - } else { - Box::new(chars.iter()) - }; + let mut chars: Box> = + if self.compiled.is_reverse { + Box::new(chars.iter().rev()) + } else { + Box::new(chars.iter()) + }; let first = *chars.next().expect("non-empty literal"); let Patch { mut hole, entry } = try!(self.c_char(first, casei)); for &c in chars { @@ -251,7 +317,7 @@ impl Compiler { } fn c_class(&mut self, ranges: &[ClassRange]) -> Result { - if self.bytes { + if self.compiled.is_bytes { CompileClass { c: self, ranges: ranges, @@ -694,7 +760,7 @@ impl<'a, 'b> CompileClass<'a, 'b> { } fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { - if self.c.reverse { + if self.c.compiled.is_reverse { self.c_utf8_seq_(seq) } else { self.c_utf8_seq_(seq.into_iter().rev()) @@ -704,9 +770,7 @@ impl<'a, 'b> CompileClass<'a, 'b> { fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator { // The initial instruction for each UTF-8 sequence should be the same. - // Since the 0th instruction has always been created by this point, - // it's safe to use it as a sentinel here. - let mut from_inst = 0; + let mut from_inst = ::std::usize::MAX; let mut last_hole = Hole::None; for byte_range in seq { let key = SuffixCacheKey { @@ -722,7 +786,7 @@ impl<'a, 'b> CompileClass<'a, 'b> { } } self.c.byte_classes.set_range(byte_range.start, byte_range.end); - if from_inst == 0 { + if from_inst == ::std::usize::MAX { last_hole = self.c.push_hole(InstHole::Bytes { start: byte_range.start, end: byte_range.end, @@ -735,8 +799,9 @@ impl<'a, 'b> CompileClass<'a, 'b> { })); } from_inst = self.c.insts.len().checked_sub(1).unwrap(); + assert!(from_inst < ::std::usize::MAX); } - assert!(from_inst > 0); + assert!(from_inst < ::std::usize::MAX); Ok(Patch { hole: last_hole, entry: from_inst }) } } diff --git a/src/dfa.rs b/src/dfa.rs index d8e5c5f6c4..239042a320 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -45,8 +45,8 @@ use std::collections::HashMap; use std::fmt; use std::mem; -use inst::{Insts, Inst}; -use program::Program; +use exec::Search; +use prog::{Inst, Program}; use sparse::SparseSet; /// The cache limit specifies approximately how much space we're willing to @@ -70,8 +70,9 @@ const CACHE_LIMIT: usize = 2 * (1<<20); /// Generally, a DFA is possible only when there are no word boundary /// assertions. This is due to the difficulty (but likely not impossibility) /// of tracking multi-byte assertions in the DFA. -pub fn can_exec(insts: &Insts) -> bool { - use inst::EmptyLook::*; +pub fn can_exec(insts: &Program) -> bool { + use prog::Inst::*; + use prog::EmptyLook::*; // If for some reason we manage to allocate a regex program with more // than 2^32-1 instructions, then we can't execute the DFA because we // use 32 bit pointers. @@ -80,37 +81,19 @@ pub fn can_exec(insts: &Insts) -> bool { } for inst in insts { match *inst { - Inst::Char(_) | Inst::Ranges(_) => return false, - Inst::EmptyLook(ref inst) => { + Char(_) | Ranges(_) => return false, + EmptyLook(ref inst) => { match inst.look { WordBoundary | NotWordBoundary => return false, StartLine | EndLine | StartText | EndText => {} } } - Inst::Match | Inst::Save(_) | Inst::Split(_) | Inst::Bytes(_) => {} + Match(_) | Save(_) | Split(_) | Bytes(_) => {} } } true } -/// The result of running the DFA. -/// -/// Conceptually, this is essentially equivalent to an `Option`, where -/// the value indicates where the end of a match was found, if any. We split -/// this out into a third state called EarlyMatch, which indicates both that -/// the caller specified that they didn't care about *where* a match was found, -/// and that the position at which the earliest match occurred may not be the -/// correct leftmost-first ending match position. -/// -/// NoMatch indicates that no match will ever be found and that processing can -/// quit immediately. -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] -pub enum DfaResult { - Match(usize), - EarlyMatch, - NoMatch, -} - /// A reusable cache of DFA states. /// /// This cache is reused between multiple invocations of the same regex @@ -169,7 +152,7 @@ pub struct DfaCache { /// N.B. We only use a single lifetime here since all pointers are taken /// from the same cache. #[derive(Debug)] -pub struct Dfa<'a> { +pub struct Dfa<'a, 'b, 'c: 'b, 'm: 'b> { /// prog contains the NFA instruction opcodes. DFA execution uses either /// the `dfa` instructions or the `dfa_reverse` instructions from /// `exec::Executor`. (It never uses `Executor.prog`, which may have @@ -178,10 +161,10 @@ pub struct Dfa<'a> { /// The start state. We record it here because the pointer may change /// when the cache is wiped. start: StatePtr, - /// When set, we can stop searching immediately after we enter a match - /// state. (Normally we keep searching in order to provide leftmost-first - /// semantics.) - quit_on_first_match: bool, + /// The search configuration, which includes capture groups. It also + /// includes space for indicating which regex matched if executing a + /// regex set. + search: &'b mut Search<'c, 'm>, /// These are all from DfaCache. (Only {qcur,qnext} are missing.) compiled: &'a mut HashMap, states: &'a mut Vec, @@ -305,7 +288,7 @@ impl DfaCache { } } -impl<'a> Dfa<'a> { +impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// The main entry point to executing a DFA, which returns the *end* of /// a match if one exists, using Perl's "leftmost-first" semantics. /// @@ -318,42 +301,42 @@ impl<'a> Dfa<'a> { /// it may seem like we should omit `at` and just rely on the caller to /// slice `text` appropriately, it is necessary to tell whether `at` is /// at the beginning of `text` or not (i.e., for empty assertions). - /// - /// quit_on_first_match should be set if the caller doesn't care about - /// where the match ends. If a match is found, DfaResult::EarlyMatch is - /// returned. pub fn exec( prog: &'a Program, + search: &'b mut Search<'c, 'm>, text: &[u8], at: usize, - quit_on_first_match: bool, - ) -> DfaResult { + ) -> bool { // Retrieve our DFA cache from the program. If another thread tries to // execute this DFA *simultaneously*, then a new independent cache is // created. let mut _cache = prog.cache_dfa(); let mut cache = &mut **_cache; - cache.resize(prog.insts.len()); + cache.resize(prog.len()); let mut dfa = Dfa { prog: prog, + start: 0, // filled in below + search: search, compiled: &mut cache.compiled, states: &mut cache.states, start_states: &mut cache.start_states, - start: 0, // filled in below stack: &mut cache.stack, - quit_on_first_match: quit_on_first_match, }; dfa.start = match dfa.start_state(&mut cache.qcur, text, at) { - STATE_DEAD => return DfaResult::NoMatch, + STATE_DEAD => return false, si => si, }; debug_assert!(dfa.start != STATE_UNKNOWN); - if prog.insts.is_reversed() { + let matched = if prog.is_reverse { dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text, at) } else { dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text, at) + }; + if matched && dfa.search.matches.len() <= 1 { + dfa.search.set_match(0); } + matched } /// Executes the DFA on a forward NFA. @@ -365,7 +348,7 @@ impl<'a> Dfa<'a> { qnext: &mut SparseSet, text: &[u8], at: usize, - ) -> DfaResult { + ) -> bool { // For the most part, the DFA is basically: // // last_match = null @@ -394,22 +377,22 @@ impl<'a> Dfa<'a> { // 6. We can't actually do state.next[byte]. Instead, we have to do // state.next[byte_classes[byte]], which permits us to keep the // 'next' list very small. - debug_assert!(!self.prog.is_reversed()); + debug_assert!(!self.prog.is_reverse); - // last_match is the currently known ending match position. It is + // The last match is the currently known ending match position. It is // reported as an index to the most recent byte that resulted in a - // transition to a match state. Its maximum value is `text.len()`, + // transition to a match state and is always stored in capture slot `1` + // when searching forwards. Its maximum value is `text.len()`, // which can only happen after the special EOF sentinel value is fed // to the DFA. - let mut last_match = DfaResult::NoMatch; - let (mut si, mut i) = (self.start, at); + let (mut si, mut i, mut matched) = (self.start, at, false); while i < text.len() { // Our set of literal prefixes can itself be a DFA, but it is // offline and can generally be quite a bit faster. (For instance, // memchr is used if possible.) if !self.prog.prefixes.is_empty() && si == self.start { i = match self.prefix_at(text, i) { - None => return DfaResult::NoMatch, + None => return false, Some(i) => i, }; } @@ -418,26 +401,27 @@ impl<'a> Dfa<'a> { // but we inline it manually here to avoid the extra branch and // also because we know we have a real `u8` (not a `Byte`, which // may be the special EOF sentinel value). - let cls = self.prog.insts.byte_classes()[text[i] as usize]; + let cls = self.prog.byte_classes[text[i] as usize]; let mut next_si = self.states[si as usize].next[cls as usize]; if next_si <= STATE_DEAD { if next_si == STATE_DEAD { - return last_match; + return matched; } // The next state may not have been cached, so re-compute it // (i.e., follow epsilon transitions). next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i])); debug_assert!(next_si != STATE_UNKNOWN); if next_si == STATE_DEAD { - return last_match; + return matched; } } si = next_si; if self.states[si as usize].is_match { - if self.quit_on_first_match { - return DfaResult::EarlyMatch; + if self.search.quit_after_first_match() { + return true; } - last_match = DfaResult::Match(i); + matched = true; + self.search.set_end(Some(i)); } i += 1; } @@ -445,12 +429,23 @@ impl<'a> Dfa<'a> { si = self.next_state(qcur, qnext, si, Byte::eof()); debug_assert!(si != STATE_UNKNOWN); if si == STATE_DEAD { - return last_match; + return matched; } if self.states[si as usize].is_match { - last_match = DfaResult::Match(text.len()); + if self.search.quit_after_first_match() { + return true; + } + matched = true; + self.search.set_end(Some(text.len())); } - last_match + if matched && self.search.matches.len() != 1 { + for &ip in &self.states[si as usize].insts { + if let Inst::Match(slot) = self.prog[ip as usize] { + self.search.set_match(slot); + } + } + } + matched } /// Executes the DFA on a reverse NFA. @@ -460,50 +455,54 @@ impl<'a> Dfa<'a> { qnext: &mut SparseSet, text: &[u8], at: usize, - ) -> DfaResult { + ) -> bool { // The comments in `exec_at` above mostly apply here too. The main // difference is that we move backwards over the input and we look for // the longest possible match instead of the leftmost-first match. // // N.B. The code duplication here is regrettable. Efforts to improve // it without sacrificing performance are welcome. ---AG - debug_assert!(self.prog.is_reversed()); - let mut last_match = DfaResult::NoMatch; - let (mut si, mut i) = (self.start, at); + debug_assert!(self.prog.is_reverse); + let (mut si, mut i, mut matched) = (self.start, at, false); while i > 0 { i -= 1; - let cls = self.prog.insts.byte_classes()[text[i] as usize]; + let cls = self.prog.byte_classes[text[i] as usize]; let mut next_si = self.states[si as usize].next[cls as usize]; if next_si <= STATE_DEAD { if next_si == STATE_DEAD { - return last_match; + return matched; } // The next state may not have been cached, so re-compute it // (i.e., follow epsilon transitions). next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i])); debug_assert!(next_si != STATE_UNKNOWN); if next_si == STATE_DEAD { - return last_match; + return matched; } } si = next_si; if self.states[si as usize].is_match { - if self.quit_on_first_match { - return DfaResult::EarlyMatch; + if self.search.quit_after_first_match() { + return true; } - last_match = DfaResult::Match(i+1); + matched = true; + self.search.set_start(Some(i+1)); } } si = self.next_state(qcur, qnext, si, Byte::eof()); debug_assert!(si != STATE_UNKNOWN); if si == STATE_DEAD { - return last_match; + return matched; } if self.states[si as usize].is_match { - last_match = DfaResult::Match(0); + if self.search.quit_after_first_match() { + return true; + } + matched = true; + self.search.set_start(Some(0)); } - last_match + matched } /// Computes the next state given the current state and the current input @@ -521,7 +520,7 @@ impl<'a> Dfa<'a> { mut si: StatePtr, b: Byte, ) -> StatePtr { - use inst::Inst::*; + use prog::Inst::*; // Initialize a queue with the current DFA state's NFA states. qcur.clear(); @@ -573,15 +572,21 @@ impl<'a> Dfa<'a> { // the current byte. qnext.clear(); for &ip in &*qcur { - match self.prog.insts[ip as usize] { + match self.prog[ip as usize] { // These states never happen in a byte-based program. Char(_) | Ranges(_) => unreachable!(), // These states are handled when following epsilon transitions. Save(_) | Split(_) | EmptyLook(_) => {} - Match => { + Match(_) => { flags.set_match(true); - if !self.prog.is_reversed() { + if !self.continue_past_first_match() { break; + } else if self.search.matches.len() != 1 { + // If we are continuing on to find other matches, + // then keep a record of the match states we've seen. + if !qnext.contains_ip(ip as usize) { + qnext.add(ip); + } } } Bytes(ref inst) => { @@ -592,6 +597,16 @@ impl<'a> Dfa<'a> { } } } + let mut cache = true; + if b.is_eof() && self.search.matches.len() != 1 { + // If we're processing the last byte of the input and we're + // matching a regex set, then make the next state contain the + // previous states transitions. We do this so that the main + // matching loop can extract all of the match instructions. + mem::swap(qcur, qnext); + // And don't cache this state because it's totally bunk. + cache = false; + } // We've now built up the set of NFA states that ought to comprise the // next DFA state, so try to find it in the cache, and if it doesn't // exist, cache it. @@ -603,7 +618,9 @@ impl<'a> Dfa<'a> { debug_assert!(next != STATE_UNKNOWN); // And now store our state in the current state's next list. let cls = self.byte_class(b); - self.states[si as usize].next[cls] = next; + if cache { + self.states[si as usize].next[cls] = next; + } next } @@ -636,8 +653,8 @@ impl<'a> Dfa<'a> { q: &mut SparseSet, flags: Flags, ) { - use inst::Inst::*; - use inst::EmptyLook::*; + use prog::Inst::*; + use prog::EmptyLook::*; // We need to traverse the NFA to follow epsilon transitions, so avoid // recursion with an explicit stack. @@ -648,9 +665,9 @@ impl<'a> Dfa<'a> { continue; } q.add(ip as usize); - match self.prog.insts[ip as usize] { + match self.prog[ip as usize] { Char(_) | Ranges(_) => unreachable!(), - Match | Bytes(_) => {} + Match(_) | Bytes(_) => {} EmptyLook(ref inst) => { // Only follow empty assertion states if our flags satisfy // the assertion. @@ -755,8 +772,8 @@ impl<'a> Dfa<'a> { q: &SparseSet, is_match: bool, ) -> Option<(StateKey, Flags)> { - use inst::Inst::*; - use inst::EmptyLook::*; + use prog::Inst::*; + use prog::EmptyLook::*; // We need to build up enough information to recognize pre-built states // in the DFA. Generally speaking, this includes every instruction @@ -770,7 +787,7 @@ impl<'a> Dfa<'a> { let mut insts = vec![]; for &ip in q { let ip = usize_to_u32(ip); - match self.prog.insts[ip as usize] { + match self.prog[ip as usize] { Char(_) | Ranges(_) => unreachable!(), Save(_) => {} Split(_) => {} @@ -796,12 +813,9 @@ impl<'a> Dfa<'a> { WordBoundary | NotWordBoundary => unreachable!(), } } - Match => { + Match(_) => { insts.push(ip); - // If this is a reverse program, then we want to continue - // executing to find the longest possible match. Otherwise, - // we only support leftmost-first semantics, so bail out. - if !self.prog.insts.is_reversed() { + if !self.continue_past_first_match() { break; } } @@ -918,7 +932,7 @@ impl<'a> Dfa<'a> { text: &[u8], at: usize, ) -> StatePtr { - let start_flags = if self.prog.insts.is_reversed() { + let start_flags = if self.prog.is_reverse { self.start_flags_reverse(text, at) } else { self.start_flags(text, at) @@ -980,7 +994,7 @@ impl<'a> Dfa<'a> { /// invariant: num_byte_classes() == len(State.next) fn num_byte_classes(&self) -> usize { // We add 1 to account for the special EOF byte. - ((self.prog.insts.byte_classes()[255] + 1) + 1) as usize + (self.prog.byte_classes[255] as usize + 1) + 1 } /// Given an input byte or the special EOF sentinel, return its @@ -989,10 +1003,22 @@ impl<'a> Dfa<'a> { if b.is_eof() { self.num_byte_classes() - 1 } else { - self.prog.insts.byte_classes()[b.0 as usize] as usize + self.prog.byte_classes[b.0 as usize] as usize } } + /// Returns true if the DFA should continue searching past the first match. + /// + /// Leftmost first semantics in the DFA are preserved by not following NFA + /// transitions after the first match is seen. + /// + /// On occasion, we want to avoid leftmost first semantics to find either + /// the longest match (for reverse search) or all possible matches (for + /// regex sets). + fn continue_past_first_match(&self) -> bool { + self.prog.is_reverse || self.search.matches.len() != 1 + } + /// Approximate size returns the approximate heap space currently used by /// the DFA. It is used to determine whether the DFA's state cache needs to /// be wiped. Namely, it is possible that for certain regexes on certain diff --git a/src/exec.rs b/src/exec.rs index f048c0572f..bd80b2ac18 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -12,14 +12,74 @@ use std::collections::HashMap; use std::sync::Arc; use backtrack::{self, Backtrack}; -use dfa::{self, Dfa, DfaResult}; +use compile::Compiler; +use dfa::{self, Dfa}; use input::{ByteInput, CharInput}; +use literals::BuildPrefixes; use nfa::Nfa; -use program::{Program, ProgramBuilder}; -use re::CaptureIdxs; +use prog::{Program, InstPtr}; +use syntax; use {Regex, Error}; +pub type CaptureSlots<'a> = &'a mut [CaptureSlot]; + +pub type CaptureSlot = Option; + +/// The parameters to running one of the four match engines. +#[derive(Debug)] +pub struct Search<'caps, 'matches> { + /// The matching engine writes capture locations to this slice. + /// + /// Note that some matching engines, like the DFA, have limited support + /// for this. The DFA can only fill in one capture location (the end + /// location of the match). + pub captures: CaptureSlots<'caps>, + /// The matching engine indicates which match instructions were executed + /// when searching stopped. + /// + /// In standard searches, there is exactly one value in this slice and it + /// should be initialized to `false`. When executing sets of regexes, + /// there should be a location for each regex. + pub matches: &'matches mut [bool], +} + +impl<'caps, 'matches> Search<'caps, 'matches> { + pub fn quit_after_first_match(&self) -> bool { + self.captures.is_empty() && self.matches.len() == 1 + } + + pub fn all_matched(&self) -> bool { + self.matches.iter().all(|m| *m) + } + + pub fn copy_captures_from(&mut self, caps: &[Option]) { + for (slot, val) in self.captures.iter_mut().zip(caps.iter()) { + *slot = *val; + } + } + + pub fn set_match(&mut self, match_slot: usize) { + if let Some(old) = self.matches.get_mut(match_slot) { + *old = true; + } + } + + pub fn set_start(&mut self, pos: Option) { + self.set_capture(0, pos); + } + + pub fn set_end(&mut self, pos: Option) { + self.set_capture(1, pos); + } + + fn set_capture(&mut self, i: usize, pos: Option) { + if let Some(old_pos) = self.captures.get_mut(i) { + *old_pos = pos; + } + } +} + /// Exec manages the execution of a regular expression. /// /// In particular, this manages the various compiled forms of a single regular @@ -27,6 +87,8 @@ use {Regex, Error}; /// regular expression. #[derive(Clone, Debug)] pub struct Exec { + /// The original regular expressions given by the caller to compile. + res: Vec, /// A compiled program that is used in the NFA simulation and backtracking. /// It can be byte-based or Unicode codepoint based. /// @@ -60,22 +122,32 @@ pub struct Exec { /// Facilitates the construction of an executor by exposing various knobs /// to control how a regex is executed and what kinds of resources it's /// permitted to use. -pub struct ExecBuilder<'r> { - re: &'r str, +pub struct ExecBuilder { + res: Vec, match_engine: MatchEngine, size_limit: usize, bytes: bool, } -impl<'r> ExecBuilder<'r> { +impl ExecBuilder { /// Create a regex execution builder. /// /// This uses default settings for everything except the regex itself, /// which must be provided. Further knobs can be set by calling methods, /// and then finally, `build` to actually create the executor. - pub fn new(re: &'r str) -> Self { + pub fn new(re: &str) -> Self { + Self::new_many(&[re]) + } + + /// Like new, but compiles the union of the given regular expressions. + /// + /// Note that when compiling 2 or more regular expressions, capture groups + /// are completely unsupported. (This means both `find` and `captures` + /// wont work.) + pub fn new_many(res: I) -> Self + where S: AsRef, I: IntoIterator { ExecBuilder { - re: re, + res: res.into_iter().map(|s| s.as_ref().to_owned()).collect(), match_engine: MatchEngine::Automatic, size_limit: 10 * (1 << 20), bytes: false, @@ -145,28 +217,40 @@ impl<'r> ExecBuilder<'r> { /// Build an executor that can run a regular expression. pub fn build(self) -> Result { - let prog = try!( - ProgramBuilder::new(self.re) - .size_limit(self.size_limit) - .bytes(self.bytes) - .compile()); + if self.res.is_empty() { + return Err(Error::InvalidSet); + } + let mut exprs = vec![]; + for re in &self.res { + exprs.push(try!(syntax::Expr::parse(re))); + } + let mut prog = try!( + Compiler::new() + .size_limit(self.size_limit) + .bytes(self.bytes) + .compile(&exprs)); let mut dfa = try!( - ProgramBuilder::new(self.re) - .size_limit(self.size_limit) - .dfa(true) - .compile()); - // Because the literal finder on byte-based programs is sub-optimal. - // We can use the literals found from a Unicode-based program just - // fine for now. - dfa.prefixes = prog.prefixes.clone(); + Compiler::new() + .size_limit(self.size_limit) + .dfa(true) + .compile(&exprs)); let dfa_reverse = try!( - ProgramBuilder::new(self.re) - .size_limit(self.size_limit) - .dfa(true) - .reverse(true) - .compile()); - let can_dfa = dfa::can_exec(&dfa.insts); + Compiler::new() + .size_limit(self.size_limit) + .dfa(true) + .reverse(true) + .compile(&exprs)); + + // Compute literal prefixes for only `prog`, which is likely a Unicode + // based program. Literal prefix extract currently works better on + // Unicode programs. + prog.prefixes = BuildPrefixes::new(&prog).literals().into_matcher(); + // And give it to the DFA too, which can use Unicode prefixes even + // though the program itself is byte based. + dfa.prefixes = prog.prefixes.clone(); + let can_dfa = dfa::can_exec(&dfa); Ok(Exec { + res: self.res, prog: prog, dfa: dfa, dfa_reverse: dfa_reverse, @@ -196,9 +280,9 @@ impl Exec { /// choosing the engine to use. If self.match_engine is Nfa or Backtrack, /// then that engine is always used. Otherwise, one is selected /// automatically. - pub fn exec( + pub fn exec<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { @@ -206,132 +290,129 @@ impl Exec { // only possible to execute those engines in exec_auto. See comment on // MatchEngine below for more details. match self.match_engine { - MatchEngine::Automatic => self.exec_auto(caps, text, start), - MatchEngine::Backtrack => self.exec_backtrack(caps, text, start), - MatchEngine::Nfa => self.exec_nfa(caps, text, start), + MatchEngine::Automatic => self.exec_auto(search, text, start), + MatchEngine::Backtrack => self.exec_backtrack(search, text, start), + MatchEngine::Nfa => self.exec_nfa(search, text, start), } } /// Like exec, but always selects the engine automatically. - pub fn exec_auto( + fn exec_auto<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { - if caps.len() <= 2 && self.prog.is_prefix_match() { + if search.captures.len() <= 2 && self.prog.prefixes.at_match() { // We should be able to execute the literal engine even if there // are more captures by falling back to the NFA engine after a // match. However, that's effectively what the NFA engine does // already (since it will use the literal engine if it exists). - self.exec_literals(caps, text, start) + self.exec_literals(search, text, start) } else if self.can_dfa { - self.exec_dfa(caps, text, start) + self.exec_dfa(search, text, start) } else { - self.exec_auto_nfa(caps, text, start) + self.exec_auto_nfa(search, text, start) } } /// Like exec, but always tries to execute the lazy DFA. /// /// Note that self.can_dfa must be true. This will panic otherwise. - fn exec_dfa( + fn exec_dfa<'a, 'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &'a mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { debug_assert!(self.can_dfa); let btext = text.as_bytes(); - let search = Dfa::exec(&self.dfa, btext, start, caps.is_empty()); - let match_end = match search { - DfaResult::Match(match_end) => match_end, - DfaResult::EarlyMatch => return true, - DfaResult::NoMatch => return false, - }; - // If caller has not requested any captures, then we don't need to - // find the start position. - if caps.is_empty() { - return true; + if !Dfa::exec(&self.dfa, search, btext, start) { + return false; } + let match_end = match search.captures.get(1) { + Some(&Some(i)) => i, + // The DFA returned true for a match, but did not set any capture + // location because the caller didn't ask for them. Therefore, we + // can quit immediately. + _ => return true, + }; // invariant: caps.len() >= 2 && caps.len() % 2 == 0 // If the reported end of the match is the same as the start, then we // have an empty match and we can quit now. if start == match_end { // Be careful... If the caller wants sub-captures, than we are // obliged to run the NFA to get them. - if caps.len() == 2 { + if search.captures.len() == 2 { // The caller only needs the start/end, so we can avoid the // NFA here. - caps[0] = Some(start); - caps[1] = Some(start); + search.captures[0] = Some(start); + search.captures[1] = Some(start); return true; } - return self.exec_auto_nfa(caps, text, start); + return self.exec_auto_nfa(search, text, start); } // OK, now we find the start of the match by running the DFA backwards // on the text. We *start* the search at the end of the match. - let search = Dfa::exec( - &self.dfa_reverse, &btext[start..], match_end - start, false); - let match_start = match search { - DfaResult::Match(match_start) => start + match_start, - DfaResult::EarlyMatch => { - panic!("BUG: early matches can't happen on reverse search") - } - DfaResult::NoMatch => { - panic!("BUG: forward match implies backward match") - } + let matched = Dfa::exec( + &self.dfa_reverse, search, &btext[start..], match_end - start); + if !matched { + panic!("BUG: forward match implies backward match"); + } + let match_start = match search.captures.get(0) { + Some(&Some(i)) => start + i, + _ => panic!("BUG: early match can't happen on reverse search"), }; - if caps.len() == 2 { + if search.captures.len() == 2 { // If the caller doesn't care about capture locations, then we can // avoid running the NFA to fill them in. - caps[0] = Some(match_start); - caps[1] = Some(match_end); + search.captures[0] = Some(match_start); + search.captures[1] = Some(match_end); return true; } - self.exec_auto_nfa(caps, text, match_start) + self.exec_auto_nfa(search, text, match_start) } /// This is like exec_auto, except it always chooses between either the /// full NFA simulation or the bounded backtracking engine. - fn exec_auto_nfa( + fn exec_auto_nfa<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { - if backtrack::should_exec(self.prog.insts.len(), text.len()) { - self.exec_backtrack(caps, text, start) + if backtrack::should_exec(self.prog.len(), text.len()) { + self.exec_backtrack(search, text, start) } else { - self.exec_nfa(caps, text, start) + self.exec_nfa(search, text, start) } } /// Always run the NFA algorithm. - fn exec_nfa( + fn exec_nfa<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { - if self.prog.insts.is_bytes() { - Nfa::exec(&self.prog, caps, ByteInput::new(text), start) + if self.prog.is_bytes { + Nfa::exec(&self.prog, search, ByteInput::new(text), start) } else { - Nfa::exec(&self.prog, caps, CharInput::new(text), start) + Nfa::exec(&self.prog, search, CharInput::new(text), start) } } /// Always runs the NFA using bounded backtracking. - fn exec_backtrack( + fn exec_backtrack<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { - if self.prog.insts.is_bytes() { - Backtrack::exec(&self.prog, caps, ByteInput::new(text), start) + if self.prog.is_bytes { + Backtrack::exec(&self.prog, search, ByteInput::new(text), start) } else { - Backtrack::exec(&self.prog, caps, CharInput::new(text), start) + Backtrack::exec(&self.prog, search, CharInput::new(text), start) } } @@ -342,19 +423,19 @@ impl Exec { /// regex machinery and use specialized DFAs. /// /// This panics if the set of literals do not correspond to matches. - fn exec_literals( + fn exec_literals<'c, 'm>( &self, - caps: &mut CaptureIdxs, + search: &mut Search<'c, 'm>, text: &str, start: usize, ) -> bool { - debug_assert!(self.prog.is_prefix_match()); + debug_assert!(self.prog.prefixes.at_match()); match self.prog.prefixes.find(&text.as_bytes()[start..]) { None => false, Some((s, e)) => { - if caps.len() == 2 { - caps[0] = Some(start + s); - caps[1] = Some(start + e); + if search.captures.len() == 2 { + search.captures[0] = Some(start + s); + search.captures[1] = Some(start + e); } true } @@ -366,28 +447,30 @@ impl Exec { Regex::Dynamic(self) } - /// Return the original regular expression string. - pub fn regex_str(&self) -> &str { - &self.prog.original + /// The original regular expressions given by the caller that were + /// compiled. + pub fn regex_strings(&self) -> &[String] { + &self.res + } + + /// Return a slice of instruction pointers to match slots. + /// + /// There is a match slot for every regular expression in this executor. + pub fn matches(&self) -> &[InstPtr] { + &self.prog.matches } /// Return a slice of capture names. /// /// Any capture that isn't named is None. - pub fn capture_names(&self) -> &[Option] { - &self.prog.cap_names + pub fn captures(&self) -> &[Option] { + &self.prog.captures } /// Return a reference to named groups mapping (from group name to /// group position). - pub fn named_groups(&self) -> &Arc> { - &self.prog.named_groups - } - - /// Return a fresh allocation for storing all possible captures in the - /// underlying regular expression. - pub fn alloc_captures(&self) -> Vec> { - self.prog.alloc_captures() + pub fn capture_name_idx(&self) -> &Arc> { + &self.prog.capture_name_idx } } diff --git a/src/input.rs b/src/input.rs index dacb36cc0f..04442bf93f 100644 --- a/src/input.rs +++ b/src/input.rs @@ -24,7 +24,7 @@ pub struct InputAt { impl InputAt { /// Returns true iff this position is at the beginning of the input. - pub fn is_beginning(&self) -> bool { + pub fn is_start(&self) -> bool { self.pos == 0 } diff --git a/src/lib.rs b/src/lib.rs index 84d26fb64d..86acba164c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -210,6 +210,34 @@ //! # } //! ``` //! +//! # Example: match multiple regular expressions simultaneously +//! +//! This demonstrates how to use a `RegexSet` to match multiple (possibly +//! overlapping) regular expressions in a single scan of the search text: +//! +//! ```rust +//! use regex::RegexSet; +//! +//! let set = RegexSet::new(&[ +//! r"\w+", +//! r"\d+", +//! r"\pL+", +//! r"foo", +//! r"bar", +//! r"barfoo", +//! r"foobar", +//! ]).unwrap(); +//! +//! // Iterate over and collect all of the matches. +//! let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +//! assert_eq!(matches, vec![0, 2, 3, 4, 6]); +//! +//! // You can also test whether a particular regex matched: +//! let matches = set.matches("foobar"); +//! assert!(!matches.matched(5)); +//! assert!(matches.matched(6)); +//! ``` +//! //! # Pay for what you use //! //! With respect to searching text with a regular expression, there are three @@ -438,7 +466,9 @@ //! allowed to store a fixed number of states. (When the limit is reached, its //! states are wiped and continues on, possibly duplicating previous work.) -#![deny(missing_docs)] +#![allow(dead_code, unused_imports, unused_variables)] + +// #![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", @@ -457,6 +487,7 @@ pub use re::{ Replacer, NoExpand, RegexSplits, RegexSplitsN, quote, is_match, }; +pub use set::{RegexSet, SetMatches, SetMatchesIntoIter, SetMatchesIter}; mod backtrack; mod char; @@ -465,12 +496,12 @@ mod compile; mod dfa; mod exec; mod input; -mod inst; mod literals; mod nfa; mod pool; -mod program; +mod prog; mod re; +mod set; mod sparse; /// The `internal` module exists to support the `regex!` macro and other @@ -478,9 +509,10 @@ mod sparse; #[doc(hidden)] pub mod internal { pub use char::Char; + pub use compile::Compiler; pub use exec::{Exec, ExecBuilder}; pub use input::{Input, CharInput, InputAt}; - pub use inst::{Inst, EmptyLook, InstRanges}; - pub use program::{Program, ProgramBuilder}; + pub use literals::{BuildPrefixes, Literals}; + pub use prog::{Program, Inst, EmptyLook, InstRanges}; pub use re::ExNative; } diff --git a/src/literals.rs b/src/literals.rs index 13d2e150a5..cd3d6d7ffd 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -17,7 +17,7 @@ use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; use memchr::{memchr, memchr2, memchr3}; use char_utf8::encode_utf8; -use inst::{Insts, Inst, InstBytes, InstRanges}; +use prog::{Program, Inst, InstBytes, InstRanges}; #[derive(Clone, Eq, PartialEq)] pub struct AlternateLiterals { @@ -200,13 +200,13 @@ impl AlternateLiterals { } pub struct BuildPrefixes<'a> { - insts: &'a Insts, + insts: &'a Program, limit: usize, alts: AlternateLiterals, } impl<'a> BuildPrefixes<'a> { - pub fn new(insts: &'a Insts) -> Self { + pub fn new(insts: &'a Program) -> Self { BuildPrefixes { insts: insts, limit: 250, @@ -215,7 +215,7 @@ impl<'a> BuildPrefixes<'a> { } pub fn literals(mut self) -> AlternateLiterals { - let mut stack = vec![self.insts.skip(self.insts.start())]; + let mut stack = vec![self.insts.skip(self.insts.start)]; let mut seen = HashSet::new(); while let Some(mut pc) = stack.pop() { seen.insert(pc); @@ -272,13 +272,13 @@ impl<'a> BuildPrefixes<'a> { } pub struct BuildRequiredLiterals<'a> { - insts: &'a Insts, + insts: &'a Program, limit: usize, alts: AlternateLiterals, } impl<'a> BuildRequiredLiterals<'a> { - pub fn new(insts: &'a Insts) -> Self { + pub fn new(insts: &'a Program) -> Self { BuildRequiredLiterals { insts: insts, limit: 250, @@ -292,7 +292,7 @@ impl<'a> BuildRequiredLiterals<'a> { } fn literals(mut self, mut pc: usize) -> AlternateLiterals { - use inst::Inst::*; + use prog::Inst::*; loop { let inst = &self.insts[pc]; match *inst { @@ -318,7 +318,7 @@ impl<'a> BuildRequiredLiterals<'a> { } pc = inst.goto; } - Split(_) | EmptyLook(_) | Match => { + Split(_) | EmptyLook(_) | Match(_) => { self.alts.at_match = self.insts.leads_to_match(pc); break; } @@ -662,31 +662,32 @@ impl fmt::Debug for Literals { #[cfg(test)] mod tests { - use program::ProgramBuilder; - use super::AlternateLiterals; + use compile::Compiler; + use super::{AlternateLiterals, BuildPrefixes}; + use syntax::Expr; macro_rules! prog { - ($re:expr) => { ProgramBuilder::new($re).compile().unwrap() } - } - - macro_rules! byte_prog { - ($re:expr) => { - ProgramBuilder::new($re).bytes(true).compile().unwrap() - } + ($re:expr) => {{ + let expr = Expr::parse($re).unwrap(); + let prog = Compiler::new().compile(&[expr]).unwrap(); + prog + }} } macro_rules! prefixes { ($re:expr) => {{ let p = prog!($re); - assert!(!p.prefixes.at_match()); - p.prefixes.prefixes() + let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); + assert!(!prefixes.at_match()); + prefixes.prefixes() }} } macro_rules! prefixes_complete { ($re:expr) => {{ let p = prog!($re); - assert!(p.prefixes.at_match()); - p.prefixes.prefixes() + let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); + assert!(prefixes.at_match()); + prefixes.prefixes() }} } diff --git a/src/nfa.rs b/src/nfa.rs index 634d1ffaaf..b78b768d5d 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -27,10 +27,9 @@ use std::mem; +use exec::Search; use input::{Input, InputAt}; -use inst::InstPtr; -use program::Program; -use re::CaptureIdxs; +use prog::{Program, InstPtr}; use sparse::SparseSet; /// An NFA simulation matching engine. @@ -101,27 +100,27 @@ impl<'r, I: Input> Nfa<'r, I> { /// captures accordingly. pub fn exec( prog: &'r Program, - mut caps: &mut CaptureIdxs, + search: &mut Search, input: I, start: usize, ) -> bool { let mut _cache = prog.cache_nfa(); let mut cache = &mut **_cache; - cache.clist.resize(prog.insts.len(), prog.num_captures()); - cache.nlist.resize(prog.insts.len(), prog.num_captures()); + cache.clist.resize(prog.len(), prog.captures.len()); + cache.nlist.resize(prog.len(), prog.captures.len()); let at = input.at(start); Nfa { prog: prog, stack: &mut cache.stack, input: input, - }.exec_(&mut cache.clist, &mut cache.nlist, &mut caps, at) + }.exec_(&mut cache.clist, &mut cache.nlist, search, at) } fn exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, - mut caps: &mut CaptureIdxs, + mut search: &mut Search, mut at: InputAt, ) -> bool { let mut matched = false; @@ -137,8 +136,7 @@ impl<'r, I: Input> Nfa<'r, I> { // // 2. If the expression starts with a '^' we can terminate as // soon as the last thread dies. - if matched - || (!at.is_beginning() && self.prog.anchored_begin) { + if matched || (!at.is_start() && self.prog.is_anchored_start) { break; } @@ -156,9 +154,8 @@ impl<'r, I: Input> Nfa<'r, I> { // This simulates a preceding '.*?' for every regex by adding // a state starting at the current position in the input for the // beginning of the program only if we don't already have a match. - if clist.set.is_empty() - || (!self.prog.anchored_begin && !matched) { - self.add(&mut clist, &mut caps, 0, at) + if clist.set.is_empty() || (!self.prog.is_anchored_start && !matched) { + self.add(&mut clist, &mut search.captures, 0, at) } // The previous call to "add" actually inspects the position just // before the current character. For stepping through the machine, @@ -167,19 +164,34 @@ impl<'r, I: Input> Nfa<'r, I> { let at_next = self.input.at(at.next_pos()); for i in 0..clist.set.len() { let ip = clist.set[i]; - let tcaps = clist.caps(ip); - if self.step(&mut nlist, caps, tcaps, ip, at, at_next) { - matched = true; - if caps.len() == 0 { + let step = self.step( + &mut nlist, + search, + clist.caps(ip), + ip, + at, + at_next, + ); + if step { + if !matched { + matched = search.all_matched(); + } + if search.quit_after_first_match() { // If we only care if a match occurs (not its // position), then we can quit right now. break 'LOOP; } - // We don't need to check the rest of the threads in this - // set because we've matched something ("leftmost-first"). - // However, we still need to check threads in the next set - // to support things like greedy matching. - break; + if search.matches.len() <= 1 { + // We don't need to check the rest of the threads + // in this set because we've matched something + // ("leftmost-first"). However, we still need to check + // threads in the next set to support things like + // greedy matching. + // + // This is only true on normal regexes. For regex sets, + // we need to mush on to observe other matches. + break; + } } } if at.is_end() { @@ -207,18 +219,17 @@ impl<'r, I: Input> Nfa<'r, I> { fn step( &mut self, nlist: &mut Threads, - caps: &mut [Option], + search: &mut Search, thread_caps: &mut [Option], ip: usize, at: InputAt, at_next: InputAt, ) -> bool { - use inst::Inst::*; - match self.prog.insts[ip] { - Match => { - for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } + use prog::Inst::*; + match self.prog[ip] { + Match(match_slot) => { + search.copy_captures_from(thread_caps); + search.set_match(match_slot); true } Char(ref inst) => { @@ -283,14 +294,14 @@ impl<'r, I: Input> Nfa<'r, I> { // traverse the set of states. We only push to the stack when we // absolutely need recursion (restoring captures or following a // branch). - use inst::Inst::*; + use prog::Inst::*; loop { // Don't visit states we've already added. if nlist.set.contains_ip(ip) { return; } nlist.set.add(ip); - match self.prog.insts[ip] { + match self.prog[ip] { EmptyLook(ref inst) => { let prev = self.input.previous_char(at); let next = self.input.next_char(at); @@ -312,7 +323,7 @@ impl<'r, I: Input> Nfa<'r, I> { self.stack.push(FollowEpsilon::IP(inst.goto2)); ip = inst.goto1; } - Match | Char(_) | Ranges(_) | Bytes(_) => { + Match(_) | Char(_) | Ranges(_) | Bytes(_) => { let mut t = &mut nlist.caps(ip); for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { *slot = *val; diff --git a/src/inst.rs b/src/prog.rs similarity index 73% rename from src/inst.rs rename to src/prog.rs index 302a428be8..99ae2e420d 100644 --- a/src/inst.rs +++ b/src/prog.rs @@ -1,57 +1,61 @@ +use std::collections::HashMap; use std::cmp::Ordering; use std::fmt; use std::ops::Deref; use std::mem; use std::slice; +use std::sync::Arc; +use backtrack::BacktrackCache; use char::Char; -use literals::{BuildPrefixes, Literals}; +use dfa::DfaCache; +use literals::Literals; +use nfa::NfaCache; +use pool::{Pool, PoolGuard}; /// InstPtr represents the index of an instruction in a regex program. pub type InstPtr = usize; -/// Insts is a sequence of instructions. +/// Program is a sequence of instructions and various facts about thos +/// instructions. #[derive(Clone)] -pub struct Insts { - insts: Vec, - bytes: bool, - reverse: bool, - byte_classes: Vec, +pub struct Program { + pub insts: Vec, + pub matches: Vec, + pub captures: Vec>, + pub capture_name_idx: Arc>, + pub start: InstPtr, + pub byte_classes: Vec, + pub is_bytes: bool, + pub is_dfa: bool, + pub is_reverse: bool, + pub is_anchored_start: bool, + pub is_anchored_end: bool, + pub prefixes: Literals, + pub cache: EngineCache, } -impl Insts { - /// Create a new instruction sequence. - /// - /// If `bytes` is true, then this instruction sequence must run on raw - /// bytes. Otherwise, it is executed on Unicode codepoints. - /// - /// A Vec can be created with the compiler. - pub fn new( - insts: Vec, - bytes: bool, - reverse: bool, - byte_classes: Vec, - ) -> Self { - assert!(byte_classes.len() == 256); - Insts { - insts: insts, - bytes: bytes, - reverse: reverse, - byte_classes: byte_classes, +impl Program { + /// Creates an empty instruction sequence. Fields are given default + /// values. + pub fn new() -> Self { + Program { + insts: vec![], + matches: vec![], + captures: vec![], + capture_name_idx: Arc::new(HashMap::new()), + start: 0, + byte_classes: vec![], + is_bytes: false, + is_dfa: false, + is_reverse: false, + is_anchored_start: false, + is_anchored_end: false, + prefixes: Literals::empty(), + cache: EngineCache::new(), } } - /// Returns true if and only if this instruction sequence must be executed - /// on byte strings. - pub fn is_bytes(&self) -> bool { - self.bytes - } - - /// Returns true if and only if this instruction sequence is reversed. - pub fn is_reversed(&self) -> bool { - self.reverse - } - /// If pc is an index to a no-op instruction (like Save), then return the /// next pc that is not a no-op instruction. pub fn skip(&self, mut pc: usize) -> usize { @@ -63,73 +67,40 @@ impl Insts { } } - /// Returns a map from input byte to byte class. Each class represents - /// a set of bytes that are indistinguishable to the underlying - /// instructions. - /// - /// It is guaranteed to have length 256. - pub fn byte_classes(&self) -> &[u8] { - &self.byte_classes - } - - /// Returns the location of the `Save(0)` instruction, which is present - /// in every program and always indicates the logical start of a match. - /// - /// (DFA programs compile a `.*?` into the program, preceding the `Save(0)` - /// instruction, to support unanchored matches. Generally, we want to - /// ignore that `.*?` when doing analysis, like extracting prefixes.) - pub fn start(&self) -> InstPtr { - for (i, inst) in self.iter().enumerate() { - match *inst { - Inst::Save(ref inst) if inst.slot == 0 => return i, - _ => {} - } - } - unreachable!() - } - /// Return true if and only if an execution engine at instruction `pc` will /// always lead to a match. pub fn leads_to_match(&self, pc: usize) -> bool { + if self.matches.len() > 1 { + // If we have a regex set, then we have more than one ending + // state, so leading to one of those states is generally + // meaningless. + return false; + } match self[self.skip(pc)] { - Inst::Match => true, + Inst::Match(_) => true, _ => false, } } - /// Return true if and only if the regex is anchored at the start of - /// search text. - pub fn anchored_begin(&self) -> bool { - match self.get(1) { - Some(&Inst::EmptyLook(ref inst)) => { - inst.look == EmptyLook::StartText - } - _ => false, - } + /// Returns true if the current configuration demands that an implicit + /// `.*?` be prepended to the instruction sequence. + pub fn needs_dotstar(&self) -> bool { + self.is_dfa && !self.is_reverse && !self.is_anchored_start } - /// Return true if and only if the regex is anchored at the end of - /// search text. - pub fn anchored_end(&self) -> bool { - match self.get(self.len() - 3) { - Some(&Inst::EmptyLook(ref inst)) => { - inst.look == EmptyLook::EndText - } - _ => false, - } + /// Retrieve cached state for NFA execution. + pub fn cache_nfa(&self) -> PoolGuard> { + self.cache.nfa.get() } - /// Build a matching engine for all prefix literals in this instruction - /// sequence. - /// - /// If there are no prefix literals (or there are too many), then a - /// matching engine that never matches is returned. - pub fn prefix_matcher(&self) -> Literals { - if self.is_bytes() || self.is_reversed() { - Literals::empty() - } else { - BuildPrefixes::new(self).literals().into_matcher() - } + /// Retrieve cached state for backtracking execution. + pub fn cache_backtrack(&self) -> PoolGuard> { + self.cache.backtrack.get() + } + + /// Retrieve cached state for DFA execution. + pub fn cache_dfa(&self) -> PoolGuard> { + self.cache.dfa.get() } /// Return the approximate heap usage of this instruction sequence in @@ -142,7 +113,7 @@ impl Insts { } } -impl Deref for Insts { +impl Deref for Program { type Target = [Inst]; fn deref(&self) -> &Self::Target { @@ -150,7 +121,7 @@ impl Deref for Insts { } } -impl fmt::Debug for Insts { +impl fmt::Debug for Program { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Inst::*; @@ -171,7 +142,9 @@ impl fmt::Debug for Insts { try!(writeln!(f, "--------------------------------")); for (pc, inst) in self.iter().enumerate() { match *inst { - Match => try!(writeln!(f, "{:04} Match", pc)), + Match(slot) => { + try!(writeln!(f, "{:04} Match({:?})", pc, slot)) + } Save(ref inst) => { let s = format!("{:04} Save({})", pc, inst.slot); try!(writeln!(f, "{}", with_goto(pc, inst.goto, s))); @@ -215,12 +188,45 @@ impl fmt::Debug for Insts { } } -impl<'a> IntoIterator for &'a Insts { +impl<'a> IntoIterator for &'a Program { type Item = &'a Inst; type IntoIter = slice::Iter<'a, Inst>; fn into_iter(self) -> Self::IntoIter { self.iter() } } +/// EngineCache maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// The allocations are created lazily, so we don't pay for caches that +/// aren't used. +/// +/// N.B. These are all behind a pointer because it's fewer bytes to memcpy. +/// These caches are pushed/popped from the pool a lot, and a smaller +/// footprint can have an impact on matching small inputs. See, for example, +/// the hard_32 benchmark. +#[derive(Debug)] +pub struct EngineCache { + nfa: Pool>, + backtrack: Pool>, + dfa: Pool>, +} + +impl EngineCache { + fn new() -> Self { + EngineCache { + nfa: Pool::new(Box::new(|| Box::new(NfaCache::new()))), + backtrack: Pool::new(Box::new(|| Box::new(BacktrackCache::new()))), + dfa: Pool::new(Box::new(|| Box::new(DfaCache::new()))), + } + } +} + +impl Clone for EngineCache { + fn clone(&self) -> EngineCache { + EngineCache::new() + } +} + /// Inst is an instruction code in a Regex program. /// /// Regrettably, a regex program either contains Unicode codepoint @@ -241,7 +247,13 @@ impl<'a> IntoIterator for &'a Insts { #[derive(Clone, Debug)] pub enum Inst { /// Match indicates that the program has reached a match state. - Match, + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), /// Save causes the program to save the current location of the input in /// the slot indicated by InstSave. Save(InstSave), diff --git a/src/program.rs b/src/program.rs deleted file mode 100644 index 9b5dcddc17..0000000000 --- a/src/program.rs +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::collections::HashMap; -use std::sync::Arc; - -use syntax; - -use backtrack::BacktrackCache; -use compile::{Compiled, Compiler}; -use dfa::DfaCache; -use inst::Insts; -use nfa::NfaCache; -use pool::{Pool, PoolGuard}; -use literals::Literals; -use Error; - - -/// Program represents a compiled regular expression. Once an expression is -/// compiled, its representation is immutable and will never change. -/// (Well, almost. In fact, the matching engines cache state that can be -/// reused on subsequent searches. But this is interior mutability that -/// shouldn't be observable by the caller.) -/// -/// A compiled regular expression contains quite a bit more than justs its -/// opcodes. It also contains capture group names, literal prefixes, the -/// original regular expression string and some facts about the expression -/// (like whether it is anchored to the beginning or end of the search text). -#[derive(Clone, Debug)] -pub struct Program { - /// The original regular expression string. - pub original: String, - /// A sequence of instructions. - pub insts: Insts, - /// The sequence of capture group names. There is an entry for each capture - /// group index and a name exists only if the capture group is named. - pub cap_names: Vec>, - /// The map of named capture groups. The keys are group names and - /// the values are group indices. - pub named_groups: Arc>, - /// If the regular expression requires a literal prefix in order to have a - /// match, that prefix is stored here as a DFA. - pub prefixes: Literals, - /// True iff program is anchored at the beginning. - pub anchored_begin: bool, - /// True iff program is anchored at the end. - pub anchored_end: bool, - /// Cached reusable state for matching engines. - pub cache: EngineCache, -} - -/// A builder for compiling a regular expression program. -pub struct ProgramBuilder { - re: String, - compiler: Compiler, -} - -impl ProgramBuilder { - /// Create a new program builder for the given regular expression. - /// - /// Afer new is called, it is legal to call compile immediately. Default - /// values for other knobs are set automatically. - pub fn new(re: &str) -> Self { - ProgramBuilder { - re: re.to_owned(), - compiler: Compiler::new(), - } - } - - /// Set a size limit that the compiler uses to limit the total number of - /// bytes occupied by the opcodes for this regex. - pub fn size_limit(mut self, size_limit: usize) -> Self { - self.compiler = self.compiler.size_limit(size_limit); - self - } - - /// Enable compilation of a byte based program. - /// - /// By default, programs operate on Unicode codepoints. - pub fn bytes(mut self, yes: bool) -> Self { - self.compiler = self.compiler.bytes(yes); - self - } - - /// Enable compilation of a byte based DFA program. - /// - /// This does instruct the compiler to compile a byte based program, but - /// it also does other things that are specifically required by the lazy - /// DFA, such as adding a `.*?` before the first capture save for - /// unanchored regular expressions. - pub fn dfa(mut self, yes: bool) -> Self { - self.compiler = self.compiler.dfa(yes); - self - } - - /// Compile the regular expression in reverse. - /// - /// This is generally only used by the lazy DFA to find the start location - /// of a match. - pub fn reverse(mut self, yes: bool) -> Self { - self.compiler = self.compiler.reverse(yes); - self - } - - /// Compile the given regular expression under the given configuration. - /// - /// If the regular expression could not be compiled (e.g., it is too big), - /// then return an error. - pub fn compile(self) -> Result { - let expr = try!(syntax::Expr::parse(&self.re)); - let Compiled { insts, cap_names } = try!(self.compiler.compile(&expr)); - let (prefixes, anchored_begin, anchored_end) = ( - insts.prefix_matcher(), - insts.anchored_begin(), - insts.anchored_end(), - ); - let mut named_groups = HashMap::new(); - for (i, name) in cap_names.iter().enumerate() { - if let Some(ref name) = *name { - named_groups.insert(name.to_owned(), i); - } - } - Ok(Program { - original: self.re, - insts: insts, - cap_names: cap_names, - named_groups: Arc::new(named_groups), - prefixes: prefixes, - anchored_begin: anchored_begin, - anchored_end: anchored_end, - cache: EngineCache::new(), - }) - } -} - -impl Program { - /// Returns true if the set of literal prefixes implies a match and - /// preserves leftmost first matching semantics. - /// - /// If this returns true, then it is possible to avoid running any of the - /// NFA or DFA based matching engines entirely. - pub fn is_prefix_match(&self) -> bool { - self.prefixes.at_match() - } - - /// Returns true if the underlying program is reversed. - pub fn is_reversed(&self) -> bool { - self.insts.is_reversed() - } - - /// Returns the total number of capture groups in the regular expression. - /// This includes the zeroth capture. - pub fn num_captures(&self) -> usize { - self.cap_names.len() - } - - /// Allocate new capture groups. - pub fn alloc_captures(&self) -> Vec> { - vec![None; 2 * self.num_captures()] - } - - /// Retrieve cached state for NFA execution. - pub fn cache_nfa(&self) -> PoolGuard> { - self.cache.nfa.get() - } - - /// Retrieve cached state for backtracking execution. - pub fn cache_backtrack(&self) -> PoolGuard> { - self.cache.backtrack.get() - } - - /// Retrieve cached state for DFA execution. - pub fn cache_dfa(&self) -> PoolGuard> { - self.cache.dfa.get() - } - - /// Return the approximate heap usage of this Program in bytes. - /// - /// Note that this does not include cached engine data. - pub fn approximate_size(&self) -> usize { - // ignore capture names - self.original.len() - + self.insts.approximate_size() - + self.prefixes.approximate_size() - } -} - -/// EngineCache maintains reusable allocations for each matching engine -/// available to a particular program. -/// -/// The allocations are created lazily, so we don't pay for caches that -/// aren't used. -/// -/// N.B. These are all behind a pointer because it's fewer bytes to memcpy. -/// These caches are pushed/popped from the pool a lot, and a smaller -/// footprint can have an impact on matching small inputs. See, for example, -/// the hard_32 benchmark. -#[derive(Debug)] -pub struct EngineCache { - nfa: Pool>, - backtrack: Pool>, - dfa: Pool>, -} - -impl EngineCache { - fn new() -> Self { - EngineCache { - nfa: Pool::new(Box::new(|| Box::new(NfaCache::new()))), - backtrack: Pool::new(Box::new(|| Box::new(BacktrackCache::new()))), - dfa: Pool::new(Box::new(|| Box::new(DfaCache::new()))), - } - } -} - -impl Clone for EngineCache { - fn clone(&self) -> EngineCache { - EngineCache::new() - } -} diff --git a/src/re.rs b/src/re.rs index 9ce1ef4561..624c74c111 100644 --- a/src/re.rs +++ b/src/re.rs @@ -17,7 +17,7 @@ use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; use std::sync::Arc; -use exec::{Exec, ExecBuilder}; +use exec::{CaptureSlots, Exec, ExecBuilder, Search}; use syntax; const REPLACE_EXPAND: &'static str = r"(?x) @@ -30,9 +30,6 @@ const REPLACE_EXPAND: &'static str = r"(?x) ) "; -/// Type alias for representing capture indices. -pub type CaptureIdxs = [Option]; - /// Escapes all regular expression meta characters in `text`. /// /// The string returned may be safely used as a literal in a regular @@ -67,6 +64,8 @@ pub enum Error { /// The compiled program exceeded the set size limit. /// The argument is the size limit imposed. CompiledTooBig(usize), + /// An invalid set is a regex set with fewer than 2 regular expressions. + InvalidSet, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -81,6 +80,9 @@ impl ::std::error::Error for Error { match *self { Error::Syntax(ref err) => err.description(), Error::CompiledTooBig(_) => "compiled program too big", + Error::InvalidSet => { + "sets must contain 2 or more regular expressions" + } Error::__Nonexhaustive => unreachable!(), } } @@ -101,6 +103,9 @@ impl fmt::Display for Error { write!(f, "Compiled regex exceeds size limit of {} bytes.", limit) } + Error::InvalidSet => { + write!(f, "Sets must contain 2 or more regular expressions.") + } Error::__Nonexhaustive => unreachable!(), } } @@ -188,7 +193,7 @@ pub struct ExNative { #[doc(hidden)] pub groups: &'static &'static [(&'static str, usize)], #[doc(hidden)] - pub prog: fn(&mut CaptureIdxs, &str, usize) -> bool, + pub prog: fn(CaptureSlots, &str, usize) -> bool, } impl Copy for ExNative {} @@ -634,7 +639,7 @@ impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match *self { - Regex::Dynamic(ref exec) => exec.regex_str(), + Regex::Dynamic(ref exec) => &exec.regex_strings()[0], Regex::Native(ExNative { ref original, .. }) => original, } } @@ -644,7 +649,7 @@ impl Regex { match *self { Regex::Native(ref n) => CaptureNames::Native(n.names.iter()), Regex::Dynamic(ref d) => { - CaptureNames::Dynamic(d.capture_names().iter()) + CaptureNames::Dynamic(d.captures().iter()) } } } @@ -653,14 +658,14 @@ impl Regex { pub fn captures_len(&self) -> usize { match *self { Regex::Native(ref n) => n.names.len(), - Regex::Dynamic(ref d) => d.capture_names().len() + Regex::Dynamic(ref d) => d.captures().len() } } fn alloc_captures(&self) -> Vec> { match *self { Regex::Native(ref n) => vec![None; 2 * n.names.len()], - Regex::Dynamic(ref d) => d.alloc_captures(), + Regex::Dynamic(ref d) => vec![None; 2 * d.captures().len()], } } } @@ -811,7 +816,6 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { } enum NamedGroups { - Empty, Native(&'static [(&'static str, usize)]), Dynamic(Arc>), } @@ -819,22 +823,17 @@ enum NamedGroups { impl NamedGroups { fn from_regex(regex: &Regex) -> NamedGroups { match *regex { - Regex::Native(ExNative { ref groups, .. }) => - NamedGroups::Native(groups), + Regex::Native(ExNative { ref groups, .. }) => { + NamedGroups::Native(groups) + } Regex::Dynamic(ref exec) => { - let groups = exec.named_groups(); - if groups.is_empty() { - NamedGroups::Empty - } else { - NamedGroups::Dynamic(groups.clone()) - } + NamedGroups::Dynamic(exec.capture_name_idx().clone()) } } } fn pos(&self, name: &str) -> Option { match *self { - NamedGroups::Empty => None, NamedGroups::Native(groups) => { groups.binary_search_by(|&(n, _)| n.cmp(name)) .ok().map(|i| groups[i].1) @@ -847,7 +846,6 @@ impl NamedGroups { fn iter<'n>(&'n self) -> NamedGroupsIter<'n> { match *self { - NamedGroups::Empty => NamedGroupsIter::Empty, NamedGroups::Native(g) => NamedGroupsIter::Native(g.iter()), NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()), } @@ -1233,10 +1231,16 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { } } -fn exec(re: &Regex, caps: &mut CaptureIdxs, text: &str, start: usize) -> bool { +fn exec(re: &Regex, caps: CaptureSlots, text: &str, start: usize) -> bool { match *re { Regex::Native(ExNative { ref prog, .. }) => (*prog)(caps, text, start), - Regex::Dynamic(ref prog) => prog.exec(caps, text, start), + Regex::Dynamic(ref prog) => { + let mut search = Search { + captures: caps, + matches: &mut [false], + }; + prog.exec(&mut search, text, start) + } } } diff --git a/src/set.rs b/src/set.rs new file mode 100644 index 0000000000..fefe009eca --- /dev/null +++ b/src/set.rs @@ -0,0 +1,315 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; +use std::iter; +use std::slice; +use std::vec; + +use syntax::Expr; + +use exec::{Exec, ExecBuilder, Search}; +use Error; + +/// Match multiple (possibly overlapping) regular expressions in a single scan. +/// +/// A regex set corresponds to the union of two or more regular expressions. +/// That is, a regex set will match text where at least one of its +/// constituent regular expressions matches. A regex set as its formulated here +/// provides a touch more power: it will also report *which* regular +/// expressions in the set match. Indeed, this is the key difference between +/// regex sets and a single `Regex` with many alternates, since only one +/// alternate can match at a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the text +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the text. The key advantage of using a regex set is that it +/// will report the matching regexes using a *single pass through the text*. +/// If one has hundreds or thousands of regexes to match repeatedly (like a URL +/// router for a complex web application or a user agent matcher), then a regex +/// set can realize huge performance gains. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +/// ```rust +/// use regex::RegexSet; +/// +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```ignore +/// (?P[a-z]+@(?P[a-z]+[.](com|org|net)))|(?P[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) +/// since the matching engines can stop after the first match is found. +/// +/// Other features like finding the location of successive matches or their +/// sub-captures aren't supported. If you need this functionality, the +/// recommended approach is to compile each regex in the set independently and +/// selectively match them based on which regexes in the set matched. +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(mn)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the search text. +#[derive(Clone)] +pub struct RegexSet(Exec); + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ```rust + /// use regex::RegexSet; + /// + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new(exprs: I) -> Result + where S: AsRef, I: IntoIterator { + let exec = try!(ExecBuilder::new_many(exprs).build()); + if exec.regex_strings().len() < 2 { + return Err(Error::InvalidSet); + } + Ok(RegexSet(exec)) + } + + /// Returns true if and only if one of the regexes in this set matches + /// the text given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests whether a set matches some text: + /// + /// ```rust + /// use regex::RegexSet; + /// + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + pub fn is_match(&self, text: &str) -> bool { + let mut search = Search { captures: &mut [], matches: &mut [] }; + self.0.exec(&mut search, text, 0) + } + + /// Returns the set of regular expressions that match in the given text. + /// + /// The set returned contains the index of each regular expression that + /// matches in the given text. The index is in correspondence with the + /// order of regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests which regular expressions match the given text: + /// + /// ```rust + /// use regex::RegexSet; + /// + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + pub fn matches(&self, text: &str) -> SetMatches { + let mut matches = vec![false; self.0.matches().len()]; + let matched_any = { + let mut search = Search { + captures: &mut [], + matches: &mut matches + }; + self.0.exec(&mut search, text, 0) + }; + SetMatches { + matched_any: matched_any, + matches: matches, + } + } + + /// Returns the total number of regular expressions in this set. + pub fn len(&self) -> usize { + self.0.regex_strings().len() + } +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +/// A set of matches returned by a regex set. +#[derive(Clone, Debug)] +pub struct SetMatches { + matched_any: bool, + matches: Vec, +} + +impl SetMatches { + /// Whether this set contains any matches. + pub fn matched_any(&self) -> bool { + self.matched_any + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `regex_index` is greater than or equal to `self.len()`. + pub fn matched(&self, regex_index: usize) -> bool { + self.matches[regex_index] + } + + /// The total number of regexes in the set that created these matches. + pub fn len(&self) -> usize { + self.matches.len() + } + + /// Returns an iterator over indexes in the regex that matched. + pub fn iter(&self) -> SetMatchesIter { + SetMatchesIter((&*self.matches).into_iter().enumerate()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + SetMatchesIntoIter(self.matches.into_iter().enumerate()) + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +pub struct SetMatchesIntoIter(iter::Enumerate>); + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option { + loop { + match self.0.next() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } +} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. +#[derive(Clone)] +pub struct SetMatchesIter<'a>(iter::Enumerate>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + loop { + match self.0.next() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } +} diff --git a/tests/test_dynamic.rs b/tests/test_dynamic.rs index 98837d70dd..18596ad30b 100644 --- a/tests/test_dynamic.rs +++ b/tests/test_dynamic.rs @@ -21,8 +21,15 @@ extern crate regex; // regex and the input. Other dynamic tests explicitly set the engine to use. macro_rules! regex { ($re:expr) => {{ - use regex::internal::ExecBuilder; - ExecBuilder::new($re).build().unwrap().into_regex() + use regex::Regex; + Regex::new($re).unwrap() + }} +} + +macro_rules! regex_set { + ($res:expr) => {{ + use regex::RegexSet; + RegexSet::new($res).unwrap() }} } @@ -32,6 +39,7 @@ macro_rules! searcher_expr { ($e:expr) => ($e) } macro_rules! searcher_expr { ($e:expr) => ({}) } mod tests; +mod tests_set; // Regression test for https://github.com/rust-lang/regex/issues/98 // @@ -42,3 +50,23 @@ fn regression_many_repeat_stack_overflow() { let re = regex!("^.{1,2500}"); assert_eq!(re.find("a"), Some((0, 1))); } + +#[test] +fn set_empty() { + use regex::{Error, RegexSet}; + let err = RegexSet::new::<&[String], &String>(&[]).unwrap_err(); + match err { + Error::InvalidSet => {} + err => panic!("expected Error::InvalidSet but got {:?}", err), + } +} + +#[test] +fn set_one() { + use regex::{Error, RegexSet}; + let err = RegexSet::new(&["foo"]).unwrap_err(); + match err { + Error::InvalidSet => {} + err => panic!("expected Error::InvalidSet but got {:?}", err), + } +} diff --git a/tests/tests_set.rs b/tests/tests_set.rs new file mode 100644 index 0000000000..a0712c89ee --- /dev/null +++ b/tests/tests_set.rs @@ -0,0 +1,45 @@ +macro_rules! mat { + ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => { + #[test] + fn $name() { + let set = regex_set!($res); + assert!(set.is_match($text)); + let expected = vec![$($match_index),*]; + let matches = set.matches($text); + assert!(matches.matched_any()); + let got: Vec<_> = matches.into_iter().collect(); + assert_eq!(expected, got); + } + } +} + +mat!(set1, &["a", "a"], "a", 0, 1); +mat!(set2, &["a", "a"], "ba", 0, 1); +mat!(set3, &["a", "b"], "a", 0); +mat!(set4, &["a", "b"], "b", 1); +mat!(set5, &["a|b", "b|a"], "b", 0, 1); +mat!(set6, &["foo", "oo"], "foo", 0, 1); +mat!(set7, &["^foo", "bar$"], "foo", 0); +mat!(set8, &["^foo", "bar$"], "foo bar", 0, 1); +mat!(set9, &["^foo", "bar$"], "bar", 1); +mat!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1); +mat!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); +mat!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); +mat!(set13, &[r".*", "a"], "zzzzzza", 0, 1); +mat!(set14, &[r".*", "a"], "zzzzzz", 0); + +macro_rules! nomat { + ($name:ident, $res:expr, $text:expr) => { + #[test] + fn $name() { + let set = regex_set!($res); + assert!(!set.is_match($text)); + let matches = set.matches($text); + assert!(!matches.matched_any()); + assert_eq!(0, matches.into_iter().count()); + } + } +} + +nomat!(nset1, &["a", "a"], "b"); +nomat!(nset2, &["^foo", "bar$"], "bar foo");