Add regex sets.

Regex sets permit matching multiple (possibly overlapping) regular expressions in a single scan of the search text. This adds a few new types, with `RegexSet` being the primary one. All matching engines support regex sets, including the lazy DFA. This commit also refactors a lot of the code around handling captures into a central `Search`, which now also includes a set of matches that is used by regex sets to determine which regex has matched. We also merged the `Program` and `Insts` type, which were split up when adding the lazy DFA, but the code seemed more complicated because of it. Closes #156.
rust-lang · Feb 22, 2016 · 94d0ad4 · 94d0ad4
1 parent 640bfa7
commit 94d0ad4
Show file tree

Hide file tree

Showing 21 changed files with 1,207 additions and 731 deletions.
diff --git a/HACKING.md b/HACKING.md
@@ -112,7 +112,7 @@ the NFA algorithm, because it was one fewer epsilon transition that it had to
 follow.
 
 There exist more instructions and they are defined and documented in
-src/inst.rs.
+src/prog.rs.
 
 Compilation has several knobs and a few unfortunately complicated invariants.
 Namely, the output of compilation can be one of two types of programs: a
@@ -163,7 +163,7 @@ engine (or engines) to use.
 
 The logic for choosing which engine to execute is in src/exec.rs and is
 documented on the Exec type. Exec values collection regular expression
-Programs (defined in src/program.rs), which contain all the necessary tidbits
+Programs (defined in src/prog.rs), which contain all the necessary tidbits
 for actually executing a regular expression on search text.
 
 For the most part, the execution logic is straight-forward and follows the

diff --git a/README.md b/README.md
@@ -128,6 +128,34 @@ fn some_helper_function(text: &str) -> bool {
 Specifically, in this example, the regex will be compiled when it is used for
 the first time. On subsequent uses, it will reuse the previous compilation.
 
+### Usage: match multiple regular expressions simultaneously
+
+This demonstrates how to use a `RegexSet` to match multiple (possibly
+overlapping) regular expressions in a single scan of the search text:
+
+```rust
+use regex::RegexSet;
+
+let set = RegexSet::new(&[
+    r"\w+",
+    r"\d+",
+    r"\pL+",
+    r"foo",
+    r"bar",
+    r"barfoo",
+    r"foobar",
+]).unwrap();
+
+// Iterate over and collect all of the matches.
+let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+
+// You can also test whether a particular regex matched:
+let matches = set.matches("foobar");
+assert!(!matches.matched(5));
+assert!(matches.matched(6));
+```
+
 ### Usage: `regex!` compiler plugin
 
 The `regex!` compiler plugin will compile your regexes at compile time. **This

diff --git a/benches/bench_dynamic_compile.rs b/benches/bench_dynamic_compile.rs
@@ -8,54 +8,55 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use regex_syntax::Expr;
 use test::Bencher;
 
-use regex::internal::ProgramBuilder;
+use regex::internal::Compiler;
 
 #[bench]
 fn compile_simple(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"^bc(d|e)*$";
-        ProgramBuilder::new(&re).compile().unwrap()
+        let re = Expr::parse(r"^bc(d|e)*$").unwrap();
+        Compiler::new().compile(&[re]).unwrap()
     });
 }
 
 #[bench]
 fn compile_simple_bytes(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"^bc(d|e)*$";
-        ProgramBuilder::new(&re).bytes(true).compile().unwrap()
+        let re = Expr::parse(r"^bc(d|e)*$").unwrap();
+        Compiler::new().bytes(true).compile(&[re]).unwrap()
     });
 }
 
 #[bench]
 fn compile_small(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"\p{L}|\p{N}|\s|.|\d";
-        ProgramBuilder::new(&re).compile().unwrap()
+        let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
+        Compiler::new().compile(&[re]).unwrap()
     });
 }
 
 #[bench]
 fn compile_small_bytes(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"\p{L}|\p{N}|\s|.|\d";
-        ProgramBuilder::new(&re).bytes(true).compile().unwrap()
+        let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
+        Compiler::new().bytes(true).compile(&[re]).unwrap()
     });
 }
 
 #[bench]
 fn compile_huge(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"\p{L}{100}";
-        ProgramBuilder::new(&re).compile().unwrap()
+        let re = Expr::parse(r"\p{L}{100}").unwrap();
+        Compiler::new().compile(&[re]).unwrap()
     });
 }
 
 #[bench]
 fn compile_huge_bytes(b: &mut Bencher) {
     b.iter(|| {
-        let re = r"\p{L}{100}";
-        ProgramBuilder::new(&re).bytes(true).compile().unwrap()
+        let re = Expr::parse(r"\p{L}{100}").unwrap();
+        Compiler::new().bytes(true).compile(&[re]).unwrap()
     });
 }
diff --git a/examples/set.rs b/examples/set.rs
@@ -0,0 +1,19 @@
+extern crate regex;
+
+use regex::RegexSet;
+
+fn main() {
+    let res = &[
+        "abc",
+        "xyzz",
+        "^[ga-fh-z]+$",
+    ];
+    let text = "abcggggggggxyz";
+    let set = RegexSet::new(res).unwrap();
+    println!("{:?}", set);
+    let m = set.is_match("abcggggggggxyz");
+    println!("match? {:?}", m);
+    for mi in set.matches(text) {
+        println!("{:?}", mi);
+    }
+}
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -177,6 +177,19 @@ pub enum Repeater {
     },
 }
 
+impl Repeater {
+    /// Returns true if and only if this repetition can match the empty string.
+    fn matches_empty(&self) -> bool {
+        use self::Repeater::*;
+        match *self {
+            ZeroOrOne => true,
+            ZeroOrMore => true,
+            OneOrMore => false,
+            Range { min, .. } => min == 0,
+        }
+    }
+}
+
 /// A character class.
 ///
 /// A character class has a canonical format that the parser guarantees. Its
@@ -315,7 +328,9 @@ impl Expr {
     /// the beginning of text.
     pub fn is_anchored_start(&self) -> bool {
         match *self {
-            Repeat { ref e, .. } => e.is_anchored_start(),
+            Repeat { ref e, r, .. } => {
+                !r.matches_empty() && e.is_anchored_start()
+            }
             Group { ref e, .. } => e.is_anchored_start(),
             Concat(ref es) => es[0].is_anchored_start(),
             Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
@@ -328,7 +343,9 @@ impl Expr {
     /// end of the text.
     pub fn is_anchored_end(&self) -> bool {
         match *self {
-            Repeat { ref e, .. } => e.is_anchored_end(),
+            Repeat { ref e, r, .. } => {
+                !r.matches_empty() && e.is_anchored_end()
+            }
             Group { ref e, .. } => e.is_anchored_end(),
             Concat(ref es) => es[es.len() - 1].is_anchored_end(),
             Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
@@ -1059,9 +1076,6 @@ mod tests {
         assert!(e("^a|^b").is_anchored_start());
         assert!(e("(^a)|(^b)").is_anchored_start());
         assert!(e("(^(a|b))").is_anchored_start());
-        assert!(e("^*").is_anchored_start());
-        assert!(e("(^)*").is_anchored_start());
-        assert!(e("((^)*)*").is_anchored_start());
 
         assert!(!e("^a|b").is_anchored_start());
         assert!(!e("a|^b").is_anchored_start());
@@ -1074,9 +1088,6 @@ mod tests {
         assert!(e("a$|b$").is_anchored_end());
         assert!(e("(a$)|(b$)").is_anchored_end());
         assert!(e("((a|b)$)").is_anchored_end());
-        assert!(e("$*").is_anchored_end());
-        assert!(e("($)*").is_anchored_end());
-        assert!(e("(($)*)*").is_anchored_end());
 
         assert!(!e("a$|b").is_anchored_end());
         assert!(!e("a|b$").is_anchored_end());

diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml
@@ -22,6 +22,10 @@ path = ".."
 version = "0.1"
 features = ["pattern"]
 
+[dependencies.regex-syntax]
+path = "../regex-syntax"
+version = "0.2"
+
 [dev-dependencies]
 lazy_static = "0.1"
 rand = "0.3"

diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs
@@ -18,8 +18,12 @@
 #![feature(plugin_registrar, quote, rustc_private)]
 
 extern crate regex;
-extern crate syntax;
+extern crate regex_syntax;
 extern crate rustc_plugin;
+extern crate syntax;
+
+use std::collections::BTreeMap;
+use std::usize;
 
 use syntax::ast;
 use syntax::codemap;
@@ -32,7 +36,8 @@ use syntax::ptr::P;
 
 use rustc_plugin::Registry;
 
-use regex::internal::{Inst, EmptyLook, Program, ProgramBuilder};
+use regex::internal::{Compiler, EmptyLook, Inst, Program};
+use regex_syntax::Expr;
 
 /// For the `regex!` syntax extension. Do not use.
 #[plugin_registrar]
@@ -67,15 +72,21 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
     };
     // We use the largest possible size limit because this is happening at
     // compile time. We trust the programmer.
-    let bprog = ProgramBuilder::new(&regex).size_limit(::std::usize::MAX);
-    let prog = match bprog.compile() {
+    let expr = match Expr::parse(&regex) {
+        Ok(expr) => expr,
+        Err(err) => {
+            cx.span_err(sp, &err.to_string());
+            return DummyResult::any(sp)
+        }
+    };
+    let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) {
         Ok(re) => re,
         Err(err) => {
             cx.span_err(sp, &err.to_string());
             return DummyResult::any(sp)
         }
     };
-    let names = prog.cap_names.iter().cloned().collect();
+    let names = prog.captures.iter().cloned().collect();
     let mut gen = NfaGen {
         cx: &*cx,
         sp: sp,
@@ -98,8 +109,8 @@ impl<'a> NfaGen<'a> {
     fn code(&mut self) -> P<ast::Expr> {
         // Most or all of the following things are used in the quasiquoted
         // expression returned.
-        let num_cap_locs = 2 * self.prog.num_captures();
-        let num_insts = self.prog.insts.len();
+        let num_cap_locs = 2 * self.prog.captures.len();
+        let num_insts = self.prog.len();
         let cap_names = self.vec_expr(self.names.iter(),
             &mut |cx, name| match *name {
                 Some(ref name) => {
@@ -109,21 +120,20 @@ impl<'a> NfaGen<'a> {
                 None => cx.expr_none(self.sp),
             }
         );
-        let named_groups = {
-            let mut named_groups = ::std::collections::BTreeMap::new();
+        let capture_name_idx = {
+            let mut capture_name_idx = BTreeMap::new();
             for (i, name) in self.names.iter().enumerate() {
                 if let Some(ref name) = *name {
-                    named_groups.insert(name.to_owned(), i);
+                    capture_name_idx.insert(name.to_owned(), i);
                 }
             }
-            self.vec_expr(named_groups.iter(),
+            self.vec_expr(capture_name_idx.iter(),
                 &mut |cx, (name, group_idx)|
                     quote_expr!(cx, ($name, $group_idx))
             )
         };
 
-        let prefix_anchor = self.prog.anchored_begin;
-
+        let is_anchored_start = self.prog.is_anchored_start;
         let step_insts = self.step_insts();
         let add_insts = self.add_insts();
         let regex = &*self.original;
@@ -135,9 +145,9 @@ impl<'a> NfaGen<'a> {
 // the user is only warned about *their* unused variable/code, and not the
 // unused code generated by regex!. See #14185 for an example.
 #[allow(dead_code)]
-static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
+static CAPTURES: &'static [Option<&'static str>] = &$cap_names;
 #[allow(dead_code)]
-static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;
+static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx;
 
 #[allow(dead_code)]
 fn exec<'t>(
@@ -175,14 +185,14 @@ fn exec<'t>(
             clist.empty(); nlist.empty();
 'LOOP:      loop {
                 if clist.size == 0 {
-                    if matched || (!at.is_beginning() && $prefix_anchor) {
+                    if matched || (!at.is_start() && $is_anchored_start) {
                         break;
                     }
                     // TODO: Prefix matching... Hmm.
                     // Prefix matching now uses a DFA, so I think this is
                     // going to require encoding that DFA statically.
                 }
-                if clist.size == 0 || (!$prefix_anchor && !matched) {
+                if clist.size == 0 || (!$is_anchored_start && !matched) {
                     self.add(clist, &mut caps, 0, at);
                 }
                 let at_next = self.input.at(at.next_pos());
@@ -322,8 +332,8 @@ fn exec<'t>(
 
 ::regex::Regex::Native(::regex::internal::ExNative {
     original: $regex,
-    names: &CAP_NAMES,
-    groups: &NAMED_GROUPS,
+    names: &CAPTURES,
+    groups: &CAPTURE_NAME_IDX,
     prog: exec,
 })
         })
@@ -332,7 +342,7 @@ fn exec<'t>(
     // Generates code for the `add` method, which is responsible for adding
     // zero-width states to the next queue of states to visit.
     fn add_insts(&self) -> P<ast::Expr> {
-        let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
+        let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
             let body = match *inst {
                 Inst::EmptyLook(ref inst) => {
                     let nextpc = inst.goto;
@@ -422,7 +432,7 @@ fn exec<'t>(
     // Generates the code for the `step` method, which processes all states
     // in the current queue that consume a single character.
     fn step_insts(&self) -> P<ast::Expr> {
-        let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
+        let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
             let body = match *inst {
                 Inst::Match => quote_expr!(self.cx, {
                     for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) {