diff --git a/src/exec.rs b/src/exec.rs index 51350dd675..29d614fddd 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -230,6 +230,7 @@ impl ExecBuilder { .ignore_whitespace(self.options.ignore_whitespace) .unicode(self.options.unicode) .allow_invalid_utf8(!self.only_utf8) + .nest_limit(self.options.nest_limit) .build(); let expr = try!(parser.parse(pat)); bytes = bytes || !expr.is_always_utf8(); diff --git a/src/re_builder.rs b/src/re_builder.rs index 56dac2704e..ae60ecffa4 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -15,6 +15,7 @@ pub struct RegexOptions { pub pats: Vec, pub size_limit: usize, pub dfa_size_limit: usize, + pub nest_limit: u32, pub case_insensitive: bool, pub multi_line: bool, pub dot_matches_new_line: bool, @@ -29,6 +30,7 @@ impl Default for RegexOptions { pats: vec![], size_limit: 10 * (1<<20), dfa_size_limit: 2 * (1<<20), + nest_limit: 250, case_insensitive: false, multi_line: false, dot_matches_new_line: false, @@ -163,6 +165,36 @@ impl RegexBuilder { self.0.dfa_size_limit = limit; self } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.0.nest_limit = limit; + self + } } } } @@ -274,6 +306,37 @@ impl RegexSetBuilder { self.0.dfa_size_limit = limit; self } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { + self.0.nest_limit = limit; + self + } + } } } diff --git a/tests/crazy.rs b/tests/crazy.rs index 207466c23f..02d058f1ae 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -103,3 +103,300 @@ fn dfa_handles_pathological_case() { }; assert!(re.is_match(text!(&*text))); } + +#[test] +fn nest_limit_makes_it_parse() { + use regex::RegexBuilder; + + RegexBuilder::new( + r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "# + ) + .nest_limit(1024) + .build() + .unwrap(); +}