fuzz: Add a roundtrip regex fuzz harness

This change adds an optional dependency on 'arbitrary' for regex-syntax. This allows us to generate arbitrary high-level intermediate representations (HIR). Using this generated HIR we convert this back to a regex string and exercise the regex matching code under src. Using this approach we can generate arbitrary well-formed regex strings, allowing the fuzzer to penetrate deeper into the regex code.
rust-lang · Feb 20, 2023 · b48c57b · b48c57b
1 parent a9b2e02
commit b48c57b
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 11 deletions.
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -1,5 +1,3 @@
-cargo-features = ['named-profiles']
-
 [package]
 name = "regex-fuzz"
 version = "0.0.0"
@@ -11,11 +9,16 @@ edition = "2018"
 cargo-fuzz = true
 
 [dependencies]
+arbitrary = { version = "1.2.3", features = ["derive"] }
 libfuzzer-sys = "0.4.1"
 
 [dependencies.regex]
 path = ".."
 
+[dependencies.regex-syntax]
+path = "../regex-syntax"
+features = ["arbitrary"]
+
 # Prevent this from interfering with workspaces
 [workspace]
 members = ["."]
@@ -24,6 +27,12 @@ members = ["."]
 name = "fuzz_regex_match"
 path = "fuzz_targets/fuzz_regex_match.rs"
 
+[[bin]]
+name = "fuzz_regex"
+path = "fuzz_targets/fuzz_regex.rs"
+test = false
+doc = false
+
 [profile.release]
 opt-level = 3
 debug = true

diff --git a/fuzz/fuzz_targets/fuzz_regex.rs b/fuzz/fuzz_targets/fuzz_regex.rs
@@ -0,0 +1,161 @@
+#![no_main]
+
+use arbitrary::Arbitrary;
+use libfuzzer_sys::fuzz_target;
+use regex_syntax::hir::print::Printer;
+use regex_syntax::hir::Hir;
+use std::{convert::TryFrom, hint::black_box};
+
+#[derive(Arbitrary, Debug, Clone)]
+enum Pattern {
+    WellFormed(Hir),
+    Random(String),
+}
+
+impl TryFrom<Pattern> for String {
+    type Error = std::fmt::Error;
+
+    fn try_from(pattern: Pattern) -> Result<Self, Self::Error> {
+        match pattern {
+            Pattern::WellFormed(hir) => {
+                let mut printer = Printer::new();
+                let mut dst = String::new();
+                printer.print(&hir, &mut dst)?;
+                return Ok(dst);
+            }
+            Pattern::Random(s) => {
+                return Ok(s);
+            }
+        }
+    }
+}
+
+#[derive(Arbitrary, Debug)]
+struct Data<'a> {
+    pattern: Pattern,
+    replacen: (usize, &'a str),
+    replacen_bytes: (usize, &'a [u8]),
+    input: &'a str,
+    input_bytes: &'a [u8],
+    pattern_set: Vec<Pattern>,
+    set_input: &'a str,
+    set_input_bytes: &'a [u8],
+}
+
+fn fuzz_regex(
+    pattern: &Pattern,
+    input: &str,
+    replacen: &(usize, &str),
+) -> Result<(), Box<dyn std::error::Error>> {
+    let re = regex::Regex::new(&String::try_from(pattern.clone())?)?;
+    _ = black_box(re.is_match(&input));
+    _ = black_box(re.captures_iter(&input).collect::<Vec<regex::Captures>>());
+    _ = black_box(re.split(&input).collect::<Vec<&str>>());
+
+    let (limit, replace) = *replacen;
+    _ = black_box(re.replacen(&input, limit, replace));
+
+    _ = black_box(re.find(&input));
+    _ = black_box(re.shortest_match(&input));
+    Ok(())
+}
+
+fn fuzz_regex_bytes(
+    pattern: &Pattern,
+    input: &[u8],
+    replacen: &(usize, &[u8]),
+) -> Result<(), Box<dyn std::error::Error>> {
+    let re = regex::bytes::Regex::new(&String::try_from(pattern.clone())?)?;
+    _ = black_box(re.is_match(&input));
+    _ = black_box(
+        re.captures_iter(&input).collect::<Vec<regex::bytes::Captures>>(),
+    );
+    _ = black_box(re.split(&input).collect::<Vec<&[u8]>>());
+
+    let (limit, replace) = *replacen;
+    _ = black_box(re.replacen(&input, limit, replace));
+
+    _ = black_box(re.find(&input));
+    _ = black_box(re.shortest_match(&input));
+    Ok(())
+}
+
+fn fuzz_regex_set(
+    pattern_set: &Vec<Pattern>,
+    input: &str,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let set = regex::RegexSet::new(
+        pattern_set
+            .into_iter()
+            .filter_map(|x| String::try_from(x.clone()).ok()),
+    )?;
+    _ = black_box(set.is_match(&input));
+    _ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
+    Ok(())
+}
+
+fn fuzz_regex_set_bytes(
+    pattern_set: &Vec<Pattern>,
+    input: &[u8],
+) -> Result<(), Box<dyn std::error::Error>> {
+    let set = regex::bytes::RegexSet::new(
+        pattern_set
+            .into_iter()
+            .filter_map(|x| String::try_from(x.clone()).ok()),
+    )?;
+    _ = black_box(set.is_match(&input));
+    _ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
+    Ok(())
+}
+
+fuzz_target!(|data: Data| {
+    if data.pattern_set.len() > 10 {
+        return;
+    }
+    let (_, replace) = data.replacen;
+    if replace.len() > 100 {
+        return;
+    }
+    let (_, replace) = data.replacen_bytes;
+    if replace.len() > 100 {
+        return;
+    }
+    if data.set_input.len() > 500 {
+        return;
+    }
+    if data.set_input_bytes.len() > 500 {
+        return;
+    }
+    if data.input_bytes.len() > 500 {
+        return;
+    }
+    if data.input.len() > 500 {
+        return;
+    }
+
+    if let Err(e) =
+        black_box(fuzz_regex(&data.pattern, &data.input, &data.replacen))
+    {
+        black_box(format!("{e:?}"));
+    }
+
+    if let Err(e) = black_box(fuzz_regex_bytes(
+        &data.pattern,
+        &data.input_bytes,
+        &data.replacen_bytes,
+    )) {
+        black_box(format!("{e:?}"));
+    }
+    if let Err(e) =
+        black_box(fuzz_regex_set(&data.pattern_set, &data.set_input))
+    {
+        black_box(format!("{e:?}"));
+    }
+
+    if let Err(e) = black_box(fuzz_regex_set_bytes(
+        &data.pattern_set,
+        &data.set_input_bytes,
+    )) {
+        black_box(format!("{e:?}"));
+    }
+});
diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
@@ -14,20 +14,16 @@ edition = "2018"
 # https://docs.rs/regex-syntax/*/#crate-features
 [features]
 default = ["unicode"]
+arbitrary = ["dep:arbitrary"]
 
-unicode = [
-  "unicode-age",
-  "unicode-bool",
-  "unicode-case",
-  "unicode-gencat",
-  "unicode-perl",
-  "unicode-script",
-  "unicode-segment",
-]
+unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"]
 unicode-age = []
 unicode-bool = []
 unicode-case = []
 unicode-gencat = []
 unicode-perl = []
 unicode-script = []
 unicode-segment = []
+
+[dependencies]
+arbitrary = { version = "1.2.3", features = ["derive"], optional = true }
diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
@@ -5,6 +5,8 @@ use std::slice;
 use std::u8;
 
 use crate::unicode;
+#[cfg(feature = "arbitrary")]
+use arbitrary::Arbitrary;
 
 // This module contains an *internal* implementation of interval sets.
 //
@@ -33,6 +35,7 @@ use crate::unicode;
 // Tests on this are relegated to the public API of HIR in src/hir.rs.
 
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct IntervalSet<I> {
     ranges: Vec<I>,
 }

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -8,6 +8,9 @@ use std::fmt;
 use std::result;
 use std::u8;
 
+#[cfg(feature = "arbitrary")]
+use arbitrary::Arbitrary;
+
 use crate::ast::Span;
 use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
 use crate::unicode;
@@ -172,6 +175,7 @@ impl fmt::Display for ErrorKind {
 /// expression pattern string, and uses constant stack space and heap space
 /// proportional to the size of the `Hir`.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct Hir {
     /// The underlying HIR kind.
     kind: HirKind,
@@ -181,6 +185,7 @@ pub struct Hir {
 
 /// The kind of an arbitrary `Hir` expression.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum HirKind {
     /// The empty regular expression, which matches everything, including the
     /// empty string.
@@ -744,6 +749,7 @@ impl fmt::Display for Hir {
 /// are preferred whenever possible. In particular, a `Byte` variant is only
 /// ever produced when it could match invalid UTF-8.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum Literal {
     /// A single character represented by a Unicode scalar value.
     Unicode(char),
@@ -780,6 +786,7 @@ impl Literal {
 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
 /// match the same set of strings.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum Class {
     /// A set of characters represented by Unicode scalar values.
     Unicode(ClassUnicode),
@@ -834,6 +841,7 @@ impl Class {
 
 /// A set of characters represented by Unicode scalar values.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct ClassUnicode {
     set: IntervalSet<ClassUnicodeRange>,
 }
@@ -970,6 +978,7 @@ impl<'a> Iterator for ClassUnicodeIter<'a> {
 /// The range is closed. That is, the start and end of the range are included
 /// in the range.
 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct ClassUnicodeRange {
     start: char,
     end: char,
@@ -1077,6 +1086,7 @@ impl ClassUnicodeRange {
 /// A set of characters represented by arbitrary bytes (where one byte
 /// corresponds to one character).
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct ClassBytes {
     set: IntervalSet<ClassBytesRange>,
 }
@@ -1187,6 +1197,7 @@ impl<'a> Iterator for ClassBytesIter<'a> {
 /// The range is closed. That is, the start and end of the range are included
 /// in the range.
 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct ClassBytesRange {
     start: u8,
     end: u8,
@@ -1282,6 +1293,7 @@ impl fmt::Debug for ClassBytesRange {
 ///
 /// A matching anchor assertion is always zero-length.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum Anchor {
     /// Match the beginning of a line or the beginning of text. Specifically,
     /// this matches at the starting position of the input, or at the position
@@ -1303,6 +1315,7 @@ pub enum Anchor {
 ///
 /// A matching word boundary assertion is always zero-length.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum WordBoundary {
     /// Match a Unicode-aware word boundary. That is, this matches a position
     /// where the left adjacent character and right adjacent character
@@ -1336,6 +1349,7 @@ impl WordBoundary {
 /// 2. A capturing group (e.g., `(expr)`).
 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct Group {
     /// The kind of this group. If it is a capturing group, then the kind
     /// contains the capture group index (and the name, if it is a named
@@ -1347,6 +1361,7 @@ pub struct Group {
 
 /// The kind of group.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum GroupKind {
     /// A normal unnamed capturing group.
     ///
@@ -1368,6 +1383,7 @@ pub enum GroupKind {
 /// A repetition operator permits the repetition of an arbitrary
 /// sub-expression.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub struct Repetition {
     /// The kind of this repetition operator.
     pub kind: RepetitionKind,
@@ -1407,6 +1423,7 @@ impl Repetition {
 
 /// The kind of a repetition operator.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum RepetitionKind {
     /// Matches a sub-expression zero or one times.
     ZeroOrOne,
@@ -1420,6 +1437,7 @@ pub enum RepetitionKind {
 
 /// The kind of a counted repetition operator.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 pub enum RepetitionRange {
     /// Matches a sub-expression exactly this many times.
     Exactly(u32),
@@ -1477,6 +1495,7 @@ impl Drop for Hir {
 ///
 /// These attributes are typically defined inductively on the HIR.
 #[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
 struct HirInfo {
     /// Represent yes/no questions by a bitfield to conserve space, since
     /// this is included in every HIR expression.