Skip to content

Commit

Permalink
fuzz: Add a roundtrip regex fuzz harness
Browse files Browse the repository at this point in the history
This change adds an optional dependency on 'arbitrary' for
regex-syntax. This allows us to generate arbitrary high-level
intermediate representations (HIR). Using this generated HIR
we convert this back to a regex string and exercise the regex
matching code under src. Using this approach we can generate
arbitrary well-formed regex strings, allowing the fuzzer to
penetrate deeper into the regex code.
  • Loading branch information
nathaniel-brough committed Feb 20, 2023
1 parent a9b2e02 commit b48c57b
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 11 deletions.
13 changes: 11 additions & 2 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
cargo-features = ['named-profiles']

[package]
name = "regex-fuzz"
version = "0.0.0"
Expand All @@ -11,11 +9,16 @@ edition = "2018"
cargo-fuzz = true

[dependencies]
arbitrary = { version = "1.2.3", features = ["derive"] }
libfuzzer-sys = "0.4.1"

[dependencies.regex]
path = ".."

[dependencies.regex-syntax]
path = "../regex-syntax"
features = ["arbitrary"]

# Prevent this from interfering with workspaces
[workspace]
members = ["."]
Expand All @@ -24,6 +27,12 @@ members = ["."]
name = "fuzz_regex_match"
path = "fuzz_targets/fuzz_regex_match.rs"

[[bin]]
name = "fuzz_regex"
path = "fuzz_targets/fuzz_regex.rs"
test = false
doc = false

[profile.release]
opt-level = 3
debug = true
Expand Down
161 changes: 161 additions & 0 deletions fuzz/fuzz_targets/fuzz_regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#![no_main]

use arbitrary::Arbitrary;
use libfuzzer_sys::fuzz_target;
use regex_syntax::hir::print::Printer;
use regex_syntax::hir::Hir;
use std::{convert::TryFrom, hint::black_box};

#[derive(Arbitrary, Debug, Clone)]
enum Pattern {
WellFormed(Hir),
Random(String),
}

impl TryFrom<Pattern> for String {
type Error = std::fmt::Error;

fn try_from(pattern: Pattern) -> Result<Self, Self::Error> {
match pattern {
Pattern::WellFormed(hir) => {
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&hir, &mut dst)?;
return Ok(dst);
}
Pattern::Random(s) => {
return Ok(s);
}
}
}
}

#[derive(Arbitrary, Debug)]
struct Data<'a> {
pattern: Pattern,
replacen: (usize, &'a str),
replacen_bytes: (usize, &'a [u8]),
input: &'a str,
input_bytes: &'a [u8],
pattern_set: Vec<Pattern>,
set_input: &'a str,
set_input_bytes: &'a [u8],
}

fn fuzz_regex(
pattern: &Pattern,
input: &str,
replacen: &(usize, &str),
) -> Result<(), Box<dyn std::error::Error>> {
let re = regex::Regex::new(&String::try_from(pattern.clone())?)?;
_ = black_box(re.is_match(&input));
_ = black_box(re.captures_iter(&input).collect::<Vec<regex::Captures>>());
_ = black_box(re.split(&input).collect::<Vec<&str>>());

let (limit, replace) = *replacen;
_ = black_box(re.replacen(&input, limit, replace));

_ = black_box(re.find(&input));
_ = black_box(re.shortest_match(&input));
Ok(())
}

fn fuzz_regex_bytes(
pattern: &Pattern,
input: &[u8],
replacen: &(usize, &[u8]),
) -> Result<(), Box<dyn std::error::Error>> {
let re = regex::bytes::Regex::new(&String::try_from(pattern.clone())?)?;
_ = black_box(re.is_match(&input));
_ = black_box(
re.captures_iter(&input).collect::<Vec<regex::bytes::Captures>>(),
);
_ = black_box(re.split(&input).collect::<Vec<&[u8]>>());

let (limit, replace) = *replacen;
_ = black_box(re.replacen(&input, limit, replace));

_ = black_box(re.find(&input));
_ = black_box(re.shortest_match(&input));
Ok(())
}

fn fuzz_regex_set(
pattern_set: &Vec<Pattern>,
input: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let set = regex::RegexSet::new(
pattern_set
.into_iter()
.filter_map(|x| String::try_from(x.clone()).ok()),
)?;
_ = black_box(set.is_match(&input));
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
Ok(())
}

fn fuzz_regex_set_bytes(
pattern_set: &Vec<Pattern>,
input: &[u8],
) -> Result<(), Box<dyn std::error::Error>> {
let set = regex::bytes::RegexSet::new(
pattern_set
.into_iter()
.filter_map(|x| String::try_from(x.clone()).ok()),
)?;
_ = black_box(set.is_match(&input));
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
Ok(())
}

fuzz_target!(|data: Data| {
if data.pattern_set.len() > 10 {
return;
}
let (_, replace) = data.replacen;
if replace.len() > 100 {
return;
}
let (_, replace) = data.replacen_bytes;
if replace.len() > 100 {
return;
}
if data.set_input.len() > 500 {
return;
}
if data.set_input_bytes.len() > 500 {
return;
}
if data.input_bytes.len() > 500 {
return;
}
if data.input.len() > 500 {
return;
}

if let Err(e) =
black_box(fuzz_regex(&data.pattern, &data.input, &data.replacen))
{
black_box(format!("{e:?}"));
}

if let Err(e) = black_box(fuzz_regex_bytes(
&data.pattern,
&data.input_bytes,
&data.replacen_bytes,
)) {
black_box(format!("{e:?}"));
}
if let Err(e) =
black_box(fuzz_regex_set(&data.pattern_set, &data.set_input))
{
black_box(format!("{e:?}"));
}

if let Err(e) = black_box(fuzz_regex_set_bytes(
&data.pattern_set,
&data.set_input_bytes,
)) {
black_box(format!("{e:?}"));
}
});
14 changes: 5 additions & 9 deletions regex-syntax/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,16 @@ edition = "2018"
# https://docs.rs/regex-syntax/*/#crate-features
[features]
default = ["unicode"]
arbitrary = ["dep:arbitrary"]

unicode = [
"unicode-age",
"unicode-bool",
"unicode-case",
"unicode-gencat",
"unicode-perl",
"unicode-script",
"unicode-segment",
]
unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"]
unicode-age = []
unicode-bool = []
unicode-case = []
unicode-gencat = []
unicode-perl = []
unicode-script = []
unicode-segment = []

[dependencies]
arbitrary = { version = "1.2.3", features = ["derive"], optional = true }
3 changes: 3 additions & 0 deletions regex-syntax/src/hir/interval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use std::slice;
use std::u8;

use crate::unicode;
#[cfg(feature = "arbitrary")]
use arbitrary::Arbitrary;

// This module contains an *internal* implementation of interval sets.
//
Expand Down Expand Up @@ -33,6 +35,7 @@ use crate::unicode;
// Tests on this are relegated to the public API of HIR in src/hir.rs.

#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct IntervalSet<I> {
ranges: Vec<I>,
}
Expand Down
19 changes: 19 additions & 0 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ use std::fmt;
use std::result;
use std::u8;

#[cfg(feature = "arbitrary")]
use arbitrary::Arbitrary;

use crate::ast::Span;
use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
use crate::unicode;
Expand Down Expand Up @@ -172,6 +175,7 @@ impl fmt::Display for ErrorKind {
/// expression pattern string, and uses constant stack space and heap space
/// proportional to the size of the `Hir`.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Hir {
/// The underlying HIR kind.
kind: HirKind,
Expand All @@ -181,6 +185,7 @@ pub struct Hir {

/// The kind of an arbitrary `Hir` expression.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum HirKind {
/// The empty regular expression, which matches everything, including the
/// empty string.
Expand Down Expand Up @@ -744,6 +749,7 @@ impl fmt::Display for Hir {
/// are preferred whenever possible. In particular, a `Byte` variant is only
/// ever produced when it could match invalid UTF-8.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Literal {
/// A single character represented by a Unicode scalar value.
Unicode(char),
Expand Down Expand Up @@ -780,6 +786,7 @@ impl Literal {
/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
/// match the same set of strings.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Class {
/// A set of characters represented by Unicode scalar values.
Unicode(ClassUnicode),
Expand Down Expand Up @@ -834,6 +841,7 @@ impl Class {

/// A set of characters represented by Unicode scalar values.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassUnicode {
set: IntervalSet<ClassUnicodeRange>,
}
Expand Down Expand Up @@ -970,6 +978,7 @@ impl<'a> Iterator for ClassUnicodeIter<'a> {
/// The range is closed. That is, the start and end of the range are included
/// in the range.
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassUnicodeRange {
start: char,
end: char,
Expand Down Expand Up @@ -1077,6 +1086,7 @@ impl ClassUnicodeRange {
/// A set of characters represented by arbitrary bytes (where one byte
/// corresponds to one character).
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassBytes {
set: IntervalSet<ClassBytesRange>,
}
Expand Down Expand Up @@ -1187,6 +1197,7 @@ impl<'a> Iterator for ClassBytesIter<'a> {
/// The range is closed. That is, the start and end of the range are included
/// in the range.
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassBytesRange {
start: u8,
end: u8,
Expand Down Expand Up @@ -1282,6 +1293,7 @@ impl fmt::Debug for ClassBytesRange {
///
/// A matching anchor assertion is always zero-length.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Anchor {
/// Match the beginning of a line or the beginning of text. Specifically,
/// this matches at the starting position of the input, or at the position
Expand All @@ -1303,6 +1315,7 @@ pub enum Anchor {
///
/// A matching word boundary assertion is always zero-length.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum WordBoundary {
/// Match a Unicode-aware word boundary. That is, this matches a position
/// where the left adjacent character and right adjacent character
Expand Down Expand Up @@ -1336,6 +1349,7 @@ impl WordBoundary {
/// 2. A capturing group (e.g., `(expr)`).
/// 3. A named capturing group (e.g., `(?P<name>expr)`).
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Group {
/// The kind of this group. If it is a capturing group, then the kind
/// contains the capture group index (and the name, if it is a named
Expand All @@ -1347,6 +1361,7 @@ pub struct Group {

/// The kind of group.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum GroupKind {
/// A normal unnamed capturing group.
///
Expand All @@ -1368,6 +1383,7 @@ pub enum GroupKind {
/// A repetition operator permits the repetition of an arbitrary
/// sub-expression.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Repetition {
/// The kind of this repetition operator.
pub kind: RepetitionKind,
Expand Down Expand Up @@ -1407,6 +1423,7 @@ impl Repetition {

/// The kind of a repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum RepetitionKind {
/// Matches a sub-expression zero or one times.
ZeroOrOne,
Expand All @@ -1420,6 +1437,7 @@ pub enum RepetitionKind {

/// The kind of a counted repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum RepetitionRange {
/// Matches a sub-expression exactly this many times.
Exactly(u32),
Expand Down Expand Up @@ -1477,6 +1495,7 @@ impl Drop for Hir {
///
/// These attributes are typically defined inductively on the HIR.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
struct HirInfo {
/// Represent yes/no questions by a bitfield to conserve space, since
/// this is included in every HIR expression.
Expand Down

0 comments on commit b48c57b

Please sign in to comment.