Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fuzz: Add a roundtrip regex fuzz harness #959

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
cargo-features = ['named-profiles']

[package]
name = "regex-fuzz"
version = "0.0.0"
Expand All @@ -11,11 +9,16 @@ edition = "2018"
cargo-fuzz = true

[dependencies]
arbitrary = { version = "1.2.3", features = ["derive"] }
libfuzzer-sys = "0.4.1"

[dependencies.regex]
path = ".."

[dependencies.regex-syntax]
path = "../regex-syntax"
features = ["arbitrary"]

# Prevent this from interfering with workspaces
[workspace]
members = ["."]
Expand All @@ -24,6 +27,12 @@ members = ["."]
name = "fuzz_regex_match"
path = "fuzz_targets/fuzz_regex_match.rs"

[[bin]]
name = "fuzz_regex"
path = "fuzz_targets/fuzz_regex.rs"
test = false
doc = false

[profile.release]
opt-level = 3
debug = true
Expand Down
161 changes: 161 additions & 0 deletions fuzz/fuzz_targets/fuzz_regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#![no_main]

use arbitrary::Arbitrary;
use libfuzzer_sys::fuzz_target;
use regex_syntax::hir::print::Printer;
use regex_syntax::hir::Hir;
use std::{convert::TryFrom, hint::black_box};

#[derive(Arbitrary, Debug, Clone)]
enum Pattern {
WellFormed(Hir),
Random(String),
}

impl TryFrom<Pattern> for String {
type Error = std::fmt::Error;

fn try_from(pattern: Pattern) -> Result<Self, Self::Error> {
match pattern {
Pattern::WellFormed(hir) => {
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&hir, &mut dst)?;
return Ok(dst);
}
Pattern::Random(s) => {
return Ok(s);
}
}
}
}

#[derive(Arbitrary, Debug)]
struct Data<'a> {
pattern: Pattern,
replacen: (usize, &'a str),
replacen_bytes: (usize, &'a [u8]),
input: &'a str,
input_bytes: &'a [u8],
pattern_set: Vec<Pattern>,
set_input: &'a str,
set_input_bytes: &'a [u8],
}

fn fuzz_regex(
pattern: &Pattern,
input: &str,
replacen: &(usize, &str),
) -> Result<(), Box<dyn std::error::Error>> {
let re = regex::Regex::new(&String::try_from(pattern.clone())?)?;
_ = black_box(re.is_match(&input));
_ = black_box(re.captures_iter(&input).collect::<Vec<regex::Captures>>());
_ = black_box(re.split(&input).collect::<Vec<&str>>());

let (limit, replace) = *replacen;
_ = black_box(re.replacen(&input, limit, replace));

_ = black_box(re.find(&input));
_ = black_box(re.shortest_match(&input));
Ok(())
}

fn fuzz_regex_bytes(
pattern: &Pattern,
input: &[u8],
replacen: &(usize, &[u8]),
) -> Result<(), Box<dyn std::error::Error>> {
let re = regex::bytes::Regex::new(&String::try_from(pattern.clone())?)?;
_ = black_box(re.is_match(&input));
_ = black_box(
re.captures_iter(&input).collect::<Vec<regex::bytes::Captures>>(),
);
_ = black_box(re.split(&input).collect::<Vec<&[u8]>>());

let (limit, replace) = *replacen;
_ = black_box(re.replacen(&input, limit, replace));

_ = black_box(re.find(&input));
_ = black_box(re.shortest_match(&input));
Ok(())
}

fn fuzz_regex_set(
pattern_set: &Vec<Pattern>,
input: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let set = regex::RegexSet::new(
pattern_set
.into_iter()
.filter_map(|x| String::try_from(x.clone()).ok()),
)?;
_ = black_box(set.is_match(&input));
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
Ok(())
}

fn fuzz_regex_set_bytes(
pattern_set: &Vec<Pattern>,
input: &[u8],
) -> Result<(), Box<dyn std::error::Error>> {
let set = regex::bytes::RegexSet::new(
pattern_set
.into_iter()
.filter_map(|x| String::try_from(x.clone()).ok()),
)?;
_ = black_box(set.is_match(&input));
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
Ok(())
}

fuzz_target!(|data: Data| {
if data.pattern_set.len() > 10 {
return;
}
let (_, replace) = data.replacen;
if replace.len() > 100 {
return;
}
let (_, replace) = data.replacen_bytes;
if replace.len() > 100 {
return;
}
if data.set_input.len() > 500 {
return;
}
if data.set_input_bytes.len() > 500 {
return;
}
if data.input_bytes.len() > 500 {
return;
}
if data.input.len() > 500 {
return;
}

if let Err(e) =
black_box(fuzz_regex(&data.pattern, &data.input, &data.replacen))
{
black_box(format!("{e:?}"));
}

if let Err(e) = black_box(fuzz_regex_bytes(
&data.pattern,
&data.input_bytes,
&data.replacen_bytes,
)) {
black_box(format!("{e:?}"));
}
if let Err(e) =
black_box(fuzz_regex_set(&data.pattern_set, &data.set_input))
{
black_box(format!("{e:?}"));
}

if let Err(e) = black_box(fuzz_regex_set_bytes(
&data.pattern_set,
&data.set_input_bytes,
)) {
black_box(format!("{e:?}"));
}
});
13 changes: 4 additions & 9 deletions regex-syntax/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,14 @@ edition = "2018"
[features]
default = ["unicode"]

unicode = [
"unicode-age",
"unicode-bool",
"unicode-case",
"unicode-gencat",
"unicode-perl",
"unicode-script",
"unicode-segment",
]
unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"]
unicode-age = []
unicode-bool = []
unicode-case = []
unicode-gencat = []
unicode-perl = []
unicode-script = []
unicode-segment = []

[dependencies]
arbitrary = { version = "1.2.3", features = ["derive"], optional = true }
3 changes: 3 additions & 0 deletions regex-syntax/src/hir/interval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use std::slice;
use std::u8;

use crate::unicode;
#[cfg(feature = "arbitrary")]
use arbitrary::Arbitrary;

// This module contains an *internal* implementation of interval sets.
//
Expand Down Expand Up @@ -33,6 +35,7 @@ use crate::unicode;
// Tests on this are relegated to the public API of HIR in src/hir.rs.

#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct IntervalSet<I> {
ranges: Vec<I>,
}
Expand Down
19 changes: 19 additions & 0 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ use std::fmt;
use std::result;
use std::u8;

#[cfg(feature = "arbitrary")]
use arbitrary::Arbitrary;

use crate::ast::Span;
use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
use crate::unicode;
Expand Down Expand Up @@ -172,6 +175,7 @@ impl fmt::Display for ErrorKind {
/// expression pattern string, and uses constant stack space and heap space
/// proportional to the size of the `Hir`.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Hir {
/// The underlying HIR kind.
kind: HirKind,
Expand All @@ -181,6 +185,7 @@ pub struct Hir {

/// The kind of an arbitrary `Hir` expression.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum HirKind {
/// The empty regular expression, which matches everything, including the
/// empty string.
Expand Down Expand Up @@ -744,6 +749,7 @@ impl fmt::Display for Hir {
/// are preferred whenever possible. In particular, a `Byte` variant is only
/// ever produced when it could match invalid UTF-8.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Literal {
/// A single character represented by a Unicode scalar value.
Unicode(char),
Expand Down Expand Up @@ -780,6 +786,7 @@ impl Literal {
/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
/// match the same set of strings.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Class {
/// A set of characters represented by Unicode scalar values.
Unicode(ClassUnicode),
Expand Down Expand Up @@ -834,6 +841,7 @@ impl Class {

/// A set of characters represented by Unicode scalar values.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassUnicode {
set: IntervalSet<ClassUnicodeRange>,
}
Expand Down Expand Up @@ -970,6 +978,7 @@ impl<'a> Iterator for ClassUnicodeIter<'a> {
/// The range is closed. That is, the start and end of the range are included
/// in the range.
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassUnicodeRange {
start: char,
end: char,
Expand Down Expand Up @@ -1077,6 +1086,7 @@ impl ClassUnicodeRange {
/// A set of characters represented by arbitrary bytes (where one byte
/// corresponds to one character).
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassBytes {
set: IntervalSet<ClassBytesRange>,
}
Expand Down Expand Up @@ -1187,6 +1197,7 @@ impl<'a> Iterator for ClassBytesIter<'a> {
/// The range is closed. That is, the start and end of the range are included
/// in the range.
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct ClassBytesRange {
start: u8,
end: u8,
Expand Down Expand Up @@ -1282,6 +1293,7 @@ impl fmt::Debug for ClassBytesRange {
///
/// A matching anchor assertion is always zero-length.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum Anchor {
/// Match the beginning of a line or the beginning of text. Specifically,
/// this matches at the starting position of the input, or at the position
Expand All @@ -1303,6 +1315,7 @@ pub enum Anchor {
///
/// A matching word boundary assertion is always zero-length.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum WordBoundary {
/// Match a Unicode-aware word boundary. That is, this matches a position
/// where the left adjacent character and right adjacent character
Expand Down Expand Up @@ -1336,6 +1349,7 @@ impl WordBoundary {
/// 2. A capturing group (e.g., `(expr)`).
/// 3. A named capturing group (e.g., `(?P<name>expr)`).
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Group {
/// The kind of this group. If it is a capturing group, then the kind
/// contains the capture group index (and the name, if it is a named
Expand All @@ -1347,6 +1361,7 @@ pub struct Group {

/// The kind of group.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum GroupKind {
/// A normal unnamed capturing group.
///
Expand All @@ -1368,6 +1383,7 @@ pub enum GroupKind {
/// A repetition operator permits the repetition of an arbitrary
/// sub-expression.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub struct Repetition {
/// The kind of this repetition operator.
pub kind: RepetitionKind,
Expand Down Expand Up @@ -1407,6 +1423,7 @@ impl Repetition {

/// The kind of a repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum RepetitionKind {
/// Matches a sub-expression zero or one times.
ZeroOrOne,
Expand All @@ -1420,6 +1437,7 @@ pub enum RepetitionKind {

/// The kind of a counted repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
pub enum RepetitionRange {
/// Matches a sub-expression exactly this many times.
Exactly(u32),
Expand Down Expand Up @@ -1477,6 +1495,7 @@ impl Drop for Hir {
///
/// These attributes are typically defined inductively on the HIR.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
struct HirInfo {
/// Represent yes/no questions by a bitfield to conserve space, since
/// this is included in every HIR expression.
Expand Down