diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 558eab4c53..41040d0833 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -73,6 +73,8 @@ jobs: run: command -v llvm-config-15 && clang-15 -v - name: Add nightly rustfmt and clippy run: rustup toolchain install nightly --component rustfmt --component clippy --component miri --allow-downgrade + - name: Install ucd-generate + run: cargo install -f ucd-generate - uses: actions/checkout@v3 - uses: Swatinem/rust-cache@v2 @@ -135,6 +137,8 @@ jobs: run: command -v llvm-config-15 && clang-15 -v - name: Install cargo-hack run: curl -LsSf https://github.com/taiki-e/cargo-hack/releases/latest/download/cargo-hack-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin + - name: Install ucd-generate + run: cargo install -f ucd-generate - name: Add nightly run: rustup toolchain install nightly --allow-downgrade - uses: actions/checkout@v3 @@ -222,6 +226,8 @@ jobs: - name: Install cxxbridge if: runner.os == 'macOS' run: cargo install cxxbridge-cmd + - name: Install ucd-generate + run: cargo install -f ucd-generate - name: Install python (macOS) # Removing macOS things already installed in CI against failed linking if: runner.os == 'macOS' @@ -384,6 +390,8 @@ jobs: toolchain: stable - name: Add nightly rustfmt and clippy run: rustup toolchain install nightly --component rustfmt --component clippy --allow-downgrade + - name: Install ucd-generate + run: cargo install -f ucd-generate - name: Install deps run: brew install z3 gtk+3 - name: Install cxxbridge @@ -453,6 +461,7 @@ jobs: freebsd-version . "$HOME/.cargo/env" rustup toolchain install nightly + cargo install -f ucd-generate export LLVM_CONFIG=/usr/local/bin/llvm-config16 pwd ls -lah diff --git a/fuzzers/baby_fuzzer_unicode/.gitignore b/fuzzers/baby_fuzzer_unicode/.gitignore new file mode 100644 index 0000000000..a977a2ca5b --- /dev/null +++ b/fuzzers/baby_fuzzer_unicode/.gitignore @@ -0,0 +1 @@ +libpng-* \ No newline at end of file diff --git a/fuzzers/baby_fuzzer_unicode/Cargo.toml b/fuzzers/baby_fuzzer_unicode/Cargo.toml new file mode 100644 index 0000000000..ff8bb2d732 --- /dev/null +++ b/fuzzers/baby_fuzzer_unicode/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "baby_fuzzer_unicode" +version = "0.10.0" +authors = ["Andrea Fioraldi ", "Dominik Maier "] +edition = "2021" + +[features] +default = ["std"] +tui = [] +std = [] + +[profile.dev] +panic = "abort" + +[profile.release] +panic = "abort" +lto = true +codegen-units = 1 +opt-level = 3 +debug = true + +[dependencies] +libafl = { path = "../../libafl/", features = ["unicode"] } +libafl_bolts = { path = "../../libafl_bolts/" } diff --git a/fuzzers/baby_fuzzer_unicode/README.md b/fuzzers/baby_fuzzer_unicode/README.md new file mode 100644 index 0000000000..0a01b7ca3f --- /dev/null +++ b/fuzzers/baby_fuzzer_unicode/README.md @@ -0,0 +1,15 @@ +# Baby fuzzer: unicode + +This is a minimalistic example about how to create a libafl based fuzzer. + +It runs on a single core until a crash occurs and then exits. + +The tested program is a simple Rust function without any instrumentation. +For real fuzzing, you will want to add some sort to add coverage or other feedback. + +You can run this example using `cargo run`, and you can enable the TUI feature by running `cargo run --features tui`. + +## Unicode + +This fuzzer uses mutators which preserve unicode properties. For programs which have string-heavy inputs, you may +consider using the same strategy. \ No newline at end of file diff --git a/fuzzers/baby_fuzzer_unicode/src/main.rs b/fuzzers/baby_fuzzer_unicode/src/main.rs new file mode 100644 index 0000000000..55795b5149 --- /dev/null +++ b/fuzzers/baby_fuzzer_unicode/src/main.rs @@ -0,0 +1,138 @@ +#[cfg(windows)] +use std::ptr::write_volatile; +use std::{path::PathBuf, ptr::write}; + +#[cfg(feature = "tui")] +use libafl::monitors::tui::{ui::TuiUI, TuiMonitor}; +#[cfg(not(feature = "tui"))] +use libafl::monitors::SimpleMonitor; +use libafl::{ + corpus::{InMemoryCorpus, OnDiskCorpus}, + events::SimpleEventManager, + executors::{inprocess::InProcessExecutor, ExitKind}, + feedbacks::{CrashFeedback, MaxMapFeedback}, + fuzzer::{Fuzzer, StdFuzzer}, + inputs::{BytesInput, HasTargetBytes}, + mutators::{StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator}, + observers::StdMapObserver, + schedulers::QueueScheduler, + stages::{mutational::StdMutationalStage, StringIdentificationStage}, + state::StdState, + Evaluator, +}; +use libafl_bolts::{current_nanos, rands::StdRand, tuples::tuple_list, AsSlice}; + +/// Coverage map with explicit assignments due to the lack of instrumentation +static mut SIGNALS: [u8; 64] = [0; 64]; +static mut SIGNALS_PTR: *mut u8 = unsafe { SIGNALS.as_mut_ptr() }; + +/// Assign a signal to the signals map +fn signals_set(idx: usize) { + unsafe { write(SIGNALS_PTR.add(idx), 1) }; +} + +#[allow(clippy::similar_names, clippy::manual_assert)] +pub fn main() { + // The closure that we want to fuzz + let mut harness = |input: &BytesInput| { + let target = input.target_bytes(); + let buf = target.as_slice(); + let goal = b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + let mut i = 0; + for _ in buf.iter().zip(goal).take_while(|(b, c)| b == c) { + signals_set(i); + i += 1; + } + if i == goal.len() { + #[cfg(unix)] + panic!("Artificial bug triggered =)"); + + #[cfg(windows)] + unsafe { + write_volatile(0 as *mut u32, 0); + } + } + ExitKind::Ok + }; + + // Create an observation channel using the signals map + let observer = unsafe { StdMapObserver::from_mut_ptr("signals", SIGNALS_PTR, SIGNALS.len()) }; + + // Feedback to rate the interestingness of an input + let mut feedback = MaxMapFeedback::new(&observer); + + // A feedback to choose if an input is a solution or not + let mut objective = CrashFeedback::new(); + + // create a State from scratch + let mut state = StdState::new( + // RNG + StdRand::with_seed(current_nanos()), + // Corpus that will be evolved, we keep it in memory for performance + InMemoryCorpus::new(), + // Corpus in which we store solutions (crashes in this example), + // on disk so the user can get them after stopping the fuzzer + OnDiskCorpus::new(PathBuf::from("./crashes")).unwrap(), + // States of the feedbacks. + // The feedbacks can report the data that should persist in the State. + &mut feedback, + // Same for objective feedbacks + &mut objective, + ) + .unwrap(); + + // The Monitor trait define how the fuzzer stats are displayed to the user + #[cfg(not(feature = "tui"))] + let mon = SimpleMonitor::new(|s| println!("{s}")); + #[cfg(feature = "tui")] + let ui = TuiUI::with_version(String::from("Baby Fuzzer"), String::from("0.0.1"), false); + #[cfg(feature = "tui")] + let mon = TuiMonitor::new(ui); + + // The event manager handle the various events generated during the fuzzing loop + // such as the notification of the addition of a new item to the corpus + let mut mgr = SimpleEventManager::new(mon); + + // A queue policy to get testcasess from the corpus + let scheduler = QueueScheduler::new(); + + // A fuzzer with feedbacks and a corpus scheduler + let mut fuzzer = StdFuzzer::new(scheduler, feedback, objective); + + // Create the executor for an in-process function with just one observer + let mut executor = InProcessExecutor::new( + &mut harness, + tuple_list!(observer), + &mut fuzzer, + &mut state, + &mut mgr, + ) + .expect("Failed to create the Executor"); + + // Generate 8 initial inputs + fuzzer + .evaluate_input( + &mut state, + &mut executor, + &mut mgr, + BytesInput::new(vec![b'a']), + ) + .unwrap(); + + // Setup a mutational stage with a basic bytes mutator + let mutator = StdScheduledMutator::new(tuple_list!( + StringCategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator + )); + let mut stages = tuple_list!( + StringIdentificationStage::new(), + StdMutationalStage::transforming(mutator) + ); + + fuzzer + .fuzz_loop(&mut stages, &mut executor, &mut state, &mut mgr) + .expect("Error in the fuzzing loop"); +} diff --git a/libafl/Cargo.toml b/libafl/Cargo.toml index 1067c3f18c..91c01fdcfb 100644 --- a/libafl/Cargo.toml +++ b/libafl/Cargo.toml @@ -77,6 +77,9 @@ concolic_mutation = ["z3"] ## Enable the fancy TuiMonitor for a termanal UI using crossterm tui_monitor = ["ratatui", "crossterm"] +## Enables `StringClassificationStage` and associated mutators, which allow for mutations which preserve the Unicode property data +unicode = ["libafl_bolts/alloc", "ahash/std", "serde/rc", "bitvec"] + #! ## LibAFL-Bolts Features @@ -126,7 +129,9 @@ agpl = ["nautilus"] nautilus = ["grammartec", "std", "serde_json/std"] [build-dependencies] +reqwest = { version = "0.11", features = ["blocking"] } rustversion = "1.0" +zip = "0.6" [dev-dependencies] serde_json = { version = "1.0", default-features = false, features = ["alloc"] } @@ -172,7 +177,9 @@ z3 = { version = "0.12.0", features = ["static-link-z3"], optional = true } # fo pyo3 = { version = "0.18", optional = true, features = ["serde", "macros"] } concat-idents = { version = "1.1.3", optional = true } -libcasr = { version = "2.7", optional = true} +libcasr = { version = "2.7", optional = true } + +bitvec = { version = "1.0", optional = true, features = ["serde"] } # used for string range storage # optional-dev deps (change when target.'cfg(accessible(::std))'.test-dependencies will be stable) serial_test = { version = "2", optional = true, default-features = false, features = ["logging"] } diff --git a/libafl/build.rs b/libafl/build.rs index 0bcffa140b..492199fcf2 100644 --- a/libafl/build.rs +++ b/libafl/build.rs @@ -1,14 +1,69 @@ +use std::error::Error; + #[rustversion::nightly] -fn main() { +fn main() -> Result<(), Box> { println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rustc-cfg=nightly"); + #[cfg(feature = "unicode")] + { + build_unicode_property_map()?; + } + Ok(()) } #[rustversion::not(nightly)] -fn main() { +fn main() -> Result<(), Box> { println!("cargo:rerun-if-changed=build.rs"); assert!( cfg!(all(not(docrs), not(feature = "nautilus"))), "The 'nautilus' feature of libafl requires a nightly compiler" ); + #[cfg(feature = "unicode")] + { + build_unicode_property_map()?; + } + Ok(()) +} + +#[cfg(feature = "unicode")] +fn build_unicode_property_map() -> Result<(), Box> { + use std::{ + env, + fs::File, + io::{BufWriter, Write}, + path::PathBuf, + process::{Command, Stdio}, + }; + + let out_dir = PathBuf::from(env::var_os("OUT_DIR").unwrap()); + let ucd_dir = out_dir.join("ucd-dir"); + let generated_file = out_dir.join("unicode_categories.rs"); + + std::fs::create_dir_all(&ucd_dir)?; + + let zip_path = ucd_dir.join("ucd.zip"); + let mut ucd_file = BufWriter::new(File::create(&zip_path)?); + for chunk in reqwest::blocking::get("https://www.unicode.org/Public/zipped/latest/UCD.zip")? + .bytes()? + .chunks(1 << 12) + { + ucd_file.write_all(chunk)?; + } + ucd_file.flush()?; + drop(ucd_file); + + let mut zip_file = zip::ZipArchive::new(File::open(&zip_path)?)?; + zip_file.extract(&ucd_dir)?; + drop(zip_file); + + std::fs::remove_file(zip_path)?; + + let status = Command::new("ucd-generate") + .arg("general-category") + .arg(ucd_dir.as_os_str()) + .stdout(Stdio::from(File::create(generated_file)?)) + .status()?; + assert!(status.success()); + + Ok(()) } diff --git a/libafl/src/mutators/mod.rs b/libafl/src/mutators/mod.rs index 599e639963..e3560bd673 100644 --- a/libafl/src/mutators/mod.rs +++ b/libafl/src/mutators/mod.rs @@ -20,6 +20,11 @@ pub use grimoire::*; pub mod tuneable; pub use tuneable::*; +#[cfg(feature = "unicode")] +pub mod string; +#[cfg(feature = "unicode")] +pub use string::*; + #[cfg(feature = "nautilus")] pub mod nautilus; use alloc::vec::Vec; diff --git a/libafl/src/mutators/string.rs b/libafl/src/mutators/string.rs new file mode 100644 index 0000000000..4e307ff2b8 --- /dev/null +++ b/libafl/src/mutators/string.rs @@ -0,0 +1,595 @@ +//! Mutators for preserving string categories, which may be useful for certain targets which are primarily string-oriented. +use alloc::vec::Vec; +use core::{ + cmp::{Ordering, Reverse}, + ops::Range, +}; + +use libafl_bolts::{rands::Rand, Error, HasLen, Named}; + +use crate::{ + corpus::{CorpusId, HasTestcase, Testcase}, + inputs::{BytesInput, HasBytesVec}, + mutators::{rand_range, MutationResult, Mutator, Tokens}, + stages::{ + extract_metadata, + mutational::{MutatedTransform, MutatedTransformPost}, + StringIdentificationMetadata, + }, + state::{HasCorpus, HasMaxSize, HasMetadata, HasRand}, +}; + +/// Input which contains the context necessary to perform unicode mutations +pub type UnicodeInput = (BytesInput, StringIdentificationMetadata); + +impl MutatedTransform for UnicodeInput +where + S: HasCorpus + HasTestcase, +{ + type Post = StringIdentificationMetadata; + + fn try_transform_from( + base: &mut Testcase, + state: &S, + _corpus_idx: CorpusId, + ) -> Result { + let input = base.load_input(state.corpus())?.clone(); + let metadata = base.metadata::().cloned()?; + Ok((input, metadata)) + } + + fn try_transform_into(self, _state: &S) -> Result<(BytesInput, Self::Post), Error> { + Ok(self) + } +} + +impl MutatedTransformPost for StringIdentificationMetadata +where + S: HasTestcase, +{ + fn post_exec( + self, + state: &mut S, + _stage_idx: i32, + corpus_idx: Option, + ) -> Result<(), Error> { + if let Some(corpus_idx) = corpus_idx { + let mut tc = state.testcase_mut(corpus_idx)?; + tc.add_metadata(self); + } + Ok(()) + } +} + +const MAX_CHARS: usize = 16; + +fn choose_start( + rand: &mut R, + bytes: &[u8], + meta: &StringIdentificationMetadata, +) -> Option<(usize, usize)> { + let idx = rand.below(bytes.len() as u64) as usize; + let mut options = Vec::new(); + for (start, range) in meta.ranges() { + if idx + .checked_sub(*start) // idx adjusted to start + .and_then(|idx| (idx < range.len()).then(|| range[idx])) // idx in range + .map_or(false, |r| r) + { + options.push((*start, range)); + } + } + match options.len() { + 0 => None, + 1 => Some((options[0].0, options[0].1.len())), + _ => { + // bias towards longer strings + options.sort_by_cached_key(|(_, entries)| entries.count_ones()); + let selected = libafl_bolts::math::integer_sqrt( + rand.below((options.len() * options.len()) as u64), + ) as usize; + Some((options[selected].0, options[selected].1.len())) + } + } +} + +fn get_subcategory(needle: T, haystack: &[(T, T)]) -> Option<(T, T)> { + haystack + .binary_search_by(|&(min, max)| match min.cmp(&needle) { + Ordering::Less | Ordering::Equal => match needle.cmp(&max) { + Ordering::Less | Ordering::Equal => Ordering::Equal, + Ordering::Greater => Ordering::Less, + }, + Ordering::Greater => Ordering::Greater, + }) + .ok() + .map(|idx| haystack[idx]) +} + +fn find_range bool>( + chars: &[(usize, char)], + idx: usize, + predicate: F, +) -> Range { + // walk backwards and discover + let start = chars[..idx] + .iter() + .rev() + .take_while(|&&(_, c)| predicate(c)) + .last() + .map_or(chars[idx].0, |&(i, _)| i); + // walk forwards + let end = chars[(idx + 1)..] + .iter() + .take_while(|&&(_, c)| predicate(c)) + .last() + .map_or(chars[idx].0 + chars[idx].1.len_utf8(), |&(i, c)| { + i + c.len_utf8() + }); + + start..end +} + +fn choose_category_range( + rand: &mut R, + string: &str, +) -> (Range, &'static [(u32, u32)]) { + let chars = string.char_indices().collect::>(); + let idx = rand.below(chars.len() as u64) as usize; + let c = chars[idx].1; + + // figure out the categories for this char + let expanded = c as u32; + #[cfg(test)] + let mut names = Vec::new(); + let mut categories = Vec::new(); + for (_name, category) in unicode_categories::BY_NAME { + if get_subcategory(expanded, category).is_some() { + #[cfg(test)] + names.push(_name); + categories.push(category); + } + } + + // ok -- we want to bias towards smaller regions to keep the mutations "tight" to original + // we sort the options by descending length, then pick isqrt of below(n^2) + + categories.sort_by_cached_key(|cat| { + Reverse( + cat.iter() + .map(|&(min, max)| (max - min + 1) as usize) + .sum::(), + ) + }); + let options = categories.len() * categories.len(); + let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize; + + let selected = categories[selected_idx]; + + #[cfg(test)] + println!("category for `{c}' ({}): {}", c as u32, names[selected_idx]); + + ( + find_range(&chars, idx, |c| { + get_subcategory(c as u32, selected).is_some() + }), + selected, + ) +} + +fn choose_subcategory_range(rand: &mut R, string: &str) -> (Range, (u32, u32)) { + let chars = string.char_indices().collect::>(); + let idx = rand.below(chars.len() as u64) as usize; + let c = chars[idx].1; + + // figure out the categories for this char + let expanded = c as u32; + #[cfg(test)] + let mut names = Vec::new(); + let mut subcategories = Vec::new(); + for (_name, category) in unicode_categories::BY_NAME { + if let Some(subcategory) = get_subcategory(expanded, category) { + #[cfg(test)] + names.push(_name); + subcategories.push(subcategory); + } + } + + // see reasoning for selection pattern in choose_category_range + + subcategories.sort_by_key(|&(min, max)| Reverse(max - min + 1)); + let options = subcategories.len() * subcategories.len(); + let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize; + let selected = subcategories[selected_idx]; + + #[cfg(test)] + println!( + "subcategory for `{c}' ({}): {} ({:?})", + c as u32, names[selected_idx], selected + ); + + ( + find_range(&chars, idx, |c| { + let expanded = c as u32; + selected.0 <= expanded && expanded <= selected.1 + }), + selected, + ) +} + +fn rand_replace_range char>( + state: &mut S, + input: &mut UnicodeInput, + range: Range, + char_gen: F, +) -> MutationResult { + let temp_range = rand_range(state, range.end - range.start, MAX_CHARS); + let range = (range.start + temp_range.start)..(range.start + temp_range.end); + let range = match core::str::from_utf8(&input.0.bytes()[range.clone()]) { + Ok(_) => range, + Err(e) => range.start..(range.start + e.valid_up_to()), + }; + + #[cfg(test)] + println!( + "mutating range: {:?} ({:?})", + range, + core::str::from_utf8(&input.0.bytes()[range.clone()]) + ); + if range.start == range.end { + return MutationResult::Skipped; + } + + let replace_len = state.rand_mut().below(MAX_CHARS as u64) as usize; + let orig_len = range.end - range.start; + if input.0.len() - orig_len + replace_len > state.max_size() { + return MutationResult::Skipped; + } + + let mut replacement = Vec::with_capacity(replace_len); + let mut dest = [0u8; 4]; + + loop { + let new_c = char_gen(state); + if replacement.len() + new_c.len_utf8() > replace_len { + break; + } + new_c.encode_utf8(&mut dest); + replacement.extend_from_slice(&dest[..new_c.len_utf8()]); + if replacement.len() + new_c.len_utf8() == replace_len { + break; // nailed it + } + } + + input.0.bytes_mut().splice(range, replacement); + input.1 = extract_metadata(input.0.bytes()); + + MutationResult::Mutated +} + +/// Unicode category data, as used by string analysis and mutators. +pub mod unicode_categories { + #![allow(unused)] + #![allow(missing_docs)] + #![allow(clippy::redundant_static_lifetimes)] + + include!(concat!(env!("OUT_DIR"), "/unicode_categories.rs")); +} + +/// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the +/// range's category +#[derive(Debug, Default)] +pub struct StringCategoryRandMutator; + +impl Named for StringCategoryRandMutator { + fn name(&self) -> &str { + "string-category-rand" + } +} + +impl Mutator for StringCategoryRandMutator +where + S: HasRand + HasMaxSize, +{ + fn mutate( + &mut self, + state: &mut S, + input: &mut UnicodeInput, + _stage_idx: i32, + ) -> Result { + if input.0.bytes().is_empty() { + return Ok(MutationResult::Skipped); + } + + let bytes = input.0.bytes(); + let meta = &input.1; + if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) { + let substring = core::str::from_utf8(&bytes[base..][..len])?; + let (range, category) = choose_category_range(state.rand_mut(), substring); + #[cfg(test)] + println!( + "{:?} => {:?}", + range, + core::str::from_utf8(&bytes[range.clone()]) + ); + + let options: u64 = category + .iter() + .map(|&(start, end)| u64::from(end) - u64::from(start) + 1) + .sum(); + let char_gen = |state: &mut S| loop { + let mut selected = state.rand_mut().below(options); + for &(min, max) in category { + if let Some(next_selected) = + selected.checked_sub(u64::from(max) - u64::from(min) + 1) + { + selected = next_selected; + } else if let Some(new_c) = char::from_u32(selected as u32 + min) { + return new_c; + } else { + break; + } + } + }; + + return Ok(rand_replace_range(state, input, range, char_gen)); + } + + Ok(MutationResult::Skipped) + } +} + +/// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the +/// range's subcategory +#[derive(Debug, Default)] +pub struct StringSubcategoryRandMutator; + +impl Named for StringSubcategoryRandMutator { + fn name(&self) -> &str { + "string-subcategory-rand" + } +} + +impl Mutator for StringSubcategoryRandMutator +where + S: HasRand + HasMaxSize, +{ + fn mutate( + &mut self, + state: &mut S, + input: &mut UnicodeInput, + _stage_idx: i32, + ) -> Result { + if input.0.bytes().is_empty() { + return Ok(MutationResult::Skipped); + } + + let bytes = input.0.bytes(); + let meta = &input.1; + if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) { + let substring = core::str::from_utf8(&bytes[base..][..len])?; + let (range, subcategory) = choose_subcategory_range(state.rand_mut(), substring); + #[cfg(test)] + println!( + "{:?} => {:?}", + range, + core::str::from_utf8(&bytes[range.clone()]) + ); + + let options: u64 = u64::from(subcategory.1) - u64::from(subcategory.0) + 1; + let char_gen = |state: &mut S| loop { + let selected = state.rand_mut().below(options); + if let Some(new_c) = char::from_u32(selected as u32 + subcategory.0) { + return new_c; + } + }; + + return Ok(rand_replace_range(state, input, range, char_gen)); + } + + Ok(MutationResult::Skipped) + } +} + +/// Mutator which randomly replaces a full category-contiguous region of chars with a random token +#[derive(Debug, Default)] +pub struct StringCategoryTokenReplaceMutator; + +impl Named for StringCategoryTokenReplaceMutator { + fn name(&self) -> &str { + "string-category-token-replace" + } +} + +impl Mutator for StringCategoryTokenReplaceMutator +where + S: HasRand + HasMaxSize + HasMetadata, +{ + fn mutate( + &mut self, + state: &mut S, + input: &mut UnicodeInput, + _stage_idx: i32, + ) -> Result { + if input.0.bytes().is_empty() { + return Ok(MutationResult::Skipped); + } + + let tokens_len = { + let meta = state.metadata_map().get::(); + if meta.is_none() { + return Ok(MutationResult::Skipped); + } + if meta.unwrap().tokens().is_empty() { + return Ok(MutationResult::Skipped); + } + meta.unwrap().tokens().len() + }; + let token_idx = state.rand_mut().below(tokens_len as u64) as usize; + + let bytes = input.0.bytes(); + let meta = &input.1; + if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) { + let substring = core::str::from_utf8(&bytes[base..][..len])?; + let (range, _) = choose_category_range(state.rand_mut(), substring); + + #[cfg(test)] + println!( + "{:?} => {:?}", + range, + core::str::from_utf8(&bytes[range.clone()]) + ); + + let meta = state.metadata_map().get::().unwrap(); + let token = &meta.tokens()[token_idx]; + + if input.0.len() - (range.end - range.start) + token.len() > state.max_size() { + return Ok(MutationResult::Skipped); + } + + input.0.bytes_mut().splice(range, token.iter().copied()); + input.1 = extract_metadata(input.0.bytes()); + return Ok(MutationResult::Mutated); + } + + Ok(MutationResult::Skipped) + } +} + +/// Mutator which randomly replaces a full subcategory-contiguous region of chars with a random token +#[derive(Debug, Default)] +pub struct StringSubcategoryTokenReplaceMutator; + +impl Named for StringSubcategoryTokenReplaceMutator { + fn name(&self) -> &str { + "string-subcategory-replace" + } +} + +impl Mutator for StringSubcategoryTokenReplaceMutator +where + S: HasRand + HasMaxSize + HasMetadata, +{ + fn mutate( + &mut self, + state: &mut S, + input: &mut UnicodeInput, + _stage_idx: i32, + ) -> Result { + if input.0.bytes().is_empty() { + return Ok(MutationResult::Skipped); + } + + let tokens_len = { + let meta = state.metadata_map().get::(); + if meta.is_none() { + return Ok(MutationResult::Skipped); + } + if meta.unwrap().tokens().is_empty() { + return Ok(MutationResult::Skipped); + } + meta.unwrap().tokens().len() + }; + let token_idx = state.rand_mut().below(tokens_len as u64) as usize; + + let bytes = input.0.bytes(); + let meta = &input.1; + if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) { + let substring = core::str::from_utf8(&bytes[base..][..len])?; + let (range, _) = choose_subcategory_range(state.rand_mut(), substring); + + #[cfg(test)] + println!( + "{:?} => {:?}", + range, + core::str::from_utf8(&bytes[range.clone()]) + ); + + let meta = state.metadata_map().get::().unwrap(); + let token = &meta.tokens()[token_idx]; + + if input.0.len() - (range.end - range.start) + token.len() > state.max_size() { + return Ok(MutationResult::Skipped); + } + + input.0.bytes_mut().splice(range, token.iter().copied()); + input.1 = extract_metadata(input.0.bytes()); + return Ok(MutationResult::Mutated); + } + + Ok(MutationResult::Skipped) + } +} + +#[cfg(test)] +mod test { + use libafl_bolts::rands::StdRand; + + use super::*; + use crate::{corpus::NopCorpus, stages::extract_metadata, state::StdState}; + + // a not-so-useful test for this + #[test] + fn mutate_hex() { + let result: Result<(), Error> = (|| { + let hex = "0123456789abcdef0123456789abcdef"; + let mut bytes = BytesInput::from(hex.as_bytes()); + + let mut mutator = StringCategoryRandMutator; + + let mut state = StdState::new( + StdRand::with_seed(0), + NopCorpus::::new(), + NopCorpus::new(), + &mut (), + &mut (), + )?; + + for _ in 0..(1 << 12) { + let metadata = extract_metadata(bytes.bytes()); + let mut input = (bytes, metadata); + let _ = mutator.mutate(&mut state, &mut input, 0); + println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap()); + bytes = input.0; + } + + Ok(()) + })(); + + if let Err(e) = result { + panic!("failed with error: {e}"); + } + } + + #[test] + fn mutate_hex_subcat() { + let result: Result<(), Error> = (|| { + let hex = "0123456789abcdef0123456789abcdef"; + let mut bytes = BytesInput::from(hex.as_bytes()); + + let mut mutator = StringSubcategoryRandMutator; + + let mut state = StdState::new( + StdRand::with_seed(0), + NopCorpus::::new(), + NopCorpus::new(), + &mut (), + &mut (), + )?; + + for _ in 0..(1 << 12) { + let metadata = extract_metadata(bytes.bytes()); + let mut input = (bytes, metadata); + let _ = mutator.mutate(&mut state, &mut input, 0); + println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap()); + bytes = input.0; + } + + Ok(()) + })(); + + if let Err(e) = result { + panic!("failed with error: {e}"); + } + } +} diff --git a/libafl/src/stages/mod.rs b/libafl/src/stages/mod.rs index 1df95bdda9..b4e9725890 100644 --- a/libafl/src/stages/mod.rs +++ b/libafl/src/stages/mod.rs @@ -49,6 +49,11 @@ pub use concolic::ConcolicTracingStage; #[cfg(feature = "std")] pub use concolic::SimpleConcolicMutationalStage; +#[cfg(feature = "unicode")] +pub mod string; +#[cfg(feature = "unicode")] +pub use string::*; + #[cfg(feature = "std")] pub mod sync; #[cfg(feature = "std")] @@ -56,6 +61,7 @@ pub use sync::*; #[cfg(feature = "std")] pub mod dump; + use core::{convert::From, marker::PhantomData}; #[cfg(feature = "std")] diff --git a/libafl/src/stages/string.rs b/libafl/src/stages/string.rs new file mode 100644 index 0000000000..a2e441b994 --- /dev/null +++ b/libafl/src/stages/string.rs @@ -0,0 +1,128 @@ +//! Stages which analysis common to Unicode-style mutations + +use alloc::{collections::VecDeque, rc::Rc, vec::Vec}; +use core::marker::PhantomData; + +use bitvec::{bitvec, vec::BitVec}; +use libafl_bolts::{impl_serdeany, Error}; +use serde::{Deserialize, Serialize}; + +use crate::{ + corpus::{CorpusId, HasTestcase}, + inputs::{BytesInput, HasBytesVec, UsesInput}, + stages::Stage, + state::{HasCorpus, HasMetadata, UsesState}, +}; + +/// Metadata which stores the list of pre-computed string-like ranges in the input +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +pub struct StringIdentificationMetadata { + ranges: Rc>, +} + +impl_serdeany!(StringIdentificationMetadata); + +impl StringIdentificationMetadata { + /// The list of pre-computed string-like ranges in the input + #[must_use] + pub fn ranges(&self) -> &Vec<(usize, BitVec)> { + self.ranges.as_ref() + } +} + +pub(crate) fn extract_metadata(bytes: &[u8]) -> StringIdentificationMetadata { + let mut ranges = Vec::new(); + + if !bytes.is_empty() { + let mut queue = VecDeque::new(); + let mut visited = bitvec![0; bytes.len()]; + queue.push_back(0); + + while let Some(i) = queue.pop_front() { + if i >= bytes.len() || visited[i] { + // if we've already visited a particular entry, then we already know its range(s) + continue; + } + visited.set(i, true); // we always visit the current entry + let s = core::str::from_utf8(&bytes[i..]).unwrap_or_else(|e| { + queue.push_back(i + e.valid_up_to() + 1); // push to the next region + core::str::from_utf8(&bytes[i..][..e.valid_up_to()]).unwrap() + }); + if !s.is_empty() { + let mut entries = bitvec![0; s.bytes().len()]; + for (c_idx, _) in s.char_indices() { + entries.set(c_idx, true); + visited.set(i + c_idx, true); + } + for unset in entries.iter_zeros() { + // each unset index potentially represents a new UTF-8 start point + queue.push_back(unset); + } + ranges.push((i, entries)); + } + } + } + + StringIdentificationMetadata { + ranges: Rc::new(ranges), + } +} + +/// Stage which identifies potential strings in the provided input +#[derive(Debug)] +pub struct StringIdentificationStage { + phantom: PhantomData, +} + +impl Default for StringIdentificationStage { + fn default() -> Self { + Self::new() + } +} + +impl StringIdentificationStage { + /// Create a new instance of the string identification stage + #[must_use] + pub fn new() -> Self { + Self { + phantom: PhantomData, + } + } +} + +impl UsesState for StringIdentificationStage +where + S: UsesInput, +{ + type State = S; +} + +impl Stage for StringIdentificationStage +where + S: HasTestcase + HasCorpus, + E: UsesState, + EM: UsesState, + Z: UsesState, +{ + fn perform( + &mut self, + _fuzzer: &mut Z, + _executor: &mut E, + state: &mut Self::State, + _manager: &mut EM, + corpus_idx: CorpusId, + ) -> Result<(), Error> { + let mut tc = state.testcase_mut(corpus_idx)?; + if tc.has_metadata::() { + return Ok(()); // skip recompute + } + + let input = tc.load_input(state.corpus())?; + + let bytes = input.bytes(); + let metadata = extract_metadata(bytes); + tc.add_metadata(metadata); + + Ok(()) + } +} diff --git a/libafl_bolts/src/lib.rs b/libafl_bolts/src/lib.rs index de5d3b9a9a..d24b8b2de6 100644 --- a/libafl_bolts/src/lib.rs +++ b/libafl_bolts/src/lib.rs @@ -169,11 +169,6 @@ use log::{Metadata, Record}; /// out of `libafl_bolts` into `libafl::events::launcher`. pub mod launcher {} -// Re-export derive(SerdeAny) -#[cfg(feature = "libafl_derive")] -#[allow(unused_imports)] -#[macro_use] -extern crate libafl_derive; use core::{ array::TryFromSliceError, fmt::{self, Display}, @@ -190,6 +185,7 @@ pub use libafl_derive::SerdeAny; use { alloc::string::{FromUtf8Error, String}, core::cell::{BorrowError, BorrowMutError}, + core::str::Utf8Error, }; /// We need fixed names for many parts of this lib. @@ -505,6 +501,14 @@ impl From for Error { } } +#[cfg(feature = "alloc")] +impl From for Error { + #[allow(unused_variables)] + fn from(err: Utf8Error) -> Self { + Self::unknown(format!("Could not convert byte / utf-8: {err:?}")) + } +} + #[cfg(feature = "std")] impl From for Error { #[allow(unused_variables)] diff --git a/libafl_bolts/src/serdeany.rs b/libafl_bolts/src/serdeany.rs index dc378039c0..5d8d90832f 100644 --- a/libafl_bolts/src/serdeany.rs +++ b/libafl_bolts/src/serdeany.rs @@ -85,7 +85,7 @@ macro_rules! create_serde_registry_for_trait { Error, }; - /// Visitor object used internally for the [`SerdeAny`] registry. + /// Visitor object used internally for the [`crate::serdeany::SerdeAny`] registry. #[derive(Debug)] pub struct BoxDynVisitor {} #[allow(unused_qualifications)] @@ -319,7 +319,7 @@ macro_rules! create_serde_registry_for_trait { } } - /// A serializable [`HashMap`] wrapper for [`SerdeAny`] types, addressable by name. + /// A serializable [`HashMap`] wrapper for [`crate::serdeany::SerdeAny`] types, addressable by name. #[allow(clippy::unsafe_derive_deserialize)] #[allow(unused_qualifications)] #[derive(Debug, Serialize, Deserialize)] diff --git a/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml b/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml index 18b9eab450..921369cfe0 100644 --- a/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml +++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml @@ -30,7 +30,7 @@ path = "src/lib.rs" crate-type = ["staticlib", "rlib"] [dependencies] -libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor"] } +libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor", "unicode"] } libafl_bolts = { path = "../../libafl_bolts", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "serdeany_autoreg", "errors_backtrace"] } libafl_targets = { path = "../../libafl_targets", features = ["sancov_8bit", "sancov_cmplog", "libfuzzer", "libfuzzer_oom", "libfuzzer_define_run_driver", "libfuzzer_interceptors", "sanitizers_flags", "whole_archive"] } diff --git a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs index 3de1b59cc3..a5de859b98 100644 --- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs +++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs @@ -166,7 +166,8 @@ macro_rules! fuzz_with { mutators::{ GrimoireExtensionMutator, GrimoireRecursiveReplacementMutator, GrimoireRandomDeleteMutator, GrimoireStringReplacementMutator, havoc_crossover, havoc_mutations, havoc_mutations_no_crossover, - I2SRandReplace, StdScheduledMutator, Tokens, tokens_mutations + I2SRandReplace, StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator, + StringCategoryTokenReplaceMutator, StringSubcategoryTokenReplaceMutator, Tokens, tokens_mutations }, observers::{stacktrace::BacktraceObserver, TimeObserver}, schedulers::{ @@ -174,7 +175,7 @@ macro_rules! fuzz_with { }, stages::{ CalibrationStage, GeneralizationStage, IfStage, StdMutationalStage, - StdPowerMutationalStage, TracingStage, + StdPowerMutationalStage, StringIdentificationStage, TracingStage, }, state::{HasCorpus, StdState}, StdFuzzer, @@ -224,7 +225,7 @@ macro_rules! fuzz_with { // Set up a generalization stage for grimoire let generalization = GeneralizationStage::new(&edges_observer); - let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), (generalization, ())); + let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), tuple_list!(generalization)); let calibration = CalibrationStage::new(&map_feedback); @@ -296,6 +297,32 @@ macro_rules! fuzz_with { }); state.metadata_map_mut().insert_boxed(grimoire_metadata); + // Set up a string category analysis stage for unicode mutations + let unicode_used = $options.unicode(); + let string_mutator = StdScheduledMutator::new( + tuple_list!( + StringCategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator, + StringSubcategoryRandMutator, + ) + ); + let string_replace_mutator = StdScheduledMutator::new( + tuple_list!( + StringCategoryTokenReplaceMutator, + StringSubcategoryTokenReplaceMutator, + StringSubcategoryTokenReplaceMutator, + StringSubcategoryTokenReplaceMutator, + StringSubcategoryTokenReplaceMutator, + ) + ); + let string_power = StdMutationalStage::transforming(string_mutator); + let string_replace_power = StdMutationalStage::transforming(string_replace_mutator); + + let string_analysis = StringIdentificationStage::new(); + let string_analysis = IfStage::new(|_, _, _, _, _| Ok((unicode_used && mutator_status.std_mutational).into()), tuple_list!(string_analysis, string_power, string_replace_power)); + // Attempt to use tokens from libfuzzer dicts if !state.has_metadata::() { let mut toks = if let Some(tokens) = $options.dict() { @@ -466,6 +493,7 @@ macro_rules! fuzz_with { calibration, generalization, tracing, + string_analysis, i2s, cm_i2s, std_power, diff --git a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs index 0e73a80483..58b3b25f08 100644 --- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs +++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs @@ -107,6 +107,7 @@ pub struct LibfuzzerOptions { artifact_prefix: ArtifactPrefix, timeout: Duration, grimoire: Option, + unicode: bool, forks: Option, dict: Option, dirs: Vec, @@ -162,6 +163,10 @@ impl LibfuzzerOptions { self.grimoire } + pub fn unicode(&self) -> bool { + self.unicode + } + pub fn forks(&self) -> Option { self.forks } @@ -230,6 +235,7 @@ struct LibfuzzerOptionsBuilder<'a> { artifact_prefix: Option<&'a str>, timeout: Option, grimoire: Option, + unicode: Option, forks: Option, dict: Option<&'a str>, dirs: Vec<&'a str>, @@ -292,6 +298,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> { } } "grimoire" => self.grimoire = Some(parse_or_bail!(name, value, u64) > 0), + "unicode" => self.unicode = Some(parse_or_bail!(name, value, u64) > 0), "artifact_prefix" => { self.artifact_prefix = Some(value); } @@ -349,6 +356,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> { .unwrap_or_default(), timeout: self.timeout.unwrap_or(Duration::from_secs(1200)), grimoire: self.grimoire, + unicode: self.unicode.unwrap_or(true), forks: self.forks, dict: self.dict.map(|path| { Tokens::from_file(path).expect("Couldn't load tokens from specified dictionary")