diff --git a/.travis.yml b/.travis.yml index 6bbfccfc2f..cb4da896ce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,10 +14,12 @@ script: - cargo doc --verbose --manifest-path=regex-syntax/Cargo.toml - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then travis_wait cargo test --verbose --features pattern; - travis_wait cargo bench --verbose --bench dynamic; - travis_wait cargo bench --manifest-path=regex-pcre-benchmark/Cargo.toml --verbose + travis_wait ./run-bench rust; + travis_wait ./run-bench rust-bytes --no-run; + travis_wait ./run-bench rust-plugin --no-run; + travis_wait ./run-bench pcre --no-run; + travis_wait ./run-bench onig --no-run; travis_wait cargo test --verbose --manifest-path=regex_macros/Cargo.toml; - travis_wait cargo bench --manifest-path=regex_macros/Cargo.toml --verbose --bench native bench::; fi after_success: | [ $TRAVIS_BRANCH = master ] && diff --git a/Cargo.toml b/Cargo.toml index 56af712a5f..c68ae37a55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,10 +23,11 @@ regex-syntax = { path = "regex-syntax", version = "0.2.5" } utf8-ranges = "0.1" [dev-dependencies] -# To prevent the benchmarking harness from running setup code more than once. -# Why? Because it takes too long. +# For examples. lazy_static = "0.1" -# For generating random text to test/benchmark with. +# For property based tests. +quickcheck = "0.2" +# For generating random test data. rand = "0.3" [features] @@ -41,7 +42,7 @@ bench = false # Generally these tests specific pieces of the regex implementation. [[test]] path = "src/lib.rs" -name = "regex" +name = "regex-inline" # Run the test suite on the default behavior of Regex::new. # This includes a mish mash of NFAs and DFAs, which are chosen automatically @@ -49,49 +50,45 @@ name = "regex" # usage with the test definitions below. (We can't test the DFA implementations # in the same way since they can't be used for every regex tested.) [[test]] -path = "tests/test_dynamic.rs" -name = "dynamic" +path = "tests/test_default.rs" +name = "default" +test = false -# Run the test suite on the NFA algorithm over Unicode codepoints. +# The same as the default tests, but run on bytes::Regex. [[test]] -path = "tests/test_dynamic_nfa.rs" -name = "dynamic-nfa" +path = "tests/test_default_bytes.rs" +name = "default-bytes" -# Run the test suite on the NFA algorithm over bytes. +# Run the test suite on the NFA algorithm over Unicode codepoints. [[test]] -path = "tests/test_dynamic_nfa_bytes.rs" -name = "dynamic-nfa-bytes" +path = "tests/test_nfa.rs" +name = "nfa" -# Run the test suite on the backtracking engine over Unicode codepoints. +# Run the test suite on the NFA algorithm over bytes that match UTF-8 only. [[test]] -path = "tests/test_dynamic_backtrack.rs" -name = "dynamic-backtrack" +path = "tests/test_nfa_utf8bytes.rs" +name = "nfa-utf8bytes" -# Run the test suite on the backtracking engine over bytes. +# Run the test suite on the NFA algorithm over arbitrary bytes. [[test]] -path = "tests/test_dynamic_backtrack_bytes.rs" -name = "dynamic-backtrack-bytes" +path = "tests/test_nfa_bytes.rs" +name = "nfa-bytes" -# Run the benchmarks on the default behavior of Regex::new. -# -# N.B. These benchmarks were originally taken from Russ Cox. -[[bench]] -name = "dynamic" -path = "benches/bench_dynamic.rs" -test = false -bench = true +# Run the test suite on the backtracking engine over Unicode codepoints. +[[test]] +path = "tests/test_backtrack.rs" +name = "backtrack" -# Run the benchmarks on the NFA algorithm. We avoid chasing other permutations. -# -# N.B. These can take a *loong* time to run. -[[bench]] -name = "dynamic-nfa" -path = "benches/bench_dynamic_nfa.rs" -test = false -bench = true +# Run the test suite on the backtracking engine over bytes that match UTF-8 +# only. +[[test]] +path = "tests/test_backtrack_utf8bytes.rs" +name = "backtrack-utf8bytes" -[profile.bench] -debug = true +# Run the test suite on the backtracking engine over arbitrary bytes. +[[test]] +path = "tests/test_backtrack_bytes.rs" +name = "backtrack-bytes" [profile.test] debug = true diff --git a/benches/Cargo.toml b/benches/Cargo.toml new file mode 100644 index 0000000000..15ebe5f8f8 --- /dev/null +++ b/benches/Cargo.toml @@ -0,0 +1,70 @@ +[package] +publish = false +name = "regex-benchmark" +version = "0.1.0" +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang/regex" +documentation = "http://doc.rust-lang.org/regex/regex_syntax/index.html" +homepage = "https://github.com/rust-lang/regex" +description = "Regex benchmarks for Rust's and other engines." + +[dependencies] +enum-set = "0.0.6" +lazy_static = "0.1" +onig = { version = "0.4", optional = true } +pcre = { version = "0.2", optional = true } +rand = "0.3" +regex = { version = "0.1", path = ".." } +regex_macros = { version = "0.1", path = "../regex_macros", optional = true } +regex-syntax = { version = "0.2", path = "../regex-syntax" } + +# Use features to conditionally compile benchmarked regexes, since not every +# regex works on every engine. :-( +[features] +re-pcre = ["pcre"] +re-onig = ["onig"] +re-rust = [] +re-rust-bytes = [] +re-rust-plugin = ["regex_macros"] + +# Run the benchmarks on the default behavior of Regex::new. +[[bench]] +name = "rust" +path = "src/bench_rust.rs" +test = false +bench = true + +# Run the benchmarks on the default behavior of bytes::Regex::new. +[[bench]] +name = "rust-bytes" +path = "src/bench_rust_bytes.rs" +test = false +bench = true + +# Run the benchmarks on the default behavior of the `regex!` compiler plugin. +[[bench]] +name = "rust-plugin" +path = "src/bench_rust_plugin.rs" +test = false +bench = true + +# Run the benchmarks on PCRE. +[[bench]] +name = "pcre" +path = "src/bench_pcre.rs" +test = false +bench = true + +# Run the benchmarks on Oniguruma. +[[bench]] +name = "onig" +path = "src/bench_onig.rs" +test = false +bench = true + +[profile.bench] +debug = true + +[profile.test] +debug = true diff --git a/benches/bench_pcre.rs b/benches/bench_pcre.rs deleted file mode 100644 index 4b8dea54f9..0000000000 --- a/benches/bench_pcre.rs +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// These benchmarks use PCRE to reproduce some of the benchmarks used to track -// performance of regexes in this crate. I'm not an experienced user of PCRE, -// so it's possible that usage here is not optimal. If it isn't, then -// improvements are welcome. (I'm aware that PCRE has a DFA, but it doesn't -// seem to actually preserve leftmost-first semantics, like the DFA in this -// crate does.) -// -// Note that for these benchmarks, all we need is to detect whether there is -// a match or not. - -#![feature(test)] -#![allow(non_snake_case)] - -extern crate enum_set; -#[macro_use] extern crate lazy_static; -extern crate pcre; -extern crate rand; -extern crate regex; -extern crate test; - -use std::iter::repeat; - -use rand::{Rng, thread_rng}; -use test::Bencher; - -/// A nominal wrapper around pcre::Pcre to expose an interface similar to -/// regex::Regex. -struct RegexPcre(pcre::Pcre); - -/// lazy_static wants this. No reason not to provide it. -unsafe impl Send for RegexPcre {} -unsafe impl Sync for RegexPcre {} - -impl RegexPcre { - fn is_match(&mut self, text: &str) -> bool { - self.0.exec(text).is_some() - } - - fn count_matches(&mut self, text: &str) -> usize { - self.0.matches(text).count() - } -} - -macro_rules! regex( - ($re:expr) => {{ - use enum_set::EnumSet; - use pcre::{Pcre, CompileOption, StudyOption}; - - let mut comp_opts = EnumSet::new(); - // Rust's regex library exclusively uses Unicode-aware character - // classes. - comp_opts.insert(CompileOption::Ucp); - let mut re = Pcre::compile_with_options($re, &comp_opts).unwrap(); - - let mut study_opts = EnumSet::new(); - study_opts.insert(StudyOption::StudyJitCompile); - re.study_with_options(&study_opts); - - ::RegexPcre(re) - }} -); - -macro_rules! bench_match { - ($name:ident, $re:expr, $text:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - static ref TEXT: String = $text; - }; - let mut re = RE.lock().unwrap(); - b.bytes = TEXT.len() as u64; - b.iter(|| { - if !re.is_match(&TEXT) { - panic!("expected match, got not match"); - } - }); - } - } -} - -macro_rules! bench_nomatch { - ($name:ident, $re:expr, $text:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - static ref TEXT: String = $text; - }; - let mut re = RE.lock().unwrap(); - b.bytes = TEXT.len() as u64; - b.iter(|| { - if re.is_match(&TEXT) { - panic!("match not expected"); - } - }); - } - } -} - -bench_match!(literal, regex!("y"), { - format!("{}y", repeat("x").take(50).collect::()) -}); - -bench_match!(not_literal, regex!(".y"), { - format!("{}y", repeat("x").take(50).collect::()) -}); - -bench_match!(match_class, regex!("[abcdw]"), { - format!("{}w", repeat("xxxx").take(20).collect::()) -}); - -bench_match!(match_class_in_range, regex!("[ac]"), { - format!("{}c", repeat("bbbb").take(20).collect::()) -}); - -bench_match!(match_class_unicode, regex!(r"\pL"), { - format!("{}a", repeat("☃5☃5").take(20).collect::()) -}); - -bench_nomatch!(anchored_literal_short_non_match, regex!("^zbc(d|e)"), { - "abcdefghijklmnopqrstuvwxyz".to_owned() -}); - -bench_nomatch!(anchored_literal_long_non_match, regex!("^zbc(d|e)"), { - repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::() -}); - -bench_match!(anchored_literal_short_match, regex!("^.bc(d|e)"), { - "abcdefghijklmnopqrstuvwxyz".to_owned() -}); - -bench_match!(anchored_literal_long_match, regex!("^.bc(d|e)"), { - repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::() -}); - -bench_match!(one_pass_short, regex!("^.bc(d|e)*$"), { - "abcddddddeeeededd".to_owned() -}); - -bench_match!(one_pass_short_not, regex!(".bc(d|e)*$"), { - "abcddddddeeeededd".to_owned() -}); - -bench_match!(one_pass_long_prefix, regex!("^abcdefghijklmnopqrstuvwxyz.*$"), { - "abcdefghijklmnopqrstuvwxyz".to_owned() -}); - -bench_match!(one_pass_long_prefix_not, regex!("^.bcdefghijklmnopqrstuvwxyz.*$"), { - "abcdefghijklmnopqrstuvwxyz".to_owned() -}); - -fn gen_text(n: usize) -> String { - let mut rng = thread_rng(); - let mut bytes = rng.gen_ascii_chars().map(|n| n as u8).take(n) - .collect::>(); - for (i, b) in bytes.iter_mut().enumerate() { - if i % 20 == 0 { - *b = b'\n' - } - } - String::from_utf8(bytes).unwrap() -} - -fn easy0() -> RegexPcre { - regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") -} - -bench_nomatch!(easy0_32, easy0(), gen_text(32)); -bench_nomatch!(easy0_1K, easy0(), gen_text(1<<10)); -bench_nomatch!(easy0_32K, easy0(), gen_text(32<<10)); -bench_nomatch!(easy0_1MB, easy0(), gen_text(1<<20)); - -fn easy1() -> RegexPcre { - regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") -} - -bench_nomatch!(easy1_32, easy1(), gen_text(32)); -bench_nomatch!(easy1_1K, easy1(), gen_text(1<<10)); -bench_nomatch!(easy1_32K, easy1(), gen_text(32<<10)); -bench_nomatch!(easy1_1MB, easy1(), gen_text(1<<20)); - -fn medium() -> RegexPcre { - regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") -} - -bench_nomatch!(medium_32, medium(), gen_text(32)); -bench_nomatch!(medium_1K, medium(), gen_text(1<<10)); -bench_nomatch!(medium_32K, medium(), gen_text(32<<10)); -bench_nomatch!(medium_1MB, medium(), gen_text(1<<20)); - -fn hard() -> RegexPcre { - regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") -} - -bench_nomatch!(hard_32, hard(), gen_text(32)); -bench_nomatch!(hard_1K, hard(), gen_text(1<<10)); -bench_nomatch!(hard_32K, hard(), gen_text(32<<10)); -bench_nomatch!(hard_1MB, hard(), gen_text(1<<20)); - - -// These are the Sherlock Holmes benchmarks. Not all of them are present -// since the syntax isn't exactly the same for things like case insensitive -// matching. We could add them back by twiddling the flags using PCRE though. -// -// Other benchmarks are removed purely because PCRE is too darn slow on them. - -mod sherlock { - use super::RegexPcre; - use test::Bencher; - - lazy_static! { - static ref SHERLOCK: String = { - include_str!("the-adventures-of-sherlock-holmes.txt").to_owned() - }; - } - - macro_rules! bench_find { - ($name:ident, $re:expr, $count:expr) => { - #[bench] - fn $name(b: &mut Bencher) { - use std::sync::Mutex; - - lazy_static! { - static ref RE: Mutex = Mutex::new($re); - }; - let mut re = RE.lock().unwrap(); - b.bytes = SHERLOCK.len() as u64; - b.iter(|| { - let count = re.count_matches(&SHERLOCK); - assert_eq!($count, count) - }); - } - } - } - - bench_find!(name_sherlock, regex!("Sherlock"), 97); - bench_find!(name_holmes, regex!("Holmes"), 461); - bench_find!(name_sherlock_holmes, regex!("Sherlock Holmes"), 91); - - bench_find!(name_sherlock_nocase, regex!("(?i)Sherlock"), 102); - bench_find!(name_holmes_nocase, regex!("(?i)Holmes"), 467); - bench_find!( - name_sherlock_holmes_nocase, regex!("(?i)Sherlock Holmes"), 96); - - bench_find!(name_whitespace, regex!(r"Sherlock\s+Holmes"), 97); - - bench_find!(name_alt1, regex!("Sherlock|Street"), 158); - bench_find!(name_alt2, regex!("Sherlock|Holmes"), 558); - bench_find!( - name_alt3, - regex!("Sherlock|Holmes|Watson|Irene|Adler|John|Baker"), 740); - bench_find!( - name_alt3_nocase, - regex!("(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"), 753); - bench_find!(name_alt4, regex!("Sher[a-z]+|Hol[a-z]+"), 582); - bench_find!(name_alt4_nocase, regex!("(?i)Sher[a-z]+|Hol[a-z]+"), 697); - - bench_find!(no_match_uncommon, regex!("zyx"), 0); - bench_find!(no_match_common, regex!("ayx"), 0); - - bench_find!(the_lower, regex!("the"), 7218); - bench_find!(the_upper, regex!("The"), 741); - bench_find!(the_nocase, regex!("(?i)the"), 7987); - - // bench_find!(everything_greedy, regex!(".*"), 13053); - // bench_find!(everything_greedy_nl, regex!("(?s).*"), 1); - - bench_find!(letters, regex!(r"\pL"), 447160); - bench_find!(letters_upper, regex!(r"\p{Lu}"), 14180); - bench_find!(letters_lower, regex!(r"\p{Ll}"), 432980); - - bench_find!(words, regex!(r"\w+"), 109214); - - bench_find!(the_whitespace, regex!(r"the\s+\w+"), 5410); - - bench_find!(before_holmes, regex!(r"\w+\s+Holmes"), 319); - - bench_find!(holmes_cochar_watson, - regex!(r"Holmes.{0,25}Watson|Watson.{0,25}Holmes"), 7); - - // bench_find!( - // holmes_coword_watson, - // regex!(r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes"), - // 64); - - bench_find!(quotes, regex!(r#"["'][^"']{0,30}[?!.]["']"#), 767); - - bench_find!(line_boundary_sherlock_holmes, - regex!(r"(?m)^Sherlock Holmes|Sherlock Holmes$"), 34); - - bench_find!(word_ending_n, regex!(r"\b\w+n\b"), 8366); -} diff --git a/benches/log/02-set/dynamic b/benches/log/02-set/dynamic index b9b67a084a..69c9f71754 100644 --- a/benches/log/02-set/dynamic +++ b/benches/log/02-set/dynamic @@ -2,77 +2,77 @@ Running target/release/dynamic-a76738dddf3bdc6b running 71 tests -test bench::anchored_literal_long_match ... bench: 74 ns/iter (+/- 8) = 5270 MB/s -test bench::anchored_literal_long_non_match ... bench: 58 ns/iter (+/- 0) = 6724 MB/s -test bench::anchored_literal_short_match ... bench: 73 ns/iter (+/- 0) = 356 MB/s -test bench::anchored_literal_short_non_match ... bench: 58 ns/iter (+/- 0) = 448 MB/s -test bench::easy0_1K ... bench: 214 ns/iter (+/- 2) = 4785 MB/s -test bench::easy0_1MB ... bench: 247,056 ns/iter (+/- 1,777) = 4244 MB/s -test bench::easy0_32 ... bench: 64 ns/iter (+/- 0) = 500 MB/s -test bench::easy0_32K ... bench: 5,281 ns/iter (+/- 29) = 6204 MB/s -test bench::easy1_1K ... bench: 278 ns/iter (+/- 5) = 3683 MB/s -test bench::easy1_1MB ... bench: 320,041 ns/iter (+/- 4,243) = 3276 MB/s -test bench::easy1_32 ... bench: 65 ns/iter (+/- 0) = 492 MB/s -test bench::easy1_32K ... bench: 5,885 ns/iter (+/- 83) = 5568 MB/s -test bench::hard_1K ... bench: 4,685 ns/iter (+/- 20) = 218 MB/s -test bench::hard_1MB ... bench: 4,745,020 ns/iter (+/- 19,440) = 220 MB/s -test bench::hard_32 ... bench: 197 ns/iter (+/- 1) = 162 MB/s -test bench::hard_32K ... bench: 147,409 ns/iter (+/- 656) = 222 MB/s -test bench::literal ... bench: 20 ns/iter (+/- 1) = 2550 MB/s -test bench::match_class ... bench: 86 ns/iter (+/- 3) = 941 MB/s -test bench::match_class_in_range ... bench: 32 ns/iter (+/- 2) = 2531 MB/s -test bench::match_class_unicode ... bench: 801 ns/iter (+/- 36) = 200 MB/s -test bench::medium_1K ... bench: 1,213 ns/iter (+/- 237) = 844 MB/s -test bench::medium_1MB ... bench: 1,991,418 ns/iter (+/- 239,612) = 526 MB/s -test bench::medium_32 ... bench: 100 ns/iter (+/- 8) = 320 MB/s -test bench::medium_32K ... bench: 57,080 ns/iter (+/- 709) = 574 MB/s -test bench::no_exponential ... bench: 522 ns/iter (+/- 17) = 191 MB/s -test bench::not_literal ... bench: 290 ns/iter (+/- 6) = 175 MB/s -test bench::one_pass_long_prefix ... bench: 176 ns/iter (+/- 15) = 147 MB/s -test bench::one_pass_long_prefix_not ... bench: 183 ns/iter (+/- 28) = 142 MB/s -test bench::one_pass_short ... bench: 136 ns/iter (+/- 8) = 125 MB/s -test bench::one_pass_short_not ... bench: 135 ns/iter (+/- 14) = 125 MB/s -test bench::replace_all ... bench: 149 ns/iter (+/- 34) -test bench_dynamic_compile::compile_huge ... bench: 158,759 ns/iter (+/- 4,546) -test bench_dynamic_compile::compile_huge_bytes ... bench: 17,538,290 ns/iter (+/- 1,735,383) -test bench_dynamic_compile::compile_simple ... bench: 5,935 ns/iter (+/- 429) -test bench_dynamic_compile::compile_simple_bytes ... bench: 6,682 ns/iter (+/- 293) -test bench_dynamic_compile::compile_small ... bench: 7,664 ns/iter (+/- 473) -test bench_dynamic_compile::compile_small_bytes ... bench: 175,272 ns/iter (+/- 4,492) -test bench_dynamic_parse::parse_huge ... bench: 1,199 ns/iter (+/- 38) -test bench_dynamic_parse::parse_simple ... bench: 1,849 ns/iter (+/- 28) -test bench_dynamic_parse::parse_small ... bench: 2,470 ns/iter (+/- 35) -test bench_sherlock::before_holmes ... bench: 2,750,028 ns/iter (+/- 21,847) = 216 MB/s -test bench_sherlock::everything_greedy ... bench: 7,896,337 ns/iter (+/- 68,883) = 75 MB/s -test bench_sherlock::everything_greedy_nl ... bench: 5,498,247 ns/iter (+/- 65,952) = 108 MB/s -test bench_sherlock::holmes_cochar_watson ... bench: 260,499 ns/iter (+/- 4,984) = 2283 MB/s -test bench_sherlock::holmes_coword_watson ... bench: 1,331,443 ns/iter (+/- 34,716) = 446 MB/s -test bench_sherlock::letters ... bench: 60,985,848 ns/iter (+/- 592,838) = 9 MB/s -test bench_sherlock::letters_lower ... bench: 59,041,695 ns/iter (+/- 186,034) = 10 MB/s -test bench_sherlock::letters_upper ... bench: 4,714,214 ns/iter (+/- 35,672) = 126 MB/s -test bench_sherlock::line_boundary_sherlock_holmes ... bench: 2,730,524 ns/iter (+/- 69,565) = 217 MB/s -test bench_sherlock::name_alt1 ... bench: 41,866 ns/iter (+/- 682) = 14210 MB/s -test bench_sherlock::name_alt2 ... bench: 194,322 ns/iter (+/- 6,628) = 3061 MB/s -test bench_sherlock::name_alt3 ... bench: 1,252,965 ns/iter (+/- 18,828) = 474 MB/s -test bench_sherlock::name_alt3_nocase ... bench: 1,476,169 ns/iter (+/- 14,557) = 403 MB/s -test bench_sherlock::name_alt4 ... bench: 298,639 ns/iter (+/- 3,905) = 1992 MB/s -test bench_sherlock::name_alt4_nocase ... bench: 1,426,191 ns/iter (+/- 23,584) = 417 MB/s -test bench_sherlock::name_holmes ... bench: 49,719 ns/iter (+/- 811) = 11965 MB/s -test bench_sherlock::name_holmes_nocase ... bench: 1,191,400 ns/iter (+/- 19,175) = 499 MB/s -test bench_sherlock::name_sherlock ... bench: 34,091 ns/iter (+/- 877) = 17451 MB/s -test bench_sherlock::name_sherlock_holmes ... bench: 33,785 ns/iter (+/- 1,207) = 17609 MB/s -test bench_sherlock::name_sherlock_holmes_nocase ... bench: 1,235,442 ns/iter (+/- 18,023) = 481 MB/s -test bench_sherlock::name_sherlock_nocase ... bench: 1,236,252 ns/iter (+/- 26,934) = 481 MB/s -test bench_sherlock::name_whitespace ... bench: 60,200 ns/iter (+/- 1,873) = 9882 MB/s -test bench_sherlock::no_match_common ... bench: 559,886 ns/iter (+/- 20,306) = 1062 MB/s -test bench_sherlock::no_match_uncommon ... bench: 23,631 ns/iter (+/- 497) = 25175 MB/s -test bench_sherlock::quotes ... bench: 967,379 ns/iter (+/- 12,856) = 614 MB/s -test bench_sherlock::the_lower ... bench: 766,950 ns/iter (+/- 21,944) = 775 MB/s -test bench_sherlock::the_nocase ... bench: 1,706,539 ns/iter (+/- 26,003) = 348 MB/s -test bench_sherlock::the_upper ... bench: 52,529 ns/iter (+/- 1,208) = 11325 MB/s -test bench_sherlock::the_whitespace ... bench: 2,012,952 ns/iter (+/- 26,968) = 295 MB/s -test bench_sherlock::word_ending_n ... bench: 55,578,841 ns/iter (+/- 537,463) = 10 MB/s -test bench_sherlock::words ... bench: 19,103,327 ns/iter (+/- 102,828) = 31 MB/s +test misc::anchored_literal_long_match ... bench: 74 ns/iter (+/- 8) = 5270 MB/s +test misc::anchored_literal_long_non_match ... bench: 58 ns/iter (+/- 0) = 6724 MB/s +test misc::anchored_literal_short_match ... bench: 73 ns/iter (+/- 0) = 356 MB/s +test misc::anchored_literal_short_non_match ... bench: 58 ns/iter (+/- 0) = 448 MB/s +test misc::easy0_1K ... bench: 214 ns/iter (+/- 2) = 4785 MB/s +test misc::easy0_1MB ... bench: 247,056 ns/iter (+/- 1,777) = 4244 MB/s +test misc::easy0_32 ... bench: 64 ns/iter (+/- 0) = 500 MB/s +test misc::easy0_32K ... bench: 5,281 ns/iter (+/- 29) = 6204 MB/s +test misc::easy1_1K ... bench: 278 ns/iter (+/- 5) = 3683 MB/s +test misc::easy1_1MB ... bench: 320,041 ns/iter (+/- 4,243) = 3276 MB/s +test misc::easy1_32 ... bench: 65 ns/iter (+/- 0) = 492 MB/s +test misc::easy1_32K ... bench: 5,885 ns/iter (+/- 83) = 5568 MB/s +test misc::hard_1K ... bench: 4,685 ns/iter (+/- 20) = 218 MB/s +test misc::hard_1MB ... bench: 4,745,020 ns/iter (+/- 19,440) = 220 MB/s +test misc::hard_32 ... bench: 197 ns/iter (+/- 1) = 162 MB/s +test misc::hard_32K ... bench: 147,409 ns/iter (+/- 656) = 222 MB/s +test misc::literal ... bench: 20 ns/iter (+/- 1) = 2550 MB/s +test misc::match_class ... bench: 86 ns/iter (+/- 3) = 941 MB/s +test misc::match_class_in_range ... bench: 32 ns/iter (+/- 2) = 2531 MB/s +test misc::match_class_unicode ... bench: 801 ns/iter (+/- 36) = 200 MB/s +test misc::medium_1K ... bench: 1,213 ns/iter (+/- 237) = 844 MB/s +test misc::medium_1MB ... bench: 1,991,418 ns/iter (+/- 239,612) = 526 MB/s +test misc::medium_32 ... bench: 100 ns/iter (+/- 8) = 320 MB/s +test misc::medium_32K ... bench: 57,080 ns/iter (+/- 709) = 574 MB/s +test misc::no_exponential ... bench: 522 ns/iter (+/- 17) = 191 MB/s +test misc::not_literal ... bench: 290 ns/iter (+/- 6) = 175 MB/s +test misc::one_pass_long_prefix ... bench: 176 ns/iter (+/- 15) = 147 MB/s +test misc::one_pass_long_prefix_not ... bench: 183 ns/iter (+/- 28) = 142 MB/s +test misc::one_pass_short ... bench: 136 ns/iter (+/- 8) = 125 MB/s +test misc::one_pass_short_not ... bench: 135 ns/iter (+/- 14) = 125 MB/s +test misc::replace_all ... bench: 149 ns/iter (+/- 34) +test rust_compile::compile_huge ... bench: 158,759 ns/iter (+/- 4,546) +test rust_compile::compile_huge_bytes ... bench: 17,538,290 ns/iter (+/- 1,735,383) +test rust_compile::compile_simple ... bench: 5,935 ns/iter (+/- 429) +test rust_compile::compile_simple_bytes ... bench: 6,682 ns/iter (+/- 293) +test rust_compile::compile_small ... bench: 7,664 ns/iter (+/- 473) +test rust_compile::compile_small_bytes ... bench: 175,272 ns/iter (+/- 4,492) +test rust_parse::parse_huge ... bench: 1,199 ns/iter (+/- 38) +test rust_parse::parse_simple ... bench: 1,849 ns/iter (+/- 28) +test rust_parse::parse_small ... bench: 2,470 ns/iter (+/- 35) +test sherlock::before_holmes ... bench: 2,750,028 ns/iter (+/- 21,847) = 216 MB/s +test sherlock::everything_greedy ... bench: 7,896,337 ns/iter (+/- 68,883) = 75 MB/s +test sherlock::everything_greedy_nl ... bench: 5,498,247 ns/iter (+/- 65,952) = 108 MB/s +test sherlock::holmes_cochar_watson ... bench: 260,499 ns/iter (+/- 4,984) = 2283 MB/s +test sherlock::holmes_coword_watson ... bench: 1,331,443 ns/iter (+/- 34,716) = 446 MB/s +test sherlock::letters ... bench: 60,985,848 ns/iter (+/- 592,838) = 9 MB/s +test sherlock::letters_lower ... bench: 59,041,695 ns/iter (+/- 186,034) = 10 MB/s +test sherlock::letters_upper ... bench: 4,714,214 ns/iter (+/- 35,672) = 126 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 2,730,524 ns/iter (+/- 69,565) = 217 MB/s +test sherlock::name_alt1 ... bench: 41,866 ns/iter (+/- 682) = 14210 MB/s +test sherlock::name_alt2 ... bench: 194,322 ns/iter (+/- 6,628) = 3061 MB/s +test sherlock::name_alt3 ... bench: 1,252,965 ns/iter (+/- 18,828) = 474 MB/s +test sherlock::name_alt3_nocase ... bench: 1,476,169 ns/iter (+/- 14,557) = 403 MB/s +test sherlock::name_alt4 ... bench: 298,639 ns/iter (+/- 3,905) = 1992 MB/s +test sherlock::name_alt4_nocase ... bench: 1,426,191 ns/iter (+/- 23,584) = 417 MB/s +test sherlock::name_holmes ... bench: 49,719 ns/iter (+/- 811) = 11965 MB/s +test sherlock::name_holmes_nocase ... bench: 1,191,400 ns/iter (+/- 19,175) = 499 MB/s +test sherlock::name_sherlock ... bench: 34,091 ns/iter (+/- 877) = 17451 MB/s +test sherlock::name_sherlock_holmes ... bench: 33,785 ns/iter (+/- 1,207) = 17609 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 1,235,442 ns/iter (+/- 18,023) = 481 MB/s +test sherlock::name_sherlock_nocase ... bench: 1,236,252 ns/iter (+/- 26,934) = 481 MB/s +test sherlock::name_whitespace ... bench: 60,200 ns/iter (+/- 1,873) = 9882 MB/s +test sherlock::no_match_common ... bench: 559,886 ns/iter (+/- 20,306) = 1062 MB/s +test sherlock::no_match_uncommon ... bench: 23,631 ns/iter (+/- 497) = 25175 MB/s +test sherlock::quotes ... bench: 967,379 ns/iter (+/- 12,856) = 614 MB/s +test sherlock::the_lower ... bench: 766,950 ns/iter (+/- 21,944) = 775 MB/s +test sherlock::the_nocase ... bench: 1,706,539 ns/iter (+/- 26,003) = 348 MB/s +test sherlock::the_upper ... bench: 52,529 ns/iter (+/- 1,208) = 11325 MB/s +test sherlock::the_whitespace ... bench: 2,012,952 ns/iter (+/- 26,968) = 295 MB/s +test sherlock::word_ending_n ... bench: 55,578,841 ns/iter (+/- 537,463) = 10 MB/s +test sherlock::words ... bench: 19,103,327 ns/iter (+/- 102,828) = 31 MB/s test result: ok. 0 passed; 0 failed; 0 ignored; 71 measured diff --git a/benches/log/03-bytes/onig b/benches/log/03-bytes/onig new file mode 100644 index 0000000000..aaf666b431 --- /dev/null +++ b/benches/log/03-bytes/onig @@ -0,0 +1,68 @@ + Compiling regex-benchmark v0.1.0 (file:///home/andrew/data/projects/rust/regex/benches) + Running benches/target/release/onig-e3bc363aa56fb408 + +running 61 tests +test misc::anchored_literal_long_match ... bench: 70 ns/iter (+/- 1) = 5571 MB/s +test misc::anchored_literal_long_non_match ... bench: 424 ns/iter (+/- 4) = 919 MB/s +test misc::anchored_literal_short_match ... bench: 70 ns/iter (+/- 1) = 371 MB/s +test misc::anchored_literal_short_non_match ... bench: 38 ns/iter (+/- 0) = 684 MB/s +test misc::easy0_1K ... bench: 176 ns/iter (+/- 2) = 5818 MB/s +test misc::easy0_1MB ... bench: 163,547 ns/iter (+/- 1,451) = 6411 MB/s +test misc::easy0_32 ... bench: 20 ns/iter (+/- 1) = 1600 MB/s +test misc::easy0_32K ... bench: 5,056 ns/iter (+/- 64) = 6481 MB/s +test misc::easy1_1K ... bench: 4,103 ns/iter (+/- 11) = 249 MB/s +test misc::easy1_1MB ... bench: 4,198,406 ns/iter (+/- 62,171) = 249 MB/s +test misc::easy1_32 ... bench: 139 ns/iter (+/- 1) = 230 MB/s +test misc::easy1_32K ... bench: 131,083 ns/iter (+/- 1,310) = 249 MB/s +test misc::hard_1K ... bench: 163 ns/iter (+/- 3) = 6282 MB/s +test misc::hard_1MB ... bench: 163,910 ns/iter (+/- 2,368) = 6397 MB/s +test misc::hard_32 ... bench: 20 ns/iter (+/- 1) = 1600 MB/s +test misc::hard_32K ... bench: 5,002 ns/iter (+/- 306) = 6550 MB/s +test misc::literal ... bench: 226 ns/iter (+/- 0) = 225 MB/s +test misc::match_class ... bench: 337 ns/iter (+/- 2) = 240 MB/s +test misc::match_class_in_range ... bench: 337 ns/iter (+/- 1) = 240 MB/s +test misc::match_class_unicode ... bench: 2,004 ns/iter (+/- 26) = 80 MB/s +test misc::medium_1K ... bench: 191 ns/iter (+/- 2) = 5361 MB/s +test misc::medium_1MB ... bench: 164,027 ns/iter (+/- 2,494) = 6392 MB/s +test misc::medium_32 ... bench: 22 ns/iter (+/- 1) = 1454 MB/s +test misc::medium_32K ... bench: 4,962 ns/iter (+/- 60) = 6603 MB/s +test misc::not_literal ... bench: 359 ns/iter (+/- 5) = 142 MB/s +test misc::one_pass_long_prefix ... bench: 94 ns/iter (+/- 3) = 276 MB/s +test misc::one_pass_long_prefix_not ... bench: 101 ns/iter (+/- 1) = 257 MB/s +test misc::one_pass_short ... bench: 332 ns/iter (+/- 6) = 51 MB/s +test misc::one_pass_short_not ... bench: 318 ns/iter (+/- 4) = 53 MB/s +test sherlock::before_holmes ... bench: 70,859,542 ns/iter (+/- 594,306) = 8 MB/s +test sherlock::everything_greedy ... bench: 5,129,894 ns/iter (+/- 33,792) = 115 MB/s +test sherlock::holmes_cochar_watson ... bench: 2,388,047 ns/iter (+/- 19,666) = 249 MB/s +test sherlock::ing_suffix ... bench: 28,413,935 ns/iter (+/- 800,513) = 20 MB/s +test sherlock::ing_suffix_limited_space ... bench: 2,636,327 ns/iter (+/- 66,410) = 225 MB/s +test sherlock::letters ... bench: 26,471,724 ns/iter (+/- 872,994) = 22 MB/s +test sherlock::letters_lower ... bench: 26,124,489 ns/iter (+/- 556,750) = 22 MB/s +test sherlock::letters_upper ... bench: 11,268,144 ns/iter (+/- 338,510) = 52 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 195,797 ns/iter (+/- 1,621) = 3038 MB/s +test sherlock::name_alt1 ... bench: 2,100,763 ns/iter (+/- 16,823) = 283 MB/s +test sherlock::name_alt2 ... bench: 2,212,816 ns/iter (+/- 17,997) = 268 MB/s +test sherlock::name_alt3 ... bench: 3,031,567 ns/iter (+/- 35,631) = 196 MB/s +test sherlock::name_alt3_nocase ... bench: 39,737,911 ns/iter (+/- 166,863) = 14 MB/s +test sherlock::name_alt4 ... bench: 2,230,681 ns/iter (+/- 18,856) = 266 MB/s +test sherlock::name_alt4_nocase ... bench: 8,294,698 ns/iter (+/- 36,887) = 71 MB/s +test sherlock::name_holmes ... bench: 402,600 ns/iter (+/- 6,232) = 1477 MB/s +test sherlock::name_holmes_nocase ... bench: 4,074,155 ns/iter (+/- 23,317) = 146 MB/s +test sherlock::name_sherlock ... bench: 270,225 ns/iter (+/- 2,815) = 2201 MB/s +test sherlock::name_sherlock_holmes ... bench: 196,502 ns/iter (+/- 2,168) = 3027 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 4,397,347 ns/iter (+/- 28,567) = 135 MB/s +test sherlock::name_sherlock_nocase ... bench: 4,400,574 ns/iter (+/- 25,127) = 135 MB/s +test sherlock::name_whitespace ... bench: 274,462 ns/iter (+/- 3,180) = 2167 MB/s +test sherlock::no_match_common ... bench: 596,601 ns/iter (+/- 9,285) = 997 MB/s +test sherlock::no_match_uncommon ... bench: 586,258 ns/iter (+/- 7,702) = 1014 MB/s +test sherlock::quotes ... bench: 4,069,570 ns/iter (+/- 20,372) = 146 MB/s +test sherlock::repeated_class_negation ... bench: 44,936,445 ns/iter (+/- 103,467) = 13 MB/s +test sherlock::the_lower ... bench: 1,300,513 ns/iter (+/- 12,884) = 457 MB/s +test sherlock::the_nocase ... bench: 5,141,237 ns/iter (+/- 25,487) = 115 MB/s +test sherlock::the_upper ... bench: 821,454 ns/iter (+/- 13,420) = 724 MB/s +test sherlock::the_whitespace ... bench: 2,009,530 ns/iter (+/- 14,082) = 296 MB/s +test sherlock::word_ending_n ... bench: 27,847,316 ns/iter (+/- 47,618) = 21 MB/s +test sherlock::words ... bench: 21,105,627 ns/iter (+/- 33,436) = 28 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 61 measured + diff --git a/benches/log/03-bytes/pcre b/benches/log/03-bytes/pcre new file mode 100644 index 0000000000..236613ae0f --- /dev/null +++ b/benches/log/03-bytes/pcre @@ -0,0 +1,66 @@ + Running benches/target/release/pcre-855c18fb35cdf072 + +running 60 tests +test misc::anchored_literal_long_match ... bench: 88 ns/iter (+/- 12) = 4431 MB/s +test misc::anchored_literal_long_non_match ... bench: 58 ns/iter (+/- 1) = 6724 MB/s +test misc::anchored_literal_short_match ... bench: 88 ns/iter (+/- 1) = 295 MB/s +test misc::anchored_literal_short_non_match ... bench: 60 ns/iter (+/- 3) = 433 MB/s +test misc::easy0_1K ... bench: 266 ns/iter (+/- 1) = 3849 MB/s +test misc::easy0_1MB ... bench: 227,366 ns/iter (+/- 794) = 4611 MB/s +test misc::easy0_32 ... bench: 62 ns/iter (+/- 2) = 516 MB/s +test misc::easy0_32K ... bench: 7,061 ns/iter (+/- 109) = 4640 MB/s +test misc::easy1_1K ... bench: 805 ns/iter (+/- 10) = 1272 MB/s +test misc::easy1_1MB ... bench: 751,948 ns/iter (+/- 6,995) = 1394 MB/s +test misc::easy1_32 ... bench: 71 ns/iter (+/- 1) = 450 MB/s +test misc::easy1_32K ... bench: 23,635 ns/iter (+/- 213) = 1386 MB/s +test misc::hard_1K ... bench: 31,008 ns/iter (+/- 299) = 33 MB/s +test misc::hard_1MB ... bench: 35,078,241 ns/iter (+/- 94,197) = 29 MB/s +test misc::hard_32 ... bench: 313 ns/iter (+/- 1) = 102 MB/s +test misc::hard_32K ... bench: 995,958 ns/iter (+/- 10,945) = 32 MB/s +test misc::literal ... bench: 130 ns/iter (+/- 1) = 392 MB/s +test misc::match_class ... bench: 176 ns/iter (+/- 2) = 460 MB/s +test misc::match_class_in_range ... bench: 178 ns/iter (+/- 1) = 455 MB/s +test misc::match_class_unicode ... bench: 511 ns/iter (+/- 6) = 315 MB/s +test misc::medium_1K ... bench: 275 ns/iter (+/- 4) = 3723 MB/s +test misc::medium_1MB ... bench: 239,603 ns/iter (+/- 1,808) = 4376 MB/s +test misc::medium_32 ... bench: 62 ns/iter (+/- 1) = 516 MB/s +test misc::medium_32K ... bench: 7,385 ns/iter (+/- 43) = 4437 MB/s +test misc::not_literal ... bench: 274 ns/iter (+/- 3) = 186 MB/s +test misc::one_pass_long_prefix ... bench: 87 ns/iter (+/- 1) = 298 MB/s +test misc::one_pass_long_prefix_not ... bench: 88 ns/iter (+/- 0) = 295 MB/s +test misc::one_pass_short ... bench: 115 ns/iter (+/- 0) = 147 MB/s +test misc::one_pass_short_not ... bench: 118 ns/iter (+/- 0) = 144 MB/s +test sherlock::before_holmes ... bench: 14,338,348 ns/iter (+/- 23,734) = 41 MB/s +test sherlock::holmes_cochar_watson ... bench: 547,196 ns/iter (+/- 4,100) = 1087 MB/s +test sherlock::ing_suffix ... bench: 6,012,620 ns/iter (+/- 51,777) = 98 MB/s +test sherlock::ing_suffix_limited_space ... bench: 6,374,577 ns/iter (+/- 46,486) = 93 MB/s +test sherlock::letters ... bench: 28,575,184 ns/iter (+/- 65,051) = 20 MB/s +test sherlock::letters_lower ... bench: 25,819,606 ns/iter (+/- 180,823) = 23 MB/s +test sherlock::letters_upper ... bench: 3,227,381 ns/iter (+/- 11,443) = 184 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 193,512 ns/iter (+/- 1,316) = 3074 MB/s +test sherlock::name_alt1 ... bench: 454,510 ns/iter (+/- 2,721) = 1308 MB/s +test sherlock::name_alt2 ... bench: 499,453 ns/iter (+/- 4,692) = 1191 MB/s +test sherlock::name_alt3 ... bench: 1,085,732 ns/iter (+/- 6,841) = 547 MB/s +test sherlock::name_alt3_nocase ... bench: 3,194,995 ns/iter (+/- 12,655) = 186 MB/s +test sherlock::name_alt4 ... bench: 944,353 ns/iter (+/- 12,661) = 629 MB/s +test sherlock::name_alt4_nocase ... bench: 1,646,368 ns/iter (+/- 12,376) = 361 MB/s +test sherlock::name_holmes ... bench: 395,019 ns/iter (+/- 3,929) = 1506 MB/s +test sherlock::name_holmes_nocase ... bench: 493,327 ns/iter (+/- 7,213) = 1205 MB/s +test sherlock::name_sherlock ... bench: 266,400 ns/iter (+/- 1,591) = 2233 MB/s +test sherlock::name_sherlock_holmes ... bench: 196,357 ns/iter (+/- 1,770) = 3029 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 1,259,747 ns/iter (+/- 4,939) = 472 MB/s +test sherlock::name_sherlock_nocase ... bench: 1,128,970 ns/iter (+/- 6,730) = 526 MB/s +test sherlock::name_whitespace ... bench: 267,323 ns/iter (+/- 1,296) = 2225 MB/s +test sherlock::no_match_common ... bench: 595,372 ns/iter (+/- 5,690) = 999 MB/s +test sherlock::no_match_uncommon ... bench: 585,406 ns/iter (+/- 5,719) = 1016 MB/s +test sherlock::quotes ... bench: 1,223,528 ns/iter (+/- 6,579) = 486 MB/s +test sherlock::repeated_class_negation ... bench: 6,440,584 ns/iter (+/- 20,444) = 92 MB/s +test sherlock::the_lower ... bench: 1,220,999 ns/iter (+/- 7,595) = 487 MB/s +test sherlock::the_nocase ... bench: 1,263,078 ns/iter (+/- 15,321) = 471 MB/s +test sherlock::the_upper ... bench: 781,141 ns/iter (+/- 15,408) = 761 MB/s +test sherlock::the_whitespace ... bench: 1,383,414 ns/iter (+/- 548,289) = 430 MB/s +test sherlock::word_ending_n ... bench: 12,709,045 ns/iter (+/- 51,420) = 46 MB/s +test sherlock::words ... bench: 10,798,918 ns/iter (+/- 40,027) = 55 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 60 measured + diff --git a/benches/log/03-bytes/rust b/benches/log/03-bytes/rust new file mode 100644 index 0000000000..6dec097c5c --- /dev/null +++ b/benches/log/03-bytes/rust @@ -0,0 +1,83 @@ + Compiling regex-syntax v0.2.5 (file:///home/andrew/data/projects/rust/regex/benches) + Compiling regex v0.1.55 (file:///home/andrew/data/projects/rust/regex/benches) + Compiling regex-benchmark v0.1.0 (file:///home/andrew/data/projects/rust/regex/benches) + Running benches/target/release/rust-50db306d093e5666 + +running 74 tests +test misc::anchored_literal_long_match ... bench: 75 ns/iter (+/- 5) = 5200 MB/s +test misc::anchored_literal_long_non_match ... bench: 56 ns/iter (+/- 0) = 6964 MB/s +test misc::anchored_literal_short_match ... bench: 79 ns/iter (+/- 0) = 329 MB/s +test misc::anchored_literal_short_non_match ... bench: 56 ns/iter (+/- 1) = 464 MB/s +test misc::easy0_1K ... bench: 138 ns/iter (+/- 0) = 7420 MB/s +test misc::easy0_1MB ... bench: 247,159 ns/iter (+/- 724) = 4242 MB/s +test misc::easy0_32 ... bench: 71 ns/iter (+/- 0) = 450 MB/s +test misc::easy0_32K ... bench: 5,474 ns/iter (+/- 34) = 5986 MB/s +test misc::easy1_1K ... bench: 273 ns/iter (+/- 1) = 3750 MB/s +test misc::easy1_1MB ... bench: 317,946 ns/iter (+/- 2,512) = 3297 MB/s +test misc::easy1_32 ... bench: 67 ns/iter (+/- 0) = 477 MB/s +test misc::easy1_32K ... bench: 5,882 ns/iter (+/- 32) = 5570 MB/s +test misc::hard_1K ... bench: 4,713 ns/iter (+/- 13) = 217 MB/s +test misc::hard_1MB ... bench: 4,732,901 ns/iter (+/- 6,948) = 221 MB/s +test misc::hard_32 ... bench: 201 ns/iter (+/- 0) = 159 MB/s +test misc::hard_32K ... bench: 147,994 ns/iter (+/- 900) = 221 MB/s +test misc::literal ... bench: 19 ns/iter (+/- 0) = 2684 MB/s +test misc::match_class ... bench: 85 ns/iter (+/- 0) = 952 MB/s +test misc::match_class_in_range ... bench: 30 ns/iter (+/- 1) = 2700 MB/s +test misc::match_class_unicode ... bench: 806 ns/iter (+/- 2) = 199 MB/s +test misc::medium_1K ... bench: 1,384 ns/iter (+/- 10) = 739 MB/s +test misc::medium_1MB ... bench: 1,974,381 ns/iter (+/- 7,383) = 531 MB/s +test misc::medium_32 ... bench: 130 ns/iter (+/- 0) = 246 MB/s +test misc::medium_32K ... bench: 52,783 ns/iter (+/- 465) = 620 MB/s +test misc::no_exponential ... bench: 536 ns/iter (+/- 13) = 186 MB/s +test misc::not_literal ... bench: 293 ns/iter (+/- 1) = 174 MB/s +test misc::one_pass_long_prefix ... bench: 179 ns/iter (+/- 1) = 145 MB/s +test misc::one_pass_long_prefix_not ... bench: 180 ns/iter (+/- 2) = 144 MB/s +test misc::one_pass_short ... bench: 139 ns/iter (+/- 1) = 122 MB/s +test misc::one_pass_short_not ... bench: 142 ns/iter (+/- 1) = 119 MB/s +test misc::replace_all ... bench: 171 ns/iter (+/- 1) +test rust_compile::compile_huge ... bench: 126,158 ns/iter (+/- 1,790) +test rust_compile::compile_huge_bytes ... bench: 18,088,719 ns/iter (+/- 518,980) +test rust_compile::compile_simple ... bench: 6,141 ns/iter (+/- 394) +test rust_compile::compile_simple_bytes ... bench: 6,669 ns/iter (+/- 306) +test rust_compile::compile_small ... bench: 7,431 ns/iter (+/- 275) +test rust_compile::compile_small_bytes ... bench: 191,002 ns/iter (+/- 1,297) +test rust_parse::parse_huge ... bench: 1,204 ns/iter (+/- 9) +test rust_parse::parse_simple ... bench: 1,905 ns/iter (+/- 16) +test rust_parse::parse_small ... bench: 2,454 ns/iter (+/- 24) +test sherlock::before_holmes ... bench: 2,748,082 ns/iter (+/- 11,406) = 216 MB/s +test sherlock::everything_greedy ... bench: 7,833,414 ns/iter (+/- 42,538) = 75 MB/s +test sherlock::everything_greedy_nl ... bench: 5,426,141 ns/iter (+/- 31,378) = 109 MB/s +test sherlock::holmes_cochar_watson ... bench: 262,322 ns/iter (+/- 5,243) = 2267 MB/s +test sherlock::holmes_coword_watson ... bench: 1,324,677 ns/iter (+/- 21,666) = 449 MB/s +test sherlock::ing_suffix ... bench: 3,179,928 ns/iter (+/- 40,246) = 187 MB/s +test sherlock::ing_suffix_limited_space ... bench: 3,525,004 ns/iter (+/- 37,262) = 168 MB/s +test sherlock::letters ... bench: 60,268,445 ns/iter (+/- 1,958,610) = 9 MB/s +test sherlock::letters_lower ... bench: 57,743,679 ns/iter (+/- 84,675) = 10 MB/s +test sherlock::letters_upper ... bench: 4,549,709 ns/iter (+/- 9,312) = 130 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 2,690,794 ns/iter (+/- 2,796) = 221 MB/s +test sherlock::name_alt1 ... bench: 42,476 ns/iter (+/- 346) = 14006 MB/s +test sherlock::name_alt2 ... bench: 199,058 ns/iter (+/- 1,498) = 2988 MB/s +test sherlock::name_alt3 ... bench: 1,248,439 ns/iter (+/- 3,051) = 476 MB/s +test sherlock::name_alt3_nocase ... bench: 1,463,628 ns/iter (+/- 2,799) = 406 MB/s +test sherlock::name_alt4 ... bench: 296,390 ns/iter (+/- 798) = 2007 MB/s +test sherlock::name_alt4_nocase ... bench: 1,415,770 ns/iter (+/- 3,400) = 420 MB/s +test sherlock::name_holmes ... bench: 49,713 ns/iter (+/- 317) = 11967 MB/s +test sherlock::name_holmes_nocase ... bench: 1,181,147 ns/iter (+/- 2,842) = 503 MB/s +test sherlock::name_sherlock ... bench: 34,263 ns/iter (+/- 136) = 17363 MB/s +test sherlock::name_sherlock_holmes ... bench: 34,179 ns/iter (+/- 188) = 17406 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 1,236,384 ns/iter (+/- 5,012) = 481 MB/s +test sherlock::name_sherlock_nocase ... bench: 1,232,613 ns/iter (+/- 5,009) = 482 MB/s +test sherlock::name_whitespace ... bench: 60,024 ns/iter (+/- 187) = 9911 MB/s +test sherlock::no_match_common ... bench: 558,607 ns/iter (+/- 2,595) = 1065 MB/s +test sherlock::no_match_uncommon ... bench: 24,049 ns/iter (+/- 54) = 24738 MB/s +test sherlock::quotes ... bench: 966,792 ns/iter (+/- 2,982) = 615 MB/s +test sherlock::repeated_class_negation ... bench: 84,186,484 ns/iter (+/- 66,800) = 7 MB/s +test sherlock::the_lower ... bench: 773,759 ns/iter (+/- 2,759) = 768 MB/s +test sherlock::the_nocase ... bench: 1,705,648 ns/iter (+/- 4,604) = 348 MB/s +test sherlock::the_upper ... bench: 52,729 ns/iter (+/- 209) = 11282 MB/s +test sherlock::the_whitespace ... bench: 1,981,215 ns/iter (+/- 8,080) = 300 MB/s +test sherlock::word_ending_n ... bench: 53,482,650 ns/iter (+/- 73,844) = 11 MB/s +test sherlock::words ... bench: 18,961,987 ns/iter (+/- 27,794) = 31 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 74 measured + diff --git a/benches/log/03-bytes/rust-bytes b/benches/log/03-bytes/rust-bytes new file mode 100644 index 0000000000..735d259348 --- /dev/null +++ b/benches/log/03-bytes/rust-bytes @@ -0,0 +1,66 @@ + Compiling regex-benchmark v0.1.0 (file:///home/andrew/data/projects/rust/regex/benches) + Running benches/target/release/rust_bytes-9f3b188bc741e04b + +running 59 tests +test misc::anchored_literal_long_match ... bench: 75 ns/iter (+/- 6) = 5200 MB/s +test misc::anchored_literal_long_non_match ... bench: 55 ns/iter (+/- 0) = 7090 MB/s +test misc::anchored_literal_short_match ... bench: 75 ns/iter (+/- 0) = 346 MB/s +test misc::anchored_literal_short_non_match ... bench: 55 ns/iter (+/- 0) = 472 MB/s +test misc::easy0_1K ... bench: 245 ns/iter (+/- 0) = 4179 MB/s +test misc::easy0_1MB ... bench: 251,614 ns/iter (+/- 1,143) = 4167 MB/s +test misc::easy0_32 ... bench: 62 ns/iter (+/- 1) = 516 MB/s +test misc::easy0_32K ... bench: 5,281 ns/iter (+/- 66) = 6204 MB/s +test misc::easy1_1K ... bench: 266 ns/iter (+/- 1) = 3849 MB/s +test misc::easy1_1MB ... bench: 325,060 ns/iter (+/- 2,011) = 3225 MB/s +test misc::easy1_32 ... bench: 73 ns/iter (+/- 0) = 438 MB/s +test misc::easy1_32K ... bench: 5,609 ns/iter (+/- 41) = 5842 MB/s +test misc::hard_1K ... bench: 4,678 ns/iter (+/- 38) = 218 MB/s +test misc::hard_1MB ... bench: 4,736,631 ns/iter (+/- 26,227) = 221 MB/s +test misc::hard_32 ... bench: 199 ns/iter (+/- 0) = 160 MB/s +test misc::hard_32K ... bench: 148,282 ns/iter (+/- 1,353) = 220 MB/s +test misc::literal ... bench: 18 ns/iter (+/- 0) = 2833 MB/s +test misc::match_class ... bench: 83 ns/iter (+/- 0) = 975 MB/s +test misc::match_class_in_range ... bench: 30 ns/iter (+/- 0) = 2700 MB/s +test misc::medium_1K ... bench: 1,147 ns/iter (+/- 10) = 892 MB/s +test misc::medium_1MB ... bench: 1,953,230 ns/iter (+/- 10,530) = 536 MB/s +test misc::medium_32 ... bench: 99 ns/iter (+/- 0) = 323 MB/s +test misc::medium_32K ... bench: 54,705 ns/iter (+/- 349) = 598 MB/s +test misc::no_exponential ... bench: 534 ns/iter (+/- 4) = 187 MB/s +test misc::not_literal ... bench: 292 ns/iter (+/- 3) = 174 MB/s +test misc::one_pass_long_prefix ... bench: 179 ns/iter (+/- 1) = 145 MB/s +test misc::one_pass_long_prefix_not ... bench: 180 ns/iter (+/- 2) = 144 MB/s +test misc::one_pass_short ... bench: 139 ns/iter (+/- 0) = 122 MB/s +test misc::one_pass_short_not ... bench: 139 ns/iter (+/- 0) = 122 MB/s +test sherlock::before_holmes ... bench: 2,778,686 ns/iter (+/- 8,735) = 214 MB/s +test sherlock::everything_greedy ... bench: 7,884,691 ns/iter (+/- 37,268) = 75 MB/s +test sherlock::everything_greedy_nl ... bench: 5,406,627 ns/iter (+/- 24,707) = 110 MB/s +test sherlock::holmes_cochar_watson ... bench: 262,175 ns/iter (+/- 1,995) = 2269 MB/s +test sherlock::holmes_coword_watson ... bench: 1,299,904 ns/iter (+/- 5,090) = 457 MB/s +test sherlock::ing_suffix ... bench: 3,202,899 ns/iter (+/- 20,810) = 185 MB/s +test sherlock::ing_suffix_limited_space ... bench: 3,367,381 ns/iter (+/- 14,143) = 176 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 2,725,593 ns/iter (+/- 10,736) = 218 MB/s +test sherlock::name_alt1 ... bench: 42,161 ns/iter (+/- 355) = 14110 MB/s +test sherlock::name_alt2 ... bench: 195,390 ns/iter (+/- 1,112) = 3044 MB/s +test sherlock::name_alt3 ... bench: 1,248,432 ns/iter (+/- 3,244) = 476 MB/s +test sherlock::name_alt3_nocase ... bench: 3,371,906 ns/iter (+/- 42,421) = 176 MB/s +test sherlock::name_alt4 ... bench: 296,423 ns/iter (+/- 1,812) = 2007 MB/s +test sherlock::name_alt4_nocase ... bench: 1,753,178 ns/iter (+/- 23,269) = 339 MB/s +test sherlock::name_holmes ... bench: 49,554 ns/iter (+/- 261) = 12005 MB/s +test sherlock::name_holmes_nocase ... bench: 1,347,682 ns/iter (+/- 5,678) = 441 MB/s +test sherlock::name_sherlock ... bench: 33,937 ns/iter (+/- 208) = 17530 MB/s +test sherlock::name_sherlock_holmes ... bench: 33,870 ns/iter (+/- 225) = 17565 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 1,212,233 ns/iter (+/- 5,452) = 490 MB/s +test sherlock::name_sherlock_nocase ... bench: 1,190,590 ns/iter (+/- 3,248) = 499 MB/s +test sherlock::name_whitespace ... bench: 59,434 ns/iter (+/- 253) = 10009 MB/s +test sherlock::no_match_common ... bench: 565,962 ns/iter (+/- 4,601) = 1051 MB/s +test sherlock::no_match_uncommon ... bench: 23,729 ns/iter (+/- 218) = 25071 MB/s +test sherlock::quotes ... bench: 966,904 ns/iter (+/- 7,115) = 615 MB/s +test sherlock::repeated_class_negation ... bench: 121,271,073 ns/iter (+/- 242,789) = 4 MB/s +test sherlock::the_lower ... bench: 778,850 ns/iter (+/- 6,781) = 763 MB/s +test sherlock::the_nocase ... bench: 2,876,190 ns/iter (+/- 8,611) = 206 MB/s +test sherlock::the_upper ... bench: 52,617 ns/iter (+/- 315) = 11306 MB/s +test sherlock::the_whitespace ... bench: 1,982,270 ns/iter (+/- 11,079) = 300 MB/s +test sherlock::word_ending_n ... bench: 76,442,330 ns/iter (+/- 236,690) = 7 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 59 measured + diff --git a/benches/src/bench_onig.rs b/benches/src/bench_onig.rs new file mode 100644 index 0000000000..577fb955d5 --- /dev/null +++ b/benches/src/bench_onig.rs @@ -0,0 +1,52 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(test)] + +#[macro_use] extern crate lazy_static; +extern crate onig; +extern crate rand; +extern crate test; + +use std::ops::Deref; + +pub struct Regex(onig::Regex); + +unsafe impl Send for Regex {} +unsafe impl Sync for Regex {} + +impl Deref for Regex { + type Target = onig::Regex; + + fn deref(&self) -> &onig::Regex { + &self.0 + } +} + +impl Regex { + fn new(pattern: &str) -> Result { + onig::Regex::new(pattern).map(Regex) + } + + // Gah. onig's match function is anchored, but find is not. + fn is_match(&self, text: &str) -> bool { + self.search_with_options( + text, 0, text.len(), onig::SEARCH_OPTION_NONE, None).is_some() + } +} + +macro_rules! regex( + ($re:expr) => {{ + ::Regex::new($re).unwrap() + }} +); + +mod misc; +mod sherlock; diff --git a/benches/src/bench_pcre.rs b/benches/src/bench_pcre.rs new file mode 100644 index 0000000000..4441c1d0ff --- /dev/null +++ b/benches/src/bench_pcre.rs @@ -0,0 +1,75 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// These benchmarks use PCRE to reproduce some of the benchmarks used to track +// performance of regexes in this crate. I'm not an experienced user of PCRE, +// so it's possible that usage here is not optimal. If it isn't, then +// improvements are welcome. (I'm aware that PCRE has a DFA, but it doesn't +// seem to actually preserve leftmost-first semantics, like the DFA in this +// crate does.) +// +// Note that for these benchmarks, all we need is to detect whether there is +// a match or not. + +#![feature(test)] +#![allow(non_snake_case)] + +extern crate enum_set; +#[macro_use] extern crate lazy_static; +extern crate pcre; +extern crate rand; +extern crate test; + +/// A nominal wrapper around pcre::Pcre to expose an interface similar to +/// regex::Regex. +struct Regex(pcre::Pcre); + +/// lazy_static wants this. No reason not to provide it. +/// It's unsafe, but we don't really care when benchmarking. +unsafe impl Send for Regex {} +unsafe impl Sync for Regex {} + +impl Regex { + fn new(pattern: &str) -> Result { + use enum_set::EnumSet; + use pcre::{Pcre, CompileOption, StudyOption}; + + let mut comp_opts = EnumSet::new(); + // Rust's regex library exclusively uses Unicode-aware character + // classes. + comp_opts.insert(CompileOption::Ucp); + let mut re = try!(Pcre::compile_with_options(pattern, &comp_opts)); + + // Make it go as fast as possible? + let mut study_opts = EnumSet::new(); + study_opts.insert(StudyOption::StudyJitCompile); + re.study_with_options(&study_opts); + + Ok(Regex(re)) + } + + fn is_match(&mut self, text: &str) -> bool { + self.0.exec(text).is_some() + } + + fn find_iter<'a, 'p>( + &'p mut self, + text: &'a str, + ) -> pcre::MatchIterator<'a, 'p> { + self.0.matches(text) + } +} + +macro_rules! regex( + ($re:expr) => { ::Regex::new($re).unwrap() } +); + +mod misc; +mod sherlock; diff --git a/benches/bench_dynamic.rs b/benches/src/bench_rust.rs similarity index 85% rename from benches/bench_dynamic.rs rename to benches/src/bench_rust.rs index 70abf5dce7..e19d453cea 100644 --- a/benches/bench_dynamic.rs +++ b/benches/src/bench_rust.rs @@ -16,14 +16,16 @@ extern crate regex; extern crate regex_syntax; extern crate test; +pub use regex::Regex; + // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. macro_rules! regex( - ($re:expr) => {{ ::regex::Regex::new($re).unwrap() }} + ($re:expr) => {{ ::Regex::new($re).unwrap() }} ); -mod bench; -mod bench_dynamic_compile; -mod bench_dynamic_parse; -mod bench_sherlock; +mod misc; +mod rust_compile; +mod rust_parse; +mod sherlock; diff --git a/benches/bench_dynamic_nfa.rs b/benches/src/bench_rust_bytes.rs similarity index 78% rename from benches/bench_dynamic_nfa.rs rename to benches/src/bench_rust_bytes.rs index 8c026ec778..05a5592722 100644 --- a/benches/bench_dynamic_nfa.rs +++ b/benches/src/bench_rust_bytes.rs @@ -16,17 +16,14 @@ extern crate regex; extern crate regex_syntax; extern crate test; +pub use regex::bytes::Regex; + // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. macro_rules! regex( - ($re:expr) => {{ - use regex::internal::ExecBuilder; - ExecBuilder::new($re).nfa().build().unwrap().into_regex() - }} + ($re:expr) => {{ ::Regex::new($re).unwrap() }} ); -mod bench; -mod bench_dynamic_compile; -mod bench_dynamic_parse; -mod bench_sherlock; +mod misc; +mod sherlock; diff --git a/benches/bench_native.rs b/benches/src/bench_rust_plugin.rs similarity index 93% rename from benches/bench_native.rs rename to benches/src/bench_rust_plugin.rs index c1766991e7..11a85e634b 100644 --- a/benches/bench_native.rs +++ b/benches/src/bench_rust_plugin.rs @@ -17,5 +17,7 @@ extern crate regex; extern crate regex_syntax; extern crate test; -mod bench; -mod bench_sherlock; +pub use regex::Regex; + +mod misc; +mod sherlock; diff --git a/benches/bench.rs b/benches/src/misc.rs similarity index 84% rename from benches/bench.rs rename to benches/src/misc.rs index 7261477518..9a4a23db5f 100644 --- a/benches/bench.rs +++ b/benches/src/misc.rs @@ -13,20 +13,30 @@ use std::iter::repeat; use rand::{Rng, thread_rng}; -use regex::{Regex, NoExpand}; use test::Bencher; +use Regex; + +#[cfg(not(feature = "re-rust-bytes"))] +macro_rules! text { ($text:expr) => { $text } } +#[cfg(feature = "re-rust-bytes")] +macro_rules! text { ($text:expr) => { $text.as_bytes() } } + macro_rules! bench_match { ($name:ident, $re:expr, $text:expr) => { #[bench] fn $name(b: &mut Bencher) { + #![allow(unused_mut)] + use std::sync::Mutex; + lazy_static! { - static ref RE: Regex = $re; + static ref RE: Mutex = Mutex::new($re); static ref TEXT: String = $text; }; + let mut re = RE.lock().unwrap(); b.bytes = TEXT.len() as u64; b.iter(|| { - if !RE.is_match(&TEXT) { + if !re.is_match(text!(&TEXT)) { panic!("expected match, got not match"); } }); @@ -38,13 +48,17 @@ macro_rules! bench_nomatch { ($name:ident, $re:expr, $text:expr) => { #[bench] fn $name(b: &mut Bencher) { + #![allow(unused_mut)] + use std::sync::Mutex; + lazy_static! { - static ref RE: Regex = $re; + static ref RE: Mutex = Mutex::new($re); static ref TEXT: String = $text; }; + let mut re = RE.lock().unwrap(); b.bytes = TEXT.len() as u64; b.iter(|| { - if RE.is_match(&TEXT) { + if re.is_match(text!(&TEXT)) { panic!("match not expected"); } }); @@ -52,6 +66,8 @@ macro_rules! bench_nomatch { } } +#[cfg(not(feature = "re-onig"))] +#[cfg(not(feature = "re-pcre"))] bench_match!(no_exponential, { let re = format!( "{}{}", @@ -77,7 +93,8 @@ bench_match!(match_class_in_range, regex!("[ac]"), { format!("{}c", repeat("bbbb").take(20).collect::()) }); -bench_match!(match_class_unicode, regex!(r"\pL"), { +#[cfg(not(feature = "re-rust-bytes"))] +bench_match!(match_class_unicode, regex!(r"\p{L}"), { format!("{}a", repeat("☃5☃5").take(20).collect::()) }); @@ -161,9 +178,10 @@ bench_nomatch!(hard_1K, hard(), gen_text(1<<10)); bench_nomatch!(hard_32K, hard(), gen_text(32<<10)); bench_nomatch!(hard_1MB, hard(), gen_text(1<<20)); +#[cfg(feature = "re-rust")] #[bench] fn replace_all(b: &mut Bencher) { let re = regex!("[cjrw]"); let text = "abcdefghijklmnopqrstuvwxyz"; - b.iter(|| re.replace_all(text, NoExpand(""))); + b.iter(|| re.replace_all(text, "")); } diff --git a/benches/bench_dynamic_compile.rs b/benches/src/rust_compile.rs similarity index 100% rename from benches/bench_dynamic_compile.rs rename to benches/src/rust_compile.rs diff --git a/benches/bench_dynamic_parse.rs b/benches/src/rust_parse.rs similarity index 100% rename from benches/bench_dynamic_parse.rs rename to benches/src/rust_parse.rs diff --git a/benches/bench_sherlock.rs b/benches/src/sherlock.rs similarity index 73% rename from benches/bench_sherlock.rs rename to benches/src/sherlock.rs index 308b4c82d0..5da7b2c6c6 100644 --- a/benches/bench_sherlock.rs +++ b/benches/src/sherlock.rs @@ -8,25 +8,39 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use regex::Regex; use test::Bencher; +use Regex; + +#[cfg(not(feature = "re-rust-bytes"))] lazy_static! { static ref SHERLOCK: String = { include_str!("the-adventures-of-sherlock-holmes.txt").to_owned() }; } +#[cfg(feature = "re-rust-bytes")] +lazy_static! { + static ref SHERLOCK: Vec = { + include_bytes!("the-adventures-of-sherlock-holmes.txt")[..].to_owned() + }; +} + macro_rules! bench_find { ($name:ident, $re:expr, $count:expr) => { #[bench] fn $name(b: &mut Bencher) { + #![allow(unused_mut)] + + use std::sync::Mutex; + lazy_static! { - static ref RE: Regex = $re; + static ref RE: Mutex = Mutex::new($re); }; + let mut re = RE.lock().unwrap(); b.bytes = SHERLOCK.len() as u64; b.iter(|| { - let count = RE.find_iter(&SHERLOCK).count(); + let count = re.find_iter(&SHERLOCK).count(); assert_eq!($count, count) }); } @@ -84,16 +98,23 @@ bench_find!(the_nocase, regex!("(?i)the"), 7987); // How fast can we match everything? This essentially defeats any clever prefix // tricks and just executes the DFA across the entire input. +#[cfg(not(feature = "re-pcre"))] bench_find!(everything_greedy, regex!(".*"), 13053); +#[cfg(not(feature = "re-onig"))] +#[cfg(not(feature = "re-pcre"))] bench_find!(everything_greedy_nl, regex!("(?s).*"), 1); // How fast can we match every letter? This also defeats any clever prefix // tricks. -bench_find!(letters, regex!(r"\pL"), 447160); +#[cfg(not(feature = "re-rust-bytes"))] +bench_find!(letters, regex!(r"\p{L}"), 447160); +#[cfg(not(feature = "re-rust-bytes"))] bench_find!(letters_upper, regex!(r"\p{Lu}"), 14180); +#[cfg(not(feature = "re-rust-bytes"))] bench_find!(letters_lower, regex!(r"\p{Ll}"), 432980); // Similarly, for words. +#[cfg(not(feature = "re-rust-bytes"))] bench_find!(words, regex!(r"\w+"), 109214); // Process whitespace after a very common word. @@ -114,6 +135,8 @@ bench_find!( // Find Holmes co-occuring with Watson in a particular window of words. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. +#[cfg(not(feature = "re-onig"))] +#[cfg(not(feature = "re-pcre"))] bench_find!( holmes_coword_watson, regex!(r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes"), @@ -134,5 +157,27 @@ bench_find!( // All words ending in `n`. // This uses word boundaries, which the lazy DFA cannot handle. Since the word // boundary also defeats finding any literal prefixes, we have to use the -// NFA algorithm the whole way. +// NFA algorithm the whole way, which is quite slow. bench_find!(word_ending_n, regex!(r"\b\w+n\b"), 8366); + +// This is a real bad one for Rust's engine. This particular expression +// fills the state cache quite frequently, which results in a lot of churn. +// This can be made to go roughly the speed of PCRE by increasing the DFA cache +// size. +// +// Its only salvation is that the DFA realizes it's executing slowly, gives up +// quickly and falls back to the NFA algorithm. +bench_find!(repeated_class_negation, regex!(r"[a-q][^u-z]{13}x"), 142); + +// This defeats any prefix optimizations and just chugs along in the DFA. +// +// (This is a potential candidate for a suffix literal optimization, but +// requires quite a bit more sophistication in the implementation.) +bench_find!(ing_suffix, regex!(r"[a-zA-Z]+ing"), 2824); + +// Similar to ing_suffix, but a little more complex by limiting the length +// of the word and making sure it's surrounded by whitespace. +// +// Onig does surprisingly well on this benchmark and yet does quite poorly on +// the ing_suffix benchmark. That one has me stumped. +bench_find!(ing_suffix_limited_space, regex!(r"\s[a-zA-Z]{0,12}ing\s"), 2081); diff --git a/benches/the-adventures-of-sherlock-holmes.txt b/benches/src/the-adventures-of-sherlock-holmes.txt similarity index 100% rename from benches/the-adventures-of-sherlock-holmes.txt rename to benches/src/the-adventures-of-sherlock-holmes.txt diff --git a/examples/large-dfa.rs b/examples/large-dfa.rs deleted file mode 100644 index adc41944d4..0000000000 --- a/examples/large-dfa.rs +++ /dev/null @@ -1,42 +0,0 @@ -#![allow(dead_code)] - -extern crate rand; -extern crate regex; - -use rand::{Rng, thread_rng}; - -macro_rules! regex { - ($re:expr) => {{ - use regex::internal::ExecBuilder; - ExecBuilder::new($re).build().unwrap().into_regex() - }} -} - -fn main() { - println!("making input..."); - let input = make_input(10000000); - println!("compiling regex..."); - let re = regex!("(a|b)*a(a|b){20}"); - println!("searching..."); - for m in re.find_iter(&input) { - println!("{:?}", m); - } - - // let input = " Wed Nov 4 16:26:23 EST 2015"; - // let re = regex!(DATETIME); - // for m in re.find_iter(&input) { - // println!("{:?}", m); - // } -} - -fn make_input(size: usize) -> String { - let mut rng = thread_rng(); - let mut s = String::with_capacity(size); - for _ in 0..size { - s.push(if rng.gen() { 'a' } else { 'b'}); - } - s -} - -static DATETIME: &'static str = "\ -(?i)(?:(?:(?:(?:(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)|(?:mon|tue[s]?|wed|thu[r]?|fri|sat|sun))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4})))?(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:[ ]{1,4})(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:[ ]{1,4})(?:(?:(?:(?:0?[0-9]|1[0-2]):[0-5][0-9](?::[0-5][0-9])?(?:(?:[ ]{1,4})?)[AP][M])(?:(?:[ ]{1,4})(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst))?)|(?:(?:(?:[0-1]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9](?:[.][0-9][0-9])?)?)(?:(?:(?:(?:[ ]{1,4})?)(?:(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst)|(?:[-+](?:0[0-9]|1[0-2])(?:(?:[:])?)(?:00|15|30|45))))|z)?))(?:[ ]{1,4})(?:[1-2][0-9][0-9][0-9]))|(?:(?:(?:(?:(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)|(?:mon|tue[s]?|wed|thu[r]?|fri|sat|sun))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4})))?(?:(?:(?:(?:0?[1-9])|(?:1[0-2]))[-/](?:[0-2]?[0-9]|3[0-1])[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:[0-2]?[0-9]|3[0-1])[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th)))|(?:(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:[ ]{1,4})(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4}))(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:[ ]{1,4})(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:(?:[,])?)(?:(?:[ ]{1,4})?)(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))))(?:(?:[ ]{1,4})|(?:(?:[ ]{1,4})?)[,@T](?:(?:[ ]{1,4})?))(?:(?:(?:(?:0?[0-9]|1[0-2]):[0-5][0-9](?::[0-5][0-9])?(?:(?:[ ]{1,4})?)[AP][M])(?:(?:[ ]{1,4})(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst))?)|(?:(?:(?:[0-1]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9](?:[.][0-9][0-9])?)?)(?:(?:(?:(?:[ ]{1,4})?)(?:(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst)|(?:[-+](?:0[0-9]|1[0-2])(?:(?:[:])?)(?:00|15|30|45))))|z)?)))|(?:(?:(?:(?:(?:0?[0-9]|1[0-2]):[0-5][0-9](?::[0-5][0-9])?(?:(?:[ ]{1,4})?)[AP][M])(?:(?:[ ]{1,4})(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst))?)|(?:(?:(?:[0-1]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9](?:[.][0-9][0-9])?)?)(?:(?:(?:(?:[ ]{1,4})?)(?:(?:acdt|acst|aedt|aest|akdt|akst|amst|awdt|awst|biot|brst|cedt|cest|chot|chut|cist|clst|cost|cwst|davt|ddut|east|eedt|eest|egst|fkst|galt|gamt|gilt|hadt|haec|hast|hovt|ibst|irdt|irkt|irst|kost|krat|lhst|lint|magt|mart|mawt|mest|mist|nzdt|nzst|omst|orat|pett|phot|pmdt|pmst|pont|pyst|rott|sakt|samt|sast|slst|sret|syot|taht|ulat|uyst|vlat|volt|vost|wakt|wast|wedt|west|yakt|yekt|act|adt|aft|amt|art|ast|azt|bdt|bit|bot|brt|bst|btt|cat|cct|cdt|cet|cit|ckt|clt|cot|cst|cvt|cxt|dft|eat|ect|edt|eet|egt|eit|est|fet|fjt|fkt|fnt|get|gft|git|gmt|gst|gyt|hkt|hmt|hst|ict|idt|iot|ist|jst|kgt|kst|mdt|met|mht|mit|mmt|msk|mst|mut|mvt|myt|nct|ndt|nft|npt|nst|nut|pdt|pet|pgt|pkt|pst|pyt|ret|sbt|sct|sgt|srt|sst|tft|tha|tjt|tkt|tlt|tmt|tot|tvt|uct|utc|uyt|uzt|vet|vut|wat|wet|wit|wst)|(?:[-+](?:0[0-9]|1[0-2])(?:(?:[:])?)(?:00|15|30|45))))|z)?))(?:[ ]{1,4})(?:(?:(?:(?:0?[1-9])|(?:1[0-2]))[-/](?:[0-2]?[0-9]|3[0-1])[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:[0-2]?[0-9]|3[0-1])[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th)))|(?:(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:[ ]{1,4})(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4}))(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:[ ]{1,4})(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:(?:[,])?)(?:(?:[ ]{1,4})?)(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))))))|(?:(?:(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)|(?:mon|tue[s]?|wed|thu[r]?|fri|sat|sun))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4})))?(?:(?:(?:(?:0?[1-9])|(?:1[0-2]))[-/](?:[0-2]?[0-9]|3[0-1])[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:[0-2]?[0-9]|3[0-1])[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))[-/](?:(?:0?[1-9])|(?:1[0-2]))[-/](?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th)))|(?:(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:[ ]{1,4})(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4}))(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9])))|(?:(?:(?:[0-2]?[0-9]|3[0-1])|(?:[2-3]?1st|2?2nd|2?3rd|(?:[4-9]|1[0-9]|2[04-9]|30)th))(?:[ ]{1,4})(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:(?:[,])?)(?:(?:[ ]{1,4})?)(?:(?:[1-2][0-9][0-9][0-9])|(?:[0-9][0-9]))))|(?:(?:(?:(?:january|february|march|april|may|june|july|august|september|october|november|december)|(?:jan|feb|mar|apr|may|jun|jul|aug|sep[t]?|oct|nov|dec))(?:(?:[,])(?:(?:[ ]{1,4})?)|(?:[ ]{1,4}))(?:[1-2][0-9][0-9][0-9]))|(?:(?:[1-2][0-9][0-9][0-9])[-/](?:(?:0?[1-9])|(?:1[0-2])))))"; diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs new file mode 100644 index 0000000000..7bd0bf1bfe --- /dev/null +++ b/examples/shootout-regex-dna-bytes.rs @@ -0,0 +1,66 @@ +// The Computer Language Benchmarks Game +// http://benchmarksgame.alioth.debian.org/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +extern crate regex; + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } } + +fn main() { + let mut seq = Vec::with_capacity(50 * (1 << 20)); + io::stdin().read_to_end(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), &b"(c|g|t)"[..]), + (regex!("D"), &b"(a|g|t)"[..]), + (regex!("H"), &b"(a|c|t)"[..]), + (regex!("K"), &b"(g|t)"[..]), + (regex!("M"), &b"(a|c)"[..]), + (regex!("N"), &b"(a|c|g|t)"[..]), + (regex!("R"), &b"(a|g)"[..]), + (regex!("S"), &b"(c|g)"[..]), + (regex!("V"), &b"(a|c|g)"[..]), + (regex!("W"), &b"(a|t)"[..]), + (regex!("Y"), &b"(c|t)"[..]), + ]; + let mut seq = seq; + for (re, replacement) in substs.into_iter() { + seq = re.replace_all(&seq, replacement); + } + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/regex-pcre-benchmark/Cargo.toml b/regex-pcre-benchmark/Cargo.toml deleted file mode 100644 index 466ffa524f..0000000000 --- a/regex-pcre-benchmark/Cargo.toml +++ /dev/null @@ -1,31 +0,0 @@ -[package] -publish = false -name = "regex-pcre-benchmark" -version = "0.1.0" -authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" -repository = "https://github.com/rust-lang/regex" -documentation = "http://doc.rust-lang.org/regex/regex_syntax/index.html" -homepage = "https://github.com/rust-lang/regex" -description = "Regex benchmarks for PCRE." - -[dependencies] -enum-set = "0.0.6" -lazy_static = "0.1" -pcre = "0.2" -rand = "0.3" -regex = { version = "0.1", path = ".." } - -# Run the benchmarks on PCRE. -[[bench]] -name = "pcre" -path = "../benches/bench_pcre.rs" -test = false -bench = true - -[profile.bench] -debug = true - -[profile.test] -debug = true -codegen-units = 1 diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 09d304e32a..1895236070 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -68,6 +68,7 @@ assert_eq!(err.kind(), &ErrorKind::UnclosedParen); mod parser; mod unicode; +use std::ascii; use std::char; use std::cmp::{Ordering, max, min}; use std::fmt; @@ -75,6 +76,7 @@ use std::iter::IntoIterator; use std::ops::Deref; use std::result; use std::slice; +use std::u8; use std::vec; use unicode::case_folding; @@ -98,12 +100,30 @@ pub enum Expr { /// Whether to match case insensitively. casei: bool, }, + /// A sequence of one or more literal bytes to be matched. + LiteralBytes { + /// The bytes. + bytes: Vec, + /// Whether to match case insensitively. + /// + /// The interpretation of "case insensitive" in this context is + /// ambiguous since `bytes` can be arbitrary. However, a good heuristic + /// is to assume that the bytes are ASCII-compatible and do simple + /// ASCII case folding. + casei: bool, + }, /// Match any character. AnyChar, - /// Match any character, excluding new line. + /// Match any character, excluding new line (`0xA`). AnyCharNoNL, + /// Match any byte. + AnyByte, + /// Match any byte, excluding new line (`0xA`). + AnyByteNoNL, /// A character class. Class(CharClass), + /// A character class with byte ranges only. + ClassBytes(ByteClass), /// Match the start of a line or beginning of input. StartLine, /// Match the end of a line or end of input. @@ -118,6 +138,10 @@ pub enum Expr { /// Match a position that is not a word boundary (word or non-word /// characters on both sides). NotWordBoundary, + /// Match an ASCII word boundary. + WordBoundaryAscii, + /// Match a position that is not an ASCII word boundary. + NotWordBoundaryAscii, /// A group, possibly non-capturing. Group { /// The expression inside the group. @@ -233,6 +257,47 @@ pub struct ClassRange { pub end: char, } +/// A byte class for byte ranges only. +/// +/// A byte class has a canonical format that the parser guarantees. Its +/// canonical format is defined by the following invariants: +/// +/// 1. Given any byte, it is matched by *at most* one byte range in a canonical +/// character class. +/// 2. Every adjacent byte range is separated by at least one byte. +/// 3. Given any pair of byte ranges `r1` and `r2`, if +/// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical +/// character class. +/// +/// In sum, any `ByteClass` produced by this crate's parser is a sorted +/// sequence of non-overlapping ranges. This makes it possible to test whether +/// a byte is matched by a class with a binary search. +/// +/// If the case insensitive flag was set when parsing a character class, +/// then simple ASCII-only case folding is done automatically. For example, +/// `(?i)[a-c]` is automatically translated to `[a-cA-C]`. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ByteClass { + ranges: Vec, +} + +/// A single inclusive range in a byte class. +/// +/// Note that this has a few convenient impls on `PartialEq` and `PartialOrd` +/// for testing whether a byte is contained inside a given range. +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] +pub struct ByteRange { + /// The start byte of the range. + /// + /// This must be less than or equal to `end`. + pub start: u8, + + /// The end byte of the range. + /// + /// This must be greater than or equal to `end`. + pub end: u8, +} + /// A builder for configuring regular expression parsing. /// /// This allows setting the default values of flags and other options, such @@ -249,13 +314,7 @@ impl ExprBuilder { /// Note that all flags are disabled by default. pub fn new() -> ExprBuilder { ExprBuilder { - flags: Flags { - casei: false, - multi: false, - dotnl: false, - swap_greed: false, - ignore_space: false, - }, + flags: Flags::default(), nest_limit: 200, } } @@ -290,6 +349,28 @@ impl ExprBuilder { self } + /// Set the default value for the Unicode (`u`) flag. + /// + /// If `yes` is false, then `allow_bytes` is set to true. + pub fn unicode(mut self, yes: bool) -> ExprBuilder { + self.flags.unicode = yes; + if !yes { + self.allow_bytes(true) + } else { + self + } + } + + /// Whether the Unicode flag can be used or not. By default, the flag is + /// enabled but it cannot be toggled. + /// + /// When disabled, use of the `u` flag will cause the parser to return an + /// error. + pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder { + self.flags.allow_bytes = yes; + self + } + /// Set the nesting limit for regular expression parsing. /// /// Regular expressions that nest more than this limit will result in a @@ -318,12 +399,12 @@ impl Expr { /// Returns true iff the expression can be repeated by a quantifier. fn can_repeat(&self) -> bool { match *self { - Literal{..} - | AnyChar - | AnyCharNoNL - | Class(_) + Literal{..} | LiteralBytes{..} + | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL + | Class(_) | ClassBytes(_) | StartLine | EndLine | StartText | EndText | WordBoundary | NotWordBoundary + | WordBoundaryAscii | NotWordBoundaryAscii | Group{..} => true, _ => false, @@ -335,7 +416,7 @@ impl Expr { match (es.pop(), e) { (None, e) => es.push(e), (Some(Literal { chars: mut chars1, casei: casei1 }), - Literal { chars: chars2, casei: casei2 }) => { + Literal { chars: chars2, casei: casei2 }) => { if casei1 == casei2 { chars1.extend(chars2); es.push(Literal { chars: chars1, casei: casei1 }); @@ -344,6 +425,16 @@ impl Expr { es.push(Literal { chars: chars2, casei: casei2 }); } } + (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }), + LiteralBytes { bytes: bytes2, casei: casei2 }) => { + if casei1 == casei2 { + bytes1.extend(bytes2); + es.push(LiteralBytes { bytes: bytes1, casei: casei1 }); + } else { + es.push(LiteralBytes { bytes: bytes1, casei: casei1 }); + es.push(LiteralBytes { bytes: bytes2, casei: casei2 }); + } + } (Some(e1), e2) => { es.push(e1); es.push(e2); @@ -426,6 +517,22 @@ impl Expr { _ => false, } } + + /// Returns true if and only if the expression contains sub-expressions + /// that can match arbitrary bytes. + pub fn has_bytes(&self) -> bool { + match *self { + Repeat { ref e, .. } => e.has_bytes(), + Group { ref e, .. } => e.has_bytes(), + Concat(ref es) => es.iter().any(|e| e.has_bytes()), + Alternate(ref es) => es.iter().any(|e| e.has_bytes()), + LiteralBytes{..} => true, + AnyByte | AnyByteNoNL => true, + ClassBytes(_) => true, + WordBoundaryAscii | NotWordBoundaryAscii => true, + _ => false, + } + } } impl Deref for CharClass { @@ -466,6 +573,16 @@ impl CharClass { CharClass { ranges: Vec::with_capacity(self.len()) } } + /// Create a byte class from this character class. + /// + /// Codepoints above 0xFF are removed. + fn to_byte_class(self) -> ByteClass { + ByteClass::new( + self.ranges.into_iter() + .filter_map(|r| r.to_byte_range()) + .collect()).canonicalize() + } + /// Merge two classes and canonicalize them. #[cfg(test)] fn merge(mut self, other: CharClass) -> CharClass { @@ -560,6 +677,21 @@ impl ClassRange { } } + /// Translate this to a byte class. + /// + /// If the start codepoint exceeds 0xFF, then this returns `None`. + /// + /// If the end codepoint exceeds 0xFF, then it is set to 0xFF. + fn to_byte_range(self) -> Option { + if self.start > '\u{FF}' { + None + } else { + let s = self.start as u8; + let e = min('\u{FF}', self.end) as u8; + Some(ByteRange::new(s, e)) + } + } + /// Create a range of one character. fn one(c: char) -> ClassRange { ClassRange { start: c, end: c } @@ -671,6 +803,199 @@ impl PartialOrd for char { } } +impl ByteClass { + /// Create a new class from an existing set of ranges. + pub fn new(ranges: Vec) -> ByteClass { + ByteClass { ranges: ranges } + } + + /// Returns true if `b` is matched by this byte class. + pub fn matches(&self, b: u8) -> bool { + self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok() + } + + /// Create a new empty class from this one. + fn to_empty(&self) -> ByteClass { + ByteClass { ranges: Vec::with_capacity(self.len()) } + } + + /// Canonicalze any sequence of ranges. + /// + /// This is responsible for enforcing the canonical format invariants + /// as described on the docs for the `ByteClass` type. + fn canonicalize(mut self) -> ByteClass { + // TODO: Save some cycles here by checking if already canonicalized. + self.ranges.sort(); + let mut ordered = self.to_empty(); // TODO: Do this in place? + for candidate in self { + // If the candidate overlaps with an existing range, then it must + // be the most recent range added because we process the candidates + // in order. + if let Some(or) = ordered.ranges.last_mut() { + if or.overlapping(candidate) { + *or = or.merge(candidate); + continue; + } + } + ordered.ranges.push(candidate); + } + ordered + } + + /// Negates the byte class. + /// + /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b` + /// does not match `self.negate()`. + pub fn negate(mut self) -> ByteClass { + fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) } + + if self.is_empty() { + // Inverting an empty range yields all bytes. + return ByteClass { + ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }], + }; + } + self = self.canonicalize(); + let mut inv = self.to_empty(); + if self[0].start > b'\x00' { + inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1))); + } + for win in self.windows(2) { + inv.ranges.push(range(win[0].end.saturating_add(1), + win[1].start.saturating_sub(1))); + } + if self[self.len() - 1].end < u8::MAX { + inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1), + u8::MAX)); + } + inv + } + + /// Apply case folding to this byte class. + /// + /// This assumes that the bytes in the ranges are ASCII compatible. + /// + /// N.B. Applying case folding to a negated character class probably + /// won't produce the expected result. e.g., `(?i)[^x]` really should + /// match any character sans `x` and `X`, but if `[^x]` is negated + /// before being case folded, you'll end up matching any character. + pub fn case_fold(self) -> ByteClass { + let mut folded = self.to_empty(); + for r in self { + folded.ranges.extend(r.case_fold()); + } + folded.canonicalize() + } +} + +impl ByteRange { + /// Create a new class range. + /// + /// If `end < start`, then the two values are swapped so that + /// the invariant `start <= end` is preserved. + fn new(start: u8, end: u8) -> ByteRange { + if start <= end { + ByteRange { start: start, end: end } + } else { + ByteRange { start: end, end: start } + } + } + + /// Returns true if and only if the two ranges are overlapping. Note that + /// since ranges are inclusive, `a-c` and `d-f` are overlapping! + fn overlapping(self, other: ByteRange) -> bool { + max(self.start, other.start) + <= min(self.end, other.end).saturating_add(1) + } + + /// Returns true if and only if the intersection of self and other is non + /// empty. + fn is_intersect_empty(self, other: ByteRange) -> bool { + max(self.start, other.start) > min(self.end, other.end) + } + + /// Creates a new range representing the union of `self` and `other. + fn merge(self, other: ByteRange) -> ByteRange { + ByteRange { + start: min(self.start, other.start), + end: max(self.end, other.end), + } + } + + /// Apply case folding to this range. + /// + /// Since case folding might add bytes such that the range is no + /// longer contiguous, this returns multiple byte ranges. + /// + /// This assumes that the bytes in this range are ASCII compatible. + fn case_fold(self) -> Vec { + // So much easier than Unicode case folding! + let mut ranges = vec![self]; + if !ByteRange::new(b'a', b'z').is_intersect_empty(self) { + let lower = max(self.start, b'a'); + let upper = min(self.end, b'z'); + ranges.push(ByteRange::new(lower - 32, upper - 32)); + } + if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) { + let lower = max(self.start, b'A'); + let upper = min(self.end, b'Z'); + ranges.push(ByteRange::new(lower + 32, upper + 32)); + } + ranges + } +} + +impl Deref for ByteClass { + type Target = Vec; + fn deref(&self) -> &Vec { &self.ranges } +} + +impl IntoIterator for ByteClass { + type Item = ByteRange; + type IntoIter = vec::IntoIter; + fn into_iter(self) -> vec::IntoIter { self.ranges.into_iter() } +} + +impl<'a> IntoIterator for &'a ByteClass { + type Item = &'a ByteRange; + type IntoIter = slice::Iter<'a, ByteRange>; + fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() } +} + +impl PartialEq for ByteRange { + #[inline] + fn eq(&self, other: &u8) -> bool { + self.start <= *other && *other <= self.end + } +} + +impl PartialEq for u8 { + #[inline] + fn eq(&self, other: &ByteRange) -> bool { + other.eq(self) + } +} + +impl PartialOrd for ByteRange { + #[inline] + fn partial_cmp(&self, other: &u8) -> Option { + Some(if self == other { + Ordering::Equal + } else if *other > self.end { + Ordering::Greater + } else { + Ordering::Less + }) + } +} + +impl PartialOrd for u8 { + #[inline] + fn partial_cmp(&self, other: &ByteRange) -> Option { + other.partial_cmp(self).map(|o| o.reverse()) + } +} + /// This implementation of `Display` will write a regular expression from the /// syntax tree. It does not write the original string parsed. impl fmt::Display for Expr { @@ -685,15 +1010,32 @@ impl fmt::Display for Expr { if casei { try!(write!(f, ")")); } Ok(()) } + LiteralBytes { ref bytes, casei } => { + if casei { + try!(write!(f, "(?i-u:")); + } else { + try!(write!(f, "(?-u:")); + } + for &b in bytes { + try!(write!(f, "{}", quote_byte(b))); + } + try!(write!(f, ")")); + Ok(()) + } AnyChar => write!(f, "(?s:.)"), AnyCharNoNL => write!(f, "."), + AnyByte => write!(f, "(?s-u:.)"), + AnyByteNoNL => write!(f, "(?-u:.)"), Class(ref cls) => write!(f, "{}", cls), + ClassBytes(ref cls) => write!(f, "{}", cls), StartLine => write!(f, "(?m:^)"), EndLine => write!(f, "(?m:$)"), StartText => write!(f, r"^"), EndText => write!(f, r"$"), WordBoundary => write!(f, r"\b"), NotWordBoundary => write!(f, r"\B"), + WordBoundaryAscii => write!(f, r"(?-u:\b)"), + NotWordBoundaryAscii => write!(f, r"(?-u:\B)"), Group { ref e, i: None, name: None } => write!(f, "(?:{})", e), Group { ref e, name: None, .. } => write!(f, "({})", e), Group { ref e, name: Some(ref n), .. } => { @@ -756,6 +1098,23 @@ impl fmt::Display for ClassRange { } } +impl fmt::Display for ByteClass { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + try!(write!(f, "(?-u:[")); + for range in self.iter() { + try!(write!(f, "{}", range)); + } + try!(write!(f, "])")); + Ok(()) + } +} + +impl fmt::Display for ByteRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end)) + } +} + /// An alias for computations that can return a `Error`. pub type Result = ::std::result::Result; @@ -854,6 +1213,10 @@ pub enum ErrorKind { /// an explicit heap allocated stack is not (yet?) used. Regardless, some /// sort of limit must be applied to avoid unbounded memory growth. StackExhausted, + /// A disallowed flag was found (e.g., `b`). + FlagNotAllowed(char), + /// A Unicode class was used when the bytes (`b`) flag was enabled. + UnicodeNotAllowed, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -913,6 +1276,8 @@ impl ErrorKind { UnrecognizedFlag(_) => "unrecognized flag", UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name", StackExhausted => "stack exhausted, too much nesting", + FlagNotAllowed(_) => "flag not allowed", + UnicodeNotAllowed => "Unicode features not allowed", __Nonexhaustive => unreachable!(), } } @@ -1015,6 +1380,11 @@ impl fmt::Display for ErrorKind { StackExhausted => write!(f, "Exhausted space required to parse regex with too \ much nesting."), + FlagNotAllowed(flag) => + write!(f, "Use of the flag '{}' is not allowed.", flag), + UnicodeNotAllowed => + write!(f, "Unicode features are not allowed when the byte \ + (b) flag is set."), __Nonexhaustive => unreachable!(), } } @@ -1079,6 +1449,15 @@ fn quote_char(c: char) -> String { s } +fn quote_byte(b: u8) -> String { + if parser::is_punct(b as char) || b == b'\'' || b == b'"' { + quote_char(b as char) + } else { + let escaped: Vec = ascii::escape_default(b).collect(); + String::from_utf8(escaped).unwrap() + } +} + fn inc_char(c: char) -> char { match c { char::MAX => char::MAX, @@ -1112,12 +1491,21 @@ pub fn is_word_char(c: char) -> bool { } } +/// Returns true if and only if `c` is an ASCII word byte. +#[doc(hidden)] +pub fn is_word_byte(b: u8) -> bool { + match b { + b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true, + _ => false, + } +} + #[cfg(test)] mod properties; #[cfg(test)] mod tests { - use {CharClass, ClassRange, Expr}; + use {CharClass, ClassRange, ByteClass, ByteRange, Expr}; fn class(ranges: &[(char, char)]) -> CharClass { let ranges = ranges.iter().cloned() @@ -1125,7 +1513,11 @@ mod tests { CharClass::new(ranges) } - fn classi(ranges: &[(char, char)]) -> CharClass { class(ranges) } + fn bclass(ranges: &[(u8, u8)]) -> ByteClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ByteRange::new(c1, c2)).collect(); + ByteClass::new(ranges) + } fn e(re: &str) -> Expr { Expr::parse(re).unwrap() } @@ -1199,19 +1591,6 @@ mod tests { ])); } - #[test] - fn class_canon_overlap_many_case_fold() { - let cls = class(&[ - ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), - ('M', 'P'), ('L', 'S'), ('c', 'f'), - ]); - assert_eq!(cls.case_fold(), classi(&[ - ('A', 'J'), ('L', 'S'), - ('a', 'j'), ('l', 's'), - ('\u{17F}', '\u{17F}'), - ])); - } - #[test] fn class_canon_overlap_boundary() { let cls = class(&[('x', 'z'), ('u', 'w')]); @@ -1322,20 +1701,51 @@ mod tests { ])); } + #[test] + fn class_canon_overlap_many_case_fold() { + let cls = class(&[ + ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), + ('M', 'P'), ('L', 'S'), ('c', 'f'), + ]); + assert_eq!(cls.case_fold(), class(&[ + ('A', 'J'), ('L', 'S'), + ('a', 'j'), ('l', 's'), + ('\u{17F}', '\u{17F}'), + ])); + + let cls = bclass(&[ + (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), + (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), + ]); + assert_eq!(cls.case_fold(), bclass(&[ + (b'A', b'J'), (b'L', b'S'), + (b'a', b'j'), (b'l', b's'), + ])); + } + #[test] fn class_fold_az() { let cls = class(&[('A', 'Z')]); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ])); let cls = class(&[('a', 'z')]); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ])); + + let cls = bclass(&[(b'A', b'Z')]); + assert_eq!(cls.case_fold(), bclass(&[ + (b'A', b'Z'), (b'a', b'z'), + ])); + let cls = bclass(&[(b'a', b'z')]); + assert_eq!(cls.case_fold(), bclass(&[ + (b'A', b'Z'), (b'a', b'z'), + ])); } #[test] @@ -1344,9 +1754,17 @@ mod tests { assert_eq!(cls.clone().canonicalize(), class(&[ ('A', 'A'), ('_', '_'), ])); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('A', 'A'), ('_', '_'), ('a', 'a'), ])); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + assert_eq!(cls.clone().canonicalize(), bclass(&[ + (b'A', b'A'), (b'_', b'_'), + ])); + assert_eq!(cls.case_fold(), bclass(&[ + (b'A', b'A'), (b'_', b'_'), (b'a', b'a'), + ])); } #[test] @@ -1355,35 +1773,72 @@ mod tests { assert_eq!(cls.clone().canonicalize(), class(&[ ('=', '='), ('A', 'A'), ])); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('=', '='), ('A', 'A'), ('a', 'a'), ])); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + assert_eq!(cls.clone().canonicalize(), bclass(&[ + (b'=', b'='), (b'A', b'A'), + ])); + assert_eq!(cls.case_fold(), bclass(&[ + (b'=', b'='), (b'A', b'A'), (b'a', b'a'), + ])); } #[test] fn class_fold_no_folding_needed() { let cls = class(&[('\x00', '\x10')]); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('\x00', '\x10'), ])); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls.case_fold(), bclass(&[ + (b'\x00', b'\x10'), + ])); } #[test] fn class_fold_negated() { let cls = class(&[('x', 'x')]); - assert_eq!(cls.clone().case_fold(), classi(&[ + assert_eq!(cls.clone().case_fold(), class(&[ ('X', 'X'), ('x', 'x'), ])); - assert_eq!(cls.case_fold().negate(), classi(&[ + assert_eq!(cls.case_fold().negate(), class(&[ ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'), ])); + + let cls = bclass(&[(b'x', b'x')]); + assert_eq!(cls.clone().case_fold(), bclass(&[ + (b'X', b'X'), (b'x', b'x'), + ])); + assert_eq!(cls.case_fold().negate(), bclass(&[ + (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'), + ])); } #[test] fn class_fold_single_to_multiple() { let cls = class(&[('k', 'k')]); - assert_eq!(cls.case_fold(), classi(&[ + assert_eq!(cls.case_fold(), class(&[ ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), ])); + + let cls = bclass(&[(b'k', b'k')]); + assert_eq!(cls.case_fold(), bclass(&[ + (b'K', b'K'), (b'k', b'k'), + ])); + } + + #[test] + fn class_fold_at() { + let cls = class(&[('@', '@')]); + assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')])); + assert_eq!(cls.case_fold(), class(&[('@', '@')])); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')])); + assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')])); } } diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index c7e0c78315..70f840770e 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -9,11 +9,13 @@ // except according to those terms. use std::cmp::{max, min}; +use std::u8; use unicode::regex::UNICODE_CLASSES; use { - Expr, Repeater, CharClass, ClassRange, CaptureIndex, CaptureName, + Expr, Repeater, CharClass, ClassRange, + CaptureIndex, CaptureName, Error, ErrorKind, Result, }; @@ -34,7 +36,7 @@ pub struct Parser { } /// Flag state used in the parser. -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug)] pub struct Flags { /// i pub casei: bool, @@ -46,6 +48,25 @@ pub struct Flags { pub swap_greed: bool, /// x pub ignore_space: bool, + /// u + pub unicode: bool, + /// Not actually a flag, but when false, the `u` flag is forbidden and + /// setting `unicode` to `false` will result in an error. + pub allow_bytes: bool, +} + +impl Default for Flags { + fn default() -> Self { + Flags { + casei: false, + multi: false, + dotnl: false, + swap_greed: false, + ignore_space: false, + unicode: true, + allow_bytes: false, + } + } } /// An ephemeral type for representing the expression stack. @@ -83,6 +104,9 @@ impl Parser { // Starts at the beginning of the input and consumes until either the end // of input or an error. fn parse_expr(mut self) -> Result { + if !self.flags.unicode && !self.flags.allow_bytes { + return Err(self.err(ErrorKind::FlagNotAllowed('u'))); + } while !self.eof() { let build_expr = match self.cur() { '\\' => try!(self.parse_escape()), @@ -93,7 +117,13 @@ impl Parser { '{' => try!(self.parse_counted_repeat()), '[' => match self.maybe_parse_ascii() { None => try!(self.parse_class()), - Some(cls) => Build::Expr(Expr::Class(cls)), + Some(cls) => { + Build::Expr(if self.flags.unicode { + Expr::Class(cls) + } else { + Expr::ClassBytes(cls.to_byte_class()) + }) + } }, '^' => { if self.flags.multi { @@ -111,9 +141,17 @@ impl Parser { } '.' => { if self.flags.dotnl { - self.parse_one(Expr::AnyChar) + if self.flags.unicode { + self.parse_one(Expr::AnyChar) + } else { + self.parse_one(Expr::AnyByte) + } } else { - self.parse_one(Expr::AnyCharNoNL) + if self.flags.unicode { + self.parse_one(Expr::AnyCharNoNL) + } else { + self.parse_one(Expr::AnyByteNoNL) + } } } '(' => try!(self.parse_group()), @@ -123,10 +161,10 @@ impl Parser { self.flags = old_flags; e } - _ => Build::Expr(Expr::Literal { - chars: vec![self.bump()], - casei: self.flags.casei, - }), + _ => { + let c = self.bump(); + try!(self.lit(c)) + } }; if !build_expr.is_empty() { self.stack.push(build_expr); @@ -146,26 +184,34 @@ impl Parser { } let c = self.cur(); if is_punct(c) { - return Ok(Build::Expr(Expr::Literal { - chars: vec![self.bump()], - casei: self.flags.casei, - })); - } - - fn lit(c: char) -> Build { - Build::Expr(Expr::Literal { chars: vec![c], casei: false }) + let c = self.bump(); + return Ok(try!(self.lit(c))); } match c { - 'a' => { self.bump(); Ok(lit('\x07')) } - 'f' => { self.bump(); Ok(lit('\x0C')) } - 't' => { self.bump(); Ok(lit('\t')) } - 'n' => { self.bump(); Ok(lit('\n')) } - 'r' => { self.bump(); Ok(lit('\r')) } - 'v' => { self.bump(); Ok(lit('\x0B')) } + 'a' => { self.bump(); Ok(try!(self.lit('\x07'))) } + 'f' => { self.bump(); Ok(try!(self.lit('\x0C'))) } + 't' => { self.bump(); Ok(try!(self.lit('\t'))) } + 'n' => { self.bump(); Ok(try!(self.lit('\n'))) } + 'r' => { self.bump(); Ok(try!(self.lit('\r'))) } + 'v' => { self.bump(); Ok(try!(self.lit('\x0B'))) } 'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) } 'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) } - 'b' => { self.bump(); Ok(Build::Expr(Expr::WordBoundary)) } - 'B' => { self.bump(); Ok(Build::Expr(Expr::NotWordBoundary)) } + 'b' => { + self.bump(); + Ok(Build::Expr(if self.flags.unicode { + Expr::WordBoundary + } else { + Expr::WordBoundaryAscii + })) + } + 'B' => { + self.bump(); + Ok(Build::Expr(if self.flags.unicode { + Expr::NotWordBoundary + } else { + Expr::NotWordBoundaryAscii + })) + } '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(), 'x' => { self.bump(); self.parse_hex() } 'p'|'P' => { @@ -175,7 +221,11 @@ impl Parser { } 'd'|'s'|'w'|'D'|'S'|'W' => { self.bump(); - Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) + Ok(Build::Expr(if self.flags.unicode { + Expr::Class(self.parse_perl_class(c)) + } else { + Expr::ClassBytes(self.parse_perl_class(c).to_byte_class()) + })) } c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), } @@ -242,6 +292,13 @@ impl Parser { 's' => { self.flags.dotnl = sign; saw_flag = true } 'U' => { self.flags.swap_greed = sign; saw_flag = true } 'x' => { self.flags.ignore_space = sign; saw_flag = true } + 'u' => { + if !self.flags.allow_bytes { + return Err(self.err(ErrorKind::FlagNotAllowed('u'))); + } + self.flags.unicode = sign; + saw_flag = true; + } '-' => { if !sign { // e.g., (?-i-s) @@ -414,10 +471,11 @@ impl Parser { // // Hence, we `unwrap` with reckless abandon. let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number"); - Ok(Build::Expr(Expr::Literal { - chars: vec![char::from_u32(n).expect("Unicode scalar value")], - casei: self.flags.casei, - })) + if !self.flags.unicode { + return Ok(try!(self.u32_to_one_byte(n))); + } + let c = char::from_u32(n).expect("Unicode scalar value"); + Ok(try!(self.lit(c))) } // Parses a hex number, e.g., `a\x5ab`. @@ -447,16 +505,16 @@ impl Parser { let s = self.bump_get(|c| c != '}').unwrap_or("".into()); let n = try!(u32::from_str_radix(&s, 16) .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); - let c = try!(char::from_u32(n) - .ok_or(self.err(ErrorKind::InvalidScalarValue(n)))); if !self.bump_if('}') { // e.g., a\x{d return Err(self.err(ErrorKind::UnclosedHex)); } - Ok(Build::Expr(Expr::Literal { - chars: vec![c], - casei: self.flags.casei, - })) + if !self.flags.unicode { + return Ok(try!(self.u32_to_one_byte(n))); + } + let c = try!(char::from_u32(n) + .ok_or(self.err(ErrorKind::InvalidScalarValue(n)))); + Ok(try!(self.lit(c))) } // Parses a two-digit hex number, e.g., `a\x5ab`. @@ -475,11 +533,11 @@ impl Parser { } let n = try!(u32::from_str_radix(&s, 16) .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); - Ok(Build::Expr(Expr::Literal { - // Because 0...255 are all valid Unicode scalar values. - chars: vec![char::from_u32(n).expect("Unicode scalar value")], - casei: self.flags.casei, - })) + if !self.flags.unicode { + return Ok(try!(self.u32_to_one_byte(n))); + } + let c = char::from_u32(n).expect("Unicode scalar value"); + Ok(try!(self.lit(c))) } // Parses a character class, e.g., `[^a-zA-Z0-9]+`. @@ -514,9 +572,20 @@ impl Parser { Build::Expr(Expr::Class(class2)) => { class.ranges.extend(class2); } + Build::Expr(Expr::ClassBytes(class2)) => { + for byte_range in class2 { + let s = byte_range.start as char; + let e = byte_range.end as char; + class.ranges.push(ClassRange::new(s, e)); + } + } Build::Expr(Expr::Literal { chars, .. }) => { try!(self.parse_class_range(&mut class, chars[0])); } + Build::Expr(Expr::LiteralBytes { bytes, .. }) => { + let start = bytes[0] as char; + try!(self.parse_class_range(&mut class, start)); + } Build::Expr(e) => { let err = ErrorKind::InvalidClassEscape(e); return Err(self.err(err)); @@ -531,7 +600,11 @@ impl Parser { } } class = self.class_transform(negated, class).canonicalize(); - Ok(Build::Expr(Expr::Class(class))) + Ok(Build::Expr(if self.flags.unicode { + Expr::Class(class) + } else { + Expr::ClassBytes(class.to_byte_class()) + })) } // Parses a single range in a character class. @@ -567,7 +640,12 @@ impl Parser { // make sure it's a valid range. let end = match self.cur() { '\\' => match try!(self.parse_escape()) { - Build::Expr(Expr::Literal { chars, .. }) => chars[0], + Build::Expr(Expr::Literal { chars, .. }) => { + chars[0] + } + Build::Expr(Expr::LiteralBytes { bytes, .. }) => { + bytes[0] as char + } Build::Expr(e) => { return Err(self.err(ErrorKind::InvalidClassEscape(e))); } @@ -648,7 +726,13 @@ impl Parser { }; match unicode_class(&name) { None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))), - Some(cls) => Ok(self.class_transform(neg, cls)), + Some(cls) => { + if self.flags.unicode { + Ok(self.class_transform(neg, cls)) + } else { + Err(self.err(ErrorKind::UnicodeNotAllowed)) + } + } } } @@ -659,10 +743,19 @@ impl Parser { // No parser state is changed. fn parse_perl_class(&mut self, name: char) -> CharClass { use unicode::regex::{PERLD, PERLS, PERLW}; - let (cls, negate) = match name { - 'd' | 'D' => (raw_class_to_expr(PERLD), name == 'D'), - 's' | 'S' => (raw_class_to_expr(PERLS), name == 'S'), - 'w' | 'W' => (raw_class_to_expr(PERLW), name == 'W'), + let (cls, negate) = match (self.flags.unicode, name) { + (true, 'd') => (raw_class_to_expr(PERLD), false), + (true, 'D') => (raw_class_to_expr(PERLD), true), + (true, 's') => (raw_class_to_expr(PERLS), false), + (true, 'S') => (raw_class_to_expr(PERLS), true), + (true, 'w') => (raw_class_to_expr(PERLW), false), + (true, 'W') => (raw_class_to_expr(PERLW), true), + (false, 'd') => (ascii_class("digit").unwrap(), false), + (false, 'D') => (ascii_class("digit").unwrap(), true), + (false, 's') => (ascii_class("space").unwrap(), false), + (false, 'S') => (ascii_class("space").unwrap(), true), + (false, 'w') => (ascii_class("word").unwrap(), false), + (false, 'W') => (ascii_class("word").unwrap(), true), _ => unreachable!(), }; self.class_transform(negate, cls) @@ -755,6 +848,54 @@ impl Parser { } cls } + + // Translates a Unicode codepoint into a single UTF-8 byte, and returns an + // error if it's not possible. + // + // This will panic if self.flags.unicode == true. + fn codepoint_to_one_byte(&self, c: char) -> Result { + assert!(!self.flags.unicode); + let bytes = c.to_string().as_bytes().to_owned(); + if bytes.len() > 1 { + return Err(self.err(ErrorKind::UnicodeNotAllowed)); + } + Ok(bytes[0]) + } + + // Creates a new byte literal from a single byte. + // + // If the given number can't fit into a single byte, then it is assumed + // to be a Unicode codepoint and an error is returned. + // + // This should only be called when the bytes flag is enabled. + fn u32_to_one_byte(&self, b: u32) -> Result { + assert!(!self.flags.unicode); + if b > u8::MAX as u32 { + Err(self.err(ErrorKind::UnicodeNotAllowed)) + } else { + Ok(Build::Expr(Expr::LiteralBytes { + bytes: vec![b as u8], + casei: self.flags.casei, + })) + } + } + + // Creates a new literal expr from a Unicode codepoint. + // + // Creates a byte literal if the `bytes` flag is set. + fn lit(&self, c: char) -> Result { + Ok(Build::Expr(if self.flags.unicode { + Expr::Literal { + chars: vec![c], + casei: self.flags.casei, + } + } else { + Expr::LiteralBytes { + bytes: vec![try!(self.codepoint_to_one_byte(c))], + casei: self.flags.casei, + } + })) + } } struct Chars<'a> { @@ -1130,9 +1271,13 @@ const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; #[cfg(test)] mod tests { - use { CharClass, ClassRange, Expr, Repeater, ErrorKind }; + use { + CharClass, ClassRange, ByteClass, ByteRange, + Expr, Repeater, + ErrorKind, + }; use unicode::regex::{PERLD, PERLS, PERLW}; - use super::{LOWER, UPPER, Flags, Parser}; + use super::{LOWER, UPPER, Flags, Parser, ascii_class}; static YI: &'static [(char, char)] = &[ ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'), @@ -1145,6 +1290,25 @@ mod tests { fn b(v: T) -> Box { Box::new(v) } fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) } + fn pb(s: &str) -> Expr { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + Parser::parse(s, flags).unwrap() + } + + fn blit(b: u8) -> Expr { + Expr::LiteralBytes { + bytes: vec![b], + casei: false, + } + } + + fn bliti(b: u8) -> Expr { + Expr::LiteralBytes { + bytes: vec![b], + casei: true, + } + } + fn class(ranges: &[(char, char)]) -> CharClass { let ranges = ranges.iter().cloned() .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); @@ -1159,6 +1323,24 @@ mod tests { cls.canonicalize() } + fn bclass(ranges: &[(u8, u8)]) -> ByteClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ByteRange::new(c1, c2)).collect(); + ByteClass::new(ranges) + } + + fn asciid() -> ByteClass { + ascii_class("digit").unwrap().to_byte_class() + } + + fn asciis() -> ByteClass { + ascii_class("space").unwrap().to_byte_class() + } + + fn asciiw() -> ByteClass { + ascii_class("word").unwrap().to_byte_class() + } + #[test] fn empty() { assert_eq!(p(""), Expr::Empty); @@ -1167,11 +1349,13 @@ mod tests { #[test] fn literal() { assert_eq!(p("a"), lit('a')); + assert_eq!(pb("(?-u)a"), blit(b'a')); } #[test] fn literal_string() { assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')])); + assert_eq!(pb("(?-u)ab"), Expr::Concat(vec![blit(b'a'), blit(b'b')])); } #[test] @@ -1434,6 +1618,11 @@ mod tests { i: None, name: None, }); + assert_eq!(pb("(?i-u:a)"), Expr::Group { + e: b(bliti(b'a')), + i: None, + name: None, + }); } #[test] @@ -1446,6 +1635,14 @@ mod tests { }, lit('a'), ])); + assert_eq!(pb("(?i-u:a)a"), c(&[ + Expr::Group { + e: b(bliti(b'a')), + i: None, + name: None, + }, + lit('a'), + ])); } #[test] @@ -1458,6 +1655,14 @@ mod tests { }, liti('a'), ])); + assert_eq!(pb("(?i-u)(?u-i:a)a"), c(&[ + Expr::Group { + e: b(lit('a')), + i: None, + name: None, + }, + bliti(b'a'), + ])); } #[test] @@ -1508,6 +1713,14 @@ mod tests { ])); } + #[test] + fn any_byte() { + assert_eq!( + pb("(?-u).(?u)."), c(&[Expr::AnyByteNoNL, Expr::AnyCharNoNL])); + assert_eq!( + pb("(?s)(?-u).(?u)."), c(&[Expr::AnyByte, Expr::AnyChar])); + } + #[test] fn flags_inline_negate() { assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')])); @@ -1585,6 +1798,9 @@ mod tests { Expr::StartText, Expr::EndText, Expr::WordBoundary, Expr::NotWordBoundary, ])); + assert_eq!(pb(r"(?-u)\b\B"), c(&[ + Expr::WordBoundaryAscii, Expr::NotWordBoundaryAscii, + ])); } #[test] @@ -1600,12 +1816,21 @@ mod tests { fn escape_octal() { assert_eq!(p(r"\123"), lit('S')); assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')])); + + assert_eq!(pb(r"(?-u)\377"), blit(0xFF)); } #[test] fn escape_hex2() { assert_eq!(p(r"\x53"), lit('S')); assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')])); + + assert_eq!(pb(r"(?-u)\xff"), blit(0xFF)); + assert_eq!(pb(r"(?-u)\x00"), blit(0x0)); + assert_eq!(pb(r"(?-u)[\x00]"), + Expr::ClassBytes(bclass(&[(b'\x00', b'\x00')]))); + assert_eq!(pb(r"(?-u)[^\x00]"), + Expr::ClassBytes(bclass(&[(b'\x01', b'\xFF')]))); } #[test] @@ -1613,6 +1838,8 @@ mod tests { assert_eq!(p(r"\x{53}"), lit('S')); assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')])); assert_eq!(p(r"\x{2603}"), lit('\u{2603}')); + + assert_eq!(pb(r"(?-u)\x{00FF}"), blit(0xFF)); } #[test] @@ -1679,64 +1906,79 @@ mod tests { #[test] fn escape_perl_d() { assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); + assert_eq!(pb(r"(?-u)\d"), Expr::ClassBytes(asciid())); } #[test] fn escape_perl_s() { assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); + assert_eq!(pb(r"(?-u)\s"), Expr::ClassBytes(asciis())); } #[test] fn escape_perl_w() { assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); + assert_eq!(pb(r"(?-u)\w"), Expr::ClassBytes(asciiw())); } #[test] fn escape_perl_d_negate() { assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); + assert_eq!(pb(r"(?-u)\D"), Expr::ClassBytes(asciid().negate())); } #[test] fn escape_perl_s_negate() { assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); + assert_eq!(pb(r"(?-u)\S"), Expr::ClassBytes(asciis().negate())); } #[test] fn escape_perl_w_negate() { assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); + assert_eq!(pb(r"(?-u)\W"), Expr::ClassBytes(asciiw().negate())); } #[test] fn escape_perl_d_case_fold() { assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(pb(r"(?i-u)\d"), Expr::ClassBytes(asciid().case_fold())); } #[test] fn escape_perl_s_case_fold() { assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); + assert_eq!(pb(r"(?i-u)\s"), Expr::ClassBytes(asciis().case_fold())); } #[test] fn escape_perl_w_case_fold() { assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); + assert_eq!(pb(r"(?i-u)\w"), Expr::ClassBytes(asciiw().case_fold())); } #[test] fn escape_perl_d_case_fold_negate() { assert_eq!(p(r"(?i)\D"), - Expr::Class(class(PERLD).negate().case_fold())); + Expr::Class(class(PERLD).case_fold().negate())); + let bytes = asciid().case_fold().negate(); + assert_eq!(pb(r"(?i-u)\D"), Expr::ClassBytes(bytes)); } #[test] fn escape_perl_s_case_fold_negate() { assert_eq!(p(r"(?i)\S"), - Expr::Class(class(PERLS).negate().case_fold())); + Expr::Class(class(PERLS).case_fold().negate())); + let bytes = asciis().case_fold().negate(); + assert_eq!(pb(r"(?i-u)\S"), Expr::ClassBytes(bytes)); } #[test] fn escape_perl_w_case_fold_negate() { assert_eq!(p(r"(?i)\W"), - Expr::Class(class(PERLW).negate().case_fold())); + Expr::Class(class(PERLW).case_fold().negate())); + let bytes = asciiw().case_fold().negate(); + assert_eq!(pb(r"(?i-u)\W"), Expr::ClassBytes(bytes)); } #[test] @@ -1745,6 +1987,13 @@ mod tests { assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')]))); assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')]))); assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')]))); + + assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')]))); + assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)]))); + assert_eq!(pb("(?-u)[\n]"), + Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); + assert_eq!(pb(r"(?-u)[\n]"), + Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); } #[test] @@ -1761,12 +2010,28 @@ mod tests { assert_eq!(p("[^\n]"), Expr::Class(class(&[ ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), ]))); + + assert_eq!(pb(r"(?-u)[^a]"), Expr::ClassBytes(bclass(&[ + (0x00, 0x60), (0x62, 0xFF), + ]))); + assert_eq!(pb(r"(?-u)[^\x00]"), Expr::ClassBytes(bclass(&[ + (0x01, 0xFF), + ]))); + assert_eq!(pb(r"(?-u)[^\n]"), Expr::ClassBytes(bclass(&[ + (0x00, 0x09), (0x0B, 0xFF), + ]))); + assert_eq!(pb("(?-u)[^\n]"), Expr::ClassBytes(bclass(&[ + (0x00, 0x09), (0x0B, 0xFF), + ]))); } #[test] fn class_singleton_class() { assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD))); assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI))); + + let bytes = class(PERLD).to_byte_class(); + assert_eq!(pb(r"(?-u)[\d]"), Expr::ClassBytes(bytes)); } #[test] @@ -1774,6 +2039,13 @@ mod tests { assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate())); assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); + + let bytes = asciid().negate(); + assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes)); + let bytes = asciiw().negate(); + assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes)); + let bytes = asciis().negate(); + assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes)); } #[test] @@ -1781,22 +2053,35 @@ mod tests { assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD))); assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); + + assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid())); + assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw())); + assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis())); } #[test] fn class_singleton_class_casei() { assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); + + assert_eq!(pb(r"(?i-u)[\d]"), Expr::ClassBytes(asciid().case_fold())); } #[test] fn class_singleton_class_negate_casei() { assert_eq!(p(r"(?i)[^\d]"), - Expr::Class(class(PERLD).negate().case_fold())); + Expr::Class(class(PERLD).case_fold().negate())); assert_eq!(p(r"(?i)[^\w]"), - Expr::Class(class(PERLW).negate().case_fold())); + Expr::Class(class(PERLW).case_fold().negate())); assert_eq!(p(r"(?i)[^\s]"), - Expr::Class(class(PERLS).negate().case_fold())); + Expr::Class(class(PERLS).case_fold().negate())); + + let bytes = asciid().case_fold().negate(); + assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes)); + let bytes = asciiw().case_fold().negate(); + assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes)); + let bytes = asciis().case_fold().negate(); + assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes)); } #[test] @@ -1804,6 +2089,10 @@ mod tests { assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold())); assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); + + assert_eq!(pb(r"(?i-u)[^\D]"), Expr::ClassBytes(asciid().case_fold())); + assert_eq!(pb(r"(?i-u)[^\W]"), Expr::ClassBytes(asciiw().case_fold())); + assert_eq!(pb(r"(?i-u)[^\S]"), Expr::ClassBytes(asciis().case_fold())); } #[test] @@ -1839,7 +2128,7 @@ mod tests { fn class_multiple_class_negate_casei() { assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[ PERLD, YI, - ]).negate().case_fold())); + ]).case_fold().negate())); } #[test] @@ -1848,7 +2137,7 @@ mod tests { let nyi = class(YI).negate(); let class = CharClass::empty().merge(nperld).merge(nyi); assert_eq!(p(r"(?i)[^\D\P{Yi}]"), - Expr::Class(class.negate().case_fold())); + Expr::Class(class.case_fold().negate())); } #[test] @@ -1884,34 +2173,54 @@ mod tests { fn class_overlapping() { assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')]))); assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')]))); + + assert_eq!(pb("(?-u)[a-fd-h]"), + Expr::ClassBytes(bclass(&[(b'a', b'h')]))); + assert_eq!(pb("(?-u)[a-fg-m]"), + Expr::ClassBytes(bclass(&[(b'a', b'm')]))); } #[test] - fn ascii_class() { + fn ascii_classes() { assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); + + assert_eq!(pb("(?-u)[:upper:]"), + Expr::ClassBytes(class(UPPER).to_byte_class())); + assert_eq!(pb("(?-u)[[:upper:]]"), + Expr::ClassBytes(class(UPPER).to_byte_class())); } #[test] - fn ascii_class_not() { + fn ascii_classes_not() { assert_eq!(p("[:abc:]"), Expr::Class(class(&[(':', ':'), ('a', 'c')]))); + assert_eq!(pb("(?-u)[:abc:]"), + Expr::ClassBytes(bclass(&[(b':', b':'), (b'a', b'c')]))); } #[test] - fn ascii_class_multiple() { + fn ascii_classes_multiple() { assert_eq!(p("[[:lower:][:upper:]]"), Expr::Class(classes(&[UPPER, LOWER]))); + + assert_eq!(pb("(?-u)[[:lower:][:upper:]]"), + Expr::ClassBytes(classes(&[UPPER, LOWER]).to_byte_class())); } #[test] - fn ascii_class_negate() { + fn ascii_classes_negate() { assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate())); assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER))); + + assert_eq!(pb("(?-u)[[:^upper:]]"), + Expr::ClassBytes(class(UPPER).to_byte_class().negate())); + assert_eq!(pb("(?-u)[^[:^upper:]]"), + Expr::ClassBytes(class(UPPER).to_byte_class())); } #[test] - fn ascii_class_negate_multiple() { + fn ascii_classes_negate_multiple() { let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate()); let cls = CharClass::empty().merge(nlower).merge(nupper); assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone())); @@ -1919,24 +2228,40 @@ mod tests { } #[test] - fn ascii_class_case_fold() { + fn ascii_classes_case_fold() { assert_eq!(p("(?i)[:upper:]"), Expr::Class(class(UPPER).case_fold())); assert_eq!(p("(?i)[[:upper:]]"), Expr::Class(class(UPPER).case_fold())); + + assert_eq!(pb("(?i-u)[:upper:]"), + Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); + assert_eq!(pb("(?i-u)[[:upper:]]"), + Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); } #[test] - fn ascii_class_negate_case_fold() { + fn ascii_classes_negate_case_fold() { assert_eq!(p("(?i)[[:^upper:]]"), Expr::Class(class(UPPER).case_fold().negate())); assert_eq!(p("(?i)[^[:^upper:]]"), Expr::Class(class(UPPER).case_fold())); + + assert_eq!(pb("(?i-u)[[:^upper:]]"), + Expr::ClassBytes( + class(UPPER).to_byte_class().case_fold().negate())); + assert_eq!(pb("(?i-u)[^[:^upper:]]"), + Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); } #[test] fn single_class_negate_case_fold() { assert_eq!(p("(?i)[^x]"), Expr::Class(class(&[('x', 'x')]).case_fold().negate())); + + assert_eq!(pb("(?i-u)[^x]"), + Expr::ClassBytes( + class(&[('x', 'x')]) + .to_byte_class().case_fold().negate())); } #[test] @@ -2028,14 +2353,53 @@ mod tests { // Test every single possible error case. macro_rules! test_err { - ($re:expr, $pos:expr, $kind:expr) => {{ - let err = Parser::parse($re, Flags::default()).unwrap_err(); + ($re:expr, $pos:expr, $kind:expr) => { + test_err!($re, $pos, $kind, Flags::default()); + }; + ($re:expr, $pos:expr, $kind:expr, $flags:expr) => {{ + let err = Parser::parse($re, $flags).unwrap_err(); assert_eq!($pos, err.pos); assert_eq!($kind, err.kind); assert!($re.contains(&err.surround)); }} } + #[test] + fn flags_default_byte_flag_not_allowed() { + let flags = Flags { unicode: false, .. Flags::default() }; + test_err!("a", 0, ErrorKind::FlagNotAllowed('u'), flags); + } + + #[test] + fn flags_byte_flag_not_allowed() { + test_err!("(?-u)a", 3, ErrorKind::FlagNotAllowed('u')); + } + + #[test] + fn unicode_char_not_allowed() { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!("☃(?-u:☃)", 7, ErrorKind::UnicodeNotAllowed, flags); + } + + #[test] + fn unicode_class_not_allowed() { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags); + } + + #[test] + fn unicode_hex_not_allowed() { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"(?-u)\x{FFFF}", 13, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)\x{100}", 12, ErrorKind::UnicodeNotAllowed, flags); + } + + #[test] + fn unicode_octal_not_allowed() { + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"(?-u)\400", 9, ErrorKind::UnicodeNotAllowed, flags); + } + #[test] fn error_repeat_no_expr_simple() { test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr); @@ -2253,12 +2617,12 @@ mod tests { #[test] fn error_escape_hex_invalid_scalar_value_surrogate() { - test_err!(r"\x{D800}", 7, ErrorKind::InvalidScalarValue(0xD800)); + test_err!(r"\x{D800}", 8, ErrorKind::InvalidScalarValue(0xD800)); } #[test] fn error_escape_hex_invalid_scalar_value_high() { - test_err!(r"\x{110000}", 9, ErrorKind::InvalidScalarValue(0x110000)); + test_err!(r"\x{110000}", 10, ErrorKind::InvalidScalarValue(0x110000)); } #[test] diff --git a/regex-syntax/src/properties.rs b/regex-syntax/src/properties.rs index 0eba042043..76a1441037 100644 --- a/regex-syntax/src/properties.rs +++ b/regex-syntax/src/properties.rs @@ -11,7 +11,10 @@ use quickcheck::{Arbitrary, Gen, Testable, QuickCheck, StdGen}; use rand::Rng; -use {Expr, CharClass, ClassRange, Repeater, dec_char}; +use { + Expr, ExprBuilder, + CharClass, ClassRange, ByteClass, ByteRange, Repeater, dec_char, +}; fn qc(t: T) { QuickCheck::new() @@ -137,9 +140,10 @@ impl Arbitrary for Expr { let nada = || Box::new(None.into_iter()); let es: Box> = match *self { - Empty | AnyChar | AnyCharNoNL + Empty | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL | StartLine | EndLine | StartText | EndText - | WordBoundary | NotWordBoundary => nada(), + | WordBoundary | NotWordBoundary + | WordBoundaryAscii | NotWordBoundaryAscii => nada(), Literal { ref chars, .. } if chars.len() == 1 => nada(), Literal { ref chars, casei } => { Box::new((chars.clone(), casei) @@ -149,7 +153,17 @@ impl Arbitrary for Expr { Literal { chars: chars, casei: casei } })) } + LiteralBytes { ref bytes, .. } if bytes.len() == 1 => nada(), + LiteralBytes { ref bytes, casei } => { + Box::new((bytes.clone(), casei) + .shrink() + .filter(|&(ref bytes, _)| bytes.len() > 0) + .map(|(bytes, casei)| { + LiteralBytes { bytes: bytes, casei: casei } + })) + } Class(ref cls) => Box::new(cls.shrink().map(Class)), + ClassBytes(ref cls) => Box::new(cls.shrink().map(ClassBytes)), Group { ref e, ref i, ref name } => { let (i, name) = (i.clone(), name.clone()); Box::new(e.clone().shrink() @@ -205,9 +219,9 @@ enum ExprType { fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { use Expr::*; let ub = match (depth as usize >= g.size(), ty) { - (true, _) => 11, - (false, ExprType::NoSequences) => 13, - (false, ExprType::Anything) => 15, + (true, _) => 16, + (false, ExprType::NoSequences) => 18, + (false, ExprType::Anything) => 20, }; match g.gen_range(1, ub) { 0 => Empty, @@ -215,22 +229,30 @@ fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { chars: SmallAscii::arbitrary(g).0.chars().collect(), casei: g.gen(), }, - 2 => AnyChar, - 3 => AnyCharNoNL, - 4 => Class(CharClass::arbitrary(g)), - 5 => StartLine, - 6 => EndLine, - 7 => StartText, - 8 => EndText, - 9 => WordBoundary, - 10 => NotWordBoundary, - 11 => gen_group_expr(g, depth + 1), - 12 => Repeat { + 2 => LiteralBytes { + bytes: SmallAscii::arbitrary(g).0.as_bytes().to_owned(), + casei: g.gen(), + }, + 3 => AnyChar, + 4 => AnyCharNoNL, + 5 => AnyByte, + 6 => AnyByteNoNL, + 7 => Class(CharClass::arbitrary(g)), + 8 => StartLine, + 9 => EndLine, + 10 => StartText, + 11 => EndText, + 12 => WordBoundary, + 13 => NotWordBoundary, + 14 => WordBoundaryAscii, + 15 => NotWordBoundaryAscii, + 16 => gen_group_expr(g, depth + 1), + 17 => Repeat { e: Box::new(gen_repeatable_expr(g, depth + 1)), r: Repeater::arbitrary(g), greedy: bool::arbitrary(g), }, - 13 => { + 18 => { let size = { let s = g.size(); g.gen_range(2, s) }; Concat((0..size) .map(|_| { @@ -238,7 +260,7 @@ fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { }) .collect()) } - 14 => { + 19 => { let size = { let s = g.size(); g.gen_range(2, s) }; Alternate((0..size) .map(|_| { @@ -252,16 +274,23 @@ fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { fn gen_repeatable_expr(g: &mut G, depth: u32) -> Expr { use Expr::*; - match g.gen_range(1, 6) { + match g.gen_range(1, 10) { 0 => Empty, 1 => Literal { chars: vec![Arbitrary::arbitrary(g)], casei: g.gen(), }, - 2 => AnyChar, - 3 => AnyCharNoNL, - 4 => Class(CharClass::arbitrary(g)), - 5 => gen_group_expr(g, depth + 1), + 2 => LiteralBytes { + bytes: vec![Arbitrary::arbitrary(g)], + casei: g.gen(), + }, + 3 => AnyChar, + 4 => AnyCharNoNL, + 5 => AnyByte, + 6 => AnyByteNoNL, + 7 => Class(CharClass::arbitrary(g)), + 8 => ClassBytes(ByteClass::arbitrary(g)), + 9 => gen_group_expr(g, depth + 1), _ => unreachable!(), } } @@ -384,6 +413,35 @@ impl Arbitrary for ClassRange { } } +impl Arbitrary for ByteClass { + fn arbitrary(g: &mut G) -> ByteClass { + let mut ranges: Vec = Arbitrary::arbitrary(g); + if ranges.is_empty() { + ranges.push(Arbitrary::arbitrary(g)); + } + let cls = ByteClass { ranges: ranges }.canonicalize(); + if g.gen() { cls.case_fold() } else { cls } + } + + fn shrink(&self) -> Box> { + Box::new(self.ranges.clone() + .shrink() + .filter(|ranges| ranges.len() > 0) + .map(|ranges| ByteClass { ranges: ranges }.canonicalize())) + } +} + +impl Arbitrary for ByteRange { + fn arbitrary(g: &mut G) -> ByteRange { + ByteRange::new(g.gen_range(97, 123), g.gen_range(97, 123)) + } + + fn shrink(&self) -> Box> { + Box::new((self.start, self.end) + .shrink().map(|(s, e)| ByteRange::new(s, e))) + } +} + #[test] fn display_regex_roundtrips() { // Given an AST, if we print it as a regex and then re-parse it, do we @@ -391,7 +449,8 @@ fn display_regex_roundtrips() { // A lot of this relies crucially on regex simplification. So this is // testing `Expr::simplify` as much as it is testing the `Display` impl. fn prop(e: Expr) -> bool { - e == Expr::parse(&e.to_string()).unwrap() + let parser = ExprBuilder::new().allow_bytes(true); + e == parser.parse(&e.to_string()).unwrap() } QuickCheck::new() .tests(10_000) diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml index b039b611f9..2d48e430ab 100644 --- a/regex_macros/Cargo.toml +++ b/regex_macros/Cargo.toml @@ -32,11 +32,5 @@ rand = "0.3" regex-syntax = { path = "../regex-syntax", version = "0.2" } [[test]] -path = "../tests/test_native.rs" -name = "native" - -[[bench]] -name = "native" -path = "../benches/bench_native.rs" -test = false -bench = true +path = "../tests/test_plugin.rs" +name = "plugin" diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index 92a8c1b3c0..ca1899f327 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -160,7 +160,7 @@ fn exec<'t>( use regex::internal::{Char, CharInput, InputAt, Input, Inst}; - let input = CharInput::new(input); + let input = CharInput::new(input.as_bytes()); let at = input.at(start); return Nfa { input: input, @@ -330,12 +330,12 @@ fn exec<'t>( } } -::regex::Regex::Native(::regex::internal::ExNative { +::regex::Regex(::regex::internal::_Regex::Native(::regex::internal::ExNative { original: $regex, names: &CAPTURES, groups: &CAPTURE_NAME_IDX, prog: exec, -}) +})) }) } @@ -393,6 +393,10 @@ fn exec<'t>( } }) } + EmptyLook::WordBoundaryAscii + | EmptyLook::NotWordBoundaryAscii => { + unreachable!() + } } } Inst::Save(ref inst) => { diff --git a/run-bench b/run-bench new file mode 100755 index 0000000000..8a7873d693 --- /dev/null +++ b/run-bench @@ -0,0 +1,46 @@ +#!/bin/bash + +if [ $# = 0 ] || [ $1 = '-h' ]; then + echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre | onig]" >&2 + exit 1 +fi + +which="$1" +shift +case $which in + rust) + exec cargo bench \ + --manifest-path benches/Cargo.toml \ + --bench rust \ + --features re-rust \ + "$@" + ;; + rust-bytes) + exec cargo bench \ + --manifest-path benches/Cargo.toml \ + --bench rust-bytes \ + --features re-rust-bytes \ + "$@" + ;; + rust-plugin) + exec cargo bench \ + --manifest-path benches/Cargo.toml \ + --bench rust-plugin \ + --features re-rust-plugin \ + "$@" + ;; + pcre) + exec cargo bench \ + --manifest-path benches/Cargo.toml \ + --bench pcre \ + --features re-pcre \ + "$@" + ;; + onig|oniguruma) + exec cargo bench \ + --manifest-path benches/Cargo.toml \ + --bench onig \ + --features re-onig \ + "$@" + ;; +esac diff --git a/run-kcov b/run-kcov new file mode 100755 index 0000000000..a39ecd37fe --- /dev/null +++ b/run-kcov @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e + +tests=( + default + default_bytes + backtrack + backtrack_utf8bytes + backtrack_bytes + nfa + nfa_utf8bytes + nfa_bytes + regex + regex_inline +) +tmpdir=$(mktemp -d) + +if [ "$1" = "--with-plugin" ]; then + echo "compiling regex_macros test, this can take a while..." + cargo test --manifest-path regex_macros/Cargo.toml --no-run + kcov \ + --include-pattern '/regex/src/' \ + "$tmpdir/plugin" \ + $(ls -t ./regex_macros/target/debug/plugin-* | head -n1) +fi + +cargo test --no-run +for t in ${tests[@]}; do + kcov \ + --include-pattern '/regex/src/' \ + "$tmpdir/$t" \ + $(ls -t ./target/debug/"$t"-* | head -n1) +done + +kcov --merge target/cov "$tmpdir"/* diff --git a/src/backtrack.rs b/src/backtrack.rs index b80ff9cf60..88d829313d 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -186,13 +186,13 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { // Only quit if we're matching one regex. // If we're matching a regex set, then mush on and // try to find other matches. - if self.search.matches.len() <= 1 { + if !self.search.find_many_matches() { return true; } } } Job::SaveRestore { slot, old_pos } => { - self.search.captures[slot] = old_pos; + self.search.set_capture(slot, old_pos); } } } @@ -212,17 +212,16 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { return true; } Save(ref inst) => { - if inst.slot < self.search.captures.len() { + if let Some(old_pos) = self.search.capture(inst.slot) { // If this path doesn't work out, then we save the old // capture index (if one exists) in an alternate // job. If the next path fails, then the alternate // job is popped and the old capture index is restored. - let old_pos = self.search.captures[inst.slot]; self.m.jobs.push(Job::SaveRestore { slot: inst.slot, old_pos: old_pos, }); - self.search.captures[inst.slot] = Some(at.pos()); + self.search.set_capture(inst.slot, Some(at.pos())); } ip = inst.goto; } diff --git a/src/char.rs b/src/char.rs deleted file mode 100644 index a053fe2fd0..0000000000 --- a/src/char.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::char; -use std::cmp::Ordering; -use std::fmt; -use std::u32; - -use syntax; - -/// An inline representation of `Option`. -/// -/// This eliminates the need to do case analysis on `Option` to determine -/// ordinality with other characters. -/// -/// (The `Option` is not related to encoding. Instead, it is used in the -/// matching engines to represent the beginning and ending boundaries of the -/// search text.) -#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct Char(u32); - -impl fmt::Debug for Char { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match char::from_u32(self.0) { - None => write!(f, "Empty"), - Some(c) => write!(f, "{:?}", c), - } - } -} - -impl Char { - /// Returns true iff the character is absent. - #[inline] - pub fn is_none(self) -> bool { self.0 == u32::MAX } - - /// Returns the length of the character's UTF-8 encoding. - /// - /// If the character is absent, then `0` is returned. - #[inline] - pub fn len_utf8(self) -> usize { - char::from_u32(self.0).map_or(0, |c| c.len_utf8()) - } - - /// Returns true iff the character is a word character. - /// - /// If the character is absent, then false is returned. - pub fn is_word_char(self) -> bool { - char::from_u32(self.0).map_or(false, syntax::is_word_char) - } - - /// Converts the character to a real primitive `char`. - /// - /// If the character is absent, then `None` is returned. - pub fn as_char(self) -> Option { - // This is only used in the `regex!` macro because it expands char - // classes into `match` expressions (instead of binary search). - char::from_u32(self.0) - } -} - -impl From for Char { - fn from(c: char) -> Char { Char(c as u32) } -} - -impl From> for Char { - fn from(c: Option) -> Char { - c.map_or(Char(u32::MAX), |c| c.into()) - } -} - -impl PartialEq for Char { - #[inline] - fn eq(&self, other: &char) -> bool { self.0 == *other as u32 } -} - -impl PartialEq for char { - #[inline] - fn eq(&self, other: &Char) -> bool { *self as u32 == other.0 } -} - -impl PartialOrd for Char { - #[inline] - fn partial_cmp(&self, other: &char) -> Option { - self.0.partial_cmp(&(*other as u32)) - } -} - -impl PartialOrd for char { - #[inline] - fn partial_cmp(&self, other: &Char) -> Option { - (*self as u32).partial_cmp(&other.0) - } -} diff --git a/src/char_utf8.rs b/src/char_utf8.rs deleted file mode 100644 index ac6be54908..0000000000 --- a/src/char_utf8.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Pulled from std::char until encode_utf8 stabilizes. ---AG - -// UTF-8 ranges and tags for encoding characters -const TAG_CONT: u8 = 0b1000_0000; -const TAG_TWO_B: u8 = 0b1100_0000; -const TAG_THREE_B: u8 = 0b1110_0000; -const TAG_FOUR_B: u8 = 0b1111_0000; -const MAX_ONE_B: u32 = 0x80; -const MAX_TWO_B: u32 = 0x800; -const MAX_THREE_B: u32 = 0x10000; - -#[inline] -pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option { - let code = character as u32; - if code < MAX_ONE_B && !dst.is_empty() { - dst[0] = code as u8; - Some(1) - } else if code < MAX_TWO_B && dst.len() >= 2 { - dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - dst[1] = (code & 0x3F) as u8 | TAG_CONT; - Some(2) - } else if code < MAX_THREE_B && dst.len() >= 3 { - dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - dst[2] = (code & 0x3F) as u8 | TAG_CONT; - Some(3) - } else if dst.len() >= 4 { - dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; - dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - dst[3] = (code & 0x3F) as u8 | TAG_CONT; - Some(4) - } else { - None - } -} diff --git a/src/compile.rs b/src/compile.rs index a3471d80b1..8f1efdf2b2 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -13,7 +13,7 @@ use std::iter; use std::result; use std::sync::Arc; -use syntax::{Expr, Repeater, CharClass, ClassRange}; +use syntax::{Expr, Repeater, CharClass, ClassRange, ByteClass, ByteRange}; use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences}; use prog::{ @@ -33,6 +33,8 @@ struct Patch { entry: InstPtr, } +/// A compiler translates a regular expression AST to a sequence of +/// instructions. The sequence of instructions represents an NFA. pub struct Compiler { insts: Vec, compiled: Program, @@ -85,6 +87,15 @@ impl Compiler { self } + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.compiled.only_utf8 = yes; + self + } + /// When set, the machine returned is suitable for use in the DFA matching /// engine. /// @@ -94,7 +105,6 @@ impl Compiler { /// or impossible in the DFA engine.) pub fn dfa(mut self, yes: bool) -> Self { self.compiled.is_dfa = yes; - self.compiled.is_bytes = yes; self } @@ -131,12 +141,7 @@ impl Compiler { self.compiled.is_anchored_start = expr.is_anchored_start(); self.compiled.is_anchored_end = expr.is_anchored_end(); if self.compiled.needs_dotstar() { - let patch = try!(self.c(&Expr::Repeat { - e: Box::new(Expr::AnyChar), - r: Repeater::ZeroOrMore, - greedy: false, - })); - self.fill_to_next(patch.hole); + try!(self.c_dotstar()); } self.compiled.captures = vec![None]; self.compiled.start = self.insts.len(); @@ -158,12 +163,7 @@ impl Compiler { self.compiled.is_anchored_end = exprs.iter().all(|e| e.is_anchored_end()); if self.compiled.needs_dotstar() { - let patch = try!(self.c(&Expr::Repeat { - e: Box::new(Expr::AnyChar), - r: Repeater::ZeroOrMore, - greedy: false, - })); - self.fill_to_next(patch.hole); + try!(self.c_dotstar()); } self.compiled.start = self.insts.len(); @@ -202,6 +202,7 @@ impl Compiler { match *expr { Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }), Literal { ref chars, casei } => self.c_literal(chars, casei), + LiteralBytes { ref bytes, casei } => self.c_bytes(bytes, casei), AnyChar => self.c_class(&[ClassRange { start: '\x00', end: '\u{10ffff}', @@ -212,9 +213,23 @@ impl Compiler { ClassRange { start: '\x0b', end: '\u{10ffff}' }, ]) } + AnyByte => { + assert!(!self.compiled.only_utf8()); + self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }]) + } + AnyByteNoNL => { + assert!(!self.compiled.only_utf8()); + self.c_class_bytes(&[ + ByteRange { start: 0, end: 0x9 }, + ByteRange { start: 0xB, end: 0xFF }, + ]) + } Class(ref cls) => { self.c_class(cls) } + ClassBytes(ref cls) => { + self.c_class_bytes(cls) + } StartLine if self.compiled.is_reverse => { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::EndLine) @@ -247,6 +262,12 @@ impl Compiler { NotWordBoundary => { self.c_empty_look(prog::EmptyLook::NotWordBoundary) } + WordBoundaryAscii => { + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + NotWordBoundaryAscii => { + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } Group { ref e, i: None, name: None } => self.c(e), Group { ref e, i, ref name } => { // it's impossible to have a named capture without an index @@ -288,6 +309,24 @@ impl Compiler { } } + fn c_dotstar(&mut self) -> result::Result<(), Error> { + let patch = if !self.compiled.only_utf8() { + try!(self.c(&Expr::Repeat { + e: Box::new(Expr::AnyByte), + r: Repeater::ZeroOrMore, + greedy: false, + })) + } else { + try!(self.c(&Expr::Repeat { + e: Box::new(Expr::AnyChar), + r: Repeater::ZeroOrMore, + greedy: false, + })) + }; + self.fill_to_next(patch.hole); + Ok(()) + } + fn c_literal(&mut self, chars: &[char], casei: bool) -> Result { assert!(!chars.is_empty()); let mut chars: Box> = @@ -317,7 +356,7 @@ impl Compiler { } fn c_class(&mut self, ranges: &[ClassRange]) -> Result { - if self.compiled.is_bytes { + if self.compiled.uses_bytes() { CompileClass { c: self, ranges: ranges, @@ -334,6 +373,60 @@ impl Compiler { } } + fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result { + assert!(!bytes.is_empty()); + let mut bytes: Box> = + if self.compiled.is_reverse { + Box::new(bytes.iter().rev()) + } else { + Box::new(bytes.iter()) + }; + let first = *bytes.next().expect("non-empty literal"); + let Patch { mut hole, entry } = try!(self.c_byte(first, casei)); + for &b in bytes { + let p = try!(self.c_byte(b, casei)); + self.fill(hole, p.entry); + hole = p.hole; + } + Ok(Patch { hole: hole, entry: entry }) + } + + fn c_byte(&mut self, b: u8, casei: bool) -> Result { + if casei { + self.c_class_bytes(&ByteClass::new(vec![ + ByteRange { start: b, end: b }, + ]).case_fold()) + } else { + self.c_class_bytes(&[ByteRange { start: b, end: b }]) + } + } + + fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result { + assert!(!ranges.is_empty()); + + let first_split_entry = self.insts.len(); + let mut holes = vec![]; + let mut prev_hole = Hole::None; + for r in &ranges[0..ranges.len() - 1] { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let next = self.insts.len(); + self.byte_classes.set_range(r.start, r.end); + holes.push(self.push_hole(InstHole::Bytes { + start: r.start, end: r.end, + })); + prev_hole = self.fill_split(split, Some(next), None); + } + let next = self.insts.len(); + let r = &ranges[ranges.len() - 1]; + self.byte_classes.set_range(r.start, r.end); + holes.push(self.push_hole(InstHole::Bytes { + start: r.start, end: r.end, + })); + self.fill(prev_hole, next); + Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }) + } + fn c_empty_look(&mut self, look: EmptyLook) -> Result { let hole = self.push_hole(InstHole::EmptyLook { look: look }); Ok(Patch { hole: hole, entry: self.insts.len() - 1 }) @@ -910,7 +1003,7 @@ impl ByteClassSet { let mut class = 0u8; for i in 0..256 { byte_classes[i] = class; - if i > 0 && self.0[i] { + if self.0[i] { class = class.checked_add(1).unwrap(); } } diff --git a/src/dfa.rs b/src/dfa.rs index 239042a320..7958e7372c 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -85,6 +85,8 @@ pub fn can_exec(insts: &Program) -> bool { EmptyLook(ref inst) => { match inst.look { WordBoundary | NotWordBoundary => return false, + // We have a hope of supporting at least these some day. + WordBoundaryAscii | NotWordBoundaryAscii => return false, StartLine | EndLine | StartText | EndText => {} } } @@ -126,6 +128,9 @@ pub struct DfaCache { /// /// The maximum stack size is the number of NFA states. stack: Vec, + /// The total number of times this cache has been flushed by the DFA + /// because of space constraints. + cache_flush_count: u64, /// qcur and qnext are ordered sets with constant time /// addition/membership/clearing-whole-set and linear time iteration. They /// are used to manage the sets of NFA states in DFA states when computing @@ -165,11 +170,38 @@ pub struct Dfa<'a, 'b, 'c: 'b, 'm: 'b> { /// includes space for indicating which regex matched if executing a /// regex set. search: &'b mut Search<'c, 'm>, + /// The current position in the input. + at: usize, + /// The input position of the last cache flush. We use this to determine + /// if we're thrashing in the cache too often. If so, the DFA quits so + /// that we can fall back to the NFA algorithm. + last_cache_flush: usize, /// These are all from DfaCache. (Only {qcur,qnext} are missing.) compiled: &'a mut HashMap, states: &'a mut Vec, start_states: &'a mut Vec, stack: &'a mut Vec, + cache_flush_count: &'a mut u64, +} + +/// The result of running the DFA. +/// +/// Generally, the result is either a match or not a match, but sometimes the +/// DFA runs too slow because the cache size is too small. In that case, it +/// gives up with the intent of falling back to the NFA algorithm. +pub enum DfaResult { + Match, + NoMatch, + Quit, +} + +impl DfaResult { + pub fn is_match(&self) -> bool { + match *self { + DfaResult::Match => true, + DfaResult::NoMatch | DfaResult::Quit => false, + } + } } /// State is a DFA state. It contains transitions to next states (given an @@ -269,6 +301,7 @@ impl DfaCache { states: vec![State::invalid(), State::invalid()], start_states: vec![STATE_UNKNOWN; 256], stack: vec![], + cache_flush_count: 0, qcur: SparseSet::new(0), qnext: SparseSet::new(0), } @@ -306,7 +339,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { search: &'b mut Search<'c, 'm>, text: &[u8], at: usize, - ) -> bool { + ) -> DfaResult { // Retrieve our DFA cache from the program. If another thread tries to // execute this DFA *simultaneously*, then a new independent cache is // created. @@ -318,25 +351,29 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { prog: prog, start: 0, // filled in below search: search, + at: at, + last_cache_flush: at, compiled: &mut cache.compiled, states: &mut cache.states, start_states: &mut cache.start_states, stack: &mut cache.stack, + cache_flush_count: &mut cache.cache_flush_count, }; - dfa.start = match dfa.start_state(&mut cache.qcur, text, at) { - STATE_DEAD => return false, - si => si, + dfa.start = match dfa.start_state(&mut cache.qcur, text) { + None => return DfaResult::Quit, + Some(STATE_DEAD) => return DfaResult::NoMatch, + Some(si) => si, }; debug_assert!(dfa.start != STATE_UNKNOWN); - let matched = if prog.is_reverse { - dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text, at) + let result = if prog.is_reverse { + dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) } else { - dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text, at) + dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) }; - if matched && dfa.search.matches.len() <= 1 { + if result.is_match() && !dfa.search.find_many_matches() { dfa.search.set_match(0); } - matched + result } /// Executes the DFA on a forward NFA. @@ -347,8 +384,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { qcur: &mut SparseSet, qnext: &mut SparseSet, text: &[u8], - at: usize, - ) -> bool { + ) -> DfaResult { // For the most part, the DFA is basically: // // last_match = null @@ -385,14 +421,14 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // when searching forwards. Its maximum value is `text.len()`, // which can only happen after the special EOF sentinel value is fed // to the DFA. - let (mut si, mut i, mut matched) = (self.start, at, false); - while i < text.len() { + let (mut si, mut result) = (self.start, DfaResult::NoMatch); + while self.at < text.len() { // Our set of literal prefixes can itself be a DFA, but it is // offline and can generally be quite a bit faster. (For instance, // memchr is used if possible.) if !self.prog.prefixes.is_empty() && si == self.start { - i = match self.prefix_at(text, i) { - None => return false, + self.at = match self.prefix_at(text, self.at) { + None => return DfaResult::NoMatch, Some(i) => i, }; } @@ -401,51 +437,58 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // but we inline it manually here to avoid the extra branch and // also because we know we have a real `u8` (not a `Byte`, which // may be the special EOF sentinel value). - let cls = self.prog.byte_classes[text[i] as usize]; + let cls = self.prog.byte_classes[text[self.at] as usize]; let mut next_si = self.states[si as usize].next[cls as usize]; if next_si <= STATE_DEAD { if next_si == STATE_DEAD { - return matched; + return result; } // The next state may not have been cached, so re-compute it // (i.e., follow epsilon transitions). - next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i])); + let b = Byte::byte(text[self.at]); + next_si = match self.exec_byte(qcur, qnext, si, b) { + None => return DfaResult::Quit, + Some(next_si) => next_si, + }; debug_assert!(next_si != STATE_UNKNOWN); if next_si == STATE_DEAD { - return matched; + return result; } } si = next_si; if self.states[si as usize].is_match { if self.search.quit_after_first_match() { - return true; + return DfaResult::Match; } - matched = true; - self.search.set_end(Some(i)); + result = DfaResult::Match; + self.search.set_end(Some(self.at)); } - i += 1; + self.at += 1; } // Run the DFA once more on the special EOF senitnel value. - si = self.next_state(qcur, qnext, si, Byte::eof()); + si = match self.next_state(qcur, qnext, si, Byte::eof()) { + None => return DfaResult::Quit, + Some(si) => si, + }; debug_assert!(si != STATE_UNKNOWN); if si == STATE_DEAD { - return matched; + return result; } if self.states[si as usize].is_match { if self.search.quit_after_first_match() { - return true; + return DfaResult::Match; } - matched = true; + result = DfaResult::Match; self.search.set_end(Some(text.len())); } - if matched && self.search.matches.len() != 1 { + if result.is_match() && !self.search.find_one_match() { for &ip in &self.states[si as usize].insts { if let Inst::Match(slot) = self.prog[ip as usize] { self.search.set_match(slot); } } } - matched + result } /// Executes the DFA on a reverse NFA. @@ -454,8 +497,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { qcur: &mut SparseSet, qnext: &mut SparseSet, text: &[u8], - at: usize, - ) -> bool { + ) -> DfaResult { // The comments in `exec_at` above mostly apply here too. The main // difference is that we move backwards over the input and we look for // the longest possible match instead of the leftmost-first match. @@ -463,46 +505,54 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // N.B. The code duplication here is regrettable. Efforts to improve // it without sacrificing performance are welcome. ---AG debug_assert!(self.prog.is_reverse); - let (mut si, mut i, mut matched) = (self.start, at, false); - while i > 0 { - i -= 1; + let (mut si, mut result) = (self.start, DfaResult::NoMatch); + while self.at > 0 { + self.at -= 1; - let cls = self.prog.byte_classes[text[i] as usize]; + let cls = self.prog.byte_classes[text[self.at] as usize]; let mut next_si = self.states[si as usize].next[cls as usize]; if next_si <= STATE_DEAD { if next_si == STATE_DEAD { - return matched; + return result; } // The next state may not have been cached, so re-compute it // (i.e., follow epsilon transitions). - next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i])); + let b = Byte::byte(text[self.at]); + next_si = match self.exec_byte(qcur, qnext, si, b) { + None => return DfaResult::Quit, + Some(next_si) => next_si, + }; debug_assert!(next_si != STATE_UNKNOWN); if next_si == STATE_DEAD { - return matched; + return result; } } si = next_si; if self.states[si as usize].is_match { if self.search.quit_after_first_match() { - return true; + return DfaResult::NoMatch; } - matched = true; - self.search.set_start(Some(i+1)); + result = DfaResult::Match; + self.search.set_start(Some(self.at+1)); } } - si = self.next_state(qcur, qnext, si, Byte::eof()); + // Run the DFA once more on the special EOF senitnel value. + si = match self.next_state(qcur, qnext, si, Byte::eof()) { + None => return DfaResult::Quit, + Some(si) => si, + }; debug_assert!(si != STATE_UNKNOWN); if si == STATE_DEAD { - return matched; + return result; } if self.states[si as usize].is_match { if self.search.quit_after_first_match() { - return true; + return DfaResult::Match; } - matched = true; + result = DfaResult::Match; self.search.set_start(Some(0)); } - matched + result } /// Computes the next state given the current state and the current input @@ -519,7 +569,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { qnext: &mut SparseSet, mut si: StatePtr, b: Byte, - ) -> StatePtr { + ) -> Option { use prog::Inst::*; // Initialize a queue with the current DFA state's NFA states. @@ -565,6 +615,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // byte. Namely, if this DFA state containing a matching NFA state, // then it is the *next* DFA state that is marked as a match. let mut flags = Flags::new(); + let mut is_match = false; if b.as_byte().map_or(false, |b| b == b'\n') { flags.set_start_line(true); } @@ -578,10 +629,10 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // These states are handled when following epsilon transitions. Save(_) | Split(_) | EmptyLook(_) => {} Match(_) => { - flags.set_match(true); + is_match = true; if !self.continue_past_first_match() { break; - } else if self.search.matches.len() != 1 { + } else if !self.search.find_one_match() { // If we are continuing on to find other matches, // then keep a record of the match states we've seen. if !qnext.contains_ip(ip as usize) { @@ -598,7 +649,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { } } let mut cache = true; - if b.is_eof() && self.search.matches.len() != 1 { + if b.is_eof() && !self.search.find_one_match() { // If we're processing the last byte of the input and we're // matching a regex set, then make the next state contain the // previous states transitions. We do this so that the main @@ -614,14 +665,17 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // N.B. We pass `&mut si` here because the cache may clear itself if // it has gotten too full. When that happens, the location of the // current state may change. - let next = self.cached_state(qnext, flags.is_match(), Some(&mut si)); + let next = match self.cached_state(qnext, is_match, Some(&mut si)) { + None => return None, + Some(next) => next, + }; debug_assert!(next != STATE_UNKNOWN); // And now store our state in the current state's next list. let cls = self.byte_class(b); if cache { self.states[si as usize].next[cls] = next; } - next + Some(next) } /// Follows the epsilon transitions starting at (and including) `ip`. The @@ -686,7 +740,10 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { } StartLine | EndLine | StartText | EndText => {} // The DFA doesn't support word boundaries. :-( - WordBoundary | NotWordBoundary => unreachable!(), + WordBoundary + | NotWordBoundary + | WordBoundaryAscii + | NotWordBoundaryAscii => unreachable!(), } } Save(ref inst) => self.stack.push(inst.goto as InstPtr), @@ -721,7 +778,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { q: &SparseSet, is_match: bool, current_state: Option<&mut StatePtr>, - ) -> StatePtr { + ) -> Option { // If we couldn't come up with a non-empty key to represent this state, // then it is dead and can never lead to a match. // @@ -730,17 +787,20 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // we should follow epsilon transitions at the empty string preceding // the current byte. let (key, inst_flags) = match self.cached_state_key(q, is_match) { - None => return STATE_DEAD, + None => return Some(STATE_DEAD), Some(v) => v, }; // In the cache? Cool. Done. if let Some(&si) = self.compiled.get(&key) { - return si; + return Some(si); } // If the cache has gotten too big, wipe it. if self.approximate_size() > CACHE_LIMIT { - self.clear_cache_and_save(current_state); + if !self.clear_cache_and_save(current_state) { + // Ooops. DFA is giving up. + return None; + } } // OK, now there's enough room to push our new state. @@ -754,7 +814,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { }); let si = usize_to_u32(self.states.len().checked_sub(1).unwrap()); self.compiled.insert(key, si); - si + Some(si) } /// Produces a key suitable for describing a state in the DFA cache. @@ -810,7 +870,10 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { inst_flags.set_end(true); insts.push(ip); } - WordBoundary | NotWordBoundary => unreachable!(), + WordBoundary + | NotWordBoundary + | WordBoundaryAscii + | NotWordBoundaryAscii => unreachable!(), } } Match(_) => { @@ -838,27 +901,54 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// /// The current state must be provided here in case its location in the /// cache changes. - fn clear_cache_and_save(&mut self, current_state: Option<&mut StatePtr>) { + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache_and_save( + &mut self, + current_state: Option<&mut StatePtr>, + ) -> bool { if self.states.len() <= 2 { // Why <= 2? Well, the states list always has its first two // positions filled by marker states for STATE_UNKNOWN and // STATE_DEAD. These states aren't actually used, but exist to // make sure no other state lives in those locations. Therefore, // a state vec with length <= 2 is actually "empty." - return; + return true; } match current_state { None => self.clear_cache(), Some(si) => { let cur = self.copy_state(*si); - self.clear_cache(); + if !self.clear_cache() { + return false; + } *si = self.restore_state(cur); + true } } } /// Wipes the state cache, but saves and restores the current start state. - fn clear_cache(&mut self) { + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache(&mut self) -> bool { + // Bail out of the DFA if we're moving too "slowly." + // A heuristic from RE2: assume the DFA is too slow if it is processing + // 10 or fewer bytes per state. + // Additionally, we permit the cache to be flushed a few times before + // caling it quits. + if *self.cache_flush_count >= 3 + && self.at >= self.last_cache_flush + && (self.at - self.last_cache_flush) <= 10 * self.states.len() { + return false; + } + // Update statistics tracking cache flushes. + self.last_cache_flush = self.at; + *self.cache_flush_count += 1; + + // OK, actually flush the cache. let start = self.copy_state(self.start); self.states.clear(); self.compiled.clear(); @@ -868,6 +958,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { self.states.push(State::invalid()); self.states.push(State::invalid()); self.start = self.restore_state(start); + true } /// Returns a fresh copy of state si with all of its next pointers set to @@ -905,18 +996,21 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// /// The pointer can be to a real state, or it can be STATE_DEAD. /// STATE_UNKNOWN cannot be returned. + /// + /// None is returned if a new state could not be allocated (i.e., the DFA + /// ran out of space and thinks it's running too slowly). fn next_state( &mut self, qcur: &mut SparseSet, qnext: &mut SparseSet, si: StatePtr, b: Byte, - ) -> StatePtr { + ) -> Option { let cls = self.byte_class(b); match self.states[si as usize].next[cls] { STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), - STATE_DEAD => return STATE_DEAD, - nsi => return nsi, + STATE_DEAD => return Some(STATE_DEAD), + nsi => return Some(nsi), } } @@ -930,18 +1024,17 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { &mut self, q: &mut SparseSet, text: &[u8], - at: usize, - ) -> StatePtr { + ) -> Option { let start_flags = if self.prog.is_reverse { - self.start_flags_reverse(text, at) + self.start_flags_reverse(text, self.at) } else { - self.start_flags(text, at) + self.start_flags(text, self.at) }; let flagi = start_flags.0 as usize; match self.start_states[flagi] { STATE_UNKNOWN => {} - STATE_DEAD => return STATE_DEAD, - si => return si, + STATE_DEAD => return Some(STATE_DEAD), + si => return Some(si), } q.clear(); self.follow_epsilons(0, q, start_flags); @@ -949,9 +1042,12 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { // by one byte. Given an empty string and an empty match, the match // won't actually occur until the DFA processes the special EOF // sentinel byte. - let sp = self.cached_state(q, false, None); + let sp = match self.cached_state(q, false, None) { + None => return None, + Some(sp) => sp, + }; self.start_states[flagi] = sp; - sp + Some(sp) } /// Computes the set of starting flags for the given position in text. @@ -1016,7 +1112,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// the longest match (for reverse search) or all possible matches (for /// regex sets). fn continue_past_first_match(&self) -> bool { - self.prog.is_reverse || self.search.matches.len() != 1 + self.prog.is_reverse || !self.search.find_one_match() } /// Approximate size returns the approximate heap space currently used by @@ -1079,15 +1175,6 @@ impl Flags { } } - #[inline] - fn is_match(&self) -> bool { self.0 & 0b1_0000000 > 0 } - - #[inline] - fn set_match(&mut self, yes: bool) -> &mut Self { - self.set(yes, 0b1_0000000); - self - } - #[inline] fn is_start(&self) -> bool { self.0 & 0b0_1_000000 > 0 } @@ -1161,7 +1248,6 @@ impl fmt::Debug for State { impl fmt::Debug for Flags { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Flags") - .field("match", &if self.is_match() { 1 } else { 0 }) .field("start", &if self.is_start() { 1 } else { 0 }) .field("end", &if self.is_end() { 1 } else { 0 }) .field("start_line", &if self.is_start_line() { 1 } else { 0 }) diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000000..8e1cbaaa2d --- /dev/null +++ b/src/error.rs @@ -0,0 +1,74 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; + +use syntax; + +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Debug)] +pub enum Error { + /// A syntax error. + Syntax(syntax::Error), + /// The compiled program exceeded the set size limit. + /// The argument is the size limit imposed. + CompiledTooBig(usize), + /// An invalid set is a regex set with fewer than 2 regular expressions. + InvalidSet, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ::std::error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err.description(), + Error::CompiledTooBig(_) => "compiled program too big", + Error::InvalidSet => { + "sets must contain 2 or more regular expressions" + } + Error::__Nonexhaustive => unreachable!(), + } + } + + fn cause(&self) -> Option<&::std::error::Error> { + match *self { + Error::Syntax(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => { + write!(f, "Compiled regex exceeds size limit of {} bytes.", + limit) + } + Error::InvalidSet => { + write!(f, "Sets must contain 2 or more regular expressions.") + } + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl From for Error { + fn from(err: syntax::Error) -> Error { + Error::Syntax(err) + } +} diff --git a/src/exec.rs b/src/exec.rs index bd80b2ac18..b436a9f92a 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -11,22 +11,24 @@ use std::collections::HashMap; use std::sync::Arc; +use syntax; + use backtrack::{self, Backtrack}; use compile::Compiler; -use dfa::{self, Dfa}; +use dfa::{self, Dfa, DfaResult}; +use error::Error; use input::{ByteInput, CharInput}; -use literals::BuildPrefixes; +use literals::Literals; use nfa::Nfa; use prog::{Program, InstPtr}; -use syntax; - -use {Regex, Error}; +use re_bytes; +use re_unicode; pub type CaptureSlots<'a> = &'a mut [CaptureSlot]; pub type CaptureSlot = Option; -/// The parameters to running one of the four match engines. +/// The parameters to running a regex search over some text. #[derive(Debug)] pub struct Search<'caps, 'matches> { /// The matching engine writes capture locations to this slice. @@ -41,41 +43,76 @@ pub struct Search<'caps, 'matches> { /// In standard searches, there is exactly one value in this slice and it /// should be initialized to `false`. When executing sets of regexes, /// there should be a location for each regex. - pub matches: &'matches mut [bool], + matches: &'matches mut [bool], + /// Whether the matching engine has recorded any match. + matched_any: bool, } impl<'caps, 'matches> Search<'caps, 'matches> { + pub fn new( + captures: CaptureSlots<'caps>, + matches: &'matches mut [bool], + ) -> Search<'caps, 'matches> { + Search { + captures: captures, + matches: matches, + matched_any: false, + } + } + pub fn quit_after_first_match(&self) -> bool { - self.captures.is_empty() && self.matches.len() == 1 + self.captures.is_empty() && self.matches.len() <= 1 } - pub fn all_matched(&self) -> bool { - self.matches.iter().all(|m| *m) + pub fn find_many_matches(&self) -> bool { + self.matches.len() > 1 } - pub fn copy_captures_from(&mut self, caps: &[Option]) { - for (slot, val) in self.captures.iter_mut().zip(caps.iter()) { - *slot = *val; - } + pub fn find_one_match(&self) -> bool { + self.matches.len() == 1 + } + + pub fn matched_all(&self) -> bool { + self.matches.iter().all(|m| *m) } pub fn set_match(&mut self, match_slot: usize) { + self.matched_any = true; if let Some(old) = self.matches.get_mut(match_slot) { *old = true; } } - pub fn set_start(&mut self, pos: Option) { - self.set_capture(0, pos); + pub fn capture(&self, i: usize) -> Option { + self.captures.get(i).map(|&slot| slot) + } + + pub fn set_start(&mut self, slot: CaptureSlot) { + self.set_capture(0, slot); + } + + pub fn set_end(&mut self, slot: CaptureSlot) { + self.set_capture(1, slot); } - pub fn set_end(&mut self, pos: Option) { - self.set_capture(1, pos); + pub fn set_capture(&mut self, i: usize, slot: CaptureSlot) { + if let Some(old_slot) = self.captures.get_mut(i) { + *old_slot = slot; + } } - fn set_capture(&mut self, i: usize, pos: Option) { - if let Some(old_pos) = self.captures.get_mut(i) { - *old_pos = pos; + pub fn copy_captures_from(&mut self, caps: &[CaptureSlot]) { + for (slot, val) in self.captures.iter_mut().zip(caps.iter()) { + *slot = *val; + } + } + + pub fn reset(&mut self) { + for slot in self.captures.iter_mut() { + *slot = None; + } + for m in self.matches.iter_mut() { + *m = false; } } } @@ -127,6 +164,7 @@ pub struct ExecBuilder { match_engine: MatchEngine, size_limit: usize, bytes: bool, + only_utf8: bool, } impl ExecBuilder { @@ -151,6 +189,7 @@ impl ExecBuilder { match_engine: MatchEngine::Automatic, size_limit: 10 * (1 << 20), bytes: false, + only_utf8: true, } } @@ -205,8 +244,7 @@ impl ExecBuilder { /// By default, the NFA engines match on Unicode scalar values. They can /// be made to use byte based programs instead. In general, the byte based /// programs are slower because of a less efficient encoding of character - /// classes. However, it may be useful (some day) for matching on raw - /// bytes that may not be UTF-8. + /// classes. /// /// Note that this does not impact DFA matching engines, which always /// execute on bytes. @@ -215,6 +253,15 @@ impl ExecBuilder { self } + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.only_utf8 = yes; + self + } + /// Build an executor that can run a regular expression. pub fn build(self) -> Result { if self.res.is_empty() { @@ -222,29 +269,36 @@ impl ExecBuilder { } let mut exprs = vec![]; for re in &self.res { - exprs.push(try!(syntax::Expr::parse(re))); + let parser = + syntax::ExprBuilder::new() + .allow_bytes(!self.only_utf8) + .unicode(self.only_utf8); + exprs.push(try!(parser.parse(re))); } let mut prog = try!( Compiler::new() .size_limit(self.size_limit) .bytes(self.bytes) + .only_utf8(self.only_utf8) .compile(&exprs)); let mut dfa = try!( Compiler::new() .size_limit(self.size_limit) .dfa(true) + .only_utf8(self.only_utf8) .compile(&exprs)); let dfa_reverse = try!( Compiler::new() .size_limit(self.size_limit) .dfa(true) + .only_utf8(self.only_utf8) .reverse(true) .compile(&exprs)); // Compute literal prefixes for only `prog`, which is likely a Unicode // based program. Literal prefix extract currently works better on // Unicode programs. - prog.prefixes = BuildPrefixes::new(&prog).literals().into_matcher(); + prog.prefixes = Literals::prefixes(&prog); // And give it to the DFA too, which can use Unicode prefixes even // though the program itself is byte based. dfa.prefixes = prog.prefixes.clone(); @@ -283,7 +337,7 @@ impl Exec { pub fn exec<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { // Why isn't the DFA or literal engine checked for here? Well, it's @@ -300,7 +354,7 @@ impl Exec { fn exec_auto<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { if search.captures.len() <= 2 && self.prog.prefixes.at_match() { @@ -322,13 +376,17 @@ impl Exec { fn exec_dfa<'a, 'c, 'm>( &self, search: &'a mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { debug_assert!(self.can_dfa); - let btext = text.as_bytes(); - if !Dfa::exec(&self.dfa, search, btext, start) { - return false; + match Dfa::exec(&self.dfa, search, text, start) { + DfaResult::Match => {} // fallthrough + DfaResult::NoMatch => return false, + DfaResult::Quit => { + search.reset(); + return self.exec_auto_nfa(search, text, start); + } } let match_end = match search.captures.get(1) { Some(&Some(i)) => i, @@ -354,10 +412,17 @@ impl Exec { } // OK, now we find the start of the match by running the DFA backwards // on the text. We *start* the search at the end of the match. - let matched = Dfa::exec( - &self.dfa_reverse, search, &btext[start..], match_end - start); - if !matched { - panic!("BUG: forward match implies backward match"); + let result = Dfa::exec( + &self.dfa_reverse, search, &text[start..], match_end - start); + match result { + DfaResult::Match => {} // fallthrough + DfaResult::NoMatch => { + panic!("BUG: forward match implies backward match"); + } + DfaResult::Quit => { + search.reset(); + return self.exec_auto_nfa(search, text, start); + } } let match_start = match search.captures.get(0) { Some(&Some(i)) => start + i, @@ -378,7 +443,7 @@ impl Exec { fn exec_auto_nfa<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { if backtrack::should_exec(self.prog.len(), text.len()) { @@ -392,10 +457,10 @@ impl Exec { fn exec_nfa<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { - if self.prog.is_bytes { + if self.prog.uses_bytes() { Nfa::exec(&self.prog, search, ByteInput::new(text), start) } else { Nfa::exec(&self.prog, search, CharInput::new(text), start) @@ -406,10 +471,10 @@ impl Exec { fn exec_backtrack<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { - if self.prog.is_bytes { + if self.prog.uses_bytes() { Backtrack::exec(&self.prog, search, ByteInput::new(text), start) } else { Backtrack::exec(&self.prog, search, CharInput::new(text), start) @@ -426,11 +491,11 @@ impl Exec { fn exec_literals<'c, 'm>( &self, search: &mut Search<'c, 'm>, - text: &str, + text: &[u8], start: usize, ) -> bool { debug_assert!(self.prog.prefixes.at_match()); - match self.prog.prefixes.find(&text.as_bytes()[start..]) { + match self.prog.prefixes.find(&text[start..]) { None => false, Some((s, e)) => { if search.captures.len() == 2 { @@ -443,8 +508,14 @@ impl Exec { } /// Build a dynamic Regex from this executor. - pub fn into_regex(self) -> Regex { - Regex::Dynamic(self) + pub fn into_regex(self) -> re_unicode::Regex { + re_unicode::Regex::from(self) + } + + /// Build a dynamic Regex from this executor that can match arbitrary + /// bytes. + pub fn into_byte_regex(self) -> re_bytes::Regex { + re_bytes::Regex::from(self) } /// The original regular expressions given by the caller that were diff --git a/src/expand.rs b/src/expand.rs new file mode 100644 index 0000000000..9bea703881 --- /dev/null +++ b/src/expand.rs @@ -0,0 +1,91 @@ +use std::str; + +use memchr::memchr; + +use bytes::Captures; + +pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = cap_ref.rest; + match cap_ref.cap { + Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), + Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), + } + } + dst.extend(replacement); +} + +struct CaptureRef<'a> { + rest: &'a [u8], + cap: Ref<'a>, +} + +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +fn find_cap_ref(mut replacement: &[u8]) -> Option { + if replacement.len() <= 1 || replacement[0] != b'$' { + return None; + } + let mut brace = false; + replacement = &replacement[1..]; + if replacement[0] == b'{' { + brace = true; + replacement = &replacement[1..]; + } + let mut cap_end = 0; + while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == 0 { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check with either unsafe or by parsing the number straight from &[u8]. + let cap = str::from_utf8(&replacement[..cap_end]) + .ok().expect("valid UTF-8 capture name"); + if brace { + if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + return None; + } + cap_end += 1; + } + Some(CaptureRef { + rest: &replacement[cap_end..], + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + }) +} + +fn is_valid_cap_letter(b: &u8) -> bool { + match *b { + b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, + _ => false, + } +} diff --git a/src/input.rs b/src/input.rs index 04442bf93f..912646d4a7 100644 --- a/src/input.rs +++ b/src/input.rs @@ -8,9 +8,15 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use std::char; +use std::cmp::Ordering; +use std::fmt; use std::ops; +use std::u32; -use char::Char; +use syntax; + +use utf8::{decode_utf8, decode_last_utf8}; use literals::Literals; /// Represents a location in the input. @@ -100,19 +106,19 @@ impl<'a, T: Input> Input for &'a T { /// An input reader over characters. #[derive(Clone, Copy, Debug)] -pub struct CharInput<'t>(&'t str); +pub struct CharInput<'t>(&'t [u8]); impl<'t> CharInput<'t> { /// Return a new character input reader for the given string. - pub fn new(s: &'t str) -> CharInput<'t> { + pub fn new(s: &'t [u8]) -> CharInput<'t> { CharInput(s) } } impl<'t> ops::Deref for CharInput<'t> { - type Target = str; + type Target = [u8]; - fn deref(&self) -> &str { + fn deref(&self) -> &[u8] { self.0 } } @@ -125,7 +131,7 @@ impl<'t> Input for CharInput<'t> { // used *a lot* in the guts of the matching engines. #[inline(always)] fn at(&self, i: usize) -> InputAt { - let c = self[i..].chars().next().into(); + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); InputAt { pos: i, c: c, @@ -139,13 +145,11 @@ impl<'t> Input for CharInput<'t> { } fn previous_char(&self, at: InputAt) -> Char { - self[..at.pos()].chars().rev().next().into() + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option { - prefixes - .find(&self.as_bytes()[at.pos()..]) - .map(|(s, _)| self.at(at.pos() + s)) + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) } fn len(&self) -> usize { @@ -153,7 +157,7 @@ impl<'t> Input for CharInput<'t> { } fn as_bytes(&self) -> &[u8] { - self.0.as_bytes() + self.0 } } @@ -163,19 +167,19 @@ impl<'t> Input for CharInput<'t> { /// easy access to necessary Unicode decoding (used for word boundary look /// ahead/look behind). #[derive(Clone, Copy, Debug)] -pub struct ByteInput<'t>(&'t str); +pub struct ByteInput<'t>(&'t [u8]); impl<'t> ByteInput<'t> { /// Return a new byte-based input reader for the given string. - pub fn new(s: &'t str) -> ByteInput<'t> { + pub fn new(s: &'t [u8]) -> ByteInput<'t> { ByteInput(s) } } impl<'t> ops::Deref for ByteInput<'t> { - type Target = str; + type Target = [u8]; - fn deref(&self) -> &str { + fn deref(&self) -> &[u8] { self.0 } } @@ -186,23 +190,21 @@ impl<'t> Input for ByteInput<'t> { InputAt { pos: i, c: None.into(), - byte: self.as_bytes().get(i).map(|&b| b), + byte: self.get(i).map(|&b| b), len: 1, } } fn next_char(&self, at: InputAt) -> Char { - self[at.pos()..].chars().next().into() + decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() } fn previous_char(&self, at: InputAt) -> Char { - self[..at.pos()].chars().rev().next().into() + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option { - prefixes - .find(&self.as_bytes()[at.pos()..]) - .map(|(s, _)| self.at(at.pos() + s)) + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) } fn len(&self) -> usize { @@ -210,6 +212,101 @@ impl<'t> Input for ByteInput<'t> { } fn as_bytes(&self) -> &[u8] { - self.0.as_bytes() + self.0 + } +} + +/// An inline representation of `Option`. +/// +/// This eliminates the need to do case analysis on `Option` to determine +/// ordinality with other characters. +/// +/// (The `Option` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { self.0 == u32::MAX } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `0` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map_or(0, |c| c.len_utf8()) + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + char::from_u32(self.0).map_or(false, syntax::is_word_char) + } + + /// Returns true iff the byte is a word byte. + /// + /// If the byte is absent, then false is returned. + pub fn is_word_byte(self) -> bool { + match char::from_u32(self.0) { + None => false, + Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8), + Some(_) => false, + } + } + + /// Converts the character to a real primitive `char`. + /// + /// If the character is absent, then `None` is returned. + pub fn as_char(self) -> Option { + // This is only used in the `regex!` macro because it expands char + // classes into `match` expressions (instead of binary search). + char::from_u32(self.0) + } +} + +impl From for Char { + fn from(c: char) -> Char { Char(c as u32) } +} + +impl From> for Char { + fn from(c: Option) -> Char { + c.map_or(Char(u32::MAX), |c| c.into()) + } +} + +impl PartialEq for Char { + #[inline] + fn eq(&self, other: &char) -> bool { self.0 == *other as u32 } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &Char) -> bool { *self as u32 == other.0 } +} + +impl PartialOrd for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option { + (*self as u32).partial_cmp(&other.0) } } diff --git a/src/lib.rs b/src/lib.rs index 86acba164c..0cd810d3b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -290,6 +290,17 @@ //! # } //! ``` //! +//! # Opt out of Unicode support +//! +//! The `bytes` sub-module provides a `Regex` type that can be used to match +//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with +//! all Unicode support disabled (e.g., `.` matches any byte instead of any +//! Unicode codepoint). Unicode support can be selectively enabled with the +//! `u` flag. See the `bytes` module documentation for more details. +//! +//! Note that Unicode support *cannot* be selectively disabled on the main +//! `Regex` type that matches on `&str`. +//! //! # Syntax //! //! The syntax supported in this crate is almost in an exact correspondence @@ -466,9 +477,7 @@ //! allowed to store a fixed number of states. (When the limit is reached, its //! states are wiped and continues on, possibly duplicating previous work.) -#![allow(dead_code, unused_imports, unused_variables)] - -// #![deny(missing_docs)] +#![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", @@ -477,30 +486,123 @@ extern crate aho_corasick; extern crate memchr; +#[cfg(test)] extern crate quickcheck; extern crate regex_syntax as syntax; extern crate utf8_ranges; -// The re module is essentially our public interface. -pub use re::{ - Regex, Error, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, +pub use error::Error; +pub use set::{RegexSet, SetMatches, SetMatchesIntoIter, SetMatchesIter}; +pub use re_unicode::{ + Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, CaptureNames, FindCaptures, FindMatches, Replacer, NoExpand, RegexSplits, RegexSplitsN, quote, is_match, }; -pub use set::{RegexSet, SetMatches, SetMatchesIntoIter, SetMatchesIter}; + +/** +Match regular expressions on arbitrary bytes. + +This module provides a nearly identical API to the one found in the +top-level of this crate. There are two important differences: + +1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` +is used where `String` would have been used. +2. Regular expressions are compiled with Unicode support *disabled* by +default. This means that while Unicode regular expressions can only match valid +UTF-8, regular expressions in this module can match arbitrary bytes. Unicode +support can be selectively enabled via the `u` flag in regular expressions +provided by this sub-module. + +# Example: match null terminated string + +This shows how to find all null-terminated strings in a slice of bytes: + +```rust +# use regex::bytes::Regex; +let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +# Example: selectively enable Unicode support + +This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded +string (e.g., to extract a title from a Matroska file): + +```rust +# use std::str; +# use regex::bytes::Regex; +let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap(); +let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; +let caps = re.captures(text).unwrap(); + +// Notice that despite the `.*` at the end, it will only match valid UTF-8 +// because Unicode mode was enabled with the `u` flag. Without the `u` flag, +// the `.*` would match the rest of the bytes. +assert_eq!((7, 10), caps.pos(1).unwrap()); + +// If there was a match, Unicode mode guarantees that `title` is valid UTF-8. +let title = str::from_utf8(caps.at(1).unwrap()).unwrap(); +assert_eq!("☃", title); +``` + +In general, if the Unicode flag is enabled in a capture group and that capture +is part of the overall match, then the capture is *guaranteed* to be valid +UTF-8. + +# Syntax + +The supported syntax is pretty much the same as the syntax for Unicode +regular expressions with a few changes that make sense for matching arbitrary +bytes: + +1. A new flag, `u`, is available for switching to Unicode mode. +2. By default, `u` is disabled, which roughly corresponds to "ASCII compatible" +mode. +3. In ASCII compatible mode, neither Unicode codepoints nor Unicode character +classes are allowed. +4. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) +revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps +to `[[:digit:]]` and `\s` maps to `[[:space:]]`. +5. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to +determine whether a byte is a word byte or not. +6. Hexadecimal notation can be used to specify arbitrary bytes instead of +Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the +literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that +matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation. +7. `.` matches any *byte* except for `\n` instead of any codepoint. When the +`s` flag is enabled, `.` matches any byte. + +# Performance + +In general, one should expect performance on `&[u8]` to be roughly similar to +performance on `&str`. +*/ +pub mod bytes { + pub use re_bytes::*; +} mod backtrack; -mod char; -mod char_utf8; +mod utf8; mod compile; mod dfa; +mod error; mod exec; +mod expand; mod input; mod literals; mod nfa; mod pool; mod prog; -mod re; +mod re_bytes; +mod re_unicode; mod set; mod sparse; @@ -508,11 +610,10 @@ mod sparse; /// suspicious activity, such as testing different matching engines. #[doc(hidden)] pub mod internal { - pub use char::Char; pub use compile::Compiler; pub use exec::{Exec, ExecBuilder}; - pub use input::{Input, CharInput, InputAt}; - pub use literals::{BuildPrefixes, Literals}; + pub use input::{Char, Input, CharInput, InputAt}; + pub use literals::Literals; pub use prog::{Program, Inst, EmptyLook, InstRanges}; - pub use re::ExNative; + pub use re_unicode::{_Regex, ExNative}; } diff --git a/src/literals.rs b/src/literals.rs index cd3d6d7ffd..5dbd6ef1bc 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -16,7 +16,7 @@ use std::mem; use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; use memchr::{memchr, memchr2, memchr3}; -use char_utf8::encode_utf8; +use utf8::encode_utf8; use prog::{Program, Inst, InstBytes, InstRanges}; #[derive(Clone, Eq, PartialEq)] @@ -199,14 +199,14 @@ impl AlternateLiterals { } } -pub struct BuildPrefixes<'a> { +struct BuildPrefixes<'a> { insts: &'a Program, limit: usize, alts: AlternateLiterals, } impl<'a> BuildPrefixes<'a> { - pub fn new(insts: &'a Program) -> Self { + fn new(insts: &'a Program) -> Self { BuildPrefixes { insts: insts, limit: 250, @@ -214,7 +214,7 @@ impl<'a> BuildPrefixes<'a> { } } - pub fn literals(mut self) -> AlternateLiterals { + fn literals(mut self) -> AlternateLiterals { let mut stack = vec![self.insts.skip(self.insts.start)]; let mut seen = HashSet::new(); while let Some(mut pc) = stack.pop() { @@ -357,7 +357,7 @@ impl<'a> BuildRequiredLiterals<'a> { // Compute roughly how many bytes will be in our literals following // the addition of the given range. If we blow our limit, then we // can't add anything. - let nbytes = (inst.end - inst.start + 1) as usize; + let nbytes = (inst.end as usize) - (inst.start as usize) + 1; let new_byte_count = (self.alts.num_bytes() * nbytes) + (self.alts.literals.len() * nbytes); if new_byte_count > self.limit { @@ -408,57 +408,17 @@ enum LiteralMatcher { AC(FullAcAutomaton>), } -impl LiteralMatcher { - /// Create a new prefix matching machine. - fn new(mut alts: AlternateLiterals) -> Self { - use self::LiteralMatcher::*; - - if alts.is_empty() { - Empty - } else if alts.distinct_single_bytes() >= 26 { - // Why do we do this? Well, it's a heuristic to prevent thrashing. - // Basically, if our literal matcher has lots of literals that are - // a single byte, then we lose a lot of the benefits of fast - // literal searching. In particular, single bytes have a high - // probability of matching. In a regex that rarely matches, we end - // up ping-ponging between the literal matcher and the regex engine - // for every byte of input. That's bad juju. - // - // Note that we only count distinct starting bytes from literals of - // length 1. For literals longer than that, we assume they have - // a lower probability of matching. - // - // This particular heuristic would be triggered on, e.g., - // `[a-z].+`. The prefix here is a single byte that is very likely - // to match on any given byte in the input, so it's quicker just - // to let the matching engine process it. - // - // TODO(burntsushi): Consider lowering the threshold! - Empty - } else if alts.is_single_byte() { - Byte(alts.literals[0][0]) - } else if alts.all_single_bytes() { - let mut set = vec![false; 256]; - let mut bytes = vec![]; - for lit in alts.literals { - bytes.push(lit[0]); - set[lit[0] as usize] = true; - } - Bytes { chars: bytes, sparse: set } - } else if alts.is_one_literal() { - Single(SingleSearch::new(alts.literals.pop().unwrap())) - } else { - AC(AcAutomaton::new(alts.literals).into_full()) - } - } -} - impl Literals { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { Literals { at_match: false, matcher: LiteralMatcher::Empty } } + /// Returns a matcher for literal prefixes in the given program. + pub fn prefixes(prog: &Program) -> Self { + BuildPrefixes::new(prog).literals().into_matcher() + } + /// Returns true if and only if a literal match corresponds to a match /// in the regex from which the literal was extracted. pub fn at_match(&self) -> bool { @@ -530,19 +490,19 @@ impl Literals { } } - /// Returns all of the prefixes participating in this machine. + /// Returns all of the literal participating in this machine. /// /// For debug/testing only! (It allocates.) #[allow(dead_code)] - fn prefixes(&self) -> Vec { - self.byte_prefixes() + fn strings(&self) -> Vec { + self.byte_strings() .into_iter() .map(|p| String::from_utf8(p).unwrap()) .collect() } #[allow(dead_code)] - fn byte_prefixes(&self) -> Vec> { + fn byte_strings(&self) -> Vec> { use self::LiteralMatcher::*; match self.matcher { Empty => vec![], @@ -556,6 +516,50 @@ impl Literals { } } +impl LiteralMatcher { + fn new(mut alts: AlternateLiterals) -> Self { + use self::LiteralMatcher::*; + + if alts.is_empty() { + Empty + } else if alts.distinct_single_bytes() >= 26 { + // Why do we do this? Well, it's a heuristic to prevent thrashing. + // Basically, if our literal matcher has lots of literals that are + // a single byte, then we lose a lot of the benefits of fast + // literal searching. In particular, single bytes have a high + // probability of matching. In a regex that rarely matches, we end + // up ping-ponging between the literal matcher and the regex engine + // for every byte of input. That's bad juju. + // + // Note that we only count distinct starting bytes from literals of + // length 1. For literals longer than that, we assume they have + // a lower probability of matching. + // + // This particular heuristic would be triggered on, e.g., + // `[a-z].+`. The prefix here is a single byte that is very likely + // to match on any given byte in the input, so it's quicker just + // to let the matching engine process it. + // + // TODO(burntsushi): Consider lowering the threshold! + Empty + } else if alts.is_single_byte() { + Byte(alts.literals[0][0]) + } else if alts.all_single_bytes() { + let mut set = vec![false; 256]; + let mut bytes = vec![]; + for lit in alts.literals { + bytes.push(lit[0]); + set[lit[0] as usize] = true; + } + Bytes { chars: bytes, sparse: set } + } else if alts.is_one_literal() { + Single(SingleSearch::new(alts.literals.pop().unwrap())) + } else { + AC(AcAutomaton::new(alts.literals).into_full()) + } + } +} + /// Provides an implementation of fast subtring search. /// /// In particular, this uses Boyer-Moore-Horspool with Tim Raita's twist: @@ -679,7 +683,7 @@ mod tests { let p = prog!($re); let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); assert!(!prefixes.at_match()); - prefixes.prefixes() + prefixes.strings() }} } macro_rules! prefixes_complete { @@ -687,7 +691,7 @@ mod tests { let p = prog!($re); let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); assert!(prefixes.at_match()); - prefixes.prefixes() + prefixes.strings() }} } diff --git a/src/nfa.rs b/src/nfa.rs index b78b768d5d..508985d1b9 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -174,14 +174,14 @@ impl<'r, I: Input> Nfa<'r, I> { ); if step { if !matched { - matched = search.all_matched(); + matched = search.matched_all(); } if search.quit_after_first_match() { // If we only care if a match occurs (not its // position), then we can quit right now. break 'LOOP; } - if search.matches.len() <= 1 { + if !search.find_many_matches() { // We don't need to check the rest of the threads // in this set because we've matched something // ("leftmost-first"). However, we still need to check diff --git a/src/prog.rs b/src/prog.rs index 99ae2e420d..f58acc417b 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -7,8 +7,8 @@ use std::slice; use std::sync::Arc; use backtrack::BacktrackCache; -use char::Char; use dfa::DfaCache; +use input::Char; use literals::Literals; use nfa::NfaCache; use pool::{Pool, PoolGuard}; @@ -20,18 +20,45 @@ pub type InstPtr = usize; /// instructions. #[derive(Clone)] pub struct Program { + /// A sequence of instructions that represents an NFA. pub insts: Vec, + /// Pointers to each Match instruction in the sequence. + /// + /// This is always length 1 unless this program represents a regex set. pub matches: Vec, + /// The ordered sequence of all capture groups extracted from the AST. + /// Unnamed groups are `None`. pub captures: Vec>, + /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, + /// A pointer to the start instruction. This can vary depending on how + /// the program was compiled. For example, programs for use with the DFA + /// engine have a `.*?` inserted at the beginning of unanchored regular + /// expressions. The actual starting point of the program is after the + /// `.*?`. pub start: InstPtr, + /// A set of equivalence classes for discriminating bytes in the compiled + /// program. pub byte_classes: Vec, + /// When true, this program can only match valid UTF-8. + pub only_utf8: bool, + /// When true, this program uses byte range instructions instead of Unicode + /// range instructions. pub is_bytes: bool, + /// When true, the program is compiled for DFA matching. For example, this + /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored + /// regexes. pub is_dfa: bool, + /// When true, the program matches text in reverse (for use only in the + /// DFA). pub is_reverse: bool, + /// Whether the regex must match from the start of the input. pub is_anchored_start: bool, + /// Whether the regex must match at the end of the input. pub is_anchored_end: bool, + /// A possibly empty machine for very quickly matching prefix literals. pub prefixes: Literals, + /// Caches for use by the matching engines. pub cache: EngineCache, } @@ -46,6 +73,7 @@ impl Program { capture_name_idx: Arc::new(HashMap::new()), start: 0, byte_classes: vec![], + only_utf8: true, is_bytes: false, is_dfa: false, is_reverse: false, @@ -88,6 +116,19 @@ impl Program { self.is_dfa && !self.is_reverse && !self.is_anchored_start } + /// Returns true if this program uses Byte instructions instead of + /// Char/Range instructions. + pub fn uses_bytes(&self) -> bool { + self.is_bytes || self.is_dfa || !self.only_utf8 + } + + /// Returns true if this program exclusively matches valid UTF-8 bytes. + /// + /// That is, if an invalid UTF-8 byte is seen, then no match is possible. + pub fn only_utf8(&self) -> bool { + self.only_utf8 + } + /// Retrieve cached state for NFA execution. pub fn cache_nfa(&self) -> PoolGuard> { self.cache.nfa.get() @@ -321,6 +362,10 @@ pub enum EmptyLook { WordBoundary, /// Word character on both sides or non-word character on both sides. NotWordBoundary, + /// ASCII word boundary. + WordBoundaryAscii, + /// Not ASCII word boundary. + NotWordBoundaryAscii, } impl InstEmptyLook { @@ -333,10 +378,21 @@ impl InstEmptyLook { EndLine => c2.is_none() || c2 == '\n', StartText => c1.is_none(), EndText => c2.is_none(), - ref wbty => { + WordBoundary => { + let (w1, w2) = (c1.is_word_char(), c2.is_word_char()); + w1 ^ w2 + } + NotWordBoundary => { let (w1, w2) = (c1.is_word_char(), c2.is_word_char()); - (*wbty == WordBoundary && w1 ^ w2) - || (*wbty == NotWordBoundary && !(w1 ^ w2)) + !(w1 ^ w2) + } + WordBoundaryAscii => { + let (w1, w2) = (c1.is_word_byte(), c2.is_word_byte()); + w1 ^ w2 + } + NotWordBoundaryAscii => { + let (w1, w2) = (c1.is_word_byte(), c2.is_word_byte()); + !(w1 ^ w2) } } } diff --git a/src/re_bytes.rs b/src/re_bytes.rs new file mode 100644 index 0000000000..427cf77011 --- /dev/null +++ b/src/re_bytes.rs @@ -0,0 +1,963 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::collections::hash_map; +use std::fmt; +use std::ops::Index; +use std::str::FromStr; +use std::sync::Arc; + +use memchr::memchr; + +use exec::{Exec, ExecBuilder, Search, CaptureSlots}; +use expand::expand; +use error::Error; + +pub use set::RegexSetBytes as RegexSet; +pub use set::SetMatchesBytes as SetMatches; +pub use set::SetMatchesIterBytes as SetMatchesIterBytes; +pub use set::SetMatchesIntoIterBytes as SetMatchesIntoIterBytes; + +/// A compiled regular expression for matching arbitrary bytes. +/// +/// It can be used to search, split or replace text. All searching is done with +/// an implicit `.*?` at the beginning and end of an expression. To force an +/// expression to match the whole string (or a prefix or a suffix), you must +/// use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// Like the `Regex` type in the parent module, matches with this regex return +/// byte offsets into the search text. **Unlike** the parent `Regex` type, +/// these byte offsets may not correspond to UTF-8 sequence boundaries since +/// the regexes in this module can match arbitrary bytes. +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// A constructor for Regex from an Exec. +/// +/// This is hidden because Exec isn't actually part of the public API. +#[doc(hidden)] +impl From for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result { + Regex::new(s) + } +} + +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result { + Regex::with_size_limit(10 * (1 << 20), re) + } + + /// Compiles a regular expression with the given size limit. + /// + /// The size limit is applied to the size of the *compiled* data structure. + /// If the data structure exceeds the size given, then an error is + /// returned. + pub fn with_size_limit(size: usize, re: &str) -> Result { + ExecBuilder::new(re) + .only_utf8(false) + .size_limit(size) + .build() + .map(Regex) + } + + /// Returns true if and only if the regex matches the string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &[u8]) -> bool { + self.exec(&mut [], text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); + /// assert_eq!(pos, Some((2, 15))); + /// # } + /// ``` + pub fn find(&self, text: &[u8]) -> Option<(usize, usize)> { + let mut caps = [None, None]; + if !self.exec(&mut caps, text, 0) { + None + } else { + Some((caps[0].unwrap(), caps[1].unwrap())) + } + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", pos); + /// } + /// // Output: + /// // (0, 13) + /// // (14, 27) + /// // (28, 41) + /// // (45, 58) + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindMatches<'r, 't> { + FindMatches { + re: self, + text: text, + last_end: 0, + last_match: None, + } + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to submatches. + /// Otherwise, `find` is faster for discovering the location of the overall + /// match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.at(1), Some(&b"Citizen Kane"[..])); + /// assert_eq!(caps.at(2), Some(&b"1941"[..])); + /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title"), Some(&b"Citizen Kane"[..])); + /// assert_eq!(caps.name("year"), Some(&b"1941"[..])); + /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `at` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `at(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { + let mut caps = self.alloc_captures(); + if !self.exec(&mut caps, text, 0) { + None + } else { + Some(Captures { + text: text, + caps: caps, + named_groups: self.0.capture_name_idx().clone(), + }) + } + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about submatches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # extern crate regex; use std::str; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t [u8], + ) -> FindCaptures<'r, 't> { + FindCaptures { + re: self, + text: text, + last_end: 0, + last_match: None, + } + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Splits<'r, 't> { + Splits { + finder: self.find_iter(text), + last: 0, + } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t [u8], + limit: usize, + ) -> SplitsN<'r, 't> { + SplitsN { + splits: self.split(text), + n: limit, + } + } + + /// Replaces the leftmost-first match with the replacement provided. The + /// replacement can be a regular byte string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced byte string. + /// + /// If no match is found, then a copy of the byte string is returned + /// unchanged. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal byte string: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace(b"1078910", &b""[..]), b"1010"); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, a + /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the + /// captures corresponding to a match. This allows one to access submatches + /// easily: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # use regex::bytes::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { + /// let mut replacement = caps[2].to_owned(); + /// replacement.push(b' '); + /// replacement.extend(&caps[1]); + /// replacement + /// }); + /// assert_eq!(result, b"Bruce Springsteen"); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); + /// assert_eq!(result, b"Bruce Springsteen"); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// If `$name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// submatch expansion. This can be done by wrapping a byte string with + /// `NoExpand`: + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// use regex::bytes::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); + /// assert_eq!(result, b"$2 $last"); + /// # } + /// ``` + pub fn replace<R: Replacer>(&self, text: &[u8], rep: R) -> Vec<u8> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// submatches in the replacement text. + pub fn replace_all<R: Replacer>(&self, text: &[u8], rep: R) -> Vec<u8> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// submatches in the replacement text. + pub fn replacen<R: Replacer>( + &self, + text: &[u8], + limit: usize, + mut rep: R, + ) -> Vec<u8> { + if let Some(rep) = rep.no_expansion() { + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, (s, e)) in self.find_iter(text).enumerate() { + if limit > 0 && i >= limit { + break + } + new.extend(&text[last_match..s]); + new.extend(&*rep); + last_match = e; + } + new.extend(&text[last_match..]); + return new; + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in self.captures_iter(text).enumerate() { + if limit > 0 && i >= limit { + break + } + // unwrap on 0 is OK because captures only reports matches + let (s, e) = cap.pos(0).unwrap(); + new.extend(&text[last_match..s]); + rep.replace_append(&cap, &mut new); + last_match = e; + } + new.extend(&text[last_match..]); + new + } + + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames { + CaptureNames(self.0.captures().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.captures().len() + } + + fn exec(&self, caps: CaptureSlots, text: &[u8], start: usize) -> bool { + let mut _matches = [false]; + let mut search = Search::new(caps, &mut _matches); + self.0.exec(&mut search, text, start) + } + + fn alloc_captures(&self) -> Vec<Option<usize>> { + vec![None; 2 * self.0.captures().len()] + } +} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a tuple of integers corresponding to the start and end +/// of the match. The indices are byte offsets. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +pub struct FindMatches<'r, 't> { + re: &'r Regex, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 't> Iterator for FindMatches<'r, 't> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.last_end > self.text.len() { + return None + } + + let mut caps = [None, None]; + if !self.re.exec(&mut caps, self.text, self.last_end) { + return None; + } + let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); + // Don't accept empty matches immediately following a match. + // i.e., no infinite loops please. + if e == s && Some(self.last_end) == self.last_match { + if self.last_end >= self.text.len() { + return None; + } + self.last_end += 1; + return self.next(); + } + self.last_end = e; + self.last_match = Some(self.last_end); + Some((s, e)) + } +} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +pub struct FindCaptures<'r, 't> { + re: &'r Regex, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 't> Iterator for FindCaptures<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + if self.last_end > self.text.len() { + return None + } + + let mut caps = self.re.alloc_captures(); + if !self.re.exec(&mut caps, self.text, self.last_end) { + return None; + } + let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); + + // Don't accept empty matches immediately following a match. + // i.e., no infinite loops please. + if e == s && Some(self.last_end) == self.last_match { + if self.last_end >= self.text.len() { + return None; + } + self.last_end += 1; + return self.next(); + } + self.last_end = e; + self.last_match = Some(self.last_end); + Some(Captures { + text: self.text, + caps: caps, + named_groups: self.re.0.capture_name_idx().clone(), + }) + } +} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +pub struct Splits<'r, 't> { + finder: FindMatches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Splits<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + let text = self.finder.text; + match self.finder.next() { + None => { + if self.last >= text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len(); + Some(s) + } + } + Some((s, e)) => { + let matched = &text[self.last..s]; + self.last = e; + Some(matched) + } + } + } +} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +pub struct SplitsN<'r, 't> { + splits: Splits<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitsN<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + if self.n == 0 { + return None + } + self.n -= 1; + if self.n == 0 { + let text = self.splits.finder.text; + Some(&text[self.splits.last..]) + } else { + self.splits.next() + } + } +} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0.next().as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +/// Captures represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `at` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t [u8], + caps: Vec<Option<usize>>, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original byte string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.caps.get(s), self.caps.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Returns the matched string for the capture group `i`. If `i` isn't + /// a valid capture group or didn't match anything, then `None` is + /// returned. + pub fn at(&self, i: usize) -> Option<&'t [u8]> { + match self.pos(i) { + None => None, + Some((s, e)) => Some(&self.text[s..e]) + } + } + + /// Returns the matched string for the capture group named `name`. If + /// `name` isn't a valid capture group or didn't match anything, then + /// `None` is returned. + pub fn name(&self, name: &str) -> Option<&'t [u8]> { + self.named_groups.get(name).and_then(|&i| self.at(i)) + } + + /// Creates an iterator of all the capture groups in order of appearance + /// in the regular expression. + pub fn iter<'a>(&'a self) -> SubCaptures<'a, 't> { + SubCaptures { idx: 0, caps: self, } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { + SubCapturesPos { idx: 0, caps: &self.caps } + } + + /// Creates an iterator of all named groups as an tuple with the group + /// name and the value. The iterator returns these values in arbitrary + /// order. + pub fn iter_named<'a>(&'a self) -> SubCapturesNamed<'a, 't> { + SubCapturesNamed { + caps: self, + names: self.named_groups.iter() + } + } + + /// Expands all instances of `$name` in `text` to the corresponding capture + /// group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { + expand(self, replacement, dst) + } + + /// Returns the number of captured groups. + #[inline] + pub fn len(&self) -> usize { + self.caps.len() / 2 + } + + /// Returns true if and only if there are no captured groups. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `at()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = [u8]; + + fn index(&self, i: usize) -> &[u8] { + self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = [u8]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { + self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator over capture groups for a particular match of a regular +/// expression. +/// +/// `'c` is the lifetime of the captures and `'t` is the lifetime of the +/// matched text. +pub struct SubCaptures<'c, 't: 'c> { + idx: usize, + caps: &'c Captures<'t>, +} + +impl<'c, 't> Iterator for SubCaptures<'c, 't> { + type Item = Option<&'t [u8]>; + + fn next(&mut self) -> Option<Option<&'t [u8]>> { + if self.idx < self.caps.len() { + self.idx += 1; + Some(self.caps.at(self.idx - 1)) + } else { + None + } + } +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original byte string matched. +/// +/// `'c` is the lifetime of the captures. +pub struct SubCapturesPos<'c> { + idx: usize, + caps: &'c [Option<usize>] +} + +impl<'c> Iterator for SubCapturesPos<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option<Option<(usize, usize)>> { + if self.idx >= self.caps.len() { + return None + } + let r = match (self.caps[self.idx], self.caps[self.idx + 1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + }; + self.idx += 2; + Some(r) + } +} + +/// An Iterator over named capture groups as a tuple with the group name and +/// the value. +/// +/// `'c` is the lifetime of the captures and `'t` is the lifetime of the +/// matched text. +pub struct SubCapturesNamed<'c, 't: 'c> { + caps: &'c Captures<'t>, + names: hash_map::Iter<'c, String, usize>, +} + +impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { + type Item = (&'c str, Option<&'t [u8]>); + + fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { + self.names.next().map(|(name, &pos)| (&**name, self.caps.at(pos))) + } +} + +/// Replacer describes types that can be used to replace matches in a byte +/// string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` and +/// `FnMut(&Captures) -> Vec<u8>`, which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(caps.at(0).unwrap())`. + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + None + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + match memchr(b'$', *self) { + Some(_) => None, + None => Some(Cow::Borrowed(*self)), + } + } +} + +impl<F> Replacer for F where F: FnMut(&Captures) -> Vec<u8> { + fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) { + dst.extend((*self)(caps)) + } +} + +/// NoExpand indicates literal byte string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal byte string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +pub struct NoExpand<'r>(pub &'r [u8]); + +impl<'a> Replacer for NoExpand<'a> { + fn replace_append(&mut self, _: &Captures, dst: &mut Vec<u8>) { + dst.extend(self.0) + } + + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/src/re.rs b/src/re_unicode.rs similarity index 81% rename from src/re.rs rename to src/re_unicode.rs index 1f3d25c1c0..40003618f2 100644 --- a/src/re.rs +++ b/src/re_unicode.rs @@ -17,18 +17,10 @@ use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; use std::sync::Arc; -use exec::{CaptureSlots, Exec, ExecBuilder, Search}; use syntax; -const REPLACE_EXPAND: &'static str = r"(?x) - (?P<before>^|\b|[^$]) # Ignore `$$name`. - \$ - (?P<name> # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) -"; +use exec::{CaptureSlots, Exec, ExecBuilder, Search}; +use error::Error; /// Escapes all regular expression meta characters in `text`. /// @@ -49,68 +41,7 @@ pub fn is_match(regex: &str, text: &str) -> Result<bool, Error> { Regex::new(regex).map(|r| r.is_match(text)) } -/// An error that occurred during parsing or compiling a regular expression. -#[derive(Debug)] -pub enum Error { - /// A syntax error. - Syntax(syntax::Error), - /// The compiled program exceeded the set size limit. - /// The argument is the size limit imposed. - CompiledTooBig(usize), - /// An invalid set is a regex set with fewer than 2 regular expressions. - InvalidSet, - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, -} - -impl ::std::error::Error for Error { - fn description(&self) -> &str { - match *self { - Error::Syntax(ref err) => err.description(), - Error::CompiledTooBig(_) => "compiled program too big", - Error::InvalidSet => { - "sets must contain 2 or more regular expressions" - } - Error::__Nonexhaustive => unreachable!(), - } - } - - fn cause(&self) -> Option<&::std::error::Error> { - match *self { - Error::Syntax(ref err) => Some(err), - _ => None, - } - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Error::Syntax(ref err) => err.fmt(f), - Error::CompiledTooBig(limit) => { - write!(f, "Compiled regex exceeds size limit of {} bytes.", - limit) - } - Error::InvalidSet => { - write!(f, "Sets must contain 2 or more regular expressions.") - } - Error::__Nonexhaustive => unreachable!(), - } - } -} - -impl From<syntax::Error> for Error { - fn from(err: syntax::Error) -> Error { - Error::Syntax(err) - } -} - -/// A compiled regular expression +/// A compiled regular expression for matching Unicode strings. /// /// It is represented as either a sequence of bytecode instructions (dynamic) /// or as a specialized Rust function (native). It can be used to search, split @@ -165,7 +96,11 @@ impl From<syntax::Error> for Error { /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); /// ``` #[derive(Clone)] -pub enum Regex { +pub struct Regex(#[doc(hidden)] pub _Regex); + +#[derive(Clone)] +#[doc(hidden)] +pub enum _Regex { // The representation of `Regex` is exported to support the `regex!` // syntax extension. Do not rely on it. // @@ -211,6 +146,13 @@ impl fmt::Debug for Regex { } } +#[doc(hidden)] +impl From<Exec> for Regex { + fn from(exec: Exec) -> Regex { + Regex(_Regex::Dynamic(exec)) + } +} + /// Equality comparison is based on the original string. It is possible that /// different regular expressions have the same matching behavior, but are /// still compared unequal. For example, `\d+` and `\d\d*` match the same set @@ -233,32 +175,33 @@ impl FromStr for Regex { } impl Regex { - /// Compiles a dynamic regular expression. Once compiled, it can be - /// used repeatedly to search, split or replace text in a string. + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result<Regex, Error> { Regex::with_size_limit(10 * (1 << 20), re) } - /// Compiles a dynamic regular expression with the given size limit. + /// Compiles a regular expression with the given size limit. /// /// The size limit is applied to the size of the *compiled* data structure. /// If the data structure exceeds the size given, then an error is /// returned. - /// - /// The default size limit used in `new` is 10MB. pub fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> { - ExecBuilder::new(re).size_limit(size).build().map(Regex::Dynamic) + ExecBuilder::new(re).size_limit(size).build().map(Regex::from) } - /// Returns true if and only if the regex matches the string given. /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// /// # Example /// /// Test if some text contains at least one word with exactly 13 - /// characters: + /// Unicode word characters: /// /// ```rust /// # extern crate regex; use regex::Regex; @@ -268,7 +211,7 @@ impl Regex { /// # } /// ``` pub fn is_match(&self, text: &str) -> bool { - exec(self, &mut [], text, 0) + self.exec(&mut [], text, 0) } /// Returns the start and end byte range of the leftmost-first match in @@ -281,7 +224,7 @@ impl Regex { /// # Example /// /// Find the start and end location of the first word with exactly 13 - /// characters: + /// Unicode word characters: /// /// ```rust /// # extern crate regex; use regex::Regex; @@ -293,10 +236,10 @@ impl Regex { /// ``` pub fn find(&self, text: &str) -> Option<(usize, usize)> { let mut caps = [None, None]; - if exec(self, &mut caps, text, 0) { - Some((caps[0].unwrap(), caps[1].unwrap())) - } else { + if !self.exec(&mut caps, text, 0) { None + } else { + Some((caps[0].unwrap(), caps[1].unwrap())) } } @@ -306,8 +249,8 @@ impl Regex { /// /// # Example /// - /// Find the start and end location of every word with exactly 13 - /// characters: + /// Find the start and end location of every word with exactly 13 Unicode + /// word characters: /// /// ```rust /// # extern crate regex; use regex::Regex; @@ -326,7 +269,7 @@ impl Regex { pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { FindMatches { re: self, - search: text, + text: text, last_end: 0, last_match: None, } @@ -389,27 +332,28 @@ impl Regex { /// ``` /// /// Here we name the capture groups, which we can access with the `name` - /// method or the `Index` notation with a `&str`. Note that the named capture groups - /// are still accessible with `at` or the `Index` notation with a `usize`. + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `at` or the `Index` notation + /// with a `usize`. /// /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { - let mut locs = self.alloc_captures(); - if exec(self, &mut locs, text, 0) { + let mut caps = self.alloc_captures(); + if !self.exec(&mut caps, text, 0) { + None + } else { Some(Captures { text: text, - locs: locs, + caps: caps, named_groups: NamedGroups::from_regex(self) }) - } else { - None } } /// Returns an iterator over all the non-overlapping capture groups matched - /// in `text`. This is operationally the same as `find_iter` (except it - /// yields information about submatches). + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about submatches. /// /// # Example /// @@ -423,7 +367,8 @@ impl Regex { /// .unwrap(); /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for caps in re.captures_iter(text) { - /// println!("Movie: {:?}, Released: {:?}", caps.name("title"), caps.name("year")); + /// println!("Movie: {:?}, Released: {:?}", + /// caps.name("title"), caps.name("year")); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 @@ -431,20 +376,21 @@ impl Regex { /// // Movie: M, Released: 1931 /// # } /// ``` - pub fn captures_iter<'r, 't>(&'r self, text: &'t str) - -> FindCaptures<'r, 't> { + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t str, + ) -> FindCaptures<'r, 't> { FindCaptures { re: self, - search: text, + text: text, last_end: 0, last_match: None, } } - /// Returns an iterator of substrings of `text` delimited by a match - /// of the regular expression. - /// Namely, each element of the iterator corresponds to text that *isn't* - /// matched by the regular expression. + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. /// /// This method will *not* copy the text given. /// @@ -457,7 +403,7 @@ impl Regex { /// # fn main() { /// let re = Regex::new(r"[ \t]+").unwrap(); /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); - /// assert_eq!(fields, vec!("a", "b", "c", "d", "e")); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { @@ -469,11 +415,9 @@ impl Regex { /// Returns an iterator of at most `limit` substrings of `text` delimited /// by a match of the regular expression. (A `limit` of `0` will return no - /// substrings.) - /// Namely, each element of the iterator corresponds to text that *isn't* - /// matched by the regular expression. - /// The remainder of the string that is not split will be the last element - /// in the iterator. + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. /// /// This method will *not* copy the text given. /// @@ -568,9 +512,9 @@ impl Regex { self.replacen(text, 1, rep) } - /// Replaces all non-overlapping matches in `text` with the - /// replacement provided. This is the same as calling `replacen` with - /// `limit` set to `0`. + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. @@ -584,8 +528,12 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. - pub fn replacen<R: Replacer> - (&self, text: &str, limit: usize, mut rep: R) -> String { + pub fn replacen<R: Replacer>( + &self, + text: &str, + limit: usize, + mut rep: R, + ) -> String { // If we know that the replacement doesn't have any capture expansions, // then we can fast path. The fast path can make a tremendous @@ -631,44 +579,60 @@ impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { - match *self { - Regex::Dynamic(ref exec) => &exec.regex_strings()[0], - Regex::Native(ExNative { ref original, .. }) => original, + match self.0 { + _Regex::Dynamic(ref exec) => &exec.regex_strings()[0], + _Regex::Native(ExNative { ref original, .. }) => original, } } /// Returns an iterator over the capture names. pub fn capture_names(&self) -> CaptureNames { - match *self { - Regex::Native(ref n) => CaptureNames::Native(n.names.iter()), - Regex::Dynamic(ref d) => { - CaptureNames::Dynamic(d.captures().iter()) + CaptureNames(match self.0 { + _Regex::Native(ref n) => _CaptureNames::Native(n.names.iter()), + _Regex::Dynamic(ref d) => { + _CaptureNames::Dynamic(d.captures().iter()) } - } + }) } /// Returns the number of captures. pub fn captures_len(&self) -> usize { - match *self { - Regex::Native(ref n) => n.names.len(), - Regex::Dynamic(ref d) => d.captures().len() + match self.0 { + _Regex::Native(ref n) => n.names.len(), + _Regex::Dynamic(ref d) => d.captures().len() } } fn alloc_captures(&self) -> Vec<Option<usize>> { - match *self { - Regex::Native(ref n) => vec![None; 2 * n.names.len()], - Regex::Dynamic(ref d) => vec![None; 2 * d.captures().len()], + match self.0 { + _Regex::Native(ref n) => vec![None; 2 * n.names.len()], + _Regex::Dynamic(ref d) => vec![None; 2 * d.captures().len()], + } + } + + fn exec(&self, caps: CaptureSlots, text: &str, start: usize) -> bool { + match self.0 { + _Regex::Native(ExNative { ref prog, .. }) => { + (*prog)(caps, text, start) + } + _Regex::Dynamic(ref exec) => { + let mut _matches = [false]; + let mut search = Search::new(caps, &mut _matches); + exec.exec(&mut search, text.as_bytes(), start) + } } } } -/// Yields the names of all possible captures. -/// `None` indicates an unnamed capture; the first element -/// (capture 0, the whole matched region) is always unnamed. +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. /// -/// `'r` is the lifetime of the compiled expression. -pub enum CaptureNames<'r> { +/// `'r` is the lifetime of the compiled regular expression. +pub struct CaptureNames<'r>(_CaptureNames<'r>); + +enum _CaptureNames<'r> { #[doc(hidden)] Native(::std::slice::Iter<'r, Option<&'static str>>), #[doc(hidden)] @@ -676,21 +640,21 @@ pub enum CaptureNames<'r> { } impl<'r> Iterator for CaptureNames<'r> { - type Item=Option<&'r str>; + type Item = Option<&'r str>; fn next(&mut self) -> Option<Option<&'r str>> { - match *self { - CaptureNames::Native(ref mut i) => - i.next().cloned(), - CaptureNames::Dynamic(ref mut i) => - i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())), + match self.0 { + _CaptureNames::Native(ref mut i) => i.next().cloned(), + _CaptureNames::Dynamic(ref mut i) => { + i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())) + } } } fn size_hint(&self) -> (usize, Option<usize>) { - match *self { - CaptureNames::Native(ref i) => i.size_hint(), - CaptureNames::Dynamic(ref i) => i.size_hint(), + match self.0 { + _CaptureNames::Native(ref i) => i.size_hint(), + _CaptureNames::Dynamic(ref i) => i.size_hint(), } } } @@ -701,7 +665,7 @@ impl<'r> Iterator for CaptureNames<'r> { /// string replacement without expanding `$name` to their corresponding /// capture groups. /// -/// `'r` is the lifetime of the literal text. +/// `'t` is the lifetime of the literal text. pub struct NoExpand<'t>(pub &'t str); /// Replacer describes types that can be used to replace matches in a string. @@ -749,8 +713,8 @@ impl<F> Replacer for F where F: FnMut(&Captures) -> String { /// Yields all substrings delimited by a regular expression match. /// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the string being split. +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. pub struct RegexSplits<'r, 't> { finder: FindMatches<'r, 't>, last: usize, @@ -760,7 +724,7 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { - let text = self.finder.search; + let text = self.finder.text; match self.finder.next() { None => { if self.last >= text.len() { @@ -784,8 +748,8 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { /// /// The last substring will be whatever remains after splitting. /// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the string being split. +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. pub struct RegexSplitsN<'r, 't> { splits: RegexSplits<'r, 't>, n: usize, @@ -800,7 +764,7 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { } self.n -= 1; if self.n == 0 { - let text = self.splits.finder.search; + let text = self.splits.finder.text; Some(&text[self.splits.last..]) } else { self.splits.next() @@ -815,11 +779,11 @@ enum NamedGroups { impl NamedGroups { fn from_regex(regex: &Regex) -> NamedGroups { - match *regex { - Regex::Native(ExNative { ref groups, .. }) => { + match regex.0 { + _Regex::Native(ExNative { ref groups, .. }) => { NamedGroups::Native(groups) } - Regex::Dynamic(ref exec) => { + _Regex::Dynamic(ref exec) => { NamedGroups::Dynamic(exec.capture_name_idx().clone()) } } @@ -846,7 +810,6 @@ impl NamedGroups { } enum NamedGroupsIter<'n> { - Empty, Native(::std::slice::Iter<'static, (&'static str, usize)>), Dynamic(::std::collections::hash_map::Iter<'n, String, usize>), } @@ -856,12 +819,10 @@ impl<'n> Iterator for NamedGroupsIter<'n> { fn next(&mut self) -> Option<Self::Item> { match *self { - NamedGroupsIter::Empty => - None, - NamedGroupsIter::Native(ref mut it) => - it.next().map(|&v| v), - NamedGroupsIter::Dynamic(ref mut it) => + NamedGroupsIter::Native(ref mut it) => it.next().map(|&v| v), + NamedGroupsIter::Dynamic(ref mut it) => { it.next().map(|(s, i)| (s.as_ref(), *i)) + } } } } @@ -869,33 +830,31 @@ impl<'n> Iterator for NamedGroupsIter<'n> { /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. -/// If a capture group is named, then the matched string is *also* available -/// via the `name` method. (Note that the 0th capture is always unnamed and so -/// must be accessed with the `at` method.) +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the `name` +/// method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `at` method.) /// /// Positions returned from a capture group are always byte indices. /// /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - locs: Vec<Option<usize>>, + caps: Vec<Option<usize>>, named_groups: NamedGroups, } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. - /// Returns `None` if `i` is not a valid capture group or if the capture - /// group did not match anything. - /// The positions returned are *always* byte indices with respect to the - /// original string matched. + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { let (s, e) = (i * 2, i * 2 + 1); - if e >= self.locs.len() || self.locs[s].is_none() { - // VM guarantees that each pair of locations are both Some or None. - return None + match (self.caps.get(s), self.caps.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, } - Some((self.locs[s].unwrap(), self.locs[e].unwrap())) } /// Returns the matched string for the capture group `i`. If `i` isn't @@ -925,7 +884,7 @@ impl<'t> Captures<'t> { /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { - SubCapturesPos { idx: 0, locs: &self.locs } + SubCapturesPos { idx: 0, caps: &self.caps } } /// Creates an iterator of all named groups as an tuple with the group @@ -951,6 +910,15 @@ impl<'t> Captures<'t> { /// /// To write a literal `$` use `$$`. pub fn expand(&self, text: &str) -> String { + const REPLACE_EXPAND: &'static str = r"(?x) + (?P<before>^|\b|[^$]) # Ignore `$$name`. + \$ + (?P<name> # Match the actual capture name. Can be... + [0-9]+ # A sequence of digits (for indexed captures), or... + | + [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. + ) + "; // How evil can you get? let re = Regex::new(REPLACE_EXPAND).unwrap(); let text = re.replace_all(text, |refs: &Captures| -> String { @@ -967,11 +935,15 @@ impl<'t> Captures<'t> { /// Returns the number of captured groups. #[inline] - pub fn len(&self) -> usize { self.locs.len() / 2 } + pub fn len(&self) -> usize { + self.caps.len() / 2 + } /// Returns true if and only if there are no captured groups. #[inline] - pub fn is_empty(&self) -> bool { self.len() == 0 } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } /// Get a group by index. @@ -983,15 +955,14 @@ impl<'t> Captures<'t> { /// of `a` and can't outlive it); to do that, use `at()` instead. /// /// # Panics +/// /// If there is no group at the given index. impl<'t> Index<usize> for Captures<'t> { - type Output = str; fn index(&self, i: usize) -> &str { self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) } - } /// Get a group by name. @@ -1004,18 +975,14 @@ impl<'t> Index<usize> for Captures<'t> { /// of `a` and can't outlive it); to do that, use `name` instead. /// /// # Panics +/// /// If there is no group named by the given value. impl<'t, 'i> Index<&'i str> for Captures<'t> { - type Output = str; fn index<'a>(&'a self, name: &'i str) -> &'a str { - match self.name(name) { - None => panic!("no group named '{}'", name), - Some(ref s) => s, - } + self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) } - } /// An iterator over capture groups for a particular match of a regular @@ -1048,17 +1015,17 @@ impl<'c> Iterator for SubCaptures<'c> { /// `'c` is the lifetime of the captures. pub struct SubCapturesPos<'c> { idx: usize, - locs: &'c [Option<usize>] + caps: &'c [Option<usize>] } impl<'c> Iterator for SubCapturesPos<'c> { type Item = Option<(usize, usize)>; fn next(&mut self) -> Option<Option<(usize, usize)>> { - if self.idx >= self.locs.len() { + if self.idx >= self.caps.len() { return None } - let r = match (self.locs[self.idx], self.locs[self.idx + 1]) { + let r = match (self.caps[self.idx], self.caps[self.idx + 1]) { (Some(s), Some(e)) => Some((s, e)), (None, None) => None, _ => unreachable!() @@ -1090,11 +1057,11 @@ impl<'c> Iterator for SubCapturesNamed<'c> { /// /// The iterator stops when no more matches can be found. /// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the matched string. +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. pub struct FindCaptures<'r, 't> { re: &'r Regex, - search: &'t str, + text: &'t str, last_end: usize, last_match: Option<usize>, } @@ -1103,12 +1070,12 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option<Captures<'t>> { - if self.last_end > self.search.len() { + if self.last_end > self.text.len() { return None } let mut caps = self.re.alloc_captures(); - if !exec(self.re, &mut caps, self.search, self.last_end) { + if !self.re.exec(&mut caps, self.text, self.last_end) { return None } let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); @@ -1116,18 +1083,18 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. if e == s && Some(self.last_end) == self.last_match { - if self.last_end >= self.search.len() { + if self.last_end >= self.text.len() { return None; } - self.last_end += self.search[self.last_end..].chars() + self.last_end += self.text[self.last_end..].chars() .next().unwrap().len_utf8(); return self.next(); } self.last_end = e; self.last_match = Some(self.last_end); Some(Captures { - text: self.search, - locs: caps, + text: self.text, + caps: caps, named_groups: NamedGroups::from_regex(self.re), }) } @@ -1139,11 +1106,11 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// of the match. The indices are byte offsets. The iterator stops when no more /// matches can be found. /// -/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime -/// of the matched string. +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. pub struct FindMatches<'r, 't> { re: &'r Regex, - search: &'t str, + text: &'t str, last_end: usize, last_match: Option<usize>, } @@ -1152,12 +1119,12 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { - if self.last_end > self.search.len() { + if self.last_end > self.text.len() { return None } let mut caps = [None, None]; - if !exec(self.re, &mut caps, self.search, self.last_end) { + if !self.re.exec(&mut caps, self.text, self.last_end) { return None; } let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); @@ -1165,10 +1132,10 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. if e == s && Some(self.last_end) == self.last_match { - if self.last_end >= self.search.len() { + if self.last_end >= self.text.len() { return None; } - self.last_end += self.search[self.last_end..].chars() + self.last_end += self.text[self.last_end..].chars() .next().unwrap().len_utf8(); return self.next(); } @@ -1202,7 +1169,7 @@ impl<'r, 't> Pattern<'t> for &'r Regex { unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { #[inline] fn haystack(&self) -> &'t str { - self.it.search + self.it.text } #[inline] @@ -1237,19 +1204,6 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { } } -fn exec(re: &Regex, caps: CaptureSlots, text: &str, start: usize) -> bool { - match *re { - Regex::Native(ExNative { ref prog, .. }) => (*prog)(caps, text, start), - Regex::Dynamic(ref prog) => { - let mut search = Search { - captures: caps, - matches: &mut [false], - }; - prog.exec(&mut search, text, start) - } - } -} - #[cfg(test)] mod test { use super::{NoExpand, Regex}; diff --git a/src/set.rs b/src/set.rs index fefe009eca..5d3bdfb7b9 100644 --- a/src/set.rs +++ b/src/set.rs @@ -13,11 +13,20 @@ use std::iter; use std::slice; use std::vec; -use syntax::Expr; - use exec::{Exec, ExecBuilder, Search}; use Error; +macro_rules! define_set { + ( + $ty:ident, + $ty_set_matches:ident, + $ty_set_matches_iter:ident, + $ty_set_matches_into_iter:ident, + $exec_build:expr, + $text_ty:ty, + $as_bytes:expr + ) => { + /// Match multiple (possibly overlapping) regular expressions in a single scan. /// /// A regex set corresponds to the union of two or more regular expressions. @@ -45,8 +54,7 @@ use Error; /// domains) might work: /// /// ```rust -/// use regex::RegexSet; -/// +/// # use regex::RegexSet; /// let set = RegexSet::new(&[ /// r"[a-z]+@[a-z]+\.(com|org|net)", /// r"[a-z]+\.(com|org|net)", @@ -101,9 +109,9 @@ use Error; /// search takes `O(mn)` time, where `m` is proportional to the size of the /// regex set and `n` is proportional to the length of the search text. #[derive(Clone)] -pub struct RegexSet(Exec); +pub struct $ty(Exec); -impl RegexSet { +impl $ty { /// Create a new regex set with the given regular expressions. /// /// This takes an iterator of `S`, where `S` is something that can produce @@ -115,18 +123,17 @@ impl RegexSet { /// Create a new regex set from an iterator of strings: /// /// ```rust - /// use regex::RegexSet; - /// + /// # use regex::RegexSet; /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); /// assert!(set.is_match("foo")); /// ``` - pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + pub fn new<I, S>(exprs: I) -> Result<$ty, Error> where S: AsRef<str>, I: IntoIterator<Item=S> { - let exec = try!(ExecBuilder::new_many(exprs).build()); + let exec = try!($exec_build(exprs)); if exec.regex_strings().len() < 2 { return Err(Error::InvalidSet); } - Ok(RegexSet(exec)) + Ok($ty(exec)) } /// Returns true if and only if one of the regexes in this set matches @@ -148,15 +155,14 @@ impl RegexSet { /// Tests whether a set matches some text: /// /// ```rust - /// use regex::RegexSet; - /// + /// # use regex::RegexSet; /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); /// assert!(set.is_match("foo")); /// assert!(!set.is_match("☃")); /// ``` - pub fn is_match(&self, text: &str) -> bool { - let mut search = Search { captures: &mut [], matches: &mut [] }; - self.0.exec(&mut search, text, 0) + pub fn is_match(&self, text: $text_ty) -> bool { + let mut search = Search::new(&mut [], &mut []); + self.0.exec(&mut search, $as_bytes(text), 0) } /// Returns the set of regular expressions that match in the given text. @@ -177,8 +183,7 @@ impl RegexSet { /// Tests which regular expressions match the given text: /// /// ```rust - /// use regex::RegexSet; - /// + /// # use regex::RegexSet; /// let set = RegexSet::new(&[ /// r"\w+", /// r"\d+", @@ -196,14 +201,11 @@ impl RegexSet { /// assert!(!matches.matched(5)); /// assert!(matches.matched(6)); /// ``` - pub fn matches(&self, text: &str) -> SetMatches { + pub fn matches(&self, text: $text_ty) -> SetMatches { let mut matches = vec![false; self.0.matches().len()]; let matched_any = { - let mut search = Search { - captures: &mut [], - matches: &mut matches - }; - self.0.exec(&mut search, text, 0) + let mut search = Search::new(&mut [], &mut matches); + self.0.exec(&mut search, $as_bytes(text), 0) }; SetMatches { matched_any: matched_any, @@ -217,20 +219,14 @@ impl RegexSet { } } -impl fmt::Debug for RegexSet { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "RegexSet({:?})", self.0.regex_strings()) - } -} - /// A set of matches returned by a regex set. #[derive(Clone, Debug)] -pub struct SetMatches { +pub struct $ty_set_matches { matched_any: bool, matches: Vec<bool>, } -impl SetMatches { +impl $ty_set_matches { /// Whether this set contains any matches. pub fn matched_any(&self) -> bool { self.matched_any @@ -254,22 +250,22 @@ impl SetMatches { } /// Returns an iterator over indexes in the regex that matched. - pub fn iter(&self) -> SetMatchesIter { - SetMatchesIter((&*self.matches).into_iter().enumerate()) + pub fn iter(&self) -> $ty_set_matches_iter { + $ty_set_matches_iter((&*self.matches).into_iter().enumerate()) } } -impl IntoIterator for SetMatches { - type IntoIter = SetMatchesIntoIter; +impl IntoIterator for $ty_set_matches { + type IntoIter = $ty_set_matches_into_iter; type Item = usize; fn into_iter(self) -> Self::IntoIter { - SetMatchesIntoIter(self.matches.into_iter().enumerate()) + $ty_set_matches_into_iter(self.matches.into_iter().enumerate()) } } -impl<'a> IntoIterator for &'a SetMatches { - type IntoIter = SetMatchesIter<'a>; +impl<'a> IntoIterator for &'a $ty_set_matches { + type IntoIter = $ty_set_matches_iter<'a>; type Item = usize; fn into_iter(self) -> Self::IntoIter { @@ -278,9 +274,9 @@ impl<'a> IntoIterator for &'a SetMatches { } /// An owned iterator over the set of matches from a regex set. -pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); +pub struct $ty_set_matches_into_iter(iter::Enumerate<vec::IntoIter<bool>>); -impl Iterator for SetMatchesIntoIter { +impl Iterator for $ty_set_matches_into_iter { type Item = usize; fn next(&mut self) -> Option<usize> { @@ -298,9 +294,9 @@ impl Iterator for SetMatchesIntoIter { /// /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. #[derive(Clone)] -pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); +pub struct $ty_set_matches_iter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); -impl<'a> Iterator for SetMatchesIter<'a> { +impl<'a> Iterator for $ty_set_matches_iter<'a> { type Item = usize; fn next(&mut self) -> Option<usize> { @@ -313,3 +309,41 @@ impl<'a> Iterator for SetMatchesIter<'a> { } } } + + } +} + +fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } +fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } + +define_set! { + RegexSet, + SetMatches, + SetMatchesIter, + SetMatchesIntoIter, + |exprs| ExecBuilder::new_many(exprs).build(), + &str, + as_bytes_str +} + +define_set! { + RegexSetBytes, + SetMatchesBytes, + SetMatchesIterBytes, + SetMatchesIntoIterBytes, + |exprs| ExecBuilder::new_many(exprs).only_utf8(false).build(), + &[u8], + as_bytes_bytes +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +impl fmt::Debug for RegexSetBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} diff --git a/src/utf8.rs b/src/utf8.rs new file mode 100644 index 0000000000..c9b532779f --- /dev/null +++ b/src/utf8.rs @@ -0,0 +1,250 @@ +/// A few elementary UTF-8 encoding and decoding functions used by the matching +/// engines. +/// +/// In an ideal world, the matching engines operate on `&str` and we can just +/// lean on the standard library for all our UTF-8 needs. However, to support +/// byte based regexes (that can match on arbitrary bytes which may contain +/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. +/// The standard library doesn't really recognize this use case, so we have +/// to build it out ourselves. +/// +/// Should this be factored out into a separate crate? It seems independently +/// useful. There are other crates that already exist (e.g., `utf-8`) that have +/// overlapping use cases. Not sure what to do. + +use std::char; + +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO: u8 = 0b1100_0000; +const TAG_THREE: u8 = 0b1110_0000; +const TAG_FOUR: u8 = 0b1111_0000; + +/// Encode the given Unicode character to `dst` as a single UTF-8 sequence. +/// +/// If `dst` is not long enough, then `None` is returned. Otherwise, the number +/// of bytes written is returned. +#[inline] +pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> { + let code = character as u32; + if code <= 0x7F && !dst.is_empty() { + dst[0] = code as u8; + Some(1) + } else if code <= 0x7FF && dst.len() >= 2 { + dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO; + dst[1] = (code & 0x3F) as u8 | TAG_CONT; + Some(2) + } else if code <= 0xFFFF && dst.len() >= 3 { + dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE; + dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code & 0x3F) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR; + dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[3] = (code & 0x3F) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. +/// +/// If no valid UTF-8 sequence could be found, then `None` is returned. +/// Otherwise, the decoded codepoint and the number of bytes read is returned. +/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be +/// 1, 2, 3 or 4. +/// +/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a +/// codepoint that is out of range (surrogate codepoints are out of range) or +/// is not the shortest possible UTF-8 sequence for that codepoint. +#[inline] +pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { + let b0 = match src.get(0) { + None => return None, + Some(&b) if b <= 0x7F => return Some((b as char, 1)), + Some(&b) => b, + }; + match b0 { + 0b110_00000 ... 0b110_11111 => { + if src.len() < 2 { + return None; + } + let b1 = src[1]; + let cp = ((b0 & !TAG_TWO) as u32) << 6 + | ((b1 & !TAG_CONT) as u32); + match cp { + 0x80 ... 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), + _ => None, + } + } + 0b1110_0000 ... 0b1110_1111 => { + if src.len() < 3 { + return None; + } + let (b1, b2) = (src[1], src[2]); + let cp = ((b0 & !TAG_THREE) as u32) << 12 + | ((b1 & !TAG_CONT) as u32) << 6 + | ((b2 & !TAG_CONT) as u32); + match cp { + // char::from_u32 will disallow surrogate codepoints. + 0x800 ... 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), + _ => None, + } + } + 0b11110_000 ... 0b11110_111 => { + if src.len() < 4 { + return None; + } + let (b1, b2, b3) = (src[1], src[2], src[3]); + let cp = ((b0 & !TAG_FOUR) as u32) << 18 + | ((b1 & !TAG_CONT) as u32) << 12 + | ((b2 & !TAG_CONT) as u32) << 6 + | ((b3 & !TAG_CONT) as u32); + match cp { + 0x10000 ... 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)), + _ => None, + } + } + _ => None, + } +} + +/// Like decode_utf8, but decodes the last UTF-8 sequence in `src` instead of +/// the first. +pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { + if src.is_empty() { + return None; + } + let mut start = src.len() - 1; + if src[start] <= 0x7F { + return Some((src[start] as char, 1)); + } + while start > src.len().saturating_sub(4) { + start -= 1; + if is_start_byte(src[start]) { + break; + } + } + match decode_utf8(&src[start..]) { + None => None, + Some((_, n)) if n < src.len() - start => None, + Some((cp, n)) => Some((cp, n)), + } +} + +fn is_start_byte(b: u8) -> bool { + b & 0b11_000000 != 0b1_0000000 +} + +#[cfg(test)] +mod tests { + use std::str; + + use quickcheck::quickcheck; + + use super::{ + TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR, + decode_utf8, decode_last_utf8, encode_utf8, + }; + + #[test] + fn prop_roundtrip() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap(); + let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_roundtrip_last() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap(); + let (got_cp, got_len) = + decode_last_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_encode_matches_std() { + fn p(cp: char) -> bool { + let mut got = [0; 4]; + let n = encode_utf8(cp, &mut got).unwrap(); + let expected = cp.to_string(); + &got[..n] == expected.as_bytes() + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = encode_utf8(given_cp, &mut tmp).unwrap(); + let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); + let expected_cp = + str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_last_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = encode_utf8(given_cp, &mut tmp).unwrap(); + let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); + let expected_cp = + str::from_utf8(&tmp[..n]).unwrap() + .chars().rev().next().unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn reject_invalid() { + // Invalid start byte + assert_eq!(decode_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); + // Bad lengths + assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); + assert_eq!(decode_utf8(&[ + TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a', + ]), None); + } + + #[test] + fn reject_invalid_last() { + // Invalid start byte + assert_eq!(decode_last_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); + // Bad lengths + assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!(decode_last_utf8(&[ + TAG_THREE, TAG_CONT, TAG_CONT | b'a', + ]), None); + assert_eq!(decode_last_utf8(&[ + TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a', + ]), None); + } +} diff --git a/tests/api.rs b/tests/api.rs new file mode 100644 index 0000000000..275157bf07 --- /dev/null +++ b/tests/api.rs @@ -0,0 +1,236 @@ +#[test] +fn empty_regex_empty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0)], findall!(re, "")); +} + +#[test] +fn empty_regex_nonempty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn one_zero_length_match() { + let re = regex!(r"\d*"); + assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2")); +} + +#[test] +fn many_zero_length_match() { + let re = regex!(r"\d*"); + assert_eq!(vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)], + findall!(re, "a1bbb2")); +} + +#[test] +fn many_sequential_zero_length_match() { + let re = regex!(r"\d?"); + assert_eq!(vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)], + findall!(re, "a12b3c")); +} + +#[test] +fn quoted_bracket_set() { + let re = regex!(r"([\x{5b}\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); + let re = regex!(r"([\[\]])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn first_range_starts_with_left_bracket() { + let re = regex!(r"([[-z])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn range_ends_with_escape() { + let re = regex!(r"([\[-\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn empty_match_find_iter() { + let re = regex!(r".*?"); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn empty_match_captures_iter() { + let re = regex!(r".*?"); + let ms: Vec<_> = re.captures_iter(text!("abc")) + .map(|c| c.pos(0).unwrap()) + .collect(); + assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); +} + +#[test] +fn capture_names() { + let re = regex!(r"(.)(?P<a>.)"); + assert_eq!(3, re.captures_len()); + assert_eq!((3, Some(3)), re.capture_names().size_hint()); + assert_eq!(vec![None, None, Some("a")], + re.capture_names().collect::<Vec<_>>()); +} + +#[test] +fn regex_string() { + assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str()); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+"))); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+"))); +} + +#[test] +fn capture_index() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + assert_eq!(&cap[0], t!("abc")); + assert_eq!(&cap[1], t!("abc")); + assert_eq!(&cap["name"], t!("abc")); +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_usize() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap[2]; +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_name() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap["bad name"]; +} + +#[test] +fn capture_index_lifetime() { + // This is a test of whether the types on `caps["..."]` are general + // enough. If not, this will fail to typecheck. + fn inner(s: &str) -> usize { + let re = regex!(r"(?P<number>\d+)"); + let caps = re.captures(t!(s)).unwrap(); + caps["number"].len() + } + assert_eq!(3, inner("123")); +} + +#[test] +fn capture_misc() { + let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + + assert_eq!(5, cap.len()); + + assert_eq!(Some((0, 3)), cap.pos(0)); + assert_eq!(None, cap.pos(2)); + assert_eq!(Some((2, 3)), cap.pos(4)); + + assert_eq!(Some(t!("abc")), cap.at(0)); + assert_eq!(None, cap.at(2)); + assert_eq!(Some(t!("c")), cap.at(4)); + + assert_eq!(None, cap.name("a")); + assert_eq!(Some(t!("c")), cap.name("b")); +} + +#[test] +fn capture_iter() { + let re = regex!(r"(.)(?P<a>.)(.)(?P<b>.)"); + let cap = re.captures(t!("abcd")).unwrap(); + assert_eq!(5, cap.len()); + + let expected = vec![ + t!("abcd"), t!("a"), t!("b"), t!("c"), t!("d"), + ].into_iter().map(Some).collect::<Vec<_>>(); + let got = cap.iter().collect::<Vec<_>>(); + assert_eq!(expected, got); +} + +#[test] +fn capture_iter_missing() { + let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + assert_eq!(5, cap.len()); + + let expected = vec![ + Some(t!("abc")), Some(t!("a")), None, Some(t!("b")), Some(t!("c")), + ]; + let got = cap.iter().collect::<Vec<_>>(); + assert_eq!(expected, got); +} + +#[test] +fn capture_iter_pos() { + let re = regex!(r"(.)(?P<a>.)(.)(?P<b>.)"); + let cap = re.captures(t!("abcd")).unwrap(); + + let expected = vec![ + (0, 4), (0, 1), (1, 2), (2, 3), (3, 4), + ].into_iter().map(Some).collect::<Vec<_>>(); + let got = cap.iter_pos().collect::<Vec<_>>(); + assert_eq!(expected, got); +} + +#[test] +fn capture_iter_pos_missing() { + let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + + let expected = vec![ + Some((0, 3)), Some((0, 1)), None, Some((1, 2)), Some((2, 3)), + ]; + let got = cap.iter_pos().collect::<Vec<_>>(); + assert_eq!(expected, got); +} + +#[test] +fn capture_iter_named() { + let re = regex!(r"(.)(?P<a>.)(.)(?P<b>.)"); + let cap = re.captures(t!("abcd")).unwrap(); + + let expected1 = vec![ + ("a", Some(t!("b"))), ("b", Some(t!("d"))), + ]; + let expected2 = vec![ + ("b", Some(t!("d"))), ("a", Some(t!("b"))), + ]; + let got = cap.iter_named().collect::<Vec<_>>(); + assert!(got == expected1 || got == expected2); +} + +#[test] +fn capture_iter_named_missing() { + let re = regex!(r"(.)(?P<a>.)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + + let expected1 = vec![ + ("a", None), ("b", Some(t!("c"))), + ]; + let expected2 = vec![ + ("b", Some(t!("c"))), ("a", None), + ]; + let got = cap.iter_named().collect::<Vec<_>>(); + assert!(got == expected1 || got == expected2); +} + +expand!(expand1, r"(?P<foo>\w+)", "abc", "$foo", "abc"); +expand!(expand2, r"(?P<foo>\w+)", "abc", "$0", "abc"); +expand!(expand3, r"(?P<foo>\w+)", "abc", "$1", "abc"); +expand!(expand4, r"(?P<foo>\w+)", "abc", "$$1", "$1"); +expand!(expand5, r"(?P<foo>\w+)", "abc", "$$foo", "$foo"); +expand!(expand6, r"(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", "$b$a", "123abc"); +expand!(expand7, r"(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", "z$bz$az", "z"); +expand!(expand8, r"(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", ".$b.$a.", ".123.abc."); +expand!(expand9, r"(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", " $b $a ", " 123 abc "); +expand!(expand10, r"(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", "$bz$az", ""); diff --git a/tests/api_str.rs b/tests/api_str.rs new file mode 100644 index 0000000000..266b6455b2 --- /dev/null +++ b/tests/api_str.rs @@ -0,0 +1,27 @@ +// These tests don't really make sense with the bytes API, so we only test them +// on the Unicode API. + +#[test] +fn empty_match_unicode_find_iter() { + // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries + // even when we're susceptible to empty width matches. + let re = regex!(u!(r".*?")); + assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], + findall!(re, "Ⅰ1Ⅱ2")); +} + +#[test] +fn empty_match_unicode_captures_iter() { + // Same as empty_match_unicode_find_iter, but tests capture iteration. + let re = regex!(u!(r".*?")); + let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) + .map(|c| c.pos(0).unwrap()) + .collect(); + assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); +} + +#[test] +fn eq() { + use regex::Regex; + assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); +} diff --git a/tests/bytes.rs b/tests/bytes.rs new file mode 100644 index 0000000000..a290630d8d --- /dev/null +++ b/tests/bytes.rs @@ -0,0 +1,40 @@ +// These are tests specifically crafted for regexes that can match arbitrary +// bytes. + +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } + +mat!(word_boundary, r" \b", " δ", None); +mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r" \B", " δ", Some((0, 1))); +mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None); + +mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1))); +mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1))); +mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1))); +mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4))); + +// The first `(.+)` matches two Unicode codepoints, but can't match the 5th +// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and +// matches. +mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), + Some((0, 5)), Some((0, 4)), Some((4, 5))); + +mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5))); +mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2))); + +mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1))); + +// This doesn't match in a normal Unicode regex because the implicit preceding +// `.*?` is Unicode aware. +mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2))); + +// Have fun with null bytes. +mat!(null_bytes, r"(?P<cstr>[^\x00]+)\x00", + R(b"foo\x00"), Some((0, 4)), Some((0, 3))); diff --git a/tests/crazy.rs b/tests/crazy.rs new file mode 100644 index 0000000000..03ddb2dd95 --- /dev/null +++ b/tests/crazy.rs @@ -0,0 +1,76 @@ +// Some crazy expressions from regular-expressions.info. +mat!(match_ranges, + r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 255", Some((5, 8))); +mat!(match_ranges_not, + r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 256", None); +mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))); +mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))); +mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))); +mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None); +mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail.com ", Some((8, 26))); +mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail ", None); +mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", + "mine is jam.slam@gmail.com ", Some((8, 26))); +mat!(match_date1, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-01-01", Some((0, 10))); +mat!(match_date2, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-00-01", None); +mat!(match_date3, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-13-01", None); + +// Do some crazy dancing with the start/end assertions. +matiter!(match_start_end_empty, r"^$", "", (0, 0)); +matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0)); +matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0)); +matiter!(match_start_end_empty_rev, r"$^", "", (0, 0)); +matiter!(match_start_end_empty_rep, r"(?:^$)*", "a\nb\nc", + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(match_start_end_empty_rep_rev, r"(?:$^)*", "a\nb\nc", + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +// Test negated character classes. +mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3))); +mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3))); +mat!(negclass_letter_space, r"[^a\s]", "a x", Some((2, 3))); +mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3))); +mat!(negclass_space, r"[^\s]", " a", Some((1, 2))); +mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3))); +mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3))); +mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); + +// Test that the DFA can handle pathological cases. +// (This should result in the DFA's cache being flushed too frequently, which +// should cause it to quit and fall back to the NFA algorithm.) +#[test] +fn dfa_handles_pathological_case() { + fn ones_and_zeroes(count: usize) -> String { + use rand::{Rng, thread_rng}; + + let mut rng = thread_rng(); + let mut s = String::new(); + for _ in 0..count { + if rng.gen() { + s.push('1'); + } else { + s.push('0'); + } + } + s + } + + let re = regex!(r"[01]*1[01]{20}$"); + let text = { + let mut pieces = ones_and_zeroes(100_000); + pieces.push('1'); + pieces.push_str(&ones_and_zeroes(20)); + pieces + }; + assert!(re.is_match(text!(&*text))); +} diff --git a/tests/flags.rs b/tests/flags.rs new file mode 100644 index 0000000000..0c9b36c91c --- /dev/null +++ b/tests/flags.rs @@ -0,0 +1,11 @@ +mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3))); +mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))); +mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None); +mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))); +mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))); +mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None); +mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2))); +mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))); +mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))); +mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))); +mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))); diff --git a/tests/matches.rs b/tests/fowler.rs similarity index 100% rename from tests/matches.rs rename to tests/fowler.rs diff --git a/tests/macros.rs b/tests/macros.rs new file mode 100644 index 0000000000..7e73e4fa9e --- /dev/null +++ b/tests/macros.rs @@ -0,0 +1,115 @@ +// Convenience macros. + +macro_rules! findall { + ($re:expr, $text:expr) => {{ + $re.find_iter(text!($text)).collect::<Vec<_>>() + }} +} + +// Macros for automatically producing tests. + +macro_rules! mat( + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<Option<_>> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<Option<_>> = match r.captures(text) { + Some(c) => { + assert!(r.is_match(text)); + c.iter_pos().collect() + } + None => vec![None], + }; + // The test set sometimes leave out capture groups, so truncate + // actual capture groups to match test set. + let mut sgot = &got[..]; + if sgot.len() > expected.len() { + sgot = &sgot[0..expected.len()] + } + if expected != sgot { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, sgot); + } + } + ); +); + +macro_rules! matiter( + ($name:ident, $re:expr, $text:expr) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<(usize, usize)> = vec![]; + let r = regex!($re); + let got: Vec<_> = r.find_iter(text).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<_> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<_> = r.find_iter(text).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); +); + +macro_rules! matset { + ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(set.is_match(text)); + let expected = vec![$($match_index),*]; + let matches = set.matches(text); + assert!(matches.matched_any()); + let got: Vec<_> = matches.into_iter().collect(); + assert_eq!(expected, got); + } + } +} + +macro_rules! nomatset { + ($name:ident, $res:expr, $text:expr) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(!set.is_match(text)); + let matches = set.matches(text); + assert!(!matches.matched_any()); + assert_eq!(0, matches.into_iter().count()); + } + } +} diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs new file mode 100644 index 0000000000..a68fada744 --- /dev/null +++ b/tests/macros_bytes.rs @@ -0,0 +1,43 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text.as_bytes() } } +macro_rules! t { ($re:expr) => { text!($re) } } + +macro_rules! bytes { ($text:expr) => { $text } } +macro_rules! b { ($text:expr) => { bytes!($text) } } + +macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::bytes::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { + ($text:expr) => {{ + use std::ascii::escape_default; + let mut s = vec![]; + for &b in bytes!($text) { + s.extend(escape_default(b)); + } + String::from_utf8(s).unwrap() + }} +} + +// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, +// but they should be unified in 1.0. Then we can move this macro back into +// tests/api.rs where it is used. ---AG +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let mut got = vec![]; + cap.expand(t!($expand), &mut got); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} diff --git a/tests/macros_str.rs b/tests/macros_str.rs new file mode 100644 index 0000000000..7ea29335de --- /dev/null +++ b/tests/macros_str.rs @@ -0,0 +1,33 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text } } +macro_rules! t { ($text:expr) => { text!($text) } } + +macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } +macro_rules! b { ($text:expr) => { bytes!($text) } } + +macro_rules! u { ($re:expr) => { $re } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { ($text:expr) => { $text } } + +// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, +// but they should be unified in 1.0. Then we can move this macro back into +// tests/api.rs where it is used. ---AG +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let got = cap.expand(t!($expand)); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} diff --git a/tests/test_native.rs b/tests/misc.rs similarity index 72% rename from tests/test_native.rs rename to tests/misc.rs index 4bbb6c98dc..efd488deef 100644 --- a/tests/test_native.rs +++ b/tests/misc.rs @@ -8,13 +8,9 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(pattern, plugin, test)] -#![plugin(regex_macros)] +use regex::Regex; -extern crate regex; -extern crate test; - -macro_rules! searcher_expr { ($e:expr) => ($e) } - -mod tests; -mod native_static; +#[test] +fn eq() { + assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); +} diff --git a/tests/multiline.rs b/tests/multiline.rs new file mode 100644 index 0000000000..191504aa75 --- /dev/null +++ b/tests/multiline.rs @@ -0,0 +1,49 @@ +matiter!(match_multi_1, r"(?m)^[a-z]+$", "abc\ndef\nxyz", + (0, 3), (4, 7), (8, 11)); +matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz"); +matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", + (0, 0), (4, 4), (8, 8)); +matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", + (3, 3), (7, 7), (11, 11)); +matiter!(match_multi_5, r"(?m)^[a-z]", "abc\ndef\nxyz", + (0, 1), (4, 5), (8, 9)); +matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz"); +matiter!(match_multi_7, r"(?m)[a-z]$", "abc\ndef\nxyz", + (2, 3), (6, 7), (10, 11)); +matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz"); +matiter!(match_multi_9, r"(?m)^$", "", (0, 0)); + +matiter!(match_multi_rep_1, r"(?m)(?:^$)*", "a\nb\nc", + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(match_multi_rep_2, r"(?m)(?:^|a)+", "a\naaa\n", + (0, 0), (2, 2), (3, 5), (6, 6)); +matiter!(match_multi_rep_3, r"(?m)(?:^|a)*", "a\naaa\n", + (0, 1), (2, 5), (6, 6)); +matiter!(match_multi_rep_4, r"(?m)(?:^[a-z])+", "abc\ndef\nxyz", + (0, 1), (4, 5), (8, 9)); +matiter!(match_multi_rep_5, r"(?m)(?:^[a-z]{3}\n?)+", "abc\ndef\nxyz", + (0, 11)); +matiter!(match_multi_rep_6, r"(?m)(?:^[a-z]{3}\n?)*", "abc\ndef\nxyz", + (0, 11)); +matiter!(match_multi_rep_7, r"(?m)(?:\n?[a-z]{3}$)+", "abc\ndef\nxyz", + (0, 11)); +matiter!(match_multi_rep_8, r"(?m)(?:\n?[a-z]{3}$)*", "abc\ndef\nxyz", + (0, 11)); +matiter!(match_multi_rep_9, r"(?m)^*", "\naa\n", + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4)); +matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", + (0, 0), (1, 1), (4, 4)); +matiter!(match_multi_rep_11, r"(?m)$*", "\naa\n", + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4)); +matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", + (0, 0), (3, 3), (4, 4)); +matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", + (0, 2), (5, 7)); +matiter!(match_multi_rep_14, r"(?m)(?:$\n)*", "\n\naaa\n\n", + (0, 2), (3, 3), (4, 4), (5, 7)); +matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", + (0, 2), (5, 7)); +matiter!(match_multi_rep_16, r"(?m)(?:^|$)+", "\n\naaa\n\n", + (0, 0), (1, 1), (2, 2), (5, 5), (6, 6), (7, 7)); +matiter!(match_multi_rep_17, r"(?m)(?:$\n)*", "\n\naaa\n\n", + (0, 2), (3, 3), (4, 4), (5, 7)); diff --git a/tests/noparse.rs b/tests/noparse.rs new file mode 100644 index 0000000000..ce2d369e20 --- /dev/null +++ b/tests/noparse.rs @@ -0,0 +1,48 @@ +macro_rules! noparse( + ($name:ident, $re:expr) => ( + #[test] + fn $name() { + let re = $re; + match regex_new!(re) { + Err(_) => {}, + Ok(_) => panic!("Regex '{}' should cause a parse error.", re), + } + } + ); +); + +noparse!(fail_double_repeat, "a**"); +noparse!(fail_no_repeat_arg, "*"); +noparse!(fail_incomplete_escape, "\\"); +noparse!(fail_class_incomplete, "[A-"); +noparse!(fail_class_not_closed, "[A"); +noparse!(fail_class_no_begin, r"[\A]"); +noparse!(fail_class_no_end, r"[\z]"); +noparse!(fail_class_no_boundary, r"[\b]"); +noparse!(fail_open_paren, "("); +noparse!(fail_close_paren, ")"); +noparse!(fail_invalid_range, "[a-Z]"); +noparse!(fail_empty_capture_name, "(?P<>a)"); +noparse!(fail_empty_capture_exp, "(?P<name>)"); +noparse!(fail_bad_capture_name, "(?P<na-me>)"); +noparse!(fail_bad_flag, "(?a)a"); +noparse!(fail_empty_alt_before, "|a"); +noparse!(fail_empty_alt_after, "a|"); +noparse!(fail_too_big, "a{10000000}"); +noparse!(fail_counted_no_close, "a{1001"); +noparse!(fail_unfinished_cap, "(?"); +noparse!(fail_unfinished_escape, "\\"); +noparse!(fail_octal_digit, r"\8"); +noparse!(fail_hex_digit, r"\xG0"); +noparse!(fail_hex_short, r"\xF"); +noparse!(fail_hex_long_digits, r"\x{fffg}"); +noparse!(fail_flag_bad, "(?a)"); +noparse!(fail_flag_empty, "(?)"); +noparse!(fail_double_neg, "(?-i-i)"); +noparse!(fail_neg_empty, "(?i-)"); +noparse!(fail_empty_group, "()"); +noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)"); +noparse!(fail_range_end_no_class, "[a-[:lower:]]"); +noparse!(fail_range_end_no_begin, r"[a-\A]"); +noparse!(fail_range_end_no_end, r"[a-\z]"); +noparse!(fail_range_end_no_boundary, r"[a-\b]"); diff --git a/tests/native_static.rs b/tests/plugin.rs similarity index 95% rename from tests/native_static.rs rename to tests/plugin.rs index 62e14731c2..b22b5d2e9c 100644 --- a/tests/native_static.rs +++ b/tests/plugin.rs @@ -12,14 +12,14 @@ use regex::Regex; static RE: Regex = regex!(r"\d+"); #[test] -fn static_splitn() { +fn splitn() { let text = "cauchy123plato456tyler789binx"; let subs: Vec<&str> = RE.splitn(text, 2).collect(); assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); } #[test] -fn static_split() { +fn split() { let text = "cauchy123plato456tyler789binx"; let subs: Vec<&str> = RE.split(text).collect(); assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); diff --git a/tests/regression.rs b/tests/regression.rs new file mode 100644 index 0000000000..8b65140ce2 --- /dev/null +++ b/tests/regression.rs @@ -0,0 +1,44 @@ +// See: https://github.com/rust-lang/regex/issues/48 +#[test] +fn invalid_regexes_no_crash() { + assert!(regex_new!("(*)").is_err()); + assert!(regex_new!("(?:?)").is_err()); + assert!(regex_new!("(?)").is_err()); + assert!(regex_new!("*").is_err()); +} + +// See: https://github.com/rust-lang/regex/issues/98 +#[test] +fn regression_many_repeat_stack_overflow() { + let re = regex!("^.{1,2500}"); + assert_eq!(vec![(0, 1)], findall!(re, "a")); +} + +// See: https://github.com/rust-lang/regex/issues/75 +mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2))); +mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2))); + +// See: https://github.com/rust-lang/regex/issues/99 +mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); +mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); + +// See: https://github.com/rust-lang/regex/issues/101 +mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1))); + +// See: https://github.com/rust-lang-nursery/regex/issues/129 +#[test] +fn regression_captures_rep() { + let re = regex!(r"([a-f]){2}(?P<foo>[x-z])"); + let caps = re.captures(text!("abx")).unwrap(); + assert_eq!(caps.name("foo").unwrap(), text!("x")); +} + +// See: https://github.com/rust-lang-nursery/regex/issues/153 +mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1))); +mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); + +// See: https://github.com/rust-lang-nursery/regex/issues/169 +mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); + +// See: https://github.com/rust-lang/regex/issues/76 +mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))); diff --git a/tests/replace.rs b/tests/replace.rs new file mode 100644 index 0000000000..c4cc5d1a4d --- /dev/null +++ b/tests/replace.rs @@ -0,0 +1,35 @@ +macro_rules! replace( + ($name:ident, $which:ident, $re:expr, + $search:expr, $replace:expr, $result:expr) => ( + #[test] + fn $name() { + let re = regex!($re); + assert_eq!(re.$which(text!($search), $replace), text!($result)); + } + ); +); + +replace!(first, replace, r"\d", "age: 26", t!("Z"), "age: Z6"); +replace!(plus, replace, r"\d+", "age: 26", t!("Z"), "age: Z"); +replace!(all, replace_all, r"\d", "age: 26", t!("Z"), "age: ZZ"); +replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1"); +replace!(double_dollar, replace, + r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $$1"), "w2 $1"); +// replace!(adjacent_index, replace, + // r"([^aeiouy])ies$", "skies", t!("$1y"), "sky"); +replace!(named, replace_all, + r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)", + "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3"); +replace!(trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", + t!(""), "trim me"); +replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); +// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); +replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a"); +replace!(literal_dollar1, replace_all, + r"(\w+) (\w+)", "a b", t!("$$1"), "$1"); +replace!(literal_dollar2, replace_all, + r"(\w+) (\w+)", "a b", t!("$2 $$c $1"), "b $c a"); +replace!(no_expand1, replace, + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$2 $1"), "$2 $1"); +replace!(no_expand2, replace, + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$$1"), "$$1"); diff --git a/tests/searcher.rs b/tests/searcher.rs new file mode 100644 index 0000000000..073fa5f96c --- /dev/null +++ b/tests/searcher.rs @@ -0,0 +1,66 @@ +macro_rules! searcher { + ($name:ident, $re:expr, $haystack:expr) => ( + searcher!($name, $re, $haystack, vec vec![]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => ( + #[test] + #[allow(unused_imports)] + fn $name() { + searcher_expr! {{ + use std::str::pattern::{Pattern, Searcher}; + use std::str::pattern::SearchStep::{Match, Reject, Done}; + let re = regex!($re); + let mut se = re.into_searcher($haystack); + let mut got_steps = vec![]; + loop { + match se.next() { + Done => break, + step => { got_steps.push(step); } + } + } + assert_eq!(got_steps, $expect_steps); + }} + } + ); +} + +searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0)); +searcher!(searcher_empty_regex, r"", "ab", + Match(0, 0), Reject(0, 1), Match(1, 1), Reject(1, 2), Match(2, 2)); +searcher!(searcher_empty_haystack, r"\d", ""); +searcher!(searcher_one_match, r"\d", "5", + Match(0, 1)); +searcher!(searcher_no_match, r"\d", "a", + Reject(0, 1)); +searcher!(searcher_two_adjacent_matches, r"\d", "56", + Match(0, 1), Match(1, 2)); +searcher!(searcher_two_non_adjacent_matches, r"\d", "5a6", + Match(0, 1), Reject(1, 2), Match(2, 3)); +searcher!(searcher_reject_first, r"\d", "a6", + Reject(0, 1), Match(1, 2)); +searcher!(searcher_one_zero_length_matches, r"\d*", "a1b2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 4), // a1b2 +); +searcher!(searcher_many_zero_length_matches, r"\d*", "a1bbb2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 3), // a1bb + Reject(3, 4), // a1bb + Match(4, 4), // a1bbb + Reject(4, 5), // a1bbb + Match(5, 6), // a1bbba +); +searcher!(searcher_unicode, r".+?", "Ⅰ1Ⅱ2", + Match(0, 3), Match(3, 4), Match(4, 7), Match(7, 8)); diff --git a/tests/set.rs b/tests/set.rs new file mode 100644 index 0000000000..4bd50fb2c8 --- /dev/null +++ b/tests/set.rs @@ -0,0 +1,17 @@ +matset!(set1, &["a", "a"], "a", 0, 1); +matset!(set2, &["a", "a"], "ba", 0, 1); +matset!(set3, &["a", "b"], "a", 0); +matset!(set4, &["a", "b"], "b", 1); +matset!(set5, &["a|b", "b|a"], "b", 0, 1); +matset!(set6, &["foo", "oo"], "foo", 0, 1); +matset!(set7, &["^foo", "bar$"], "foo", 0); +matset!(set8, &["^foo", "bar$"], "foo bar", 0, 1); +matset!(set9, &["^foo", "bar$"], "bar", 1); +matset!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1); +matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); +matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); +matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1); +matset!(set14, &[r".*", "a"], "zzzzzz", 0); + +nomatset!(nset1, &["a", "a"], "b"); +nomatset!(nset2, &["^foo", "bar$"], "bar foo"); diff --git a/tests/test_dynamic_nfa.rs b/tests/test_backtrack.rs similarity index 60% rename from tests/test_dynamic_nfa.rs rename to tests/test_backtrack.rs index 959daa9eb3..880233096f 100644 --- a/tests/test_dynamic_nfa.rs +++ b/tests/test_backtrack.rs @@ -10,18 +10,34 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; extern crate regex; -macro_rules! regex { +macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; - ExecBuilder::new($re).nfa().build().unwrap().into_regex() + ExecBuilder::new($re) + .bounded_backtracking().build().map(|e| e.into_regex()) }} } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); -mod tests; +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs new file mode 100644 index 0000000000..8b15f79ef0 --- /dev/null +++ b/tests/test_backtrack_bytes.rs @@ -0,0 +1,44 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate rand; +extern crate regex; + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex()) + }} +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_dynamic_nfa_bytes.rs b/tests/test_backtrack_utf8bytes.rs similarity index 59% rename from tests/test_dynamic_nfa_bytes.rs rename to tests/test_backtrack_utf8bytes.rs index 3e81c1d20a..12ed55d364 100644 --- a/tests/test_dynamic_nfa_bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -10,18 +10,34 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; extern crate regex; -macro_rules! regex { +macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; - ExecBuilder::new($re).nfa().bytes(true).build().unwrap().into_regex() + ExecBuilder::new($re) + .bounded_backtracking().bytes(true).build().map(|e| e.into_regex()) }} } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); -mod tests; +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_dynamic.rs b/tests/test_default.rs similarity index 74% rename from tests/test_dynamic.rs rename to tests/test_default.rs index 18596ad30b..40dbe9e42e 100644 --- a/tests/test_dynamic.rs +++ b/tests/test_default.rs @@ -10,6 +10,7 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; extern crate regex; // Due to macro scoping rules, this definition only applies for the modules @@ -19,37 +20,53 @@ extern crate regex; // This is also used to test the various matching engines. This one exercises // the normal code path which automatically chooses the engine based on the // regex and the input. Other dynamic tests explicitly set the engine to use. -macro_rules! regex { +macro_rules! regex_new { ($re:expr) => {{ use regex::Regex; - Regex::new($re).unwrap() + Regex::new($re) }} } -macro_rules! regex_set { - ($res:expr) => {{ +macro_rules! regex_set_new { + ($re:expr) => {{ use regex::RegexSet; - RegexSet::new($res).unwrap() + RegexSet::new($re) }} } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + #[cfg(feature = "pattern")] macro_rules! searcher_expr { ($e:expr) => ($e) } #[cfg(not(feature = "pattern"))] macro_rules! searcher_expr { ($e:expr) => ({}) } -mod tests; -mod tests_set; +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); -// Regression test for https://github.com/rust-lang/regex/issues/98 -// -// N.B. This test is here because it wreaks havoc with code generation via -// the `regex!` plugin. -#[test] -fn regression_many_repeat_stack_overflow() { - let re = regex!("^.{1,2500}"); - assert_eq!(re.find("a"), Some((0, 1))); -} +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod unicode; #[test] fn set_empty() { @@ -70,3 +87,8 @@ fn set_one() { err => panic!("expected Error::InvalidSet but got {:?}", err), } } + +#[test] +fn disallow_unicode_flag() { + assert!(regex::Regex::new("(?-u)a").is_err()); +} diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs new file mode 100644 index 0000000000..9f8dc5701b --- /dev/null +++ b/tests/test_default_bytes.rs @@ -0,0 +1,54 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate rand; +extern crate regex; + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::bytes::Regex; + Regex::new($re) + }} +} + +macro_rules! regex_set_new { + ($res:expr) => {{ + use regex::bytes::RegexSet; + RegexSet::new($res) + }} +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod unicode; diff --git a/tests/test_dynamic_backtrack_bytes.rs b/tests/test_nfa.rs similarity index 61% rename from tests/test_dynamic_backtrack_bytes.rs rename to tests/test_nfa.rs index effbe55012..15686cd6fb 100644 --- a/tests/test_dynamic_backtrack_bytes.rs +++ b/tests/test_nfa.rs @@ -10,19 +10,33 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; extern crate regex; -macro_rules! regex { +macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; - ExecBuilder::new($re).bounded_backtracking().bytes(true) - .build().unwrap().into_regex() + ExecBuilder::new($re).nfa().build().map(|e| e.into_regex()) }} } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); -mod tests; +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs new file mode 100644 index 0000000000..2f0cb52f0f --- /dev/null +++ b/tests/test_nfa_bytes.rs @@ -0,0 +1,42 @@ + +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate rand; +extern crate regex; + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .nfa().only_utf8(false).build().map(|e| e.into_byte_regex()) + }} +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_dynamic_backtrack.rs b/tests/test_nfa_utf8bytes.rs similarity index 61% rename from tests/test_dynamic_backtrack.rs rename to tests/test_nfa_utf8bytes.rs index 76b14829c3..d8c45dc7a9 100644 --- a/tests/test_dynamic_backtrack.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -10,19 +10,33 @@ #![cfg_attr(feature = "pattern", feature(pattern))] +extern crate rand; extern crate regex; -macro_rules! regex { +macro_rules! regex_new { ($re:expr) => {{ use regex::internal::ExecBuilder; - ExecBuilder::new($re).bounded_backtracking() - .build().unwrap().into_regex() + ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex()) }} } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); -mod tests; +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod unicode; diff --git a/tests/test_plugin.rs b/tests/test_plugin.rs new file mode 100644 index 0000000000..c58c5a38e6 --- /dev/null +++ b/tests/test_plugin.rs @@ -0,0 +1,30 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(plugin, test)] +#![plugin(regex_macros)] + +extern crate rand; +extern crate regex; +extern crate test; + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod plugin; +mod replace; +mod unicode; diff --git a/tests/tests.rs b/tests/tests.rs deleted file mode 100644 index 9deb0f3d7d..0000000000 --- a/tests/tests.rs +++ /dev/null @@ -1,509 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use regex::{Regex, NoExpand}; - -#[test] -fn eq() { - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} - -#[test] -fn splitn() { - let re = regex!(r"\d+"); - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = re.splitn(text, 2).collect(); - assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); -} - -#[test] -fn split() { - let re = regex!(r"\d+"); - let text = "cauchy123plato456tyler789binx"; - let subs: Vec<&str> = re.split(text).collect(); - assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); -} - -#[test] -fn empty_regex_empty_match() { - let re = regex!(""); - let ms = re.find_iter("").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 0)]); -} - -#[test] -fn empty_regex_nonempty_match() { - let re = regex!(""); - let ms = re.find_iter("abc").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); -} - -#[test] -fn one_zero_length_match() { - let re = regex!(r"\d*"); - let ms = re.find_iter("a1b2").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]); -} - -#[test] -fn many_zero_length_match() { - let re = regex!(r"\d*"); - let ms = re.find_iter("a1bbb2").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]); -} - -#[test] -fn many_sequential_zero_length_match() { - let re = regex!(r"\d?"); - let ms = re.find_iter("a12b3c").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)]); -} - -#[test] -fn quoted_bracket_set() { - let re = regex!(r"([\x{5b}\x{5d}])"); - let ms = re.find_iter("[]").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); - let re = regex!(r"([\[\]])"); - let ms = re.find_iter("[]").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -#[test] -fn first_range_starts_with_left_bracket() { - let re = regex!(r"([[-z])"); - let ms = re.find_iter("[]").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -#[test] -fn range_ends_with_escape() { - let re = regex!(r"([\[-\x{5d}])"); - let ms = re.find_iter("[]").collect::<Vec<_>>(); - assert_eq!(ms, vec![(0, 1), (1, 2)]); -} - -#[test] -fn empty_match_find_iter() { - let re = regex!(r".*?"); - let ms: Vec<_> = re.find_iter("abc").collect(); - assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); -} - -#[test] -fn empty_match_captures_iter() { - let re = regex!(r".*?"); - let ms: Vec<_> = re.captures_iter("abc") - .map(|c| c.pos(0).unwrap()) - .collect(); - assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); -} - -#[test] -fn empty_match_unicode_find_iter() { - let re = regex!(r".*?"); - let ms: Vec<_> = re.find_iter("Ⅰ1Ⅱ2").collect(); - assert_eq!(ms, vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)]); -} - -#[test] -fn empty_match_unicode_captures_iter() { - let re = regex!(r".*?"); - let ms: Vec<_> = re.captures_iter("Ⅰ1Ⅱ2") - .map(|c| c.pos(0).unwrap()) - .collect(); - assert_eq!(ms, vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)]); -} - -#[test] -fn invalid_regexes_no_crash() { - // See: https://github.com/rust-lang/regex/issues/48 - assert!(Regex::new("(*)").is_err()); - assert!(Regex::new("(?:?)").is_err()); - assert!(Regex::new("(?)").is_err()); - assert!(Regex::new("*").is_err()); -} - -macro_rules! searcher { - ($name:ident, $re:expr, $haystack:expr) => ( - searcher!($name, $re, $haystack, vec vec![]); - ); - ($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => ( - searcher!($name, $re, $haystack, vec vec![$($steps),*]); - ); - ($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => ( - searcher!($name, $re, $haystack, vec vec![$($steps),*]); - ); - ($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => ( - #[test] - #[allow(unused_imports)] - fn $name() { - searcher_expr! {{ - use std::str::pattern::{Pattern, Searcher}; - use std::str::pattern::SearchStep::{Match, Reject, Done}; - let re = regex!($re); - let mut se = re.into_searcher($haystack); - let mut got_steps = vec![]; - loop { - match se.next() { - Done => break, - step => { got_steps.push(step); } - } - } - assert_eq!(got_steps, $expect_steps); - }} - } - ); -} - -searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0)); -searcher!(searcher_empty_regex, r"", "ab", - Match(0, 0), Reject(0, 1), Match(1, 1), Reject(1, 2), Match(2, 2)); -searcher!(searcher_empty_haystack, r"\d", ""); -searcher!(searcher_one_match, r"\d", "5", - Match(0, 1)); -searcher!(searcher_no_match, r"\d", "a", - Reject(0, 1)); -searcher!(searcher_two_adjacent_matches, r"\d", "56", - Match(0, 1), Match(1, 2)); -searcher!(searcher_two_non_adjacent_matches, r"\d", "5a6", - Match(0, 1), Reject(1, 2), Match(2, 3)); -searcher!(searcher_reject_first, r"\d", "a6", - Reject(0, 1), Match(1, 2)); -searcher!(searcher_one_zero_length_matches, r"\d*", "a1b2", - Match(0, 0), // ^ - Reject(0, 1), // a - Match(1, 2), // a1 - Reject(2, 3), // a1b - Match(3, 4), // a1b2 -); -searcher!(searcher_many_zero_length_matches, r"\d*", "a1bbb2", - Match(0, 0), // ^ - Reject(0, 1), // a - Match(1, 2), // a1 - Reject(2, 3), // a1b - Match(3, 3), // a1bb - Reject(3, 4), // a1bb - Match(4, 4), // a1bbb - Reject(4, 5), // a1bbb - Match(5, 6), // a1bbba -); -searcher!(searcher_unicode, r".+?", "Ⅰ1Ⅱ2", - Match(0, 3), Match(3, 4), Match(4, 7), Match(7, 8)); - -macro_rules! replace( - ($name:ident, $which:ident, $re:expr, - $search:expr, $replace:expr, $result:expr) => ( - #[test] - fn $name() { - let re = regex!($re); - assert_eq!(re.$which($search, $replace), $result); - } - ); -); - -replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6"); -replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z"); -replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ"); -replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1"); -replace!(rep_double_dollar, replace, - r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1"); -replace!(rep_adjacent_index, replace, - r"([^aeiouy])ies$", "skies", "$1y", "sky"); -replace!(rep_no_expand, replace, - r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1"); -replace!(rep_named, replace_all, - r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)", - "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3"); -replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", - "", "trim me"); -replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b"); -replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b"); - -macro_rules! noparse( - ($name:ident, $re:expr) => ( - #[test] - fn $name() { - let re = $re; - match Regex::new(re) { - Err(_) => {}, - Ok(_) => panic!("Regex '{}' should cause a parse error.", re), - } - } - ); -); - -noparse!(fail_double_repeat, "a**"); -noparse!(fail_no_repeat_arg, "*"); -noparse!(fail_incomplete_escape, "\\"); -noparse!(fail_class_incomplete, "[A-"); -noparse!(fail_class_not_closed, "[A"); -noparse!(fail_class_no_begin, r"[\A]"); -noparse!(fail_class_no_end, r"[\z]"); -noparse!(fail_class_no_boundary, r"[\b]"); -noparse!(fail_open_paren, "("); -noparse!(fail_close_paren, ")"); -noparse!(fail_invalid_range, "[a-Z]"); -noparse!(fail_empty_capture_name, "(?P<>a)"); -noparse!(fail_empty_capture_exp, "(?P<name>)"); -noparse!(fail_bad_capture_name, "(?P<na-me>)"); -noparse!(fail_bad_flag, "(?a)a"); -noparse!(fail_empty_alt_before, "|a"); -noparse!(fail_empty_alt_after, "a|"); -noparse!(fail_too_big, "a{10000000}"); -noparse!(fail_counted_no_close, "a{1001"); -noparse!(fail_unfinished_cap, "(?"); -noparse!(fail_unfinished_escape, "\\"); -noparse!(fail_octal_digit, r"\8"); -noparse!(fail_hex_digit, r"\xG0"); -noparse!(fail_hex_short, r"\xF"); -noparse!(fail_hex_long_digits, r"\x{fffg}"); -noparse!(fail_flag_bad, "(?a)"); -noparse!(fail_flag_empty, "(?)"); -noparse!(fail_double_neg, "(?-i-i)"); -noparse!(fail_neg_empty, "(?i-)"); -noparse!(fail_empty_group, "()"); -noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)"); -noparse!(fail_range_end_no_class, "[a-[:lower:]]"); -noparse!(fail_range_end_no_begin, r"[a-\A]"); -noparse!(fail_range_end_no_end, r"[a-\z]"); -noparse!(fail_range_end_no_boundary, r"[a-\b]"); - -macro_rules! matiter( - ($name:ident, $re:expr, $text:expr) => ( - #[test] - fn $name() { - let text = $text; - let expected: Vec<(usize, usize)> = vec![]; - let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); - if expected != got { - panic!("For RE '{}' against '{:?}', \ - expected '{:?}' but got '{:?}'", - $re, text, expected, got); - } - } - ); - ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( - #[test] - fn $name() { - let text = $text; - let expected: Vec<_> = vec!($($loc)+); - let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); - if expected != got { - panic!("For RE '{}' against '{:?}', \ - expected '{:?}' but got '{:?}'", - $re, text, expected, got); - } - } - ); -); - -macro_rules! mat( - ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( - #[test] - fn $name() { - let text = $text; - let expected: Vec<Option<_>> = vec!($($loc)+); - let r = regex!($re); - let got = match r.captures(text) { - Some(c) => c.iter_pos().collect::<Vec<Option<_>>>(), - None => vec!(None), - }; - // The test set sometimes leave out capture groups, so truncate - // actual capture groups to match test set. - let mut sgot = &got[..]; - if sgot.len() > expected.len() { - sgot = &sgot[0..expected.len()] - } - if expected != sgot { - panic!("For RE '{}' against '{:?}', \ - expected '{:?}' but got '{:?}'", - $re, text, expected, sgot); - } - } - ); -); - -// Some crazy expressions from regular-expressions.info. -mat!(match_ranges, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", - "num: 255", Some((5, 8))); -mat!(match_ranges_not, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", - "num: 256", None); -mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))); -mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))); -mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))); -mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None); -mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", - "mine is jam.slam@gmail.com ", Some((8, 26))); -mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", - "mine is jam.slam@gmail ", None); -mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", - "mine is jam.slam@gmail.com ", Some((8, 26))); -mat!(match_date1, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-01-01", Some((0, 10))); -mat!(match_date2, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-00-01", None); -mat!(match_date3, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", - "1900-13-01", None); - -// Exercise the flags. -mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3))); -mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))); -mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None); -mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))); -mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))); -mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None); -mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2))); -mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))); -mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))); -mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))); -mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))); - -// More exercising of multi-line flag. -matiter!(match_multi_1, r"(?m)^[a-z]+$", "abc\ndef\nxyz", - (0, 3), (4, 7), (8, 11)); -matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz"); -matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", - (0, 0), (4, 4), (8, 8)); -matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", - (3, 3), (7, 7), (11, 11)); -matiter!(match_multi_5, r"(?m)^[a-z]", "abc\ndef\nxyz", - (0, 1), (4, 5), (8, 9)); -matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz"); -matiter!(match_multi_7, r"(?m)[a-z]$", "abc\ndef\nxyz", - (2, 3), (6, 7), (10, 11)); -matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz"); -matiter!(match_multi_9, r"(?m)^$", "", (0, 0)); - -matiter!(match_multi_rep_1, r"(?m)(?:^$)*", "a\nb\nc", - (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); -matiter!(match_multi_rep_2, r"(?m)(?:^|a)+", "a\naaa\n", - (0, 0), (2, 2), (3, 5), (6, 6)); -matiter!(match_multi_rep_3, r"(?m)(?:^|a)*", "a\naaa\n", - (0, 1), (2, 5), (6, 6)); -matiter!(match_multi_rep_4, r"(?m)(?:^[a-z])+", "abc\ndef\nxyz", - (0, 1), (4, 5), (8, 9)); -matiter!(match_multi_rep_5, r"(?m)(?:^[a-z]{3}\n?)+", "abc\ndef\nxyz", - (0, 11)); -matiter!(match_multi_rep_6, r"(?m)(?:^[a-z]{3}\n?)*", "abc\ndef\nxyz", - (0, 11)); -matiter!(match_multi_rep_7, r"(?m)(?:\n?[a-z]{3}$)+", "abc\ndef\nxyz", - (0, 11)); -matiter!(match_multi_rep_8, r"(?m)(?:\n?[a-z]{3}$)*", "abc\ndef\nxyz", - (0, 11)); -matiter!(match_multi_rep_9, r"(?m)^*", "\naa\n", - (0, 0), (1, 1), (2, 2), (3, 3), (4, 4)); -matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", - (0, 0), (1, 1), (4, 4)); -matiter!(match_multi_rep_11, r"(?m)$*", "\naa\n", - (0, 0), (1, 1), (2, 2), (3, 3), (4, 4)); -matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", - (0, 0), (3, 3), (4, 4)); -matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", - (0, 2), (5, 7)); -matiter!(match_multi_rep_14, r"(?m)(?:$\n)*", "\n\naaa\n\n", - (0, 2), (3, 3), (4, 4), (5, 7)); -matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", - (0, 2), (5, 7)); -matiter!(match_multi_rep_16, r"(?m)(?:^|$)+", "\n\naaa\n\n", - (0, 0), (1, 1), (2, 2), (5, 5), (6, 6), (7, 7)); -// matiter!(match_multi_rep_14, r"(?m)(?:$\n)*", "\n\naaa\n\n", - // (0, 2), (3, 3), (4, 4), (5, 7)); - -matiter!(match_start_end_empty, r"^$", "", (0, 0)); -matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0)); -matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0)); -matiter!(match_start_end_empty_rev, r"$^", "", (0, 0)); -matiter!(match_start_end_empty_rep, r"(?:^$)*", "a\nb\nc", - (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); -matiter!(match_start_end_empty_rep_rev, r"(?:$^)*", "a\nb\nc", - (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); - -// Some Unicode tests. -// A couple of these are commented out because something in the guts of macro -// expansion is creating invalid byte strings. -mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); -mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); -mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); -mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); -mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); -mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); -mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); -mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); - -// https://github.com/rust-lang/regex/issues/76 -mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); - -// Test the Unicode friendliness of Perl character classes. -mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); -mat!(uni_perl_w_not, r"\w+", "⥡", None); -mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); -mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); -mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); -mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); -mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); -mat!(uni_perl_s_not, r"\s+", "☃", None); -mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); - -// And do the same for word boundaries. -mat!(uni_boundary_none, r"\d\b", "6δ", None); -mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); - -// Test negated character classes. -mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3))); -mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3))); -mat!(negclass_letter_space, r"[^a\s]", "a x", Some((2, 3))); -mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3))); -mat!(negclass_space, r"[^\s]", " a", Some((1, 2))); -mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3))); -mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3))); -mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); - -// Regression test for https://github.com/rust-lang/regex/issues/75 -mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2))); -mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2))); - -// Regression tests for https://github.com/rust-lang/regex/issues/99 -mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); -mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); - -// Regression test for https://github.com/rust-lang/regex/issues/101 -mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1))); - -// Regression test for https://github.com/rust-lang-nursery/regex/issues/129 -#[test] -fn regression_captures_rep() { - let re = regex!(r"([a-f]){2}(?P<foo>[x-z])"); - let caps = re.captures("abx").unwrap(); - assert_eq!(caps.name("foo").unwrap(), "x"); -} - -// Regression test for https://github.com/rust-lang-nursery/regex/issues/153 -mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1))); -mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); - -mat!(one_unicode, r"☃", "☃", Some((0, 3))); - -// Regression test for https://github.com/rust-lang-nursery/regex/issues/169 -mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); - -// A whole mess of tests from Glenn Fowler's regex test suite. -// Generated by the 'src/etc/regex-match-tests' program. -#[path = "matches.rs"] -mod matches; diff --git a/tests/tests_set.rs b/tests/tests_set.rs deleted file mode 100644 index a0712c89ee..0000000000 --- a/tests/tests_set.rs +++ /dev/null @@ -1,45 +0,0 @@ -macro_rules! mat { - ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => { - #[test] - fn $name() { - let set = regex_set!($res); - assert!(set.is_match($text)); - let expected = vec![$($match_index),*]; - let matches = set.matches($text); - assert!(matches.matched_any()); - let got: Vec<_> = matches.into_iter().collect(); - assert_eq!(expected, got); - } - } -} - -mat!(set1, &["a", "a"], "a", 0, 1); -mat!(set2, &["a", "a"], "ba", 0, 1); -mat!(set3, &["a", "b"], "a", 0); -mat!(set4, &["a", "b"], "b", 1); -mat!(set5, &["a|b", "b|a"], "b", 0, 1); -mat!(set6, &["foo", "oo"], "foo", 0, 1); -mat!(set7, &["^foo", "bar$"], "foo", 0); -mat!(set8, &["^foo", "bar$"], "foo bar", 0, 1); -mat!(set9, &["^foo", "bar$"], "bar", 1); -mat!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1); -mat!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); -mat!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); -mat!(set13, &[r".*", "a"], "zzzzzza", 0, 1); -mat!(set14, &[r".*", "a"], "zzzzzz", 0); - -macro_rules! nomat { - ($name:ident, $res:expr, $text:expr) => { - #[test] - fn $name() { - let set = regex_set!($res); - assert!(!set.is_match($text)); - let matches = set.matches($text); - assert!(!matches.matched_any()); - assert_eq!(0, matches.into_iter().count()); - } - } -} - -nomat!(nset1, &["a", "a"], "b"); -nomat!(nset2, &["^foo", "bar$"], "bar foo"); diff --git a/tests/unicode.rs b/tests/unicode.rs new file mode 100644 index 0000000000..bac42ed5fa --- /dev/null +++ b/tests/unicode.rs @@ -0,0 +1,28 @@ +mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3))); +mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3))); +mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2))); +mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5))); +mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2))); +mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10))); + +// Test the Unicode friendliness of Perl character classes. +mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4))); +mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None); +mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3))); +mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None); +mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3))); +mat!(uni_perl_s_not, u!(r"\s+"), "☃", None); +mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3))); + +// And do the same for word boundaries. +mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None); +mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None);