From e4a99a011fde850083c0f1394e8f38fcb4e60ca7 Mon Sep 17 00:00:00 2001 From: Ambareesh Balaji Date: Fri, 30 Aug 2024 04:51:44 +0000 Subject: [PATCH] disable unicode in network filter regex --- src/engine.rs | 7 ++++--- src/filters/network.rs | 17 ++++++++++------- src/optimizer.rs | 8 +++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 1dfecaee..7dd3bd93 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -768,14 +768,15 @@ mod tests { "script").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ - { + // fails - unicode not supported in network filter + /*{ let engine = Engine::from_rules_debug([r#"/tesT߶/$domain=example.com"#], Default::default()); let request = Request::new("https://example.com/tesT߶", "https://example.com", "script").unwrap(); assert!(engine.check_network_request(&request).matched); - } - // fails - punycoded domain + }*/ + // fails - unicode not supported in network filter /*{ let engine = Engine::from_rules_debug([r#"/tesT߶/$domain=example.com"#], Default::default()); let request = Request::new("https://example-tesT߶.com/tesT", diff --git a/src/filters/network.rs b/src/filters/network.rs index c0c6d096..44e81ce3 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -3,7 +3,10 @@ use memchr::{memchr as find_char, memmem, memrchr as find_char_reverse}; use once_cell::sync::Lazy; -use regex::{Regex, RegexSet}; +use regex::{ + bytes::Regex as BytesRegex, bytes::RegexBuilder as BytesRegexBuilder, + bytes::RegexSet as BytesRegexSet, bytes::RegexSetBuilder as BytesRegexSetBuilder, Regex, +}; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -180,8 +183,8 @@ impl From<&request::RequestType> for NetworkFilterMask { #[derive(Debug, Clone)] pub enum CompiledRegex { - Compiled(Regex), - CompiledSet(RegexSet), + Compiled(BytesRegex), + CompiledSet(BytesRegexSet), MatchAll, RegexParsingError(regex::Error), } @@ -191,11 +194,11 @@ impl CompiledRegex { match &self { CompiledRegex::MatchAll => true, // simple case for matching everything, e.g. for empty filter CompiledRegex::RegexParsingError(_e) => false, // no match if regex didn't even compile - CompiledRegex::Compiled(r) => r.is_match(pattern), + CompiledRegex::Compiled(r) => r.is_match(pattern.as_bytes()), CompiledRegex::CompiledSet(r) => { // let matches: Vec<_> = r.matches(pattern).into_iter().collect(); // println!("Matching {} against RegexSet: {:?}", pattern, matches); - r.is_match(pattern) + r.is_match(pattern.as_bytes()) } } } @@ -1225,7 +1228,7 @@ pub fn compile_regex( CompiledRegex::MatchAll } else if escaped_patterns.len() == 1 { let pattern = &escaped_patterns[0]; - match Regex::new(pattern) { + match BytesRegexBuilder::new(pattern).unicode(false).build() { Ok(compiled) => CompiledRegex::Compiled(compiled), Err(e) => { // println!("Regex parsing failed ({:?})", e); @@ -1233,7 +1236,7 @@ pub fn compile_regex( } } } else { - match RegexSet::new(escaped_patterns) { + match BytesRegexSetBuilder::new(escaped_patterns).unicode(false).build() { Ok(compiled) => CompiledRegex::CompiledSet(compiled), Err(e) => CompiledRegex::RegexParsingError(e), } diff --git a/src/optimizer.rs b/src/optimizer.rs index 986e29fe..8efdf6b0 100644 --- a/src/optimizer.rs +++ b/src/optimizer.rs @@ -209,7 +209,7 @@ mod optimization_tests_pattern_group { use crate::lists; use crate::regex_manager::RegexManager; use crate::request::Request; - use regex::RegexSet; + use regex::bytes::RegexSetBuilder as BytesRegexSetBuilder; fn check_regex_match(regex: &CompiledRegex, pattern: &str, matches: bool) { let is_match = regex.is_match(pattern); @@ -244,13 +244,15 @@ mod optimization_tests_pattern_group { #[test] fn regex_set_works() { - let regex_set = RegexSet::new(&[ + let regex_set = BytesRegexSetBuilder::new(&[ r"/static/ad\.", "/static/ad-", "/static/ad/.*", "/static/ads/.*", "/static/adv/.*", - ]); + ]) + .unicode(false) + .build(); let fused_regex = CompiledRegex::CompiledSet(regex_set.unwrap()); assert!(matches!(fused_regex, CompiledRegex::CompiledSet(_)));