From 7a52acb6031cc532a2d97e5fc3ddf460dff8e7d8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 28 Apr 2018 13:20:09 -0400 Subject: [PATCH] regex: ban (?-u:\B) for Unicode regexes The issue with the ASCII version of \B is that it can match between code units of UTF-8, which means it can cause match indices reported to be on invalid UTF-8 boundaries. Therefore, similar to things like `(?-u:\xFF)`, we ban negated ASCII word boundaries from Unicode regular expressions. Normal ASCII word boundaries remain accessible from Unicode regular expressions. See #457 --- regex-syntax/src/hir/translate.rs | 20 ++++++++------------ tests/bytes.rs | 10 ++++++++++ tests/regression.rs | 5 ----- tests/word_boundary_unicode.rs | 2 -- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 0cb60acfd8..8427fb2408 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -724,13 +724,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // It is possible for negated ASCII word boundaries to // match at invalid UTF-8 boundaries, even when searching // valid UTF-8. - // - // TODO(ag): Enable this error when regex goes to 1.0. - // Otherwise, it is too steep of a breaking change. - // if !self.trans().allow_invalid_utf8 { - // return Err(self.error( - // asst.span, ErrorKind::InvalidUtf8)); - // } + if !self.trans().allow_invalid_utf8 { + return Err(self.error( + asst.span, ErrorKind::InvalidUtf8)); + } hir::WordBoundary::AsciiNegate }) } @@ -1511,11 +1508,10 @@ mod tests { t_bytes(r"(?-u)\B"), hir_word(hir::WordBoundary::AsciiNegate)); - // TODO(ag): Enable this tests when regex goes to 1.0. - // assert_eq!(t_err(r"(?-u)\B"), TestError { - // kind: hir::ErrorKind::InvalidUtf8, - // span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)), - // }); + assert_eq!(t_err(r"(?-u)\B"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)), + }); } #[test] diff --git a/tests/bytes.rs b/tests/bytes.rs index 0285950a1a..0b0f008d6e 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -60,3 +60,13 @@ matiter!(invalidutf8_anchor3, fn negated_full_byte_range() { assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); } + +matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(word_boundary_ascii2, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); + +// See: https://github.com/rust-lang/regex/issues/264 +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); + +// See: https://github.com/rust-lang/regex/issues/271 +mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); diff --git a/tests/regression.rs b/tests/regression.rs index 1bc79ac7e7..a09333e2c3 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -61,10 +61,6 @@ matiter!(word_boundary_dfa, r"\b", "a b c", // See: https://github.com/rust-lang/regex/issues/268 matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); -// See: https://github.com/rust-lang/regex/issues/264 -mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); -mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); - // See: https://github.com/rust-lang/regex/issues/280 ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false); ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false); @@ -77,7 +73,6 @@ mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P[0-9]+)$", "CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8))); // See: https://github.com/rust-lang/regex/issues/271 -mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs index 43612a91ac..c41355ffc4 100644 --- a/tests/word_boundary_unicode.rs +++ b/tests/word_boundary_unicode.rs @@ -4,5 +4,3 @@ matiter!(unicode1, r"\bx\b", "áxβ"); matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); -matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); -matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));