From 7a52acb6031cc532a2d97e5fc3ddf460dff8e7d8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 28 Apr 2018 13:20:09 -0400
Subject: [PATCH] regex: ban (?-u:\B) for Unicode regexes

The issue with the ASCII version of \B is that it can match between code
units of UTF-8, which means it can cause match indices reported to be on
invalid UTF-8 boundaries. Therefore, similar to things like `(?-u:\xFF)`,
we ban negated ASCII word boundaries from Unicode regular expressions.
Normal ASCII word boundaries remain accessible from Unicode regular
expressions.

See #457
---
 regex-syntax/src/hir/translate.rs | 20 ++++++++------------
 tests/bytes.rs                    | 10 ++++++++++
 tests/regression.rs               |  5 -----
 tests/word_boundary_unicode.rs    |  2 --
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 0cb60acfd8..8427fb2408 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -724,13 +724,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                     // It is possible for negated ASCII word boundaries to
                     // match at invalid UTF-8 boundaries, even when searching
                     // valid UTF-8.
-                    //
-                    // TODO(ag): Enable this error when regex goes to 1.0.
-                    // Otherwise, it is too steep of a breaking change.
-                    // if !self.trans().allow_invalid_utf8 {
-                        // return Err(self.error(
-                            // asst.span, ErrorKind::InvalidUtf8));
-                    // }
+                    if !self.trans().allow_invalid_utf8 {
+                        return Err(self.error(
+                            asst.span, ErrorKind::InvalidUtf8));
+                    }
                     hir::WordBoundary::AsciiNegate
                 })
             }
@@ -1511,11 +1508,10 @@ mod tests {
             t_bytes(r"(?-u)\B"),
             hir_word(hir::WordBoundary::AsciiNegate));
 
-        // TODO(ag): Enable this tests when regex goes to 1.0.
-        // assert_eq!(t_err(r"(?-u)\B"), TestError {
-            // kind: hir::ErrorKind::InvalidUtf8,
-            // span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
-        // });
+        assert_eq!(t_err(r"(?-u)\B"), TestError {
+            kind: hir::ErrorKind::InvalidUtf8,
+            span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
+        });
     }
 
     #[test]
diff --git a/tests/bytes.rs b/tests/bytes.rs
index 0285950a1a..0b0f008d6e 100644
--- a/tests/bytes.rs
+++ b/tests/bytes.rs
@@ -60,3 +60,13 @@ matiter!(invalidutf8_anchor3,
 fn negated_full_byte_range() {
      assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
 }
+
+matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(word_boundary_ascii2, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
+
+// See: https://github.com/rust-lang/regex/issues/264
+mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
+mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
+
+// See: https://github.com/rust-lang/regex/issues/271
+mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
diff --git a/tests/regression.rs b/tests/regression.rs
index 1bc79ac7e7..a09333e2c3 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -61,10 +61,6 @@ matiter!(word_boundary_dfa, r"\b", "a b c",
 // See: https://github.com/rust-lang/regex/issues/268
 matiter!(partial_anchor, r"^a|b", "ba", (0, 1));
 
-// See: https://github.com/rust-lang/regex/issues/264
-mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
-mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
-
 // See: https://github.com/rust-lang/regex/issues/280
 ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false);
 ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
@@ -77,7 +73,6 @@ mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
      "CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));
 
 // See: https://github.com/rust-lang/regex/issues/271
-mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
 mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
 mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4)));
 mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4)));
diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs
index 43612a91ac..c41355ffc4 100644
--- a/tests/word_boundary_unicode.rs
+++ b/tests/word_boundary_unicode.rs
@@ -4,5 +4,3 @@ matiter!(unicode1, r"\bx\b", "áxβ");
 matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
 
 matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
-matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
-matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));