Skip to content

Commit

Permalink
regex: ban (?-u:\B) for Unicode regexes
Browse files Browse the repository at this point in the history
The issue with the ASCII version of \B is that it can match between code
units of UTF-8, which means it can cause match indices reported to be on
invalid UTF-8 boundaries. Therefore, similar to things like `(?-u:\xFF)`,
we ban negated ASCII word boundaries from Unicode regular expressions.
Normal ASCII word boundaries remain accessible from Unicode regular
expressions.

See rust-lang#457
  • Loading branch information
BurntSushi committed May 1, 2018
1 parent 8ad256b commit 7a52acb
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 19 deletions.
20 changes: 8 additions & 12 deletions regex-syntax/src/hir/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,13 +724,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
// It is possible for negated ASCII word boundaries to
// match at invalid UTF-8 boundaries, even when searching
// valid UTF-8.
//
// TODO(ag): Enable this error when regex goes to 1.0.
// Otherwise, it is too steep of a breaking change.
// if !self.trans().allow_invalid_utf8 {
// return Err(self.error(
// asst.span, ErrorKind::InvalidUtf8));
// }
if !self.trans().allow_invalid_utf8 {
return Err(self.error(
asst.span, ErrorKind::InvalidUtf8));
}
hir::WordBoundary::AsciiNegate
})
}
Expand Down Expand Up @@ -1511,11 +1508,10 @@ mod tests {
t_bytes(r"(?-u)\B"),
hir_word(hir::WordBoundary::AsciiNegate));

// TODO(ag): Enable this tests when regex goes to 1.0.
// assert_eq!(t_err(r"(?-u)\B"), TestError {
// kind: hir::ErrorKind::InvalidUtf8,
// span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
// });
assert_eq!(t_err(r"(?-u)\B"), TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
});
}

#[test]
Expand Down
10 changes: 10 additions & 0 deletions tests/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,13 @@ matiter!(invalidutf8_anchor3,
fn negated_full_byte_range() {
assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
}

matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(word_boundary_ascii2, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));

// See: https://github.com/rust-lang/regex/issues/264
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));

// See: https://github.com/rust-lang/regex/issues/271
mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
5 changes: 0 additions & 5 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ matiter!(word_boundary_dfa, r"\b", "a b c",
// See: https://github.com/rust-lang/regex/issues/268
matiter!(partial_anchor, r"^a|b", "ba", (0, 1));

// See: https://github.com/rust-lang/regex/issues/264
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));

// See: https://github.com/rust-lang/regex/issues/280
ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false);
ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
Expand All @@ -77,7 +73,6 @@ mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
"CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));

// See: https://github.com/rust-lang/regex/issues/271
mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4)));
mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4)));
Expand Down
2 changes: 0 additions & 2 deletions tests/word_boundary_unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,3 @@ matiter!(unicode1, r"\bx\b", "áxβ");
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));

0 comments on commit 7a52acb

Please sign in to comment.