From f6b68ba42714418e9e9cecad39d34fb212e28e75 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 3 Mar 2023 09:32:22 -0500 Subject: [PATCH] syntax: permit most no-op escape sequences This resolves a long-standing (but somewhat minor) complaint that folks have with the regex crate: it does not permit escaping punctuation characters in cases where those characters do not need to be escaped. So things like \/, \" and \! would result in parse errors. Most other regex engines permit these, even in cases where they aren't needed. I had been against doing this for future evolution purposes, but it's incredibly unlikely that we're ever going to add a new meta character to the syntax. I literally cannot think of any conceivable future in which that might happen. However, we do continue to ban escapes for [0-9A-Za-z<>], because it is conceivable that we might add new escape sequences for those characters. (And 0-9 are already banned by virtue of them looking too much like backreferences, which aren't supported.) For example, we could add \Q...\E literal syntax. Or \< and \> as start and end word boundaries, as found in POSIX regex engines. Fixes #501 --- regex-syntax/src/ast/mod.rs | 9 ++- regex-syntax/src/ast/parse.rs | 133 +++++++++++++++++++++------------- regex-syntax/src/ast/print.rs | 2 +- regex-syntax/src/lib.rs | 102 ++++++++++++++++++++++++-- 4 files changed, 185 insertions(+), 61 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index a79133272..8f73d5a62 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -588,9 +588,12 @@ impl Literal { pub enum LiteralKind { /// The literal is written verbatim, e.g., `a` or `☃`. Verbatim, - /// The literal is written as an escape because it is punctuation, e.g., - /// `\*` or `\[`. - Punctuation, + /// The literal is written as an escape because it is otherwise a special + /// regex meta character, e.g., `\*` or `\[`. + Meta, + /// The literal is written as an escape despite the fact that the escape is + /// unnecessary, e.g., `\%` or `\/`. + Superfluous, /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 93452cb18..901250f61 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -18,7 +18,7 @@ use alloc::{ use crate::{ ast::{self, Ast, Position, Span}, either::Either, - is_meta_character, + is_escapeable_character, is_meta_character, }; type Result = core::result::Result; @@ -1495,7 +1495,14 @@ impl<'s, P: Borrow> ParserI<'s, P> { if is_meta_character(c) { return Ok(Primitive::Literal(ast::Literal { span, - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, + c, + })); + } + if is_escapeable_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Superfluous, c, })); } @@ -1513,9 +1520,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), - ' ' if self.ignore_whitespace() => { - special(ast::SpecialLiteralKind::Space, ' ') - } 'A' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::StartText, @@ -2420,13 +2424,9 @@ mod tests { lit_with(c, span(start..start + c.len_utf8())) } - /// Create a punctuation literal starting at the given position. - fn punct_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Punctuation, - c, - }) + /// Create a meta literal starting at the given position. + fn meta_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. @@ -2710,24 +2710,24 @@ bar Ok(concat( 0..36, vec![ - punct_lit('\\', span(0..2)), - punct_lit('.', span(2..4)), - punct_lit('+', span(4..6)), - punct_lit('*', span(6..8)), - punct_lit('?', span(8..10)), - punct_lit('(', span(10..12)), - punct_lit(')', span(12..14)), - punct_lit('|', span(14..16)), - punct_lit('[', span(16..18)), - punct_lit(']', span(18..20)), - punct_lit('{', span(20..22)), - punct_lit('}', span(22..24)), - punct_lit('^', span(24..26)), - punct_lit('$', span(26..28)), - punct_lit('#', span(28..30)), - punct_lit('&', span(30..32)), - punct_lit('-', span(32..34)), - punct_lit('~', span(34..36)), + meta_lit('\\', span(0..2)), + meta_lit('.', span(2..4)), + meta_lit('+', span(4..6)), + meta_lit('*', span(6..8)), + meta_lit('?', span(8..10)), + meta_lit('(', span(10..12)), + meta_lit(')', span(12..14)), + meta_lit('|', span(14..16)), + meta_lit('[', span(16..18)), + meta_lit(']', span(18..20)), + meta_lit('{', span(20..22)), + meta_lit('}', span(22..24)), + meta_lit('^', span(24..26)), + meta_lit('$', span(26..28)), + meta_lit('#', span(28..30)), + meta_lit('&', span(30..32)), + meta_lit('-', span(32..34)), + meta_lit('~', span(34..36)), ] )) ); @@ -2879,23 +2879,12 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Literal(ast::Literal { span: span_range(pat, 4..6), - kind: ast::LiteralKind::Special( - ast::SpecialLiteralKind::Space - ), + kind: ast::LiteralKind::Superfluous, c: ' ', }), ] )) ); - // ... but only when `x` mode is enabled. - let pat = r"\ "; - assert_eq!( - parser(pat).parse().unwrap_err(), - TestError { - span: span_range(pat, 0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); } #[test] @@ -4246,7 +4235,7 @@ bar parser(r"\|").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '|', })) ); @@ -4297,11 +4286,26 @@ bar })) ); + // We also support superfluous escapes in most cases now too. + for c in ['!', '@', '%', '"', '\'', '/', ' '] { + let pat = format!(r"\{}", c); + assert_eq!( + parser(&pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Superfluous, + c, + })) + ); + } + + // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This + // gives flexibility for future evolution. assert_eq!( - parser(r"\").parse_escape().unwrap_err(), + parser(r"\e").parse_escape().unwrap_err(), TestError { - span: span(0..1), - kind: ast::ErrorKind::EscapeUnexpectedEof, + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, } ); assert_eq!( @@ -4311,6 +4315,31 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); + // But also, < and > are banned, so that we may evolve them into + // start/end word boundary assertions. (Not sure if we will...) + assert_eq!( + parser(r"\<").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + assert_eq!( + parser(r"\>").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + + // An unfinished escape is illegal. + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); } #[test] @@ -4907,7 +4936,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: ']', }), ] @@ -4925,7 +4954,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '-', }), lit(span(4..5), 'z'), @@ -5117,7 +5146,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '^', })), itemset(lit(span(5..6), '^')), @@ -5133,7 +5162,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '&', })), itemset(lit(span(5..6), '&')), @@ -5198,7 +5227,7 @@ bar lit(span(1..2), ']'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', }), ] @@ -5216,7 +5245,7 @@ bar kind: itemset(ast::ClassSetItem::Literal( ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', } )), diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 40f967cfa..86a87e143 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -216,7 +216,7 @@ impl Writer { match ast.kind { Verbatim => self.wtr.write_char(ast.c), - Punctuation => write!(self.wtr, r"\{}", ast.c), + Meta | Superfluous => write!(self.wtr, r"\{}", ast.c), Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), HexFixed(ast::HexLiteralKind::X) => { write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 10540cab5..4953641d7 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -215,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) { /// Returns true if the given character has significance in a regex. /// -/// These are the only characters that are allowed to be escaped, with one -/// exception: an ASCII space character may be escaped when extended mode (with -/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns -/// `false`. +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. /// /// Note that the set of characters for which this function returns `true` or -/// `false` is fixed and won't change in a semver compatible release. +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` pub fn is_meta_character(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' @@ -230,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool { } } +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + '<' | '>' => false, + _ => true, + } +} + /// Returns true if and only if the given character is a Unicode word /// character. ///