From b6085a96c246c61b8ad41ec5e2bfb95b52618943 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 1 Mar 2023 22:28:42 -0500 Subject: [PATCH] syntax: add support for CRLF-aware line anchors This adds Look::StartCRLF and Look::EndCRLF. And also adds a new flag, 'R', for making ^/$ be CRLF aware in multi-line mode. The 'R' flag also causes '.' to *not* match \r in addition to \n (unless the 's' flag is enabled of course). The intended semantics are that CRLF mode makes \r\n, \r and \n line terminators but with one key property: \r\n is treated as a single line terminator. That is, ^/$ do not match between \r and \n. This partially addresses #244 by adding syntax support. Currently, if you try to use this new flag, the regex compiler will report an error. We intend to finish support for this once #656 is complete. (Indeed, at time of writing, CRLF matching works in regex-automata.) --- regex-syntax/src/ast/mod.rs | 2 + regex-syntax/src/ast/parse.rs | 30 ++++++++ regex-syntax/src/ast/print.rs | 1 + regex-syntax/src/hir/mod.rs | 83 +++++++++++++++++------ regex-syntax/src/hir/print.rs | 6 ++ regex-syntax/src/hir/translate.rs | 109 ++++++++++++++++++++++++++++-- regex-syntax/src/parser.rs | 17 +++++ src/compile.rs | 6 ++ 8 files changed, 226 insertions(+), 28 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index f36f27791..9be867c56 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -1314,6 +1314,8 @@ pub enum Flag { SwapGreed, /// `u` Unicode, + /// `R` + CRLF, /// `x` IgnoreWhitespace, } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 1d6d4d046..93452cb18 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1381,6 +1381,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { 's' => Ok(ast::Flag::DotMatchesNewLine), 'U' => Ok(ast::Flag::SwapGreed), 'u' => Ok(ast::Flag::Unicode), + 'R' => Ok(ast::Flag::CRLF), 'x' => Ok(ast::Flag::IgnoreWhitespace), _ => { Err(self @@ -4084,6 +4085,34 @@ bar ], }) ); + assert_eq!( + parser("i-sR:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), + }, + ], + }) + ); assert_eq!( parser("isU").parse_flags().unwrap_err(), @@ -4145,6 +4174,7 @@ bar assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); assert_eq!( diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 0922ea0e3..40f967cfa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -289,6 +289,7 @@ impl Writer { Flag::DotMatchesNewLine => self.wtr.write_str("s"), Flag::SwapGreed => self.wtr.write_str("U"), Flag::Unicode => self.wtr.write_str("u"), + Flag::CRLF => self.wtr.write_str("R"), Flag::IgnoreWhitespace => self.wtr.write_str("x"), }, }?; diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index a2465e98a..ae361f48a 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -471,10 +471,12 @@ impl Hir { /// Returns an HIR expression for `.`. /// - /// * [`Dot::AnyChar`] maps to `(?su:.)`. - /// * [`Dot::AnyByte`] maps to `(?s-u:.)`. - /// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`. - /// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`. + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// /// Note that this is a convenience routine for constructing the correct /// character class based on the value of `Dot`. There is no explicit "dot" @@ -492,18 +494,32 @@ impl Hir { cls.push(ClassBytesRange::new(b'\0', b'\xFF')); Hir::class(Class::Bytes(cls)) } - Dot::AnyCharExceptNL => { + Dot::AnyCharExceptLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } - Dot::AnyByteExceptNL => { + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExceptLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); Hir::class(Class::Bytes(cls)) } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } } } } @@ -1365,6 +1381,16 @@ pub enum Look { /// at the end position of the input, or at the position immediately /// preceding a `\n` character. EndLF, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. @@ -1380,30 +1406,34 @@ pub enum Look { } impl Look { - fn from_repr(repr: u8) -> Option { + fn from_repr(repr: u16) -> Option { match repr { 0 => Some(Look::Start), 1 => Some(Look::End), 2 => Some(Look::StartLF), 3 => Some(Look::EndLF), - 4 => Some(Look::WordAscii), - 5 => Some(Look::WordAsciiNegate), - 6 => Some(Look::WordUnicode), - 7 => Some(Look::WordUnicodeNegate), + 4 => Some(Look::StartCRLF), + 5 => Some(Look::EndCRLF), + 6 => Some(Look::WordAscii), + 7 => Some(Look::WordAsciiNegate), + 8 => Some(Look::WordUnicode), + 9 => Some(Look::WordUnicodeNegate), _ => None, } } - fn as_repr(&self) -> u8 { + fn as_repr(&self) -> u16 { match *self { Look::Start => 0, Look::End => 1, Look::StartLF => 2, Look::EndLF => 3, - Look::WordAscii => 4, - Look::WordAsciiNegate => 5, - Look::WordUnicode => 6, - Look::WordUnicodeNegate => 7, + Look::StartCRLF => 5, + Look::EndCRLF => 5, + Look::WordAscii => 6, + Look::WordAsciiNegate => 7, + Look::WordUnicode => 8, + Look::WordUnicodeNegate => 9, } } @@ -1413,6 +1443,8 @@ impl Look { Look::End => 'z', Look::StartLF => '^', Look::EndLF => '$', + Look::StartCRLF => '^', + Look::EndCRLF => '$', Look::WordAscii => 'b', Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', @@ -1505,11 +1537,20 @@ pub enum Dot { /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. /// /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. - AnyCharExceptNL, + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, /// Matches any byte value except for `\n`. /// /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. - AnyByteExceptNL, + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, } /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack @@ -2038,7 +2079,7 @@ impl Properties { /// example, an [`Hir`] provides properties that return `LookSet`s. #[derive(Clone, Copy, Default, Eq, PartialEq)] pub struct LookSet { - bits: u8, + bits: u16, } impl LookSet { @@ -2170,8 +2211,8 @@ impl Iterator for LookSetIter { #[inline] fn next(&mut self) -> Option { // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a usize. - let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap(); + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); let look = Look::from_repr(repr)?; self.set.remove(look); Some(look) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 40f8905b7..fcb7cd252 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -177,6 +177,12 @@ impl Visitor for Writer { hir::Look::EndLF => { self.wtr.write_str("(?m:$)")?; } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; + } + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; + } hir::Look::WordAscii => { self.wtr.write_str(r"(?-u:\b)")?; } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index b1ebf7b17..c1ebf85c2 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -85,6 +85,12 @@ impl TranslatorBuilder { self } + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; @@ -866,14 +872,23 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_assertion(&self, asst: &ast::Assertion) -> Result { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); Ok(match asst.kind { ast::AssertionKind::StartLine => Hir::look(if multi_line { - hir::Look::StartLF + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } } else { hir::Look::Start }), ast::AssertionKind::EndLine => Hir::look(if multi_line { - hir::Look::EndLF + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } } else { hir::Look::End }), @@ -1146,6 +1161,7 @@ struct Flags { dot_matches_new_line: Option, swap_greed: Option, unicode: Option, + crlf: Option, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } @@ -1174,6 +1190,9 @@ impl Flags { ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } @@ -1196,6 +1215,9 @@ impl Flags { if self.unicode.is_none() { self.unicode = previous.unicode; } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } } fn dot(&self) -> hir::Dot { @@ -1207,9 +1229,17 @@ impl Flags { } } else { if self.unicode() { - hir::Dot::AnyCharExceptNL + if self.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + hir::Dot::AnyCharExceptLF + } } else { - hir::Dot::AnyByteExceptNL + if self.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExceptLF + } } } } @@ -1233,6 +1263,10 @@ impl Flags { fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { @@ -1678,14 +1712,32 @@ mod tests { fn dot() { assert_eq!( t("."), - hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) ); - assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), - hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( @@ -1698,6 +1750,16 @@ mod tests { ), } ); + assert_eq!( + t_err("(?R-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); assert_eq!( t_err("(?s-u)."), TestError { @@ -1708,6 +1770,16 @@ mod tests { ), } ); + assert_eq!( + t_err("(?Rs-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(8, 1, 9) + ), + } + ); } #[test] @@ -1795,6 +1867,29 @@ mod tests { ); } + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); + } + #[test] fn flags() { #[cfg(feature = "unicode-case")] diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 8eb88e042..2851cda33 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -134,6 +134,23 @@ impl ParserBuilder { self } + /// Enable or disable the CRLF mode flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.crlf(yes); + self + } + /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively diff --git a/src/compile.rs b/src/compile.rs index 9ee52354d..20eebf0ed 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -326,6 +326,12 @@ impl Compiler { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::EndLine) } + hir::Look::StartCRLF | hir::Look::EndCRLF => { + return Err(Error::Syntax( + "CRLF-aware line anchors are not supported yet" + .to_string(), + )); + } hir::Look::WordAscii => { self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)