From c436bfdf2232deab2dd4027ec9756ecc2ea23555 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Wed, 22 Feb 2017 16:41:03 +1100 Subject: [PATCH] Support nested character classes and intersection with `&&` This implements parts of UTS#18 RL1.3, namely: * Nested character classes, e.g.: `[a[b-c]]` * Intersections in classes, e.g.: `[\w&&\p{Greek}]` They can be combined to do things like `[\w&&[^a]]` to get all word characters except `a`. Fixes #341 --- regex-syntax/src/lib.rs | 159 ++++++++++++- regex-syntax/src/parser.rs | 462 ++++++++++++++++++++++++++++++++----- 2 files changed, 567 insertions(+), 54 deletions(-) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index d10b2a50ec..0e16d04e51 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -680,7 +680,7 @@ impl CharClass { self.canonicalize() } - /// Canonicalze any sequence of ranges. + /// Canonicalize any sequence of ranges. /// /// This is responsible for enforcing the canonical format invariants /// as described on the docs for the `CharClass` type. @@ -703,6 +703,43 @@ impl CharClass { ordered } + /// Calculate the intersection of two canonical character classes. + /// + /// The returned intersection is canonical. + fn intersection(&self, other: &CharClass) -> CharClass { + if self.ranges.is_empty() || other.ranges.is_empty() { + return CharClass::empty(); + } + + let mut intersection = CharClass::empty(); + + let mut iter_a = self.ranges.iter(); + let mut iter_b = other.ranges.iter(); + let mut a = iter_a.next().unwrap(); + let mut b = iter_b.next().unwrap(); + loop { + if let Some(i) = a.intersection(&b) { + intersection.ranges.push(i); + } + + // If the range with the smaller end didn't match this time, + // it won't ever match, so move on to the next one. + let (iter, item) = if a.end < b.end { + (&mut iter_a, &mut a) + } else { + (&mut iter_b, &mut b) + }; + if let Some(v) = iter.next() { + *item = v; + } else { + // No more ranges to check, done. + break; + } + } + + intersection.canonicalize() + } + /// Negates the character class. /// /// For all `c` where `c` is a Unicode scalar value, `c` matches `self` @@ -801,6 +838,18 @@ impl ClassRange { max(self.start, other.start) <= inc_char(min(self.end, other.end)) } + /// Returns the intersection of the two ranges if they have common + /// characters, `None` otherwise. + fn intersection(&self, other: &ClassRange) -> Option { + let start = max(self.start, other.start); + let end = min(self.end, other.end); + if start <= end { + Some(ClassRange::new(start, end)) + } else { + None + } + } + /// Creates a new range representing the union of `self` and `other. fn merge(self, other: ClassRange) -> ClassRange { ClassRange { @@ -1907,6 +1956,108 @@ mod tests { ])); } + #[test] + fn class_intersection_empty() { + let cls1 = class(&[]); + let cls2 = class(&[('a', 'a')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_single_equal() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('a', 'a')]); + assert_intersection(cls1, cls2, class(&[('a', 'a')])); + } + + #[test] + fn class_intersection_single_unequal() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('b', 'b')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_single_in_other() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('a', 'c')]); + assert_intersection(cls1, cls2, class(&[('a', 'a')])); + } + + #[test] + fn class_intersection_range_in_other() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('a', 'c')]); + assert_intersection(cls1, cls2, class(&[('a', 'b')])); + } + + #[test] + fn class_intersection_range_intersection() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('b', 'c')]); + assert_intersection(cls1, cls2, class(&[('b', 'b')])); + } + + #[test] + fn class_intersection_only_adjacent() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('c', 'd')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_range_subset() { + let cls1 = class(&[('b', 'c')]); + let cls2 = class(&[('a', 'd')]); + assert_intersection(cls1, cls2, class(&[('b', 'c')])); + } + + #[test] + fn class_intersection_many_ranges_in_one_big() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('a', 'h')]); + assert_intersection(cls1, cls2, class(&[ + ('a', 'b'), ('d', 'e'), ('g', 'h') + ])); + } + + #[test] + fn class_intersection_many_ranges_same() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_intersection(cls1, cls2, class(&[ + ('a', 'b'), ('d', 'e'), ('g', 'h') + ])); + } + + #[test] + fn class_intersection_multiple_non_intersecting() { + let cls1 = class(&[('a', 'b'), ('g', 'h')]); + let cls2 = class(&[('d', 'e'), ('k', 'l')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_non_intersecting_then_intersecting() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('h', 'h')]); + assert_intersection(cls1, cls2, class(&[('h', 'h')])); + } + + #[test] + fn class_intersection_adjacent_alternating() { + let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_overlapping_alternating() { + let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + assert_intersection(cls1, cls2, class(&[('b', 'f')])); + } + #[test] fn class_canon_overlap_many_case_fold() { let cls = class(&[ @@ -2056,4 +2207,10 @@ mod tests { let expr = e("(?-u)[-./]"); assert_eq!("(?-u:[-\\.-/])", expr.to_string()); } + + fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) { + // intersection operation should be commutative + assert_eq!(cls1.intersection(&cls2), expected); + assert_eq!(cls2.intersection(&cls1), expected); + } } diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index c2aca269bc..32b03072f9 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -526,14 +526,98 @@ impl Parser { // Parses a character class, e.g., `[^a-zA-Z0-9]+`. // + // If the Unicode flag is enabled, the class is returned as a `CharClass`, + // otherwise it is converted to a `ByteClass`. + // // Start: `[` // End: `+` fn parse_class(&mut self) -> Result { + let class = try!(self.parse_class_as_chars()); + Ok(Build::Expr(if self.flags.unicode { + Expr::Class(class) + } else { + let byte_class = class.to_byte_class(); + + // If `class` was only non-empty due to multibyte characters, the + // corresponding byte class will now be empty. + // + // See https://github.com/rust-lang/regex/issues/303 + if byte_class.is_empty() { + // e.g., (?-u)[^\x00-\xFF] + return Err(self.err(ErrorKind::EmptyClass)); + } + + Expr::ClassBytes(byte_class) + })) + } + + // Parses a character class as a `CharClass`, e.g., `[^a-zA-Z0-9]+`. + // + // This does not convert to a `ByteClass` yet, so that it can be used for + // nested character classes. + // + // Start: `[` + // End: `+` + fn parse_class_as_chars(&mut self) -> Result { self.bump(); let negated = self.bump_if('^'); + + let mut class = try!(self.parse_class_set(true)); + loop { + match self.cur() { + ']' => { + // end of the class + self.bump(); + break; + } + '&' => { + // intersection with `&&` + self.bump(); + self.bump(); + // parse next set and calculate intersection (left to right order) + let class2 = try!(self.parse_class_set(false)); + // intersection returns canonicalized `CharClass` + class = class.intersection(&class2); + } + _ => unreachable!() + } + } + + // negate after combining all sets (`^` has lower precedence than `&&`) + if negated { + class = class.negate(); + } + if class.is_empty() { + // e.g., [^\d\D] + return Err(self.err(ErrorKind::EmptyClass)); + } + Ok(class) + } + + // Parses a set in a character class. A set is the union of multiple + // consecutive single characters, ranges or nested sets. + // + // Terminates either when encountering a closing `]` or a `&&`. + // + // The returned `CharClass` is canonical. + // + // e.g., `[a-cd&&x-z]` + // + // Start: `a` (with first_set == true) + // End: `&` (the first one) + // + // or + // + // Start: `x` (with first_set == false) + // End: `]` + fn parse_class_set(&mut self, first_set: bool) -> Result { let mut class = CharClass::empty(); - while self.bump_if('-') { - class.ranges.push(ClassRange::one('-')); + // `-` at the start of the first set in a class is allowed. + // If it occurs after a `&&`, we need to check for the `--` operator. + if first_set { + while self.bump_if('-') { + class.ranges.push(ClassRange::one('-')); + } } loop { if self.eof() { @@ -541,41 +625,24 @@ impl Parser { return Err(self.err(ErrorKind::UnexpectedClassEof)); } match self.cur() { - // If no ranges have been added, then `]` is the first - // character (sans, perhaps, the `^` symbol), so it should + // If we're at the start of the class and `]` is the first + // character (sans, perhaps, the `^` symbol), it should // be interpreted as a `]` instead of a closing class bracket. - ']' if class.len() > 0 => { self.bump(); break } + // If we're after `&&` or there were other characters already, + // it should be interpreted as the end of the class. + ']' if !(first_set && class.len() == 0) => { break } + // `&&` means intersection with the next set, so stop parsing. + '&' if self.peek_is("&&") => { break } '[' => match self.maybe_parse_ascii() { Some(class2) => class.ranges.extend(class2), None => { - return Err(self.err( - ErrorKind::UnsupportedClassChar('['))); - } - }, - '\\' => match try!(self.parse_escape()) { - Build::Expr(Expr::Class(class2)) => { + // Nested set, e.g. `[c-d]` in `[a-b[c-d]]` + let class2 = try!(self.parse_class_as_chars()); class.ranges.extend(class2); } - Build::Expr(Expr::ClassBytes(class2)) => { - for byte_range in class2 { - let s = byte_range.start as char; - let e = byte_range.end as char; - class.ranges.push(ClassRange::new(s, e)); - } - } - Build::Expr(Expr::Literal { chars, .. }) => { - try!(self.parse_class_range(&mut class, chars[0])); - } - Build::Expr(Expr::LiteralBytes { bytes, .. }) => { - let start = bytes[0] as char; - try!(self.parse_class_range(&mut class, start)); - } - Build::Expr(e) => { - let err = ErrorKind::InvalidClassEscape(e); - return Err(self.err(err)); - } - // Because `parse_escape` can never return `LeftParen`. - _ => unreachable!(), + }, + '\\' => { + try!(self.parse_class_escape(&mut class)); }, start => { if !self.flags.unicode { @@ -583,8 +650,8 @@ impl Parser { } self.bump(); match start { - '&'|'~'|'-' => { - // Only report an error if we see && or ~~ or --. + '~'|'-' => { + // Only report an error if we see ~~ or --. if self.peek_is(start) { return Err(self.err( ErrorKind::UnsupportedClassChar(start))); @@ -596,27 +663,50 @@ impl Parser { } } } - class = self.class_transform(negated, class).canonicalize(); - if class.is_empty() { - // e.g., [^\d\D] - return Err(self.err(ErrorKind::EmptyClass)); - } - Ok(Build::Expr(if self.flags.unicode { - Expr::Class(class) + Ok(if self.flags.casei { + // Case folding canonicalizes too + class.case_fold() } else { - let byte_class = class.to_byte_class(); + class.canonicalize() + }) + } - // If `class` was only non-empty due to multibyte characters, the - // corresponding byte class will now be empty. - // - // See https://github.com/rust-lang/regex/issues/303 - if byte_class.is_empty() { - // e.g., (?-u)[^\x00-\xFF] - return Err(self.err(ErrorKind::EmptyClass)); + // Parses an escape in a character class. + // + // This is a helper for `parse_class`. Instead of returning an `Ok` value, + // it either mutates the char class or returns an error. + // + // e.g., `\wx` + // + // Start: `\` + // End: `x` + fn parse_class_escape(&mut self, class: &mut CharClass) -> Result<()> { + match try!(self.parse_escape()) { + Build::Expr(Expr::Class(class2)) => { + class.ranges.extend(class2); } - - Expr::ClassBytes(byte_class) - })) + Build::Expr(Expr::ClassBytes(class2)) => { + for byte_range in class2 { + let s = byte_range.start as char; + let e = byte_range.end as char; + class.ranges.push(ClassRange::new(s, e)); + } + } + Build::Expr(Expr::Literal { chars, .. }) => { + try!(self.parse_class_range(class, chars[0])); + } + Build::Expr(Expr::LiteralBytes { bytes, .. }) => { + let start = bytes[0] as char; + try!(self.parse_class_range(class, start)); + } + Build::Expr(e) => { + let err = ErrorKind::InvalidClassEscape(e); + return Err(self.err(err)); + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + } + Ok(()) } // Parses a single range in a character class. @@ -2221,6 +2311,250 @@ mod tests { ])); } + #[test] + fn class_nested_class_union() { + assert_eq!(p(r"[c[a-b]]"), Expr::Class(class(&[('a', 'c')]))); + assert_eq!(p(r"[[a-b]]"), Expr::Class(class(&[('a', 'b')]))); + assert_eq!(p(r"[[c][a-b]]"), Expr::Class(class(&[('a', 'c')]))); + + assert_eq!(pb(r"(?-u)[c[a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'c')]))); + assert_eq!(pb(r"(?-u)[[a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'b')]))); + assert_eq!(pb(r"(?-u)[[c][a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'c')]))); + } + + #[test] + fn class_nested_class_union_casei() { + assert_eq!(p(r"(?i)[c[a-b]]"), + Expr::Class(class(&[('a', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[[a-b]]"), + Expr::Class(class(&[('a', 'b')]).case_fold())); + assert_eq!(p(r"(?i)[[c][a-b]]"), + Expr::Class(class(&[('a', 'c')]).case_fold())); + + assert_eq!(pb(r"(?i-u)[[\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + } + + #[test] + fn class_nested_class_negate() { + assert_eq!(p(r"[^[\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[[^\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[^\d]]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[^[\w]]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[[^\w]]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[^[^\w]]"), Expr::Class(class(PERLW))); + assert_eq!(p(r"[a-b[^c]]"), + Expr::Class(class(&[('\u{0}', 'b'), ('d', '\u{10FFFF}')]))); + + assert_eq!(pb(r"(?-u)[^[\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[[^\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\d]]"), + Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[^[\w]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + assert_eq!(pb(r"(?-u)[[^\w]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\w]]"), + Expr::ClassBytes(asciiw_bytes())); + assert_eq!(pb(r"(?-u)[a-b[^c]]"), + Expr::ClassBytes(bclass(&[(b'\x00', b'b'), (b'd', b'\xFF')]))) + } + + #[test] + fn class_nested_class_negate_casei() { + assert_eq!(p(r"(?i)[^[\d]]"), + Expr::Class(class(PERLD).case_fold().negate())); + assert_eq!(p(r"(?i)[[^\d]]"), + Expr::Class(class(PERLD).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\d]]"), + Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[^[\w]]"), + Expr::Class(class(PERLW).case_fold().negate())); + assert_eq!(p(r"(?i)[[^\w]]"), + Expr::Class(class(PERLW).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\w]]"), + Expr::Class(class(PERLW).case_fold())); + let mut cls = CharClass::empty().negate(); + cls.remove('c'); + cls.remove('C'); + assert_eq!(p(r"(?i)[a-b[^c]]"), Expr::Class(cls)); + + assert_eq!(pb(r"(?i-u)[^[\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[[^\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^[\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[[^\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold())); + let mut bytes = ByteClass::new(vec![]).negate(); + bytes.remove(b'c'); + bytes.remove(b'C'); + assert_eq!(pb(r"(?i-u)[a-b[^c]]"), Expr::ClassBytes(bytes)); + } + + #[test] + fn class_nested_class_brackets_hyphen() { + // This is really confusing, but `]` is allowed if first character within a class + // It parses as a nested class with the `]` and `-` characters + assert_eq!(p(r"[[]-]]"), Expr::Class(class(&[('-', '-'), (']', ']')]))); + assert_eq!(p(r"[[\[]]"), Expr::Class(class(&[('[', '[')]))); + assert_eq!(p(r"[[\]]]"), Expr::Class(class(&[(']', ']')]))); + } + + #[test] + fn class_intersection_ranges() { + assert_eq!(p(r"[abc&&b-c]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[abc&&[b-c]]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[[abc]&&[b-c]]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[a-z&&b-y&&c-x]"), Expr::Class(class(&[('c', 'x')]))); + assert_eq!(p(r"[c-da-b&&a-d]"), Expr::Class(class(&[('a', 'd')]))); + assert_eq!(p(r"[a-d&&c-da-b]"), Expr::Class(class(&[('a', 'd')]))); + + assert_eq!(pb(r"(?-u)[abc&&b-c]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[abc&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[[abc]&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[a-z&&b-y&&c-x]"), + Expr::ClassBytes(bclass(&[(b'c', b'x')]))); + assert_eq!(pb(r"(?-u)[c-da-b&&a-d]"), + Expr::ClassBytes(bclass(&[(b'a', b'd')]))); + } + + #[test] + fn class_intersection_ranges_casei() { + assert_eq!(p(r"(?i)[abc&&b-c]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[abc&&[b-c]]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[[abc]&&[b-c]]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[a-z&&b-y&&c-x]"), + Expr::Class(class(&[('c', 'x')]).case_fold())); + assert_eq!(p(r"(?i)[c-da-b&&a-d]"), + Expr::Class(class(&[('a', 'd')]).case_fold())); + + assert_eq!(pb(r"(?i-u)[abc&&b-c]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[abc&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[[abc]&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[a-z&&b-y&&c-x]"), + Expr::ClassBytes(bclass(&[(b'c', b'x')]).case_fold())); + assert_eq!(pb(r"(?i-u)[c-da-b&&a-d]"), + Expr::ClassBytes(bclass(&[(b'a', b'd')]).case_fold())); + } + + #[test] + fn class_intersection_classes() { + assert_eq!(p(r"[\w&&\d]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\w&&[[:ascii:]]]"), Expr::Class(asciiw())); + assert_eq!(p(r"[\x00-\xFF&&\pZ]"), + Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')]))); + + assert_eq!(pb(r"(?-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes())); + } + + #[test] + fn class_intersection_classes_casei() { + assert_eq!(p(r"(?i)[\w&&\d]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[\w&&[[:ascii:]]]"), Expr::Class(asciiw().case_fold())); + assert_eq!(p(r"(?i)[\x00-\xFF&&\pZ]"), + Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')]))); + + assert_eq!(pb(r"(?i-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes().case_fold())); + } + + #[test] + fn class_intersection_negate() { + assert_eq!(p(r"[^\w&&\d]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[\w&&\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[^\w&&\d]]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\w&&[^\d]]"), + Expr::Class(class(PERLW).intersection(&class(PERLD).negate()))); + assert_eq!(p(r"[[^\w]&&[^\d]]"), + Expr::Class(class(PERLW).negate())); + + assert_eq!(pb(r"(?-u)[^\w&&\d]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[\w&&\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\w&&\d]]"), + Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[\w&&[^\d]]"), + Expr::ClassBytes(asciiw().intersection(&asciid().negate()).to_byte_class())); + assert_eq!(pb(r"(?-u)[[^\w]&&[^\d]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + } + + #[test] + fn class_intersection_negate_casei() { + assert_eq!(p(r"(?i)[^\w&&a-z]"), + Expr::Class(class(&[('a', 'z')]).case_fold().negate())); + assert_eq!(p(r"(?i)[^[\w&&a-z]]"), + Expr::Class(class(&[('a', 'z')]).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\w&&a-z]]"), + Expr::Class(class(&[('a', 'z')]).case_fold())); + assert_eq!(p(r"(?i)[\w&&[^a-z]]"), + Expr::Class( + class(PERLW).intersection(&class(&[('a', 'z')]) + .case_fold().negate()))); + assert_eq!(p(r"(?i)[[^\w]&&[^a-z]]"), + Expr::Class(class(PERLW).negate())); + + assert_eq!(pb(r"(?i-u)[^\w&&a-z]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[\w&&a-z]]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\w&&a-z]]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold())); + assert_eq!(pb(r"(?i-u)[\w&&[^a-z]]"), + Expr::ClassBytes(bclass(&[(b'0', b'9'), (b'_', b'_')]))); + assert_eq!(pb(r"(?i-u)[[^\w]&&[^a-z]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + } + + #[test] + fn class_intersection_caret() { + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(p(r"[\^&&^]"), Expr::Class(class(&[('^', '^')]))); + } + + #[test] + fn class_intersection_brackets_hyphen() { + // `]` needs to be escaped after `&&` because it is not at the start of the class. + assert_eq!(p(r"[]&&\]]"), Expr::Class(class(&[(']', ']')]))); + + assert_eq!(p(r"[-&&-]"), Expr::Class(class(&[('-', '-')]))); + } + + #[test] + fn class_intersection_ampersand() { + // Unescaped `&` after `&&` + assert_eq!(p(r"[\&&&&]"), Expr::Class(class(&[('&', '&')]))); + assert_eq!(p(r"[\&&&\&]"), Expr::Class(class(&[('&', '&')]))); + } + + #[test] + fn class_intersection_precedence() { + assert_eq!(p(r"[a-w&&[^c-g]z]"), Expr::Class(class(&[('a', 'b'), ('h', 'w')]))); + } + #[test] fn class_special_escaped_set_chars() { // These tests ensure that some special characters require escaping @@ -2803,11 +3137,33 @@ mod tests { // rejected in character classes. The intention is to use these // characters to implement sets as described in UTS#18 RL1.3. Once // that's done, these tests should be removed and replaced with others. - test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('[')); - test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&')); test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~')); test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-')); test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-')); + test_err!(r"[a&&~~]", 5, ErrorKind::UnsupportedClassChar('~')); + test_err!(r"[a&&--]", 5, ErrorKind::UnsupportedClassChar('-')); + } + + #[test] + fn error_class_nested_class() { + test_err!(r"[[]]", 4, ErrorKind::UnexpectedClassEof); + test_err!(r"[[][]]", 6, ErrorKind::UnexpectedClassEof); + test_err!(r"[[^\d\D]]", 8, ErrorKind::EmptyClass); + test_err!(r"[[]", 3, ErrorKind::UnexpectedClassEof); + test_err!(r"[[^]", 4, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_intersection() { + test_err!(r"[&&]", 4, ErrorKind::EmptyClass); + test_err!(r"[a&&]", 5, ErrorKind::EmptyClass); + test_err!(r"[&&&&]", 6, ErrorKind::EmptyClass); + // `]` after `&&` is not the same as in (`[]]`), because it's also not + // allowed unescaped in `[a]]`. + test_err!(r"[]&&]]", 5, ErrorKind::EmptyClass); + + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"(?-u)[a&&\pZ]", 12, ErrorKind::UnicodeNotAllowed, flags); } #[test]