From c436bfdf2232deab2dd4027ec9756ecc2ea23555 Mon Sep 17 00:00:00 2001
From: Robin Stocker <rstocker@atlassian.com>
Date: Wed, 22 Feb 2017 16:41:03 +1100
Subject: [PATCH] Support nested character classes and intersection with `&&`

This implements parts of UTS#18 RL1.3, namely:

* Nested character classes, e.g.: `[a[b-c]]`
* Intersections in classes, e.g.: `[\w&&\p{Greek}]`

They can be combined to do things like `[\w&&[^a]]` to get all word
characters except `a`.

Fixes #341
---
 regex-syntax/src/lib.rs    | 159 ++++++++++++-
 regex-syntax/src/parser.rs | 462 ++++++++++++++++++++++++++++++++-----
 2 files changed, 567 insertions(+), 54 deletions(-)
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index d10b2a50ec..0e16d04e51 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -680,7 +680,7 @@ impl CharClass {
         self.canonicalize()
     }
 
-    /// Canonicalze any sequence of ranges.
+    /// Canonicalize any sequence of ranges.
     ///
     /// This is responsible for enforcing the canonical format invariants
     /// as described on the docs for the `CharClass` type.
@@ -703,6 +703,43 @@ impl CharClass {
         ordered
     }
 
+    /// Calculate the intersection of two canonical character classes.
+    ///
+    /// The returned intersection is canonical.
+    fn intersection(&self, other: &CharClass) -> CharClass {
+        if self.ranges.is_empty() || other.ranges.is_empty() {
+            return CharClass::empty();
+        }
+
+        let mut intersection = CharClass::empty();
+
+        let mut iter_a = self.ranges.iter();
+        let mut iter_b = other.ranges.iter();
+        let mut a = iter_a.next().unwrap();
+        let mut b = iter_b.next().unwrap();
+        loop {
+            if let Some(i) = a.intersection(&b) {
+                intersection.ranges.push(i);
+            }
+
+            // If the range with the smaller end didn't match this time,
+            // it won't ever match, so move on to the next one.
+            let (iter, item) = if a.end < b.end {
+                (&mut iter_a, &mut a)
+            } else {
+                (&mut iter_b, &mut b)
+            };
+            if let Some(v) = iter.next() {
+                *item = v;
+            } else {
+                // No more ranges to check, done.
+                break;
+            }
+        }
+
+        intersection.canonicalize()
+    }
+
     /// Negates the character class.
     ///
     /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
@@ -801,6 +838,18 @@ impl ClassRange {
         max(self.start, other.start) <= inc_char(min(self.end, other.end))
     }
 
+    /// Returns the intersection of the two ranges if they have common
+    /// characters, `None` otherwise.
+    fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
+        let start = max(self.start, other.start);
+        let end = min(self.end, other.end);
+        if start <= end {
+            Some(ClassRange::new(start, end))
+        } else {
+            None
+        }
+    }
+
     /// Creates a new range representing the union of `self` and `other.
     fn merge(self, other: ClassRange) -> ClassRange {
         ClassRange {
@@ -1907,6 +1956,108 @@ mod tests {
         ]));
     }
 
+    #[test]
+    fn class_intersection_empty() {
+        let cls1 = class(&[]);
+        let cls2 = class(&[('a', 'a')]);
+        assert_intersection(cls1, cls2, class(&[]));
+    }
+
+    #[test]
+    fn class_intersection_single_equal() {
+        let cls1 = class(&[('a', 'a')]);
+        let cls2 = class(&[('a', 'a')]);
+        assert_intersection(cls1, cls2, class(&[('a', 'a')]));
+    }
+
+    #[test]
+    fn class_intersection_single_unequal() {
+        let cls1 = class(&[('a', 'a')]);
+        let cls2 = class(&[('b', 'b')]);
+        assert_intersection(cls1, cls2, class(&[]));
+    }
+
+    #[test]
+    fn class_intersection_single_in_other() {
+        let cls1 = class(&[('a', 'a')]);
+        let cls2 = class(&[('a', 'c')]);
+        assert_intersection(cls1, cls2, class(&[('a', 'a')]));
+    }
+
+    #[test]
+    fn class_intersection_range_in_other() {
+        let cls1 = class(&[('a', 'b')]);
+        let cls2 = class(&[('a', 'c')]);
+        assert_intersection(cls1, cls2, class(&[('a', 'b')]));
+    }
+
+    #[test]
+    fn class_intersection_range_intersection() {
+        let cls1 = class(&[('a', 'b')]);
+        let cls2 = class(&[('b', 'c')]);
+        assert_intersection(cls1, cls2, class(&[('b', 'b')]));
+    }
+
+    #[test]
+    fn class_intersection_only_adjacent() {
+        let cls1 = class(&[('a', 'b')]);
+        let cls2 = class(&[('c', 'd')]);
+        assert_intersection(cls1, cls2, class(&[]));
+    }
+
+    #[test]
+    fn class_intersection_range_subset() {
+        let cls1 = class(&[('b', 'c')]);
+        let cls2 = class(&[('a', 'd')]);
+        assert_intersection(cls1, cls2, class(&[('b', 'c')]));
+    }
+
+    #[test]
+    fn class_intersection_many_ranges_in_one_big() {
+        let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+        let cls2 = class(&[('a', 'h')]);
+        assert_intersection(cls1, cls2, class(&[
+            ('a', 'b'), ('d', 'e'), ('g', 'h')
+        ]));
+    }
+
+    #[test]
+    fn class_intersection_many_ranges_same() {
+        let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+        let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+        assert_intersection(cls1, cls2, class(&[
+            ('a', 'b'), ('d', 'e'), ('g', 'h')
+        ]));
+    }
+
+    #[test]
+    fn class_intersection_multiple_non_intersecting() {
+        let cls1 = class(&[('a', 'b'), ('g', 'h')]);
+        let cls2 = class(&[('d', 'e'), ('k', 'l')]);
+        assert_intersection(cls1, cls2, class(&[]));
+    }
+
+    #[test]
+    fn class_intersection_non_intersecting_then_intersecting() {
+        let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
+        let cls2 = class(&[('h', 'h')]);
+        assert_intersection(cls1, cls2, class(&[('h', 'h')]));
+    }
+
+    #[test]
+    fn class_intersection_adjacent_alternating() {
+        let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
+        let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
+        assert_intersection(cls1, cls2, class(&[]));
+    }
+
+    #[test]
+    fn class_intersection_overlapping_alternating() {
+        let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
+        let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
+        assert_intersection(cls1, cls2, class(&[('b', 'f')]));
+    }
+
     #[test]
     fn class_canon_overlap_many_case_fold() {
         let cls = class(&[
@@ -2056,4 +2207,10 @@ mod tests {
         let expr = e("(?-u)[-./]");
         assert_eq!("(?-u:[-\\.-/])", expr.to_string());
     }
+
+    fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
+        // intersection operation should be commutative
+        assert_eq!(cls1.intersection(&cls2), expected);
+        assert_eq!(cls2.intersection(&cls1), expected);
+    }
 }
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index c2aca269bc..32b03072f9 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -526,14 +526,98 @@ impl Parser {
 
     // Parses a character class, e.g., `[^a-zA-Z0-9]+`.
     //
+    // If the Unicode flag is enabled, the class is returned as a `CharClass`,
+    // otherwise it is converted to a `ByteClass`.
+    //
     // Start: `[`
     // End:   `+`
     fn parse_class(&mut self) -> Result<Build> {
+        let class = try!(self.parse_class_as_chars());
+        Ok(Build::Expr(if self.flags.unicode {
+            Expr::Class(class)
+        } else {
+            let byte_class = class.to_byte_class();
+
+            // If `class` was only non-empty due to multibyte characters, the
+            // corresponding byte class will now be empty.
+            //
+            // See https://github.com/rust-lang/regex/issues/303
+            if byte_class.is_empty() {
+                // e.g., (?-u)[^\x00-\xFF]
+                return Err(self.err(ErrorKind::EmptyClass));
+            }
+
+            Expr::ClassBytes(byte_class)
+        }))
+    }
+
+    // Parses a character class as a `CharClass`, e.g., `[^a-zA-Z0-9]+`.
+    //
+    // This does not convert to a `ByteClass` yet, so that it can be used for
+    // nested character classes.
+    //
+    // Start: `[`
+    // End:   `+`
+    fn parse_class_as_chars(&mut self) -> Result<CharClass> {
         self.bump();
         let negated = self.bump_if('^');
+
+        let mut class = try!(self.parse_class_set(true));
+        loop {
+            match self.cur() {
+                ']' => {
+                    // end of the class
+                    self.bump();
+                    break;
+                }
+                '&' => {
+                    // intersection with `&&`
+                    self.bump();
+                    self.bump();
+                    // parse next set and calculate intersection (left to right order)
+                    let class2 = try!(self.parse_class_set(false));
+                    // intersection returns canonicalized `CharClass`
+                    class = class.intersection(&class2);
+                }
+                _ => unreachable!()
+            }
+        }
+
+        // negate after combining all sets (`^` has lower precedence than `&&`)
+        if negated {
+            class = class.negate();
+        }
+        if class.is_empty() {
+            // e.g., [^\d\D]
+            return Err(self.err(ErrorKind::EmptyClass));
+        }
+        Ok(class)
+    }
+
+    // Parses a set in a character class. A set is the union of multiple
+    // consecutive single characters, ranges or nested sets.
+    //
+    // Terminates either when encountering a closing `]` or a `&&`.
+    //
+    // The returned `CharClass` is canonical.
+    //
+    // e.g., `[a-cd&&x-z]`
+    //
+    // Start: `a` (with first_set == true)
+    // End:   `&` (the first one)
+    //
+    // or
+    //
+    // Start: `x` (with first_set == false)
+    // End:   `]`
+    fn parse_class_set(&mut self, first_set: bool) -> Result<CharClass> {
         let mut class = CharClass::empty();
-        while self.bump_if('-') {
-            class.ranges.push(ClassRange::one('-'));
+        // `-` at the start of the first set in a class is allowed.
+        // If it occurs after a `&&`, we need to check for the `--` operator.
+        if first_set {
+            while self.bump_if('-') {
+                class.ranges.push(ClassRange::one('-'));
+            }
         }
         loop {
             if self.eof() {
@@ -541,41 +625,24 @@ impl Parser {
                 return Err(self.err(ErrorKind::UnexpectedClassEof));
             }
             match self.cur() {
-                // If no ranges have been added, then `]` is the first
-                // character (sans, perhaps, the `^` symbol), so it should
+                // If we're at the start of the class and `]` is the first
+                // character (sans, perhaps, the `^` symbol), it should
                 // be interpreted as a `]` instead of a closing class bracket.
-                ']' if class.len() > 0 => { self.bump(); break }
+                // If we're after `&&` or there were other characters already,
+                // it should be interpreted as the end of the class.
+                ']' if !(first_set && class.len() == 0) => { break }
+                // `&&` means intersection with the next set, so stop parsing.
+                '&' if self.peek_is("&&") => { break }
                 '[' => match self.maybe_parse_ascii() {
                     Some(class2) => class.ranges.extend(class2),
                     None => {
-                        return Err(self.err(
-                            ErrorKind::UnsupportedClassChar('[')));
-                    }
-                },
-                '\\' => match try!(self.parse_escape()) {
-                    Build::Expr(Expr::Class(class2)) => {
+                        // Nested set, e.g. `[c-d]` in `[a-b[c-d]]`
+                        let class2 = try!(self.parse_class_as_chars());
                         class.ranges.extend(class2);
                     }
-                    Build::Expr(Expr::ClassBytes(class2)) => {
-                        for byte_range in class2 {
-                            let s = byte_range.start as char;
-                            let e = byte_range.end as char;
-                            class.ranges.push(ClassRange::new(s, e));
-                        }
-                    }
-                    Build::Expr(Expr::Literal { chars, .. }) => {
-                        try!(self.parse_class_range(&mut class, chars[0]));
-                    }
-                    Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
-                        let start = bytes[0] as char;
-                        try!(self.parse_class_range(&mut class, start));
-                    }
-                    Build::Expr(e) => {
-                        let err = ErrorKind::InvalidClassEscape(e);
-                        return Err(self.err(err));
-                    }
-                    // Because `parse_escape` can never return `LeftParen`.
-                    _ => unreachable!(),
+                },
+                '\\' => {
+                    try!(self.parse_class_escape(&mut class));
                 },
                 start => {
                     if !self.flags.unicode {
@@ -583,8 +650,8 @@ impl Parser {
                     }
                     self.bump();
                     match start {
-                        '&'|'~'|'-' => {
-                            // Only report an error if we see && or ~~ or --.
+                        '~'|'-' => {
+                            // Only report an error if we see ~~ or --.
                             if self.peek_is(start) {
                                 return Err(self.err(
                                     ErrorKind::UnsupportedClassChar(start)));
@@ -596,27 +663,50 @@ impl Parser {
                 }
             }
         }
-        class = self.class_transform(negated, class).canonicalize();
-        if class.is_empty() {
-            // e.g., [^\d\D]
-            return Err(self.err(ErrorKind::EmptyClass));
-        }
-        Ok(Build::Expr(if self.flags.unicode {
-            Expr::Class(class)
+        Ok(if self.flags.casei {
+            // Case folding canonicalizes too
+            class.case_fold()
         } else {
-            let byte_class = class.to_byte_class();
+            class.canonicalize()
+        })
+    }
 
-            // If `class` was only non-empty due to multibyte characters, the
-            // corresponding byte class will now be empty.
-            //
-            // See https://github.com/rust-lang/regex/issues/303
-            if byte_class.is_empty() {
-                // e.g., (?-u)[^\x00-\xFF]
-                return Err(self.err(ErrorKind::EmptyClass));
+    // Parses an escape in a character class.
+    //
+    // This is a helper for `parse_class`. Instead of returning an `Ok` value,
+    // it either mutates the char class or returns an error.
+    //
+    // e.g., `\wx`
+    //
+    // Start: `\`
+    // End:   `x`
+    fn parse_class_escape(&mut self, class: &mut CharClass) -> Result<()> {
+        match try!(self.parse_escape()) {
+            Build::Expr(Expr::Class(class2)) => {
+                class.ranges.extend(class2);
             }
-
-            Expr::ClassBytes(byte_class)
-        }))
+            Build::Expr(Expr::ClassBytes(class2)) => {
+                for byte_range in class2 {
+                    let s = byte_range.start as char;
+                    let e = byte_range.end as char;
+                    class.ranges.push(ClassRange::new(s, e));
+                }
+            }
+            Build::Expr(Expr::Literal { chars, .. }) => {
+                try!(self.parse_class_range(class, chars[0]));
+            }
+            Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
+                let start = bytes[0] as char;
+                try!(self.parse_class_range(class, start));
+            }
+            Build::Expr(e) => {
+                let err = ErrorKind::InvalidClassEscape(e);
+                return Err(self.err(err));
+            }
+            // Because `parse_escape` can never return `LeftParen`.
+            _ => unreachable!(),
+        }
+        Ok(())
     }
 
     // Parses a single range in a character class.
@@ -2221,6 +2311,250 @@ mod tests {
         ]));
     }
 
+    #[test]
+    fn class_nested_class_union() {
+        assert_eq!(p(r"[c[a-b]]"), Expr::Class(class(&[('a', 'c')])));
+        assert_eq!(p(r"[[a-b]]"), Expr::Class(class(&[('a', 'b')])));
+        assert_eq!(p(r"[[c][a-b]]"), Expr::Class(class(&[('a', 'c')])));
+
+        assert_eq!(pb(r"(?-u)[c[a-b]]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'c')])));
+        assert_eq!(pb(r"(?-u)[[a-b]]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'b')])));
+        assert_eq!(pb(r"(?-u)[[c][a-b]]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'c')])));
+    }
+
+    #[test]
+    fn class_nested_class_union_casei() {
+        assert_eq!(p(r"(?i)[c[a-b]]"),
+                   Expr::Class(class(&[('a', 'c')]).case_fold()));
+        assert_eq!(p(r"(?i)[[a-b]]"),
+                   Expr::Class(class(&[('a', 'b')]).case_fold()));
+        assert_eq!(p(r"(?i)[[c][a-b]]"),
+                   Expr::Class(class(&[('a', 'c')]).case_fold()));
+
+        assert_eq!(pb(r"(?i-u)[[\d]]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold()));
+    }
+
+    #[test]
+    fn class_nested_class_negate() {
+        assert_eq!(p(r"[^[\d]]"), Expr::Class(class(PERLD).negate()));
+        assert_eq!(p(r"[[^\d]]"), Expr::Class(class(PERLD).negate()));
+        assert_eq!(p(r"[^[^\d]]"), Expr::Class(class(PERLD)));
+        assert_eq!(p(r"[^[\w]]"), Expr::Class(class(PERLW).negate()));
+        assert_eq!(p(r"[[^\w]]"), Expr::Class(class(PERLW).negate()));
+        assert_eq!(p(r"[^[^\w]]"), Expr::Class(class(PERLW)));
+        assert_eq!(p(r"[a-b[^c]]"),
+                   Expr::Class(class(&[('\u{0}', 'b'), ('d', '\u{10FFFF}')])));
+
+        assert_eq!(pb(r"(?-u)[^[\d]]"),
+                   Expr::ClassBytes(asciid_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[[^\d]]"),
+                   Expr::ClassBytes(asciid_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[^[^\d]]"),
+                   Expr::ClassBytes(asciid_bytes()));
+        assert_eq!(pb(r"(?-u)[^[\w]]"),
+                   Expr::ClassBytes(asciiw_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[[^\w]]"),
+                   Expr::ClassBytes(asciiw_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[^[^\w]]"),
+                   Expr::ClassBytes(asciiw_bytes()));
+        assert_eq!(pb(r"(?-u)[a-b[^c]]"),
+                   Expr::ClassBytes(bclass(&[(b'\x00', b'b'), (b'd', b'\xFF')])))
+    }
+
+    #[test]
+    fn class_nested_class_negate_casei() {
+        assert_eq!(p(r"(?i)[^[\d]]"),
+                   Expr::Class(class(PERLD).case_fold().negate()));
+        assert_eq!(p(r"(?i)[[^\d]]"),
+                   Expr::Class(class(PERLD).case_fold().negate()));
+        assert_eq!(p(r"(?i)[^[^\d]]"),
+                   Expr::Class(class(PERLD).case_fold()));
+        assert_eq!(p(r"(?i)[^[\w]]"),
+                   Expr::Class(class(PERLW).case_fold().negate()));
+        assert_eq!(p(r"(?i)[[^\w]]"),
+                   Expr::Class(class(PERLW).case_fold().negate()));
+        assert_eq!(p(r"(?i)[^[^\w]]"),
+                   Expr::Class(class(PERLW).case_fold()));
+        let mut cls = CharClass::empty().negate();
+        cls.remove('c');
+        cls.remove('C');
+        assert_eq!(p(r"(?i)[a-b[^c]]"), Expr::Class(cls));
+
+        assert_eq!(pb(r"(?i-u)[^[\d]]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[[^\d]]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[^[^\d]]"),
+                   Expr::ClassBytes(asciid_bytes().case_fold()));
+        assert_eq!(pb(r"(?i-u)[^[\w]]"),
+                   Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[[^\w]]"),
+                   Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[^[^\w]]"),
+                   Expr::ClassBytes(asciiw_bytes().case_fold()));
+        let mut bytes = ByteClass::new(vec![]).negate();
+        bytes.remove(b'c');
+        bytes.remove(b'C');
+        assert_eq!(pb(r"(?i-u)[a-b[^c]]"), Expr::ClassBytes(bytes));
+    }
+
+    #[test]
+    fn class_nested_class_brackets_hyphen() {
+        // This is really confusing, but `]` is allowed if first character within a class
+        // It parses as a nested class with the `]` and `-` characters
+        assert_eq!(p(r"[[]-]]"), Expr::Class(class(&[('-', '-'), (']', ']')])));
+        assert_eq!(p(r"[[\[]]"), Expr::Class(class(&[('[', '[')])));
+        assert_eq!(p(r"[[\]]]"), Expr::Class(class(&[(']', ']')])));
+    }
+
+    #[test]
+    fn class_intersection_ranges() {
+        assert_eq!(p(r"[abc&&b-c]"), Expr::Class(class(&[('b', 'c')])));
+        assert_eq!(p(r"[abc&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
+        assert_eq!(p(r"[[abc]&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
+        assert_eq!(p(r"[a-z&&b-y&&c-x]"), Expr::Class(class(&[('c', 'x')])));
+        assert_eq!(p(r"[c-da-b&&a-d]"), Expr::Class(class(&[('a', 'd')])));
+        assert_eq!(p(r"[a-d&&c-da-b]"), Expr::Class(class(&[('a', 'd')])));
+
+        assert_eq!(pb(r"(?-u)[abc&&b-c]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')])));
+        assert_eq!(pb(r"(?-u)[abc&&[b-c]]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')])));
+        assert_eq!(pb(r"(?-u)[[abc]&&[b-c]]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')])));
+        assert_eq!(pb(r"(?-u)[a-z&&b-y&&c-x]"),
+                   Expr::ClassBytes(bclass(&[(b'c', b'x')])));
+        assert_eq!(pb(r"(?-u)[c-da-b&&a-d]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'd')])));
+    }
+
+    #[test]
+    fn class_intersection_ranges_casei() {
+        assert_eq!(p(r"(?i)[abc&&b-c]"),
+                   Expr::Class(class(&[('b', 'c')]).case_fold()));
+        assert_eq!(p(r"(?i)[abc&&[b-c]]"),
+                   Expr::Class(class(&[('b', 'c')]).case_fold()));
+        assert_eq!(p(r"(?i)[[abc]&&[b-c]]"),
+                   Expr::Class(class(&[('b', 'c')]).case_fold()));
+        assert_eq!(p(r"(?i)[a-z&&b-y&&c-x]"),
+                   Expr::Class(class(&[('c', 'x')]).case_fold()));
+        assert_eq!(p(r"(?i)[c-da-b&&a-d]"),
+                   Expr::Class(class(&[('a', 'd')]).case_fold()));
+
+        assert_eq!(pb(r"(?i-u)[abc&&b-c]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
+        assert_eq!(pb(r"(?i-u)[abc&&[b-c]]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
+        assert_eq!(pb(r"(?i-u)[[abc]&&[b-c]]"),
+                   Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
+        assert_eq!(pb(r"(?i-u)[a-z&&b-y&&c-x]"),
+                   Expr::ClassBytes(bclass(&[(b'c', b'x')]).case_fold()));
+        assert_eq!(pb(r"(?i-u)[c-da-b&&a-d]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'd')]).case_fold()));
+    }
+
+    #[test]
+    fn class_intersection_classes() {
+        assert_eq!(p(r"[\w&&\d]"), Expr::Class(class(PERLD)));
+        assert_eq!(p(r"[\w&&[[:ascii:]]]"), Expr::Class(asciiw()));
+        assert_eq!(p(r"[\x00-\xFF&&\pZ]"),
+                   Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
+
+        assert_eq!(pb(r"(?-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes()));
+        assert_eq!(pb(r"(?-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes()));
+    }
+
+    #[test]
+    fn class_intersection_classes_casei() {
+        assert_eq!(p(r"(?i)[\w&&\d]"), Expr::Class(class(PERLD).case_fold()));
+        assert_eq!(p(r"(?i)[\w&&[[:ascii:]]]"), Expr::Class(asciiw().case_fold()));
+        assert_eq!(p(r"(?i)[\x00-\xFF&&\pZ]"),
+                   Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
+
+        assert_eq!(pb(r"(?i-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes().case_fold()));
+        assert_eq!(pb(r"(?i-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes().case_fold()));
+    }
+
+    #[test]
+    fn class_intersection_negate() {
+        assert_eq!(p(r"[^\w&&\d]"), Expr::Class(class(PERLD).negate()));
+        assert_eq!(p(r"[^[\w&&\d]]"), Expr::Class(class(PERLD).negate()));
+        assert_eq!(p(r"[^[^\w&&\d]]"), Expr::Class(class(PERLD)));
+        assert_eq!(p(r"[\w&&[^\d]]"),
+                   Expr::Class(class(PERLW).intersection(&class(PERLD).negate())));
+        assert_eq!(p(r"[[^\w]&&[^\d]]"),
+                   Expr::Class(class(PERLW).negate()));
+
+        assert_eq!(pb(r"(?-u)[^\w&&\d]"),
+                   Expr::ClassBytes(asciid_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[^[\w&&\d]]"),
+                   Expr::ClassBytes(asciid_bytes().negate()));
+        assert_eq!(pb(r"(?-u)[^[^\w&&\d]]"),
+                   Expr::ClassBytes(asciid_bytes()));
+        assert_eq!(pb(r"(?-u)[\w&&[^\d]]"),
+                   Expr::ClassBytes(asciiw().intersection(&asciid().negate()).to_byte_class()));
+        assert_eq!(pb(r"(?-u)[[^\w]&&[^\d]]"),
+                   Expr::ClassBytes(asciiw_bytes().negate()));
+    }
+
+    #[test]
+    fn class_intersection_negate_casei() {
+        assert_eq!(p(r"(?i)[^\w&&a-z]"),
+                   Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
+        assert_eq!(p(r"(?i)[^[\w&&a-z]]"),
+                   Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
+        assert_eq!(p(r"(?i)[^[^\w&&a-z]]"),
+                   Expr::Class(class(&[('a', 'z')]).case_fold()));
+        assert_eq!(p(r"(?i)[\w&&[^a-z]]"),
+                   Expr::Class(
+                       class(PERLW).intersection(&class(&[('a', 'z')])
+                       .case_fold().negate())));
+        assert_eq!(p(r"(?i)[[^\w]&&[^a-z]]"),
+                   Expr::Class(class(PERLW).negate()));
+
+        assert_eq!(pb(r"(?i-u)[^\w&&a-z]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[^[\w&&a-z]]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
+        assert_eq!(pb(r"(?i-u)[^[^\w&&a-z]]"),
+                   Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold()));
+        assert_eq!(pb(r"(?i-u)[\w&&[^a-z]]"),
+                   Expr::ClassBytes(bclass(&[(b'0', b'9'), (b'_', b'_')])));
+        assert_eq!(pb(r"(?i-u)[[^\w]&&[^a-z]]"),
+                   Expr::ClassBytes(asciiw_bytes().negate()));
+    }
+
+    #[test]
+    fn class_intersection_caret() {
+        // In `[a^]`, `^` does not need to be escaped, so it makes sense that
+        // `^` is also allowed to be unescaped after `&&`.
+        assert_eq!(p(r"[\^&&^]"), Expr::Class(class(&[('^', '^')])));
+    }
+
+    #[test]
+    fn class_intersection_brackets_hyphen() {
+        // `]` needs to be escaped after `&&` because it is not at the start of the class.
+        assert_eq!(p(r"[]&&\]]"), Expr::Class(class(&[(']', ']')])));
+
+        assert_eq!(p(r"[-&&-]"), Expr::Class(class(&[('-', '-')])));
+    }
+
+    #[test]
+    fn class_intersection_ampersand() {
+        // Unescaped `&` after `&&`
+        assert_eq!(p(r"[\&&&&]"), Expr::Class(class(&[('&', '&')])));
+        assert_eq!(p(r"[\&&&\&]"), Expr::Class(class(&[('&', '&')])));
+    }
+
+    #[test]
+    fn class_intersection_precedence() {
+        assert_eq!(p(r"[a-w&&[^c-g]z]"), Expr::Class(class(&[('a', 'b'), ('h', 'w')])));
+    }
+
     #[test]
     fn class_special_escaped_set_chars() {
         // These tests ensure that some special characters require escaping
@@ -2803,11 +3137,33 @@ mod tests {
         // rejected in character classes. The intention is to use these
         // characters to implement sets as described in UTS#18 RL1.3. Once
         // that's done, these tests should be removed and replaced with others.
-        test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('['));
-        test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&'));
         test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~'));
         test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-'));
         test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-'));
+        test_err!(r"[a&&~~]", 5, ErrorKind::UnsupportedClassChar('~'));
+        test_err!(r"[a&&--]", 5, ErrorKind::UnsupportedClassChar('-'));
+    }
+
+    #[test]
+    fn error_class_nested_class() {
+        test_err!(r"[[]]", 4, ErrorKind::UnexpectedClassEof);
+        test_err!(r"[[][]]", 6, ErrorKind::UnexpectedClassEof);
+        test_err!(r"[[^\d\D]]", 8, ErrorKind::EmptyClass);
+        test_err!(r"[[]", 3, ErrorKind::UnexpectedClassEof);
+        test_err!(r"[[^]", 4, ErrorKind::UnexpectedClassEof);
+    }
+
+    #[test]
+    fn error_class_intersection() {
+        test_err!(r"[&&]", 4, ErrorKind::EmptyClass);
+        test_err!(r"[a&&]", 5, ErrorKind::EmptyClass);
+        test_err!(r"[&&&&]", 6, ErrorKind::EmptyClass);
+        // `]` after `&&` is not the same as in (`[]]`), because it's also not
+        // allowed unescaped in `[a]]`.
+        test_err!(r"[]&&]]", 5, ErrorKind::EmptyClass);
+
+        let flags = Flags { allow_bytes: true, .. Flags::default() };
+        test_err!(r"(?-u)[a&&\pZ]", 12, ErrorKind::UnicodeNotAllowed, flags);
     }
 
     #[test]