feat(regular_expression)!: Support ES2025 Duplicated named capture gr…

…oups (#6847) Closes #6358 @preyneyv I know you've been working on this problem. This is an implementation that has been dormant on my local for a while. - All tests are passing - However, the approach is simple but not general, so there might be some edge cases that were missed - There's also room for improvement in terms of performance For these reasons, it was marked as WIP for me. I believe the test cases and other parts are usable, so feel free to fork and replace them with your implementation if you'd like.
oxc-project · Oct 25, 2024 · 90c786c · 90c786c
1 parent f49b3e2
commit 90c786c
Show file tree

Hide file tree

Showing 12 changed files with 199 additions and 98 deletions.
diff --git a/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap b/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap
@@ -200,7 +200,7 @@ source: crates/oxc_linter/src/tester.rs
    ╰────
 
   ⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated capturing group names
-   ╭─[no_invalid_regexp.tsx:1:23]
+   ╭─[no_invalid_regexp.tsx:1:16]
  1 │ new RegExp('(?<k>a)(?<k>b)')
-   ·                       ─
+   ·                ─      ─
    ╰────
diff --git a/crates/oxc_regular_expression/README.md b/crates/oxc_regular_expression/README.md
@@ -5,3 +5,8 @@ Implements ECMAScript® 2024 Language Specification
 - https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals
 - https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects
 - https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns
+
+And, Stage 3 proposals
+
+- https://github.com/tc39/proposal-duplicate-named-capturing-groups
+- https://github.com/tc39/proposal-regexp-modifiers
diff --git a/crates/oxc_regular_expression/src/parser/mod.rs b/crates/oxc_regular_expression/src/parser/mod.rs
@@ -101,10 +101,17 @@ mod test {
             (r"\1()", "u"),
             (r"(?<n1>..)(?<n2>..)", ""),
             // ES2025 ---
-            // TODO: Duplicate named capturing groups
-            // (r"(?<n1>..)|(?<n1>..)", ""),
-            // (r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
-            // (r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
+            // Duplicate named capturing groups
+            (r"(?<n1>..)|(?<n1>..)", ""),
+            (r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
+            (r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
+            (r"(?<x>a)|(?<x>b)", ""),
+            (r"(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))", ""),
+            (r"(?:(?<x>a)|(?<x>b))\\k<x>", ""),
+            (r"(?:(?:(?<x>a)|(?<x>b)|c)\\k<x>){2}", ""),
+            (r"(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}", ""),
+            (r"(?:(?:(?<x>a)\\k<x>|(?<x>b)\\k<x>)|(?:))\\k<x>", ""),
+            (r"(?:(?:(?<x>a\\k<x>)|(?<x>b\\k<x>))|(?:))\\k<x>", ""),
             // Modifiers
             (r"(?:.)", ""),
             (r"(?s:.)", ""),
@@ -190,9 +197,15 @@ mod test {
             (r"[\q{]", "v"),
             (r"[\q{\a}]", "v"),
             // ES2025 ---
-            // TODO: Duplicate named capturing groups
-            (r"(?<n>..)|(?<n>..)", ""), // This will be valid
-            // (r"(?<a>|(?<a>))", ""), // Nested, still invalid
+            // Duplicate named capturing groups
+            (r"(?<n>.)(?<n>.)", ""),
+            (r"(?<n>.(?<n>..))", "u"),
+            ("(?<n>)|(?<n>)(?<n>)", ""),
+            ("(((((((?<n>.)))))))(?<n>)", ""),
+            ("(?:(?<x>a)|(?<x>b))(?<x>c)", ""),
+            ("(?<x>a)(?:(?<x>b)|(?<x>c))", ""),
+            ("(?:(?:(?<x>a)|(?<x>b)))(?<x>c)", ""),
+            ("(?:(?:(?<x>a)|(?<x>b))|(?:))(?<x>c)", ""),
             // Modifiers
             (r"(?a:.)", ""),
             (r"(?-S:.)", ""),
@@ -255,6 +268,11 @@ mod test {
             (r"[[[[[^[[[[\q{ng}]]]]]]]]]", "v", true),
             (r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", "v", true),
             // ES2025 ---
+            // Duplicated named capture groups
+            ("(?:(?<x>a)|(?<x>b))(?<x>c)", "", true),
+            ("(?:(?<x>a)|(?<x>b))(?<X>c)", "", false),
+            ("(?<x>a)(?:(?<x>b)|(?<x>c))", "", true),
+            ("(?<x>a)|(?:(?<x>b)|(?<x>c))", "", false),
             // Modifiers
             (r"(?ii:.)", "", true),
             (r"(?-ss:.)", "", true),

diff --git a/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs b/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs
@@ -46,8 +46,14 @@ impl<'a> PatternParser<'a> {
         // - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
         // We may re-consider this if we need more performance rather than simplicity.
         let checkpoint = self.reader.checkpoint();
-        let duplicated_named_capturing_groups =
-            self.state.initialize_with_parsing(&mut self.reader);
+
+        // [SS:EE] Pattern :: Disjunction
+        // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
+        self.state.initialize_with_parsing(&mut self.reader).map_err(|offsets| {
+            diagnostics::duplicated_capturing_group_names(
+                offsets.iter().map(|&(start, end)| self.span_factory.create(start, end)).collect(),
+            )
+        })?;
         self.reader.rewind(checkpoint);
 
         // [SS:EE] Pattern :: Disjunction
@@ -58,16 +64,6 @@ impl<'a> PatternParser<'a> {
         if u32::MAX == self.state.num_of_capturing_groups {
             return Err(diagnostics::too_may_capturing_groups(self.span_factory.create(0, 0)));
         }
-        // [SS:EE] Pattern :: Disjunction
-        // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
-        if !duplicated_named_capturing_groups.is_empty() {
-            return Err(diagnostics::duplicated_capturing_group_names(
-                duplicated_named_capturing_groups
-                    .iter()
-                    .map(|&(start, end)| self.span_factory.create(start, end))
-                    .collect(),
-            ));
-        }
 
         // Let's start parsing!
         let span_start = self.reader.offset();

diff --git a/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs b/crates/oxc_regular_expression/src/parser/pattern_parser/state.rs
@@ -1,5 +1,5 @@
 use oxc_span::Atom;
-use rustc_hash::FxHashSet;
+use rustc_hash::{FxHashMap, FxHashSet};
 
 use crate::parser::reader::Reader;
 
@@ -16,6 +16,8 @@ pub struct State<'a> {
     pub capturing_group_names: FxHashSet<Atom<'a>>,
 }
 
+type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;
+
 impl<'a> State<'a> {
     pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
         Self {
@@ -27,12 +29,11 @@ impl<'a> State<'a> {
         }
     }
 
-    pub fn initialize_with_parsing(&mut self, reader: &mut Reader<'a>) -> Vec<(u32, u32)> {
-        let (
-            num_of_left_capturing_parens,
-            capturing_group_names,
-            duplicated_named_capturing_groups,
-        ) = parse_capturing_groups(reader);
+    pub fn initialize_with_parsing(
+        &mut self,
+        reader: &mut Reader<'a>,
+    ) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
+        let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;
 
         // In Annex B, this is `false` by default.
         // It is `true`
@@ -44,26 +45,43 @@ impl<'a> State<'a> {
         self.num_of_capturing_groups = num_of_left_capturing_parens;
         self.capturing_group_names = capturing_group_names;
 
-        duplicated_named_capturing_groups
+        Ok(())
     }
 }
 
-/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
+enum SimpleUnit<'a> {
+    Open,
+    Close,
+    Pipe,
+    GroupName(Atom<'a>),
+}
+
+/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
 fn parse_capturing_groups<'a>(
     reader: &mut Reader<'a>,
-) -> (u32, FxHashSet<Atom<'a>>, Vec<(u32, u32)>) {
-    let mut num_of_left_capturing_parens = 0;
-    let mut capturing_group_names = FxHashSet::default();
-    let mut duplicated_named_capturing_groups = vec![];
-
-    let mut in_escape = false;
-    let mut in_character_class = false;
-
+) -> Result<(u32, FxHashSet<Atom<'a>>), DuplicatedNamedCapturingGroupOffsets> {
     // Count only normal CapturingGroup(named, unnamed)
     //   (?<name>...), (...)
     // IgnoreGroup, and LookaroundAssertions are ignored
     //   (?:...)
     //   (?=...), (?!...), (?<=...), (?<!...)
+    let mut num_of_left_capturing_parens = 0;
+
+    // Collect capturing group names
+    let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
+    // At the same time, check duplicates
+    // If you want to process this most efficiently:
+    // - define a scope for each Disjunction
+    // - then check for duplicates for each `|` while inheriting the parent-child relationship
+    // ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
+    // However, duplicates are rare in the first place.
+    // And as long as it works simply, this may be enough.
+    let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
+        FxHashMap::default();
+    let mut simplified: Vec<SimpleUnit<'a>> = vec![];
+
+    let mut in_escape = false;
+    let mut in_character_class = false;
     while let Some(cp) = reader.peek() {
         if in_escape {
             in_escape = false;
@@ -73,9 +91,15 @@ fn parse_capturing_groups<'a>(
             in_character_class = true;
         } else if cp == ']' as u32 {
             in_character_class = false;
+        } else if !in_character_class && cp == '|' as u32 {
+            simplified.push(SimpleUnit::Pipe);
+        } else if !in_character_class && cp == ')' as u32 {
+            simplified.push(SimpleUnit::Close);
         } else if !in_character_class && cp == '(' as u32 {
             reader.advance();
 
+            simplified.push(SimpleUnit::Open);
+
             // Skip IgnoreGroup
             if reader.eat2('?', ':')
             // Skip LookAroundAssertion
@@ -102,56 +126,124 @@ fn parse_capturing_groups<'a>(
                 let span_end = reader.offset();
 
                 if reader.eat('>') {
-                    // May be duplicated
-                    if !capturing_group_names.insert(reader.atom(span_start, span_end)) {
-                        // Report them with `Span`
-                        duplicated_named_capturing_groups.push((span_start, span_end));
+                    let group_name = reader.atom(span_start, span_end);
+
+                    simplified.push(SimpleUnit::GroupName(group_name.clone()));
+                    // Check duplicates later
+                    if let Some(last_span) = group_names.get(&group_name) {
+                        let entry = may_duplicates.entry(group_name).or_default();
+                        entry.push(*last_span);
+                        entry.push((span_start, span_end));
+                    } else {
+                        group_names.insert(group_name, (span_start, span_end));
                     }
+
                     continue;
                 }
             }
+
+            // Unnamed
+            continue;
         }
 
         reader.advance();
     }
 
-    (num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
+    // Check duplicates and emit error if exists
+    if !may_duplicates.is_empty() {
+        // Check must be done for each group name
+        for (group_name, spans) in may_duplicates {
+            let iter = simplified.iter().clone();
+
+            let mut alternative_depth = FxHashSet::default();
+            let mut depth = 0_u32;
+            let mut is_first = true;
+
+            'outer: for token in iter {
+                match token {
+                    SimpleUnit::Open => {
+                        depth += 1;
+                    }
+                    SimpleUnit::Close => {
+                        // May panic if the pattern has invalid, unbalanced parens
+                        depth = depth.saturating_sub(1);
+                    }
+                    SimpleUnit::Pipe => {
+                        if !is_first {
+                            alternative_depth.insert(depth);
+                        }
+                    }
+                    SimpleUnit::GroupName(name) => {
+                        // Check target group name only
+                        if *name != group_name {
+                            continue;
+                        }
+                        // Skip the first one, because it is not duplicated
+                        if is_first {
+                            is_first = false;
+                            continue;
+                        }
+
+                        // If left outer `|` is found, both can participate
+                        // `|(?<n>)`
+                        //  ^   ^ depth: 1
+                        //  ^ depth: 0
+                        for i in (0..depth).rev() {
+                            if alternative_depth.contains(&i) {
+                                // Remove it, next duplicates requires another `|`
+                                alternative_depth.remove(&i);
+                                continue 'outer;
+                            }
+                        }
+
+                        return Err(spans);
+                    }
+                }
+            }
+        }
+    }
+
+    Ok((num_of_left_capturing_parens, group_names.keys().cloned().collect()))
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn test_count_capturing_groups() {
+    fn count_capturing_groups() {
         for (source_text, expected) in [
-            ("()", (1, 0, false)),
-            (r"\1()", (1, 0, false)),
-            ("(foo)", (1, 0, false)),
-            ("(foo)(bar)", (2, 0, false)),
-            ("(foo(bar))", (2, 0, false)),
-            ("(foo)[(bar)]", (1, 0, false)),
-            (r"(foo)\(bar\)", (1, 0, false)),
-            ("(foo)(?<n>bar)", (2, 1, false)),
-            ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
-            ("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
-            ("(?<n>.)(?<n>..)", (2, 1, true)),
-            ("(?<n>.(?<n>..))", (2, 1, true)),
+            ("()", (1, 0)),
+            (r"\1()", (1, 0)),
+            ("(foo)", (1, 0)),
+            ("(foo)(bar)", (2, 0)),
+            ("(foo(bar))", (2, 0)),
+            ("(foo)[(bar)]", (1, 0)),
+            (r"(foo)\(bar\)", (1, 0)),
+            ("(foo)(?<n>bar)", (2, 1)),
+            ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
+            ("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
+            ("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
+            ("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
         ] {
             let mut reader = Reader::initialize(source_text, true, false).unwrap();
 
-            let (
-                num_of_left_capturing_parens,
-                capturing_group_names,
-                duplicated_named_capturing_groups,
-            ) = parse_capturing_groups(&mut reader);
-
-            let actual = (
-                num_of_left_capturing_parens,
-                capturing_group_names.len(),
-                !duplicated_named_capturing_groups.is_empty(),
-            );
+            let (num_of_left_capturing_parens, capturing_group_names) =
+                parse_capturing_groups(&mut reader).unwrap();
+
+            let actual = (num_of_left_capturing_parens, capturing_group_names.len());
             assert_eq!(expected, actual, "{source_text}");
         }
     }
+
+    #[test]
+    fn duplicated_named_capturing_groups() {
+        for source_text in
+            ["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
+        {
+            let mut reader = Reader::initialize(source_text, true, false).unwrap();
+
+            assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");
+        }
+    }
 }
diff --git a/tasks/coverage/snapshots/codegen_test262.snap b/tasks/coverage/snapshots/codegen_test262.snap
@@ -1,5 +1,5 @@
 commit: 06454619
 
 codegen_test262 Summary:
-AST Parsed     : 43832/43832 (100.00%)
-Positive Passed: 43832/43832 (100.00%)
+AST Parsed     : 43851/43851 (100.00%)
+Positive Passed: 43851/43851 (100.00%)
diff --git a/tasks/coverage/snapshots/minifier_test262.snap b/tasks/coverage/snapshots/minifier_test262.snap
@@ -1,5 +1,5 @@
 commit: 06454619
 
 minifier_test262 Summary:
-AST Parsed     : 43832/43832 (100.00%)
-Positive Passed: 43832/43832 (100.00%)
+AST Parsed     : 43851/43851 (100.00%)
+Positive Passed: 43851/43851 (100.00%)