Skip to content

Commit

Permalink
feat(regular_expression)!: Support ES2025 Duplicated named capture gr…
Browse files Browse the repository at this point in the history
…oups (#6847)

Closes #6358

@preyneyv I know you've been working on this problem.

This is an implementation that has been dormant on my local for a while.

- All tests are passing
- However, the approach is simple but not general, so there might be some edge cases that were missed
- There's also room for improvement in terms of performance

For these reasons, it was marked as WIP for me.

I believe the test cases and other parts are usable, so feel free to fork and replace them with your implementation if you'd like.
  • Loading branch information
leaysgur committed Oct 25, 2024
1 parent f49b3e2 commit 90c786c
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 98 deletions.
4 changes: 2 additions & 2 deletions crates/oxc_linter/src/snapshots/no_invalid_regexp.snap
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ source: crates/oxc_linter/src/tester.rs
╰────

eslint(no-invalid-regexp): Invalid regular expression: Duplicated capturing group names
╭─[no_invalid_regexp.tsx:1:23]
╭─[no_invalid_regexp.tsx:1:16]
1new RegExp('(?<k>a)(?<k>b)')
·
·
╰────
5 changes: 5 additions & 0 deletions crates/oxc_regular_expression/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ Implements ECMAScript® 2024 Language Specification
- https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals
- https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects
- https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns

And, Stage 3 proposals

- https://github.com/tc39/proposal-duplicate-named-capturing-groups
- https://github.com/tc39/proposal-regexp-modifiers
32 changes: 25 additions & 7 deletions crates/oxc_regular_expression/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,17 @@ mod test {
(r"\1()", "u"),
(r"(?<n1>..)(?<n2>..)", ""),
// ES2025 ---
// TODO: Duplicate named capturing groups
// (r"(?<n1>..)|(?<n1>..)", ""),
// (r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
// (r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
// Duplicate named capturing groups
(r"(?<n1>..)|(?<n1>..)", ""),
(r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
(r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
(r"(?<x>a)|(?<x>b)", ""),
(r"(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))", ""),
(r"(?:(?<x>a)|(?<x>b))\\k<x>", ""),
(r"(?:(?:(?<x>a)|(?<x>b)|c)\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)\\k<x>|(?<x>b)\\k<x>)|(?:))\\k<x>", ""),
(r"(?:(?:(?<x>a\\k<x>)|(?<x>b\\k<x>))|(?:))\\k<x>", ""),
// Modifiers
(r"(?:.)", ""),
(r"(?s:.)", ""),
Expand Down Expand Up @@ -190,9 +197,15 @@ mod test {
(r"[\q{]", "v"),
(r"[\q{\a}]", "v"),
// ES2025 ---
// TODO: Duplicate named capturing groups
(r"(?<n>..)|(?<n>..)", ""), // This will be valid
// (r"(?<a>|(?<a>))", ""), // Nested, still invalid
// Duplicate named capturing groups
(r"(?<n>.)(?<n>.)", ""),
(r"(?<n>.(?<n>..))", "u"),
("(?<n>)|(?<n>)(?<n>)", ""),
("(((((((?<n>.)))))))(?<n>)", ""),
("(?:(?<x>a)|(?<x>b))(?<x>c)", ""),
("(?<x>a)(?:(?<x>b)|(?<x>c))", ""),
("(?:(?:(?<x>a)|(?<x>b)))(?<x>c)", ""),
("(?:(?:(?<x>a)|(?<x>b))|(?:))(?<x>c)", ""),
// Modifiers
(r"(?a:.)", ""),
(r"(?-S:.)", ""),
Expand Down Expand Up @@ -255,6 +268,11 @@ mod test {
(r"[[[[[^[[[[\q{ng}]]]]]]]]]", "v", true),
(r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", "v", true),
// ES2025 ---
// Duplicated named capture groups
("(?:(?<x>a)|(?<x>b))(?<x>c)", "", true),
("(?:(?<x>a)|(?<x>b))(?<X>c)", "", false),
("(?<x>a)(?:(?<x>b)|(?<x>c))", "", true),
("(?<x>a)|(?:(?<x>b)|(?<x>c))", "", false),
// Modifiers
(r"(?ii:.)", "", true),
(r"(?-ss:.)", "", true),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,14 @@ impl<'a> PatternParser<'a> {
// - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
// We may re-consider this if we need more performance rather than simplicity.
let checkpoint = self.reader.checkpoint();
let duplicated_named_capturing_groups =
self.state.initialize_with_parsing(&mut self.reader);

// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
self.state.initialize_with_parsing(&mut self.reader).map_err(|offsets| {
diagnostics::duplicated_capturing_group_names(
offsets.iter().map(|&(start, end)| self.span_factory.create(start, end)).collect(),
)
})?;
self.reader.rewind(checkpoint);

// [SS:EE] Pattern :: Disjunction
Expand All @@ -58,16 +64,6 @@ impl<'a> PatternParser<'a> {
if u32::MAX == self.state.num_of_capturing_groups {
return Err(diagnostics::too_may_capturing_groups(self.span_factory.create(0, 0)));
}
// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
if !duplicated_named_capturing_groups.is_empty() {
return Err(diagnostics::duplicated_capturing_group_names(
duplicated_named_capturing_groups
.iter()
.map(|&(start, end)| self.span_factory.create(start, end))
.collect(),
));
}

// Let's start parsing!
let span_start = self.reader.offset();
Expand Down
184 changes: 138 additions & 46 deletions crates/oxc_regular_expression/src/parser/pattern_parser/state.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use oxc_span::Atom;
use rustc_hash::FxHashSet;
use rustc_hash::{FxHashMap, FxHashSet};

use crate::parser::reader::Reader;

Expand All @@ -16,6 +16,8 @@ pub struct State<'a> {
pub capturing_group_names: FxHashSet<Atom<'a>>,
}

type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;

impl<'a> State<'a> {
pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
Self {
Expand All @@ -27,12 +29,11 @@ impl<'a> State<'a> {
}
}

pub fn initialize_with_parsing(&mut self, reader: &mut Reader<'a>) -> Vec<(u32, u32)> {
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(reader);
pub fn initialize_with_parsing(
&mut self,
reader: &mut Reader<'a>,
) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;

// In Annex B, this is `false` by default.
// It is `true`
Expand All @@ -44,26 +45,43 @@ impl<'a> State<'a> {
self.num_of_capturing_groups = num_of_left_capturing_parens;
self.capturing_group_names = capturing_group_names;

duplicated_named_capturing_groups
Ok(())
}
}

/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
enum SimpleUnit<'a> {
Open,
Close,
Pipe,
GroupName(Atom<'a>),
}

/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
fn parse_capturing_groups<'a>(
reader: &mut Reader<'a>,
) -> (u32, FxHashSet<Atom<'a>>, Vec<(u32, u32)>) {
let mut num_of_left_capturing_parens = 0;
let mut capturing_group_names = FxHashSet::default();
let mut duplicated_named_capturing_groups = vec![];

let mut in_escape = false;
let mut in_character_class = false;

) -> Result<(u32, FxHashSet<Atom<'a>>), DuplicatedNamedCapturingGroupOffsets> {
// Count only normal CapturingGroup(named, unnamed)
// (?<name>...), (...)
// IgnoreGroup, and LookaroundAssertions are ignored
// (?:...)
// (?=...), (?!...), (?<=...), (?<!...)
let mut num_of_left_capturing_parens = 0;

// Collect capturing group names
let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
// At the same time, check duplicates
// If you want to process this most efficiently:
// - define a scope for each Disjunction
// - then check for duplicates for each `|` while inheriting the parent-child relationship
// ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
// However, duplicates are rare in the first place.
// And as long as it works simply, this may be enough.
let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
FxHashMap::default();
let mut simplified: Vec<SimpleUnit<'a>> = vec![];

let mut in_escape = false;
let mut in_character_class = false;
while let Some(cp) = reader.peek() {
if in_escape {
in_escape = false;
Expand All @@ -73,9 +91,15 @@ fn parse_capturing_groups<'a>(
in_character_class = true;
} else if cp == ']' as u32 {
in_character_class = false;
} else if !in_character_class && cp == '|' as u32 {
simplified.push(SimpleUnit::Pipe);
} else if !in_character_class && cp == ')' as u32 {
simplified.push(SimpleUnit::Close);
} else if !in_character_class && cp == '(' as u32 {
reader.advance();

simplified.push(SimpleUnit::Open);

// Skip IgnoreGroup
if reader.eat2('?', ':')
// Skip LookAroundAssertion
Expand All @@ -102,56 +126,124 @@ fn parse_capturing_groups<'a>(
let span_end = reader.offset();

if reader.eat('>') {
// May be duplicated
if !capturing_group_names.insert(reader.atom(span_start, span_end)) {
// Report them with `Span`
duplicated_named_capturing_groups.push((span_start, span_end));
let group_name = reader.atom(span_start, span_end);

simplified.push(SimpleUnit::GroupName(group_name.clone()));
// Check duplicates later
if let Some(last_span) = group_names.get(&group_name) {
let entry = may_duplicates.entry(group_name).or_default();
entry.push(*last_span);
entry.push((span_start, span_end));
} else {
group_names.insert(group_name, (span_start, span_end));
}

continue;
}
}

// Unnamed
continue;
}

reader.advance();
}

(num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
// Check duplicates and emit error if exists
if !may_duplicates.is_empty() {
// Check must be done for each group name
for (group_name, spans) in may_duplicates {
let iter = simplified.iter().clone();

let mut alternative_depth = FxHashSet::default();
let mut depth = 0_u32;
let mut is_first = true;

'outer: for token in iter {
match token {
SimpleUnit::Open => {
depth += 1;
}
SimpleUnit::Close => {
// May panic if the pattern has invalid, unbalanced parens
depth = depth.saturating_sub(1);
}
SimpleUnit::Pipe => {
if !is_first {
alternative_depth.insert(depth);
}
}
SimpleUnit::GroupName(name) => {
// Check target group name only
if *name != group_name {
continue;
}
// Skip the first one, because it is not duplicated
if is_first {
is_first = false;
continue;
}

// If left outer `|` is found, both can participate
// `|(?<n>)`
// ^ ^ depth: 1
// ^ depth: 0
for i in (0..depth).rev() {
if alternative_depth.contains(&i) {
// Remove it, next duplicates requires another `|`
alternative_depth.remove(&i);
continue 'outer;
}
}

return Err(spans);
}
}
}
}
}

Ok((num_of_left_capturing_parens, group_names.keys().cloned().collect()))
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_count_capturing_groups() {
fn count_capturing_groups() {
for (source_text, expected) in [
("()", (1, 0, false)),
(r"\1()", (1, 0, false)),
("(foo)", (1, 0, false)),
("(foo)(bar)", (2, 0, false)),
("(foo(bar))", (2, 0, false)),
("(foo)[(bar)]", (1, 0, false)),
(r"(foo)\(bar\)", (1, 0, false)),
("(foo)(?<n>bar)", (2, 1, false)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
("(?<n>.)(?<n>..)", (2, 1, true)),
("(?<n>.(?<n>..))", (2, 1, true)),
("()", (1, 0)),
(r"\1()", (1, 0)),
("(foo)", (1, 0)),
("(foo)(bar)", (2, 0)),
("(foo(bar))", (2, 0)),
("(foo)[(bar)]", (1, 0)),
(r"(foo)\(bar\)", (1, 0)),
("(foo)(?<n>bar)", (2, 1)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
] {
let mut reader = Reader::initialize(source_text, true, false).unwrap();

let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(&mut reader);

let actual = (
num_of_left_capturing_parens,
capturing_group_names.len(),
!duplicated_named_capturing_groups.is_empty(),
);
let (num_of_left_capturing_parens, capturing_group_names) =
parse_capturing_groups(&mut reader).unwrap();

let actual = (num_of_left_capturing_parens, capturing_group_names.len());
assert_eq!(expected, actual, "{source_text}");
}
}

#[test]
fn duplicated_named_capturing_groups() {
for source_text in
["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
{
let mut reader = Reader::initialize(source_text, true, false).unwrap();

assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");
}
}
}
4 changes: 2 additions & 2 deletions tasks/coverage/snapshots/codegen_test262.snap
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
commit: 06454619

codegen_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)
4 changes: 2 additions & 2 deletions tasks/coverage/snapshots/minifier_test262.snap
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
commit: 06454619

minifier_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)
Loading

0 comments on commit 90c786c

Please sign in to comment.