From 5e95db8f6e752ac6391d55707f516e7e8e64fea5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 27 Nov 2023 21:07:23 -0500 Subject: [PATCH] searcher: work around NUL line terminator bug As the FIXME comment says, ripgrep is not yet using the new line terminator option in regex-automata exposed for exactly this purpose. Because of that, line anchors like `(?m:^)` and `(?m:$)` will only match `\n` as a line terminator. This means that when --null-data is used in combination with --line-regexp, the anchors inserted by --line-regexp will not match correctly. This is only a big deal in the "fast" path, which requires the regex engine to deal with line terminators itself correctly. The slow path strips line terminators regardless of what they are, and so the line anchors can match (begin/end of haystack). Fixes #2658 --- CHANGELOG.md | 2 ++ crates/searcher/src/searcher/core.rs | 11 +++++++++++ tests/regression.rs | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 117ab170c0..b8ef0e67f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ Bug fixes: * [BUG #2654](https://github.com/BurntSushi/ripgrep/issues/2654): Fix `deb` release sha256 sum file. +* [BUG #2658](https://github.com/BurntSushi/ripgrep/issues/2658): + Fix partial regression in the behavior of `--null-data --line-regexp`. * [BUG #2659](https://github.com/BurntSushi/ripgrep/issues/2659): Fix Fish shell completions. * [BUG #2662](https://github.com/BurntSushi/ripgrep/issues/2662): diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs index e6836e6a1a..7d7e5cdcf1 100644 --- a/crates/searcher/src/searcher/core.rs +++ b/crates/searcher/src/searcher/core.rs @@ -612,6 +612,17 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { return false; } if let Some(line_term) = self.matcher.line_terminator() { + // FIXME: This works around a bug in grep-regex where it does + // not set the line terminator of the regex itself, and thus + // line anchors like `(?m:^)` and `(?m:$)` will not match + // anything except for `\n`. So for now, we just disable the fast + // line-by-line searcher which requires the regex to be able to + // deal with line terminators correctly. The slow line-by-line + // searcher strips line terminators and thus absolves the regex + // engine from needing to care about whether they are `\n` or NUL. + if line_term.as_byte() == b'\x00' { + return false; + } if line_term == self.config.line_term { return true; } diff --git a/tests/regression.rs b/tests/regression.rs index 54490b98ec..dc463aa317 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -1210,3 +1210,10 @@ rgtest!(r2574, |dir: Dir, mut cmd: TestCommand| { .stdout(); eqnice!("some.domain.com\nsome.domain.com\n", got); }); + +// See: https://github.com/BurntSushi/ripgrep/issues/2658 +rgtest!(r2658_null_data_line_regexp, |dir: Dir, mut cmd: TestCommand| { + dir.create("haystack", "foo\0bar\0quux\0"); + let got = cmd.args(&["--null-data", "--line-regexp", r"bar"]).stdout(); + eqnice!("haystack:bar\0", got); +});