From f1f0e3cffbd454f4bb7b419cd66337c6c867a7d1 Mon Sep 17 00:00:00 2001
From: purajit <purajit.malalur@color.com>
Date: Wed, 3 Jul 2024 13:17:51 -0700
Subject: [PATCH] Allow customizing semgrep configurations; correct rule
 matching glob

---
 .../pants/backend/tools/semgrep/rules.py      | 48 +++++++------------
 .../pants/backend/tools/semgrep/rules_test.py |  2 +-
 .../pants/backend/tools/semgrep/subsystem.py  | 17 ++++++-
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py
index e2e68cbe8bf0..d426b166757f 100644
--- a/src/python/pants/backend/tools/semgrep/rules.py
+++ b/src/python/pants/backend/tools/semgrep/rules.py
@@ -44,29 +44,12 @@ class SemgrepLintRequest(LintTargetsRequest):
 @dataclass(frozen=True)
 class PartitionMetadata:
     config_files: frozenset[PurePath]
-    ignore_files: Snapshot
 
     @property
     def description(self) -> str:
         return ", ".join(sorted(str(path) for path in self.config_files))
 
 
-_IGNORE_FILE_NAME = ".semgrepignore"
-
-_RULES_DIR_NAME = ".semgrep"
-_RULES_FILES_GLOBS = (
-    ".semgrep.yml",
-    ".semgrep.yaml",
-    f"{_RULES_DIR_NAME}/*.yml",
-    f"{_RULES_DIR_NAME}/*.yaml",
-)
-
-
-@dataclass
-class SemgrepIgnoreFiles:
-    snapshot: Snapshot
-
-
 @dataclass
 class AllSemgrepConfigs:
     configs_by_dir: dict[PurePath, set[PurePath]]
@@ -85,14 +68,14 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]:
             yield from self.configs_by_dir.get(ancestor, [])
 
 
-def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs:
+def _group_by_semgrep_dir(rules_dir_name: str, all_paths: Paths) -> AllSemgrepConfigs:
     configs_by_dir = defaultdict(set)
     for path_ in all_paths.files:
         path = PurePath(path_)
         # A rule like foo/bar/.semgrep/baz.yaml should behave like it's in in foo/bar, not
         # foo/bar/.semgrep
         parent = path.parent
-        config_directory = parent.parent if parent.name == _RULES_DIR_NAME else parent
+        config_directory = parent.parent if parent.name == rules_dir_name else parent
 
         configs_by_dir[config_directory].add(path)
 
@@ -100,9 +83,17 @@ def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs:
 
 
 @rule
-async def find_all_semgrep_configs() -> AllSemgrepConfigs:
-    all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in _RULES_FILES_GLOBS]))
-    return _group_by_semgrep_dir(all_paths)
+async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs:
+    rules_files_globs = (
+        f"{semgrep.rules_dir_name}/*.yml",
+        f"{semgrep.rules_dir_name}/*.yaml",
+        # TODO: these don't seem to be mentioned in semgrep docs; should they be removed?
+        ".semgrep.yml",
+        ".semgrep.yaml",
+    )
+
+    all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in rules_files_globs]))
+    return _group_by_semgrep_dir(semgrep.rules_dir_name, all_paths)
 
 
 @dataclass(frozen=True)
@@ -121,17 +112,10 @@ async def infer_relevant_semgrep_configs(
     return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address))
 
 
-@rule
-async def all_semgrep_ignore_files() -> SemgrepIgnoreFiles:
-    snapshot = await Get(Snapshot, PathGlobs([f"**/{_IGNORE_FILE_NAME}"]))
-    return SemgrepIgnoreFiles(snapshot)
-
-
 @rule
 async def partition(
     request: SemgrepLintRequest.PartitionRequest[SemgrepFieldSet],
     semgrep: SemgrepSubsystem,
-    ignore_files: SemgrepIgnoreFiles,
 ) -> Partitions:
     if semgrep.skip:
         return Partitions()
@@ -148,7 +132,7 @@ async def partition(
             by_config[configs].append(field_set)
 
     return Partitions(
-        Partition(tuple(field_sets), PartitionMetadata(configs, ignore_files.snapshot))
+        Partition(tuple(field_sets), PartitionMetadata(configs))
         for configs, field_sets in by_config.items()
     )
 
@@ -175,6 +159,8 @@ async def lint(
         Get(Digest, CreateDigest([_DEFAULT_SETTINGS])),
     )
 
+    ignore_files = await Get(Snapshot, PathGlobs([semgrep.ignore_config_path]))
+
     input_digest = await Get(
         Digest,
         MergeDigests(
@@ -182,7 +168,7 @@ async def lint(
                 input_files.snapshot.digest,
                 config_files.digest,
                 settings,
-                request.partition_metadata.ignore_files.digest,
+                ignore_files.digest,
             )
         ),
     )
diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py
index 520560b8c736..318cb4948769 100644
--- a/src/python/pants/backend/tools/semgrep/rules_test.py
+++ b/src/python/pants/backend/tools/semgrep/rules_test.py
@@ -66,7 +66,7 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs:
 )
 def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs):
     input = Paths(files=paths, dirs=())
-    result = rules._group_by_semgrep_dir(input)
+    result = rules._group_by_semgrep_dir(".semgrep", input)
     assert result == expected
 
 
diff --git a/src/python/pants/backend/tools/semgrep/subsystem.py b/src/python/pants/backend/tools/semgrep/subsystem.py
index af73b460304e..dc08864c5ff0 100644
--- a/src/python/pants/backend/tools/semgrep/subsystem.py
+++ b/src/python/pants/backend/tools/semgrep/subsystem.py
@@ -12,7 +12,7 @@
 from pants.engine.rules import Rule, collect_rules
 from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target
 from pants.engine.unions import UnionRule
-from pants.option.option_types import ArgsListOption, BoolOption, SkipOption
+from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption
 from pants.util.strutil import softwrap
 
 
@@ -51,6 +51,21 @@ class SemgrepSubsystem(PythonToolBase):
     register_lockfile = True
     default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock")
 
+    rules_dir_name = StrOption(
+        default=".semgrep",
+        help=softwrap(
+            """
+            The directory name with semgrep rules, which is searched recursively for YAML files, and
+            can be present at any level, with rules applying to all levels below it.
+            """
+        ),
+    )
+
+    ignore_config_path = StrOption(
+        default=".semgrepignore",
+        help="The path to the semgrepignore file",
+    )
+
     args = ArgsListOption(
         example="--verbose",
         default=["--quiet"],