- Sponsor
-
Notifications
You must be signed in to change notification settings - Fork 646
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add lint backend to run semgrep #18593
Changes from all commits
c041769
f4bde7f
8955c50
a2fa07b
56dccf9
955538e
68fc026
8be9b21
594d1ea
790fe51
4f33a5d
89c5b4d
25733f3
2bb0672
323f693
d08611f
798ad6c
a381711
987b484
613fc83
d0f5d4e
b86ba09
3efeac6
a405f7b
1fad45e
f71db25
42f9683
ec9c1fd
ec79db0
d7ff26e
b390be2
00d4a97
a208b90
e2ca370
a198538
af1101f
71091e6
e2902ac
cee2487
05c49c4
a3c7e07
5f7fdaa
195d090
9c68431
ace84a6
409e73c
25c4f48
385666a
5db7e97
16f22c4
ed0687a
4593fb2
db6bc2e
7e7135f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Copyright 2023 Pants project contributors (see CONTRIBUTORS.md). | ||
# Licensed under the Apache License, Version 2.0 (see LICENSE). | ||
python_sources() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Copyright 2023 Pants project contributors (see CONTRIBUTORS.md). | ||
# Licensed under the Apache License, Version 2.0 (see LICENSE). | ||
|
||
"""Lightweight static analysis for many languages. Find bug variants with patterns that look like | ||
source code. | ||
|
||
See https://semgrep.dev/ for details. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import Iterable | ||
|
||
from pants.backend.python.goals import lockfile as python_lockfile | ||
from pants.backend.tools.semgrep import rules as semgrep_rules | ||
from pants.backend.tools.semgrep import subsystem as subsystem | ||
from pants.engine.rules import Rule | ||
from pants.engine.unions import UnionRule | ||
|
||
|
||
def rules() -> Iterable[Rule | UnionRule]: | ||
return ( | ||
*semgrep_rules.rules(), | ||
*subsystem.rules(), | ||
*python_lockfile.rules(), | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright 2023 Pants project contributors (see CONTRIBUTORS.md). | ||
# Licensed under the Apache License, Version 2.0 (see LICENSE). | ||
|
||
resource(name="lockfile", source="semgrep.lock") | ||
|
||
python_sources(dependencies=[":lockfile"]) | ||
|
||
python_tests( | ||
name="tests", | ||
overrides={ | ||
"rules_integration_test.py": dict(timeout=240), | ||
}, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
# Copyright 2023 Pants project contributors (see CONTRIBUTORS.md). | ||
# Licensed under the Apache License, Version 2.0 (see LICENSE). | ||
from __future__ import annotations | ||
|
||
import itertools | ||
import logging | ||
from collections import defaultdict | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
from typing import Iterable | ||
|
||
from pants.backend.python.util_rules import pex | ||
from pants.backend.python.util_rules.pex import PexRequest, VenvPex, VenvPexProcess | ||
from pants.core.goals.lint import LintResult, LintTargetsRequest | ||
from pants.core.util_rules.partitions import Partition, Partitions | ||
from pants.core.util_rules.source_files import SourceFiles, SourceFilesRequest | ||
from pants.engine.addresses import Address | ||
from pants.engine.fs import ( | ||
CreateDigest, | ||
Digest, | ||
FileContent, | ||
MergeDigests, | ||
PathGlobs, | ||
Paths, | ||
Snapshot, | ||
) | ||
from pants.engine.process import FallibleProcessResult, ProcessCacheScope | ||
from pants.engine.rules import Get, MultiGet, Rule, collect_rules, rule | ||
from pants.engine.unions import UnionRule | ||
from pants.option.global_options import GlobalOptions | ||
from pants.util.logging import LogLevel | ||
from pants.util.strutil import pluralize | ||
|
||
from .subsystem import SemgrepFieldSet, SemgrepSubsystem | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class SemgrepLintRequest(LintTargetsRequest): | ||
field_set_type = SemgrepFieldSet | ||
tool_subsystem = SemgrepSubsystem | ||
|
||
|
||
@dataclass(frozen=True) | ||
class PartitionMetadata: | ||
config_files: frozenset[Path] | ||
ignore_files: Snapshot | ||
|
||
@property | ||
def description(self) -> str: | ||
return ", ".join(sorted(str(path) for path in self.config_files)) | ||
|
||
|
||
_IGNORE_FILE_NAME = ".semgrepignore" | ||
|
||
_RULES_DIR_NAME = ".semgrep" | ||
_RULES_FILES_GLOBS = ( | ||
".semgrep.yml", | ||
".semgrep.yaml", | ||
f"{_RULES_DIR_NAME}/*.yml", | ||
f"{_RULES_DIR_NAME}/*.yaml", | ||
) | ||
|
||
|
||
@dataclass | ||
class SemgrepIgnoreFiles: | ||
snapshot: Snapshot | ||
|
||
|
||
@dataclass | ||
class AllSemgrepConfigs: | ||
configs_by_dir: dict[Path, set[Path]] | ||
|
||
def ancestor_configs(self, address: Address) -> Iterable[Path]: | ||
# TODO: introspect the semgrep rules and determine which (if any) apply to the files, e.g. a | ||
# Python file shouldn't depend on a .semgrep.yml that doesn't have any 'python' or 'generic' | ||
# rules, and similarly if there's path inclusions/exclusions. | ||
# TODO: this would be better as actual dependency inference (e.g. allows inspection, manual | ||
# addition/exclusion), but that can only infer 'full' dependencies and it is wrong (e.g. JVM | ||
# things break) for real code files to depend on this sort of non-code linter config; requires | ||
# dependency scopes or similar (https://github.com/pantsbuild/pants/issues/12794) | ||
spec = Path(address.spec_path) | ||
|
||
for ancestor in itertools.chain([spec], spec.parents): | ||
yield from self.configs_by_dir.get(ancestor, []) | ||
|
||
|
||
def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs: | ||
configs_by_dir = defaultdict(set) | ||
for path_ in all_paths.files: | ||
path = Path(path_) | ||
# A rule like foo/bar/.semgrep/baz.yaml should behave like it's in in foo/bar, not | ||
# foo/bar/.semgrep | ||
parent = path.parent | ||
config_directory = parent.parent if parent.name == _RULES_DIR_NAME else parent | ||
|
||
configs_by_dir[config_directory].add(path) | ||
|
||
return AllSemgrepConfigs(configs_by_dir) | ||
|
||
|
||
@rule | ||
async def find_all_semgrep_configs() -> AllSemgrepConfigs: | ||
all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in _RULES_FILES_GLOBS])) | ||
return _group_by_semgrep_dir(all_paths) | ||
|
||
|
||
@dataclass(frozen=True) | ||
class RelevantSemgrepConfigsRequest: | ||
field_set: SemgrepFieldSet | ||
|
||
|
||
class RelevantSemgrepConfigs(frozenset[Path]): | ||
pass | ||
|
||
|
||
@rule | ||
async def infer_relevant_semgrep_configs( | ||
request: RelevantSemgrepConfigsRequest, all_semgrep: AllSemgrepConfigs | ||
) -> RelevantSemgrepConfigs: | ||
return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address)) | ||
|
||
|
||
@rule | ||
async def all_semgrep_ignore_files() -> SemgrepIgnoreFiles: | ||
snapshot = await Get(Snapshot, PathGlobs([f"**/{_IGNORE_FILE_NAME}"])) | ||
return SemgrepIgnoreFiles(snapshot) | ||
|
||
|
||
@rule | ||
async def partition( | ||
request: SemgrepLintRequest.PartitionRequest[SemgrepFieldSet], | ||
semgrep: SemgrepSubsystem, | ||
ignore_files: SemgrepIgnoreFiles, | ||
) -> Partitions: | ||
if semgrep.skip: | ||
return Partitions() | ||
|
||
all_configs = await MultiGet( | ||
Get(RelevantSemgrepConfigs, RelevantSemgrepConfigsRequest(field_set)) | ||
for field_set in request.field_sets | ||
) | ||
|
||
# partition by the sets of configs that apply to each input | ||
by_config = defaultdict(list) | ||
for field_set, configs in zip(request.field_sets, all_configs): | ||
if configs: | ||
by_config[configs].append(field_set) | ||
|
||
return Partitions( | ||
Partition(tuple(field_sets), PartitionMetadata(configs, ignore_files.snapshot)) | ||
for configs, field_sets in by_config.items() | ||
) | ||
|
||
|
||
# We have a hard-coded settings file to side-step | ||
# https://github.com/returntocorp/semgrep/issues/7102, and also provide more cacheability. | ||
_DEFAULT_SETTINGS = FileContent( | ||
path="__semgrep_settings.yaml", | ||
content=b"has_shown_metrics_notification: true", | ||
) | ||
|
||
|
||
@rule(desc="Lint with Semgrep", level=LogLevel.DEBUG) | ||
async def lint( | ||
request: SemgrepLintRequest.Batch[SemgrepFieldSet, PartitionMetadata], | ||
semgrep: SemgrepSubsystem, | ||
global_options: GlobalOptions, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side observation (not suggesting actionable in this PR): I think it is unfortunate to depend on the entire I see this is the case in quite a few other places as well, which makes me think we might want to consider having a generic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been meaning to generate a rule per option from subsystems... Let me file a ticket... |
||
) -> LintResult: | ||
config_files, semgrep_pex, input_files, settings = await MultiGet( | ||
Get(Snapshot, PathGlobs(str(s) for s in request.partition_metadata.config_files)), | ||
Get(VenvPex, PexRequest, semgrep.to_pex_request()), | ||
Get(SourceFiles, SourceFilesRequest(field_set.source for field_set in request.elements)), | ||
Get(Digest, CreateDigest([_DEFAULT_SETTINGS])), | ||
) | ||
|
||
input_digest = await Get( | ||
Digest, | ||
MergeDigests( | ||
( | ||
input_files.snapshot.digest, | ||
config_files.digest, | ||
settings, | ||
request.partition_metadata.ignore_files.digest, | ||
) | ||
), | ||
) | ||
|
||
cache_scope = ProcessCacheScope.PER_SESSION if semgrep.force else ProcessCacheScope.SUCCESSFUL | ||
|
||
# TODO: https://github.com/pantsbuild/pants/issues/18430 support running this with --autofix | ||
# under the fix goal... but not all rules have fixes, so we need to be running with | ||
# --error/checking exit codes, which FixResult doesn't currently support. | ||
result = await Get( | ||
FallibleProcessResult, | ||
VenvPexProcess( | ||
semgrep_pex, | ||
argv=( | ||
"scan", | ||
*(f"--config={f}" for f in config_files.files), | ||
"--jobs={pants_concurrency}", | ||
"--error", | ||
*semgrep.args, | ||
# we don't pass the target files directly because that overrides .semgrepignore | ||
huonw marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# (https://github.com/returntocorp/semgrep/issues/4978), so instead we just tell its | ||
# traversal to include all the source files in this partition. Unfortunately this | ||
# include is implicitly unrooted (i.e. as if it was **/path/to/file), and so may | ||
# pick up other files if the names match. The highest risk of this is within the | ||
# semgrep PEX. | ||
*(f"--include={f}" for f in input_files.files), | ||
f"--exclude={semgrep_pex.pex_filename}", | ||
), | ||
extra_env={ | ||
"SEMGREP_FORCE_COLOR": "true", | ||
# disable various global state/network requests | ||
"SEMGREP_SETTINGS_FILE": _DEFAULT_SETTINGS.path, | ||
"SEMGREP_ENABLE_VERSION_CHECK": "0", | ||
"SEMGREP_SEND_METRICS": "off", | ||
}, | ||
input_digest=input_digest, | ||
concurrency_available=len(input_files.files), | ||
description=f"Run Semgrep on {pluralize(len(input_files.files), 'file')}.", | ||
level=LogLevel.DEBUG, | ||
cache_scope=cache_scope, | ||
), | ||
) | ||
|
||
return LintResult.create(request, result, strip_formatting=not global_options.colors) | ||
|
||
|
||
def rules() -> Iterable[Rule | UnionRule]: | ||
return [*collect_rules(), *SemgrepLintRequest.rules(), *pex.rules()] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be
PurePath
? Generally we prefer thePure
variants since rules shouldn't be doing file I/OThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this got resolved?
It's the only one I feel strongly about.
Admittedly we should issue a lint.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, dear, sorry, I fixed but pushed to the wrong branch! Thanks for flagging.
#19685
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#19686