-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Compare detection strategies per chunk/detector #2922
Open
rgmz
wants to merge
1
commit into
trufflesecurity:main
Choose a base branch
from
rgmz:feat/inline-compare
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,9 +13,10 @@ import ( | |
"github.com/adrg/strutil" | ||
"github.com/adrg/strutil/metrics" | ||
lru "github.com/hashicorp/golang-lru/v2" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache" | ||
"google.golang.org/protobuf/proto" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/common" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/config" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/context" | ||
|
@@ -149,6 +150,8 @@ type Config struct { | |
|
||
VerificationResultCache verificationcache.ResultCache | ||
VerificationCacheMetrics verificationcache.MetricsReporter | ||
|
||
CompareDetectionStrategies bool | ||
} | ||
|
||
// Engine represents the core scanning engine responsible for detecting secrets in input data. | ||
|
@@ -179,6 +182,12 @@ type Engine struct { | |
// By default, the engine will only scan a subset of the chunk if a detector matches the chunk. | ||
// If this flag is set to true, the engine will scan the entire chunk. | ||
scanEntireChunk bool | ||
// If this flag is set to true, the engine will run two scans per chunk: | ||
// 1. the entire chunk (old) | ||
// 2. a subset of the chunk (new) | ||
// | ||
// Any discrepancies between methods will be logged. | ||
compareScanStrategies bool | ||
|
||
// ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. | ||
AhoCorasickCore *ahocorasick.Core | ||
|
@@ -241,6 +250,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) { | |
detectorWorkerMultiplier: cfg.DetectorWorkerMultiplier, | ||
notificationWorkerMultiplier: cfg.NotificationWorkerMultiplier, | ||
verificationOverlapWorkerMultiplier: cfg.VerificationOverlapWorkerMultiplier, | ||
compareScanStrategies: cfg.CompareDetectionStrategies, | ||
} | ||
if engine.sourceManager == nil { | ||
return nil, fmt.Errorf("source manager is required") | ||
|
@@ -517,14 +527,8 @@ func (e *Engine) initialize(ctx context.Context) error { | |
e.dedupeCache = cache | ||
ctx.Logger().V(4).Info("engine initialized") | ||
|
||
// Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk. | ||
var ahoCOptions []ahocorasick.CoreOption | ||
if e.scanEntireChunk { | ||
ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator))) | ||
} | ||
|
||
ctx.Logger().V(4).Info("setting up aho-corasick core") | ||
e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...) | ||
e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors) | ||
ctx.Logger().V(4).Info("set up aho-corasick core") | ||
|
||
return nil | ||
|
@@ -1033,12 +1037,25 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { | |
func (e *Engine) detectorWorker(ctx context.Context) { | ||
for data := range e.detectableChunksChan { | ||
start := time.Now() | ||
e.detectChunk(ctx, data) | ||
|
||
if !e.compareScanStrategies { | ||
// Typical use case: scan the chunk. | ||
_ = e.detectChunk(ctx, data, e.scanEntireChunk) | ||
} else { | ||
// --compare-detection-strategies is enabled, scan with both methods and compare results. | ||
customSpanResultCount := e.detectChunk(ctx, data, false) | ||
entireChunkResultCount := e.detectChunk(ctx, data, true) | ||
|
||
if customSpanResultCount != entireChunkResultCount { | ||
err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount) | ||
ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String()) | ||
} | ||
} | ||
chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds())) | ||
} | ||
} | ||
|
||
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { | ||
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int { | ||
var start time.Time | ||
if e.printAvgDetectorTime { | ||
start = time.Now() | ||
|
@@ -1055,8 +1072,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { | |
// The matches field of the DetectorMatch struct contains the | ||
// relevant portions of the chunk data that were matched. | ||
// This avoids the need for additional regex processing on the entire chunk data. | ||
matches := data.detector.Matches() | ||
for _, matchBytes := range matches { | ||
var matchedBytes [][]byte | ||
if scanEntireChunk { | ||
matchedBytes = [][]byte{data.chunk.Data} | ||
} else { | ||
matchedBytes = data.detector.Matches() | ||
} | ||
resultCount := 0 | ||
for _, matchBytes := range matchedBytes { | ||
matchCount++ | ||
detectBytesPerMatch.Observe(float64(len(matchBytes))) | ||
|
||
|
@@ -1089,13 +1112,23 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { | |
if e.printAvgDetectorTime && len(results) > 0 { | ||
elapsed := time.Since(start) | ||
detectorName := results[0].DetectorType.String() | ||
|
||
avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName) | ||
var avgTime []time.Duration | ||
if ok { | ||
avgTime, ok = avgTimeI.([]time.Duration) | ||
if !ok { | ||
return | ||
} | ||
if !ok { | ||
ctx.Logger().Error( | ||
errors.New("failed to load metric"), | ||
"Unable to track detector time", | ||
"detector", detectorName) | ||
goto HandleResults | ||
} | ||
|
||
avgTime, ok := avgTimeI.([]time.Duration) | ||
if !ok { | ||
ctx.Logger().Error( | ||
errors.New("failed to cast metric as []time.Duration"), | ||
"Unable to track detector time", | ||
"detector", detectorName) | ||
goto HandleResults | ||
} | ||
avgTime = append(avgTime, elapsed) | ||
e.metrics.detectorAvgTime.Store(detectorName, avgTime) | ||
|
@@ -1110,14 +1143,24 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { | |
results = e.filterResults(ctx, data.detector, results) | ||
} | ||
|
||
HandleResults: | ||
results = e.filterResults(ctx, data.detector, results) | ||
|
||
resultCount += len(results) | ||
for _, res := range results { | ||
e.processResult(ctx, data, res, isFalsePositive) | ||
} | ||
} | ||
|
||
matchesPerChunk.Observe(float64(matchCount)) | ||
|
||
data.wgDoneFn() | ||
// If `e.compareScanStrategies` is enabled, two scans will be run. | ||
// Don't decrement the WaitGroup until both have been completed. | ||
if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) { | ||
data.wgDoneFn() | ||
} | ||
Comment on lines
+1159
to
+1161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit brittle. It might be better if moved to |
||
|
||
return resultCount | ||
} | ||
|
||
func (e *Engine) filterResults( | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Idk if this line is still necessary.