diff --git a/main.go b/main.go index fff23e77e36f..d79d37fe0ba1 100644 --- a/main.go +++ b/main.go @@ -12,7 +12,6 @@ import ( "runtime" "strconv" "strings" - "sync" "syscall" "github.com/alecthomas/kingpin/v2" @@ -20,10 +19,11 @@ import ( "github.com/go-logr/logr" "github.com/jpillora/overseer" "github.com/mattn/go-isatty" + "go.uber.org/automaxprocs/maxprocs" + "github.com/trufflesecurity/trufflehog/v3/pkg/cache/simple" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache" - "go.uber.org/automaxprocs/maxprocs" "github.com/trufflesecurity/trufflehog/v3/pkg/analyzer" "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" @@ -493,37 +493,36 @@ func run(state overseer.State) { // default detectors, which can be further filtered by the // user. The filters are applied by the engine and are only // subtractive. - Detectors: append(defaults.DefaultDetectors(), conf.Detectors...), - Verify: !*noVerification, - IncludeDetectors: *includeDetectors, - ExcludeDetectors: *excludeDetectors, - CustomVerifiersOnly: *customVerifiersOnly, - VerifierEndpoints: *verifiers, - Dispatcher: engine.NewPrinterDispatcher(printer), - FilterUnverified: *filterUnverified, - FilterEntropy: *filterEntropy, - VerificationOverlap: *allowVerificationOverlap, - Results: parsedResults, - PrintAvgDetectorTime: *printAvgDetectorTime, - ShouldScanEntireChunk: *scanEntireChunk, - VerificationCacheMetrics: &verificationCacheMetrics, + Detectors: append(defaults.DefaultDetectors(), conf.Detectors...), + Verify: !*noVerification, + IncludeDetectors: *includeDetectors, + ExcludeDetectors: *excludeDetectors, + CustomVerifiersOnly: *customVerifiersOnly, + VerifierEndpoints: *verifiers, + Dispatcher: engine.NewPrinterDispatcher(printer), + FilterUnverified: *filterUnverified, + FilterEntropy: *filterEntropy, + VerificationOverlap: *allowVerificationOverlap, + Results: parsedResults, + PrintAvgDetectorTime: *printAvgDetectorTime, + ShouldScanEntireChunk: *scanEntireChunk, + CompareDetectionStrategies: *compareDetectionStrategies, + VerificationCacheMetrics: &verificationCacheMetrics, } if !*noVerificationCache { engConf.VerificationResultCache = simple.NewCache[detectors.Result]() } - if *compareDetectionStrategies { - if err := compareScans(ctx, cmd, engConf); err != nil { - logFatal(err, "error comparing detection strategies") + topLevelSubCommand, _, _ := strings.Cut(cmd, " ") + switch topLevelSubCommand { + case analyzeCmd.FullCommand(): + analyzer.Run(cmd) + default: + metrics, err := runSingleScan(ctx, cmd, engConf) + if err != nil { + logFatal(err, "error running scan") } - return - } - - metrics, err := runSingleScan(ctx, cmd, engConf) - if err != nil { - logFatal(err, "error running scan") - } verificationCacheMetricsSnapshot := struct { Hits int32 @@ -556,54 +555,6 @@ func run(state overseer.State) { } } -func compareScans(ctx context.Context, cmd string, cfg engine.Config) error { - var ( - entireMetrics metrics - maxLengthMetrics metrics - err error - ) - - var wg sync.WaitGroup - wg.Add(1) - - go func() { - defer wg.Done() - // Run scan with entire chunk span calculator. - cfg.ShouldScanEntireChunk = true - entireMetrics, err = runSingleScan(ctx, cmd, cfg) - if err != nil { - ctx.Logger().Error(err, "error running scan with entire chunk span calculator") - } - }() - - // Run scan with max-length span calculator. - maxLengthMetrics, err = runSingleScan(ctx, cmd, cfg) - if err != nil { - return fmt.Errorf("error running scan with custom span calculator: %v", err) - } - - wg.Wait() - - return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics) -} - -func compareMetrics(customMetrics, entireMetrics engine.Metrics) error { - fmt.Printf("Comparison of scan results: \n") - fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String()) - fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String()) - - // Check for differences in scan metrics. - if customMetrics.ChunksScanned != entireMetrics.ChunksScanned || - customMetrics.BytesScanned != entireMetrics.BytesScanned || - customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound { - return fmt.Errorf("scan metrics do not match") - } - - return nil -} - type metrics struct { engine.Metrics hasFoundResults bool diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index c6f97da1476c..357ab04d79b3 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -52,16 +52,6 @@ type spanCalculationParams struct { detector detectors.Detector } -// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data. -// This is used when we want to match against the full length of the provided chunk. -type EntireChunkSpanCalculator struct{} - -// calculateSpan returns the match span as the length of the chunk data, -// effectively using the entire chunk for matching. -func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { - return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))} -} - // adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude // or values provided by specific detectors to adjust the start and end indices of the span, allowing // for more granular control over the match. diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 2bfb2d51b92c..50fbdf0681c7 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -13,9 +13,10 @@ import ( "github.com/adrg/strutil" "github.com/adrg/strutil/metrics" lru "github.com/hashicorp/golang-lru/v2" - "github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache" "google.golang.org/protobuf/proto" + "github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache" + "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/config" "github.com/trufflesecurity/trufflehog/v3/pkg/context" @@ -149,6 +150,8 @@ type Config struct { VerificationResultCache verificationcache.ResultCache VerificationCacheMetrics verificationcache.MetricsReporter + + CompareDetectionStrategies bool } // Engine represents the core scanning engine responsible for detecting secrets in input data. @@ -179,6 +182,12 @@ type Engine struct { // By default, the engine will only scan a subset of the chunk if a detector matches the chunk. // If this flag is set to true, the engine will scan the entire chunk. scanEntireChunk bool + // If this flag is set to true, the engine will run two scans per chunk: + // 1. the entire chunk (old) + // 2. a subset of the chunk (new) + // + // Any discrepancies between methods will be logged. + compareScanStrategies bool // ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. AhoCorasickCore *ahocorasick.Core @@ -241,6 +250,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) { detectorWorkerMultiplier: cfg.DetectorWorkerMultiplier, notificationWorkerMultiplier: cfg.NotificationWorkerMultiplier, verificationOverlapWorkerMultiplier: cfg.VerificationOverlapWorkerMultiplier, + compareScanStrategies: cfg.CompareDetectionStrategies, } if engine.sourceManager == nil { return nil, fmt.Errorf("source manager is required") @@ -517,14 +527,8 @@ func (e *Engine) initialize(ctx context.Context) error { e.dedupeCache = cache ctx.Logger().V(4).Info("engine initialized") - // Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk. - var ahoCOptions []ahocorasick.CoreOption - if e.scanEntireChunk { - ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator))) - } - ctx.Logger().V(4).Info("setting up aho-corasick core") - e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...) + e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors) ctx.Logger().V(4).Info("set up aho-corasick core") return nil @@ -1033,12 +1037,25 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { func (e *Engine) detectorWorker(ctx context.Context) { for data := range e.detectableChunksChan { start := time.Now() - e.detectChunk(ctx, data) + + if !e.compareScanStrategies { + // Typical use case: scan the chunk. + _ = e.detectChunk(ctx, data, e.scanEntireChunk) + } else { + // --compare-detection-strategies is enabled, scan with both methods and compare results. + customSpanResultCount := e.detectChunk(ctx, data, false) + entireChunkResultCount := e.detectChunk(ctx, data, true) + + if customSpanResultCount != entireChunkResultCount { + err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount) + ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String()) + } + } chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds())) } } -func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { +func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int { var start time.Time if e.printAvgDetectorTime { start = time.Now() @@ -1055,8 +1072,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { // The matches field of the DetectorMatch struct contains the // relevant portions of the chunk data that were matched. // This avoids the need for additional regex processing on the entire chunk data. - matches := data.detector.Matches() - for _, matchBytes := range matches { + var matchedBytes [][]byte + if scanEntireChunk { + matchedBytes = [][]byte{data.chunk.Data} + } else { + matchedBytes = data.detector.Matches() + } + resultCount := 0 + for _, matchBytes := range matchedBytes { matchCount++ detectBytesPerMatch.Observe(float64(len(matchBytes))) @@ -1089,13 +1112,23 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { if e.printAvgDetectorTime && len(results) > 0 { elapsed := time.Since(start) detectorName := results[0].DetectorType.String() + avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName) - var avgTime []time.Duration - if ok { - avgTime, ok = avgTimeI.([]time.Duration) - if !ok { - return - } + if !ok { + ctx.Logger().Error( + errors.New("failed to load metric"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults + } + + avgTime, ok := avgTimeI.([]time.Duration) + if !ok { + ctx.Logger().Error( + errors.New("failed to cast metric as []time.Duration"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults } avgTime = append(avgTime, elapsed) e.metrics.detectorAvgTime.Store(detectorName, avgTime) @@ -1110,6 +1143,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { results = e.filterResults(ctx, data.detector, results) } + HandleResults: + results = e.filterResults(ctx, data.detector, results) + + resultCount += len(results) for _, res := range results { e.processResult(ctx, data, res, isFalsePositive) } @@ -1117,7 +1154,13 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { matchesPerChunk.Observe(float64(matchCount)) - data.wgDoneFn() + // If `e.compareScanStrategies` is enabled, two scans will be run. + // Don't decrement the WaitGroup until both have been completed. + if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) { + data.wgDoneFn() + } + + return resultCount } func (e *Engine) filterResults(