diff --git a/main.go b/main.go index 8d63a236dab8f..0ff5485039d76 100644 --- a/main.go +++ b/main.go @@ -10,7 +10,6 @@ import ( "runtime" "strconv" "strings" - "sync" "syscall" "time" @@ -467,33 +466,26 @@ func run(state overseer.State) { } scanConfig := scanConfig{ - Command: cmd, - Concurrency: *concurrency, - Decoders: decoders.DefaultDecoders(), - Conf: conf, - IncludeFilter: includeFilter, - ExcludeFilter: excludeFilter, - EndpointCustomizer: endpointCustomizer, - NoVerification: *noVerification, - PrintAvgDetectorTime: *printAvgDetectorTime, - FilterUnverified: *filterUnverified, - FilterEntropy: *filterEntropy, - ScanEntireChunk: *scanEntireChunk, - JobReportWriter: jobReportWriter, - AllowVerificationOverlap: *allowVerificationOverlap, - ParsedResults: parsedResults, - Printer: printer, - } - - if *compareDetectionStrategies { - err := compareScans(ctx, scanConfig) - if err != nil { - logFatal(err, "error comparing detection strategies") - } - return - } - - metrics, err := runSingleScan(ctx, scanConfig, *scanEntireChunk) + Command: cmd, + Concurrency: *concurrency, + Decoders: decoders.DefaultDecoders(), + Conf: conf, + IncludeFilter: includeFilter, + ExcludeFilter: excludeFilter, + EndpointCustomizer: endpointCustomizer, + NoVerification: *noVerification, + PrintAvgDetectorTime: *printAvgDetectorTime, + FilterUnverified: *filterUnverified, + FilterEntropy: *filterEntropy, + ScanEntireChunk: *scanEntireChunk, + CompareDetectionStrategies: *compareDetectionStrategies, + JobReportWriter: jobReportWriter, + AllowVerificationOverlap: *allowVerificationOverlap, + ParsedResults: parsedResults, + Printer: printer, + } + + metrics, err := runScan(ctx, scanConfig) if err != nil { logFatal(err, "error running scan") } @@ -515,69 +507,23 @@ func run(state overseer.State) { } type scanConfig struct { - Command string - Concurrency int - Decoders []decoders.Decoder - Conf *config.Config - IncludeFilter func(detectors.Detector) bool - ExcludeFilter func(detectors.Detector) bool - EndpointCustomizer func(detectors.Detector) bool - NoVerification bool - PrintAvgDetectorTime bool - FilterUnverified bool - FilterEntropy float64 - ScanEntireChunk bool - JobReportWriter io.WriteCloser - AllowVerificationOverlap bool - ParsedResults map[string]struct{} - Printer engine.Printer -} - -func compareScans(ctx context.Context, cfg scanConfig) error { - var ( - entireMetrics metrics - maxLengthMetrics metrics - err error - ) - - var wg sync.WaitGroup - wg.Add(1) - - go func() { - defer wg.Done() - // Run scan with entire chunk span calculator. - entireMetrics, err = runSingleScan(ctx, cfg, true) - if err != nil { - ctx.Logger().Error(err, "error running scan with entire chunk span calculator") - } - }() - - // Run scan with max-length span calculator. - maxLengthMetrics, err = runSingleScan(ctx, cfg, false) - if err != nil { - return fmt.Errorf("error running scan with custom span calculator: %v", err) - } - - wg.Wait() - - return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics) -} - -func compareMetrics(customMetrics, entireMetrics engine.Metrics) error { - fmt.Printf("Comparison of scan results: \n") - fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String()) - fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String()) - - // Check for differences in scan metrics. - if customMetrics.ChunksScanned != entireMetrics.ChunksScanned || - customMetrics.BytesScanned != entireMetrics.BytesScanned || - customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound { - return fmt.Errorf("scan metrics do not match") - } - - return nil + Command string + Concurrency int + Decoders []decoders.Decoder + Conf *config.Config + IncludeFilter func(detectors.Detector) bool + ExcludeFilter func(detectors.Detector) bool + EndpointCustomizer func(detectors.Detector) bool + NoVerification bool + PrintAvgDetectorTime bool + FilterUnverified bool + FilterEntropy float64 + ScanEntireChunk bool + CompareDetectionStrategies bool + JobReportWriter io.WriteCloser + AllowVerificationOverlap bool + ParsedResults map[string]struct{} + Printer engine.Printer } type metrics struct { @@ -585,7 +531,7 @@ type metrics struct { hasFoundResults bool } -func runSingleScan(ctx context.Context, cfg scanConfig, scanEntireChunk bool) (metrics, error) { +func runScan(ctx context.Context, cfg scanConfig) (metrics, error) { eng, err := engine.Start(ctx, engine.WithConcurrency(cfg.Concurrency), engine.WithDecoders(cfg.Decoders...), @@ -601,7 +547,8 @@ func runSingleScan(ctx context.Context, cfg scanConfig, scanEntireChunk bool) (m engine.WithPrinter(cfg.Printer), engine.WithFilterEntropy(cfg.FilterEntropy), engine.WithVerificationOverlap(cfg.AllowVerificationOverlap), - engine.WithEntireChunkScan(scanEntireChunk), + engine.WithEntireChunkScan(cfg.ScanEntireChunk), + engine.WithCompareDetectionStrategies(cfg.CompareDetectionStrategies), ) if err != nil { return metrics{}, fmt.Errorf("error initializing engine: %v", err) diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index d6ce0dd96b18b..7fbdea8f8eb4b 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -41,16 +41,6 @@ type spanCalculationParams struct { detector detectors.Detector } -// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data. -// This is used when we want to match against the full length of the provided chunk. -type EntireChunkSpanCalculator struct{} - -// calculateSpan returns the match span as the length of the chunk data, -// effectively using the entire chunk for matching. -func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { - return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))} -} - // adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude // or values provided by specific detectors to adjust the start and end indices of the span, allowing // for more granular control over the match. diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 74ec912e93f00..8b14d404b8e6e 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -81,6 +81,12 @@ type Engine struct { // By default, the engine will only scan a subset of the chunk if a detector matches the chunk. // If this flag is set to true, the engine will scan the entire chunk. scanEntireChunk bool + // If this flag is set to true, the engine will run two scans per chunk: + // 1. the entire chunk (old) + // 2. a subset of the chunk (new) + // + // Any discrepancies between methods will be logged. + compareScanStrategies bool // ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. ahoCorasickCore *ahocorasick.Core @@ -264,6 +270,11 @@ func WithEntireChunkScan(enabled bool) Option { return func(e *Engine) { e.scanEntireChunk = enabled } } +// WithCompareDetectionStrategies sets the flag to scan with both strategies and log any discrepancies. +func WithCompareDetectionStrategies(enabled bool) Option { + return func(e *Engine) { e.compareScanStrategies = enabled } +} + // HasFoundResults returns true if any results are found. func (e *Engine) HasFoundResults() bool { return atomic.LoadUint32(&e.numFoundResults) > 0 @@ -407,14 +418,8 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error { } ctx.Logger().V(4).Info("engine initialized") - // Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk. - var ahoCOptions []ahocorasick.CoreOption - if e.scanEntireChunk { - ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator))) - } - ctx.Logger().V(4).Info("setting up aho-corasick core") - e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...) + e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors) ctx.Logger().V(4).Info("set up aho-corasick core") return nil @@ -804,11 +809,22 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { func (e *Engine) detectChunks(ctx context.Context) { for data := range e.detectableChunksChan { - e.detectChunk(ctx, data) + if !e.compareScanStrategies { + // Typical use case: scan the chunk. + e.detectChunk(ctx, data, e.scanEntireChunk) + } else { + // --compare-detection-strategies is enabled, scan with both methods and compare results. + customSpanResultCount := e.detectChunk(ctx, data, false) + entireChunkResultCount := e.detectChunk(ctx, data, true) + if customSpanResultCount != entireChunkResultCount { + err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount) + ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String()) + } + } } } -func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { +func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int { var start time.Time if e.printAvgDetectorTime { start = time.Now() @@ -822,7 +838,13 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { // The matches field of the DetectorMatch struct contains the // relevant portions of the chunk data that were matched. // This avoids the need for additional regex processing on the entire chunk data. - matchedBytes := data.detector.Matches() + var matchedBytes [][]byte + if scanEntireChunk { + matchedBytes = [][]byte{data.chunk.Data} + } else { + matchedBytes = data.detector.Matches() + } + resultCount := 0 for _, match := range matchedBytes { results, err := data.detector.FromData(ctx, data.chunk.Verify, match) if err != nil { @@ -833,25 +855,44 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { if e.printAvgDetectorTime && len(results) > 0 { elapsed := time.Since(start) detectorName := results[0].DetectorType.String() + avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName) + if !ok { + ctx.Logger().Error( + errors.New("failed to load metric"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults + } + var avgTime []time.Duration - if ok { - avgTime, ok = avgTimeI.([]time.Duration) - if !ok { - return - } + avgTime, ok = avgTimeI.([]time.Duration) + if !ok { + ctx.Logger().Error( + errors.New("failed to cast metric as []time.Duration"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults } avgTime = append(avgTime, elapsed) e.metrics.detectorAvgTime.Store(detectorName, avgTime) } + HandleResults: results = e.filterResults(ctx, data.detector, results, e.logFilteredUnverified) + resultCount += len(results) for _, res := range results { e.processResult(ctx, data, res) } } - data.wgDoneFn() + // If `e.compareScanStrategies` is enabled, two scans will be run. + // Don't decrement the WaitGroup until both have been completed. + if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) { + data.wgDoneFn() + } + + return resultCount } // filterResults applies multiple filters to the detection results to reduce false positives