Skip to content

Commit

Permalink
feat(engine): compare results per chunk/detector
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz authored and Richard Gomez committed Jun 5, 2024
1 parent d8e7fa9 commit e6c8d7d
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 123 deletions.
133 changes: 40 additions & 93 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -467,33 +466,26 @@ func run(state overseer.State) {
}

scanConfig := scanConfig{
Command: cmd,
Concurrency: *concurrency,
Decoders: decoders.DefaultDecoders(),
Conf: conf,
IncludeFilter: includeFilter,
ExcludeFilter: excludeFilter,
EndpointCustomizer: endpointCustomizer,
NoVerification: *noVerification,
PrintAvgDetectorTime: *printAvgDetectorTime,
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
ScanEntireChunk: *scanEntireChunk,
JobReportWriter: jobReportWriter,
AllowVerificationOverlap: *allowVerificationOverlap,
ParsedResults: parsedResults,
Printer: printer,
}

if *compareDetectionStrategies {
err := compareScans(ctx, scanConfig)
if err != nil {
logFatal(err, "error comparing detection strategies")
}
return
}

metrics, err := runSingleScan(ctx, scanConfig, *scanEntireChunk)
Command: cmd,
Concurrency: *concurrency,
Decoders: decoders.DefaultDecoders(),
Conf: conf,
IncludeFilter: includeFilter,
ExcludeFilter: excludeFilter,
EndpointCustomizer: endpointCustomizer,
NoVerification: *noVerification,
PrintAvgDetectorTime: *printAvgDetectorTime,
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
ScanEntireChunk: *scanEntireChunk,
CompareDetectionStrategies: *compareDetectionStrategies,
JobReportWriter: jobReportWriter,
AllowVerificationOverlap: *allowVerificationOverlap,
ParsedResults: parsedResults,
Printer: printer,
}

metrics, err := runScan(ctx, scanConfig)
if err != nil {
logFatal(err, "error running scan")
}
Expand All @@ -515,77 +507,31 @@ func run(state overseer.State) {
}

type scanConfig struct {
Command string
Concurrency int
Decoders []decoders.Decoder
Conf *config.Config
IncludeFilter func(detectors.Detector) bool
ExcludeFilter func(detectors.Detector) bool
EndpointCustomizer func(detectors.Detector) bool
NoVerification bool
PrintAvgDetectorTime bool
FilterUnverified bool
FilterEntropy float64
ScanEntireChunk bool
JobReportWriter io.WriteCloser
AllowVerificationOverlap bool
ParsedResults map[string]struct{}
Printer engine.Printer
}

func compareScans(ctx context.Context, cfg scanConfig) error {
var (
entireMetrics metrics
maxLengthMetrics metrics
err error
)

var wg sync.WaitGroup
wg.Add(1)

go func() {
defer wg.Done()
// Run scan with entire chunk span calculator.
entireMetrics, err = runSingleScan(ctx, cfg, true)
if err != nil {
ctx.Logger().Error(err, "error running scan with entire chunk span calculator")
}
}()

// Run scan with max-length span calculator.
maxLengthMetrics, err = runSingleScan(ctx, cfg, false)
if err != nil {
return fmt.Errorf("error running scan with custom span calculator: %v", err)
}

wg.Wait()

return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics)
}

func compareMetrics(customMetrics, entireMetrics engine.Metrics) error {
fmt.Printf("Comparison of scan results: \n")
fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String())
fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String())

// Check for differences in scan metrics.
if customMetrics.ChunksScanned != entireMetrics.ChunksScanned ||
customMetrics.BytesScanned != entireMetrics.BytesScanned ||
customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound {
return fmt.Errorf("scan metrics do not match")
}

return nil
Command string
Concurrency int
Decoders []decoders.Decoder
Conf *config.Config
IncludeFilter func(detectors.Detector) bool
ExcludeFilter func(detectors.Detector) bool
EndpointCustomizer func(detectors.Detector) bool
NoVerification bool
PrintAvgDetectorTime bool
FilterUnverified bool
FilterEntropy float64
ScanEntireChunk bool
CompareDetectionStrategies bool
JobReportWriter io.WriteCloser
AllowVerificationOverlap bool
ParsedResults map[string]struct{}
Printer engine.Printer
}

type metrics struct {
engine.Metrics
hasFoundResults bool
}

func runSingleScan(ctx context.Context, cfg scanConfig, scanEntireChunk bool) (metrics, error) {
func runScan(ctx context.Context, cfg scanConfig) (metrics, error) {
eng, err := engine.Start(ctx,
engine.WithConcurrency(cfg.Concurrency),
engine.WithDecoders(cfg.Decoders...),
Expand All @@ -601,7 +547,8 @@ func runSingleScan(ctx context.Context, cfg scanConfig, scanEntireChunk bool) (m
engine.WithPrinter(cfg.Printer),
engine.WithFilterEntropy(cfg.FilterEntropy),
engine.WithVerificationOverlap(cfg.AllowVerificationOverlap),
engine.WithEntireChunkScan(scanEntireChunk),
engine.WithEntireChunkScan(cfg.ScanEntireChunk),
engine.WithCompareDetectionStrategies(cfg.CompareDetectionStrategies),
)
if err != nil {
return metrics{}, fmt.Errorf("error initializing engine: %v", err)
Expand Down
14 changes: 0 additions & 14 deletions pkg/engine/ahocorasick/ahocorasickcore.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,6 @@ type spanCalculator interface {
calculateSpan(startIdx int64, chunkData []byte, detector detectors.Detector) matchSpan
}

// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data.
// This is used when we want to match against the full length of the provided chunk.
type EntireChunkSpanCalculator struct{}

// calculateSpans returns the match span as the length of the chunk data,
// effectively using the entire chunk for matching.
func (e *EntireChunkSpanCalculator) calculateSpan(
startIdx int64,
chunkData []byte,
_ detectors.Detector,
) matchSpan {
return matchSpan{startOffset: startIdx, endOffset: int64(len(chunkData))}
}

// maxMatchLengthSpanCalculator is a strategy that calculates match spans based on a default max
// match length or values provided by detectors. This allows for more granular control over the match span.
type maxMatchLengthSpanCalculator struct{ maxMatchLength int64 }
Expand Down
73 changes: 57 additions & 16 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ type Engine struct {
// By default, the engine will only scan a subset of the chunk if a detector matches the chunk.
// If this flag is set to true, the engine will scan the entire chunk.
scanEntireChunk bool
// If this flag is set to true, the engine will run two scans per chunk:
// 1. the entire chunk (old)
// 2. a subset of the chunk (new)
//
// Any discrepancies between methods will be logged.
compareScanStrategies bool

// ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups.
ahoCorasickCore *ahocorasick.AhoCorasickCore
Expand Down Expand Up @@ -264,6 +270,11 @@ func WithEntireChunkScan(enabled bool) Option {
return func(e *Engine) { e.scanEntireChunk = enabled }
}

// WithCompareDetectionStrategies sets the flag to scan with both strategies and log any discrepancies.
func WithCompareDetectionStrategies(enabled bool) Option {
return func(e *Engine) { e.compareScanStrategies = enabled }
}

// HasFoundResults returns true if any results are found.
func (e *Engine) HasFoundResults() bool {
return atomic.LoadUint32(&e.numFoundResults) > 0
Expand Down Expand Up @@ -407,14 +418,8 @@ func (e *Engine) initialize(ctx context.Context, options ...Option) error {
}
ctx.Logger().V(4).Info("engine initialized")

// Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk.
var ahoCOptions []ahocorasick.AhoCorasickCoreOption
if e.scanEntireChunk {
ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator)))
}

ctx.Logger().V(4).Info("setting up aho-corasick core")
e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...)
e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors)
ctx.Logger().V(4).Info("set up aho-corasick core")

return nil
Expand Down Expand Up @@ -803,11 +808,22 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {

func (e *Engine) detectChunks(ctx context.Context) {
for data := range e.detectableChunksChan {
e.detectChunk(ctx, data)
if !e.compareScanStrategies {
// Typical use case: scan the chunk.
e.detectChunk(ctx, data, e.scanEntireChunk)
} else {
// --compare-detection-strategies is enabled, scan with both methods and compare results.
customSpanResultCount := e.detectChunk(ctx, data, false)
entireChunkResultCount := e.detectChunk(ctx, data, true)
if customSpanResultCount != entireChunkResultCount {
err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount)
ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String())
}
}
}
}

func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int {
var start time.Time
if e.printAvgDetectorTime {
start = time.Now()
Expand All @@ -821,7 +837,13 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
// The matches field of the DetectorMatch struct contains the
// relevant portions of the chunk data that were matched.
// This avoids the need for additional regex processing on the entire chunk data.
matchedBytes := data.detector.Matches()
var matchedBytes [][]byte
if scanEntireChunk {
matchedBytes = [][]byte{data.chunk.Data}
} else {
matchedBytes = data.detector.Matches()
}
resultCount := 0
for _, match := range matchedBytes {
results, err := data.detector.FromData(ctx, data.chunk.Verify, match)
if err != nil {
Expand All @@ -831,18 +853,30 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
if e.printAvgDetectorTime && len(results) > 0 {
elapsed := time.Since(start)
detectorName := results[0].DetectorType.String()

avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName)
if !ok {
ctx.Logger().Error(
errors.New("failed to load metric"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}

var avgTime []time.Duration
if ok {
avgTime, ok = avgTimeI.([]time.Duration)
if !ok {
return
}
avgTime, ok = avgTimeI.([]time.Duration)
if !ok {
ctx.Logger().Error(
errors.New("failed to cast metric as []time.Duration"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}
avgTime = append(avgTime, elapsed)
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
}

HandleResults:
if e.filterUnverified {
results = detectors.CleanResults(results)
}
Expand All @@ -853,11 +887,18 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
results = detectors.FilterResultsWithEntropy(ctx, results, *e.filterEntropy, e.logFilteredUnverified)
}

resultCount += len(results)
for _, res := range results {
e.processResult(ctx, data, res)
}
}
data.wgDoneFn()
// If `e.compareScanStrategies` is enabled, two scans will be run.
// Don't decrement the WaitGroup until both have been completed.
if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) {
data.wgDoneFn()
}

return resultCount
}

func (e *Engine) processResult(ctx context.Context, data detectableChunk, res detectors.Result) {
Expand Down

0 comments on commit e6c8d7d

Please sign in to comment.