Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compare detection strategies per chunk/detector #2922

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 25 additions & 74 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"syscall"

"github.com/alecthomas/kingpin/v2"
"github.com/felixge/fgprof"
"github.com/go-logr/logr"
"github.com/jpillora/overseer"
"github.com/mattn/go-isatty"
"go.uber.org/automaxprocs/maxprocs"

"github.com/trufflesecurity/trufflehog/v3/pkg/cache/simple"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
"go.uber.org/automaxprocs/maxprocs"

"github.com/trufflesecurity/trufflehog/v3/pkg/analyzer"
"github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp"
Expand Down Expand Up @@ -493,37 +493,36 @@ func run(state overseer.State) {
// default detectors, which can be further filtered by the
// user. The filters are applied by the engine and are only
// subtractive.
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
VerificationCacheMetrics: &verificationCacheMetrics,
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
CompareDetectionStrategies: *compareDetectionStrategies,
VerificationCacheMetrics: &verificationCacheMetrics,
}

if !*noVerificationCache {
engConf.VerificationResultCache = simple.NewCache[detectors.Result]()
}

if *compareDetectionStrategies {
if err := compareScans(ctx, cmd, engConf); err != nil {
logFatal(err, "error comparing detection strategies")
topLevelSubCommand, _, _ := strings.Cut(cmd, " ")
switch topLevelSubCommand {
case analyzeCmd.FullCommand():
analyzer.Run(cmd)
default:
metrics, err := runSingleScan(ctx, cmd, engConf)
if err != nil {
logFatal(err, "error running scan")
}
return
}

metrics, err := runSingleScan(ctx, cmd, engConf)
if err != nil {
logFatal(err, "error running scan")
}

verificationCacheMetricsSnapshot := struct {
Hits int32
Expand Down Expand Up @@ -556,54 +555,6 @@ func run(state overseer.State) {
}
}

func compareScans(ctx context.Context, cmd string, cfg engine.Config) error {
var (
entireMetrics metrics
maxLengthMetrics metrics
err error
)

var wg sync.WaitGroup
wg.Add(1)

go func() {
defer wg.Done()
// Run scan with entire chunk span calculator.
cfg.ShouldScanEntireChunk = true
entireMetrics, err = runSingleScan(ctx, cmd, cfg)
if err != nil {
ctx.Logger().Error(err, "error running scan with entire chunk span calculator")
}
}()

// Run scan with max-length span calculator.
maxLengthMetrics, err = runSingleScan(ctx, cmd, cfg)
if err != nil {
return fmt.Errorf("error running scan with custom span calculator: %v", err)
}

wg.Wait()

return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics)
}

func compareMetrics(customMetrics, entireMetrics engine.Metrics) error {
fmt.Printf("Comparison of scan results: \n")
fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String())
fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String())

// Check for differences in scan metrics.
if customMetrics.ChunksScanned != entireMetrics.ChunksScanned ||
customMetrics.BytesScanned != entireMetrics.BytesScanned ||
customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound {
return fmt.Errorf("scan metrics do not match")
}

return nil
}

type metrics struct {
engine.Metrics
hasFoundResults bool
Expand Down
10 changes: 0 additions & 10 deletions pkg/engine/ahocorasick/ahocorasickcore.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,6 @@ type spanCalculationParams struct {
detector detectors.Detector
}

// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data.
// This is used when we want to match against the full length of the provided chunk.
type EntireChunkSpanCalculator struct{}

// calculateSpan returns the match span as the length of the chunk data,
// effectively using the entire chunk for matching.
func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))}
}

// adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude
// or values provided by specific detectors to adjust the start and end indices of the span, allowing
// for more granular control over the match.
Expand Down
81 changes: 62 additions & 19 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ import (
"github.com/adrg/strutil"
"github.com/adrg/strutil/metrics"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
"google.golang.org/protobuf/proto"

"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/config"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
Expand Down Expand Up @@ -149,6 +150,8 @@ type Config struct {

VerificationResultCache verificationcache.ResultCache
VerificationCacheMetrics verificationcache.MetricsReporter

CompareDetectionStrategies bool
}

// Engine represents the core scanning engine responsible for detecting secrets in input data.
Expand Down Expand Up @@ -179,6 +182,12 @@ type Engine struct {
// By default, the engine will only scan a subset of the chunk if a detector matches the chunk.
// If this flag is set to true, the engine will scan the entire chunk.
scanEntireChunk bool
// If this flag is set to true, the engine will run two scans per chunk:
// 1. the entire chunk (old)
// 2. a subset of the chunk (new)
//
// Any discrepancies between methods will be logged.
compareScanStrategies bool

// ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups.
AhoCorasickCore *ahocorasick.Core
Expand Down Expand Up @@ -241,6 +250,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
detectorWorkerMultiplier: cfg.DetectorWorkerMultiplier,
notificationWorkerMultiplier: cfg.NotificationWorkerMultiplier,
verificationOverlapWorkerMultiplier: cfg.VerificationOverlapWorkerMultiplier,
compareScanStrategies: cfg.CompareDetectionStrategies,
}
if engine.sourceManager == nil {
return nil, fmt.Errorf("source manager is required")
Expand Down Expand Up @@ -517,14 +527,8 @@ func (e *Engine) initialize(ctx context.Context) error {
e.dedupeCache = cache
ctx.Logger().V(4).Info("engine initialized")

// Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk.
var ahoCOptions []ahocorasick.CoreOption
if e.scanEntireChunk {
ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator)))
}

ctx.Logger().V(4).Info("setting up aho-corasick core")
e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...)
e.AhoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors)
ctx.Logger().V(4).Info("set up aho-corasick core")

return nil
Expand Down Expand Up @@ -1033,12 +1037,25 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
func (e *Engine) detectorWorker(ctx context.Context) {
for data := range e.detectableChunksChan {
start := time.Now()
e.detectChunk(ctx, data)

if !e.compareScanStrategies {
// Typical use case: scan the chunk.
_ = e.detectChunk(ctx, data, e.scanEntireChunk)
} else {
// --compare-detection-strategies is enabled, scan with both methods and compare results.
customSpanResultCount := e.detectChunk(ctx, data, false)
entireChunkResultCount := e.detectChunk(ctx, data, true)

if customSpanResultCount != entireChunkResultCount {
err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount)
ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String())
}
}
chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds()))
}
}

func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int {
var start time.Time
if e.printAvgDetectorTime {
start = time.Now()
Expand All @@ -1055,8 +1072,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
// The matches field of the DetectorMatch struct contains the
// relevant portions of the chunk data that were matched.
// This avoids the need for additional regex processing on the entire chunk data.
matches := data.detector.Matches()
for _, matchBytes := range matches {
var matchedBytes [][]byte
if scanEntireChunk {
matchedBytes = [][]byte{data.chunk.Data}
} else {
matchedBytes = data.detector.Matches()
}
resultCount := 0
for _, matchBytes := range matchedBytes {
matchCount++
detectBytesPerMatch.Observe(float64(len(matchBytes)))

Expand Down Expand Up @@ -1089,13 +1112,23 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
if e.printAvgDetectorTime && len(results) > 0 {
elapsed := time.Since(start)
detectorName := results[0].DetectorType.String()

avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName)
var avgTime []time.Duration
if ok {
avgTime, ok = avgTimeI.([]time.Duration)
if !ok {
return
}
if !ok {
ctx.Logger().Error(
errors.New("failed to load metric"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}

avgTime, ok := avgTimeI.([]time.Duration)
if !ok {
ctx.Logger().Error(
errors.New("failed to cast metric as []time.Duration"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}
avgTime = append(avgTime, elapsed)
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
Expand All @@ -1110,14 +1143,24 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
results = e.filterResults(ctx, data.detector, results)
}

HandleResults:
results = e.filterResults(ctx, data.detector, results)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idk if this line is still necessary.


resultCount += len(results)
for _, res := range results {
e.processResult(ctx, data, res, isFalsePositive)
}
}

matchesPerChunk.Observe(float64(matchCount))

data.wgDoneFn()
// If `e.compareScanStrategies` is enabled, two scans will be run.
// Don't decrement the WaitGroup until both have been completed.
if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) {
data.wgDoneFn()
}
Comment on lines +1159 to +1161
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit brittle. It might be better if moved to detectorWorker.


return resultCount
}

func (e *Engine) filterResults(
Expand Down