Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiple detectors per match #2065

Merged
merged 17 commits into from
Nov 3, 2023
Prev Previous commit
Next Next commit
switch back to map
  • Loading branch information
rosecodym committed Nov 1, 2023
commit f348c813840388ee366278d9ee80729ca6e203dd
22 changes: 11 additions & 11 deletions pkg/engine/ahocorasickcore.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)

// detectorKey is used to identify a detector in the keywordsToDetectors map.
// DetectorKey is used to identify a detector in the keywordsToDetectors map.
// Multiple detectors can have the same detector type but different versions.
// This allows us to identify a detector by its type and version. An
// additional (optional) field is provided to disambiguate multiple custom
// detectors.
type detectorKey struct {
type DetectorKey struct {
detectorType detectorspb.DetectorType
version int
customDetectorName string
Expand All @@ -32,16 +32,16 @@ type AhoCorasickCore struct {
// type and then again from detector type to detector. We could
// go straight from keywords to detectors but doing it this way makes
// some consuming code a little cleaner.)
keywordsToDetectors map[string][]detectorKey
detectorsByKey map[detectorKey]detectors.Detector
keywordsToDetectors map[string][]DetectorKey
detectorsByKey map[DetectorKey]detectors.Detector
}

// NewAhoCorasickCore allocates and initializes a new instance of AhoCorasickCore. It uses the
// provided detector slice to create a map from keywords to detectors and build the Aho-Corasick
// prefilter trie.
func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
keywordsToDetectors := make(map[string][]detectorKey)
detectorsByKey := make(map[detectorKey]detectors.Detector, len(allDetectors))
keywordsToDetectors := make(map[string][]DetectorKey)
detectorsByKey := make(map[DetectorKey]detectors.Detector, len(allDetectors))
var keywords []string
for _, d := range allDetectors {
key := createDetectorKey(d)
Expand All @@ -63,21 +63,21 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector) *AhoCorasickCore {
// populateDetectorsByMatch populates the given detectorMap based on the Aho-Corasick match results.
// This method is designed to reuse the same map for performance optimization,
// reducing the need for repeated allocations within each detector worker in the engine.
func (ac *AhoCorasickCore) populateDetectorsByMatch(match *ahocorasick.Match, detectors *[]detectors.Detector) bool {
func (ac *AhoCorasickCore) populateDetectorsByMatch(match *ahocorasick.Match, detectors map[DetectorKey]detectors.Detector) bool {
matchedDetectorKeys, ok := ac.keywordsToDetectors[match.MatchString()]
if !ok {
return false
}
for _, key := range matchedDetectorKeys {
*detectors = append(*detectors, ac.detectorsByKey[key])
detectors[key] = ac.detectorsByKey[key]
}
return true
}

// PopulateMatchingDetectors populates the given detector slice with all the detectors matching the
// provided input. This method populates an existing slice rather than allocating a new one because
// it will be called once per chunk and that many allocations has a noticeable performance cost.
func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors *[]detectors.Detector) bool {
func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors map[DetectorKey]detectors.Detector) bool {
seen := make(map[string]struct{})
gotAny := false
for _, m := range ac.prefilter.MatchString(strings.ToLower(chunkData)) {
Expand All @@ -97,7 +97,7 @@ func (ac *AhoCorasickCore) PopulateMatchingDetectors(chunkData string, detectors

// createDetectorKey creates a unique key for each detector. This key based on type and version,
// it ensures faster lookups and reduces redundancy in our main detector store.
func createDetectorKey(d detectors.Detector) detectorKey {
func createDetectorKey(d detectors.Detector) DetectorKey {
detectorType := d.Type()
var version int
if v, ok := d.(detectors.Versioner); ok {
Expand All @@ -107,5 +107,5 @@ func createDetectorKey(d detectors.Detector) detectorKey {
if r, ok := d.(*custom_detectors.CustomRegexWebhook); ok {
customDetectorName = r.GetName()
}
return detectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
return DetectorKey{detectorType: detectorType, version: version, customDetectorName: customDetectorName}
}
31 changes: 19 additions & 12 deletions pkg/engine/ahocorasickcore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,13 @@ func TestAhoCorasickCore_MultipleCustomDetectorsMatchable(t *testing.T) {

ac := NewAhoCorasickCore(allDetectors)

detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
ac.PopulateMatchingDetectors("a", &matchingDetectors)
assert.Equal(t, 2, len(matchingDetectors))
assert.Contains(t, matchingDetectors, customDetector1)
assert.Contains(t, matchingDetectors, customDetector2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}

func TestAhoCorasickCore_MultipleDetectorVersionsMatchable(t *testing.T) {
Expand All @@ -89,11 +91,13 @@ func TestAhoCorasickCore_MultipleDetectorVersionsMatchable(t *testing.T) {

ac := NewAhoCorasickCore(allDetectors)

detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
ac.PopulateMatchingDetectors("a", &matchingDetectors)
assert.Equal(t, 2, len(matchingDetectors))
assert.Contains(t, matchingDetectors, v1)
assert.Contains(t, matchingDetectors, v2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}

func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) {
Expand All @@ -102,8 +106,11 @@ func TestAhoCorasickCore_NoDuplicateDetectorsMatched(t *testing.T) {

ac := NewAhoCorasickCore(allDetectors)

matchingDetectors := make([]detectors.Detector, 0, 1)
ac.PopulateMatchingDetectors("a a b b", &matchingDetectors)
assert.Equal(t, 1, len(matchingDetectors))
assert.Contains(t, matchingDetectors, d)
detectorsMap := make(map[DetectorKey]detectors.Detector, 2)
ac.PopulateMatchingDetectors("a a b b", detectorsMap)
matchingDetectors := make([]detectors.Detector, 0, 2)
for _, d := range detectorsMap {
matchingDetectors = append(matchingDetectors, d)
}
assert.ElementsMatch(t, allDetectors, matchingDetectors)
}
6 changes: 3 additions & 3 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {

// Reuse the same map to avoid allocations.
const avgDetectorsPerChunk = 2
chunkSpecificDetectors := make([]detectors.Detector, 0, avgDetectorsPerChunk)
chunkSpecificDetectors := make(map[DetectorKey]detectors.Detector, avgDetectorsPerChunk)
for originalChunk := range e.ChunksChan() {
for chunk := range sources.Chunker(originalChunk) {
atomic.AddUint64(&e.metrics.BytesScanned, uint64(len(chunk.Data)))
Expand All @@ -469,7 +469,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
continue
}

e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), &chunkSpecificDetectors)
e.ahoCorasickCore.PopulateMatchingDetectors(string(decoded.Chunk.Data), chunkSpecificDetectors)

for _, detector := range chunkSpecificDetectors {
decoded.Chunk.Verify = e.verify
Expand All @@ -481,7 +481,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
wgDoneFn: wgDetect.Done,
}
}
chunkSpecificDetectors = chunkSpecificDetectors[:0]
clear(chunkSpecificDetectors)
}
}
atomic.AddUint64(&e.metrics.ChunksScanned, 1)
Expand Down