Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[chore][pkg/stanza] Speed up file deduplication in finder #34888

Merged
merged 8 commits into from
Sep 3, 2024
14 changes: 5 additions & 9 deletions pkg/stanza/fileconsumer/matcher/internal/finder/finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"

"github.com/bmatcuk/doublestar/v4"
"golang.org/x/exp/maps"
)

func Validate(globs []string) error {
Expand All @@ -23,7 +24,8 @@ func Validate(globs []string) error {
// FindFiles gets a list of paths given an array of glob patterns to include and exclude
func FindFiles(includes []string, excludes []string) ([]string, error) {
var errs error
all := make([]string, 0, len(includes))

allSet := make(map[string]struct{}, len(includes))
for _, include := range includes {
matches, err := doublestar.FilepathGlob(include, doublestar.WithFilesOnly(), doublestar.WithFailOnIOErrors())
if err != nil {
Expand All @@ -40,15 +42,9 @@ func FindFiles(includes []string, excludes []string) ([]string, error) {
}
}

for _, existing := range all {
if existing == match {
continue INCLUDE
}
}

all = append(all, match)
allSet[match] = struct{}{}
}
}

return all, errs
return maps.Keys(allSet), errs
}
36 changes: 34 additions & 2 deletions pkg/stanza/fileconsumer/matcher/internal/finder/finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package finder

import (
"fmt"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -188,7 +189,7 @@ func TestFindFiles(t *testing.T) {
}
files, err := FindFiles(tc.include, tc.exclude)
assert.NoError(t, err)
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}
Expand Down Expand Up @@ -251,7 +252,38 @@ func TestFindFilesWithIOErrors(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
files, err := FindFiles(tc.include, []string{})
assert.ErrorContains(t, err, tc.failedMsg)
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}

// benchResult is package level variable that store the result of the benchmark.
// It is used to prevent go from optimizing out the benchmarked code.
var benchResult []string

func BenchmarkFind10kFiles(b *testing.B) {
numFiles := 10000
tmpDir := b.TempDir()

// Create a bunch of files for benchmarking
for i := range numFiles {
path := filepath.Join(tmpDir, fmt.Sprintf("log-%05d.log", i))
f, err := os.Create(path)
require.NoError(b, err)
require.NoError(b, f.Close())
}

includeGlobs := []string{
filepath.Join(tmpDir, "log-*.log"),
}

excludeGlobs := []string{}

var r []string
b.ResetTimer()
for range b.N {
r, _ = FindFiles(includeGlobs, excludeGlobs)
}

benchResult = r
}
2 changes: 1 addition & 1 deletion pkg/stanza/fileconsumer/matcher/matcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ func TestMatcher(t *testing.T) {
} else {
assert.NoError(t, err)
}
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}
Expand Down
Loading