[chore][pkg/stanza] Speed up file deduplication in finder (open-telem…

…etry#34888) **Description:** <Describe what has changed.> For large numbers of files, the logic that deduplicates the filenames between matches is costly. This is mainly due to the O(n^2) deduping algorithm used. If we instead use a map (as a hashset), we can make this ~O(n). This PR speeds up the deduplication logic, as well as adds a benchmark for a case where the filelog receiver is polling many files at once. **Testing:** <Describe what testing was performed and which tests were added.> Running the added benchmark and comparing with benchstat, we can see a large increase in speed for the large number of files case (10000 monitored files), at the cost of a very slight increase in memory usage: ``` goos: darwin goarch: arm64 pkg: github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/matcher/internal/finder cpu: Apple M3 Pro │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Find10kFiles-12 198.636m ± 6% 8.696m ± 16% -95.62% (p=0.002 n=6) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Find10kFiles-12 5.416Mi ± 0% 5.581Mi ± 0% +3.04% (p=0.002 n=6) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Find10kFiles-12 80.06k ± 0% 80.25k ± 0% +0.23% (p=0.002 n=6) ```
f7o · Sep 12, 2024 · 8ca939c · 8ca939c
1 parent 75bf68f
commit 8ca939c
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 12 deletions.
diff --git a/pkg/stanza/fileconsumer/matcher/internal/finder/finder.go b/pkg/stanza/fileconsumer/matcher/internal/finder/finder.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 
 	"github.com/bmatcuk/doublestar/v4"
+	"golang.org/x/exp/maps"
 )
 
 func Validate(globs []string) error {
@@ -23,7 +24,8 @@ func Validate(globs []string) error {
 // FindFiles gets a list of paths given an array of glob patterns to include and exclude
 func FindFiles(includes []string, excludes []string) ([]string, error) {
 	var errs error
-	all := make([]string, 0, len(includes))
+
+	allSet := make(map[string]struct{}, len(includes))
 	for _, include := range includes {
 		matches, err := doublestar.FilepathGlob(include, doublestar.WithFilesOnly(), doublestar.WithFailOnIOErrors())
 		if err != nil {
@@ -40,15 +42,9 @@ func FindFiles(includes []string, excludes []string) ([]string, error) {
 				}
 			}
 
-			for _, existing := range all {
-				if existing == match {
-					continue INCLUDE
-				}
-			}
-
-			all = append(all, match)
+			allSet[match] = struct{}{}
 		}
 	}
 
-	return all, errs
+	return maps.Keys(allSet), errs
 }
diff --git a/pkg/stanza/fileconsumer/matcher/internal/finder/finder_test.go b/pkg/stanza/fileconsumer/matcher/internal/finder/finder_test.go
@@ -4,6 +4,7 @@
 package finder
 
 import (
+	"fmt"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -188,7 +189,7 @@ func TestFindFiles(t *testing.T) {
 			}
 			files, err := FindFiles(tc.include, tc.exclude)
 			assert.NoError(t, err)
-			assert.Equal(t, tc.expected, files)
+			assert.ElementsMatch(t, tc.expected, files)
 		})
 	}
 }
@@ -251,7 +252,38 @@ func TestFindFilesWithIOErrors(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			files, err := FindFiles(tc.include, []string{})
 			assert.ErrorContains(t, err, tc.failedMsg)
-			assert.Equal(t, tc.expected, files)
+			assert.ElementsMatch(t, tc.expected, files)
 		})
 	}
 }
+
+// benchResult is package level variable that store the result of the benchmark.
+// It is used to prevent go from optimizing out the benchmarked code.
+var benchResult []string
+
+func BenchmarkFind10kFiles(b *testing.B) {
+	numFiles := 10000
+	tmpDir := b.TempDir()
+
+	// Create a bunch of files for benchmarking
+	for i := range numFiles {
+		path := filepath.Join(tmpDir, fmt.Sprintf("log-%05d.log", i))
+		f, err := os.Create(path)
+		require.NoError(b, err)
+		require.NoError(b, f.Close())
+	}
+
+	includeGlobs := []string{
+		filepath.Join(tmpDir, "log-*.log"),
+	}
+
+	excludeGlobs := []string{}
+
+	var r []string
+	b.ResetTimer()
+	for range b.N {
+		r, _ = FindFiles(includeGlobs, excludeGlobs)
+	}
+
+	benchResult = r
+}
diff --git a/pkg/stanza/fileconsumer/matcher/matcher_test.go b/pkg/stanza/fileconsumer/matcher/matcher_test.go
@@ -785,7 +785,7 @@ func TestMatcher(t *testing.T) {
 			} else {
 				assert.NoError(t, err)
 			}
-			assert.Equal(t, tc.expected, files)
+			assert.ElementsMatch(t, tc.expected, files)
 		})
 	}
 }