Skip to content

Commit

Permalink
[chore][pkg/stanza] Speed up file deduplication in finder (open-telem…
Browse files Browse the repository at this point in the history
…etry#34888)

**Description:** <Describe what has changed.>
For large numbers of files, the logic that deduplicates the filenames
between matches is costly. This is mainly due to the O(n^2) deduping
algorithm used. If we instead use a map (as a hashset), we can make this
~O(n).

This PR speeds up the deduplication logic, as well as adds a benchmark
for a case where the filelog receiver is polling many files at once.

**Testing:** <Describe what testing was performed and which tests were
added.>

Running the added benchmark and comparing with benchstat, we can see a
large increase in speed for the large number of files case (10000
monitored files), at the cost of a very slight increase in memory usage:
```
goos: darwin
goarch: arm64
pkg: github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/matcher/internal/finder
cpu: Apple M3 Pro
                │    old.txt    │               new.txt               │
                │    sec/op     │    sec/op     vs base               │
Find10kFiles-12   198.636m ± 6%   8.696m ± 16%  -95.62% (p=0.002 n=6)

                │   old.txt    │              new.txt               │
                │     B/op     │     B/op      vs base              │
Find10kFiles-12   5.416Mi ± 0%   5.581Mi ± 0%  +3.04% (p=0.002 n=6)

                │   old.txt   │              new.txt              │
                │  allocs/op  │  allocs/op   vs base              │
Find10kFiles-12   80.06k ± 0%   80.25k ± 0%  +0.23% (p=0.002 n=6)
```
  • Loading branch information
BinaryFissionGames authored and f7o committed Sep 12, 2024
1 parent 75bf68f commit 8ca939c
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 12 deletions.
14 changes: 5 additions & 9 deletions pkg/stanza/fileconsumer/matcher/internal/finder/finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"

"github.com/bmatcuk/doublestar/v4"
"golang.org/x/exp/maps"
)

func Validate(globs []string) error {
Expand All @@ -23,7 +24,8 @@ func Validate(globs []string) error {
// FindFiles gets a list of paths given an array of glob patterns to include and exclude
func FindFiles(includes []string, excludes []string) ([]string, error) {
var errs error
all := make([]string, 0, len(includes))

allSet := make(map[string]struct{}, len(includes))
for _, include := range includes {
matches, err := doublestar.FilepathGlob(include, doublestar.WithFilesOnly(), doublestar.WithFailOnIOErrors())
if err != nil {
Expand All @@ -40,15 +42,9 @@ func FindFiles(includes []string, excludes []string) ([]string, error) {
}
}

for _, existing := range all {
if existing == match {
continue INCLUDE
}
}

all = append(all, match)
allSet[match] = struct{}{}
}
}

return all, errs
return maps.Keys(allSet), errs
}
36 changes: 34 additions & 2 deletions pkg/stanza/fileconsumer/matcher/internal/finder/finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package finder

import (
"fmt"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -188,7 +189,7 @@ func TestFindFiles(t *testing.T) {
}
files, err := FindFiles(tc.include, tc.exclude)
assert.NoError(t, err)
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}
Expand Down Expand Up @@ -251,7 +252,38 @@ func TestFindFilesWithIOErrors(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
files, err := FindFiles(tc.include, []string{})
assert.ErrorContains(t, err, tc.failedMsg)
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}

// benchResult is package level variable that store the result of the benchmark.
// It is used to prevent go from optimizing out the benchmarked code.
var benchResult []string

func BenchmarkFind10kFiles(b *testing.B) {
numFiles := 10000
tmpDir := b.TempDir()

// Create a bunch of files for benchmarking
for i := range numFiles {
path := filepath.Join(tmpDir, fmt.Sprintf("log-%05d.log", i))
f, err := os.Create(path)
require.NoError(b, err)
require.NoError(b, f.Close())
}

includeGlobs := []string{
filepath.Join(tmpDir, "log-*.log"),
}

excludeGlobs := []string{}

var r []string
b.ResetTimer()
for range b.N {
r, _ = FindFiles(includeGlobs, excludeGlobs)
}

benchResult = r
}
2 changes: 1 addition & 1 deletion pkg/stanza/fileconsumer/matcher/matcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ func TestMatcher(t *testing.T) {
} else {
assert.NoError(t, err)
}
assert.Equal(t, tc.expected, files)
assert.ElementsMatch(t, tc.expected, files)
})
}
}
Expand Down

0 comments on commit 8ca939c

Please sign in to comment.