Skip to content

Commit

Permalink
Update GitHub enumeration to report unique filtered values
Browse files Browse the repository at this point in the history
The reported values should match the values populated in s.repos.
  • Loading branch information
mcastorina committed Sep 13, 2024
1 parent dc9c9a3 commit 369430d
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 11 deletions.
49 changes: 40 additions & 9 deletions pkg/sources/github/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,38 +330,69 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, tar
githubSecondsSpentRateLimited.WithLabelValues(s.name).Set(0)
githubReposScanned.WithLabelValues(s.name).Set(0)

err := s.enumerate(ctx)
// We don't care about handling enumerated values as they happen during
// the normal Chunks flow because we enumerate and scan in two steps.
noopReporter := sources.VisitorReporter{
VisitUnit: func(context.Context, sources.SourceUnit) error {
return nil
},
}
err := s.enumerate(ctx, noopReporter)
if err != nil {
return fmt.Errorf("error enumerating: %w", err)
}

return s.scan(ctx, chunksChan)
}

func (s *Source) enumerate(ctx context.Context) error {
// Create a reporter that does nothing for now.
noopReporter := sources.VisitorReporter{
// enumerate enumerates the GitHub source based on authentication method and
// user configuration. It populates s.filteredRepoCache, s.repoInfoCache,
// s.memberCache, s.totalRepoSize, s.orgsCache, and s.repos. Additionally,
// repositories and gists are reported to the provided UnitReporter.
func (s *Source) enumerate(ctx context.Context, reporter sources.UnitReporter) error {
seenUnits := make(map[sources.SourceUnit]struct{})
// Wrapper reporter to deduplicate and filter found units.
dedupeReporter := sources.VisitorReporter{
VisitUnit: func(ctx context.Context, su sources.SourceUnit) error {
return nil
// Only report units that passed the user configured filter.
name := su.Display()
if !s.filteredRepoCache.Exists(name) {
return ctx.Err()
}
// Only report a unit once.
if _, ok := seenUnits[su]; ok {
return ctx.Err()
}
seenUnits[su] = struct{}{}
return reporter.UnitOk(ctx, su)
},
VisitErr: func(ctx context.Context, err error) error {
return reporter.UnitErr(ctx, err)
},
}
// Report any values that were already configured.
for _, name := range s.filteredRepoCache.Keys() {
url, _ := s.filteredRepoCache.Get(name)
_ = dedupeReporter.UnitOk(ctx, RepoUnit{name: name, url: url})
}

// I'm not wild about switching on the connector type here (as opposed to dispatching to the connector itself) but
// this felt like a compromise that allowed me to isolate connection logic without rewriting the entire source.
switch c := s.connector.(type) {
case *appConnector:
if err := s.enumerateWithApp(ctx, c.InstallationClient(), noopReporter); err != nil {
if err := s.enumerateWithApp(ctx, c.InstallationClient(), dedupeReporter); err != nil {
return err
}
case *basicAuthConnector:
if err := s.enumerateBasicAuth(ctx, noopReporter); err != nil {
if err := s.enumerateBasicAuth(ctx, dedupeReporter); err != nil {
return err
}
case *tokenConnector:
if err := s.enumerateWithToken(ctx, c.IsGithubEnterprise(), noopReporter); err != nil {
if err := s.enumerateWithToken(ctx, c.IsGithubEnterprise(), dedupeReporter); err != nil {
return err
}
case *unauthenticatedConnector:
s.enumerateUnauthenticated(ctx, noopReporter)
s.enumerateUnauthenticated(ctx, dedupeReporter)
}
s.repos = make([]string, 0, s.filteredRepoCache.Count())

Expand Down
17 changes: 15 additions & 2 deletions pkg/sources/github/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"net/http"
"net/url"
"reflect"
"slices"
"strconv"
"testing"
"time"
Expand Down Expand Up @@ -566,8 +567,18 @@ func TestEnumerate(t *testing.T) {
s.cacheRepoInfo(repo)
s.filteredRepoCache.Set(repo.GetFullName(), repo.GetCloneURL())

var reportedRepos []string
reporter := sources.VisitorReporter{
VisitUnit: func(ctx context.Context, su sources.SourceUnit) error {
url, _ := su.SourceUnitID()
reportedRepos = append(reportedRepos, url)
return nil
},
}

// Act
err := s.enumerate(context.Background())
err := s.enumerate(context.Background(), reporter)
slices.Sort(reportedRepos)

// Assert
assert.Nil(t, err)
Expand All @@ -576,6 +587,8 @@ func TestEnumerate(t *testing.T) {
assert.True(t, s.filteredRepoCache.Exists("super-secret-user/super-secret-repo"))
assert.True(t, s.filteredRepoCache.Exists("cached-user/cached-repo"))
assert.True(t, s.filteredRepoCache.Exists("2801a2b0523099d0614a951579d99ba9"))
assert.Equal(t, 3, len(s.repos))
assert.Equal(t, s.repos, reportedRepos)
// Enumeration cached all repos.
assert.Equal(t, 3, len(s.repoInfoCache.cache))
_, ok := s.repoInfoCache.get("https://github.com/super-secret-user/super-secret-repo.git")
Expand Down Expand Up @@ -640,7 +653,7 @@ func BenchmarkEnumerate(b *testing.B) {
setupMocks(b)

b.StartTimer()
_ = s.enumerate(context.Background())
_ = s.enumerate(context.Background(), noopReporter())
}
}

Expand Down

0 comments on commit 369430d

Please sign in to comment.