Skip to content

Commit

Permalink
Ranking: handle files with missing ranks (#555)
Browse files Browse the repository at this point in the history
Even when a repo has ranking data, certain files will not have ranks, like
Markdown or yaml files. Currently these have rank 0, which puts them at a big
disadvantage and means they're usually ranked last.

This PR proposes to use the mean repo rank instead of 0. The rules:
* If we have a concrete rank for the file, always use it
* If there's no rank, and it's a low priority file like a test, then use rank 0
* Otherwise use the mean rank for the repository

We don't attempt to handle the case where an entire repo is missing ranks
because it doesn't have precise code intel.
  • Loading branch information
jtibshirani authored Mar 10, 2023
1 parent 250c2ef commit 45754a7
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 7 deletions.
24 changes: 21 additions & 3 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,25 @@ func squashRange(j int) float64 {
return x / (1 + x)
}

var testRe = regexp.MustCompile("test")
// IsLowPriority takes a file name and makes an educated guess about its priority
// in search results. A file is considered low priority if it looks like a test,
// vendored, or generated file.
//
// These 'priority' criteria affects how documents are ordered within a shard. It's
// also used to help guess a file's rank when we're missing ranking information.
func IsLowPriority(file string) bool {
return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
}

var testRe = regexp.MustCompile("[Tt]est")

func isGenerated(file string) bool {
return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
}

func isVendored(file string) bool {
return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
}

type rankedDoc struct {
*zoekt.Document
Expand All @@ -911,12 +929,12 @@ type rankedDoc struct {
// have a higher chance of being searched before limits kick in.
func rank(d *zoekt.Document, origIdx int) []float64 {
generated := 0.0
if strings.HasSuffix(d.Name, "min.js") || strings.HasSuffix(d.Name, "js.map") {
if isGenerated(d.Name) {
generated = 1.0
}

vendor := 0.0
if strings.Contains(d.Name, "vendor/") || strings.Contains(d.Name, "node_modules/") {
if isVendored(d.Name) {
vendor = 1.0
}

Expand Down
34 changes: 34 additions & 0 deletions build/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,40 @@ func TestFindRepositoryMetadata(t *testing.T) {
}
}

func TestIsLowPriority(t *testing.T) {
cases := []string{
"builder_test.go",
"TestQuery.java",
"test/mocks.go",
"search/vendor/thirdparty.cc",
"search/node_modules/search/index.js",
"search.min.js",
"internal/search.js.map",
}

for _, tt := range cases {
t.Run(tt, func(t *testing.T) {
if !IsLowPriority(tt) {
t.Errorf("expected file '%s' to be low priority", tt)
}
})
}

negativeCases := []string{
"builder.go",
"RoutesTrigger.java",
"search.js",
}

for _, tt := range negativeCases {
t.Run(tt, func(t *testing.T) {
if IsLowPriority(tt) {
t.Errorf("did not expect file '%s' to be low priority", tt)
}
})
}
}

func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {
t.Helper()

Expand Down
36 changes: 32 additions & 4 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
}

var ranks repoPathRanks
var meanRank float64
if opts.BuildOptions.DocumentRanksPath != "" {
data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
if err != nil {
Expand All @@ -505,6 +506,17 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
if err != nil {
return err
}

// Compute the mean rank for this repository. Note: we overwrite the rank
// mean that's stored in the document ranks file, since that currently
// represents a global mean rank across repos, which is not what we want.
numRanks := len(ranks.Paths)
if numRanks > 0 {
for _, rank := range ranks.Paths {
meanRank += rank
}
ranks.MeanRank = meanRank / float64(numRanks)
}
}

// we don't need to check error, since we either already have an error, or
Expand Down Expand Up @@ -555,17 +567,19 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
return err
}

var pathRank []float64
if rank, ok := ranks.Paths[keyFullPath]; ok {
pathRank = []float64{rank}
var pathRanks []float64
if len(ranks.Paths) > 0 {
// If the repository has ranking data, then store the file's rank.
pathRank := ranks.rank(keyFullPath)
pathRanks = []float64{pathRank}
}

if err := builder.Add(zoekt.Document{
SubRepositoryPath: key.SubRepoPath,
Name: keyFullPath,
Content: contents,
Branches: brs,
Ranks: pathRank,
Ranks: pathRanks,
}); err != nil {
return fmt.Errorf("error adding document with name %s: %w", keyFullPath, err)
}
Expand All @@ -580,6 +594,20 @@ type repoPathRanks struct {
Paths map[string]float64 `json:"paths"`
}

// rank returns the rank for a given path. It uses these rules:
// - If we have a concrete rank for this file, always use it
// - If there's no rank, and it's a low priority file like a test, then use rank 0
// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
func (r repoPathRanks) rank(path string) float64 {
if rank, ok := r.Paths[path]; ok {
return rank
} else if build.IsLowPriority(path) {
return 0.0
} else {
return r.MeanRank
}
}

func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
ignoreFile, err := tree.File(ignore.IgnoreFile)
if err == object.ErrFileNotFound {
Expand Down
57 changes: 57 additions & 0 deletions gitindex/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,63 @@ func TestIndexDeltaBasic(t *testing.T) {
}
}

func TestRepoPathRanks(t *testing.T) {
pathRanks := repoPathRanks{
Paths: map[string]float64{
"search.go": 10.23,
"internal/index.go": 5.5,
"internal/scratch.go": 0.0,
"backend/search_test.go": 2.1,
},
MeanRank: 3.3,
}
cases := []struct {
name string
path string
rank float64
}{
{
name: "rank for standard file",
path: "search.go",
rank: 10.23,
},
{
name: "file with rank 0",
path: "internal/scratch.go",
rank: 0.0,
},
{
name: "rank for test file",
path: "backend/search_test.go",
rank: 2.1,
},
{
name: "file with missing rank",
path: "internal/docs.md",
rank: 3.3,
},
{
name: "test file with missing rank",
path: "backend/index_test.go",
rank: 0.0,
},
{
name: "third-party file with missing rank",
path: "node_modules/search/index.js",
rank: 0.0,
},
}

for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got := pathRanks.rank(tt.path)
if got != tt.rank {
t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got)
}
})
}
}

func runScript(t *testing.T, cwd string, script string) {
err := os.MkdirAll(cwd, 0755)
if err != nil {
Expand Down

0 comments on commit 45754a7

Please sign in to comment.