Skip to content

Commit 45754a7

Browse files
authored
Ranking: handle files with missing ranks (#555)
Even when a repo has ranking data, certain files will not have ranks, like Markdown or yaml files. Currently these have rank 0, which puts them at a big disadvantage and means they're usually ranked last. This PR proposes to use the mean repo rank instead of 0. The rules: * If we have a concrete rank for the file, always use it * If there's no rank, and it's a low priority file like a test, then use rank 0 * Otherwise use the mean rank for the repository We don't attempt to handle the case where an entire repo is missing ranks because it doesn't have precise code intel.
1 parent 250c2ef commit 45754a7

File tree

4 files changed

+144
-7
lines changed

4 files changed

+144
-7
lines changed

build/builder.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -898,7 +898,25 @@ func squashRange(j int) float64 {
898898
return x / (1 + x)
899899
}
900900

901-
var testRe = regexp.MustCompile("test")
901+
// IsLowPriority takes a file name and makes an educated guess about its priority
902+
// in search results. A file is considered low priority if it looks like a test,
903+
// vendored, or generated file.
904+
//
905+
// These 'priority' criteria affects how documents are ordered within a shard. It's
906+
// also used to help guess a file's rank when we're missing ranking information.
907+
func IsLowPriority(file string) bool {
908+
return testRe.MatchString(file) || isGenerated(file) || isVendored(file)
909+
}
910+
911+
var testRe = regexp.MustCompile("[Tt]est")
912+
913+
func isGenerated(file string) bool {
914+
return strings.HasSuffix(file, "min.js") || strings.HasSuffix(file, "js.map")
915+
}
916+
917+
func isVendored(file string) bool {
918+
return strings.Contains(file, "vendor/") || strings.Contains(file, "node_modules/")
919+
}
902920

903921
type rankedDoc struct {
904922
*zoekt.Document
@@ -911,12 +929,12 @@ type rankedDoc struct {
911929
// have a higher chance of being searched before limits kick in.
912930
func rank(d *zoekt.Document, origIdx int) []float64 {
913931
generated := 0.0
914-
if strings.HasSuffix(d.Name, "min.js") || strings.HasSuffix(d.Name, "js.map") {
932+
if isGenerated(d.Name) {
915933
generated = 1.0
916934
}
917935

918936
vendor := 0.0
919-
if strings.Contains(d.Name, "vendor/") || strings.Contains(d.Name, "node_modules/") {
937+
if isVendored(d.Name) {
920938
vendor = 1.0
921939
}
922940

build/builder_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,40 @@ func TestFindRepositoryMetadata(t *testing.T) {
757757
}
758758
}
759759

760+
func TestIsLowPriority(t *testing.T) {
761+
cases := []string{
762+
"builder_test.go",
763+
"TestQuery.java",
764+
"test/mocks.go",
765+
"search/vendor/thirdparty.cc",
766+
"search/node_modules/search/index.js",
767+
"search.min.js",
768+
"internal/search.js.map",
769+
}
770+
771+
for _, tt := range cases {
772+
t.Run(tt, func(t *testing.T) {
773+
if !IsLowPriority(tt) {
774+
t.Errorf("expected file '%s' to be low priority", tt)
775+
}
776+
})
777+
}
778+
779+
negativeCases := []string{
780+
"builder.go",
781+
"RoutesTrigger.java",
782+
"search.js",
783+
}
784+
785+
for _, tt := range negativeCases {
786+
t.Run(tt, func(t *testing.T) {
787+
if IsLowPriority(tt) {
788+
t.Errorf("did not expect file '%s' to be low priority", tt)
789+
}
790+
})
791+
}
792+
}
793+
760794
func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {
761795
t.Helper()
762796

gitindex/index.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
495495
}
496496

497497
var ranks repoPathRanks
498+
var meanRank float64
498499
if opts.BuildOptions.DocumentRanksPath != "" {
499500
data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath)
500501
if err != nil {
@@ -505,6 +506,17 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
505506
if err != nil {
506507
return err
507508
}
509+
510+
// Compute the mean rank for this repository. Note: we overwrite the rank
511+
// mean that's stored in the document ranks file, since that currently
512+
// represents a global mean rank across repos, which is not what we want.
513+
numRanks := len(ranks.Paths)
514+
if numRanks > 0 {
515+
for _, rank := range ranks.Paths {
516+
meanRank += rank
517+
}
518+
ranks.MeanRank = meanRank / float64(numRanks)
519+
}
508520
}
509521

510522
// we don't need to check error, since we either already have an error, or
@@ -555,17 +567,19 @@ func indexGitRepo(opts Options, config gitIndexConfig) error {
555567
return err
556568
}
557569

558-
var pathRank []float64
559-
if rank, ok := ranks.Paths[keyFullPath]; ok {
560-
pathRank = []float64{rank}
570+
var pathRanks []float64
571+
if len(ranks.Paths) > 0 {
572+
// If the repository has ranking data, then store the file's rank.
573+
pathRank := ranks.rank(keyFullPath)
574+
pathRanks = []float64{pathRank}
561575
}
562576

563577
if err := builder.Add(zoekt.Document{
564578
SubRepositoryPath: key.SubRepoPath,
565579
Name: keyFullPath,
566580
Content: contents,
567581
Branches: brs,
568-
Ranks: pathRank,
582+
Ranks: pathRanks,
569583
}); err != nil {
570584
return fmt.Errorf("error adding document with name %s: %w", keyFullPath, err)
571585
}
@@ -580,6 +594,20 @@ type repoPathRanks struct {
580594
Paths map[string]float64 `json:"paths"`
581595
}
582596

597+
// rank returns the rank for a given path. It uses these rules:
598+
// - If we have a concrete rank for this file, always use it
599+
// - If there's no rank, and it's a low priority file like a test, then use rank 0
600+
// - Otherwise use the mean rank of this repository, to avoid giving it a big disadvantage
601+
func (r repoPathRanks) rank(path string) float64 {
602+
if rank, ok := r.Paths[path]; ok {
603+
return rank
604+
} else if build.IsLowPriority(path) {
605+
return 0.0
606+
} else {
607+
return r.MeanRank
608+
}
609+
}
610+
583611
func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {
584612
ignoreFile, err := tree.File(ignore.IgnoreFile)
585613
if err == object.ErrFileNotFound {

gitindex/index_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,63 @@ func TestIndexDeltaBasic(t *testing.T) {
695695
}
696696
}
697697

698+
func TestRepoPathRanks(t *testing.T) {
699+
pathRanks := repoPathRanks{
700+
Paths: map[string]float64{
701+
"search.go": 10.23,
702+
"internal/index.go": 5.5,
703+
"internal/scratch.go": 0.0,
704+
"backend/search_test.go": 2.1,
705+
},
706+
MeanRank: 3.3,
707+
}
708+
cases := []struct {
709+
name string
710+
path string
711+
rank float64
712+
}{
713+
{
714+
name: "rank for standard file",
715+
path: "search.go",
716+
rank: 10.23,
717+
},
718+
{
719+
name: "file with rank 0",
720+
path: "internal/scratch.go",
721+
rank: 0.0,
722+
},
723+
{
724+
name: "rank for test file",
725+
path: "backend/search_test.go",
726+
rank: 2.1,
727+
},
728+
{
729+
name: "file with missing rank",
730+
path: "internal/docs.md",
731+
rank: 3.3,
732+
},
733+
{
734+
name: "test file with missing rank",
735+
path: "backend/index_test.go",
736+
rank: 0.0,
737+
},
738+
{
739+
name: "third-party file with missing rank",
740+
path: "node_modules/search/index.js",
741+
rank: 0.0,
742+
},
743+
}
744+
745+
for _, tt := range cases {
746+
t.Run(tt.name, func(t *testing.T) {
747+
got := pathRanks.rank(tt.path)
748+
if got != tt.rank {
749+
t.Errorf("expected file '%s' to have rank %f, but got %f", tt.path, tt.rank, got)
750+
}
751+
})
752+
}
753+
}
754+
698755
func runScript(t *testing.T, cwd string, script string) {
699756
err := os.MkdirAll(cwd, 0755)
700757
if err != nil {

0 commit comments

Comments
 (0)