Skip to content

Commit

Permalink
Ranking: increase contribution of repo rank (sourcegraph#546)
Browse files Browse the repository at this point in the history
The file score includes a "repo rank" signal, which is based on the
repository's number of stars. Previously, we were aggressively normalizing the
number of stars, which made the repo ranks small and close together. This PR
changes the normalization to spread it out better over the full range. This
increases its contribution to the score.
  • Loading branch information
jtibshirani authored Feb 27, 2023
1 parent 2560773 commit 5f25b30
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 13 deletions.
6 changes: 3 additions & 3 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"encoding/json"
"errors"
"fmt"
"math"
"reflect"
"strconv"
"time"
Expand Down Expand Up @@ -618,8 +617,9 @@ func (r *Repository) UnmarshalJSON(data []byte) error {
// based on priority. Setting it on read instead of during indexing
// allows us to avoid a complete reindex.
if r.Rank == 0 && r.priority > 0 {
l := math.Log(float64(r.priority))
repo.Rank = uint16((1.0 - 1.0/math.Pow(1+l, 0.6)) * 10000)
// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
}
}
return nil
Expand Down
91 changes: 91 additions & 0 deletions build/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1241,3 +1241,94 @@ func TestScoringWithDocumentRanks(t *testing.T) {
})
}
}

func TestRepoRanks(t *testing.T) {
if os.Getenv("CI") == "" && checkCTags() == "" {
t.Skip("ctags not available")
}
dir := t.TempDir()

opts := Options{
IndexDir: dir,
RepositoryDescription: zoekt.Repository{
Name: "repo",
},
DocumentRanksVersion: "ranking",
}

searchQuery := &query.Substring{Content: true, Pattern: "Inner"}
exampleJava, err := os.ReadFile("./testdata/example.java")
if err != nil {
t.Fatal(err)
}

cases := []struct {
name string
repoRank uint16
wantScore float64
}{
{
name: "no shard rank",
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order)
wantScore: 7012.00,
},
{
name: "medium shard rank",
repoRank: 30000,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) + 9.16 (repo rank)
wantScore: 7021.16,
},
{
name: "high shard rank",
repoRank: 60000,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) + 18.31 (repo rank)
wantScore: 7030.31,
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
opts.RepositoryDescription = zoekt.Repository{
Name: "repo",
Rank: c.repoRank,
}

b, err := NewBuilder(opts)
if err != nil {
t.Fatalf("NewBuilder: %v", err)
}

err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava})
if err != nil {
t.Fatal(err)
}

if err := b.Finish(); err != nil {
t.Fatalf("Finish: %v", err)
}

ss, err := shards.NewDirectorySearcher(dir)
if err != nil {
t.Fatalf("NewDirectorySearcher(%s): %v", dir, err)
}
defer ss.Close()

srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{
UseDocumentRanks: true,
DebugScore: true,
})

if err != nil {
t.Fatal(err)
}

if got, want := len(srs.Files), 1; got != want {
t.Fatalf("file matches: want %d, got %d", want, got)
}

if got := srs.Files[0].Score; math.Abs(got-c.wantScore) >= 0.01 {
t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore)
}
})
}
}
2 changes: 1 addition & 1 deletion contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ const (
// equal weight with the query-dependent signals.
scoreFileRankFactor = 9000.0
scoreFileOrderFactor = 10.0
scoreShardRankFactor = 20.0
scoreRepoRankFactor = 20.0

// Used for ordering line and chunk matches within a file.
scoreLineOrderFactor = 1.0
Expand Down
2 changes: 1 addition & 1 deletion eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ nextFileMatch:
}

fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore)
fileMatch.addScore("shard-order", scoreShardRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)

fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
sortMatchesByScore(fileMatch.LineMatches)
Expand Down
16 changes: 8 additions & 8 deletions shards/shards_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,10 @@ func TestShardedSearcher_Ranking(t *testing.T) {
nextShardNum++
}

addShard("weekend-project", 0.25, zoekt.Document{Name: "f2", Content: []byte("foo bas")})
addShard("moderately-popular", 0.5, zoekt.Document{Name: "f3", Content: []byte("foo bar")})
addShard("weekend-project-2", 0.25, zoekt.Document{Name: "f2", Content: []byte("foo bas")})
addShard("super-star", 0.9, zoekt.Document{Name: "f1", Content: []byte("foo bar bas")})
addShard("weekend-project", 20, zoekt.Document{Name: "f2", Content: []byte("foo bas")})
addShard("moderately-popular", 500, zoekt.Document{Name: "f3", Content: []byte("foo bar")})
addShard("weekend-project-2", 20, zoekt.Document{Name: "f2", Content: []byte("foo bas")})
addShard("super-star", 5000, zoekt.Document{Name: "f1", Content: []byte("foo bar bas")})

want := []string{
"super-star",
Expand Down Expand Up @@ -244,10 +244,10 @@ func TestShardedSearcher_DocumentRanking(t *testing.T) {
nextShardNum++
}

addShard("weekend-project", 0.25, zoekt.Document{Name: "f1", Content: []byte("foobar")})
addShard("moderately-popular", 0.4, zoekt.Document{Name: "f2", Content: []byte("foobaz")})
addShard("weekend-project-2", 0.25, zoekt.Document{Name: "f3", Content: []byte("foo bar")})
addShard("super-star", 0.9, zoekt.Document{Name: "f4", Content: []byte("foo baz")},
addShard("weekend-project", 20, zoekt.Document{Name: "f1", Content: []byte("foobar")})
addShard("moderately-popular", 500, zoekt.Document{Name: "f2", Content: []byte("foobaz")})
addShard("weekend-project-2", 20, zoekt.Document{Name: "f3", Content: []byte("foo bar")})
addShard("super-star", 5000, zoekt.Document{Name: "f4", Content: []byte("foo baz")},
zoekt.Document{Name: "f5", Content: []byte("fooooo")})

// Run a stream search and gather the results
Expand Down

0 comments on commit 5f25b30

Please sign in to comment.