Skip to content

Commit

Permalink
list: add indextime to MinimalRepoListEntry (sourcegraph#596)
Browse files Browse the repository at this point in the history
We want this information in Sourcegraph. This turned into a bit more of
a hairy change due to our custom marshalling (for perf) and grpc
validation.

Test Plan: lots of unit test coverage
  • Loading branch information
keegancsmith authored Jun 14, 2023
1 parent ae9c94d commit e1876ff
Show file tree
Hide file tree
Showing 12 changed files with 305 additions and 235 deletions.
27 changes: 26 additions & 1 deletion api.go
Original file line number Diff line number Diff line change
Expand Up @@ -742,9 +742,34 @@ type RepoListEntry struct {
Stats RepoStats
}

// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
// performance profiling of sourcegraph.com revealed that querying this
// information from Zoekt was causing lots of CPU and memory usage. Note: we
// can revisit this, how we store and query this information has changed a lot
// since this was introduced.
type MinimalRepoListEntry struct {
// HasSymbols is exported since Sourcegraph uses this information at search
// planning time to decide between Zoekt and an unindexed symbol search.
//
// Note: it pretty much is always true in practice.
HasSymbols bool
Branches []RepositoryBranch

// Branches is used by Sourcegraphs query planner to decided if it can use
// zoekt or go via an unindexed code path.
Branches []RepositoryBranch

// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
// since the epoch). This is to make it clear we are not transporting the
// full fidelty timestamp (ie with milliseconds and location). Additionally
// it saves 16 bytes in this struct.
//
// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
// how many repositories need updating after a ranking change/etc.
//
// TODO(keegancsmith) audit updates to IndexTime and document how and when
// it changes. Concerned about things like metadata updates or compound
// shards leading to untrustworthy data here.
IndexTimeUnix int64
}

type ReposMap map[uint32]MinimalRepoListEntry
Expand Down
10 changes: 6 additions & 4 deletions api_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,9 @@ func MinimalRepoListEntryFromProto(p *proto.MinimalRepoListEntry) MinimalRepoLis
}

return MinimalRepoListEntry{
HasSymbols: p.GetHasSymbols(),
Branches: branches,
HasSymbols: p.GetHasSymbols(),
Branches: branches,
IndexTimeUnix: p.GetIndexTimeUnix(),
}
}

Expand All @@ -579,8 +580,9 @@ func (m *MinimalRepoListEntry) ToProto() *proto.MinimalRepoListEntry {
branches[i] = branch.ToProto()
}
return &proto.MinimalRepoListEntry{
HasSymbols: m.HasSymbols,
Branches: branches,
HasSymbols: m.HasSymbols,
Branches: branches,
IndexTimeUnix: m.IndexTimeUnix,
}
}

Expand Down
21 changes: 13 additions & 8 deletions api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"encoding/gob"
"strings"
"testing"
"time"
)

/*
Expand All @@ -29,24 +30,28 @@ func BenchmarkMinimalRepoListEncodings(b *testing.B) {
size := uint32(13000) // 2021-06-24 rough estimate of number of repos on a replica.

type Slice struct {
ID uint32
HasSymbols bool
Branches []RepositoryBranch
ID uint32
HasSymbols bool
Branches []RepositoryBranch
IndexTimeUnix int64
}

branches := []RepositoryBranch{{Name: "HEAD", Version: strings.Repeat("a", 40)}}
mapData := make(map[uint32]*MinimalRepoListEntry, size)
sliceData := make([]Slice, 0, size)
indexTime := time.Now().Unix()

for id := uint32(1); id <= size; id++ {
mapData[id] = &MinimalRepoListEntry{
HasSymbols: true,
Branches: branches,
HasSymbols: true,
Branches: branches,
IndexTimeUnix: indexTime,
}
sliceData = append(sliceData, Slice{
ID: id,
HasSymbols: true,
Branches: branches,
ID: id,
HasSymbols: true,
Branches: branches,
IndexTimeUnix: indexTime,
})
}

Expand Down
10 changes: 6 additions & 4 deletions eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -717,13 +717,15 @@ func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl
l.Repos = append(l.Repos, rle)
case RepoListFieldMinimal:
l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
HasSymbols: rle.Repository.HasSymbols,
Branches: rle.Repository.Branches,
HasSymbols: rle.Repository.HasSymbols,
Branches: rle.Repository.Branches,
IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
}
case RepoListFieldReposMap:
l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
HasSymbols: rle.Repository.HasSymbols,
Branches: rle.Repository.Branches,
HasSymbols: rle.Repository.HasSymbols,
Branches: rle.Repository.Branches,
IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
}
}

Expand Down
428 changes: 219 additions & 209 deletions grpc/v1/webserver.pb.go

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions grpc/v1/webserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ message IndexMetadata {
message MinimalRepoListEntry {
bool has_symbols = 1;
repeated RepositoryBranch branches = 2;
int64 index_time_unix = 3;
}

// RepositoryBranch describes an indexed branch, which is a name
Expand Down
5 changes: 4 additions & 1 deletion index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1782,7 +1782,10 @@ func TestListRepos(t *testing.T) {
},
}

if diff := cmp.Diff(want, res); diff != "" {
ignored := []cmp.Option{
cmpopts.IgnoreFields(MinimalRepoListEntry{}, "IndexTimeUnix"),
}
if diff := cmp.Diff(want, res, ignored...); diff != "" {
t.Fatalf("mismatch (-want +got):\n%s", diff)
}

Expand Down
28 changes: 23 additions & 5 deletions marshal.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,19 @@ import (

// Wire-format of map[uint32]MinimalRepoListEntry is pretty straightforward:
//
// byte(1) version
// byte(2) version
// uvarint(len(minimal))
// uvarint(sum(len(entry.Branches) for entry in minimal))
// for repoID, entry in minimal:
// uvarint(repoID)
// byte(entry.HasSymbols)
// uvarint(entry.IndexTimeUnix)
// uvarint(len(entry.Branches))
// for b in entry.Branches:
// str(b.Name)
// str(b.Version)
//
// Version 1 was the same, except it didn't have the IndexTimeUnix field.

// reposMapEncode implements an efficient encoder for ReposMap.
func reposMapEncode(minimal ReposMap) ([]byte, error) {
Expand Down Expand Up @@ -54,6 +57,7 @@ func reposMapEncode(minimal ReposMap) ([]byte, error) {
for repoID, entry := range minimal {
size += binary.PutUvarint(enc[:], uint64(repoID))
size += 1 // HasSymbols
size += binary.PutUvarint(enc[:], uint64(entry.IndexTimeUnix))
size += binary.PutUvarint(enc[:], uint64(len(entry.Branches)))
for _, b := range entry.Branches {
size += strSize(b.Name)
Expand All @@ -63,7 +67,7 @@ func reposMapEncode(minimal ReposMap) ([]byte, error) {
b.Grow(size)

// Version
b.WriteByte(1)
b.WriteByte(2)

// Length
varint(len(minimal))
Expand All @@ -79,6 +83,8 @@ func reposMapEncode(minimal ReposMap) ([]byte, error) {
}
b.WriteByte(hasSymbols)

varint(int(entry.IndexTimeUnix))

varint(len(entry.Branches))
for _, b := range entry.Branches {
str(b.Name)
Expand All @@ -104,7 +110,14 @@ func reposMapDecode(b []byte) (ReposMap, error) {
}

// Version
if v := r.byt(); v != 1 {
var readIndexTime bool
v := r.byt()
switch v {
case 1:
readIndexTime = false
case 2:
readIndexTime = true
default:
return nil, fmt.Errorf("unsupported stringSet encoding version %d", v)
}

Expand All @@ -119,6 +132,10 @@ func reposMapDecode(b []byte) (ReposMap, error) {
for i := 0; i < l; i++ {
repoID := r.uvarint()
hasSymbols := r.byt() == 1
var indexTimeUnix int64
if readIndexTime {
indexTimeUnix = int64(r.uvarint())
}
lb := r.uvarint()
for i := 0; i < lb; i++ {
allBranches = append(allBranches, RepositoryBranch{
Expand All @@ -128,8 +145,9 @@ func reposMapDecode(b []byte) (ReposMap, error) {
}
branches := allBranches[len(allBranches)-lb:]
m[uint32(repoID)] = MinimalRepoListEntry{
HasSymbols: hasSymbols,
Branches: branches,
HasSymbols: hasSymbols,
Branches: branches,
IndexTimeUnix: indexTimeUnix,
}
}

Expand Down
5 changes: 4 additions & 1 deletion marshal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"encoding/gob"
"testing"
"time"

"github.com/google/go-cmp/cmp"
)
Expand Down Expand Up @@ -72,9 +73,11 @@ func TestRepoList_Marshal(t *testing.T) {

func genRepoList(size int) *RepoList {
m := make(ReposMap, size)
indexTime := time.Now().Unix()
for i := 0; i < size; i++ {
m[uint32(i)] = MinimalRepoListEntry{
HasSymbols: true,
HasSymbols: true,
IndexTimeUnix: indexTime,
Branches: []RepositoryBranch{{
Name: "HEAD",
Version: "c301e5c82b6e1632dce5c39902691c359559852e",
Expand Down
1 change: 1 addition & 0 deletions shards/shards_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ func TestShardedSearcher_List(t *testing.T) {

ignored := []cmp.Option{
cmpopts.EquateEmpty(),
cmpopts.IgnoreFields(zoekt.MinimalRepoListEntry{}, "IndexTimeUnix"),
cmpopts.IgnoreFields(zoekt.RepoListEntry{}, "IndexMetadata"),
cmpopts.IgnoreFields(zoekt.RepoStats{}, "IndexBytes"),
cmpopts.IgnoreFields(zoekt.Repository{}, "SubRepoMap"),
Expand Down
2 changes: 1 addition & 1 deletion web/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func TestBasic(t *testing.T) {
ts := httptest.NewServer(mux)
defer ts.Close()

nowStr := time.Now().Format("Jan 02, 2006 15:04")
nowStr := time.Now().UTC().Format("Jan 02, 2006 15:04")
for req, needles := range map[string][]string{
"/": {"from 1 repositories"},
"/search?q=water": {
Expand Down
2 changes: 1 addition & 1 deletion write.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func (b *IndexBuilder) Write(out io.Writer) error {

indexTime := b.IndexTime
if indexTime.IsZero() {
indexTime = time.Now()
indexTime = time.Now().UTC()
}

if err := b.writeJSON(&IndexMetadata{
Expand Down

0 comments on commit e1876ff

Please sign in to comment.