Skip to content

Commit c03b77f

Browse files
authored
Boost symbol matches in BM25 (sourcegraph#876)
When digging into our Natural Language Search (NLS) eval results, I found that one of the leading causes for flexible search types like "Fuzzy symbol search" and "Find logic" was noisy matches in top results. Currently, our BM25 ranking rewards any substring match equally. So for queries like 'extract tar', any match on 'tar' (even within unrelated terms like 'start', etc.) counts towards the term frequency. This PR helps reduce noise by boosting symbol matches the same as we do filename matches. Our NLS evals show positive improvement, and context evals are the tiniest bit better.
1 parent c5dd69f commit c03b77f

File tree

5 files changed

+44
-80
lines changed

5 files changed

+44
-80
lines changed

build/scoring_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ func TestBM25(t *testing.T) {
7777
query: &query.Substring{Pattern: "example"},
7878
content: exampleJava,
7979
language: "Java",
80-
// bm25-score: 0.57 <- sum-termFrequencyScore: 10.00, length-ratio: 1.00
81-
wantScore: 0.57,
80+
// bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00
81+
wantScore: 0.58,
8282
}, {
8383
// Matches only on content
8484
fileName: "example.java",
@@ -89,8 +89,8 @@ func TestBM25(t *testing.T) {
8989
}},
9090
content: exampleJava,
9191
language: "Java",
92-
// bm25-score: 1.75 <- sum-termFrequencyScore: 56.00, length-ratio: 1.00
93-
wantScore: 1.75,
92+
// bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00
93+
wantScore: 1.81,
9494
},
9595
{
9696
// Matches only on filename

contentprovider.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,22 @@ func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32,
588588
return uint32(j), ol1 > 0
589589
}
590590

591+
func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
592+
if cm.fileName {
593+
return false
594+
}
595+
596+
// Check if this candidate came from a symbol matchTree
597+
if cm.symbol {
598+
return true
599+
}
600+
601+
// Check if it overlaps with a symbol.
602+
secs := p.docSections()
603+
_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
604+
return ok
605+
}
606+
591607
func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
592608
if cm.fileName {
593609
return DocumentSection{}, nil, false
@@ -619,6 +635,29 @@ func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symb
619635
return sec, si, true
620636
}
621637

638+
// calculateTermFrequency computes the term frequency for the file match.
639+
// Notes:
640+
// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
641+
// * Symbol matches also count more than content matches, to reward matches on symbol definitions.
642+
func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
643+
// Treat each candidate match as a term and compute the frequencies. For now, ignore case
644+
// sensitivity and treat filenames and symbols the same as content.
645+
termFreqs := map[string]int{}
646+
for _, m := range cands {
647+
term := string(m.substrLowered)
648+
if m.fileName || p.matchesSymbol(m) {
649+
termFreqs[term] += 5
650+
} else {
651+
termFreqs[term]++
652+
}
653+
}
654+
655+
for term := range termFreqs {
656+
df[term] += 1
657+
}
658+
return termFreqs
659+
}
660+
622661
func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
623662
type debugScore struct {
624663
what string

eval.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ nextFileMatch:
339339
// document frequencies. Since we don't store document frequencies in the index,
340340
// we have to defer the calculation of the final BM25 score to after the whole
341341
// shard has been processed.
342-
tf = calculateTermFrequency(finalCands, df)
342+
tf = cp.calculateTermFrequency(finalCands, df)
343343
} else {
344344
// Use the standard, non-experimental scoring method by default
345345
d.scoreFile(&fileMatch, nextDoc, mt, known, opts)

score.go

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -110,30 +110,6 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
110110
}
111111
}
112112

113-
// calculateTermFrequency computes the term frequency for the file match.
114-
//
115-
// Filename matches count more than content matches. This mimics a common text
116-
// search strategy where you 'boost' matches on document titles.
117-
func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
118-
// Treat each candidate match as a term and compute the frequencies. For now, ignore case
119-
// sensitivity and treat filenames and symbols the same as content.
120-
termFreqs := map[string]int{}
121-
for _, cand := range cands {
122-
term := string(cand.substrLowered)
123-
if cand.fileName {
124-
termFreqs[term] += 5
125-
} else {
126-
termFreqs[term]++
127-
}
128-
}
129-
130-
for term := range termFreqs {
131-
df[term] += 1
132-
}
133-
134-
return termFreqs
135-
}
136-
137113
// idf computes the inverse document frequency for a term. nq is the number of
138114
// documents that contain the term and documentCount is the total number of
139115
// documents in the corpus.

score_test.go

Lines changed: 0 additions & 51 deletions
This file was deleted.

0 commit comments

Comments
 (0)