Skip to content

Commit fad5576

Browse files
authored
Merge pull request #12065 from github/repo-sync
repo sync
2 parents b0831ed + 5760b85 commit fad5576

File tree

3 files changed

+101
-22
lines changed

3 files changed

+101
-22
lines changed

lib/failbot.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ async function retryingGot(url, args) {
3131
)
3232
}
3333

34-
export function report(error, metadata) {
34+
export async function report(error, metadata) {
3535
// If there's no HAYSTACK_URL set, bail early
3636
if (!process.env.HAYSTACK_URL) return
3737

lib/search/lunr-search.js

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
8888
// want to make sure this number accounts for that.
8989
const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
9090

91+
// Multiplication bonus given to matches that were made on the
92+
// the search where ALL tokens are required.
93+
// E.g. you search for 'foo bar' and we have three records:
94+
//
95+
// A) "This foo is very special"
96+
// B) "With bar and foo you can't go wrong"
97+
// C) "Only bar can save you"
98+
//
99+
// What will happen is that it only finds record (B) when it's
100+
// requires to match both 'foo' *and* 'bar'. So you get these scores:
101+
//
102+
// A) score = result.score + popularity
103+
// B) score = MATCH_PHRASE * (result.score + popularity)
104+
// C) score = result.score + popularity
105+
//
106+
// So it's very powerful multiplier. But that's fine because a
107+
// "phrase match" is a very accurate thing.
108+
const MATCH_PHRASE = 5
109+
91110
// Imagine that we have 1,000 documents. 100 of them contain the word
92111
// 'foobar'. Of those 100, we want to display the top 10 "best".
93112
// But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
101120
// records that we finally return.
102121
const PRE_LIMIT = 500
103122

104-
let titleQuery = query.trim()
105-
if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
106-
// When the search input is really short, force it to search with
107-
// the "forward wild card". I.e. you typed `go` we turn it into a
108-
// search for `go*` which means it can find things like `Google`.
109-
titleQuery += '*'
110-
}
123+
const titleQuery = query.trim()
111124

112125
let highestTitleScore = 0.0
126+
127+
const andTitleResults = []
128+
129+
// This will turn something like 'foo and bar' into:
130+
// [
131+
// { str: 'foo', metadata: { position: [Array], index: 0 } },
132+
// { str: 'bar', metadata: { position: [Array], index: 1 } }
133+
// ]
134+
// Note how the stopword gets omitted.
135+
// It's important to omit the stopwords because even if the record
136+
// actually contains the stopword, it won't match then.
137+
// E.g. you have a record called "Foo And Bar" and you search for
138+
// {foo AND and AND bar} it will actually not find anything.
139+
// But if you change it to {foo AND bar} it will match "Foo And Bar"
140+
// Same goes if any other stopwords were used like "Foo the Bar with for a".
141+
// That also needs to become an AND-search of {foo AND bar} ...only.
142+
const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)
143+
144+
if (titleQueryTokenized.length > 1) {
145+
andTitleResults.push(
146+
...index
147+
.query((q) => {
148+
for (const { str } of titleQueryTokenized) {
149+
q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
150+
}
151+
})
152+
.slice(0, PRE_LIMIT)
153+
.map((result) => {
154+
const { popularity } = records[result.ref]
155+
if (result.score > highestTitleScore) {
156+
highestTitleScore = result.score
157+
}
158+
const score = result.score / highestTitleScore
159+
return {
160+
result,
161+
_score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
162+
}
163+
})
164+
)
165+
}
166+
113167
const titleResults = index
114168
.query((q) => {
115-
if (/['"]/.test(titleQuery)) {
116-
// If the query contains a quotation marks, you can't easily
117-
// enough break it up into individual words.
118-
q.term(titleQuery, { fields: ['title'] })
119-
} else {
120-
// This is the structured way of doing turning 'foo bar'
121-
// into `title:foo title:bar'.
122-
titleQuery.split(/ /g).forEach((part) => {
123-
q.term(part, { fields: ['title'] })
169+
// The objective is to create an OR-query specifically for the 'title'
170+
// because *we* value matches on that much higher than any other
171+
// field in our records.
172+
// But we want to make sure that the last word is always treated
173+
// like a forward-tokenized token. I.e. you typed "google ku"
174+
// becomes a search for "google ku*".
175+
// Note that it's import that use the `lunr.tokenizer()` function when
176+
// using the `index.query()` function because, for starters, it will
177+
// normalize the input.
178+
// If you use `index.search()` is the higher abstraction of basically
179+
// doing this:
180+
// (pseudo code)
181+
//
182+
// Index.prototype.search = function(input) {
183+
// lunr.tokenize(input).forEach(token => {
184+
// Index.query(callback => {
185+
// callback(token)
186+
// })
187+
// })
188+
// }
189+
//
190+
// If we didn't use the tokenized form, we'd get different results
191+
// for searching for "SSH agent" and "ssh AgenT" for example.
192+
titleQueryTokenized.forEach(({ str }, i) => {
193+
const isLastToken = i === titleQueryTokenized.length - 1
194+
const isShort = str.length <= 3
195+
q.term(str, {
196+
fields: ['title'],
197+
wildcard:
198+
isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
124199
})
125-
}
200+
})
126201
})
127202
.slice(0, PRE_LIMIT)
128203
.map((result) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
170245
const _unique = new Set()
171246
const combinedMatchData = {}
172247
const results = []
173-
for (const matches of [titleResults, allResults]) {
248+
for (const matches of [andTitleResults, titleResults, allResults]) {
174249
for (const match of matches) {
175250
const { result } = match
176251
// We need to loop over all results (both from title searches and

tests/unit/failbot.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ describe('FailBot', () => {
55
const requestBodiesSent = []
66

77
beforeEach(() => {
8+
delete process.env.HAYSTACK_URL
9+
10+
// Always reset the array to an empty one between tests
11+
// so it doesn't intefere across tests.
12+
requestBodiesSent.length = 0
13+
814
nock('https://haystack.example.com')
915
.post('/')
1016
.reply(200, (uri, requestBody) => {
@@ -15,15 +21,13 @@ describe('FailBot', () => {
1521

1622
afterEach(() => {
1723
delete process.env.HAYSTACK_URL
18-
// Reset the array to an empty one between tests
19-
// so it doesn't intefere across tests.
20-
requestBodiesSent.length = 0
2124
})
2225

2326
describe('.report', () => {
2427
it('returns early if `HAYSTACK_URL` is not set', async () => {
2528
const result = await FailBot.report()
2629
expect(result).toBeUndefined()
30+
expect(requestBodiesSent.length).toBe(0)
2731
})
2832

2933
it('sends the expected report', async () => {

0 commit comments

Comments
 (0)