@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
88
88
// want to make sure this number accounts for that.
89
89
const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
90
90
91
+ // Multiplication bonus given to matches that were made on the
92
+ // the search where ALL tokens are required.
93
+ // E.g. you search for 'foo bar' and we have three records:
94
+ //
95
+ // A) "This foo is very special"
96
+ // B) "With bar and foo you can't go wrong"
97
+ // C) "Only bar can save you"
98
+ //
99
+ // What will happen is that it only finds record (B) when it's
100
+ // requires to match both 'foo' *and* 'bar'. So you get these scores:
101
+ //
102
+ // A) score = result.score + popularity
103
+ // B) score = MATCH_PHRASE * (result.score + popularity)
104
+ // C) score = result.score + popularity
105
+ //
106
+ // So it's very powerful multiplier. But that's fine because a
107
+ // "phrase match" is a very accurate thing.
108
+ const MATCH_PHRASE = 5
109
+
91
110
// Imagine that we have 1,000 documents. 100 of them contain the word
92
111
// 'foobar'. Of those 100, we want to display the top 10 "best".
93
112
// But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
101
120
// records that we finally return.
102
121
const PRE_LIMIT = 500
103
122
104
- let titleQuery = query . trim ( )
105
- if ( titleQuery . length <= 3 && ! titleQuery . endsWith ( '*s' ) ) {
106
- // When the search input is really short, force it to search with
107
- // the "forward wild card". I.e. you typed `go` we turn it into a
108
- // search for `go*` which means it can find things like `Google`.
109
- titleQuery += '*'
110
- }
123
+ const titleQuery = query . trim ( )
111
124
112
125
let highestTitleScore = 0.0
126
+
127
+ const andTitleResults = [ ]
128
+
129
+ // This will turn something like 'foo and bar' into:
130
+ // [
131
+ // { str: 'foo', metadata: { position: [Array], index: 0 } },
132
+ // { str: 'bar', metadata: { position: [Array], index: 1 } }
133
+ // ]
134
+ // Note how the stopword gets omitted.
135
+ // It's important to omit the stopwords because even if the record
136
+ // actually contains the stopword, it won't match then.
137
+ // E.g. you have a record called "Foo And Bar" and you search for
138
+ // {foo AND and AND bar} it will actually not find anything.
139
+ // But if you change it to {foo AND bar} it will match "Foo And Bar"
140
+ // Same goes if any other stopwords were used like "Foo the Bar with for a".
141
+ // That also needs to become an AND-search of {foo AND bar} ...only.
142
+ const titleQueryTokenized = lunr . tokenizer ( titleQuery ) . filter ( lunr . stopWordFilter )
143
+
144
+ if ( titleQueryTokenized . length > 1 ) {
145
+ andTitleResults . push (
146
+ ...index
147
+ . query ( ( q ) => {
148
+ for ( const { str } of titleQueryTokenized ) {
149
+ q . term ( str , { fields : [ 'title' ] , presence : lunr . Query . presence . REQUIRED } )
150
+ }
151
+ } )
152
+ . slice ( 0 , PRE_LIMIT )
153
+ . map ( ( result ) => {
154
+ const { popularity } = records [ result . ref ]
155
+ if ( result . score > highestTitleScore ) {
156
+ highestTitleScore = result . score
157
+ }
158
+ const score = result . score / highestTitleScore
159
+ return {
160
+ result,
161
+ _score : MATCH_PHRASE * TITLE_FIRST * ( score + POPULARITY_FACTOR * ( popularity || 0.0 ) ) ,
162
+ }
163
+ } )
164
+ )
165
+ }
166
+
113
167
const titleResults = index
114
168
. query ( ( q ) => {
115
- if ( / [ ' " ] / . test ( titleQuery ) ) {
116
- // If the query contains a quotation marks, you can't easily
117
- // enough break it up into individual words.
118
- q . term ( titleQuery , { fields : [ 'title' ] } )
119
- } else {
120
- // This is the structured way of doing turning 'foo bar'
121
- // into `title:foo title:bar'.
122
- titleQuery . split ( / / g) . forEach ( ( part ) => {
123
- q . term ( part , { fields : [ 'title' ] } )
169
+ // The objective is to create an OR-query specifically for the 'title'
170
+ // because *we* value matches on that much higher than any other
171
+ // field in our records.
172
+ // But we want to make sure that the last word is always treated
173
+ // like a forward-tokenized token. I.e. you typed "google ku"
174
+ // becomes a search for "google ku*".
175
+ // Note that it's import that use the `lunr.tokenizer()` function when
176
+ // using the `index.query()` function because, for starters, it will
177
+ // normalize the input.
178
+ // If you use `index.search()` is the higher abstraction of basically
179
+ // doing this:
180
+ // (pseudo code)
181
+ //
182
+ // Index.prototype.search = function(input) {
183
+ // lunr.tokenize(input).forEach(token => {
184
+ // Index.query(callback => {
185
+ // callback(token)
186
+ // })
187
+ // })
188
+ // }
189
+ //
190
+ // If we didn't use the tokenized form, we'd get different results
191
+ // for searching for "SSH agent" and "ssh AgenT" for example.
192
+ titleQueryTokenized . forEach ( ( { str } , i ) => {
193
+ const isLastToken = i === titleQueryTokenized . length - 1
194
+ const isShort = str . length <= 3
195
+ q . term ( str , {
196
+ fields : [ 'title' ] ,
197
+ wildcard :
198
+ isLastToken && isShort ? lunr . Query . wildcard . TRAILING : lunr . Query . wildcard . NONE ,
124
199
} )
125
- }
200
+ } )
126
201
} )
127
202
. slice ( 0 , PRE_LIMIT )
128
203
. map ( ( result ) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
170
245
const _unique = new Set ( )
171
246
const combinedMatchData = { }
172
247
const results = [ ]
173
- for ( const matches of [ titleResults , allResults ] ) {
248
+ for ( const matches of [ andTitleResults , titleResults , allResults ] ) {
174
249
for ( const match of matches ) {
175
250
const { result } = match
176
251
// We need to loop over all results (both from title searches and
0 commit comments