Merge pull request #12065 from github/repo-sync

Octomerger · web-flow · commit fad5576a1c08 · 2021-11-16T15:58:12.000-05:00
repo sync
diff --git a/lib/failbot.js b/lib/failbot.js
@@ -31,7 +31,7 @@ async function retryingGot(url, args) {
   )
 }
 
-export function report(error, metadata) {
+export async function report(error, metadata) {
   // If there's no HAYSTACK_URL set, bail early
   if (!process.env.HAYSTACK_URL) return
 
diff --git a/lib/search/lunr-search.js b/lib/search/lunr-search.js
@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
   // want to make sure this number accounts for that.
   const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
 
+  // Multiplication bonus given to matches that were made on the
+  // the search where ALL tokens are required.
+  // E.g. you search for 'foo bar' and we have three records:
+  //
+  //  A)  "This foo is very special"
+  //  B)  "With bar and foo you can't go wrong"
+  //  C)  "Only bar can save you"
+  //
+  // What will happen is that it only finds record (B) when it's
+  // requires to match both 'foo' *and* 'bar'. So you get these scores:
+  //
+  //  A) score = result.score + popularity
+  //  B) score = MATCH_PHRASE * (result.score + popularity)
+  //  C) score = result.score + popularity
+  //
+  // So it's very powerful multiplier. But that's fine because a
+  // "phrase match" is a very accurate thing.
+  const MATCH_PHRASE = 5
+
   // Imagine that we have 1,000 documents. 100 of them contain the word
   // 'foobar'. Of those 100, we want to display the top 10 "best".
   // But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
   // records that we finally return.
   const PRE_LIMIT = 500
 
-  let titleQuery = query.trim()
-  if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
-    // When the search input is really short, force it to search with
-    // the "forward wild card". I.e. you typed `go` we turn it into a
-    // search for `go*` which means it can find things like `Google`.
-    titleQuery += '*'
-  }
+  const titleQuery = query.trim()
 
   let highestTitleScore = 0.0
+
+  const andTitleResults = []
+
+  // This will turn something like 'foo and bar' into:
+  // [
+  //   { str: 'foo', metadata: { position: [Array], index: 0 } },
+  //   { str: 'bar', metadata: { position: [Array], index: 1 } }
+  // ]
+  // Note how the stopword gets omitted.
+  // It's important to omit the stopwords because even if the record
+  // actually contains the stopword, it won't match then.
+  // E.g. you have a record called "Foo And Bar" and you search for
+  // {foo AND and AND bar} it will actually not find anything.
+  // But if you change it to {foo AND bar} it will match "Foo And Bar"
+  // Same goes if any other stopwords were used like "Foo the Bar with for a".
+  // That also needs to become an AND-search of {foo AND bar} ...only.
+  const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)
+
+  if (titleQueryTokenized.length > 1) {
+    andTitleResults.push(
+      ...index
+        .query((q) => {
+          for (const { str } of titleQueryTokenized) {
+            q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
+          }
+        })
+        .slice(0, PRE_LIMIT)
+        .map((result) => {
+          const { popularity } = records[result.ref]
+          if (result.score > highestTitleScore) {
+            highestTitleScore = result.score
+          }
+          const score = result.score / highestTitleScore
+          return {
+            result,
+            _score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
+          }
+        })
+    )
+  }
+
   const titleResults = index
     .query((q) => {
-      if (/['"]/.test(titleQuery)) {
-        // If the query contains a quotation marks, you can't easily
-        // enough break it up into individual words.
-        q.term(titleQuery, { fields: ['title'] })
-      } else {
-        // This is the structured way of doing turning 'foo bar'
-        // into `title:foo title:bar'.
-        titleQuery.split(/ /g).forEach((part) => {
-          q.term(part, { fields: ['title'] })
+      // The objective is to create an OR-query specifically for the 'title'
+      // because *we* value matches on that much higher than any other
+      // field in our records.
+      // But we want to make sure that the last word is always treated
+      // like a forward-tokenized token. I.e. you typed "google ku"
+      // becomes a search for "google ku*".
+      // Note that it's import that use the `lunr.tokenizer()` function when
+      // using the `index.query()` function because, for starters, it will
+      // normalize the input.
+      // If you use `index.search()` is the higher abstraction of basically
+      // doing this:
+      // (pseudo code)
+      //
+      //    Index.prototype.search = function(input) {
+      //       lunr.tokenize(input).forEach(token => {
+      //          Index.query(callback => {
+      //              callback(token)
+      //          })
+      //       })
+      //    }
+      //
+      // If we didn't use the tokenized form, we'd get different results
+      // for searching for "SSH agent" and "ssh AgenT" for example.
+      titleQueryTokenized.forEach(({ str }, i) => {
+        const isLastToken = i === titleQueryTokenized.length - 1
+        const isShort = str.length <= 3
+        q.term(str, {
+          fields: ['title'],
+          wildcard:
+            isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
         })
-      }
+      })
     })
     .slice(0, PRE_LIMIT)
     .map((result) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
   const _unique = new Set()
   const combinedMatchData = {}
   const results = []
-  for (const matches of [titleResults, allResults]) {
+  for (const matches of [andTitleResults, titleResults, allResults]) {
     for (const match of matches) {
       const { result } = match
       // We need to loop over all results (both from title searches and
diff --git a/tests/unit/failbot.js b/tests/unit/failbot.js
@@ -5,6 +5,12 @@ describe('FailBot', () => {
   const requestBodiesSent = []
 
   beforeEach(() => {
+    delete process.env.HAYSTACK_URL
+
+    // Always reset the array to an empty one between tests
+    // so it doesn't intefere across tests.
+    requestBodiesSent.length = 0
+
     nock('https://haystack.example.com')
       .post('/')
       .reply(200, (uri, requestBody) => {
@@ -15,15 +21,13 @@ describe('FailBot', () => {
 
   afterEach(() => {
     delete process.env.HAYSTACK_URL
-    // Reset the array to an empty one between tests
-    // so it doesn't intefere across tests.
-    requestBodiesSent.length = 0
   })
 
   describe('.report', () => {
     it('returns early if `HAYSTACK_URL` is not set', async () => {
       const result = await FailBot.report()
       expect(result).toBeUndefined()
+      expect(requestBodiesSent.length).toBe(0)
     })
 
     it('sends the expected report', async () => {

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ async function retryingGot(url, args) {`
`31`	`31`	`)`
`32`	`32`	`}`
`33`	`33`
`34`		`-export function report(error, metadata) {`
	`34`	`+export async function report(error, metadata) {`
`35`	`35`	`// If there's no HAYSTACK_URL set, bail early`
`36`	`36`	`if (!process.env.HAYSTACK_URL) return`
`37`	`37`