use cheerio-to-text (github#31479)

gouwestadmm · Oct 10, 2022 · 994c09d · 994c09d
1 parent cc35486
commit 994c09d
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 349 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -20,6 +20,7 @@
     "ajv-formats": "^2.1.1",
     "bottleneck": "2.19.5",
     "cheerio": "^1.0.0-rc.11",
+    "cheerio-to-text": "0.1.0",
     "classnames": "^2.3.1",
     "connect-datadog": "0.0.9",
     "cookie-parser": "^1.4.6",

diff --git a/script/search/find-indexable-pages.js b/script/search/find-indexable-pages.js
@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 import { loadPages } from '../../lib/page-data.js'
 
-export default async function findIndexablePages() {
+export default async function findIndexablePages(match = '') {
   const allPages = await loadPages()
   const indexablePages = allPages
     // exclude hidden pages
@@ -10,6 +10,7 @@ export default async function findIndexablePages() {
     .filter((page) => !page.parentProduct || !page.parentProduct.wip || page.parentProduct.hidden)
     // exclude absolute home page (e.g. /en or /ja)
     .filter((page) => page.relativePath !== 'index.md')
+    .filter((page) => !match || page.relativePath.includes(match))
 
   console.log('total pages', allPages.length)
   console.log('indexable pages', indexablePages.length)

diff --git a/script/search/parse-page-sections-into-records.js b/script/search/parse-page-sections-into-records.js
@@ -1,4 +1,6 @@
 #!/usr/bin/env node
+import { render } from 'cheerio-to-text'
+
 import { maxContentLength } from '../../lib/search/config.js'
 
 // This module takes cheerio page object and divides it into sections
@@ -67,7 +69,7 @@ export default function parsePageSectionsIntoRecords(page) {
   // pages that yields some decent content to be searched on, because
   // when you view these pages in a browser, there's clearly text there.
   if ($root.length > 0) {
-    body = getAllText($root)
+    body = render($root)
   }
 
   if (!body && !intro) {
@@ -95,43 +97,3 @@ export default function parsePageSectionsIntoRecords(page) {
     topics,
   }
 }
-
-function getAllText($root) {
-  const inlineElements = new Set(
-    `a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
-    datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
-    meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
-    select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
-    tt,u,var,video,wbr`
-      .split(',')
-      .map((s) => s.trim())
-  )
-
-  const walkTree = (node, callback, index = 0, level = 0) => {
-    callback(node, index, level)
-    for (let i = 0; i < (node.children || []).length; i++) {
-      walkTree(node.children[i], callback, i, ++level)
-      level--
-    }
-  }
-
-  const fragments = []
-
-  walkTree($root[0], (element) => {
-    if (element.name === 'body') return
-
-    if (element.type === 'text') {
-      const parentElement = element.parent || {}
-      const previousElement = element.prev || {}
-      let { data } = element
-      if (data.trim()) {
-        if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) {
-          data = `\n${data}`
-        }
-        fragments.push(data)
-      }
-    }
-  })
-
-  return fragments.join('').trim()
-}
diff --git a/script/search/sync-search-indices.js b/script/search/sync-search-indices.js
@@ -50,6 +50,7 @@ program
   // any Lunr indexes.
   .option('--no-lunr-index', `Do not generate a Lunr index, just the records file (default false)`)
   .option('--no-markers', 'Do not print a marker for each parsed document')
+  .option('--filter <MATCH>', 'Filter to only do pages that match this string')
   .parse(process.argv)
 
 main(program.opts())
@@ -131,6 +132,7 @@ async function main(opts) {
 
   const config = {
     noMarkers: !opts.markers,
+    filter: opts.filter,
   }
 
   const options = {

diff --git a/script/search/sync.js b/script/search/sync.js
@@ -40,7 +40,7 @@ export default async function syncSearchIndexes({
   )
 
   // Exclude WIP pages, hidden pages, index pages, etc
-  const indexablePages = await findIndexablePages()
+  const indexablePages = await findIndexablePages(config.filter)
   const redirects = {}
   indexablePages.forEach((page) => {
     const href = page.relativePath.replace('index.md', '').replace('.md', '')

diff --git a/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html b/tests/unit/search/fixtures/page-with-heading-and-paragraph-no-whitespace.html
@@ -20,4 +20,6 @@ <h1>Heading</h1>
     <li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
    </ul></ul></div>
 
+  <p>Para<strong>gra</strong><em>ph</em>.</p>
+
 </div>
diff --git a/tests/unit/search/parse-page-sections-into-records.js b/tests/unit/search/parse-page-sections-into-records.js
@@ -119,5 +119,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
     // But note also that it would also concatenate the text of the heading
     // with the text of the paragraph without a whitespace in between.
     expect(record.content.includes('email addressYou can set')).toBeFalsy()
+    // Make sure that inline elements are still together.
+    expect(record.content).toMatch(/Paragraph\./)
   })
 })