Skip to content

Commit

Permalink
use cheerio-to-text (github#31479)
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe authored Oct 10, 2022
1 parent cc35486 commit 994c09d
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 349 deletions.
333 changes: 27 additions & 306 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"ajv-formats": "^2.1.1",
"bottleneck": "2.19.5",
"cheerio": "^1.0.0-rc.11",
"cheerio-to-text": "0.1.0",
"classnames": "^2.3.1",
"connect-datadog": "0.0.9",
"cookie-parser": "^1.4.6",
Expand Down
3 changes: 2 additions & 1 deletion script/search/find-indexable-pages.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env node
import { loadPages } from '../../lib/page-data.js'

export default async function findIndexablePages() {
export default async function findIndexablePages(match = '') {
const allPages = await loadPages()
const indexablePages = allPages
// exclude hidden pages
Expand All @@ -10,6 +10,7 @@ export default async function findIndexablePages() {
.filter((page) => !page.parentProduct || !page.parentProduct.wip || page.parentProduct.hidden)
// exclude absolute home page (e.g. /en or /ja)
.filter((page) => page.relativePath !== 'index.md')
.filter((page) => !match || page.relativePath.includes(match))

console.log('total pages', allPages.length)
console.log('indexable pages', indexablePages.length)
Expand Down
44 changes: 3 additions & 41 deletions script/search/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env node
import { render } from 'cheerio-to-text'

import { maxContentLength } from '../../lib/search/config.js'

// This module takes cheerio page object and divides it into sections
Expand Down Expand Up @@ -67,7 +69,7 @@ export default function parsePageSectionsIntoRecords(page) {
// pages that yields some decent content to be searched on, because
// when you view these pages in a browser, there's clearly text there.
if ($root.length > 0) {
body = getAllText($root)
body = render($root)
}

if (!body && !intro) {
Expand Down Expand Up @@ -95,43 +97,3 @@ export default function parsePageSectionsIntoRecords(page) {
topics,
}
}

function getAllText($root) {
const inlineElements = new Set(
`a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
tt,u,var,video,wbr`
.split(',')
.map((s) => s.trim())
)

const walkTree = (node, callback, index = 0, level = 0) => {
callback(node, index, level)
for (let i = 0; i < (node.children || []).length; i++) {
walkTree(node.children[i], callback, i, ++level)
level--
}
}

const fragments = []

walkTree($root[0], (element) => {
if (element.name === 'body') return

if (element.type === 'text') {
const parentElement = element.parent || {}
const previousElement = element.prev || {}
let { data } = element
if (data.trim()) {
if (!inlineElements.has(parentElement.name) && !inlineElements.has(previousElement.name)) {
data = `\n${data}`
}
fragments.push(data)
}
}
})

return fragments.join('').trim()
}
2 changes: 2 additions & 0 deletions script/search/sync-search-indices.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ program
// any Lunr indexes.
.option('--no-lunr-index', `Do not generate a Lunr index, just the records file (default false)`)
.option('--no-markers', 'Do not print a marker for each parsed document')
.option('--filter <MATCH>', 'Filter to only do pages that match this string')
.parse(process.argv)

main(program.opts())
Expand Down Expand Up @@ -131,6 +132,7 @@ async function main(opts) {

const config = {
noMarkers: !opts.markers,
filter: opts.filter,
}

const options = {
Expand Down
2 changes: 1 addition & 1 deletion script/search/sync.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ export default async function syncSearchIndexes({
)

// Exclude WIP pages, hidden pages, index pages, etc
const indexablePages = await findIndexablePages()
const indexablePages = await findIndexablePages(config.filter)
const redirects = {}
indexablePages.forEach((page) => {
const href = page.relativePath.replace('index.md', '').replace('.md', '')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ <h1>Heading</h1>
<li><div><div><a href="/"><h2>Changing your primary email address</h2><p>You can change the email address associated with your personal account at any time.</p></a></div></span></div></li>
</ul></ul></div>

<p>Para<strong>gra</strong><em>ph</em>.</p>

</div>
2 changes: 2 additions & 0 deletions tests/unit/search/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
// But note also that it would also concatenate the text of the heading
// with the text of the paragraph without a whitespace in between.
expect(record.content.includes('email addressYou can set')).toBeFalsy()
// Make sure that inline elements are still together.
expect(record.content).toMatch(/Paragraph\./)
})
})

0 comments on commit 994c09d

Please sign in to comment.