Skip to content

Commit

Permalink
make it easier to spot pages with no scrapable content (github#31421)
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe authored Oct 4, 2022
1 parent 6468a95 commit 642cb07
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 4 deletions.
8 changes: 5 additions & 3 deletions script/search/build-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ export default async function buildRecords(
indexablePages,
pageVersion,
languageCode,
redirects
redirects,
config = {}
) {
const { noMarkers } = config
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
const records = []
const pages = indexablePages
Expand Down Expand Up @@ -61,12 +63,12 @@ export default async function buildRecords(

const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
.on('page', (page) => {
process.stdout.write(pageMarker)
if (!noMarkers) process.stdout.write(pageMarker)
const newRecord = parsePageSectionsIntoRecords(page)
const pathArticle = page.relativePath.replace('/index.md', '').replace('.md', '')
const popularity = (hasPopularPages && popularPages[pathArticle]) || 0.0
newRecord.popularity = popularity
process.stdout.write(recordMarker)
if (!noMarkers) process.stdout.write(recordMarker)
records.push(newRecord)
})
.on('error', (err) => {
Expand Down
5 changes: 5 additions & 0 deletions script/search/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ export default function parsePageSectionsIntoRecords(page) {

const rootSelector = '[data-search=article-body]'
const $root = $(rootSelector)
if ($root.length === 0) {
console.warn(`${href} has no '${rootSelector}'`)
} else if ($root.length > 1) {
console.warn(`${href} has more than one '${rootSelector}' (${$root.length})`)
}

const $sections = $('h2', $root)
.filter('[id]')
Expand Down
6 changes: 6 additions & 0 deletions script/search/sync-search-indices.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ program
// and change where it's used to be that the default is to not generate
// any Lunr indexes.
.option('--no-lunr-index', `Do not generate a Lunr index, just the records file (default false)`)
.option('--no-markers', 'Do not print a marker for each parsed document')
.parse(process.argv)

main(program.opts())
Expand Down Expand Up @@ -128,6 +129,10 @@ async function main(opts) {

const generateLunrIndex = !!opts.lunrIndex

const config = {
noMarkers: !opts.markers,
}

const options = {
dryRun,
language,
Expand All @@ -136,6 +141,7 @@ async function main(opts) {
outDirectory,
compressFiles,
generateLunrIndex,
config,
}
await searchSync(options)
}
4 changes: 3 additions & 1 deletion script/search/sync.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export default async function syncSearchIndexes({
outDirectory,
compressFiles,
generateLunrIndex,
config = {},
}) {
const t0 = new Date()

Expand Down Expand Up @@ -72,7 +73,8 @@ export default async function syncSearchIndexes({
indexablePages,
pageVersion,
languageCode,
redirects
redirects,
config
)
if (generateLunrIndex) {
const index = new LunrIndex(indexName, records)
Expand Down

0 comments on commit 642cb07

Please sign in to comment.