Skip to content

Commit

Permalink
Pre-computed pageinfos (#40414)
Browse files Browse the repository at this point in the history
Co-authored-by: Robert Sese <734194+rsese@users.noreply.github.com>
  • Loading branch information
peterbe and rsese authored Jan 5, 2024
1 parent deb114a commit 173abd1
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 49 deletions.
46 changes: 46 additions & 0 deletions .github/actions/precompute-pageinfo/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Warmup pageinfo cache

description: Run this to create a .pageinfo-cache.json.gz file

inputs:
restore-only:
description: Only attempt to restore, don't warm up
required: false

runs:
using: 'composite'
steps:
# The caching technique here is to "unboundedly" add to the cache.
# By unboundedly, it means the cached item will grow and grow.
# The general idea is that we A) restore from cache, B) replace the
# file by running the script, and C) save the file back to cache.
# Optionally, you can have it just do A (and not B and C).

- name: Cache .pageinfo-cache.json.gz (restore)
# You can't use a SHA on these. Only possible with `actions/cache@SHA...`
uses: actions/cache/restore@v3
with:
path: .pageinfo-cache.json.gz
key: pageinfo-cache-
restore-keys: pageinfo-cache-

# When we use this composite action from the workflows like
# Azure Preview Deploy and Azure Production Deploy, we don't have
# any Node installed or any of its packages. I.e. we never
# run `npm ci` in those actions. For security sake.
# So we can't do things that require Node code.
# Tests and others will omit the `restore-only` input, but
# prepping for Docker build and push, will set it to a non-empty
# string which basically means "If you can restore it, great.
# If not, that's fine, don't bother".
- name: Run script
if: ${{ inputs.restore-only == '' }}
shell: bash
run: npm run precompute-pageinfo

- name: Cache .remotejson-cache (save)
if: ${{ inputs.restore-only == '' }}
uses: actions/cache/save@v3
with:
path: .pageinfo-cache.json.gz
key: pageinfo-cache-${{ github.sha }}
4 changes: 4 additions & 0 deletions .github/workflows/azure-preview-env-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ jobs:
with:
restore-only: true

- uses: ./.github/actions/precompute-pageinfo
with:
restore-only: true

# In addition to making the final image smaller, we also save time by not sending unnecessary files to the docker build context
- name: 'Prune for preview env'
run: src/workflows/prune-for-preview-env.sh
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/azure-prod-build-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ jobs:
with:
restore-only: true

- uses: ./.github/actions/precompute-pageinfo
with:
restore-only: true

- uses: ./.github/actions/clone-translations
with:
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
Expand Down
20 changes: 16 additions & 4 deletions .github/workflows/keep-caches-warm.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
name: Keep caches warm

# **What it does**: Makes sure the caching of ./node_modules and ./.next
# is kept warm for making pull requests more rapid.
# **Why we have it**: A PR workflow that depends on caching can't reuse a
# cached artifact acorss PRs unless it also runs on `main`.
# **What it does**:
# Makes sure the caching of ./node_modules and ./.next is kept warm
# for making other pull requests faster.
# We also use this workflow to precompute other things so that the
# actions cache is warmed up with data available during deployment
# actions. When you use actions/cache within a run on `main`
# what gets saved can be used by other pull requests. But it's
# also so that when we make preview or production deployments,
# we can just rely on the cache to already be warmed up.
# **Why we have it**:
# A PR workflow that depends on caching can't reuse a
# cached artifact acorss PRs unless it also runs on `main`.
# **Who does it impact**: Docs engineering, open-source engineering contributors.

on:
Expand Down Expand Up @@ -31,6 +39,10 @@ jobs:
run: npm run build

- uses: ./.github/actions/warmup-remotejson-cache
if: github.repository == 'github/docs-internal'

- uses: ./.github/actions/precompute-pageinfo
if: github.repository == 'github/docs-internal'

- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,12 @@ jobs:
# archived enterprise server URLs.
if: ${{ matrix.name == 'redirects' }}

- uses: ./.github/actions/precompute-pageinfo
# Only the 'pageinfo' tests include end-to-end tests about this.
if: ${{ matrix.name == 'pageinfo' }}
env:
ROOT: src/fixtures/fixtures

- name: Index fixtures into the local Elasticsearch
# For the sake of saving time, only run this step if the group
# is one that will run tests against an Elasticsearch on localhost.
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,6 @@ assets/images/help/writing/unordered-list-rendered (1).png

# Used by getRemoteJSON()
.remotejson-cache/

# Used by precompute-pageinfo
.pageinfo-cache.json.br
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ FROM all_deps as builder
COPY src ./src
# The star is because it's an optional directory
COPY .remotejson-cache* ./.remotejson-cache
# The star is because it's an optional file
COPY .pageinfo-cache.json.gz* ./.pageinfo-cache.json.gz
# Certain content is necessary for being able to build
COPY content/index.md ./content/index.md
COPY content/rest ./content/rest
Expand Down Expand Up @@ -88,6 +90,7 @@ COPY --chown=node:node assets ./assets
COPY --chown=node:node content ./content
COPY --chown=node:node src ./src
COPY --chown=node:node .remotejson-cache* ./.remotejson-cache
COPY --chown=node:node .pageinfo-cache.json* ./.pageinfo-cache.json
COPY --chown=node:node data ./data
COPY --chown=node:node next.config.js ./

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"playwright-test": "playwright test --config src/fixtures/playwright.config.ts --project=\"Google Chrome\"",
"post-lints": "node src/content-linter/scripts/post-lints.js",
"postinstall": "cp package-lock.json .installed.package-lock.json && echo \"Updated .installed.package-lock.json\" # see husky/post-checkout and husky/post-merge",
"precompute-pageinfo": "node src/pageinfo/scripts/precompute-pageinfo.js",
"prepare": "husky install src/workflows/husky",
"prettier": "prettier -w \"**/*.{ts,tsx,js,mjs,scss,yml,yaml}\"",
"prettier-check": "prettier -c \"**/*.{ts,tsx,js,mjs,scss,yml,yaml}\"",
Expand Down
139 changes: 94 additions & 45 deletions src/pageinfo/middleware.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,18 @@ import contextualize from '#src/frame/middleware/context/context.js'
import features from '#src/versions/middleware/features.js'
import getRedirect from '#src/redirects/lib/get-redirect.js'
import { isArchivedVersionByPath } from '#src/archives/lib/is-archived-version.js'
import { readCompressedJsonFile } from '#src/frame/lib/read-json-file.js'

const router = express.Router()

// If you have pre-computed page info into a JSON file on disk, this is
// where it would be expected to be found.
// Note that if the file does not exist, it will be ignored and
// every pageinfo is computed every time.
// Note! The only reason this variable is exported is so that
// it can be imported by the script scripts/precompute-pageinfo.js
export const CACHE_FILE_PATH = '.pageinfo-cache.json.br'

const validationMiddleware = (req, res, next) => {
const { pathname } = req.query
if (!pathname) {
Expand Down Expand Up @@ -83,6 +92,90 @@ const pageinfoMiddleware = (req, res, next) => {
return next()
}

export async function getPageInfo(page, pathname) {
const renderingReq = {
path: pathname,
language: page.languageCode,
pagePath: pathname,
cookies: {},
}
const next = () => {}
const res = {}
await contextualize(renderingReq, res, next)
await shortVersions(renderingReq, res, next)
renderingReq.context.page = page
await features(renderingReq, res, next)
const context = renderingReq.context

const title = await page.renderProp('title', context, { textOnly: true })
const intro = await page.renderProp('intro', context, { textOnly: true })

let productPage = null
for (const permalink of page.permalinks) {
const rootHref = permalink.href
.split('/')
.slice(0, permalink.pageVersion === 'free-pro-team@latest' ? 3 : 4)
.join('/')
const rootPage = context.pages[rootHref]
if (rootPage) {
productPage = rootPage
break
}
}
const product = productPage ? await getProductPageInfo(productPage, context) : ''

return { title, intro, product }
}

const _productPageCache = {}
// The title of the product is much easier to cache because it's often
// repeated. What determines the title of the product is the language
// and the version. A lot of pages have the same title for the product.
async function getProductPageInfo(page, context) {
const cacheKey = `${page.relativePath}:${context.currentVersion}:${context.currentLanguage}`
if (!(cacheKey in _productPageCache)) {
const title =
(await page.renderProp('shortTitle', context, {
textOnly: true,
})) ||
(await page.renderProp('title', context, {
textOnly: true,
}))
_productPageCache[cacheKey] = title
}
return _productPageCache[cacheKey]
}

let _cache = null
async function getPageInfoFromCache(page, pathname) {
if (_cache === null) {
try {
_cache = readCompressedJsonFile(CACHE_FILE_PATH)
} catch (error) {
if (error.code !== 'ENOENT') {
throw error
}
_cache = {}
}
}

let info = _cache[pathname]
if (!info) {
info = await getPageInfo(page, pathname)
// You might wonder; why do we not store this compute information
// into the `_cache` from here?
// The short answer is; it won't be used again.
// In production, which is the only place where performance matters,
// a HTTP GET request will only happen once per deployment. That's
// because the CDN will cache it until the next deployment (which is
// followed by a CDN purge).
// In development (local preview), the performance doesn't really matter.
// In CI, we use the caching because the CI runs
// `npm run precompute-pageinfo` right before it runs jest tests.
}
return info
}

router.get(
'/v1',
validationMiddleware,
Expand Down Expand Up @@ -113,51 +206,7 @@ router.get(
throw new Error(`pathname '${pathname}' not one of the page's permalinks`)
}

const renderingReq = {
path: pathname,
language: page.languageCode,
pagePath: pathname,
cookies: {},
}
const next = () => {}
await contextualize(renderingReq, res, next)
await shortVersions(renderingReq, res, next)
renderingReq.context.page = page
await features(renderingReq, res, next)
const context = renderingReq.context

const title = await page.renderProp('title', context, { textOnly: true })
const intro = await page.renderProp('intro', context, { textOnly: true })

let productPage = null
for (const permalink of page.permalinks) {
const rootHref = permalink.href
.split('/')
.slice(0, permalink.pageVersion === 'free-pro-team@latest' ? 3 : 4)
.join('/')
const rootPage = context.pages[rootHref]
if (rootPage) {
productPage = rootPage
break
}
}
let product = ''
if (productPage) {
product = await productPage.renderProp('shortTitle', context, {
textOnly: true,
})
if (!product) {
product = await productPage.renderProp('title', context, {
textOnly: true,
})
}
}

const info = {
product,
title,
intro,
}
const info = await getPageInfoFromCache(page, pathname)

const tags = [
// According to https://docs.datadoghq.com/getting_started/tagging/#define-tags
Expand Down
70 changes: 70 additions & 0 deletions src/pageinfo/scripts/precompute-pageinfo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env node

/**
* This script gathers all English pages, computes each page's
* 'title', 'intro' and 'product' properties. These things are then stored
* in a JSON file (and gzipped) on disk. Then, the pageinfo middleware
* can load in that JSON file to have a cache of pageinfo for all English
* pages.
* Now, when someone requests `/api/pageinfo?pathname=/en/foo/bar`, for the
* backend, it just needs to read from a precomputed cache file instead
* of having to do this computation on every request. Time saved, up front.
*
* Why cache?: Despite being a fast computation (3 Liquid + Markdown renders),
* it still adds up. And it's safe and cheap to precompute in advance.
*
* Why only the English?: To make the file not too large.
* Given how good these things compress, we might consider, in the
* future, to do all languages.
*
* Why brotli?: Because the file gets included in the Docker container and
* there every byte counts.
*
* When is this script run?: On every push to main, it gets computed
* and uses actions/cache to store the result. Meaning, it's not run
* during deployment. (During the deploy it only *downloads* from
* actions/cache).
*/

import fs from 'fs'
import { brotliCompressSync } from 'zlib'

import { loadPages, loadUnversionedTree } from '#src/frame/lib/page-data.js'
import { CACHE_FILE_PATH, getPageInfo } from '../middleware.js'

main()

const CI = Boolean(JSON.parse(process.env.CI || 'false'))

async function main() {
const unversionedTree = await loadUnversionedTree(['en'])
const pageList = await loadPages(unversionedTree, ['en'])

let label = `Compute pageinfos for ${pageList.length.toLocaleString()} pages`
console.time(label)
const pageinfos = {}
for (const page of pageList) {
const pathname = page.permalinks[0].href
try {
const computed = await getPageInfo(page, pathname)
if (computed) {
pageinfos[pathname] = computed
}
} catch (error) {
console.error(`Error computing pageinfo for ${page.fullPath} (${pathname})`)
throw error
}
}
console.timeEnd(label)

label = `Serialize, compress, and write to ${CACHE_FILE_PATH}`
console.time(label)
const payload = CI ? JSON.stringify(pageinfos) : JSON.stringify(pageinfos, null, 2)
const payloadBuffer = Buffer.from(payload, 'utf-8')
const payloadCompressed = brotliCompressSync(payloadBuffer)
fs.writeFileSync(CACHE_FILE_PATH, payloadCompressed)
console.timeEnd(label)
console.log(
`Wrote ${Object.keys(pageinfos).length.toLocaleString()} pageinfos to ${CACHE_FILE_PATH}`,
)
}

0 comments on commit 173abd1

Please sign in to comment.