Pre-computed pageinfos (#40414)

Co-authored-by: Robert Sese <734194+rsese@users.noreply.github.com>
adamlaska · Jan 5, 2024 · 173abd1 · 173abd1
1 parent deb114a
commit 173abd1
Show file tree

Hide file tree

Showing 10 changed files with 247 additions and 49 deletions.
diff --git a/.github/actions/precompute-pageinfo/action.yml b/.github/actions/precompute-pageinfo/action.yml
@@ -0,0 +1,46 @@
+name: Warmup pageinfo cache
+
+description: Run this to create a .pageinfo-cache.json.gz file
+
+inputs:
+  restore-only:
+    description: Only attempt to restore, don't warm up
+    required: false
+
+runs:
+  using: 'composite'
+  steps:
+    # The caching technique here is to "unboundedly" add to the cache.
+    # By unboundedly, it means the cached item will grow and grow.
+    # The general idea is that we A) restore from cache, B) replace the
+    # file by running the script, and C) save the file back to cache.
+    # Optionally, you can have it just do A (and not B and C).
+
+    - name: Cache .pageinfo-cache.json.gz (restore)
+      # You can't use a SHA on these. Only possible with `actions/cache@SHA...`
+      uses: actions/cache/restore@v3
+      with:
+        path: .pageinfo-cache.json.gz
+        key: pageinfo-cache-
+        restore-keys: pageinfo-cache-
+
+    # When we use this composite action from the workflows like
+    # Azure Preview Deploy and Azure Production Deploy, we don't have
+    # any Node installed or any of its packages. I.e. we never
+    # run `npm ci` in those actions. For security sake.
+    # So we can't do things that require Node code.
+    # Tests and others will omit the `restore-only` input, but
+    # prepping for Docker build and push, will set it to a non-empty
+    # string which basically means "If you can restore it, great.
+    # If not, that's fine, don't bother".
+    - name: Run script
+      if: ${{ inputs.restore-only == '' }}
+      shell: bash
+      run: npm run precompute-pageinfo
+
+    - name: Cache .remotejson-cache (save)
+      if: ${{ inputs.restore-only == '' }}
+      uses: actions/cache/save@v3
+      with:
+        path: .pageinfo-cache.json.gz
+        key: pageinfo-cache-${{ github.sha }}
diff --git a/.github/workflows/azure-preview-env-deploy.yml b/.github/workflows/azure-preview-env-deploy.yml
@@ -198,6 +198,10 @@ jobs:
         with:
           restore-only: true
 
+      - uses: ./.github/actions/precompute-pageinfo
+        with:
+          restore-only: true
+
       # In addition to making the final image smaller, we also save time by not sending unnecessary files to the docker build context
       - name: 'Prune for preview env'
         run: src/workflows/prune-for-preview-env.sh

diff --git a/.github/workflows/azure-prod-build-deploy.yml b/.github/workflows/azure-prod-build-deploy.yml
@@ -78,6 +78,10 @@ jobs:
         with:
           restore-only: true
 
+      - uses: ./.github/actions/precompute-pageinfo
+        with:
+          restore-only: true
+
       - uses: ./.github/actions/clone-translations
         with:
           token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}

diff --git a/.github/workflows/keep-caches-warm.yml b/.github/workflows/keep-caches-warm.yml
@@ -1,9 +1,17 @@
 name: Keep caches warm
 
-# **What it does**: Makes sure the caching of ./node_modules and ./.next
-#                   is kept warm for making pull requests more rapid.
-# **Why we have it**: A PR workflow that depends on caching can't reuse a
-#                     cached artifact acorss PRs unless it also runs on `main`.
+# **What it does**:
+#   Makes sure the caching of ./node_modules and ./.next is kept warm
+#   for making other pull requests faster.
+#   We also use this workflow to precompute other things so that the
+#   actions cache is warmed up with data available during deployment
+#   actions. When you use actions/cache within a run on `main`
+#   what gets saved can be used by other pull requests. But it's
+#   also so that when we make preview or production deployments,
+#   we can just rely on the cache to already be warmed up.
+# **Why we have it**:
+#   A PR workflow that depends on caching can't reuse a
+#   cached artifact acorss PRs unless it also runs on `main`.
 # **Who does it impact**: Docs engineering, open-source engineering contributors.
 
 on:
@@ -31,6 +39,10 @@ jobs:
         run: npm run build
 
       - uses: ./.github/actions/warmup-remotejson-cache
+        if: github.repository == 'github/docs-internal'
+
+      - uses: ./.github/actions/precompute-pageinfo
+        if: github.repository == 'github/docs-internal'
 
       - uses: ./.github/actions/slack-alert
         if: ${{ failure() && github.event_name != 'workflow_dispatch' }}

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -170,6 +170,12 @@ jobs:
         # archived enterprise server URLs.
         if: ${{ matrix.name == 'redirects' }}
 
+      - uses: ./.github/actions/precompute-pageinfo
+        # Only the 'pageinfo' tests include end-to-end tests about this.
+        if: ${{ matrix.name == 'pageinfo' }}
+        env:
+          ROOT: src/fixtures/fixtures
+
       - name: Index fixtures into the local Elasticsearch
         # For the sake of saving time, only run this step if the group
         # is one that will run tests against an Elasticsearch on localhost.

diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,6 @@ assets/images/help/writing/unordered-list-rendered (1).png
 
 # Used by getRemoteJSON()
 .remotejson-cache/
+
+# Used by precompute-pageinfo
+.pageinfo-cache.json.br
diff --git a/Dockerfile b/Dockerfile
@@ -47,6 +47,8 @@ FROM all_deps as builder
 COPY src ./src
 # The star is because it's an optional directory
 COPY .remotejson-cache* ./.remotejson-cache
+# The star is because it's an optional file
+COPY .pageinfo-cache.json.gz* ./.pageinfo-cache.json.gz
 # Certain content is necessary for being able to build
 COPY content/index.md ./content/index.md
 COPY content/rest ./content/rest
@@ -88,6 +90,7 @@ COPY --chown=node:node assets ./assets
 COPY --chown=node:node content ./content
 COPY --chown=node:node src ./src
 COPY --chown=node:node .remotejson-cache* ./.remotejson-cache
+COPY --chown=node:node .pageinfo-cache.json* ./.pageinfo-cache.json
 COPY --chown=node:node data ./data
 COPY --chown=node:node next.config.js ./
 

diff --git a/package.json b/package.json
@@ -39,6 +39,7 @@
     "playwright-test": "playwright test --config src/fixtures/playwright.config.ts --project=\"Google Chrome\"",
     "post-lints": "node src/content-linter/scripts/post-lints.js",
     "postinstall": "cp package-lock.json .installed.package-lock.json && echo \"Updated .installed.package-lock.json\" # see husky/post-checkout and husky/post-merge",
+    "precompute-pageinfo": "node src/pageinfo/scripts/precompute-pageinfo.js",
     "prepare": "husky install src/workflows/husky",
     "prettier": "prettier -w \"**/*.{ts,tsx,js,mjs,scss,yml,yaml}\"",
     "prettier-check": "prettier -c \"**/*.{ts,tsx,js,mjs,scss,yml,yaml}\"",

diff --git a/src/pageinfo/middleware.js b/src/pageinfo/middleware.js
@@ -13,9 +13,18 @@ import contextualize from '#src/frame/middleware/context/context.js'
 import features from '#src/versions/middleware/features.js'
 import getRedirect from '#src/redirects/lib/get-redirect.js'
 import { isArchivedVersionByPath } from '#src/archives/lib/is-archived-version.js'
+import { readCompressedJsonFile } from '#src/frame/lib/read-json-file.js'
 
 const router = express.Router()
 
+// If you have pre-computed page info into a JSON file on disk, this is
+// where it would be expected to be found.
+// Note that if the file does not exist, it will be ignored and
+// every pageinfo is computed every time.
+// Note! The only reason this variable is exported is so that
+// it can be imported by the script scripts/precompute-pageinfo.js
+export const CACHE_FILE_PATH = '.pageinfo-cache.json.br'
+
 const validationMiddleware = (req, res, next) => {
   const { pathname } = req.query
   if (!pathname) {
@@ -83,6 +92,90 @@ const pageinfoMiddleware = (req, res, next) => {
   return next()
 }
 
+export async function getPageInfo(page, pathname) {
+  const renderingReq = {
+    path: pathname,
+    language: page.languageCode,
+    pagePath: pathname,
+    cookies: {},
+  }
+  const next = () => {}
+  const res = {}
+  await contextualize(renderingReq, res, next)
+  await shortVersions(renderingReq, res, next)
+  renderingReq.context.page = page
+  await features(renderingReq, res, next)
+  const context = renderingReq.context
+
+  const title = await page.renderProp('title', context, { textOnly: true })
+  const intro = await page.renderProp('intro', context, { textOnly: true })
+
+  let productPage = null
+  for (const permalink of page.permalinks) {
+    const rootHref = permalink.href
+      .split('/')
+      .slice(0, permalink.pageVersion === 'free-pro-team@latest' ? 3 : 4)
+      .join('/')
+    const rootPage = context.pages[rootHref]
+    if (rootPage) {
+      productPage = rootPage
+      break
+    }
+  }
+  const product = productPage ? await getProductPageInfo(productPage, context) : ''
+
+  return { title, intro, product }
+}
+
+const _productPageCache = {}
+// The title of the product is much easier to cache because it's often
+// repeated. What determines the title of the product is the language
+// and the version. A lot of pages have the same title for the product.
+async function getProductPageInfo(page, context) {
+  const cacheKey = `${page.relativePath}:${context.currentVersion}:${context.currentLanguage}`
+  if (!(cacheKey in _productPageCache)) {
+    const title =
+      (await page.renderProp('shortTitle', context, {
+        textOnly: true,
+      })) ||
+      (await page.renderProp('title', context, {
+        textOnly: true,
+      }))
+    _productPageCache[cacheKey] = title
+  }
+  return _productPageCache[cacheKey]
+}
+
+let _cache = null
+async function getPageInfoFromCache(page, pathname) {
+  if (_cache === null) {
+    try {
+      _cache = readCompressedJsonFile(CACHE_FILE_PATH)
+    } catch (error) {
+      if (error.code !== 'ENOENT') {
+        throw error
+      }
+      _cache = {}
+    }
+  }
+
+  let info = _cache[pathname]
+  if (!info) {
+    info = await getPageInfo(page, pathname)
+    // You might wonder; why do we not store this compute information
+    // into the `_cache` from here?
+    // The short answer is; it won't be used again.
+    // In production, which is the only place where performance matters,
+    // a HTTP GET request will only happen once per deployment. That's
+    // because the CDN will cache it until the next deployment (which is
+    // followed by a CDN purge).
+    // In development (local preview), the performance doesn't really matter.
+    // In CI, we use the caching because the CI runs
+    // `npm run precompute-pageinfo` right before it runs jest tests.
+  }
+  return info
+}
+
 router.get(
   '/v1',
   validationMiddleware,
@@ -113,51 +206,7 @@ router.get(
       throw new Error(`pathname '${pathname}' not one of the page's permalinks`)
     }
 
-    const renderingReq = {
-      path: pathname,
-      language: page.languageCode,
-      pagePath: pathname,
-      cookies: {},
-    }
-    const next = () => {}
-    await contextualize(renderingReq, res, next)
-    await shortVersions(renderingReq, res, next)
-    renderingReq.context.page = page
-    await features(renderingReq, res, next)
-    const context = renderingReq.context
-
-    const title = await page.renderProp('title', context, { textOnly: true })
-    const intro = await page.renderProp('intro', context, { textOnly: true })
-
-    let productPage = null
-    for (const permalink of page.permalinks) {
-      const rootHref = permalink.href
-        .split('/')
-        .slice(0, permalink.pageVersion === 'free-pro-team@latest' ? 3 : 4)
-        .join('/')
-      const rootPage = context.pages[rootHref]
-      if (rootPage) {
-        productPage = rootPage
-        break
-      }
-    }
-    let product = ''
-    if (productPage) {
-      product = await productPage.renderProp('shortTitle', context, {
-        textOnly: true,
-      })
-      if (!product) {
-        product = await productPage.renderProp('title', context, {
-          textOnly: true,
-        })
-      }
-    }
-
-    const info = {
-      product,
-      title,
-      intro,
-    }
+    const info = await getPageInfoFromCache(page, pathname)
 
     const tags = [
       // According to https://docs.datadoghq.com/getting_started/tagging/#define-tags

diff --git a/src/pageinfo/scripts/precompute-pageinfo.js b/src/pageinfo/scripts/precompute-pageinfo.js
@@ -0,0 +1,70 @@
+#!/usr/bin/env node
+
+/**
+ * This script gathers all English pages, computes each page's
+ * 'title', 'intro' and 'product' properties. These things are then stored
+ * in a JSON file (and gzipped) on disk. Then, the pageinfo middleware
+ * can load in that JSON file to have a cache of pageinfo for all English
+ * pages.
+ * Now, when someone requests `/api/pageinfo?pathname=/en/foo/bar`, for the
+ * backend, it just needs to read from a precomputed cache file instead
+ * of having to do this computation on every request. Time saved, up front.
+ *
+ * Why cache?: Despite being a fast computation (3 Liquid + Markdown renders),
+ * it still adds up. And it's safe and cheap to precompute in advance.
+ *
+ * Why only the English?: To make the file not too large.
+ * Given how good these things compress, we might consider, in the
+ * future, to do all languages.
+ *
+ * Why brotli?: Because the file gets included in the Docker container and
+ * there every byte counts.
+ *
+ * When is this script run?: On every push to main, it gets computed
+ * and uses actions/cache to store the result. Meaning, it's not run
+ * during deployment. (During the deploy it only *downloads* from
+ * actions/cache).
+ */
+
+import fs from 'fs'
+import { brotliCompressSync } from 'zlib'
+
+import { loadPages, loadUnversionedTree } from '#src/frame/lib/page-data.js'
+import { CACHE_FILE_PATH, getPageInfo } from '../middleware.js'
+
+main()
+
+const CI = Boolean(JSON.parse(process.env.CI || 'false'))
+
+async function main() {
+  const unversionedTree = await loadUnversionedTree(['en'])
+  const pageList = await loadPages(unversionedTree, ['en'])
+
+  let label = `Compute pageinfos for ${pageList.length.toLocaleString()} pages`
+  console.time(label)
+  const pageinfos = {}
+  for (const page of pageList) {
+    const pathname = page.permalinks[0].href
+    try {
+      const computed = await getPageInfo(page, pathname)
+      if (computed) {
+        pageinfos[pathname] = computed
+      }
+    } catch (error) {
+      console.error(`Error computing pageinfo for ${page.fullPath} (${pathname})`)
+      throw error
+    }
+  }
+  console.timeEnd(label)
+
+  label = `Serialize, compress, and write to ${CACHE_FILE_PATH}`
+  console.time(label)
+  const payload = CI ? JSON.stringify(pageinfos) : JSON.stringify(pageinfos, null, 2)
+  const payloadBuffer = Buffer.from(payload, 'utf-8')
+  const payloadCompressed = brotliCompressSync(payloadBuffer)
+  fs.writeFileSync(CACHE_FILE_PATH, payloadCompressed)
+  console.timeEnd(label)
+  console.log(
+    `Wrote ${Object.keys(pageinfos).length.toLocaleString()} pageinfos to ${CACHE_FILE_PATH}`,
+  )
+}