Skip to content

Commit

Permalink
src/search refactor + new endpoint: AI Search Autocomplete (#52822)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ebonsignori authored Nov 7, 2024
1 parent c0c5b6a commit 1a99ce6
Show file tree
Hide file tree
Showing 92 changed files with 3,697 additions and 2,454 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Index autocomplete Elasticsearch
name: Index autocomplete search in Elasticsearch

# **What it does**: Indexes autocomplete data into Elasticsearch.
# **Why we have it**: So we can power the API for autocomplete.
# **What it does**: Indexes autocomplete data (general and AI search) into Elasticsearch.
# **Why we have it**: So we can power the APIs for autocomplete.
# **Who does it impact**: docs-engineering

on:
Expand All @@ -10,7 +10,7 @@ on:
- cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST
pull_request:
paths:
- .github/workflows/index-autocomplete-elasticsearch.yml
- .github/workflows/index-autocomplete-search.yml
- 'src/search/scripts/index/**'
- 'package*.json'

Expand Down Expand Up @@ -40,10 +40,15 @@ jobs:
if: ${{ github.event_name == 'pull_request' }}
run: curl --fail --retry-connrefused --retry 5 -I http://localhost:9200

- name: Run indexing
- name: Run general auto-complete indexing
env:
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
run: npm run index -- autocomplete docs-internal-data
run: npm run index-general-autocomplete -- docs-internal-data

- name: Run AI search auto-complete indexing
env:
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
run: npm run index-ai-search-autocomplete -- docs-internal-data

- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name == 'schedule' }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Sync search - PR
name: Index general search in Elasticsearch on PR

# **What it does**: This does what `sync-sarch-elasticsearch.yml` does but
# **What it does**: This does what `index-general-search-elasticsearch.yml` does but
# with a localhost Elasticsearch and only for English.
# **Why we have it**: To test that the script works and the popular pages json is valid.
# **Who does it impact**: Docs engineering
Expand All @@ -11,8 +11,8 @@ on:
paths:
- 'src/search/**'
- 'package*.json'
# Ultimately, for debugging this workflow itself
- .github/workflows/sync-search-pr.yml
# For debugging this workflow
- .github/workflows/index-general-search-pr.yml
# Make sure we run this if the composite action changes
- .github/actions/setup-elasticsearch/action.yml

Expand All @@ -25,9 +25,6 @@ concurrency:
cancel-in-progress: true

env:
# Yes, it's hardcoded but it makes all the steps look exactly the same
# as they do in `sync-search-elasticsearch.yml` where it uses
# that `${{ env.ELASTICSEARCH_URL }}`
ELASTICSEARCH_URL: http://localhost:9200
# Since we'll run in NDOE_ENV=production, we need to be explicit that
# we don't want Hydro configured.
Expand Down Expand Up @@ -63,7 +60,7 @@ jobs:
env:
ENABLE_DEV_LOGGING: false
run: |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
Expand All @@ -88,15 +85,13 @@ jobs:
# let's just accept an empty string instead.
THROW_ON_EMPTY: false

# The sync-search-index recognizes this env var if you don't
# use the `--docs-internal-data <PATH>` option.
DOCS_INTERNAL_DATA: docs-internal-data

run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
npm run general-search-scrape -- /tmp/records \
--language en \
--version dotcom
--version fpt
ls -lh /tmp/records
Expand All @@ -106,9 +101,9 @@ jobs:
- name: Index into Elasticsearch
run: |
npm run index-elasticsearch -- /tmp/records \
npm run index-general-search -- /tmp/records \
--language en \
--version dotcom
--version fpt
- name: Check created indexes and aliases
run: |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Sync search Elasticsearch
name: Index general search in Elasticsearch

# **What it does**: It scrapes the whole site and dumps the records in a
# temp directory. Then it indexes that into Elasticsearch.
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
env:
ENABLE_DEV_LOGGING: false
run: |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
Expand Down Expand Up @@ -169,13 +169,11 @@ jobs:
# the same as not set within the script.
VERSION: ${{ inputs.version }}

# The sync-search-index recognizes this env var if you don't
# use the `--docs-internal-data <PATH>` option.
DOCS_INTERNAL_DATA: docs-internal-data

run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
npm run general-search-scrape -- /tmp/records \
--language ${{ matrix.language }}
ls -lh /tmp/records
Expand All @@ -186,12 +184,12 @@ jobs:
- name: Index into Elasticsearch
env:
# Must match what we used when scraping (npm run sync-search-indices)
# Must match what we used when scraping (npm run general-search-scrape)
# otherwise the script will seek other versions from disk that might
# not exist.
VERSION: ${{ inputs.version }}
run: |
npm run index-elasticsearch -- /tmp/records \
npm run index-general-search -- /tmp/records \
--language ${{ matrix.language }} \
--stagger-seconds 5 \
--retries 5
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,9 @@ assets/images/help/writing/unordered-list-rendered (1).png

# Used by precompute-pageinfo
.pageinfo-cache.json.br

# Cloned and used for indexing Elasticsearch data
docs-internal-data/

# For intermediate data (like scraping for Elasticsearch indexing)
tmp/
27 changes: 27 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 13 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"exports": "./src/frame/server.ts",
"scripts": {
"all-documents": "tsx src/content-render/scripts/all-documents/cli.ts",
"analyze-text": "node src/search/scripts/analyze-text.js",
"analyze-text": "tsx src/search/scripts/analyze-text.ts",
"analyze-comment": "tsx src/events/scripts/analyze-comment-cli.ts",
"archive-version": "tsx --max-old-space-size=16384 src/ghes-releases/scripts/archive-version.ts",
"audit-log-sync": "tsx src/audit-logs/scripts/sync.ts",
Expand All @@ -39,8 +39,14 @@
"find-unused-variables": "tsx src/content-linter/scripts/find-unsed-variables.ts",
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
"index": "tsx src/search/scripts/index/index.ts",
"index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
"general-search-scrape": "tsx src/search/scripts/scrape/scrape-cli.ts",
"general-search-scrape-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
"ghes-release-scrape-with-server": "cross-env GHES_RELEASE=1 start-server-and-test general-search-scrape-server 4002 general-search-scrape",
"general-search-scrape-with-server": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test general-search-scrape-server 4002 general-search-scrape",
"index": "tsx src/search/scripts/index/index-cli autocomplete docs-internal-data",
"index-ai-search-autocomplete": "tsx src/search/scripts/index/index-cli ai-search-autocomplete",
"index-general-autocomplete": "tsx src/search/scripts/index/index-cli general-autocomplete",
"index-general-search": "tsx src/search/scripts/index/index-cli general-search",
"index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
"lint-content": "node src/content-linter/scripts/lint-content.js",
Expand Down Expand Up @@ -70,10 +76,6 @@
"start-for-playwright": "cross-env ROOT=src/fixtures/fixtures TRANSLATIONS_FIXTURE_ROOT=src/fixtures/fixtures/translations ENABLED_LANGUAGES=en,ja NODE_ENV=test tsx src/frame/server.ts",
"symlink-from-local-repo": "node src/early-access/scripts/symlink-from-local-repo.js",
"sync-rest": "tsx src/rest/scripts/update-files.ts",
"sync-search": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test sync-search-server 4002 sync-search-indices",
"sync-search-ghes-release": "cross-env GHES_RELEASE=1 start-server-and-test sync-search-server 4002 sync-search-indices",
"sync-search-indices": "node src/search/scripts/sync-search-indices.js",
"sync-search-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
"sync-secret-scanning": "tsx src/secret-scanning/scripts/sync.ts",
"sync-webhooks": "npx tsx src/rest/scripts/update-files.ts -o webhooks",
"test": "vitest",
Expand Down Expand Up @@ -222,6 +224,7 @@
"src/open-source/scripts/add-pr-links.js",
"src/open-source/scripts/pr-link-source.js",
"rest-api-description/",
"docs-internal-data/",
"src/code-scanning/scripts/generate-code-scanning-query-list.ts"
]
},
Expand Down Expand Up @@ -327,10 +330,13 @@
"@octokit/rest": "21.0.2",
"@playwright/test": "^1.48.1",
"@types/accept-language-parser": "1.5.6",
"@types/cheerio": "^0.22.35",
"@types/connect-datadog": "0.0.10",
"@types/connect-timeout": "0.0.39",
"@types/cookie": "0.6.0",
"@types/cookie-parser": "1.4.7",
"@types/elasticsearch": "^5.0.43",
"@types/event-to-promise": "^0.7.5",
"@types/express": "4.17.21",
"@types/imurmurhash": "^0.1.4",
"@types/js-cookie": "^3.0.6",
Expand Down
4 changes: 2 additions & 2 deletions src/fixtures/tests/breadcrumbs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ describe('breadcrumbs', () => {

expect($breadcrumbTitles.length).toBe(0)
expect($breadcrumbLinks.length).toBe(2)
expect($breadcrumbLinks[0].attribs.title).toBe('Deeper secrets')
expect($breadcrumbLinks[1].attribs.title).toBe('Mariana Trench')
expect(($breadcrumbLinks[0] as cheerio.TagElement).attribs.title).toBe('Deeper secrets')
expect(($breadcrumbLinks[1] as cheerio.TagElement).attribs.title).toBe('Mariana Trench')
})
})
2 changes: 1 addition & 1 deletion src/frame/middleware/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { createProxyMiddleware } from 'http-proxy-middleware'

import events from '@/events/middleware.js'
import anchorRedirect from '@/rest/api/anchor-redirect.js'
import search from '@/search/middleware/search.js'
import search from '@/search/middleware/search-routes.js'
import pageInfo from '@/pageinfo/middleware'
import pageList from '@/pagelist/middleware'
import webhooks from '@/webhooks/middleware/webhooks.js'
Expand Down
4 changes: 2 additions & 2 deletions src/frame/middleware/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ import fastlyCacheTest from './fastly-cache-test'
import trailingSlashes from './trailing-slashes'
import mockVaPortal from './mock-va-portal'
import dynamicAssets from '@/assets/middleware/dynamic-assets'
import contextualizeSearch from '@/search/middleware/contextualize.js'
import generalSearchMiddleware from '@/search/middleware/general-search-middleware'
import shielding from '@/shielding/middleware'
import tracking from '@/tracking/middleware'
import { MAX_REQUEST_TIMEOUT } from '@/frame/lib/constants.js'
Expand Down Expand Up @@ -275,7 +275,7 @@ export default function (app: Express) {
app.use(asyncMiddleware(productExamples))
app.use(asyncMiddleware(productGroups))
app.use(asyncMiddleware(glossaries))
app.use(asyncMiddleware(contextualizeSearch))
app.use(asyncMiddleware(generalSearchMiddleware))
app.use(asyncMiddleware(featuredLinks))
app.use(asyncMiddleware(learningTrack))

Expand Down
12 changes: 9 additions & 3 deletions src/frame/tests/favicons.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ describe('favicon assets', () => {
expect(res.headers['cache-control']).toContain('public')
expect(res.headers['cache-control']).toContain('immutable')
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
const maxAgeSeconds = parseInt(
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
10,
)
// Let's not be too specific in the tests, just as long as it's testing
// that it's a reasonably large number of seconds.
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
Expand All @@ -25,13 +28,16 @@ describe('favicon assets', () => {
test('should serve a valid and aggressively caching /apple-touch-icon.png', async () => {
const res = await get('/apple-touch-icon.png')
expect(res.statusCode).toBe(200)
expect(parseInt(res.headers['content-length'], 10)).toBeGreaterThan(0)
expect(parseInt(res.headers['content-length'] || '', 10)).toBeGreaterThan(0)
expect(res.headers['content-type']).toBe('image/png')
expect(res.headers['set-cookie']).toBeUndefined()
expect(res.headers['cache-control']).toContain('public')
expect(res.headers['cache-control']).toContain('immutable')
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
const maxAgeSeconds = parseInt(
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
10,
)
// Let's not be too specific in the tests, just as long as it's testing
// that it's a reasonably large number of seconds.
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
Expand Down
3 changes: 3 additions & 0 deletions src/frame/tests/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ describe('manifest', () => {
test('download manifest from HTML and check content', async () => {
const $ = await getDOM('/')
const url = $('link[rel="manifest"]').attr('href')
if (!url) {
throw new Error('No manifest URL found')
}
const res = await get(url)
expect(res.statusCode).toBe(200)

Expand Down
8 changes: 4 additions & 4 deletions src/ghes-releases/lib/release-templates/release-steps-1.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ labels:
- [Prerequisites](#prerequisites)
- [Create publication branch for a new version of GHES](#creation)
- [Resolve check failures](#check-failures)
- [Sync the search indices](#sync-search-indices)
- [Scrape the search indices](#scrape-search-indices)
- [Maintain the publication branch](#maintenance)
- [Complete preparation for the RC and publish the docset](#publication)

Expand Down Expand Up @@ -110,11 +110,11 @@ For content from the OpenAPI schema, note the affected content with broken links

<br/>

<a name="sync-search-indices">
<a name="scrape-search-indices">

### [🔎](#sync-search-indices) Sync the search indices
### [🔎](#scrape-search-indices) Scrape the search indices

1. Go to the [`sync-search-elasticsearch` workflow](https://github.com/github/docs-internal/actions/workflows/sync-search-elasticsearch.yml) ([permalink](https://github.com/github/docs-internal/blob/f8ca45703c48c7d1976a278337bc3391fb14fe9e/.github/workflows/sync-search-elasticsearch.yml) in case it moves)
1. Go to the [`index-general-search.yml` workflow](https://github.com/github/docs-internal/actions/workflows/index-general-search.yml)
1. Click on the **Run workflow** drop down and set the following parameters:
- `Branch:` set to the name of the publication branch
- `Version` set to the version you're publishing (e.g., `ghes-3.12` if you're publishing GHES 3.12)
Expand Down
Loading

0 comments on commit 1a99ce6

Please sign in to comment.