Skip to content

Commit 14ca55f

Browse files
authored
cope with possible translation drift (#24842)
* cope with possible translation drift * fix test * don't shallow clone * fix unit tests * update code comments * more code comment corrections * more code comment * feedbacked
1 parent 45cd562 commit 14ca55f

File tree

5 files changed

+218
-24
lines changed

5 files changed

+218
-24
lines changed

crowdin.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
files:
22
- source: /content/**/*.md
33
translation: /translations/%locale%/%original_path%/%original_file_name%
4+
# See lib/page-data.js for a matching list of prefix exceptions
5+
# Try to keep these in sync when editing in either location.
46
ignore:
57
- '/content/README.md'
68
- '/content/early-access'

lib/create-tree.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url))
88
const _basePaths = new Map()
99
// Return a full directory based on __dirname from a specific language directory.
1010
// This function is memoized with a simple global cache object.
11-
function getBasePath(directory) {
11+
export function getBasePath(directory) {
1212
if (!_basePaths.has(directory)) {
1313
_basePaths.set(directory, path.posix.join(__dirname, '..', directory, 'content'))
1414
}

lib/page-data.js

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,29 @@ import { fileURLToPath } from 'url'
22
import path from 'path'
33
import languages from './languages.js'
44
import { allVersions } from './all-versions.js'
5-
import createTree from './create-tree.js'
5+
import createTree, { getBasePath } from './create-tree.js'
66
import renderContent from './render-content/index.js'
77
import loadSiteData from './site-data.js'
88
import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
9+
import Page from './page.js'
910
const __dirname = path.dirname(fileURLToPath(import.meta.url))
1011
const versions = Object.keys(allVersions)
1112
const enterpriseServerVersions = versions.filter((v) => v.startsWith('enterprise-server@'))
1213
const renderOpts = { textOnly: true, encodeEntities: true }
1314

15+
// These are the exceptions to the rule.
16+
// These URI prefixes should match what you'll find in crowdin.yml.
17+
// If a URI starts with one of these prefixes, it basically means we don't
18+
// bother to "backfill" a translation in its spot.
19+
// For example, `/en/github/site-policy-deprecated/foo` works
20+
// only in English and we don't bother making `/ja/github/site-policy-deprecated/foo`
21+
// work too.
22+
const TRANSLATION_DRIFT_EXCEPTIONS = [
23+
'github/site-policy-deprecated',
24+
// Early access stuff never has translations.
25+
'early-access',
26+
]
27+
1428
/**
1529
* We only need to initialize pages _once per language_ since pages don't change per version. So we do that
1630
* first since it's the most expensive work. This gets us a nested object with pages attached that we can use
@@ -152,8 +166,112 @@ export function createMapFromArray(pageList) {
152166
}
153167

154168
export async function loadPageMap(pageList) {
155-
const pages = pageList || (await loadPageList())
156-
return createMapFromArray(pages)
169+
const pages = await correctTranslationOrphans(pageList || (await loadPageList()))
170+
const pageMap = createMapFromArray(pages)
171+
return pageMap
172+
}
173+
174+
// If a translation page exists, that doesn't have an English equivalent,
175+
// remove it.
176+
// If an English page exists, that doesn't have an translation equivalent,
177+
// add it.
178+
// Note, this function is exported purely for the benefit of the unit tests.
179+
export async function correctTranslationOrphans(pageList, basePath = null) {
180+
const englishRelativePaths = new Set()
181+
for (const page of pageList) {
182+
if (page.languageCode === 'en') {
183+
englishRelativePaths.add(page.relativePath)
184+
}
185+
}
186+
187+
// Prime the Map with an empty set for each language prefix.
188+
// It's important that we do this for *every* language rather than
189+
// just populating `nonEnglish` based on those pages that *are* present.
190+
// Otherwise, we won't have an index of all the languages
191+
// that *might* be missing.
192+
const nonEnglish = new Map()
193+
Object.keys(languages)
194+
.filter((lang) => lang !== 'en')
195+
.forEach((languageCode) => {
196+
nonEnglish.set(languageCode, new Set())
197+
})
198+
199+
// By default, when backfilling, we set the `basePath` to be that of
200+
// English. But for the benefit of being able to do unit tests,
201+
// we make this an optional argument. Then, unit tests can use
202+
// its "tests/fixtures" directory.
203+
const englishBasePath = basePath || getBasePath(languages.en.dir)
204+
205+
// Filter out all non-English pages that appear to be excess.
206+
// E.g. if an English doc was renamed from `content/foo.md` to
207+
// `content/bar.md` what will happen is that `translations/*/content/foo.md`
208+
// will still linger around and we want to remove that even if it was
209+
// scooped up from disk.
210+
const newPageList = []
211+
for (const page of pageList) {
212+
if (page.languageCode === 'en') {
213+
// English pages are never considered "excess"
214+
newPageList.push(page)
215+
continue
216+
}
217+
218+
// If this translation page exists in English, keep it but also
219+
// add it to the set of relative paths that is known.
220+
if (englishRelativePaths.has(page.relativePath)) {
221+
nonEnglish.get(page.languageCode).add(page.relativePath)
222+
newPageList.push(page)
223+
continue
224+
}
225+
226+
// All else is considered "excess" and should be excluded from
227+
// the new list of pages. So do nothing.
228+
if (process.env.NODE_ENV === 'development') {
229+
console.log(
230+
`For ${page.languageCode}, omit the page ${page.relativePath} even though it's present on disk.`
231+
)
232+
}
233+
}
234+
235+
const pageLoadPromises = []
236+
for (const relativePath of englishRelativePaths) {
237+
for (const [languageCode, relativePaths] of nonEnglish) {
238+
if (!relativePaths.has(relativePath)) {
239+
// At this point, we've found an English `relativePath` that is
240+
// not used by this language.
241+
// But before we decide to "backfill" it from the English equivalent
242+
// we first need to figure out if it should be excluded.
243+
// The reason for doing this check this late is for the benefit
244+
// of optimization. In general, when the translation pipeline has
245+
// done its magic, this should be very rare, so it's unnecessary
246+
// to do this exception check on every single English relativePath.
247+
if (TRANSLATION_DRIFT_EXCEPTIONS.find((exception) => relativePath.startsWith(exception))) {
248+
continue
249+
}
250+
251+
// The magic right here!
252+
// The trick is that we can't clone instances of class Page. We need
253+
// to create them for this language. But the trick is that we
254+
// use the English relative path so it can have something to read.
255+
// For example, if we have figured out that
256+
// `translations/ja-JP/content/foo.md` doesn't exist, we pretend
257+
// that we can use `foo.md` and the base path of `content/`.
258+
pageLoadPromises.push(
259+
Page.init({
260+
basePath: englishBasePath,
261+
relativePath: relativePath,
262+
languageCode: languageCode,
263+
})
264+
)
265+
if (process.env.NODE_ENV === 'development') {
266+
console.log(`Backfill ${relativePath} for ${languageCode} from English`)
267+
}
268+
}
269+
}
270+
}
271+
const additionalPages = await Promise.all(pageLoadPromises)
272+
newPageList.push(...additionalPages)
273+
274+
return newPageList
157275
}
158276

159277
export default {
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import path from 'path'
2+
import { fileURLToPath } from 'url'
3+
4+
import { expect } from '@jest/globals'
5+
6+
import languages from '../../lib/languages.js'
7+
import Page from '../../lib/page.js'
8+
import { loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js'
9+
const __dirname = path.dirname(fileURLToPath(import.meta.url))
10+
11+
describe('loading page map with translation orphans', () => {
12+
test('inject missing translations from English', async () => {
13+
const basePath = path.join(__dirname, '../fixtures')
14+
const page = await Page.init({
15+
relativePath: 'page-that-does-not-exist-in-translations-dir.md',
16+
basePath,
17+
languageCode: 'en',
18+
})
19+
console.assert(page, 'page could not be loaded')
20+
21+
const pageList = [page]
22+
const pageMap = await loadPageMap(await correctTranslationOrphans(pageList, basePath))
23+
// It should make a copy of the English into each language
24+
expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length)
25+
26+
// +1 for the test just above, 2 tests per language.
27+
expect.assertions(1 + Object.keys(languages).length * 2)
28+
29+
for (const languageCode of Object.keys(languages)) {
30+
for (const permalink of page.permalinks) {
31+
const translationHref = permalink.href.replace('/en', `/${languageCode}`)
32+
const translationPage = pageMap[translationHref]
33+
expect(translationPage).toBeTruthy()
34+
expect(translationPage.languageCode).toBe(languageCode)
35+
}
36+
}
37+
})
38+
39+
test('remove translation pages that were not in English', async () => {
40+
const basePath = path.join(__dirname, '../fixtures')
41+
const page = await Page.init({
42+
relativePath: 'page-that-does-not-exist-in-translations-dir.md',
43+
basePath,
44+
languageCode: 'en',
45+
})
46+
console.assert(page, 'page could not be loaded')
47+
const orphan = await Page.init({
48+
relativePath: 'article-with-videos.md',
49+
basePath,
50+
languageCode: 'ja',
51+
})
52+
console.assert(orphan, 'page could not be loaded')
53+
const orphanPermalinks = orphan.permalinks.map((p) => p.href)
54+
55+
const pageList = await correctTranslationOrphans([page, orphan], basePath)
56+
const pageMap = await loadPageMap(pageList)
57+
expect(pageMap[orphanPermalinks[0]]).toBeFalsy()
58+
expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length)
59+
})
60+
})

tests/unit/pages.js

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { jest } from '@jest/globals'
22
import path from 'path'
3-
import { loadPages, loadPageMap } from '../../lib/page-data.js'
3+
import { loadPages, loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js'
44
import libLanguages from '../../lib/languages.js'
55
import { liquid } from '../../lib/render-content/index.js'
66
import patterns from '../../lib/patterns.js'
@@ -13,13 +13,24 @@ import removeFPTFromPath from '../../lib/remove-fpt-from-path.js'
1313
const languageCodes = Object.keys(libLanguages)
1414
const slugger = new GithubSlugger()
1515

16+
// By default, the tests don't change that each translation has an
17+
// equivalent English page (e.g. `translations/*/content/foo.md`
18+
// expects `content/foo.md`)
19+
// Set the environment variable `TEST_TRANSLATION_MATCHING=true`
20+
// to enable that test.
21+
const testIfRequireTranslationMatching = JSON.parse(
22+
process.env.TEST_TRANSLATION_MATCHING || 'false'
23+
)
24+
? test
25+
: test.skip
26+
1627
describe('pages module', () => {
1728
jest.setTimeout(60 * 1000)
1829

1930
let pages
2031

2132
beforeAll(async () => {
22-
pages = await loadPages()
33+
pages = await correctTranslationOrphans(await loadPages())
2334
})
2435

2536
describe('loadPages', () => {
@@ -76,8 +87,8 @@ describe('pages module', () => {
7687

7788
const message = `Found ${duplicates.length} duplicate redirect_from ${
7889
duplicates.length === 1 ? 'path' : 'paths'
79-
}.
80-
Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files.
90+
}.
91+
Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files.
8192
You may also receive this error if you have defined the same children property more than once.\n
8293
${duplicates.join('\n')}`
8394
expect(duplicates.length, message).toBe(0)
@@ -152,26 +163,29 @@ describe('pages module', () => {
152163
expect(liquidErrors.length, failureMessage).toBe(0)
153164
})
154165

155-
test('every non-English page has a matching English page', async () => {
156-
const englishPaths = chain(walk('content', { directories: false }))
157-
.uniq()
158-
.value()
159-
160-
const nonEnglishPaths = chain(Object.values(libLanguages))
161-
.filter((language) => language.code !== 'en')
162-
.map((language) => walk(`${language.dir}/content`, { directories: false }))
163-
.flatten()
164-
.uniq()
165-
.value()
166-
167-
const diff = difference(nonEnglishPaths, englishPaths)
168-
const failureMessage = `
166+
testIfRequireTranslationMatching(
167+
'every non-English page has a matching English page',
168+
async () => {
169+
const englishPaths = chain(walk('content', { directories: false }))
170+
.uniq()
171+
.value()
172+
173+
const nonEnglishPaths = chain(Object.values(libLanguages))
174+
.filter((language) => language.code !== 'en')
175+
.map((language) => walk(`${language.dir}/content`, { directories: false }))
176+
.flatten()
177+
.uniq()
178+
.value()
179+
180+
const diff = difference(nonEnglishPaths, englishPaths)
181+
const failureMessage = `
169182
Found ${diff.length} non-English pages without a matching English page:\n - ${diff.join('\n - ')}
170183
171184
Remove them with script/i18n/prune-stale-files.js and commit your changes using "git commit --no-verify".
172185
`
173-
expect(diff.length, failureMessage).toBe(0)
174-
})
186+
expect(diff.length, failureMessage).toBe(0)
187+
}
188+
)
175189
})
176190

177191
describe('loadPageMap', () => {

0 commit comments

Comments
 (0)