Skip to content

Commit

Permalink
cope with possible translation drift (#24842)
Browse files Browse the repository at this point in the history
* cope with possible translation drift

* fix test

* don't shallow clone

* fix unit tests

* update code comments

* more code comment corrections

* more code comment

* feedbacked
  • Loading branch information
peterbe authored Feb 4, 2022
1 parent 45cd562 commit 14ca55f
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 24 deletions.
2 changes: 2 additions & 0 deletions crowdin.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
files:
- source: /content/**/*.md
translation: /translations/%locale%/%original_path%/%original_file_name%
# See lib/page-data.js for a matching list of prefix exceptions
# Try to keep these in sync when editing in either location.
ignore:
- '/content/README.md'
- '/content/early-access'
Expand Down
2 changes: 1 addition & 1 deletion lib/create-tree.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url))
const _basePaths = new Map()
// Return a full directory based on __dirname from a specific language directory.
// This function is memoized with a simple global cache object.
function getBasePath(directory) {
export function getBasePath(directory) {
if (!_basePaths.has(directory)) {
_basePaths.set(directory, path.posix.join(__dirname, '..', directory, 'content'))
}
Expand Down
124 changes: 121 additions & 3 deletions lib/page-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,29 @@ import { fileURLToPath } from 'url'
import path from 'path'
import languages from './languages.js'
import { allVersions } from './all-versions.js'
import createTree from './create-tree.js'
import createTree, { getBasePath } from './create-tree.js'
import renderContent from './render-content/index.js'
import loadSiteData from './site-data.js'
import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
import Page from './page.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const versions = Object.keys(allVersions)
const enterpriseServerVersions = versions.filter((v) => v.startsWith('enterprise-server@'))
const renderOpts = { textOnly: true, encodeEntities: true }

// These are the exceptions to the rule.
// These URI prefixes should match what you'll find in crowdin.yml.
// If a URI starts with one of these prefixes, it basically means we don't
// bother to "backfill" a translation in its spot.
// For example, `/en/github/site-policy-deprecated/foo` works
// only in English and we don't bother making `/ja/github/site-policy-deprecated/foo`
// work too.
const TRANSLATION_DRIFT_EXCEPTIONS = [
'github/site-policy-deprecated',
// Early access stuff never has translations.
'early-access',
]

/**
* We only need to initialize pages _once per language_ since pages don't change per version. So we do that
* first since it's the most expensive work. This gets us a nested object with pages attached that we can use
Expand Down Expand Up @@ -152,8 +166,112 @@ export function createMapFromArray(pageList) {
}

export async function loadPageMap(pageList) {
const pages = pageList || (await loadPageList())
return createMapFromArray(pages)
const pages = await correctTranslationOrphans(pageList || (await loadPageList()))
const pageMap = createMapFromArray(pages)
return pageMap
}

// If a translation page exists, that doesn't have an English equivalent,
// remove it.
// If an English page exists, that doesn't have an translation equivalent,
// add it.
// Note, this function is exported purely for the benefit of the unit tests.
export async function correctTranslationOrphans(pageList, basePath = null) {
const englishRelativePaths = new Set()
for (const page of pageList) {
if (page.languageCode === 'en') {
englishRelativePaths.add(page.relativePath)
}
}

// Prime the Map with an empty set for each language prefix.
// It's important that we do this for *every* language rather than
// just populating `nonEnglish` based on those pages that *are* present.
// Otherwise, we won't have an index of all the languages
// that *might* be missing.
const nonEnglish = new Map()
Object.keys(languages)
.filter((lang) => lang !== 'en')
.forEach((languageCode) => {
nonEnglish.set(languageCode, new Set())
})

// By default, when backfilling, we set the `basePath` to be that of
// English. But for the benefit of being able to do unit tests,
// we make this an optional argument. Then, unit tests can use
// its "tests/fixtures" directory.
const englishBasePath = basePath || getBasePath(languages.en.dir)

// Filter out all non-English pages that appear to be excess.
// E.g. if an English doc was renamed from `content/foo.md` to
// `content/bar.md` what will happen is that `translations/*/content/foo.md`
// will still linger around and we want to remove that even if it was
// scooped up from disk.
const newPageList = []
for (const page of pageList) {
if (page.languageCode === 'en') {
// English pages are never considered "excess"
newPageList.push(page)
continue
}

// If this translation page exists in English, keep it but also
// add it to the set of relative paths that is known.
if (englishRelativePaths.has(page.relativePath)) {
nonEnglish.get(page.languageCode).add(page.relativePath)
newPageList.push(page)
continue
}

// All else is considered "excess" and should be excluded from
// the new list of pages. So do nothing.
if (process.env.NODE_ENV === 'development') {
console.log(
`For ${page.languageCode}, omit the page ${page.relativePath} even though it's present on disk.`
)
}
}

const pageLoadPromises = []
for (const relativePath of englishRelativePaths) {
for (const [languageCode, relativePaths] of nonEnglish) {
if (!relativePaths.has(relativePath)) {
// At this point, we've found an English `relativePath` that is
// not used by this language.
// But before we decide to "backfill" it from the English equivalent
// we first need to figure out if it should be excluded.
// The reason for doing this check this late is for the benefit
// of optimization. In general, when the translation pipeline has
// done its magic, this should be very rare, so it's unnecessary
// to do this exception check on every single English relativePath.
if (TRANSLATION_DRIFT_EXCEPTIONS.find((exception) => relativePath.startsWith(exception))) {
continue
}

// The magic right here!
// The trick is that we can't clone instances of class Page. We need
// to create them for this language. But the trick is that we
// use the English relative path so it can have something to read.
// For example, if we have figured out that
// `translations/ja-JP/content/foo.md` doesn't exist, we pretend
// that we can use `foo.md` and the base path of `content/`.
pageLoadPromises.push(
Page.init({
basePath: englishBasePath,
relativePath: relativePath,
languageCode: languageCode,
})
)
if (process.env.NODE_ENV === 'development') {
console.log(`Backfill ${relativePath} for ${languageCode} from English`)
}
}
}
}
const additionalPages = await Promise.all(pageLoadPromises)
newPageList.push(...additionalPages)

return newPageList
}

export default {
Expand Down
60 changes: 60 additions & 0 deletions tests/unit/load-translation-orphans.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import path from 'path'
import { fileURLToPath } from 'url'

import { expect } from '@jest/globals'

import languages from '../../lib/languages.js'
import Page from '../../lib/page.js'
import { loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))

describe('loading page map with translation orphans', () => {
test('inject missing translations from English', async () => {
const basePath = path.join(__dirname, '../fixtures')
const page = await Page.init({
relativePath: 'page-that-does-not-exist-in-translations-dir.md',
basePath,
languageCode: 'en',
})
console.assert(page, 'page could not be loaded')

const pageList = [page]
const pageMap = await loadPageMap(await correctTranslationOrphans(pageList, basePath))
// It should make a copy of the English into each language
expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length)

// +1 for the test just above, 2 tests per language.
expect.assertions(1 + Object.keys(languages).length * 2)

for (const languageCode of Object.keys(languages)) {
for (const permalink of page.permalinks) {
const translationHref = permalink.href.replace('/en', `/${languageCode}`)
const translationPage = pageMap[translationHref]
expect(translationPage).toBeTruthy()
expect(translationPage.languageCode).toBe(languageCode)
}
}
})

test('remove translation pages that were not in English', async () => {
const basePath = path.join(__dirname, '../fixtures')
const page = await Page.init({
relativePath: 'page-that-does-not-exist-in-translations-dir.md',
basePath,
languageCode: 'en',
})
console.assert(page, 'page could not be loaded')
const orphan = await Page.init({
relativePath: 'article-with-videos.md',
basePath,
languageCode: 'ja',
})
console.assert(orphan, 'page could not be loaded')
const orphanPermalinks = orphan.permalinks.map((p) => p.href)

const pageList = await correctTranslationOrphans([page, orphan], basePath)
const pageMap = await loadPageMap(pageList)
expect(pageMap[orphanPermalinks[0]]).toBeFalsy()
expect(Object.keys(pageMap).length).toBe(Object.keys(languages).length)
})
})
54 changes: 34 additions & 20 deletions tests/unit/pages.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { jest } from '@jest/globals'
import path from 'path'
import { loadPages, loadPageMap } from '../../lib/page-data.js'
import { loadPages, loadPageMap, correctTranslationOrphans } from '../../lib/page-data.js'
import libLanguages from '../../lib/languages.js'
import { liquid } from '../../lib/render-content/index.js'
import patterns from '../../lib/patterns.js'
Expand All @@ -13,13 +13,24 @@ import removeFPTFromPath from '../../lib/remove-fpt-from-path.js'
const languageCodes = Object.keys(libLanguages)
const slugger = new GithubSlugger()

// By default, the tests don't change that each translation has an
// equivalent English page (e.g. `translations/*/content/foo.md`
// expects `content/foo.md`)
// Set the environment variable `TEST_TRANSLATION_MATCHING=true`
// to enable that test.
const testIfRequireTranslationMatching = JSON.parse(
process.env.TEST_TRANSLATION_MATCHING || 'false'
)
? test
: test.skip

describe('pages module', () => {
jest.setTimeout(60 * 1000)

let pages

beforeAll(async () => {
pages = await loadPages()
pages = await correctTranslationOrphans(await loadPages())
})

describe('loadPages', () => {
Expand Down Expand Up @@ -76,8 +87,8 @@ describe('pages module', () => {

const message = `Found ${duplicates.length} duplicate redirect_from ${
duplicates.length === 1 ? 'path' : 'paths'
}.
Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files.
}.
Ensure that you don't define the same path more than once in the redirect_from property in a single file and across all English files.
You may also receive this error if you have defined the same children property more than once.\n
${duplicates.join('\n')}`
expect(duplicates.length, message).toBe(0)
Expand Down Expand Up @@ -152,26 +163,29 @@ describe('pages module', () => {
expect(liquidErrors.length, failureMessage).toBe(0)
})

test('every non-English page has a matching English page', async () => {
const englishPaths = chain(walk('content', { directories: false }))
.uniq()
.value()

const nonEnglishPaths = chain(Object.values(libLanguages))
.filter((language) => language.code !== 'en')
.map((language) => walk(`${language.dir}/content`, { directories: false }))
.flatten()
.uniq()
.value()

const diff = difference(nonEnglishPaths, englishPaths)
const failureMessage = `
testIfRequireTranslationMatching(
'every non-English page has a matching English page',
async () => {
const englishPaths = chain(walk('content', { directories: false }))
.uniq()
.value()

const nonEnglishPaths = chain(Object.values(libLanguages))
.filter((language) => language.code !== 'en')
.map((language) => walk(`${language.dir}/content`, { directories: false }))
.flatten()
.uniq()
.value()

const diff = difference(nonEnglishPaths, englishPaths)
const failureMessage = `
Found ${diff.length} non-English pages without a matching English page:\n - ${diff.join('\n - ')}
Remove them with script/i18n/prune-stale-files.js and commit your changes using "git commit --no-verify".
`
expect(diff.length, failureMessage).toBe(0)
})
expect(diff.length, failureMessage).toBe(0)
}
)
})

describe('loadPageMap', () => {
Expand Down

0 comments on commit 14ca55f

Please sign in to comment.