From 46d5e1abf6e3ff004db4e0a9cab61e304fc8ea74 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Tue, 13 Jun 2023 07:49:18 -0400 Subject: [PATCH] Correct translation content from reusables too (#37722) --- lib/correct-translation-content.js | 90 ++++++++++++++++++ lib/get-data.js | 29 +++++- lib/page-data.js | 92 +------------------ middleware/contextualizers/glossaries.js | 2 +- .../get-started/quickstart/hello-world.md | 4 + .../get-started/quickstart/hello-world.md | 11 ++- .../reusables/gated-features/more-info.md | 1 + tests/rendering-fixtures/translations.js | 23 +++++ 8 files changed, 158 insertions(+), 94 deletions(-) create mode 100644 lib/correct-translation-content.js create mode 100644 tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md diff --git a/lib/correct-translation-content.js b/lib/correct-translation-content.js new file mode 100644 index 000000000000..05ae6fdd8c31 --- /dev/null +++ b/lib/correct-translation-content.js @@ -0,0 +1,90 @@ +/** + * A lot of translations have minor corruptions that will lead to rendering + * failing (and having to rely on English fallback). Many of these are + * easy to manually correct for. + * + * This function is a temporary solution to correct for these corruptions. + * It looks for easy "low hanging fruit" that we can correct for. + * + */ +export function correctTranslatedContentStrings(content, englishContent, debug = false) { + // A lot of translations have corruptions around the AUTOTITLE links. + // We've requested that these are corrected back but as a temporary + // solution we'll manually recover now. + // See internal issue #2762 + // In late 2023, search in the translations repos if these things are + // still happening and if not, the following lines can be removed. + content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]') + content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]') + content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]') + content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](') + + // A lot of Liquid tags lose their linebreak after the `}` which can + // result in formatting problems, especially around Markdown tables. + // This code here, compares each Liquid statement, in the translation, + // and tests if it appears like that but with a newline in the English. + // English example: + // + // {%- ifversion ghes %} + // | Thing | ✔️ | + // {%- endif %} + // + // Translation example: + // + // {%- ifversion ghes %} | Thing | ✔️ | {%- endif %} + // + // There exists the risk that different Liquid statements gets compared + // different Liquid statements in the English, but the risk is worth + // taking because even if this accidentally introduces a newline, it's + // unlikely to cause a problem. At worst that a sentence displays on its + // own paragraph. + content = content.replace(/\{%(.+?)%\} /g, (match) => { + if (match.lastIndexOf('{%') > 0) { + // For example: + // + // `{% bla bla %}, and {% foo bar %} ` + // + // Our regex is not greedy, but technically, if you look closely + // you'll see this is the first match that starts with `{%` and + // ends with `%} `. Let's skip these. + return match + } + + const withLinebreak = match.slice(0, -1) + '\n' + if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) { + return withLinebreak + } + return match + }) + // The above corrections deepend on looking for `{% foo %} ` and replacing + // it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English + // content and `{% foo %} ` was *not*. + // However we see a lot of cases of this: + // + // ... {% endif %} | First Column ... + // + // Which needs to become this: + // + // ... {% endif %} + // | First Column ... + // + // And since `{% endif %}` is such a common Liquid tag we can't reply + // on lookig for it with `{% endif %}\n` in the English content. + content = content.replace(/\{% endif %\} \| /g, (match) => { + const potentiallyBetter = '{% endif %}\n| ' + if (englishContent.includes(potentiallyBetter)) { + return potentiallyBetter + } + return match + }) + + // All too often we see translations that look like this: + // + // | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %} + // + // Yes, that's one very long line. Notice how all the necessary linebreaks + // are suddenly gone. + content = content.replaceAll(' | | ', ' |\n| ') + + return content +} diff --git a/lib/get-data.js b/lib/get-data.js index bc0f61588e4c..f2ee1964323d 100644 --- a/lib/get-data.js +++ b/lib/get-data.js @@ -6,6 +6,7 @@ import matter from 'gray-matter' import { merge, get } from 'lodash-es' import languages from './languages.js' +import { correctTranslatedContentStrings } from './correct-translation-content.js' // If you run `export DEBUG_JIT_DATA_READS=true` in your terminal, // next time it will mention every file it reads from disk. @@ -175,7 +176,33 @@ function getDataByDir(dottedPath, dir, englishRoot) { fullPath.push(...split) fullPath.push(`${nakedname}.md`) const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot) - return matter(markdown).content + let { content } = matter(markdown) + if (dir !== englishRoot) { + // If we're reading a translation, we need to replace the possible + // corruptions. For example `[AUTOTITLE"을](/foo/bar)`. + // To do this we'll need the English equivalent + let englishContent = content + try { + englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot) + } catch (error) { + // In some real but rare cases a reusable doesn't exist in English. + // At all. + // This can happen when the translation is really out of date. + // You might have an old `docs-internal.locale/content/**/*.md` + // file that mentions `{% data reusables.foo.bar %}`. And it's + // working fine, except none of that exists in English. + // If this is the case, we still want to executed the + // correctTranslatedContentStrings() function, but we can't + // genuinely give it the English equivalent content, which it + // sometimes uses to correct some Liquid tags. At least other + // good corrections might happen. + if (error.code !== 'ENOENT') { + throw error + } + } + content = correctTranslatedContentStrings(content, englishContent) + } + return content } // E.g. {% data ui.pages.foo.bar %} diff --git a/lib/page-data.js b/lib/page-data.js index f41ee783724c..34daf75d5b1b 100644 --- a/lib/page-data.js +++ b/lib/page-data.js @@ -7,6 +7,7 @@ import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js' import readFileContents from './read-file-contents.js' import Page from './page.js' import frontmatterSchema from './frontmatter.js' +import { correctTranslatedContentStrings } from './correct-translation-content.js' // If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal, // every time a translation file fails to initialize we fall back to English @@ -181,97 +182,6 @@ async function translateTree(dir, langObj, enTree) { return item } -/** - * A lot of translations have minor corruptions that will lead to rendering - * failing (and having to rely on English fallback). Many of these are - * easy to manually correct for. - * - * This function is a temporary solution to correct for these corruptions. - * It looks for easy "low hanging fruit" that we can correct for. - * - */ -export function correctTranslatedContentStrings(content, englishContent, debug = false) { - // A lot of translations have corruptions around the AUTOTITLE links. - // We've requested that these are corrected back but as a temporary - // solution we'll manually recover now. - // See internal issue #2762 - // In late 2023, search in the translations repos if these things are - // still happening and if not, the following lines can be removed. - content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]') - content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]') - content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]') - content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](') - - // A lot of Liquid tags lose their linebreak after the `}` which can - // result in formatting problems, especially around Markdown tables. - // This code here, compares each Liquid statement, in the translation, - // and tests if it appears like that but with a newline in the English. - // English example: - // - // {%- ifversion ghes %} - // | Thing | ✔️ | - // {%- endif %} - // - // Translation example: - // - // {%- ifversion ghes %} | Thing | ✔️ | {%- endif %} - // - // There exists the risk that different Liquid statements gets compared - // different Liquid statements in the English, but the risk is worth - // taking because even if this accidentally introduces a newline, it's - // unlikely to cause a problem. At worst that a sentence displays on its - // own paragraph. - content = content.replace(/\{%(.+?)%\} /g, (match) => { - if (match.lastIndexOf('{%') > 0) { - // For example: - // - // `{% bla bla %}, and {% foo bar %} ` - // - // Our regex is not greedy, but technically, if you look closely - // you'll see this is the first match that starts with `{%` and - // ends with `%} `. Let's skip these. - return match - } - - const withLinebreak = match.slice(0, -1) + '\n' - if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) { - return withLinebreak - } - return match - }) - // The above corrections deepend on looking for `{% foo %} ` and replacing - // it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English - // content and `{% foo %} ` was *not*. - // However we see a lot of cases of this: - // - // ... {% endif %} | First Column ... - // - // Which needs to become this: - // - // ... {% endif %} - // | First Column ... - // - // And since `{% endif %}` is such a common Liquid tag we can't reply - // on lookig for it with `{% endif %}\n` in the English content. - content = content.replace(/\{% endif %\} \| /g, (match) => { - const potentiallyBetter = '{% endif %}\n| ' - if (englishContent.includes(potentiallyBetter)) { - return potentiallyBetter - } - return match - }) - - // All too often we see translations that look like this: - // - // | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %} - // - // Yes, that's one very long line. Notice how all the necessary linebreaks - // are suddenly gone. - content = content.replaceAll(' | | ', ' |\n| ') - - return content -} - /** * The siteTree is a nested object with pages for every language and version, useful for nav because it * contains parent, child, and sibling relationships: diff --git a/middleware/contextualizers/glossaries.js b/middleware/contextualizers/glossaries.js index b6ba9434aafb..c55ff244099a 100644 --- a/middleware/contextualizers/glossaries.js +++ b/middleware/contextualizers/glossaries.js @@ -1,7 +1,7 @@ import { getDataByLanguage } from '../../lib/get-data.js' import liquid from '../../lib/render-content/liquid.js' import { executeWithFallback } from '../../lib/render-with-fallback.js' -import { correctTranslatedContentStrings } from '../../lib/page-data.js' +import { correctTranslatedContentStrings } from '../../lib/correct-translation-content.js' export default async function glossaries(req, res, next) { if (!req.pagePath.endsWith('get-started/quickstart/github-glossary')) return next() diff --git a/tests/fixtures/content/get-started/quickstart/hello-world.md b/tests/fixtures/content/get-started/quickstart/hello-world.md index 6b9dcb671246..78a26d2a2468 100644 --- a/tests/fixtures/content/get-started/quickstart/hello-world.md +++ b/tests/fixtures/content/get-started/quickstart/hello-world.md @@ -30,3 +30,7 @@ like "Enterprise Server X.Y". It should change the above sentence. "[AUTOTITLE](/get-started/quickstart/dynamic-title)" "[AUTOTITLE](/get-started/foo/cross-version-linking)" + +## Use of a reusable that might have auto-title links + +{% data reusables.gated-features.more-info %} diff --git a/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md b/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md index 7765709879fb..47e8afda1db9 100644 --- a/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md +++ b/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md @@ -11,6 +11,15 @@ intro: 'この Hello World 演習に従って、{% data variables.product.produc ## 可変タイトルのページへのリンク -"[AUTOTITLE](/get-started/quickstart/dynamic-title)" +Hi, if you look carefully at this next line, the way the auto-title keyword +is used, is wrong. The quotation is inside the square bracket! Naughty. +But this is a common occurrence in translations and we need to smartly +recover from it. + +["AUTOTITLE](/get-started/quickstart/dynamic-title)" "[AUTOTITLE](/get-started/foo/cross-version-linking)" + +## Use of a reusable that might have auto-title links + +{% data reusables.gated-features.more-info %} diff --git a/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md b/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md new file mode 100644 index 000000000000..a77b2f14d55e --- /dev/null +++ b/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md @@ -0,0 +1 @@ +{% ifversion fpt or ghec %}詳細については, see ["AUTOTITLE](/get-started/quickstart/hello-world)."{% endif %} diff --git a/tests/rendering-fixtures/translations.js b/tests/rendering-fixtures/translations.js index f12f4ce3fde0..e5d35a8c327a 100644 --- a/tests/rendering-fixtures/translations.js +++ b/tests/rendering-fixtures/translations.js @@ -88,4 +88,27 @@ describe('translations', () => { expect(tds[1]).toBe('Present') } }) + + test('automatic correction of bad AUTOTITLE in reusables', async () => { + const $ = await getDOM('/ja/get-started/quickstart/hello-world') + const links = $('#article-contents a[href]') + const texts = links.map((i, element) => $(element).text()).get() + // That Japanese page uses AUTOTITLE links. Both in the main `.md` file + // but also inside a reusable. + // E.g. `["AUTOTITLE](/get-started/quickstart/hello-world)."` + // If we didn't do the necessary string corrections on translations' + // content and reusables what *would* remain is a HTML link that + // would look like this: + // + // "AUTOTITLE + // + // This test makes sure no such string is left in any of the article + // content links. + // Note that, in English, it's not acceptable to have such a piece of + // Markdown. It would not be let into `main` by our CI checks. But + // by their nature, translations are not checked by CI in the same way. + // Its "flaws" have to be corrected at runtime. + const stillAutotitle = texts.filter((text) => /autotitle/i.test(text)) + expect(stillAutotitle.length).toBe(0) + }) })