Skip to content

Commit

Permalink
Correct translation content from reusables too (#37722)
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe authored Jun 13, 2023
1 parent dc54233 commit 46d5e1a
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 94 deletions.
90 changes: 90 additions & 0 deletions lib/correct-translation-content.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/**
* A lot of translations have minor corruptions that will lead to rendering
* failing (and having to rely on English fallback). Many of these are
* easy to manually correct for.
*
* This function is a temporary solution to correct for these corruptions.
* It looks for easy "low hanging fruit" that we can correct for.
*
*/
export function correctTranslatedContentStrings(content, englishContent, debug = false) {
// A lot of translations have corruptions around the AUTOTITLE links.
// We've requested that these are corrected back but as a temporary
// solution we'll manually recover now.
// See internal issue #2762
// In late 2023, search in the translations repos if these things are
// still happening and if not, the following lines can be removed.
content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')

// A lot of Liquid tags lose their linebreak after the `}` which can
// result in formatting problems, especially around Markdown tables.
// This code here, compares each Liquid statement, in the translation,
// and tests if it appears like that but with a newline in the English.
// English example:
//
// {%- ifversion ghes %}
// | Thing | ✔️ |
// {%- endif %}
//
// Translation example:
//
// {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
//
// There exists the risk that different Liquid statements gets compared
// different Liquid statements in the English, but the risk is worth
// taking because even if this accidentally introduces a newline, it's
// unlikely to cause a problem. At worst that a sentence displays on its
// own paragraph.
content = content.replace(/\{%(.+?)%\} /g, (match) => {
if (match.lastIndexOf('{%') > 0) {
// For example:
//
// `{% bla bla %}, and {% foo bar %} `
//
// Our regex is not greedy, but technically, if you look closely
// you'll see this is the first match that starts with `{%` and
// ends with `%} `. Let's skip these.
return match
}

const withLinebreak = match.slice(0, -1) + '\n'
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
return withLinebreak
}
return match
})
// The above corrections deepend on looking for `{% foo %} ` and replacing
// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
// content and `{% foo %} ` was *not*.
// However we see a lot of cases of this:
//
// ... {% endif %} | First Column ...
//
// Which needs to become this:
//
// ... {% endif %}
// | First Column ...
//
// And since `{% endif %}` is such a common Liquid tag we can't reply
// on lookig for it with `{% endif %}\n` in the English content.
content = content.replace(/\{% endif %\} \| /g, (match) => {
const potentiallyBetter = '{% endif %}\n| '
if (englishContent.includes(potentiallyBetter)) {
return potentiallyBetter
}
return match
})

// All too often we see translations that look like this:
//
// | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
//
// Yes, that's one very long line. Notice how all the necessary linebreaks
// are suddenly gone.
content = content.replaceAll(' | | ', ' |\n| ')

return content
}
29 changes: 28 additions & 1 deletion lib/get-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import matter from 'gray-matter'
import { merge, get } from 'lodash-es'

import languages from './languages.js'
import { correctTranslatedContentStrings } from './correct-translation-content.js'

// If you run `export DEBUG_JIT_DATA_READS=true` in your terminal,
// next time it will mention every file it reads from disk.
Expand Down Expand Up @@ -175,7 +176,33 @@ function getDataByDir(dottedPath, dir, englishRoot) {
fullPath.push(...split)
fullPath.push(`${nakedname}.md`)
const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot)
return matter(markdown).content
let { content } = matter(markdown)
if (dir !== englishRoot) {
// If we're reading a translation, we need to replace the possible
// corruptions. For example `[AUTOTITLE"을](/foo/bar)`.
// To do this we'll need the English equivalent
let englishContent = content
try {
englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot)
} catch (error) {
// In some real but rare cases a reusable doesn't exist in English.
// At all.
// This can happen when the translation is really out of date.
// You might have an old `docs-internal.locale/content/**/*.md`
// file that mentions `{% data reusables.foo.bar %}`. And it's
// working fine, except none of that exists in English.
// If this is the case, we still want to executed the
// correctTranslatedContentStrings() function, but we can't
// genuinely give it the English equivalent content, which it
// sometimes uses to correct some Liquid tags. At least other
// good corrections might happen.
if (error.code !== 'ENOENT') {
throw error
}
}
content = correctTranslatedContentStrings(content, englishContent)
}
return content
}

// E.g. {% data ui.pages.foo.bar %}
Expand Down
92 changes: 1 addition & 91 deletions lib/page-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
import readFileContents from './read-file-contents.js'
import Page from './page.js'
import frontmatterSchema from './frontmatter.js'
import { correctTranslatedContentStrings } from './correct-translation-content.js'

// If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal,
// every time a translation file fails to initialize we fall back to English
Expand Down Expand Up @@ -181,97 +182,6 @@ async function translateTree(dir, langObj, enTree) {
return item
}

/**
* A lot of translations have minor corruptions that will lead to rendering
* failing (and having to rely on English fallback). Many of these are
* easy to manually correct for.
*
* This function is a temporary solution to correct for these corruptions.
* It looks for easy "low hanging fruit" that we can correct for.
*
*/
export function correctTranslatedContentStrings(content, englishContent, debug = false) {
// A lot of translations have corruptions around the AUTOTITLE links.
// We've requested that these are corrected back but as a temporary
// solution we'll manually recover now.
// See internal issue #2762
// In late 2023, search in the translations repos if these things are
// still happening and if not, the following lines can be removed.
content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')

// A lot of Liquid tags lose their linebreak after the `}` which can
// result in formatting problems, especially around Markdown tables.
// This code here, compares each Liquid statement, in the translation,
// and tests if it appears like that but with a newline in the English.
// English example:
//
// {%- ifversion ghes %}
// | Thing | ✔️ |
// {%- endif %}
//
// Translation example:
//
// {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
//
// There exists the risk that different Liquid statements gets compared
// different Liquid statements in the English, but the risk is worth
// taking because even if this accidentally introduces a newline, it's
// unlikely to cause a problem. At worst that a sentence displays on its
// own paragraph.
content = content.replace(/\{%(.+?)%\} /g, (match) => {
if (match.lastIndexOf('{%') > 0) {
// For example:
//
// `{% bla bla %}, and {% foo bar %} `
//
// Our regex is not greedy, but technically, if you look closely
// you'll see this is the first match that starts with `{%` and
// ends with `%} `. Let's skip these.
return match
}

const withLinebreak = match.slice(0, -1) + '\n'
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
return withLinebreak
}
return match
})
// The above corrections deepend on looking for `{% foo %} ` and replacing
// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
// content and `{% foo %} ` was *not*.
// However we see a lot of cases of this:
//
// ... {% endif %} | First Column ...
//
// Which needs to become this:
//
// ... {% endif %}
// | First Column ...
//
// And since `{% endif %}` is such a common Liquid tag we can't reply
// on lookig for it with `{% endif %}\n` in the English content.
content = content.replace(/\{% endif %\} \| /g, (match) => {
const potentiallyBetter = '{% endif %}\n| '
if (englishContent.includes(potentiallyBetter)) {
return potentiallyBetter
}
return match
})

// All too often we see translations that look like this:
//
// | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
//
// Yes, that's one very long line. Notice how all the necessary linebreaks
// are suddenly gone.
content = content.replaceAll(' | | ', ' |\n| ')

return content
}

/**
* The siteTree is a nested object with pages for every language and version, useful for nav because it
* contains parent, child, and sibling relationships:
Expand Down
2 changes: 1 addition & 1 deletion middleware/contextualizers/glossaries.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { getDataByLanguage } from '../../lib/get-data.js'
import liquid from '../../lib/render-content/liquid.js'
import { executeWithFallback } from '../../lib/render-with-fallback.js'
import { correctTranslatedContentStrings } from '../../lib/page-data.js'
import { correctTranslatedContentStrings } from '../../lib/correct-translation-content.js'

export default async function glossaries(req, res, next) {
if (!req.pagePath.endsWith('get-started/quickstart/github-glossary')) return next()
Expand Down
4 changes: 4 additions & 0 deletions tests/fixtures/content/get-started/quickstart/hello-world.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,7 @@ like "Enterprise Server X.Y". It should change the above sentence.
"[AUTOTITLE](/get-started/quickstart/dynamic-title)"

"[AUTOTITLE](/get-started/foo/cross-version-linking)"

## Use of a reusable that might have auto-title links

{% data reusables.gated-features.more-info %}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@ intro: 'この Hello World 演習に従って、{% data variables.product.produc
## 可変タイトルのページへのリンク

"[AUTOTITLE](/get-started/quickstart/dynamic-title)"
Hi, if you look carefully at this next line, the way the auto-title keyword
is used, is wrong. The quotation is inside the square bracket! Naughty.
But this is a common occurrence in translations and we need to smartly
recover from it.

["AUTOTITLE](/get-started/quickstart/dynamic-title)"

"[AUTOTITLE](/get-started/foo/cross-version-linking)"

## Use of a reusable that might have auto-title links

{% data reusables.gated-features.more-info %}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% ifversion fpt or ghec %}詳細については, see ["AUTOTITLE](/get-started/quickstart/hello-world)."{% endif %}
23 changes: 23 additions & 0 deletions tests/rendering-fixtures/translations.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,27 @@ describe('translations', () => {
expect(tds[1]).toBe('Present')
}
})

test('automatic correction of bad AUTOTITLE in reusables', async () => {
const $ = await getDOM('/ja/get-started/quickstart/hello-world')
const links = $('#article-contents a[href]')
const texts = links.map((i, element) => $(element).text()).get()
// That Japanese page uses AUTOTITLE links. Both in the main `.md` file
// but also inside a reusable.
// E.g. `["AUTOTITLE](/get-started/quickstart/hello-world)."`
// If we didn't do the necessary string corrections on translations'
// content and reusables what *would* remain is a HTML link that
// would look like this:
//
// <a href="/ja/get-started/quickstart/hello-world">&quot;AUTOTITLE</a>
//
// This test makes sure no such string is left in any of the article
// content links.
// Note that, in English, it's not acceptable to have such a piece of
// Markdown. It would not be let into `main` by our CI checks. But
// by their nature, translations are not checked by CI in the same way.
// Its "flaws" have to be corrected at runtime.
const stillAutotitle = texts.filter((text) => /autotitle/i.test(text))
expect(stillAutotitle.length).toBe(0)
})
})

0 comments on commit 46d5e1a

Please sign in to comment.