From 46d5e1abf6e3ff004db4e0a9cab61e304fc8ea74 Mon Sep 17 00:00:00 2001
From: Peter Bengtsson <peterbe@github.com>
Date: Tue, 13 Jun 2023 07:49:18 -0400
Subject: [PATCH] Correct translation content from reusables too (#37722)

---
 lib/correct-translation-content.js            | 90 ++++++++++++++++++
 lib/get-data.js                               | 29 +++++-
 lib/page-data.js                              | 92 +------------------
 middleware/contextualizers/glossaries.js      |  2 +-
 .../get-started/quickstart/hello-world.md     |  4 +
 .../get-started/quickstart/hello-world.md     | 11 ++-
 .../reusables/gated-features/more-info.md     |  1 +
 tests/rendering-fixtures/translations.js      | 23 +++++
 8 files changed, 158 insertions(+), 94 deletions(-)
 create mode 100644 lib/correct-translation-content.js
 create mode 100644 tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md

diff --git a/lib/correct-translation-content.js b/lib/correct-translation-content.js
new file mode 100644
index 000000000000..05ae6fdd8c31
--- /dev/null
+++ b/lib/correct-translation-content.js
@@ -0,0 +1,90 @@
+/**
+ * A lot of translations have minor corruptions that will lead to rendering
+ * failing (and having to rely on English fallback). Many of these are
+ * easy to manually correct for.
+ *
+ * This function is a temporary solution to correct for these corruptions.
+ * It looks for easy "low hanging fruit" that we can correct for.
+ *
+ */
+export function correctTranslatedContentStrings(content, englishContent, debug = false) {
+  // A lot of translations have corruptions around the AUTOTITLE links.
+  // We've requested that these are corrected back but as a temporary
+  // solution we'll manually recover now.
+  // See internal issue #2762
+  // In late 2023, search in the translations repos if these things are
+  // still happening and if not, the following lines can be removed.
+  content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
+  content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
+  content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
+  content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
+
+  // A lot of Liquid tags lose their linebreak after the `}` which can
+  // result in formatting problems, especially around Markdown tables.
+  // This code here, compares each Liquid statement, in the translation,
+  // and tests if it appears like that but with a newline in the English.
+  // English example:
+  //
+  //    {%- ifversion ghes %}
+  //    | Thing | ✔️ |
+  //    {%- endif %}
+  //
+  // Translation example:
+  //
+  //    {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
+  //
+  // There exists the risk that different Liquid statements gets compared
+  // different Liquid statements in the English, but the risk is worth
+  // taking because even if this accidentally introduces a newline, it's
+  // unlikely to cause a problem. At worst that a sentence displays on its
+  // own paragraph.
+  content = content.replace(/\{%(.+?)%\} /g, (match) => {
+    if (match.lastIndexOf('{%') > 0) {
+      // For example:
+      //
+      //    `{% bla bla %}, and {% foo bar %} `
+      //
+      // Our regex is not greedy, but technically, if you look closely
+      // you'll see this is the first match that starts with `{%` and
+      // ends with `%} `. Let's skip these.
+      return match
+    }
+
+    const withLinebreak = match.slice(0, -1) + '\n'
+    if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
+      return withLinebreak
+    }
+    return match
+  })
+  // The above corrections deepend on looking for `{% foo %} ` and replacing
+  // it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
+  // content and `{% foo %} ` was *not*.
+  // However we see a lot of cases of this:
+  //
+  //    ... {% endif %} | First Column ...
+  //
+  // Which needs to become this:
+  //
+  //    ... {% endif %}
+  //    | First Column ...
+  //
+  // And since `{% endif %}` is such a common Liquid tag we can't reply
+  // on lookig for it with `{% endif %}\n` in the English content.
+  content = content.replace(/\{% endif %\} \| /g, (match) => {
+    const potentiallyBetter = '{% endif %}\n| '
+    if (englishContent.includes(potentiallyBetter)) {
+      return potentiallyBetter
+    }
+    return match
+  })
+
+  // All too often we see translations that look like this:
+  //
+  //   | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden  | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
+  //
+  // Yes, that's one very long line. Notice how all the necessary linebreaks
+  // are suddenly gone.
+  content = content.replaceAll(' | | ', ' |\n| ')
+
+  return content
+}
diff --git a/lib/get-data.js b/lib/get-data.js
index bc0f61588e4c..f2ee1964323d 100644
--- a/lib/get-data.js
+++ b/lib/get-data.js
@@ -6,6 +6,7 @@ import matter from 'gray-matter'
 import { merge, get } from 'lodash-es'
 
 import languages from './languages.js'
+import { correctTranslatedContentStrings } from './correct-translation-content.js'
 
 // If you run `export DEBUG_JIT_DATA_READS=true` in your terminal,
 // next time it will mention every file it reads from disk.
@@ -175,7 +176,33 @@ function getDataByDir(dottedPath, dir, englishRoot) {
     fullPath.push(...split)
     fullPath.push(`${nakedname}.md`)
     const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot)
-    return matter(markdown).content
+    let { content } = matter(markdown)
+    if (dir !== englishRoot) {
+      // If we're reading a translation, we need to replace the possible
+      // corruptions. For example `[AUTOTITLE"을](/foo/bar)`.
+      // To do this we'll need the English equivalent
+      let englishContent = content
+      try {
+        englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot)
+      } catch (error) {
+        // In some real but rare cases a reusable doesn't exist in English.
+        // At all.
+        // This can happen when the translation is really out of date.
+        // You might have an old `docs-internal.locale/content/**/*.md`
+        // file that mentions `{% data reusables.foo.bar %}`. And it's
+        // working fine, except none of that exists in English.
+        // If this is the case, we still want to executed the
+        // correctTranslatedContentStrings() function, but we can't
+        // genuinely give it the English equivalent content, which it
+        // sometimes uses to correct some Liquid tags. At least other
+        // good corrections might happen.
+        if (error.code !== 'ENOENT') {
+          throw error
+        }
+      }
+      content = correctTranslatedContentStrings(content, englishContent)
+    }
+    return content
   }
 
   // E.g. {% data ui.pages.foo.bar %}
diff --git a/lib/page-data.js b/lib/page-data.js
index f41ee783724c..34daf75d5b1b 100644
--- a/lib/page-data.js
+++ b/lib/page-data.js
@@ -7,6 +7,7 @@ import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
 import readFileContents from './read-file-contents.js'
 import Page from './page.js'
 import frontmatterSchema from './frontmatter.js'
+import { correctTranslatedContentStrings } from './correct-translation-content.js'
 
 // If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal,
 // every time a translation file fails to initialize we fall back to English
@@ -181,97 +182,6 @@ async function translateTree(dir, langObj, enTree) {
   return item
 }
 
-/**
- * A lot of translations have minor corruptions that will lead to rendering
- * failing (and having to rely on English fallback). Many of these are
- * easy to manually correct for.
- *
- * This function is a temporary solution to correct for these corruptions.
- * It looks for easy "low hanging fruit" that we can correct for.
- *
- */
-export function correctTranslatedContentStrings(content, englishContent, debug = false) {
-  // A lot of translations have corruptions around the AUTOTITLE links.
-  // We've requested that these are corrected back but as a temporary
-  // solution we'll manually recover now.
-  // See internal issue #2762
-  // In late 2023, search in the translations repos if these things are
-  // still happening and if not, the following lines can be removed.
-  content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
-  content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
-  content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
-  content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
-
-  // A lot of Liquid tags lose their linebreak after the `}` which can
-  // result in formatting problems, especially around Markdown tables.
-  // This code here, compares each Liquid statement, in the translation,
-  // and tests if it appears like that but with a newline in the English.
-  // English example:
-  //
-  //    {%- ifversion ghes %}
-  //    | Thing | ✔️ |
-  //    {%- endif %}
-  //
-  // Translation example:
-  //
-  //    {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
-  //
-  // There exists the risk that different Liquid statements gets compared
-  // different Liquid statements in the English, but the risk is worth
-  // taking because even if this accidentally introduces a newline, it's
-  // unlikely to cause a problem. At worst that a sentence displays on its
-  // own paragraph.
-  content = content.replace(/\{%(.+?)%\} /g, (match) => {
-    if (match.lastIndexOf('{%') > 0) {
-      // For example:
-      //
-      //    `{% bla bla %}, and {% foo bar %} `
-      //
-      // Our regex is not greedy, but technically, if you look closely
-      // you'll see this is the first match that starts with `{%` and
-      // ends with `%} `. Let's skip these.
-      return match
-    }
-
-    const withLinebreak = match.slice(0, -1) + '\n'
-    if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
-      return withLinebreak
-    }
-    return match
-  })
-  // The above corrections deepend on looking for `{% foo %} ` and replacing
-  // it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
-  // content and `{% foo %} ` was *not*.
-  // However we see a lot of cases of this:
-  //
-  //    ... {% endif %} | First Column ...
-  //
-  // Which needs to become this:
-  //
-  //    ... {% endif %}
-  //    | First Column ...
-  //
-  // And since `{% endif %}` is such a common Liquid tag we can't reply
-  // on lookig for it with `{% endif %}\n` in the English content.
-  content = content.replace(/\{% endif %\} \| /g, (match) => {
-    const potentiallyBetter = '{% endif %}\n| '
-    if (englishContent.includes(potentiallyBetter)) {
-      return potentiallyBetter
-    }
-    return match
-  })
-
-  // All too often we see translations that look like this:
-  //
-  //   | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden  | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
-  //
-  // Yes, that's one very long line. Notice how all the necessary linebreaks
-  // are suddenly gone.
-  content = content.replaceAll(' | | ', ' |\n| ')
-
-  return content
-}
-
 /**
  * The siteTree is a nested object with pages for every language and version, useful for nav because it
  * contains parent, child, and sibling relationships:
diff --git a/middleware/contextualizers/glossaries.js b/middleware/contextualizers/glossaries.js
index b6ba9434aafb..c55ff244099a 100644
--- a/middleware/contextualizers/glossaries.js
+++ b/middleware/contextualizers/glossaries.js
@@ -1,7 +1,7 @@
 import { getDataByLanguage } from '../../lib/get-data.js'
 import liquid from '../../lib/render-content/liquid.js'
 import { executeWithFallback } from '../../lib/render-with-fallback.js'
-import { correctTranslatedContentStrings } from '../../lib/page-data.js'
+import { correctTranslatedContentStrings } from '../../lib/correct-translation-content.js'
 
 export default async function glossaries(req, res, next) {
   if (!req.pagePath.endsWith('get-started/quickstart/github-glossary')) return next()
diff --git a/tests/fixtures/content/get-started/quickstart/hello-world.md b/tests/fixtures/content/get-started/quickstart/hello-world.md
index 6b9dcb671246..78a26d2a2468 100644
--- a/tests/fixtures/content/get-started/quickstart/hello-world.md
+++ b/tests/fixtures/content/get-started/quickstart/hello-world.md
@@ -30,3 +30,7 @@ like "Enterprise Server X.Y". It should change the above sentence.
 "[AUTOTITLE](/get-started/quickstart/dynamic-title)"
 
 "[AUTOTITLE](/get-started/foo/cross-version-linking)"
+
+## Use of a reusable that might have auto-title links
+
+{% data reusables.gated-features.more-info %}
diff --git a/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md b/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md
index 7765709879fb..47e8afda1db9 100644
--- a/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md
+++ b/tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md
@@ -11,6 +11,15 @@ intro: 'この Hello World 演習に従って、{% data variables.product.produc
 
 ## 可変タイトルのページへのリンク
 
-"[AUTOTITLE](/get-started/quickstart/dynamic-title)"
+Hi, if you look carefully at this next line, the way the auto-title keyword
+is used, is wrong. The quotation is inside the square bracket! Naughty.
+But this is a common occurrence in translations and we need to smartly
+recover from it.
+
+["AUTOTITLE](/get-started/quickstart/dynamic-title)"
 
 "[AUTOTITLE](/get-started/foo/cross-version-linking)"
+
+## Use of a reusable that might have auto-title links
+
+{% data reusables.gated-features.more-info %}
diff --git a/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md b/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md
new file mode 100644
index 000000000000..a77b2f14d55e
--- /dev/null
+++ b/tests/fixtures/translations/ja-jp/data/reusables/gated-features/more-info.md
@@ -0,0 +1 @@
+{% ifversion fpt or ghec %}詳細については, see ["AUTOTITLE](/get-started/quickstart/hello-world)."{% endif %}
diff --git a/tests/rendering-fixtures/translations.js b/tests/rendering-fixtures/translations.js
index f12f4ce3fde0..e5d35a8c327a 100644
--- a/tests/rendering-fixtures/translations.js
+++ b/tests/rendering-fixtures/translations.js
@@ -88,4 +88,27 @@ describe('translations', () => {
       expect(tds[1]).toBe('Present')
     }
   })
+
+  test('automatic correction of bad AUTOTITLE in reusables', async () => {
+    const $ = await getDOM('/ja/get-started/quickstart/hello-world')
+    const links = $('#article-contents a[href]')
+    const texts = links.map((i, element) => $(element).text()).get()
+    // That Japanese page uses AUTOTITLE links. Both in the main `.md` file
+    // but also inside a reusable.
+    // E.g. `["AUTOTITLE](/get-started/quickstart/hello-world)."`
+    // If we didn't do the necessary string corrections on translations'
+    // content and reusables what *would* remain is a HTML link that
+    // would look like this:
+    //
+    //    <a href="/ja/get-started/quickstart/hello-world">&quot;AUTOTITLE</a>
+    //
+    // This test makes sure no such string is left in any of the article
+    // content links.
+    // Note that, in English, it's not acceptable to have such a piece of
+    // Markdown. It would not be let into `main` by our CI checks. But
+    // by their nature, translations are not checked by CI in the same way.
+    // Its "flaws" have to be corrected at runtime.
+    const stillAutotitle = texts.filter((text) => /autotitle/i.test(text))
+    expect(stillAutotitle.length).toBe(0)
+  })
 })