From 309786479689ceb785bfb104caacdcc08e39c630 Mon Sep 17 00:00:00 2001
From: Devin Mullins <twifkak@google.com>
Date: Thu, 28 Mar 2019 17:10:49 -0700
Subject: [PATCH] UTF-8 fixes (disallow U+??FFFE noncharacters, and clarify
 comments).

PiperOrigin-RevId: 240885452
---
 transformer/utf8.go      | 17 ++++++++++-------
 transformer/utf8_test.go |  7 ++++---
 2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/transformer/utf8.go b/transformer/utf8.go
index 064d75112..06bc7b3a3 100644
--- a/transformer/utf8.go
+++ b/transformer/utf8.go
@@ -6,16 +6,17 @@ import (
 	"github.com/pkg/errors"
 )
 
-// False if the rune is known to cause parse errors during preprocessing, per
+// False if the code point is known to cause parse errors during HTML
+// preprocessing, per
 // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
 //
 // Also false for U+0000 NULL, as that causes parse errors everywhere except
 // CDATA, and for defense in depth we don't assume that all parsers interpret
 // this properly.
 func isHTMLValidInternal(r rune) bool {
-	// In order to reduce the average number of comparisons per rune, test
-	// for validity (OR of ANDs) rather than invalidity (AND of ORs), and
-	// check popular ranges first.
+	// In order to reduce the average number of comparisons per code point,
+	// test for validity (OR of ANDs) rather than invalidity (AND of ORs),
+	// and check popular ranges first.
 	return (
 		// Invalid chars:
 		// U+0000 NULL, per above logic.
@@ -27,9 +28,9 @@ func isHTMLValidInternal(r rune) bool {
 		(r > 0xDFFF && r < 0xFDD0) ||
 		// U+FDD0 through U+FDEF, per https://infra.spec.whatwg.org/#noncharacter.
 		(r > 0xFDEF && r < 0xFFFE) ||
-		// U+FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter.
-		(r > 0xFFFF && r < 0x10FFFF && r & 0xFFFF != 0xFFFF))
-		// Maybe U+110000 and higher? These codepoints are currently undefined, so best not assume.
+		// U+??FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter.
+		(r > 0xFFFF && r < 0x10FFFE && r & 0xFFFE != 0xFFFE))
+		// There are no codepoints greater than U+10FFFF.
 }
 
 // Overrideable for test.
@@ -42,6 +43,8 @@ func validateUTF8ForHTML(html string) error {
 	pos := 0
 	for pos < len(html) {
 		r, width := utf8.DecodeRuneInString(html[pos:])
+		// Check that the code point wasn't ill-formed. utf8.RuneError
+		// == '\uFFFD' so we need to check for a mismatched width, too.
 		if r == utf8.RuneError && width < 2 {
 			return errors.Errorf("invalid UTF-8 at byte position %d", pos)
 		}
diff --git a/transformer/utf8_test.go b/transformer/utf8_test.go
index 92f2bce31..83edc67c2 100644
--- a/transformer/utf8_test.go
+++ b/transformer/utf8_test.go
@@ -13,7 +13,8 @@ var minimumValidAMP = tt.Concat(
 	tt.NoscriptAMPBoilerplate, "        <style>
          .commit-tease,
          .user-profile-mini-avatar,
          .avatar,
          .vcard-details,
          .signup-prompt-bg {
            display: none !IMPORTANT;
          }
        </style>
         <script>
          document.addEventListener('DOMContentLoaded', function() {
            this.querySelectorAll('a').forEach(anchor => {
              anchor.addEventListener('click', e => {
                e.preventDefault();

                const redact = new URLSearchParams(window.location.search).get('redact');
                const hasExistingParams = anchor.href.includes('?');
                window.location.href = anchor.href + (hasExistingParams ? `&redact=${redact}` : `?redact=${redact}`);
              });
            });
          });
        </script>
 </head><body></body></html>",
 )
 
-// True if the rune is known to cause parse errors during preprocessing, per
+// True if the code point is known to cause parse errors during HTML
+// preprocessing, per
 // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream,
 // or if it is U+0000 NULL.
 //
@@ -27,8 +28,8 @@ func isHTMLInvalid(r rune) bool {
 		// https://infra.spec.whatwg.org/#surrogate
 		(r >= 0xD800 && r <= 0xDFFF) ||
 		// https://infra.spec.whatwg.org/#noncharacter
-		(r >= 0xFDD0 && r <= 0xFDEF) || r == 0xFFFE ||
-		(r >= 0xFFFF && r <= 0x10FFFF && r & 0xFFFF == 0xFFFF) ||
+		(r >= 0xFDD0 && r <= 0xFDEF) ||
+		(r >= 0xFFFE && r <= 0x10FFFF && r & 0xFFFE == 0xFFFE) ||
 		// http://unicode.org/glossary/#codespace
 		(r >= 0x110000))
 }