From 309786479689ceb785bfb104caacdcc08e39c630 Mon Sep 17 00:00:00 2001 From: Devin Mullins Date: Thu, 28 Mar 2019 17:10:49 -0700 Subject: [PATCH] UTF-8 fixes (disallow U+??FFFE noncharacters, and clarify comments). PiperOrigin-RevId: 240885452 --- transformer/utf8.go | 17 ++++++++++------- transformer/utf8_test.go | 7 ++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/transformer/utf8.go b/transformer/utf8.go index 064d75112..06bc7b3a3 100644 --- a/transformer/utf8.go +++ b/transformer/utf8.go @@ -6,16 +6,17 @@ import ( "github.com/pkg/errors" ) -// False if the rune is known to cause parse errors during preprocessing, per +// False if the code point is known to cause parse errors during HTML +// preprocessing, per // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream // // Also false for U+0000 NULL, as that causes parse errors everywhere except // CDATA, and for defense in depth we don't assume that all parsers interpret // this properly. func isHTMLValidInternal(r rune) bool { - // In order to reduce the average number of comparisons per rune, test - // for validity (OR of ANDs) rather than invalidity (AND of ORs), and - // check popular ranges first. + // In order to reduce the average number of comparisons per code point, + // test for validity (OR of ANDs) rather than invalidity (AND of ORs), + // and check popular ranges first. return ( // Invalid chars: // U+0000 NULL, per above logic. @@ -27,9 +28,9 @@ func isHTMLValidInternal(r rune) bool { (r > 0xDFFF && r < 0xFDD0) || // U+FDD0 through U+FDEF, per https://infra.spec.whatwg.org/#noncharacter. (r > 0xFDEF && r < 0xFFFE) || - // U+FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter. - (r > 0xFFFF && r < 0x10FFFF && r & 0xFFFF != 0xFFFF)) - // Maybe U+110000 and higher? These codepoints are currently undefined, so best not assume. + // U+??FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter. + (r > 0xFFFF && r < 0x10FFFE && r & 0xFFFE != 0xFFFE)) + // There are no codepoints greater than U+10FFFF. } // Overrideable for test. @@ -42,6 +43,8 @@ func validateUTF8ForHTML(html string) error { pos := 0 for pos < len(html) { r, width := utf8.DecodeRuneInString(html[pos:]) + // Check that the code point wasn't ill-formed. utf8.RuneError + // == '\uFFFD' so we need to check for a mismatched width, too. if r == utf8.RuneError && width < 2 { return errors.Errorf("invalid UTF-8 at byte position %d", pos) } diff --git a/transformer/utf8_test.go b/transformer/utf8_test.go index 92f2bce31..83edc67c2 100644 --- a/transformer/utf8_test.go +++ b/transformer/utf8_test.go @@ -13,7 +13,8 @@ var minimumValidAMP = tt.Concat( tt.NoscriptAMPBoilerplate, " ", ) -// True if the rune is known to cause parse errors during preprocessing, per +// True if the code point is known to cause parse errors during HTML +// preprocessing, per // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream, // or if it is U+0000 NULL. // @@ -27,8 +28,8 @@ func isHTMLInvalid(r rune) bool { // https://infra.spec.whatwg.org/#surrogate (r >= 0xD800 && r <= 0xDFFF) || // https://infra.spec.whatwg.org/#noncharacter - (r >= 0xFDD0 && r <= 0xFDEF) || r == 0xFFFE || - (r >= 0xFFFF && r <= 0x10FFFF && r & 0xFFFF == 0xFFFF) || + (r >= 0xFDD0 && r <= 0xFDEF) || + (r >= 0xFFFE && r <= 0x10FFFF && r & 0xFFFE == 0xFFFE) || // http://unicode.org/glossary/#codespace (r >= 0x110000)) }