Skip to content

Commit

Permalink
UTF-8 fixes (disallow U+??FFFE noncharacters, and clarify comments).
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 240885452
  • Loading branch information
twifkak committed Mar 29, 2019
1 parent 0b348d8 commit 3097864
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
17 changes: 10 additions & 7 deletions transformer/utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@ import (
"github.com/pkg/errors"
)

// False if the rune is known to cause parse errors during preprocessing, per
// False if the code point is known to cause parse errors during HTML
// preprocessing, per
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
//
// Also false for U+0000 NULL, as that causes parse errors everywhere except
// CDATA, and for defense in depth we don't assume that all parsers interpret
// this properly.
func isHTMLValidInternal(r rune) bool {
// In order to reduce the average number of comparisons per rune, test
// for validity (OR of ANDs) rather than invalidity (AND of ORs), and
// check popular ranges first.
// In order to reduce the average number of comparisons per code point,
// test for validity (OR of ANDs) rather than invalidity (AND of ORs),
// and check popular ranges first.
return (
// Invalid chars:
// U+0000 NULL, per above logic.
Expand All @@ -27,9 +28,9 @@ func isHTMLValidInternal(r rune) bool {
(r > 0xDFFF && r < 0xFDD0) ||
// U+FDD0 through U+FDEF, per https://infra.spec.whatwg.org/#noncharacter.
(r > 0xFDEF && r < 0xFFFE) ||
// U+FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter.
(r > 0xFFFF && r < 0x10FFFF && r & 0xFFFF != 0xFFFF))
// Maybe U+110000 and higher? These codepoints are currently undefined, so best not assume.
// U+??FFFE and U+??FFFF, per https://infra.spec.whatwg.org/#noncharacter.
(r > 0xFFFF && r < 0x10FFFE && r & 0xFFFE != 0xFFFE))
// There are no codepoints greater than U+10FFFF.
}

// Overrideable for test.
Expand All @@ -42,6 +43,8 @@ func validateUTF8ForHTML(html string) error {
pos := 0
for pos < len(html) {
r, width := utf8.DecodeRuneInString(html[pos:])
// Check that the code point wasn't ill-formed. utf8.RuneError
// == '\uFFFD' so we need to check for a mismatched width, too.
if r == utf8.RuneError && width < 2 {
return errors.Errorf("invalid UTF-8 at byte position %d", pos)
}
Expand Down
7 changes: 4 additions & 3 deletions transformer/utf8_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ var minimumValidAMP = tt.Concat(
tt.NoscriptAMPBoilerplate, "</head><body></body></html>",
)

// True if the rune is known to cause parse errors during preprocessing, per
// True if the code point is known to cause parse errors during HTML
// preprocessing, per
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream,
// or if it is U+0000 NULL.
//
Expand All @@ -27,8 +28,8 @@ func isHTMLInvalid(r rune) bool {
// https://infra.spec.whatwg.org/#surrogate
(r >= 0xD800 && r <= 0xDFFF) ||
// https://infra.spec.whatwg.org/#noncharacter
(r >= 0xFDD0 && r <= 0xFDEF) || r == 0xFFFE ||
(r >= 0xFFFF && r <= 0x10FFFF && r & 0xFFFF == 0xFFFF) ||
(r >= 0xFDD0 && r <= 0xFDEF) ||
(r >= 0xFFFE && r <= 0x10FFFF && r & 0xFFFE == 0xFFFE) ||
// http://unicode.org/glossary/#codespace
(r >= 0x110000))
}
Expand Down

0 comments on commit 3097864

Please sign in to comment.