Tweak diagnostics with invalid UTF-8 so they can pass over the wire (#237)

apparentlymart · web-flow · commit 0488e0874634 · 2022-11-22T10:01:08.000-05:00
A correct provider should only ever return valid UTF-8 strings as the
diagnostic Summary or Detail, but since diagnostics tend to be describing
unexpected situations and are often derived from errors in downstream
libraries it's possible that a provider might incorrectly return incorrect
garbage as part of a diagnostic message.

The protobuf serializer rejects non-UTF8 strings with a generic message
that is unhelpful to end-users:
    string field contains invalid UTF-8

Here we make the compromise that it's better to make a best effort to
return a diagnostic that is probably only partially invalid so that the
end user has a chance of still getting some clue about what problem
occurred. The new helper functions here achieve that by replacing any
invalid bytes with a correctly-encoded version of the Unicode Replacement
Character, which will then allow the string to pass over the wire protocol
successfully and hopefully end up as an obviously-invalid character in
the CLI output or web UI that's rendering the diagnostics.

This does introduce some slight additional overhead when returning
responses, but it should be immaterial for any response that doesn't
include any diagnostics, relatively minor for responses that include
valid diagnostics, and only markedly expensive for a diagnostic string
with invalid bytes that will therefore need to be re-encoded on a
rune-by-rune basis.
diff --git a/.changelog/237.txt b/.changelog/237.txt
@@ -0,0 +1,7 @@
+```release-note:bug
+tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
+```
+
+```release-note:bug
+tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
+```
diff --git a/tfprotov5/internal/toproto/diagnostic.go b/tfprotov5/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
 package toproto
 
 import (
+	"unicode/utf8"
+
 	"github.com/hashicorp/terraform-plugin-go/tfprotov5"
 	"github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5"
 )
 
 func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) {
 	diag := &tfplugin5.Diagnostic{
 		Severity: Diagnostic_Severity(in.Severity),
-		Summary:  in.Summary,
-		Detail:   in.Detail,
+		Summary:  forceValidUTF8(in.Summary),
+		Detail:   forceValidUTF8(in.Detail),
 	}
 	if in.Attribute != nil {
 		attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) {
 	return diagnostics, nil
 }
 
+// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
+// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
+// the Unicode Replacement Character (\uFFFD).
+//
+// The protobuf serialization library will reject invalid UTF-8 with an
+// unhelpful error message:
+//
+//	string field contains invalid UTF-8
+//
+// Passing a string result through this function makes invalid UTF-8 instead
+// emerge as placeholder characters on the other side of the wire protocol,
+// giving a better chance of still returning a partially-legible message
+// instead of a generic character encoding error.
+//
+// This is intended for user-facing messages such as diagnostic summary and
+// detail messages, where Terraform will just treat the value as opaque and
+// it's ultimately up to the user and their terminal or web browser to
+// interpret the result. Don't use this for strings that have machine-readable
+// meaning.
+func forceValidUTF8(s string) string {
+	// Most strings that pass through here will already be valid UTF-8 and
+	// utf8.ValidString has a fast path which will beat our rune-by-rune
+	// analysis below, so it's worth the cost of walking the string twice
+	// in the rarer invalid case.
+	if utf8.ValidString(s) {
+		return s
+	}
+
+	// If we get down here then we know there's at least one invalid UTF-8
+	// sequence in the string, so in this slow path we'll reconstruct the
+	// string one rune at a time, guaranteeing that we'll only write valid
+	// UTF-8 sequences into the resulting buffer.
+	//
+	// Any invalid string will grow at least a little larger as a result of
+	// this operation because we'll be replacing each invalid byte with
+	// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
+	// the replacement character \uFFFD. 9 is a magic number giving room for
+	// three such expansions without any further allocation.
+	ret := make([]byte, 0, len(s)+9)
+	for {
+		// If the first byte in s is not the start of a valid UTF-8 sequence
+		// then the following will return utf8.RuneError, 1, where
+		// utf8.RuneError is the unicode replacement character.
+		r, advance := utf8.DecodeRuneInString(s)
+		if advance == 0 {
+			break
+		}
+		s = s[advance:]
+		ret = utf8.AppendRune(ret, r)
+	}
+	return string(ret)
+}
+
 // we have to say this next thing to get golint to stop yelling at us about the
 // underscores in the function names. We want the function names to match
 // actually-generated code, so it feels like fair play. It's just a shame we
diff --git a/tfprotov5/internal/toproto/diagnostic_test.go b/tfprotov5/internal/toproto/diagnostic_test.go
@@ -0,0 +1,58 @@
+package toproto
+
+import (
+	"testing"
+)
+
+func TestForceValidUTF8(t *testing.T) {
+	tests := []struct {
+		Input string
+		Want  string
+	}{
+		{
+			"hello",
+			"hello",
+		},
+		{
+			"こんにちは",
+			"こんにちは",
+		},
+		{
+			"baﬄe", // NOTE: "ﬄ" is a single-character ligature
+			"baﬄe", // ligature is preserved exactly
+		},
+		{
+			"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
+			"wé́́é́́é́́!", // diacritics are preserved exactly
+		},
+		{
+			"😸😾", // Astral-plane characters
+			"😸😾", // preserved exactly
+		},
+		{
+			"\xff\xff",     // neither byte is valid UTF-8
+			"\ufffd\ufffd", // both are replaced by replacement character
+		},
+		{
+			"\xff\xff\xff\xff\xff",           // more than three invalid bytes
+			"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
+		},
+		{
+			"t\xffe\xffst",     // invalid bytes interleaved with other content
+			"t\ufffde\ufffdst", // the valid content is preserved
+		},
+		{
+			"\xffこんにちは\xffこんにちは",     // invalid bytes interacting with multibyte sequences
+			"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			got := forceValidUTF8(test.Input)
+			if got != test.Want {
+				t.Errorf("wrong result\ngot:  %q\nwant: %q", got, test.Want)
+			}
+		})
+	}
+}
diff --git a/tfprotov6/internal/toproto/diagnostic.go b/tfprotov6/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
 package toproto
 
 import (
+	"unicode/utf8"
+
 	"github.com/hashicorp/terraform-plugin-go/tfprotov6"
 	"github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6"
 )
 
 func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) {
 	diag := &tfplugin6.Diagnostic{
 		Severity: Diagnostic_Severity(in.Severity),
-		Summary:  in.Summary,
-		Detail:   in.Detail,
+		Summary:  forceValidUTF8(in.Summary),
+		Detail:   forceValidUTF8(in.Detail),
 	}
 	if in.Attribute != nil {
 		attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) {
 	return diagnostics, nil
 }
 
+// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
+// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
+// the Unicode Replacement Character (\uFFFD).
+//
+// The protobuf serialization library will reject invalid UTF-8 with an
+// unhelpful error message:
+//
+//	string field contains invalid UTF-8
+//
+// Passing a string result through this function makes invalid UTF-8 instead
+// emerge as placeholder characters on the other side of the wire protocol,
+// giving a better chance of still returning a partially-legible message
+// instead of a generic character encoding error.
+//
+// This is intended for user-facing messages such as diagnostic summary and
+// detail messages, where Terraform will just treat the value as opaque and
+// it's ultimately up to the user and their terminal or web browser to
+// interpret the result. Don't use this for strings that have machine-readable
+// meaning.
+func forceValidUTF8(s string) string {
+	// Most strings that pass through here will already be valid UTF-8 and
+	// utf8.ValidString has a fast path which will beat our rune-by-rune
+	// analysis below, so it's worth the cost of walking the string twice
+	// in the rarer invalid case.
+	if utf8.ValidString(s) {
+		return s
+	}
+
+	// If we get down here then we know there's at least one invalid UTF-8
+	// sequence in the string, so in this slow path we'll reconstruct the
+	// string one rune at a time, guaranteeing that we'll only write valid
+	// UTF-8 sequences into the resulting buffer.
+	//
+	// Any invalid string will grow at least a little larger as a result of
+	// this operation because we'll be replacing each invalid byte with
+	// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
+	// the replacement character \uFFFD. 9 is a magic number giving room for
+	// three such expansions without any further allocation.
+	ret := make([]byte, 0, len(s)+9)
+	for {
+		// If the first byte in s is not the start of a valid UTF-8 sequence
+		// then the following will return utf8.RuneError, 1, where
+		// utf8.RuneError is the unicode replacement character.
+		r, advance := utf8.DecodeRuneInString(s)
+		if advance == 0 {
+			break
+		}
+		s = s[advance:]
+		ret = utf8.AppendRune(ret, r)
+	}
+	return string(ret)
+}
+
 // we have to say this next thing to get golint to stop yelling at us about the
 // underscores in the function names. We want the function names to match
 // actually-generated code, so it feels like fair play. It's just a shame we
diff --git a/tfprotov6/internal/toproto/diagnostic_test.go b/tfprotov6/internal/toproto/diagnostic_test.go
@@ -0,0 +1,58 @@
+package toproto
+
+import (
+	"testing"
+)
+
+func TestForceValidUTF8(t *testing.T) {
+	tests := []struct {
+		Input string
+		Want  string
+	}{
+		{
+			"hello",
+			"hello",
+		},
+		{
+			"こんにちは",
+			"こんにちは",
+		},
+		{
+			"baﬄe", // NOTE: "ﬄ" is a single-character ligature
+			"baﬄe", // ligature is preserved exactly
+		},
+		{
+			"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
+			"wé́́é́́é́́!", // diacritics are preserved exactly
+		},
+		{
+			"😸😾", // Astral-plane characters
+			"😸😾", // preserved exactly
+		},
+		{
+			"\xff\xff",     // neither byte is valid UTF-8
+			"\ufffd\ufffd", // both are replaced by replacement character
+		},
+		{
+			"\xff\xff\xff\xff\xff",           // more than three invalid bytes
+			"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
+		},
+		{
+			"t\xffe\xffst",     // invalid bytes interleaved with other content
+			"t\ufffde\ufffdst", // the valid content is preserved
+		},
+		{
+			"\xffこんにちは\xffこんにちは",     // invalid bytes interacting with multibyte sequences
+			"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			got := forceValidUTF8(test.Input)
+			if got != test.Want {
+				t.Errorf("wrong result\ngot:  %q\nwant: %q", got, test.Want)
+			}
+		})
+	}
+}