Skip to content

Commit 0488e08

Browse files
Tweak diagnostics with invalid UTF-8 so they can pass over the wire (#237)
A correct provider should only ever return valid UTF-8 strings as the diagnostic Summary or Detail, but since diagnostics tend to be describing unexpected situations and are often derived from errors in downstream libraries it's possible that a provider might incorrectly return incorrect garbage as part of a diagnostic message. The protobuf serializer rejects non-UTF8 strings with a generic message that is unhelpful to end-users: string field contains invalid UTF-8 Here we make the compromise that it's better to make a best effort to return a diagnostic that is probably only partially invalid so that the end user has a chance of still getting some clue about what problem occurred. The new helper functions here achieve that by replacing any invalid bytes with a correctly-encoded version of the Unicode Replacement Character, which will then allow the string to pass over the wire protocol successfully and hopefully end up as an obviously-invalid character in the CLI output or web UI that's rendering the diagnostics. This does introduce some slight additional overhead when returning responses, but it should be immaterial for any response that doesn't include any diagnostics, relatively minor for responses that include valid diagnostics, and only markedly expensive for a diagnostic string with invalid bytes that will therefore need to be re-encoded on a rune-by-rune basis.
1 parent 434a0b0 commit 0488e08

File tree

5 files changed

+237
-4
lines changed

5 files changed

+237
-4
lines changed

.changelog/237.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
```release-note:bug
2+
tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
3+
```
4+
5+
```release-note:bug
6+
tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
7+
```

tfprotov5/internal/toproto/diagnostic.go

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
package toproto
22

33
import (
4+
"unicode/utf8"
5+
46
"github.com/hashicorp/terraform-plugin-go/tfprotov5"
57
"github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5"
68
)
79

810
func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) {
911
diag := &tfplugin5.Diagnostic{
1012
Severity: Diagnostic_Severity(in.Severity),
11-
Summary: in.Summary,
12-
Detail: in.Detail,
13+
Summary: forceValidUTF8(in.Summary),
14+
Detail: forceValidUTF8(in.Detail),
1315
}
1416
if in.Attribute != nil {
1517
attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) {
4143
return diagnostics, nil
4244
}
4345

46+
// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
47+
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
48+
// the Unicode Replacement Character (\uFFFD).
49+
//
50+
// The protobuf serialization library will reject invalid UTF-8 with an
51+
// unhelpful error message:
52+
//
53+
// string field contains invalid UTF-8
54+
//
55+
// Passing a string result through this function makes invalid UTF-8 instead
56+
// emerge as placeholder characters on the other side of the wire protocol,
57+
// giving a better chance of still returning a partially-legible message
58+
// instead of a generic character encoding error.
59+
//
60+
// This is intended for user-facing messages such as diagnostic summary and
61+
// detail messages, where Terraform will just treat the value as opaque and
62+
// it's ultimately up to the user and their terminal or web browser to
63+
// interpret the result. Don't use this for strings that have machine-readable
64+
// meaning.
65+
func forceValidUTF8(s string) string {
66+
// Most strings that pass through here will already be valid UTF-8 and
67+
// utf8.ValidString has a fast path which will beat our rune-by-rune
68+
// analysis below, so it's worth the cost of walking the string twice
69+
// in the rarer invalid case.
70+
if utf8.ValidString(s) {
71+
return s
72+
}
73+
74+
// If we get down here then we know there's at least one invalid UTF-8
75+
// sequence in the string, so in this slow path we'll reconstruct the
76+
// string one rune at a time, guaranteeing that we'll only write valid
77+
// UTF-8 sequences into the resulting buffer.
78+
//
79+
// Any invalid string will grow at least a little larger as a result of
80+
// this operation because we'll be replacing each invalid byte with
81+
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
82+
// the replacement character \uFFFD. 9 is a magic number giving room for
83+
// three such expansions without any further allocation.
84+
ret := make([]byte, 0, len(s)+9)
85+
for {
86+
// If the first byte in s is not the start of a valid UTF-8 sequence
87+
// then the following will return utf8.RuneError, 1, where
88+
// utf8.RuneError is the unicode replacement character.
89+
r, advance := utf8.DecodeRuneInString(s)
90+
if advance == 0 {
91+
break
92+
}
93+
s = s[advance:]
94+
ret = utf8.AppendRune(ret, r)
95+
}
96+
return string(ret)
97+
}
98+
4499
// we have to say this next thing to get golint to stop yelling at us about the
45100
// underscores in the function names. We want the function names to match
46101
// actually-generated code, so it feels like fair play. It's just a shame we
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package toproto
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestForceValidUTF8(t *testing.T) {
8+
tests := []struct {
9+
Input string
10+
Want string
11+
}{
12+
{
13+
"hello",
14+
"hello",
15+
},
16+
{
17+
"こんにちは",
18+
"こんにちは",
19+
},
20+
{
21+
"baffle", // NOTE: "ffl" is a single-character ligature
22+
"baffle", // ligature is preserved exactly
23+
},
24+
{
25+
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
26+
"wé́́é́́é́́!", // diacritics are preserved exactly
27+
},
28+
{
29+
"😸😾", // Astral-plane characters
30+
"😸😾", // preserved exactly
31+
},
32+
{
33+
"\xff\xff", // neither byte is valid UTF-8
34+
"\ufffd\ufffd", // both are replaced by replacement character
35+
},
36+
{
37+
"\xff\xff\xff\xff\xff", // more than three invalid bytes
38+
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
39+
},
40+
{
41+
"t\xffe\xffst", // invalid bytes interleaved with other content
42+
"t\ufffde\ufffdst", // the valid content is preserved
43+
},
44+
{
45+
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences
46+
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
47+
},
48+
}
49+
50+
for _, test := range tests {
51+
t.Run(test.Input, func(t *testing.T) {
52+
got := forceValidUTF8(test.Input)
53+
if got != test.Want {
54+
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
55+
}
56+
})
57+
}
58+
}

tfprotov6/internal/toproto/diagnostic.go

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
package toproto
22

33
import (
4+
"unicode/utf8"
5+
46
"github.com/hashicorp/terraform-plugin-go/tfprotov6"
57
"github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6"
68
)
79

810
func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) {
911
diag := &tfplugin6.Diagnostic{
1012
Severity: Diagnostic_Severity(in.Severity),
11-
Summary: in.Summary,
12-
Detail: in.Detail,
13+
Summary: forceValidUTF8(in.Summary),
14+
Detail: forceValidUTF8(in.Detail),
1315
}
1416
if in.Attribute != nil {
1517
attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) {
4143
return diagnostics, nil
4244
}
4345

46+
// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
47+
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
48+
// the Unicode Replacement Character (\uFFFD).
49+
//
50+
// The protobuf serialization library will reject invalid UTF-8 with an
51+
// unhelpful error message:
52+
//
53+
// string field contains invalid UTF-8
54+
//
55+
// Passing a string result through this function makes invalid UTF-8 instead
56+
// emerge as placeholder characters on the other side of the wire protocol,
57+
// giving a better chance of still returning a partially-legible message
58+
// instead of a generic character encoding error.
59+
//
60+
// This is intended for user-facing messages such as diagnostic summary and
61+
// detail messages, where Terraform will just treat the value as opaque and
62+
// it's ultimately up to the user and their terminal or web browser to
63+
// interpret the result. Don't use this for strings that have machine-readable
64+
// meaning.
65+
func forceValidUTF8(s string) string {
66+
// Most strings that pass through here will already be valid UTF-8 and
67+
// utf8.ValidString has a fast path which will beat our rune-by-rune
68+
// analysis below, so it's worth the cost of walking the string twice
69+
// in the rarer invalid case.
70+
if utf8.ValidString(s) {
71+
return s
72+
}
73+
74+
// If we get down here then we know there's at least one invalid UTF-8
75+
// sequence in the string, so in this slow path we'll reconstruct the
76+
// string one rune at a time, guaranteeing that we'll only write valid
77+
// UTF-8 sequences into the resulting buffer.
78+
//
79+
// Any invalid string will grow at least a little larger as a result of
80+
// this operation because we'll be replacing each invalid byte with
81+
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
82+
// the replacement character \uFFFD. 9 is a magic number giving room for
83+
// three such expansions without any further allocation.
84+
ret := make([]byte, 0, len(s)+9)
85+
for {
86+
// If the first byte in s is not the start of a valid UTF-8 sequence
87+
// then the following will return utf8.RuneError, 1, where
88+
// utf8.RuneError is the unicode replacement character.
89+
r, advance := utf8.DecodeRuneInString(s)
90+
if advance == 0 {
91+
break
92+
}
93+
s = s[advance:]
94+
ret = utf8.AppendRune(ret, r)
95+
}
96+
return string(ret)
97+
}
98+
4499
// we have to say this next thing to get golint to stop yelling at us about the
45100
// underscores in the function names. We want the function names to match
46101
// actually-generated code, so it feels like fair play. It's just a shame we
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package toproto
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestForceValidUTF8(t *testing.T) {
8+
tests := []struct {
9+
Input string
10+
Want string
11+
}{
12+
{
13+
"hello",
14+
"hello",
15+
},
16+
{
17+
"こんにちは",
18+
"こんにちは",
19+
},
20+
{
21+
"baffle", // NOTE: "ffl" is a single-character ligature
22+
"baffle", // ligature is preserved exactly
23+
},
24+
{
25+
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
26+
"wé́́é́́é́́!", // diacritics are preserved exactly
27+
},
28+
{
29+
"😸😾", // Astral-plane characters
30+
"😸😾", // preserved exactly
31+
},
32+
{
33+
"\xff\xff", // neither byte is valid UTF-8
34+
"\ufffd\ufffd", // both are replaced by replacement character
35+
},
36+
{
37+
"\xff\xff\xff\xff\xff", // more than three invalid bytes
38+
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
39+
},
40+
{
41+
"t\xffe\xffst", // invalid bytes interleaved with other content
42+
"t\ufffde\ufffdst", // the valid content is preserved
43+
},
44+
{
45+
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences
46+
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
47+
},
48+
}
49+
50+
for _, test := range tests {
51+
t.Run(test.Input, func(t *testing.T) {
52+
got := forceValidUTF8(test.Input)
53+
if got != test.Want {
54+
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
55+
}
56+
})
57+
}
58+
}

0 commit comments

Comments
 (0)