|
1 | 1 | package toproto |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "unicode/utf8" |
| 5 | + |
4 | 6 | "github.com/hashicorp/terraform-plugin-go/tfprotov5" |
5 | 7 | "github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5" |
6 | 8 | ) |
7 | 9 |
|
8 | 10 | func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) { |
9 | 11 | diag := &tfplugin5.Diagnostic{ |
10 | 12 | Severity: Diagnostic_Severity(in.Severity), |
11 | | - Summary: in.Summary, |
12 | | - Detail: in.Detail, |
| 13 | + Summary: forceValidUTF8(in.Summary), |
| 14 | + Detail: forceValidUTF8(in.Detail), |
13 | 15 | } |
14 | 16 | if in.Attribute != nil { |
15 | 17 | attr, err := AttributePath(in.Attribute) |
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) { |
41 | 43 | return diagnostics, nil |
42 | 44 | } |
43 | 45 |
|
| 46 | +// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the |
| 47 | +// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of |
| 48 | +// the Unicode Replacement Character (\uFFFD). |
| 49 | +// |
| 50 | +// The protobuf serialization library will reject invalid UTF-8 with an |
| 51 | +// unhelpful error message: |
| 52 | +// |
| 53 | +// string field contains invalid UTF-8 |
| 54 | +// |
| 55 | +// Passing a string result through this function makes invalid UTF-8 instead |
| 56 | +// emerge as placeholder characters on the other side of the wire protocol, |
| 57 | +// giving a better chance of still returning a partially-legible message |
| 58 | +// instead of a generic character encoding error. |
| 59 | +// |
| 60 | +// This is intended for user-facing messages such as diagnostic summary and |
| 61 | +// detail messages, where Terraform will just treat the value as opaque and |
| 62 | +// it's ultimately up to the user and their terminal or web browser to |
| 63 | +// interpret the result. Don't use this for strings that have machine-readable |
| 64 | +// meaning. |
| 65 | +func forceValidUTF8(s string) string { |
| 66 | + // Most strings that pass through here will already be valid UTF-8 and |
| 67 | + // utf8.ValidString has a fast path which will beat our rune-by-rune |
| 68 | + // analysis below, so it's worth the cost of walking the string twice |
| 69 | + // in the rarer invalid case. |
| 70 | + if utf8.ValidString(s) { |
| 71 | + return s |
| 72 | + } |
| 73 | + |
| 74 | + // If we get down here then we know there's at least one invalid UTF-8 |
| 75 | + // sequence in the string, so in this slow path we'll reconstruct the |
| 76 | + // string one rune at a time, guaranteeing that we'll only write valid |
| 77 | + // UTF-8 sequences into the resulting buffer. |
| 78 | + // |
| 79 | + // Any invalid string will grow at least a little larger as a result of |
| 80 | + // this operation because we'll be replacing each invalid byte with |
| 81 | + // the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of |
| 82 | + // the replacement character \uFFFD. 9 is a magic number giving room for |
| 83 | + // three such expansions without any further allocation. |
| 84 | + ret := make([]byte, 0, len(s)+9) |
| 85 | + for { |
| 86 | + // If the first byte in s is not the start of a valid UTF-8 sequence |
| 87 | + // then the following will return utf8.RuneError, 1, where |
| 88 | + // utf8.RuneError is the unicode replacement character. |
| 89 | + r, advance := utf8.DecodeRuneInString(s) |
| 90 | + if advance == 0 { |
| 91 | + break |
| 92 | + } |
| 93 | + s = s[advance:] |
| 94 | + ret = utf8.AppendRune(ret, r) |
| 95 | + } |
| 96 | + return string(ret) |
| 97 | +} |
| 98 | + |
44 | 99 | // we have to say this next thing to get golint to stop yelling at us about the |
45 | 100 | // underscores in the function names. We want the function names to match |
46 | 101 | // actually-generated code, so it feels like fair play. It's just a shame we |
|
0 commit comments