Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read over BOM #277

Merged
merged 1 commit into from
Jun 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,29 +96,30 @@ func TestUTF16(t *testing.T) {
// a = "b" in UTF-16, without BOM and with the LE and BE BOMs.
{
[]byte{0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
`bare keys cannot contain '\x00'; probably using UTF-16; TOML files must be UTF-8`,
`files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8`,
},
{
[]byte{0xfe, 0xff, 0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
`document starts with UTF-16 byte-order-mark (BOM) 0xfeff; TOML files must be UTF-8`,
},
{
[]byte{0xff, 0xfe, 0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
`document starts with UTF-16 byte-order-mark (BOM) 0xfffe; TOML files must be UTF-8`,
`files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8`,
},
// UTF-8 with BOM
{[]byte("\xff\xfea = \"b\""), ``},
{[]byte("\xfe\xffa = \"b\""), ``},
}

for _, tt := range tests {
t.Run("", func(t *testing.T) {
var s struct {
A string
}
var s struct{ A string }

_, err := Decode(string(tt.in), &s)
if err == nil {
t.Fatal("err is nil")
if !errorContains(err, tt.wantErr) {
t.Fatalf("wrong error\nhave: %q\nwant: %q", err, tt.wantErr)
}
if !strings.Contains(err.Error(), tt.wantErr) {
t.Errorf("wrong error\nhave: %q\nwant: %q", err, tt.wantErr)
if tt.wantErr != "" {
return
}
if s.A != "b" {
t.Errorf(`s.A is not "b" but %q`, s.A)
}
})
}
Expand Down Expand Up @@ -1555,3 +1556,18 @@ cauchy = "cat 2"
Decode(testSimple, &val)
}
}

// errorContains checks if the error message in have contains the text in
// want.
//
// This is safe when have is nil. Use an empty string for want if you want to
// test that err is nil.
func errorContains(have error, want string) bool {
if have == nil {
return want == ""
}
if want == "" {
return false
}
return strings.Contains(have.Error(), want)
}
4 changes: 0 additions & 4 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,6 @@ func lexBareKey(lx *lexer) stateFn {
lx.emit(itemText)
return lexKeyEnd
default:
// NULL bytes probably means it's a UTF-16 file without BOM.
if r == 0 {
return lx.errorf("bare keys cannot contain %q; probably using UTF-16; TOML files must be UTF-8", r)
}
return lx.errorf("bare keys cannot contain %q", r)
}
}
Expand Down
17 changes: 14 additions & 3 deletions parse.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package toml

import (
"errors"
"fmt"
"strconv"
"strings"
Expand Down Expand Up @@ -47,10 +48,20 @@ func parse(data string) (p *parser, err error) {
}
}()

// Read over BOM; do this here as the lexer calls utf8.DecodeRuneInString()
// which mangles stuff.
if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") {
return nil, fmt.Errorf(
"document starts with UTF-16 byte-order-mark (BOM) 0x%x; TOML files must be UTF-8",
data[:2])
data = data[2:]
}
// Examine first few bytes for NULL bytes; this probably means it's a UTF-16
// file (second byte in surrogate pair being NULL). Again, do this here to
// avoid having to deal with UTF-8/16 stuff in the lexer.
ex := 6
if len(data) < 6 {
ex = len(data)
}
if strings.ContainsRune(data[:ex], 0) {
return nil, errors.New("files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8")
}

p = &parser{
Expand Down