BurntSushi · arp242 · Jun 9, 2021 · Jun 9, 2021
diff --git a/decode_test.go b/decode_test.go
@@ -96,29 +96,30 @@ func TestUTF16(t *testing.T) {
 		// a = "b" in UTF-16, without BOM and with the LE and BE BOMs.
 		{
 			[]byte{0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
-			`bare keys cannot contain '\x00'; probably using UTF-16; TOML files must be UTF-8`,
+			`files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8`,
 		},
 		{
 			[]byte{0xfe, 0xff, 0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
-			`document starts with UTF-16 byte-order-mark (BOM) 0xfeff; TOML files must be UTF-8`,
-		},
-		{
-			[]byte{0xff, 0xfe, 0x61, 0x00, 0x20, 0x00, 0x3d, 0x00, 0x20, 0x00, 0x22, 0x00, 0x62, 0x00, 0x22, 0x00, 0x0a, 0x00},
-			`document starts with UTF-16 byte-order-mark (BOM) 0xfffe; TOML files must be UTF-8`,
+			`files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8`,
 		},
+		//  UTF-8 with BOM
+		{[]byte("\xff\xfea = \"b\""), ``},
+		{[]byte("\xfe\xffa = \"b\""), ``},
 	}
 
 	for _, tt := range tests {
 		t.Run("", func(t *testing.T) {
-			var s struct {
-				A string
-			}
+			var s struct{ A string }
+
 			_, err := Decode(string(tt.in), &s)
-			if err == nil {
-				t.Fatal("err is nil")
+			if !errorContains(err, tt.wantErr) {
+				t.Fatalf("wrong error\nhave: %q\nwant: %q", err, tt.wantErr)
 			}
-			if !strings.Contains(err.Error(), tt.wantErr) {
-				t.Errorf("wrong error\nhave: %q\nwant: %q", err, tt.wantErr)
+			if tt.wantErr != "" {
+				return
+			}
+			if s.A != "b" {
+				t.Errorf(`s.A is not "b" but %q`, s.A)
 			}
 		})
 	}
@@ -1555,3 +1556,18 @@ cauchy = "cat 2"
 		Decode(testSimple, &val)
 	}
 }
+
+// errorContains checks if the error message in have contains the text in
+// want.
+//
+// This is safe when have is nil. Use an empty string for want if you want to
+// test that err is nil.
+func errorContains(have error, want string) bool {
+	if have == nil {
+		return want == ""
+	}
+	if want == "" {
+		return false
+	}
+	return strings.Contains(have.Error(), want)
+}
diff --git a/lex.go b/lex.go
@@ -389,10 +389,6 @@ func lexBareKey(lx *lexer) stateFn {
 		lx.emit(itemText)
 		return lexKeyEnd
 	default:
-		// NULL bytes probably means it's a UTF-16 file without BOM.
-		if r == 0 {
-			return lx.errorf("bare keys cannot contain %q; probably using UTF-16; TOML files must be UTF-8", r)
-		}
 		return lx.errorf("bare keys cannot contain %q", r)
 	}
 }

diff --git a/parse.go b/parse.go
@@ -1,6 +1,7 @@
 package toml
 
 import (
+	"errors"
 	"fmt"
 	"strconv"
 	"strings"
@@ -47,10 +48,20 @@ func parse(data string) (p *parser, err error) {
 		}
 	}()
 
+	// Read over BOM; do this here as the lexer calls utf8.DecodeRuneInString()
+	// which mangles stuff.
 	if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") {
-		return nil, fmt.Errorf(
-			"document starts with UTF-16 byte-order-mark (BOM) 0x%x; TOML files must be UTF-8",
-			data[:2])
+		data = data[2:]
+	}
+	// Examine first few bytes for NULL bytes; this probably means it's a UTF-16
+	// file (second byte in surrogate pair being NULL). Again, do this here to
+	// avoid having to deal with UTF-8/16 stuff in the lexer.
+	ex := 6
+	if len(data) < 6 {
+		ex = len(data)
+	}
+	if strings.ContainsRune(data[:ex], 0) {
+		return nil, errors.New("files cannot contain NULL bytes; probably using UTF-16; TOML files must be UTF-8")
 	}
 
 	p = &parser{