unquote should check for invalid UTF-8 code points (#3595)

grobinson-grafana · web-flow · commit f96ba1b58fa3 · 2023-11-13T16:31:15.000Z
Quoted tokens can contain both UTF-8 byte and code point literals
that should be interpreted when quoted. However, we need to check
that the interpreted literals are valid UTF-8 code points or not.
This now happens in unquote.

Signed-off-by: George Robinson &lt;george.robinson@grafana.com&gt;
diff --git a/matchers/parse/parse_test.go b/matchers/parse/parse_test.go
@@ -107,6 +107,14 @@ func TestMatchers(t *testing.T) {
 		name:     "equals unicode emoji in quotes",
 		input:    "{\"foo\"=\"🙂\"}",
 		expected: labels.Matchers{mustNewMatcher(t, labels.MatchEqual, "foo", "🙂")},
+	}, {
+		name:     "equals unicode emoji as bytes in quotes",
+		input:    "{\"foo\"=\"\\xf0\\x9f\\x99\\x82\"}",
+		expected: labels.Matchers{mustNewMatcher(t, labels.MatchEqual, "foo", "🙂")},
+	}, {
+		name:     "equals unicode emoji as code points in quotes",
+		input:    "{\"foo\"=\"\\U0001f642\"}",
+		expected: labels.Matchers{mustNewMatcher(t, labels.MatchEqual, "foo", "🙂")},
 	}, {
 		name:     "equals unicode sentence in quotes",
 		input:    "{\"foo\"=\"🙂bar\"}",
@@ -199,6 +207,10 @@ func TestMatchers(t *testing.T) {
 		name:  "no unquoted escape sequences",
 		input: "{foo=bar\\n}",
 		error: "8:9: \\: invalid input: expected a comma or close brace",
+	}, {
+		name:  "invalid unicode",
+		input: "{\"foo\"=\"\\xf0\\x9f\"}",
+		error: "7:17: \"\\xf0\\x9f\": invalid input",
 	}}
 
 	for _, test := range tests {
@@ -244,6 +256,14 @@ func TestMatcher(t *testing.T) {
 		name:     "equals unicode emoji",
 		input:    "{foo=🙂}",
 		expected: mustNewMatcher(t, labels.MatchEqual, "foo", "🙂"),
+	}, {
+		name:     "equals unicode emoji as bytes in quotes",
+		input:    "{\"foo\"=\"\\xf0\\x9f\\x99\\x82\"}",
+		expected: mustNewMatcher(t, labels.MatchEqual, "foo", "🙂"),
+	}, {
+		name:     "equals unicode emoji as code points in quotes",
+		input:    "{\"foo\"=\"\\U0001f642\"}",
+		expected: mustNewMatcher(t, labels.MatchEqual, "foo", "🙂"),
 	}, {
 		name:     "equals unicode sentence",
 		input:    "{foo=🙂bar}",
@@ -331,6 +351,10 @@ func TestMatcher(t *testing.T) {
 		name:  "two or more returns error",
 		input: "foo=bar,bar=baz",
 		error: "expected 1 matcher, found 2",
+	}, {
+		name:  "invalid unicode",
+		input: "foo=\"\\xf0\\x9f\"",
+		error: "4:14: \"\\xf0\\x9f\": invalid input",
 	}}
 
 	for _, test := range tests {
diff --git a/matchers/parse/token.go b/matchers/parse/token.go
@@ -14,8 +14,10 @@
 package parse
 
 import (
+	"errors"
 	"fmt"
 	"strconv"
+	"unicode/utf8"
 )
 
 type tokenKind int
@@ -82,7 +84,14 @@ func (t token) isOneOf(kinds ...tokenKind) bool {
 // unquote the value in token. If unquoted returns it unmodified.
 func (t token) unquote() (string, error) {
 	if t.kind == tokenQuoted {
-		return strconv.Unquote(t.value)
+		unquoted, err := strconv.Unquote(t.value)
+		if err != nil {
+			return "", err
+		}
+		if !utf8.ValidString(unquoted) {
+			return "", errors.New("quoted string contains invalid UTF-8 code points")
+		}
+		return unquoted, nil
 	}
 	return t.value, nil
 }