Skip to content

Commit 63db13b

Browse files
authored
[Strings] Avoid mishandling unicode in interpreter (#6405)
Our interpreter implementations of `stringview_wtf16.length`, `stringview_wtf16.get_codeunit`, and `string.encode_wtf16_array` are not unicode-aware, so they were previously incorrect in the face of multi-byte code units. As a fix, bail out of the interpretation if there is a non-ascii code point that would make our naive implementation incorrect.
1 parent bfb5ec0 commit 63db13b

File tree

2 files changed

+135
-5
lines changed

2 files changed

+135
-5
lines changed

src/wasm-interpreter.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,6 +1902,16 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {
19021902
Flow visitStringConst(StringConst* curr) {
19031903
return Literal(curr->string.toString());
19041904
}
1905+
1906+
bool hasNonAsciiUpTo(const Literals& values, Index end) {
1907+
for (Index i = 0; i < end; ++i) {
1908+
if (uint32_t(values[i].geti32()) > 127) {
1909+
return true;
1910+
}
1911+
}
1912+
return false;
1913+
}
1914+
19051915
Flow visitStringMeasure(StringMeasure* curr) {
19061916
// For now we only support JS-style strings.
19071917
if (curr->op != StringMeasureWTF16View) {
@@ -1917,6 +1927,13 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {
19171927
if (!data) {
19181928
trap("null ref");
19191929
}
1930+
1931+
// This is only correct if all the bytes stored in `values` correspond to
1932+
// single unicode code points. See `visitStringWTF16Get` for details.
1933+
if (hasNonAsciiUpTo(data->values, data->values.size())) {
1934+
return Flow(NONCONSTANT_FLOW);
1935+
}
1936+
19201937
return Literal(int32_t(data->values.size()));
19211938
}
19221939
Flow visitStringConcat(StringConcat* curr) {
@@ -1980,6 +1997,11 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {
19801997
trap("oob");
19811998
}
19821999

2000+
// We don't handle non-ascii code points correctly yet.
2001+
if (hasNonAsciiUpTo(refValues, refValues.size())) {
2002+
return Flow(NONCONSTANT_FLOW);
2003+
}
2004+
19832005
for (Index i = 0; i < refValues.size(); i++) {
19842006
ptrValues[startVal + i] = refValues[i];
19852007
}
@@ -2095,6 +2117,18 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {
20952117
if (i >= values.size()) {
20962118
trap("string oob");
20972119
}
2120+
2121+
// This naive indexing approach is only correct if the first `i` bytes
2122+
// stored in `values` each corresponds to a single unicode code point. To
2123+
// implement this correctly in general, we would have to reinterpret the
2124+
// bytes as WTF-8, then count up to the `i`th code point, accounting
2125+
// properly for code points that would be represented by surrogate pairs in
2126+
// WTF-16. Alternatively, we could represent string contents as WTF-16 to
2127+
// begin with.
2128+
if (hasNonAsciiUpTo(values, i + 1)) {
2129+
return Flow(NONCONSTANT_FLOW);
2130+
}
2131+
20982132
return Literal(values[i].geti32());
20992133
}
21002134
Flow visitStringIterNext(StringIterNext* curr) {
Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited.
22

3-
;; RUN: wasm-opt %s --precompute --fuzz-exec -all -S -o - | filecheck %s
3+
;; RUN: wasm-opt %s --precompute -all -S -o - | filecheck %s
44

55
(module
6+
;; CHECK: (type $array16 (array (mut i16)))
7+
(type $array16 (array (mut i16)))
8+
69
;; CHECK: (func $eq-no (type $0) (result i32)
710
;; CHECK-NEXT: (i32.const 0)
811
;; CHECK-NEXT: )
9-
(func $eq-no (export "eq-no") (result i32)
12+
(func $eq-no (result i32)
1013
(string.eq
1114
(string.const "ab")
1215
(string.const "cdefg")
@@ -16,7 +19,7 @@
1619
;; CHECK: (func $eq-yes (type $0) (result i32)
1720
;; CHECK-NEXT: (i32.const 1)
1821
;; CHECK-NEXT: )
19-
(func $eq-yes (export "eq-yes") (result i32)
22+
(func $eq-yes (result i32)
2023
(string.eq
2124
(string.const "ab")
2225
(string.const "ab")
@@ -26,11 +29,104 @@
2629
;; CHECK: (func $concat (type $0) (result i32)
2730
;; CHECK-NEXT: (i32.const 1)
2831
;; CHECK-NEXT: )
29-
(func $concat (export "concat") (result i32)
32+
(func $concat (result i32)
3033
(string.eq
3134
(string.concat (string.const "a") (string.const "b"))
3235
(string.const "ab")
3336
)
3437
)
35-
)
3638

39+
;; CHECK: (func $length (type $0) (result i32)
40+
;; CHECK-NEXT: (i32.const 7)
41+
;; CHECK-NEXT: )
42+
(func $length (result i32)
43+
(stringview_wtf16.length
44+
(string.as_wtf16
45+
(string.const "1234567")
46+
)
47+
)
48+
)
49+
50+
;; CHECK: (func $length-bad (type $0) (result i32)
51+
;; CHECK-NEXT: (stringview_wtf16.length
52+
;; CHECK-NEXT: (string.as_wtf16
53+
;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
54+
;; CHECK-NEXT: )
55+
;; CHECK-NEXT: )
56+
;; CHECK-NEXT: )
57+
(func $length-bad (result i32)
58+
;; Not precomputable because we don't handle unicode yet.
59+
(stringview_wtf16.length
60+
(string.as_wtf16
61+
;; $_£_€_𐍈
62+
(string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
63+
)
64+
)
65+
)
66+
67+
;; CHECK: (func $get_codepoint (type $0) (result i32)
68+
;; CHECK-NEXT: (i32.const 95)
69+
;; CHECK-NEXT: )
70+
(func $get_codepoint (result i32)
71+
;; This is computable because everything up to the requested index is ascii. Returns 95 ('_').
72+
(stringview_wtf16.get_codeunit
73+
(string.as_wtf16
74+
;; $_£_€_𐍈
75+
(string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
76+
)
77+
(i32.const 1)
78+
)
79+
)
80+
81+
;; CHECK: (func $get_codepoint-bad (type $0) (result i32)
82+
;; CHECK-NEXT: (stringview_wtf16.get_codeunit
83+
;; CHECK-NEXT: (string.as_wtf16
84+
;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
85+
;; CHECK-NEXT: )
86+
;; CHECK-NEXT: (i32.const 2)
87+
;; CHECK-NEXT: )
88+
;; CHECK-NEXT: )
89+
(func $get_codepoint-bad (export "get_codepoint-bad") (result i32)
90+
;; This is not computable because the requested code unit is not ascii.
91+
(stringview_wtf16.get_codeunit
92+
(string.as_wtf16
93+
;; $_£_€_𐍈
94+
(string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
95+
)
96+
(i32.const 2)
97+
)
98+
)
99+
100+
;; CHECK: (func $encode (type $0) (result i32)
101+
;; CHECK-NEXT: (i32.const 2)
102+
;; CHECK-NEXT: )
103+
(func $encode (result i32)
104+
(string.encode_wtf16_array
105+
(string.const "$_")
106+
(array.new_default $array16
107+
(i32.const 20)
108+
)
109+
(i32.const 0)
110+
)
111+
)
112+
113+
;; CHECK: (func $encode-bad (type $0) (result i32)
114+
;; CHECK-NEXT: (string.encode_wtf16_array
115+
;; CHECK-NEXT: (string.const "$_\c2\a3_\e2\82\ac_\f0\90\8d\88")
116+
;; CHECK-NEXT: (array.new_default $array16
117+
;; CHECK-NEXT: (i32.const 20)
118+
;; CHECK-NEXT: )
119+
;; CHECK-NEXT: (i32.const 0)
120+
;; CHECK-NEXT: )
121+
;; CHECK-NEXT: )
122+
(func $encode-bad (result i32)
123+
(string.encode_wtf16_array
124+
;; $_£_€_𐍈
125+
(string.const "$_\C2\A3_\E2\82\AC_\F0\90\8D\88")
126+
(array.new_default $array16
127+
(i32.const 20)
128+
)
129+
(i32.const 0)
130+
)
131+
)
132+
)

0 commit comments

Comments
 (0)