Skip to content

Commit 688630f

Browse files
committed
Validate that names are valid UTF-8
Add an `isUTF8` utility and use it in both the text and binary parsers. Add missing checks for overlong encodings and overlarge code points in our WTF8 reader, which the new utility uses. Re-enable the spec tests that test UTF-8 validation.
1 parent 3acacac commit 688630f

File tree

6 files changed

+36
-12
lines changed

6 files changed

+36
-12
lines changed

scripts/test/shared.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,6 @@ def get_tests(test_dir, extensions=[], recursive=False):
400400
# expected-output/ if any.
401401
SPEC_TESTS_TO_SKIP = [
402402
# Malformed module accepted
403-
'utf8-custom-section-id.wast',
404-
'utf8-import-field.wast',
405-
'utf8-import-module.wast',
406-
'utf8-invalid-encoding.wast',
407403
'const.wast',
408404
'address.wast',
409405

src/parser/lexer.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "support/name.h"
2727
#include "support/result.h"
28+
#include "support/string.h"
2829

2930
#ifndef parser_lexer_h
3031
#define parser_lexer_h
@@ -124,11 +125,11 @@ struct Lexer {
124125
std::optional<std::string> takeString();
125126

126127
std::optional<Name> takeName() {
127-
// TODO: Validate UTF.
128-
if (auto str = takeString()) {
129-
return Name(*str);
128+
auto str = takeString();
129+
if (!str || !String::isUTF8(*str)) {
130+
return std::nullopt;
130131
}
131-
return std::nullopt;
132+
return Name(*str);
132133
}
133134

134135
bool takeSExprStart(std::string_view expected) {

src/support/string.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,21 @@ std::optional<uint32_t> takeWTF8CodePoint(std::string_view& str) {
195195
}
196196

197197
str = str.substr(1 + trailingBytes);
198+
198199
if (!valid) {
199200
return std::nullopt;
200201
}
202+
203+
size_t expectedTrailing = u < 0x80 ? 0
204+
: u < 0x800 ? 1
205+
: u < 0x10000 ? 2
206+
: u < 0x110000 ? 3
207+
: -1;
208+
if (trailingBytes != expectedTrailing) {
209+
// Overlong encoding or overlarge code point.
210+
return std::nullopt;
211+
}
212+
201213
return u;
202214
}
203215

@@ -404,4 +416,14 @@ std::ostream& printEscapedJSON(std::ostream& os, std::string_view str) {
404416
return os << '"';
405417
}
406418

419+
bool isUTF8(std::string_view str) {
420+
while (str.size()) {
421+
auto u = takeWTF8CodePoint(str);
422+
if (!u || (0xD800 <= *u && *u < 0xE000)) {
423+
return false;
424+
}
425+
}
426+
return true;
427+
}
428+
407429
} // namespace wasm::String

src/support/string.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ bool convertWTF16ToWTF8(std::ostream& os, std::string_view str);
9999
// unit. Returns `true` if the input was valid UTF-16.
100100
bool convertUTF16ToUTF8(std::ostream& os, std::string_view str);
101101

102+
// Whether the string is valid UTF-8.
103+
bool isUTF8(std::string_view str);
104+
102105
} // namespace wasm::String
103106

104107
#endif // wasm_support_string_h

src/wasm-binary.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1505,7 +1505,7 @@ class WasmBinaryReader {
15051505
HeapType getIndexedHeapType();
15061506

15071507
Type getConcreteType();
1508-
Name getInlineString();
1508+
Name getInlineString(bool requireValid = true);
15091509
void verifyInt8(int8_t x);
15101510
void verifyInt16(int16_t x);
15111511
void verifyInt32(int32_t x);

src/wasm/wasm-binary.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2201,11 +2201,13 @@ Type WasmBinaryReader::getConcreteType() {
22012201
return type;
22022202
}
22032203

2204-
Name WasmBinaryReader::getInlineString() {
2204+
Name WasmBinaryReader::getInlineString(bool requireValid) {
22052205
BYN_TRACE("<==\n");
22062206
auto len = getU32LEB();
22072207
auto data = getByteView(len);
2208-
2208+
if (requireValid && !String::isUTF8(data)) {
2209+
throwError("invalid UTF-8 string");
2210+
}
22092211
BYN_TRACE("getInlineString: " << data << " ==>\n");
22102212
return Name(data);
22112213
}
@@ -3027,7 +3029,7 @@ void WasmBinaryReader::readStrings() {
30273029
}
30283030
size_t num = getU32LEB();
30293031
for (size_t i = 0; i < num; i++) {
3030-
auto string = getInlineString();
3032+
auto string = getInlineString(false);
30313033
// Re-encode from WTF-8 to WTF-16.
30323034
std::stringstream wtf16;
30333035
if (!String::convertWTF8ToWTF16(wtf16, string.str)) {

0 commit comments

Comments
 (0)