Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix decode_utf8 for codepoints >= U+010000 #1201

Merged
merged 1 commit into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix decode_utf8 for codepoints >= U+010000
Fixes #1181.

Add unit test cases to cover UTF-8 decode/encode.
  • Loading branch information
johnbartholomew committed Jan 19, 2025
commit 92b7747a1f61ebd09c9d013364a8f6de77732e0f
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ if (BUILD_TESTS)
# target runs tests without building them.
add_custom_target(run_tests COMMAND ${CMAKE_CTEST_COMMAND}
DEPENDS libjsonnet_test libjsonnet_test_file libjsonnet_test_snippet
jsonnet parser_test lexer_test libjsonnet++_test libjsonnet_test_locale
jsonnet unicode_test parser_test lexer_test libjsonnet++_test libjsonnet_test_locale
)
endif()
9 changes: 9 additions & 0 deletions core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,12 @@ cc_test(
"@com_google_googletest//:gtest_main",
],
)

cc_test(
name = "unicode_test",
srcs = ["unicode_test.cpp"],
deps = [
":libjsonnet",
"@com_google_googletest//:gtest_main",
],
)
3 changes: 3 additions & 0 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ function(add_test_executable test_name)
endfunction()

if (BUILD_TESTS)
add_test_executable(unicode_test)
add_test(unicode_test ${GLOBAL_OUTPUT_PATH}/unicode_test)

add_test_executable(lexer_test)
add_test(lexer_test ${GLOBAL_OUTPUT_PATH}/lexer_test)

Expand Down
2 changes: 1 addition & 1 deletion core/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ static inline char32_t decode_utf8(const std::string &str, size_t &i)
if ((c3 & 0xC0) != 0x80) {
return JSONNET_CODEPOINT_ERROR;
}
return ((c0 & 0x7) << 24ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
return ((c0 & 0x7) << 18ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
} else {
return JSONNET_CODEPOINT_ERROR;
}
Expand Down
114 changes: 114 additions & 0 deletions core/unicode_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
Copyright 2025 Google Inc. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#include <array>
#include <string>
#include <sstream>
#include <iostream>
#include "unicode.h"
#include "gtest/gtest.h"

namespace jsonnet::internal {
namespace {

void testEncodeDecode(char32_t codepoint, const std::string &expect_utf8) {
std::string buffer;
size_t len = encode_utf8(codepoint, buffer);
EXPECT_EQ(len, expect_utf8.size());
EXPECT_EQ(buffer, expect_utf8);

size_t at = 0;
char32_t decoded = decode_utf8(expect_utf8, at);
EXPECT_EQ(decoded, codepoint);
EXPECT_EQ(at, expect_utf8.size() - 1);
}

TEST(Unicode, TestUTF8)
{
// ASCII encodes as itself.
testEncodeDecode(0x00, std::string("\x00", 1));
testEncodeDecode(0x41, "A");
testEncodeDecode(0x7f, "\x7f");

testEncodeDecode(0x80, "\xc2\x80");
testEncodeDecode(0x100, "\xc4\x80");
testEncodeDecode(0x7ff, "\xdf\xbf");

testEncodeDecode(0x800, "\xe0\xa0\x80");
testEncodeDecode(0x1482, "\xe1\x92\x82");
testEncodeDecode(0xffff, "\xef\xbf\xbf");

testEncodeDecode(0x010000, "\xf0\x90\x80\x80");
testEncodeDecode(0x01f600, "\xf0\x9f\x98\x80"); // U+1F600 "Grinning Face"
testEncodeDecode(0x0f057e, "\xf3\xb0\x95\xbe"); // U+F057E Private use area character
testEncodeDecode(0x10ffff, "\xf4\x8f\xbf\xbf");
}

TEST(Unicode, TestUTF8RejectBad)
{
const auto test_cases = std::array{
"\x80", // Continuation byte without leading byte
"\xa0", // Continuation byte without leading byte
"\xbf", // Continuation byte without leading byte
"\xc0", // Leading byte for 2-byte sequence (missing tail)
"\xe0", // Leading byte for 3-byte sequence (missing tail)
"\xf0", // Leading byte for 4-byte sequence (missing tail)
"\xf8\x83\x83\x83", // Invalid leading byte
"\xe0\x80", // Leading byte for 3-byte sequence (missing tail)
"\xf0\x80", // Leading byte for 4-byte sequence (missing tail)
"\xf0\x80\x80", // Leading byte for 4-byte sequence (missing tail)
"\xc0\xcf", // Leading byte for 2-byte sequence (incorrect tail)
"\xe0\xcf", // Leading byte for 3-byte sequence (incorrect tail)
"\xf0\xcf", // Leading byte for 4-byte sequence (incorrect tail)
"\xe0\xcf\x80", // Leading byte for 3-byte sequence (incorrect tail)
"\xf0\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail)
"\xe0\x80\xcf", // Leading byte for 3-byte sequence (incorrect tail)
"\xf0\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail)
"\xf0\x80\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail)
"\xf0\x80\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail)
"\xf0\xcf\x80\x80", // Leading byte for 4-byte sequence (incorrect tail)
};
for (size_t i = 0; i < test_cases.size(); ++i) {
const auto str = test_cases[i];
size_t at = 0;
char32_t c = decode_utf8(str, at);

EXPECT_EQ(c, JSONNET_CODEPOINT_ERROR) << "expect decode to reject. case " << i << std::endl;
}
}

TEST(Unicode, TestUTF8RoundTripExhaustive)
{
// Encode every Unicode code-point as UTF-8 and verify that
// it decodes to the same value.
std::string buffer;
for (int x = 0; x < JSONNET_CODEPOINT_MAX; ++x) {
if (x == JSONNET_CODEPOINT_ERROR) {
continue;
}
buffer.clear();
encode_utf8(x, buffer);

size_t at = 0;
char32_t y = decode_utf8(buffer, at);
EXPECT_NE(y, JSONNET_CODEPOINT_ERROR) << "UTF-8 roundtrip failed for codepoint " << x << " decode rejects" << std::endl;
EXPECT_EQ(x, y) << "UTF-8 roundtrip failed for codepoint " << x << " converts to " << y << std::endl;
EXPECT_EQ(at, buffer.size() - 1) << "UTF-8 roundtrip failed for codepoint " << x << " decodes incorrect length" << std::endl;
}
}

} // namespace
} // namespace jsonnet::internal
Loading