Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BugFix] Bump simdjson to 3.9.4 and Fix struct field columns inconsistent when loading from bad json #47775

Merged
merged 3 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions be/src/formats/json/numeric_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,25 @@ static Status add_column_with_numeric_value(FixedLengthColumn<T>* column, const
return Status::OK();
}

case simdjson::ondemand::number_type::big_integer: {
auto s = value->raw_json_token();
StringParser::ParseResult r;
auto in = StringParser::string_to_int<int128_t>(s.data(), s.size(), &r);
if (r != StringParser::PARSE_SUCCESS) {
auto err_msg = strings::Substitute("Fail to convert big integer. column=$0, value=$1", name, s);
return Status::InvalidArgument(err_msg);
}

T out{};
if (!checked_cast(in, &out)) {
column->append_numbers(&out, sizeof(out));
} else {
auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in);
return Status::InvalidArgument(err_msg);
}
return Status::OK();
}

case simdjson::ondemand::number_type::floating_point_number: {
double in = value->get_double();
T out{};
Expand Down
8 changes: 4 additions & 4 deletions be/src/formats/json/struct_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ Status add_struct_column(Column* column, const TypeDescriptor& type_desc, const
simdjson::ondemand::value* field_value_ptr = nullptr;
if (err == simdjson::SUCCESS) {
field_value_ptr = &field_value;
} else if (err == simdjson::NO_SUCH_FIELD) {
// nullptr
} else {
} else if (err != simdjson::NO_SUCH_FIELD) {
// if returns error, the struct field columns may be inconsistent.
// so fill null if error.
auto err_msg = strings::Substitute("Failed to parse value, field=$0.$1, error=$2", name, field_name,
simdjson::error_message(err));
return Status::DataQualityError(err_msg);
LOG(WARNING) << err_msg;
}
RETURN_IF_ERROR(add_nullable_column(field_column.get(), field_type_desc, name, field_value_ptr, true));
}
Expand Down
2 changes: 1 addition & 1 deletion be/test/formats/json/map_column_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ TEST_F(AddMapColumnTest, test_bad_json) {
auto column = ColumnHelper::create_column(type_desc, false);

simdjson::ondemand::parser parser;
auto json = R"( { "key1": "foo", "key2": "bar", "key3": "baz" )"_padded;
auto json = R"( { "key1": "foo", "key2": "bar" "key3": "baz" } )"_padded;
auto doc = parser.iterate(json);
simdjson::ondemand::value val = doc.get_value();

Expand Down
26 changes: 20 additions & 6 deletions be/test/formats/json/numeric_column_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ TEST_F(AddNumericColumnTest, test_add_int64_overflow) {
auto doc = parser.iterate(json);
simdjson::ondemand::value val = doc.find_field("f_int64");
auto st = add_numeric_column<int64_t>(column.get(), t, "f_int64", &val);
ASSERT_TRUE(st.is_data_quality_error());
ASSERT_TRUE(st.is_invalid_argument());
}

TEST_F(AddNumericColumnTest, test_add_int64_overflow2) {
Expand All @@ -119,6 +119,19 @@ TEST_F(AddNumericColumnTest, test_add_int64_overflow2) {
ASSERT_TRUE(st.is_invalid_argument());
}

TEST_F(AddNumericColumnTest, test_add_int64_overflow3) {
auto column = FixedLengthColumn<int64_t>::create();
TypeDescriptor t(TYPE_BIGINT);

simdjson::ondemand::parser parser;
auto json = R"( { "f_int64": 18446744073709551616} )"_padded;
auto doc = parser.iterate(json);
simdjson::ondemand::value val = doc.find_field("f_int64");

auto st = add_numeric_column<int64_t>(column.get(), t, "f_int64", &val);
ASSERT_TRUE(st.is_invalid_argument());
}

TEST_F(AddNumericColumnTest, test_add_int128) {
auto column = FixedLengthColumn<int128_t>::create();
TypeDescriptor t(TYPE_LARGEINT);
Expand All @@ -134,9 +147,7 @@ TEST_F(AddNumericColumnTest, test_add_int128) {
ASSERT_EQ("[9223372036854775808]", column->debug_string());
}

// Currently simdjson can not parse number < -9223372036854775808 (lower bound of int64_t)
// or > 18446744073709551615 (upper bound of uint64_t)
TEST_F(AddNumericColumnTest, test_add_int128_invalid) {
TEST_F(AddNumericColumnTest, test_add_int128_big_integer) {
auto column = FixedLengthColumn<int128_t>::create();
TypeDescriptor t(TYPE_LARGEINT);

Expand All @@ -146,14 +157,17 @@ TEST_F(AddNumericColumnTest, test_add_int128_invalid) {
simdjson::ondemand::value val = doc.find_field("f_int128");

auto st = add_numeric_column<int128_t>(column.get(), t, "f_int128", &val);
ASSERT_TRUE(st.is_data_quality_error());
ASSERT_TRUE(st.ok());
ASSERT_EQ("[-9223372036854775809]", column->debug_string());

column->reset_column();
json = R"( { "f_int128": 18446744073709551616} )"_padded;
doc = parser.iterate(json);
val = doc.find_field("f_int128");

st = add_numeric_column<int128_t>(column.get(), t, "f_int128", &val);
ASSERT_TRUE(st.is_data_quality_error());
ASSERT_TRUE(st.ok());
ASSERT_EQ("[18446744073709551616]", column->debug_string());
}

} // namespace starrocks
20 changes: 19 additions & 1 deletion be/test/formats/json/struct_column_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,25 @@ TEST_F(AddStructColumnTest, test_bad_json) {
auto doc = parser.iterate(json);
simdjson::ondemand::value val = doc.get_value();

EXPECT_STATUS(Status::DataQualityError(""), add_struct_column(column.get(), type_desc, "root_key", &val));
EXPECT_OK(add_struct_column(column.get(), type_desc, "root_key", &val));

EXPECT_EQ("{key1:'foo',key2:'bar',key3:NULL}", column->debug_string());
}

TEST_F(AddStructColumnTest, test_bad_json2) {
TypeDescriptor type_desc = TypeDescriptor::create_struct_type(
{"key1", "key2", "key3"}, {TypeDescriptor::create_varchar_type(10), TypeDescriptor::create_json_type(),
TypeDescriptor::create_varchar_type(10)});
auto column = ColumnHelper::create_column(type_desc, false);

simdjson::ondemand::parser parser;
auto json = R"( { "key1": "foo", "key2": {a:1,b:2}, "key3": "bar"} )"_padded;
auto doc = parser.iterate(json);
simdjson::ondemand::value val = doc.get_value();

EXPECT_OK(add_struct_column(column.get(), type_desc, "root_key", &val));

EXPECT_EQ("{key1:'foo',key2:NULL,key3:NULL}", column->debug_string());
}

TEST_F(AddStructColumnTest, test_field_not_found) {
Expand Down
10 changes: 5 additions & 5 deletions thirdparty/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ export TP_JAR_DIR=$TP_INSTALL_DIR/lib/jar
# Definitions for architecture-related thirdparty
MACHINE_TYPE=$(uname -m)
# handle mac m1 platform, change arm64 to aarch64
if [[ "${MACHINE_TYPE}" == "arm64" ]]; then
if [[ "${MACHINE_TYPE}" == "arm64" ]]; then
MACHINE_TYPE="aarch64"
fi

Expand Down Expand Up @@ -163,10 +163,10 @@ RAPIDJSON_SOURCE=rapidjson-1.1.0
RAPIDJSON_MD5SUM="badd12c511e081fec6c89c43a7027bce"

# simdjson
SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v2.2.0.tar.gz"
SIMDJSON_NAME=simdjson-v2.2.0.tar.gz
SIMDJSON_SOURCE=simdjson-2.2.0
SIMDJSON_MD5SUM="9bd0ced53281484d8842a9429065943d"
SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.9.4.tar.gz"
SIMDJSON_NAME=simdjson-v3.9.4.tar.gz
SIMDJSON_SOURCE=simdjson-3.9.4
SIMDJSON_MD5SUM="bdc1dfcb2a89dc0c09e8370808a946f5"

# curl
CURL_DOWNLOAD="https://curl.se/download/curl-8.4.0.tar.gz"
Expand Down
Loading