From cf931838ff00c07eea7c5b44628a53cda356e75c Mon Sep 17 00:00:00 2001 From: wyb Date: Thu, 4 Jul 2024 19:06:44 +0800 Subject: [PATCH] Bump simdjson to 3.9.4 Signed-off-by: wyb --- be/src/formats/json/numeric_column.cpp | 19 ++++++++++++++ be/test/formats/json/numeric_column_test.cpp | 26 +++++++++++++++----- be/test/formats/json/struct_column_test.cpp | 16 ++++++++++++ thirdparty/vars.sh | 10 ++++---- 4 files changed, 60 insertions(+), 11 deletions(-) diff --git a/be/src/formats/json/numeric_column.cpp b/be/src/formats/json/numeric_column.cpp index f1e04dcef5358e..f1610f586297af 100644 --- a/be/src/formats/json/numeric_column.cpp +++ b/be/src/formats/json/numeric_column.cpp @@ -87,6 +87,25 @@ static Status add_column_with_numeric_value(FixedLengthColumn* column, const return Status::OK(); } + case simdjson::ondemand::number_type::big_integer: { + auto s = value->raw_json_token(); + StringParser::ParseResult r; + auto in = StringParser::string_to_int(s.data(), s.size(), &r); + if (r != StringParser::PARSE_SUCCESS) { + auto err_msg = strings::Substitute("Fail to convert big integer. column=$0, value=$1", name, s); + return Status::InvalidArgument(err_msg); + } + + T out{}; + if (!checked_cast(in, &out)) { + column->append_numbers(&out, sizeof(out)); + } else { + auto err_msg = strings::Substitute("Value is overflow. column=$0, value=$1", name, in); + return Status::InvalidArgument(err_msg); + } + return Status::OK(); + } + case simdjson::ondemand::number_type::floating_point_number: { double in = value->get_double(); T out{}; diff --git a/be/test/formats/json/numeric_column_test.cpp b/be/test/formats/json/numeric_column_test.cpp index 35c1d3effa9940..e362eacb703dca 100644 --- a/be/test/formats/json/numeric_column_test.cpp +++ b/be/test/formats/json/numeric_column_test.cpp @@ -103,7 +103,7 @@ TEST_F(AddNumericColumnTest, test_add_int64_overflow) { auto doc = parser.iterate(json); simdjson::ondemand::value val = doc.find_field("f_int64"); auto st = add_numeric_column(column.get(), t, "f_int64", &val); - ASSERT_TRUE(st.is_data_quality_error()); + ASSERT_TRUE(st.is_invalid_argument()); } TEST_F(AddNumericColumnTest, test_add_int64_overflow2) { @@ -119,6 +119,19 @@ TEST_F(AddNumericColumnTest, test_add_int64_overflow2) { ASSERT_TRUE(st.is_invalid_argument()); } +TEST_F(AddNumericColumnTest, test_add_int64_overflow3) { + auto column = FixedLengthColumn::create(); + TypeDescriptor t(TYPE_BIGINT); + + simdjson::ondemand::parser parser; + auto json = R"( { "f_int64": 18446744073709551616} )"_padded; + auto doc = parser.iterate(json); + simdjson::ondemand::value val = doc.find_field("f_int64"); + + auto st = add_numeric_column(column.get(), t, "f_int64", &val); + ASSERT_TRUE(st.is_invalid_argument()); +} + TEST_F(AddNumericColumnTest, test_add_int128) { auto column = FixedLengthColumn::create(); TypeDescriptor t(TYPE_LARGEINT); @@ -134,9 +147,7 @@ TEST_F(AddNumericColumnTest, test_add_int128) { ASSERT_EQ("[9223372036854775808]", column->debug_string()); } -// Currently simdjson can not parse number < -9223372036854775808 (lower bound of int64_t) -// or > 18446744073709551615 (upper bound of uint64_t) -TEST_F(AddNumericColumnTest, test_add_int128_invalid) { +TEST_F(AddNumericColumnTest, test_add_int128_big_integer) { auto column = FixedLengthColumn::create(); TypeDescriptor t(TYPE_LARGEINT); @@ -146,14 +157,17 @@ TEST_F(AddNumericColumnTest, test_add_int128_invalid) { simdjson::ondemand::value val = doc.find_field("f_int128"); auto st = add_numeric_column(column.get(), t, "f_int128", &val); - ASSERT_TRUE(st.is_data_quality_error()); + ASSERT_TRUE(st.ok()); + ASSERT_EQ("[-9223372036854775809]", column->debug_string()); + column->reset_column(); json = R"( { "f_int128": 18446744073709551616} )"_padded; doc = parser.iterate(json); val = doc.find_field("f_int128"); st = add_numeric_column(column.get(), t, "f_int128", &val); - ASSERT_TRUE(st.is_data_quality_error()); + ASSERT_TRUE(st.ok()); + ASSERT_EQ("[18446744073709551616]", column->debug_string()); } } // namespace starrocks diff --git a/be/test/formats/json/struct_column_test.cpp b/be/test/formats/json/struct_column_test.cpp index 1f10418e3031b6..a380643ac1d42d 100644 --- a/be/test/formats/json/struct_column_test.cpp +++ b/be/test/formats/json/struct_column_test.cpp @@ -55,6 +55,22 @@ TEST_F(AddStructColumnTest, test_bad_json) { EXPECT_EQ("{key1:'foo',key2:'bar',key3:NULL}", column->debug_string()); } +TEST_F(AddStructColumnTest, test_bad_json2) { + TypeDescriptor type_desc = TypeDescriptor::create_struct_type( + {"key1", "key2", "key3"},{TypeDescriptor::create_varchar_type(10), TypeDescriptor::create_json_type(), + TypeDescriptor::create_varchar_type(10)}); + auto column = ColumnHelper::create_column(type_desc, false); + + simdjson::ondemand::parser parser; + auto json = R"( { "key1": "foo", "key2": {a:1,b:2}, "key3": "bar"} )"_padded; + auto doc = parser.iterate(json); + simdjson::ondemand::value val = doc.get_value(); + + EXPECT_OK(add_struct_column(column.get(), type_desc, "root_key", &val)); + + EXPECT_EQ("{key1:'foo',key2:NULL,key3:NULL}", column->debug_string()); +} + TEST_F(AddStructColumnTest, test_field_not_found) { TypeDescriptor type_desc = TypeDescriptor::create_struct_type( {"key1", "key2"}, {TypeDescriptor::create_varchar_type(10), TypeDescriptor::create_varchar_type(10)}); diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index 6867dbc48e83ac..a5d98353ba3169 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -63,7 +63,7 @@ export TP_JAR_DIR=$TP_INSTALL_DIR/lib/jar # Definitions for architecture-related thirdparty MACHINE_TYPE=$(uname -m) # handle mac m1 platform, change arm64 to aarch64 -if [[ "${MACHINE_TYPE}" == "arm64" ]]; then +if [[ "${MACHINE_TYPE}" == "arm64" ]]; then MACHINE_TYPE="aarch64" fi @@ -163,10 +163,10 @@ RAPIDJSON_SOURCE=rapidjson-1.1.0 RAPIDJSON_MD5SUM="badd12c511e081fec6c89c43a7027bce" # simdjson -SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v2.2.0.tar.gz" -SIMDJSON_NAME=simdjson-v2.2.0.tar.gz -SIMDJSON_SOURCE=simdjson-2.2.0 -SIMDJSON_MD5SUM="9bd0ced53281484d8842a9429065943d" +SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.9.4.tar.gz" +SIMDJSON_NAME=simdjson-v3.9.4.tar.gz +SIMDJSON_SOURCE=simdjson-3.9.4 +SIMDJSON_MD5SUM="bdc1dfcb2a89dc0c09e8370808a946f5" # curl CURL_DOWNLOAD="https://curl.se/download/curl-8.4.0.tar.gz"