From f211eb58c7f825fda220e35e416849bd243474cf Mon Sep 17 00:00:00 2001 From: Ashin Gau Date: Sun, 9 Jun 2024 21:53:03 +0800 Subject: [PATCH] [fix](ubsan) reinterpret_cast fix length types to int8 is not safe (#35912) Fix type check of ubsan. ``` /root/doris/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h:75:78: runtime error: member call on address 0x5582f35db5c0 which does not point to an object of type 'doris::vectorized::ColumnVector' 0x5582f35db5c0: note: object is of type 'doris::vectorized::ColumnVector' 83 55 00 00 78 c0 b0 5a 82 55 00 00 02 00 00 00 00 00 00 00 10 a0 00 d7 83 55 00 00 10 a0 00 d7 ^~~~~~~~~~~~~~~~~~~~~~~ vptr for 'doris::vectorized::ColumnVector' doris::Status doris::vectorized::FixLengthPlainDecoder::_decode_values(COW::mutable_ptr&, std::shared_ptr&, doris::vectorized::ColumnSelectVector&, bool) at fix_length_plain_decoder.h:75:78 ``` --- .../format/parquet/fix_length_dict_decoder.hpp | 17 ++++++++++------- .../format/parquet/fix_length_plain_decoder.h | 13 +++++++------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 115ca68bc1e61d..65e329ae89b5a4 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -60,23 +60,26 @@ class FixLengthDictDecoder final : public BaseDictDecoder { return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - return _decode_fixed_values(doris_column, select_vector); + return _decode_fixed_values(doris_column, data_type, select_vector); } protected: template - Status _decode_fixed_values(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = reinterpret_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + _type_length * (select_vector.num_values() - - select_vector.num_filtered())); + Status _decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, + ColumnSelectVector& select_vector) { + size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t data_index = doris_column->size() * primitive_length; + size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * + (_type_length / primitive_length); + doris_column->resize(doris_column->size() + scale_size); + char* raw_data = const_cast(doris_column->get_raw_data().data); size_t dict_index = 0; ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - memcpy(column_data.data() + data_index, _dict_items[_indexes[dict_index++]], + memcpy(raw_data + data_index, _dict_items[_indexes[dict_index++]], _type_length); data_index += _type_length; } diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 72cb283f3f9506..40e4c54a822106 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -72,16 +72,17 @@ Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat return Status::IOError("Out-of-bounds access in parquet data decoder"); } - auto& column_data = reinterpret_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + - _type_length * (select_vector.num_values() - select_vector.num_filtered())); + size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t data_index = doris_column->size() * primitive_length; + size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * + (_type_length / primitive_length); + doris_column->resize(doris_column->size() + scale_size); + char* raw_data = const_cast(doris_column->get_raw_data().data); ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { - memcpy(column_data.data() + data_index, _data->data + _offset, - run_length * _type_length); + memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length); _offset += run_length * _type_length; data_index += run_length * _type_length; break;