Skip to content

Commit

Permalink
[fix](ubsan) reinterpret_cast fix length types to int8 is not safe (a…
Browse files Browse the repository at this point in the history
…pache#35912)

Fix type check of ubsan. 
```
/root/doris/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h:75:78: runtime error: member call on address 0x5582f35db5c0 which does not point to an object of type 'doris::vectorized::ColumnVector<signed char>'
0x5582f35db5c0: note: object is of type 'doris::vectorized::ColumnVector<int>'
 83 55 00 00  78 c0 b0 5a 82 55 00 00  02 00 00 00 00 00 00 00  10 a0 00 d7 83 55 00 00  10 a0 00 d7
              ^~~~~~~~~~~~~~~~~~~~~~~
              vptr for 'doris::vectorized::ColumnVector<int>'
doris::Status doris::vectorized::FixLengthPlainDecoder::_decode_values<false>(COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>&, std::shared_ptr<doris::vectorized::IDataType const>&, doris::vectorized::ColumnSelectVector&, bool) at fix_length_plain_decoder.h:75:78
```
  • Loading branch information
AshinGau authored Jun 9, 2024
1 parent 163d4de commit f211eb5
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
17 changes: 10 additions & 7 deletions be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,26 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
}

return _decode_fixed_values<has_filter>(doris_column, select_vector);
return _decode_fixed_values<has_filter>(doris_column, data_type, select_vector);
}

protected:
template <bool has_filter>
Status _decode_fixed_values(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + _type_length * (select_vector.num_values() -
select_vector.num_filtered()));
Status _decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
size_t data_index = doris_column->size() * primitive_length;
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
(_type_length / primitive_length);
doris_column->resize(doris_column->size() + scale_size);
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
memcpy(column_data.data() + data_index, _dict_items[_indexes[dict_index++]],
memcpy(raw_data + data_index, _dict_items[_indexes[dict_index++]],
_type_length);
data_index += _type_length;
}
Expand Down
13 changes: 7 additions & 6 deletions be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
return Status::IOError("Out-of-bounds access in parquet data decoder");
}

auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index +
_type_length * (select_vector.num_values() - select_vector.num_filtered()));
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
size_t data_index = doris_column->size() * primitive_length;
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
(_type_length / primitive_length);
doris_column->resize(doris_column->size() + scale_size);
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
memcpy(column_data.data() + data_index, _data->data + _offset,
run_length * _type_length);
memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length);
_offset += run_length * _type_length;
data_index += run_length * _type_length;
break;
Expand Down

0 comments on commit f211eb5

Please sign in to comment.