Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions cpp/src/arrow/compute/row/encode_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "arrow/compute/row/encode_internal.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/int_util_overflow.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -160,8 +161,8 @@ Status RowTableEncoder::EncodeSelected(RowTableImpl* rows, uint32_t num_selected
/*num_extra_bytes_to_append=*/static_cast<uint32_t>(0)));
// Then populate the offsets of the var-length columns, which will be used as the target
// size of the var-length buffers resizing below.
EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, num_selected,
selection);
RETURN_NOT_OK(EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_,
num_selected, selection));
// Last AppendEmpty with zero rows and zero extra bytes to resize the var-length buffers
// based on the populated offsets.
RETURN_NOT_OK(
Expand Down Expand Up @@ -667,12 +668,12 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows,
}
}

void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected,
const uint16_t* selection) {
Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected,
const uint16_t* selection) {
if (rows->metadata().is_fixed_length) {
return;
return Status::OK();
}

uint32_t* row_offsets = rows->mutable_offsets();
Expand Down Expand Up @@ -713,9 +714,18 @@ void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
uint32_t length = row_offsets[i];
length += RowTableMetadata::padding_for_alignment(length, row_alignment);
row_offsets[i] = sum;
sum += length;
uint32_t sum_maybe_overflow = 0;
if (ARROW_PREDICT_FALSE(
arrow::internal::AddWithOverflow(sum, length, &sum_maybe_overflow))) {
return Status::Invalid(
"Offset overflow detected in EncoderOffsets::GetRowOffsetsSelected for row ", i,
" of length ", length, " bytes, current length in total is ", sum, " bytes");
}
sum = sum_maybe_overflow;
}
row_offsets[num_selected] = sum;

return Status::OK();
}

template <bool has_nulls, bool is_first_varbinary>
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/compute/row/encode_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ class EncoderBinaryPair {

class EncoderOffsets {
public:
static void GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);
static Status GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);
static void EncodeSelected(RowTableImpl* rows, const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);

Expand Down
12 changes: 10 additions & 2 deletions cpp/src/arrow/compute/row/row_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "arrow/compute/row/row_internal.h"

#include "arrow/compute/util.h"
#include "arrow/util/int_util_overflow.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -325,14 +326,21 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from,
// Varying-length rows
auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
// TODO(GH-43202): The following two variables are possibly overflowing.
uint32_t total_length = to_offsets[num_rows_];
uint32_t total_length_to_append = 0;
for (uint32_t i = 0; i < num_rows_to_append; ++i) {
uint16_t row_id = source_row_ids ? source_row_ids[i] : i;
uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
total_length_to_append += length;
to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
uint32_t to_offset_maybe_overflow = 0;
if (ARROW_PREDICT_FALSE(arrow::internal::AddWithOverflow(
total_length, total_length_to_append, &to_offset_maybe_overflow))) {
return Status::Invalid(
"Offset overflow detected in RowTableImpl::AppendSelectionFrom for row ",
num_rows_ + i, " of length ", length, " bytes, current length in total is ",
to_offsets[num_rows_ + i], " bytes");
}
to_offsets[num_rows_ + i + 1] = to_offset_maybe_overflow;
}

RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
Expand Down
129 changes: 129 additions & 0 deletions cpp/src/arrow/compute/row/row_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,134 @@ TEST(RowTableMemoryConsumption, Encode) {
}
}

// GH-43202: Ensure that when offset overflow happens in encoding the row table, an
// explicit error is raised instead of a silent wrong result.
TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
if constexpr (sizeof(void*) == 4) {
GTEST_SKIP() << "Test only works on 64-bit platforms";
}

// Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
constexpr int64_t num_rows = 8;
constexpr int64_t length_per_binary = 512 * 1024 * 1024;
constexpr int64_t row_alignment = sizeof(uint32_t);
constexpr int64_t var_length_alignment = sizeof(uint32_t);

MemoryPool* pool = default_memory_pool();

// The column to encode.
std::vector<KeyColumnArray> columns;
std::vector<Datum> values;
ASSERT_OK_AND_ASSIGN(
auto value, ::arrow::gen::Constant(
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
->Generate(1));
values.push_back(std::move(value));
ExecBatch batch = ExecBatch(std::move(values), 1);
ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));

// The row table.
std::vector<KeyColumnMetadata> column_metadatas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
RowTableMetadata table_metadata;
table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
var_length_alignment);
RowTableImpl row_table;
ASSERT_OK(row_table.Init(pool, table_metadata));
RowTableEncoder row_encoder;
row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);

// The rows to encode.
std::vector<uint16_t> row_ids(num_rows, 0);

// Encoding 7 rows should be fine.
{
row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns);
ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows - 1),
row_ids.data()));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we drop/clear the row_table after this to release a bit of memory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking!

Every time an encoder encodes into a row table, the row table gets effectively cleared. So the memory is only hold til the next EncodeSelected call. Clearing it here doesn't make much difference I guess(?) - the code in between is relatively trivial.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok, thank you

}

// Encoding 8 rows should overflow.
{
int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
std::stringstream expected_error_message;
expected_error_message << "Invalid: Offset overflow detected in "
"EncoderOffsets::GetRowOffsetsSelected for row "
<< num_rows - 1 << " of length " << length_per_row
<< " bytes, current length in total is "
<< length_per_row * (num_rows - 1) << " bytes";
row_encoder.PrepareEncodeSelected(0, num_rows, columns);
ASSERT_RAISES_WITH_MESSAGE(
Invalid, expected_error_message.str(),
row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows),
row_ids.data()));
}
}

// GH-43202: Ensure that when offset overflow happens in appending to the row table, an
// explicit error is raised instead of a silent wrong result.
TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) {
if constexpr (sizeof(void*) == 4) {
GTEST_SKIP() << "Test only works on 64-bit platforms";
}

// Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
constexpr int64_t num_rows = 8;
constexpr int64_t length_per_binary = 512 * 1024 * 1024;
constexpr int64_t num_rows_seed = 1;
constexpr int64_t row_alignment = sizeof(uint32_t);
constexpr int64_t var_length_alignment = sizeof(uint32_t);

MemoryPool* pool = default_memory_pool();

// The column to encode.
std::vector<KeyColumnArray> columns;
std::vector<Datum> values;
ASSERT_OK_AND_ASSIGN(
auto value, ::arrow::gen::Constant(
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
->Generate(num_rows_seed));
values.push_back(std::move(value));
ExecBatch batch = ExecBatch(std::move(values), num_rows_seed);
ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));

// The seed row table.
std::vector<KeyColumnMetadata> column_metadatas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
RowTableMetadata table_metadata;
table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
var_length_alignment);
RowTableImpl row_table_seed;
ASSERT_OK(row_table_seed.Init(pool, table_metadata));
RowTableEncoder row_encoder;
row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);
row_encoder.PrepareEncodeSelected(0, num_rows_seed, columns);
std::vector<uint16_t> row_ids(num_rows_seed, 0);
ASSERT_OK(row_encoder.EncodeSelected(
&row_table_seed, static_cast<uint32_t>(num_rows_seed), row_ids.data()));

// The target row table.
RowTableImpl row_table;
ASSERT_OK(row_table.Init(pool, table_metadata));

// Appending the seed 7 times should be fine.
for (int i = 0; i < num_rows - 1; ++i) {
ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
/*source_row_ids=*/NULLPTR));
}

// Appending the seed the 8-th time should overflow.
int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
std::stringstream expected_error_message;
expected_error_message
<< "Invalid: Offset overflow detected in RowTableImpl::AppendSelectionFrom for row "
<< num_rows - 1 << " of length " << length_per_row
<< " bytes, current length in total is " << length_per_row * (num_rows - 1)
<< " bytes";
ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(),
row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
/*source_row_ids=*/NULLPTR));
}

} // namespace compute
} // namespace arrow