Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
zanmato1984 committed Jul 9, 2024
1 parent daccdc2 commit 185a73c
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
16 changes: 12 additions & 4 deletions cpp/src/arrow/compute/row/encode_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,21 @@ void RowTableEncoder::PrepareEncodeSelected(int64_t start_row, int64_t num_rows,
Status RowTableEncoder::EncodeSelected(RowTableImpl* rows, uint32_t num_selected,
const uint16_t* selection) {
rows->Clean();
RETURN_NOT_OK(
rows->AppendEmpty(static_cast<uint32_t>(num_selected), static_cast<uint32_t>(0)));

// First AppendEmpty with num_selected rows and zero extra bytes to resize the
// fixed-length buffers (including buffer for offsets).
RETURN_NOT_OK(
rows->AppendEmpty(static_cast<uint32_t>(num_selected),
/*num_extra_bytes_to_append=*/static_cast<uint32_t>(0)));
// Then populate the offsets of the var-length columns, which will be used as the target
// size of the var-length buffers resizing below.
EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, num_selected,
selection);

RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(0), static_cast<uint32_t>(0)));
// Last AppendEmpty with zero rows and zero extra bytes to resize the var-length buffers
// based on the populated offsets.
RETURN_NOT_OK(
rows->AppendEmpty(/*num_rows_to_append=*/static_cast<uint32_t>(0),
/*num_extra_bytes_to_append=*/static_cast<uint32_t>(0)));

for (size_t icol = 0; icol < batch_all_cols_.size(); ++icol) {
if (batch_all_cols_[icol].metadata().is_fixed_length) {
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/compute/row/row_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ Result<RowTableImpl> MakeRowTableFromColumn(const std::shared_ptr<Array>& column

} // namespace

// GH-43129: Ensure that the memory consumption of the row table is reasonable, that is,
// with the growth factor of 2, the actual memory usage does not exceed twice the amount
// of memory actually needed.
TEST(RowTableMemoryConsumption, Encode) {
constexpr int64_t num_rows_max = 8192;
constexpr int64_t padding_for_vectors = 64;
Expand All @@ -87,10 +90,12 @@ TEST(RowTableMemoryConsumption, Encode) {

int64_t actual_null_mask_size =
num_rows * row_table.metadata().null_masks_bytes_per_row;
ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors);
ASSERT_GT(actual_null_mask_size * 2,
row_table.buffer_size(0) - padding_for_vectors);

int64_t actual_rows_size = num_rows * uint32()->byte_width();
ASSERT_LE(actual_rows_size, row_table.buffer_size(1) - padding_for_vectors);
ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(1) - padding_for_vectors);
}

Expand All @@ -105,13 +110,16 @@ TEST(RowTableMemoryConsumption, Encode) {

int64_t actual_null_mask_size =
num_rows * row_table.metadata().null_masks_bytes_per_row;
ASSERT_LE(actual_null_mask_size, row_table.buffer_size(0) - padding_for_vectors);
ASSERT_GT(actual_null_mask_size * 2,
row_table.buffer_size(0) - padding_for_vectors);

int64_t actual_offset_size = num_rows * sizeof(uint32_t);
ASSERT_LE(actual_offset_size, row_table.buffer_size(1) - padding_for_vectors);
ASSERT_GT(actual_offset_size * 2, row_table.buffer_size(1) - padding_for_vectors);

int64_t actual_rows_size = num_rows * row_table.offsets()[1];
ASSERT_LE(actual_rows_size, row_table.buffer_size(2) - padding_for_vectors);
ASSERT_GT(actual_rows_size * 2, row_table.buffer_size(2) - padding_for_vectors);
}
}
Expand Down

0 comments on commit 185a73c

Please sign in to comment.