Skip to content

[C++][Compute] Possible overflow of row table offsets #43202

@zanmato1984

Description

@zanmato1984

Describe the bug, including details regarding any error messages, version, and platform.

When debugging the test in #43046, I accidentally found that the offsets within the row table were overflowing (which is much worse than #43046 itself). The suspected code is:

uint32_t total_length = to_offsets[num_rows_];
uint32_t total_length_to_append = 0;

Here is a complete test that exposes this issue (note you'll need #43065 to really replicate this issue, otherwise the test crashes first):

// Compare columns to rows of row ids larger than 2^31 within a row table.
// Certain AVX2 instructions may behave unexpectedly causing troubles like GH-43046.
TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsMany)) {
  if constexpr (sizeof(void*) == 4) {
    GTEST_SKIP() << "Test only works on 64-bit platforms";
  }

  // The idea of this case is to create a row table containing one fixed length column and
  // one var length column (so the row is hence var length and has offset buffer), by
  // appending the same small batch of n rows repeatedly until it has more than 2^31 rows.
  // Then compare the last n rows of the row table with the batch.
  constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
  constexpr int64_t num_rows_row_table =
      (std::numeric_limits<int32_t>::max() + 1ll) / num_rows_batch * num_rows_batch +
      num_rows_batch;

  MemoryPool* pool = default_memory_pool();

  // The left side columns with num_rows_batch rows.
  std::vector<KeyColumnArray> columns_left;
  ExecBatch batch_left;
  {
    std::vector<Datum> values;

    // A fixed length array containing random values.
    ASSERT_OK_AND_ASSIGN(auto value_fixed_length,
                         ::arrow::gen::Random(uint32())->Generate(num_rows_batch));
    values.push_back(std::move(value_fixed_length));

    // A var length array containing small var length values ("X").
    ASSERT_OK_AND_ASSIGN(auto value_var_length,
                         ::arrow::gen::Constant(std::make_shared<BinaryScalar>("X"))
                             ->Generate(num_rows_batch));
    values.push_back(std::move(value_var_length));

    batch_left = ExecBatch(std::move(values), num_rows_batch);
    ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
  }

  // The right side row table with num_rows_row_table rows.
  RowTableImpl row_table_right;
  {
    // Encode the row table with the left columns repeatedly.
    std::vector<KeyColumnMetadata> column_metadatas;
    ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
    RowTableMetadata table_metadata;
    table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
                                            sizeof(uint64_t));
    ASSERT_OK(row_table_right.Init(pool, table_metadata));
    RowTableImpl row_table_batch;
    ASSERT_OK(row_table_batch.Init(pool, table_metadata));
    std::vector<uint16_t> row_ids(num_rows_batch);
    std::iota(row_ids.begin(), row_ids.end(), 0);
    RowTableEncoder row_encoder;
    row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
    row_encoder.PrepareEncodeSelected(0, num_rows_batch, columns_left);
    ASSERT_OK(row_encoder.EncodeSelected(
        &row_table_batch, static_cast<uint32_t>(num_rows_batch), row_ids.data()));
    for (int i = 0; i < num_rows_row_table / num_rows_batch; ++i) {
      ASSERT_OK(row_table_right.AppendSelectionFrom(row_table_batch, num_rows_batch,
                                                    /*source_row_ids=*/NULLPTR));
    }

    // The row table must contain an offset buffer.
    ASSERT_NE(row_table_right.offsets(), NULLPTR);
    ASSERT_EQ(row_table_right.length(), num_rows_row_table);
  }

  // The rows to compare: all rows in the batch to the last num_rows_batch rows of the
  // row table.
  std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
  std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
            num_rows_row_table - num_rows_batch);

  TempVectorStack stack;
  ASSERT_OK(
      stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_batch)));
  LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};

  {
    // No selection, output no match row ids.
    uint32_t num_rows_no_match;
    std::vector<uint16_t> row_ids_out(num_rows_batch);
    KeyCompare::CompareColumnsToRows(num_rows_batch, /*sel_left_maybe_null=*/NULLPTR,
                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
                                     row_ids_out.data(), columns_left, row_table_right,
                                     /*are_cols_in_encoding_order=*/true,
                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
    ASSERT_EQ(num_rows_no_match, 0);
  }

  {
    // No selection, output match bit vector.
    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
    KeyCompare::CompareColumnsToRows(
        num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx,
        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
        row_table_right,
        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
    ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows_batch),
              num_rows_batch);
  }

  std::vector<uint16_t> selection_left(num_rows_batch);
  std::iota(selection_left.begin(), selection_left.end(), 0);

  {
    // With selection, output no match row ids.
    uint32_t num_rows_no_match;
    std::vector<uint16_t> row_ids_out(num_rows_batch);
    KeyCompare::CompareColumnsToRows(num_rows_batch, selection_left.data(),
                                     row_ids_to_compare.data(), &ctx, &num_rows_no_match,
                                     row_ids_out.data(), columns_left, row_table_right,
                                     /*are_cols_in_encoding_order=*/true,
                                     /*out_match_bitvector_maybe_null=*/NULLPTR);
    ASSERT_EQ(num_rows_no_match, 0);
  }

  {
    // With selection, output match bit vector.
    std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
    KeyCompare::CompareColumnsToRows(
        num_rows_batch, selection_left.data(), row_ids_to_compare.data(), &ctx,
        /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
        row_table_right,
        /*are_cols_in_encoding_order=*/true, match_bitvector.data());
    ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows_batch),
              num_rows_batch);
  }
}

Component(s)

C++

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions