Skip to content

Commit

Permalink
Fix PartitionIdGenerator's multiplier initialization for VectorHasher (
Browse files Browse the repository at this point in the history
…#9713)

Summary:
When having N columns, PartitionIdGenerator relies on N VectorHashers
 to generate ValueIds, with each VectorHasher’s multiplier correctly set up.

However when having multiple bool columns, it triggers a corner case that
VectorHasher’s multipliers won’t be set at all.

Add multiplierInitialized flag and properly set it to fix this corner case.

Pull Request resolved: #9713

Reviewed By: xiaoxmeng, spershin

Differential Revision: D57062896

Pulled By: kewang1024

fbshipit-source-id: b02002e1e920735606eb830be2a025fde5a8d101
  • Loading branch information
kewang1024 authored and facebook-github-bot committed May 7, 2024
1 parent 7cfb42f commit ac55339
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 3 deletions.
6 changes: 5 additions & 1 deletion velox/connectors/hive/PartitionIdGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,23 @@ void PartitionIdGenerator::computeValueIds(

bool rehash = false;
for (auto& hasher : hashers_) {
// NOTE: for boolean column type, computeValueIds() always returns true and
// this might cause problem in case of multiple boolean partition columns as
// we might not set the multiplier properly.
auto partitionVector = input->childAt(hasher->channel())->loadedVector();
hasher->decode(*partitionVector, allRows_);
if (!hasher->computeValueIds(allRows_, valueIds)) {
rehash = true;
}
}

if (!rehash) {
if (!rehash && hasMultiplierSet_) {
return;
}

uint64_t multiplier = 1;
for (auto& hasher : hashers_) {
hasMultiplierSet_ = true;
multiplier = hasher->typeKind() == TypeKind::BOOLEAN
? hasher->enableValueRange(multiplier, 50)
: hasher->enableValueIds(multiplier, 50);
Expand Down
4 changes: 2 additions & 2 deletions velox/connectors/hive/PartitionIdGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@

namespace facebook::velox::connector::hive {
/// Generate sequential integer IDs for distinct partition values, which could
/// be used as vector index. Only single partition key is supported at the
/// moment.
/// be used as vector index.
class PartitionIdGenerator {
public:
/// @param inputType RowType of the input.
Expand Down Expand Up @@ -83,6 +82,7 @@ class PartitionIdGenerator {
const bool partitionPathAsLowerCase_;

std::vector<std::unique_ptr<exec::VectorHasher>> hashers_;
bool hasMultiplierSet_ = false;

// A mapping from value ID produced by VectorHashers to a partition ID.
std::unordered_map<uint64_t, uint64_t> partitionIds_;
Expand Down
27 changes: 27 additions & 0 deletions velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ TEST_F(PartitionIdGeneratorTest, consecutiveIdsMultipleKeys) {
numPartitions - 1);
}

TEST_F(PartitionIdGeneratorTest, multipleBoolKeys) {
PartitionIdGenerator idGenerator(
ROW({BOOLEAN(), BOOLEAN()}), {0, 1}, 100, pool(), true);

auto input = makeRowVector({
makeFlatVector<bool>(
1'000, [](vector_size_t row) { return row < 50; }, nullEvery(7)),
makeFlatVector<bool>(
1'000,
[](vector_size_t row) { return (row % 2) == 0; },
nullEvery(3)),
});

raw_vector<uint64_t> ids;
idGenerator.run(input, ids);

// distinctIds contains 9 ids.
const auto numPartitions = 9;

std::unordered_set<uint64_t> distinctIds(ids.begin(), ids.end());
EXPECT_EQ(distinctIds.size(), numPartitions);
EXPECT_EQ(*std::min_element(distinctIds.begin(), distinctIds.end()), 0);
EXPECT_EQ(
*std::max_element(distinctIds.begin(), distinctIds.end()),
numPartitions - 1);
}

TEST_F(PartitionIdGeneratorTest, stableIdsSingleKey) {
PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool(), true);

Expand Down

0 comments on commit ac55339

Please sign in to comment.