diff --git a/velox/connectors/hive/PartitionIdGenerator.cpp b/velox/connectors/hive/PartitionIdGenerator.cpp index 7d4734cd1996..bc5678a46cc5 100644 --- a/velox/connectors/hive/PartitionIdGenerator.cpp +++ b/velox/connectors/hive/PartitionIdGenerator.cpp @@ -112,6 +112,9 @@ void PartitionIdGenerator::computeValueIds( bool rehash = false; for (auto& hasher : hashers_) { + // NOTE: for boolean column type, computeValueIds() always returns true and + // this might cause problem in case of multiple boolean partition columns as + // we might not set the multiplier properly. auto partitionVector = input->childAt(hasher->channel())->loadedVector(); hasher->decode(*partitionVector, allRows_); if (!hasher->computeValueIds(allRows_, valueIds)) { @@ -119,12 +122,13 @@ void PartitionIdGenerator::computeValueIds( } } - if (!rehash) { + if (!rehash && hasMultiplierSet_) { return; } uint64_t multiplier = 1; for (auto& hasher : hashers_) { + hasMultiplierSet_ = true; multiplier = hasher->typeKind() == TypeKind::BOOLEAN ? hasher->enableValueRange(multiplier, 50) : hasher->enableValueIds(multiplier, 50); diff --git a/velox/connectors/hive/PartitionIdGenerator.h b/velox/connectors/hive/PartitionIdGenerator.h index 2df6a93e8750..01b638c0f3ad 100644 --- a/velox/connectors/hive/PartitionIdGenerator.h +++ b/velox/connectors/hive/PartitionIdGenerator.h @@ -20,8 +20,7 @@ namespace facebook::velox::connector::hive { /// Generate sequential integer IDs for distinct partition values, which could -/// be used as vector index. Only single partition key is supported at the -/// moment. +/// be used as vector index. class PartitionIdGenerator { public: /// @param inputType RowType of the input. @@ -83,6 +82,7 @@ class PartitionIdGenerator { const bool partitionPathAsLowerCase_; std::vector> hashers_; + bool hasMultiplierSet_ = false; // A mapping from value ID produced by VectorHashers to a partition ID. std::unordered_map partitionIds_; diff --git a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp index 493a6b3194eb..7f980cffe7b5 100644 --- a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp +++ b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp @@ -80,6 +80,33 @@ TEST_F(PartitionIdGeneratorTest, consecutiveIdsMultipleKeys) { numPartitions - 1); } +TEST_F(PartitionIdGeneratorTest, multipleBoolKeys) { + PartitionIdGenerator idGenerator( + ROW({BOOLEAN(), BOOLEAN()}), {0, 1}, 100, pool(), true); + + auto input = makeRowVector({ + makeFlatVector( + 1'000, [](vector_size_t row) { return row < 50; }, nullEvery(7)), + makeFlatVector( + 1'000, + [](vector_size_t row) { return (row % 2) == 0; }, + nullEvery(3)), + }); + + raw_vector ids; + idGenerator.run(input, ids); + + // distinctIds contains 9 ids. + const auto numPartitions = 9; + + std::unordered_set distinctIds(ids.begin(), ids.end()); + EXPECT_EQ(distinctIds.size(), numPartitions); + EXPECT_EQ(*std::min_element(distinctIds.begin(), distinctIds.end()), 0); + EXPECT_EQ( + *std::max_element(distinctIds.begin(), distinctIds.end()), + numPartitions - 1); +} + TEST_F(PartitionIdGeneratorTest, stableIdsSingleKey) { PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool(), true);