diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 01a65077066d..ec087c8657b0 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -257,7 +257,7 @@ inline bool isSynthesizedColumn( const std::string& name, const std::unordered_map>& infoColumns) { - return name == kPath || name == kBucket || infoColumns.count(name) != 0; + return infoColumns.count(name) != 0; } inline bool isRowIndexColumn( diff --git a/velox/connectors/hive/HiveConnectorUtil.h b/velox/connectors/hive/HiveConnectorUtil.h index 3b5f25ad82ce..84a2dc67782d 100644 --- a/velox/connectors/hive/HiveConnectorUtil.h +++ b/velox/connectors/hive/HiveConnectorUtil.h @@ -33,9 +33,6 @@ struct HiveConnectorSplit; using SubfieldFilters = std::unordered_map>; -constexpr const char* kPath = "$path"; -constexpr const char* kBucket = "$bucket"; - const std::string& getColumnName(const common::Subfield& subfield); void checkColumnNameLowerCase(const std::shared_ptr& type); diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 0bea3cc30898..31b596467817 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -337,25 +337,6 @@ std::vector SplitReader::adaptColumns( if (auto it = hiveSplit_->partitionKeys.find(fieldName); it != hiveSplit_->partitionKeys.end()) { setPartitionValue(childSpec, fieldName, it->second); - } else if (fieldName == kPath) { - auto constantVec = std::make_shared>( - connectorQueryCtx_->memoryPool(), - 1, - false, - VARCHAR(), - StringView(hiveSplit_->filePath)); - childSpec->setConstantValue(constantVec); - } else if (fieldName == kBucket) { - if (hiveSplit_->tableBucketNumber.has_value()) { - int32_t bucket = hiveSplit_->tableBucketNumber.value(); - auto constantVec = std::make_shared>( - connectorQueryCtx_->memoryPool(), - 1, - false, - INTEGER(), - std::move(bucket)); - childSpec->setConstantValue(constantVec); - } } else if (auto iter = hiveSplit_->infoColumns.find(fieldName); iter != hiveSplit_->infoColumns.end()) { auto infoColumnType = diff --git a/velox/exec/fuzzer/FuzzerUtil.cpp b/velox/exec/fuzzer/FuzzerUtil.cpp index c13688455918..fa2908c6f8b9 100644 --- a/velox/exec/fuzzer/FuzzerUtil.cpp +++ b/velox/exec/fuzzer/FuzzerUtil.cpp @@ -119,20 +119,32 @@ Split makeSplit( const std::unordered_map>& partitionKeys, std::optional tableBucketNumber) { - return Split{std::make_shared( + return Split{makeConnectorSplit(filePath, partitionKeys, tableBucketNumber)}; +} + +std::shared_ptr makeConnectorSplit( + const std::string& filePath, + const std::unordered_map>& + partitionKeys, + std::optional tableBucketNumber) { + std::unordered_map infoColumns = { + {"$path", filePath}}; + if (tableBucketNumber.has_value()) { + infoColumns["$bucket"] = std::to_string(*tableBucketNumber); + } + return std::make_shared( kHiveConnectorId, filePath, dwio::common::FileFormat::DWRF, 0, std::numeric_limits::max(), partitionKeys, - tableBucketNumber)}; -} - -std::shared_ptr makeConnectorSplit( - const std::string& filePath) { - return std::make_shared( - kHiveConnectorId, filePath, dwio::common::FileFormat::DWRF); + tableBucketNumber, + /*customSplitInfo=*/std::unordered_map{}, + /*extraFileInfo=*/nullptr, + /*serdeParameters=*/std::unordered_map{}, + /*splitWeight=*/0, + infoColumns); } std::vector makeNames(const std::string& prefix, size_t n) { diff --git a/velox/exec/fuzzer/FuzzerUtil.h b/velox/exec/fuzzer/FuzzerUtil.h index fc06df157721..00b0f5a9d656 100644 --- a/velox/exec/fuzzer/FuzzerUtil.h +++ b/velox/exec/fuzzer/FuzzerUtil.h @@ -62,7 +62,10 @@ Split makeSplit( /// Create a connector split from an exsiting file. std::shared_ptr makeConnectorSplit( - const std::string& filePath); + const std::string& filePath, + const std::unordered_map>& + partitionKeys = {}, + std::optional tableBucketNumber = std::nullopt); /// Create column names with the pattern '${prefix}${i}'. std::vector makeNames(const std::string& prefix, size_t n); diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 576e44cbdbcc..bbacb0def04d 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -2723,7 +2723,7 @@ TEST_F(TableScanTest, path) { auto assignments = allRegularColumns(rowType); assignments[kPath] = synthesizedColumn(kPath, VARCHAR()); - auto pathValue = fmt::format("file:{}", filePath->getPath()); + auto& pathValue = filePath->getPath(); auto typeWithPath = ROW({kPath, "a"}, {VARCHAR(), BIGINT()}); auto op = PlanBuilder() .startTableScan() diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index aa6e5ac5beb9..74dd5223307b 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -225,8 +225,10 @@ class HiveConnectorTestBase : public OperatorTestBase { class HiveConnectorSplitBuilder { public: - HiveConnectorSplitBuilder(std::string filePath) - : filePath_{std::move(filePath)} {} + explicit HiveConnectorSplitBuilder(std::string filePath) + : filePath_{std::move(filePath)} { + infoColumns_["$path"] = filePath_; + } HiveConnectorSplitBuilder& start(uint64_t start) { start_ = start; @@ -264,6 +266,7 @@ class HiveConnectorSplitBuilder { HiveConnectorSplitBuilder& tableBucketNumber(int32_t bucket) { tableBucketNumber_ = bucket; + infoColumns_["$bucket"] = std::to_string(bucket); return *this; } @@ -296,7 +299,7 @@ class HiveConnectorSplitBuilder { static const std::unordered_map serdeParameters; return std::make_shared( connectorId_, - filePath_.find("/") == 0 ? "file:" + filePath_ : filePath_, + filePath_, fileFormat_, start_, length_,