facebookincubator · liujiayi771 · Apr 8, 2024 · Apr 22, 2024 · Apr 24, 2024 · Sep 27, 2024
diff --git a/velox/common/memory/tests/SharedArbitratorTest.cpp b/velox/common/memory/tests/SharedArbitratorTest.cpp
@@ -764,7 +764,7 @@ DEBUG_ONLY_TEST_P(
     folly::EventCount taskPauseWait;
     auto taskPauseWaitKey = taskPauseWait.prepareWait();
 
-    const auto fakeAllocationSize = kMemoryCapacity - (32L << 20);
+    const auto fakeAllocationSize = kMemoryCapacity - (2L << 20);
 
     std::atomic<bool> injectAllocationOnce{true};
     fakeOperatorFactory_->setAllocationCallback([&](Operator* op) {

diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
@@ -131,6 +131,12 @@ class QueryConfig {
   static constexpr const char* kAbandonPartialTopNRowNumberMinPct =
       "abandon_partial_topn_row_number_min_pct";
 
+  static constexpr const char* kAbandonBuildNoDupHashMinRows =
+      "abandon_build_no_dup_hash_min_rows";
+
+  static constexpr const char* kAbandonBuildNoDupHashMinPct =
+      "abandon_build_no_dup_hash_min_pct";
+
   static constexpr const char* kMaxPartitionedOutputBufferSize =
       "max_page_partitioning_buffer_size";
 
@@ -454,6 +460,14 @@ class QueryConfig {
     return get<int32_t>(kAbandonPartialTopNRowNumberMinPct, 80);
   }
 
+  int32_t abandonBuildNoDupHashMinRows() const {
+    return get<int32_t>(kAbandonBuildNoDupHashMinRows, 100'000);
+  }
+
+  int32_t abandonBuildNoDupHashMinPct() const {
+    return get<int32_t>(kAbandonBuildNoDupHashMinPct, 80);
+  }
+
   uint64_t maxSpillRunRows() const {
     static constexpr uint64_t kDefault = 12UL << 20;
     return get<uint64_t>(kMaxSpillRunRows, kDefault);

diff --git a/velox/exec/HashBuild.cpp b/velox/exec/HashBuild.cpp
@@ -67,7 +67,11 @@ HashBuild::HashBuild(
       joinBridge_(operatorCtx_->task()->getHashJoinBridgeLocked(
           operatorCtx_->driverCtx()->splitGroupId,
           planNodeId())),
-      keyChannelMap_(joinNode_->rightKeys().size()) {
+      keyChannelMap_(joinNode_->rightKeys().size()),
+      abandonBuildNoDupHashMinRows_(
+          driverCtx->queryConfig().abandonBuildNoDupHashMinRows()),
+      abandonBuildNoDupHashMinPct_(
+          driverCtx->queryConfig().abandonBuildNoDupHashMinPct()) {
   VELOX_CHECK(pool()->trackUsage());
   VELOX_CHECK_NOT_NULL(joinBridge_);
 
@@ -91,9 +95,11 @@ HashBuild::HashBuild(
     types.emplace_back(inputType->childAt(channel));
   }
 
+  dropDuplicates_ = canDropDuplicates(joinNode_);
+
   // Identify the non-key build side columns and make a decoder for each.
   const int32_t numDependents = inputType->size() - numKeys;
-  if (numDependents > 0) {
+  if (!dropDuplicates_ && numDependents > 0) {
     // Number of join keys (numKeys) may be less then number of input columns
     // (inputType->size()). In this case numDependents is negative and cannot be
     // used to call 'reserve'. This happens when we join different probe side
@@ -102,12 +108,16 @@ HashBuild::HashBuild(
     dependentChannels_.reserve(numDependents);
     decoders_.reserve(numDependents);
   }
-  for (auto i = 0; i < inputType->size(); ++i) {
-    if (keyChannelMap_.find(i) == keyChannelMap_.end()) {
-      dependentChannels_.emplace_back(i);
-      decoders_.emplace_back(std::make_unique<DecodedVector>());
-      names.emplace_back(inputType->nameOf(i));
-      types.emplace_back(inputType->childAt(i));
+  if (!dropDuplicates_) {
+    // For left semi and anti join with no extra filter, hash table does not
+    // store dependent columns.
+    for (auto i = 0; i < inputType->size(); ++i) {
+      if (keyChannelMap_.find(i) == keyChannelMap_.end()) {
+        dependentChannels_.emplace_back(i);
+        decoders_.emplace_back(std::make_unique<DecodedVector>());
+        names.emplace_back(inputType->nameOf(i));
+        types.emplace_back(inputType->childAt(i));
+      }
     }
   }
 
@@ -155,11 +165,6 @@ void HashBuild::setupTable() {
             .minTableRowsForParallelJoinBuild(),
         pool());
   } else {
-    // (Left) semi and anti join with no extra filter only needs to know whether
-    // there is a match. Hence, no need to store entries with duplicate keys.
-    const bool dropDuplicates = !joinNode_->filter() &&
-        (joinNode_->isLeftSemiFilterJoin() ||
-         joinNode_->isLeftSemiProjectJoin() || isAntiJoin(joinType_));
     // Right semi join needs to tag build rows that were probed.
     const bool needProbedFlag = joinNode_->isRightSemiFilterJoin();
     if (isLeftNullAwareJoinWithFilter(joinNode_)) {
@@ -168,7 +173,7 @@ void HashBuild::setupTable() {
       table_ = HashTable<false>::createForJoin(
           std::move(keyHashers),
           dependentTypes,
-          !dropDuplicates, // allowDuplicates
+          !dropDuplicates_, // allowDuplicates
           needProbedFlag, // hasProbedFlag
           operatorCtx_->driverCtx()
               ->queryConfig()
@@ -179,14 +184,16 @@ void HashBuild::setupTable() {
       table_ = HashTable<true>::createForJoin(
           std::move(keyHashers),
           dependentTypes,
-          !dropDuplicates, // allowDuplicates
+          !dropDuplicates_, // allowDuplicates
           needProbedFlag, // hasProbedFlag
           operatorCtx_->driverCtx()
               ->queryConfig()
               .minTableRowsForParallelJoinBuild(),
           pool());
     }
   }
+  lookup_ = std::make_unique<HashLookup>(table_->hashers());
+  lookup_->reset(1);
   analyzeKeys_ = table_->hashMode() != BaseHashTable::HashMode::kHash;
 }
 
@@ -381,6 +388,32 @@ void HashBuild::addInput(RowVectorPtr input) {
     return;
   }
 
+  numInputRows_ += activeRows_.countSelected();
+
+  if (dropDuplicates_ && !abandonBuildNoDupHash_) {
+    const bool abandonEarly = abandonBuildNoDupHashEarly(table_->numDistinct());
+    if (abandonEarly) {
+      // The hash table is no longer directly constructed in addInput. The data
+      // that was previously inserted into the hash table is already in the
+      // RowContainer.
+      addRuntimeStat("abandonBuildNoDupHash", RuntimeCounter(1));
+      abandonBuildNoDupHash_ = true;
+      table_->joinTableMayHaveDuplicates();
+    } else {
+      table_->prepareForGroupProbe(
+          *lookup_,
+          input,
+          activeRows_,
+          BaseHashTable::kNoSpillInputStartPartitionBit);
+      if (lookup_->rows.empty()) {
+        return;
+      }
+      table_->groupProbe(
+          *lookup_, BaseHashTable::kNoSpillInputStartPartitionBit);
+      return;
+    }
+  }
+
   if (analyzeKeys_ && hashes_.size() < activeRows_.end()) {
     hashes_.resize(activeRows_.end());
   }
@@ -771,7 +804,8 @@ bool HashBuild::finishHashBuild() {
         isInputFromSpill() ? spillConfig()->startPartitionBit
                            : BaseHashTable::kNoSpillInputStartPartitionBit,
         allowParallelJoinBuild ? operatorCtx_->task()->queryCtx()->executor()
-                               : nullptr);
+                               : nullptr,
+        dropDuplicates_);
   }
   stats_.wlock()->addRuntimeStat(
       BaseHashTable::kBuildWallNanos,
@@ -869,6 +903,7 @@ void HashBuild::setupSpillInput(HashJoinBridge::SpillInput spillInput) {
   setupTable();
   setupSpiller(spillInput.spillPartition.get());
   stateCleared_ = false;
+  numInputRows_ = 0;
 
   // Start to process spill input.
   processSpillInput();
@@ -1171,4 +1206,10 @@ void HashBuild::close() {
     table_.reset();
   }
 }
+
+bool HashBuild::abandonBuildNoDupHashEarly(int64_t numDistinct) const {
+  VELOX_CHECK(dropDuplicates_);
+  return numInputRows_ > abandonBuildNoDupHashMinRows_ &&
+      100 * numDistinct / numInputRows_ >= abandonBuildNoDupHashMinPct_;
+}
 } // namespace facebook::velox::exec
diff --git a/velox/exec/HashBuild.h b/velox/exec/HashBuild.h
@@ -200,6 +200,11 @@ class HashBuild final : public Operator {
   // not.
   bool nonReclaimableState() const;
 
+  // True if we have enough rows and not enough duplicate join keys, i.e. more
+  // than 'abandonBuildNoDuplicatesHashMinRows_' rows and more than
+  // 'abandonBuildNoDuplicatesHashMinPct_' % of rows are unique.
+  bool abandonBuildNoDupHashEarly(int64_t numDistinct) const;
+
   const std::shared_ptr<const core::HashJoinNode> joinNode_;
 
   const core::JoinType joinType_;
@@ -237,6 +242,7 @@ class HashBuild final : public Operator {
 
   // Container for the rows being accumulated.
   std::unique_ptr<BaseHashTable> table_;
+  std::unique_ptr<HashLookup> lookup_;
 
   // Key channels in 'input_'
   std::vector<column_index_t> keyChannels_;
@@ -265,6 +271,11 @@ class HashBuild final : public Operator {
   // at least one entry with null join keys.
   bool joinHasNullKeys_{false};
 
+  // Indicates whether drop duplicate rows. Rows containing duplicate keys
+  // can be removed for left semi and anti join.
+  bool dropDuplicates_{false};
+  bool abandonBuildNoDupHash_{false};
+
   // The type used to spill hash table which might attach a boolean column to
   // record the probed flag if 'needProbedFlagSpill_' is true.
   RowTypePtr spillType_;
@@ -303,6 +314,17 @@ class HashBuild final : public Operator {
 
   // Maps key channel in 'input_' to channel in key.
   folly::F14FastMap<column_index_t, column_index_t> keyChannelMap_;
+
+  // Count the number of input rows.
+  int64_t numInputRows_ = 0;
+
+  // Minimum number of rows to see before deciding to give up build no
+  // duplicates hash table.
+  const int32_t abandonBuildNoDupHashMinRows_;
+  // Min unique rows pct for give up build no duplicates hash table. If more
+  // than this many rows are unique, build hash table in addInput phase is not
+  // worthwhile.
+  const int32_t abandonBuildNoDupHashMinPct_;
 };
 
 inline std::ostream& operator<<(std::ostream& os, HashBuild::State state) {

diff --git a/velox/exec/HashJoinBridge.cpp b/velox/exec/HashJoinBridge.cpp
@@ -192,6 +192,15 @@ bool isLeftNullAwareJoinWithFilter(
       joinNode->isNullAware() && (joinNode->filter() != nullptr);
 }
 
+bool canDropDuplicates(
+    const std::shared_ptr<const core::HashJoinNode>& joinNode) {
+  // Left semi and anti join with no extra filter only needs to know whether
+  // there is a match. Hence, no need to store entries with duplicate keys.
+  return !joinNode->filter() &&
+      (joinNode->isLeftSemiFilterJoin() || joinNode->isLeftSemiProjectJoin() ||
+       joinNode->isAntiJoin());
+}
+
 uint64_t HashJoinMemoryReclaimer::reclaim(
     memory::MemoryPool* pool,
     uint64_t targetBytes,

diff --git a/velox/exec/HashJoinBridge.h b/velox/exec/HashJoinBridge.h
@@ -145,6 +145,11 @@ class HashJoinBridge : public JoinBridge {
 bool isLeftNullAwareJoinWithFilter(
     const std::shared_ptr<const core::HashJoinNode>& joinNode);
 
+// Indicates if 'joinNode' can drop duplicate rows with same join key. For left
+// semi and anti join, it is not necessary to store duplicate rows.
+bool canDropDuplicates(
+    const std::shared_ptr<const core::HashJoinNode>& joinNode);
+
 class HashJoinMemoryReclaimer final : public MemoryReclaimer {
  public:
   static std::unique_ptr<memory::MemoryReclaimer> create() {

diff --git a/velox/exec/HashProbe.cpp b/velox/exec/HashProbe.cpp
@@ -1979,13 +1979,18 @@ void HashProbe::prepareTableSpill(
     names.emplace_back(tableInputType->nameOf(channel));
     types.emplace_back(tableInputType->childAt(channel));
   }
-  const auto numDependents = tableInputType->size() - numKeys;
-  for (auto i = 0; i < tableInputType->size(); ++i) {
-    if (keyChannelMap.find(i) == keyChannelMap.end()) {
-      names.emplace_back(tableInputType->nameOf(i));
-      types.emplace_back(tableInputType->childAt(i));
+  if (!canDropDuplicates(joinNode_)) {
+    // For left semi and anti join with no extra filter, hash table does not
+    // store dependent columns.
+    const auto numDependents = tableInputType->size() - numKeys;
+    for (auto i = 0; i < tableInputType->size(); ++i) {
+      if (keyChannelMap.find(i) == keyChannelMap.end()) {
+        names.emplace_back(tableInputType->nameOf(i));
+        types.emplace_back(tableInputType->childAt(i));
+      }
     }
   }
+
   tableSpillType_ = hashJoinTableSpillType(
       ROW(std::move(names), std::move(types)), joinType_);
 }

diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp
@@ -55,7 +55,8 @@ HashTable<ignoreNullKeys>::HashTable(
     memory::MemoryPool* pool)
     : BaseHashTable(std::move(hashers)),
       minTableSizeForParallelJoinBuild_(minTableSizeForParallelJoinBuild),
-      isJoinBuild_(isJoinBuild) {
+      isJoinBuild_(isJoinBuild),
+      joinBuildNoDuplicates_(!allowDuplicates) {
   std::vector<TypePtr> keys;
   for (auto& hasher : hashers_) {
     keys.push_back(hasher->type());
@@ -1512,7 +1513,9 @@ void HashTable<ignoreNullKeys>::decideHashMode(
     return;
   }
   disableRangeArrayHash_ |= disableRangeArrayHash;
-  if (numDistinct_ && !isJoinBuild_) {
+  if (numDistinct_ && (!isJoinBuild_ || joinBuildNoDuplicates_)) {
+    // If the join type is left semi and anti, allowDuplicates_ will be false,
+    // and join build is building hash table while adding input rows.
     if (!analyze()) {
       setHashMode(HashMode::kHash, numNew, spillInputStartPartitionBit);
       return;
@@ -1731,8 +1734,21 @@ template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::prepareJoinTable(
     std::vector<std::unique_ptr<BaseHashTable>> tables,
     int8_t spillInputStartPartitionBit,
-    folly::Executor* executor) {
+    folly::Executor* executor,
+    bool dropDuplicates) {
   buildExecutor_ = executor;
+  if (dropDuplicates) {
+    if (table_ != nullptr) {
+      // Set table_ to nullptr to trigger rehash.
+      rows_->pool()->freeContiguous(tableAllocation_);
+      table_ = nullptr;
+    }
+    // Call analyze to insert all unique values in row container to the
+    // table hashers' uniqueValues_;
+    if (!analyze()) {
+      setHashMode(HashMode::kHash, 0, spillInputStartPartitionBit);
+    }
+  }
   otherTables_.reserve(tables.size());
   for (auto& table : tables) {
     otherTables_.emplace_back(std::unique_ptr<HashTable<ignoreNullKeys>>(
@@ -1762,6 +1778,15 @@ void HashTable<ignoreNullKeys>::prepareJoinTable(
     }
     if (useValueIds) {
       for (auto& other : otherTables_) {
+        if (dropDuplicates) {
+          // Before merging with the current hashers, all values in the row
+          // containers of other table need to be inserted into uniqueValues_.
+          if (!other->analyze()) {
+            other->setHashMode(HashMode::kHash, 0, spillInputStartPartitionBit);
+            useValueIds = false;
+            break;
+          }
+        }
         for (auto i = 0; i < hashers_.size(); ++i) {
           hashers_[i]->merge(*other->hashers_[i]);
           if (!hashers_[i]->mayUseValueIds()) {