executer & planner changes to potentially improve placing (#8409) (#8442)

gridnevvvit · web-flow · commit 3c07380b025a · 2024-08-29T14:58:27.000+03:00
diff --git a/ydb/core/kqp/executer_actor/kqp_planner.cpp b/ydb/core/kqp/executer_actor/kqp_planner.cpp
@@ -60,7 +60,6 @@ bool TKqpPlanner::UseMockEmptyPlanner = false;
 // Task can allocate extra memory during execution.
 // So, we estimate total memory amount required for task as apriori task size multiplied by this constant.
 constexpr ui32 MEMORY_ESTIMATION_OVERFLOW = 2;
-constexpr ui32 MAX_NON_PARALLEL_TASKS_EXECUTION_LIMIT = 8;
 
 TKqpPlanner::TKqpPlanner(TKqpPlanner::TArgs&& args)
     : TxId(args.TxId)
@@ -256,9 +255,18 @@ std::unique_ptr<IEventHandle> TKqpPlanner::AssignTasksToNodes() {
 
     auto localResources = ResourceManager_->GetLocalResources();
     Y_UNUSED(MEMORY_ESTIMATION_OVERFLOW);
+
+    auto placingOptions = ResourceManager_->GetPlacingOptions();
+
+    bool singleNodeExecutionMakeSence = (
+        ResourceEstimations.size() <= placingOptions.MaxNonParallelTasksExecutionLimit ||
+        // all readers are located on the one node.
+        TasksPerNode.size() == 1
+    );
+
     if (LocalRunMemoryEst * MEMORY_ESTIMATION_OVERFLOW <= localResources.Memory[NRm::EKqpMemoryPool::ScanQuery] &&
         ResourceEstimations.size() <= localResources.ExecutionUnits &&
-        ResourceEstimations.size() <= MAX_NON_PARALLEL_TASKS_EXECUTION_LIMIT)
+        singleNodeExecutionMakeSence)
     {
         ui64 selfNodeId = ExecuterId.NodeId();
         for(ui64 taskId: ComputeTasks) {
@@ -293,6 +301,41 @@ std::unique_ptr<IEventHandle> TKqpPlanner::AssignTasksToNodes() {
         return std::make_unique<IEventHandle>(ExecuterId, ExecuterId, ev.Release());
     }
 
+    std::vector<ui64> deepestTasks;
+    ui64 maxLevel = 0;
+    for(auto& task: TasksGraph.GetTasks()) {
+        // const auto& task = TasksGraph.GetTask(taskId);
+        const auto& stageInfo = TasksGraph.GetStageInfo(task.StageId);
+        const NKqpProto::TKqpPhyStage& stage = stageInfo.Meta.GetStage(stageInfo.Id);
+        const ui64 stageLevel = stage.GetProgram().GetSettings().GetStageLevel();
+
+        if (stageLevel > maxLevel) {
+            maxLevel = stageLevel;
+            deepestTasks.clear();
+        }
+
+        if (stageLevel == maxLevel) {
+            deepestTasks.push_back(task.Id);
+        }
+    }
+
+    THashMap<ui64, ui64> alreadyAssigned;
+    for(auto& [nodeId, tasks] : TasksPerNode) {
+        for(ui64 taskId: tasks) {
+            alreadyAssigned.emplace(taskId, nodeId);
+        }
+    }
+
+    if (deepestTasks.size() <= placingOptions.MaxNonParallelTopStageExecutionLimit) {
+        // looks like the merge / union all connection
+        for(ui64 taskId: deepestTasks) {
+            auto [it, success] = alreadyAssigned.emplace(taskId, ExecuterId.NodeId());
+            if (success) {
+                TasksPerNode[ExecuterId.NodeId()].push_back(taskId);
+            }
+        }
+    }
+
     auto planner = (UseMockEmptyPlanner ? CreateKqpMockEmptyPlanner() : CreateKqpGreedyPlanner());  // KqpMockEmptyPlanner is a mock planner for tests
 
     auto ctx = TlsActivationContext->AsActorContext();
@@ -309,13 +352,6 @@ std::unique_ptr<IEventHandle> TKqpPlanner::AssignTasksToNodes() {
 
     auto plan = planner->Plan(ResourcesSnapshot, ResourceEstimations);
 
-    THashMap<ui64, ui64> alreadyAssigned;
-    for(auto& [nodeId, tasks] : TasksPerNode) {
-        for(ui64 taskId: tasks) {
-            alreadyAssigned.emplace(taskId, nodeId);
-        }
-    }
-
     if (!plan.empty()) {
         for (auto& group : plan) {
             for(ui64 taskId: group.TaskIds) {
diff --git a/ydb/core/kqp/rm_service/kqp_rm_service.cpp b/ydb/core/kqp/rm_service/kqp_rm_service.cpp
@@ -142,6 +142,13 @@ class TKqpResourceManager : public IKqpResourceManager {
         return Counters;
     }
 
+    TPlannerPlacingOptions GetPlacingOptions() override {
+        return TPlannerPlacingOptions{
+            .MaxNonParallelTasksExecutionLimit = MaxNonParallelTasksExecutionLimit.load(),
+            .MaxNonParallelTopStageExecutionLimit = MaxNonParallelTopStageExecutionLimit.load(),
+        };
+    }
+
     void CreateResourceInfoExchanger(
             const NKikimrConfig::TTableServiceConfig::TResourceManager::TInfoExchangerSettings& settings) {
         PublishResourcesByExchanger = true;
@@ -414,6 +421,8 @@ class TKqpResourceManager : public IKqpResourceManager {
         MinChannelBufferSize.store(config.GetMinChannelBufferSize());
         MaxTotalChannelBuffersSize.store(config.GetMaxTotalChannelBuffersSize());
         QueryMemoryLimit.store(config.GetQueryMemoryLimit());
+        MaxNonParallelTopStageExecutionLimit.store(config.GetMaxNonParallelTopStageExecutionLimit());
+        MaxNonParallelTasksExecutionLimit.store(config.GetMaxNonParallelTasksExecutionLimit());
     }
 
     ui32 GetNodeId() override {
@@ -460,6 +469,8 @@ class TKqpResourceManager : public IKqpResourceManager {
     std::atomic<i32> ExecutionUnitsLimit;
     TLimitedResource<ui64> ScanQueryMemoryResource;
     std::atomic<i64> ExternalDataQueryMemory = 0;
+    std::atomic<ui64> MaxNonParallelTopStageExecutionLimit = 1;
+    std::atomic<ui64> MaxNonParallelTasksExecutionLimit = 8;
 
     // current state
     std::atomic<ui64> LastResourceBrokerTaskId = 0;
diff --git a/ydb/core/kqp/rm_service/kqp_rm_service.h b/ydb/core/kqp/rm_service/kqp_rm_service.h
@@ -202,6 +202,11 @@ struct TKqpLocalNodeResources {
     std::array<ui64, EKqpMemoryPool::Count> Memory;
 };
 
+struct TPlannerPlacingOptions {
+    ui64 MaxNonParallelTasksExecutionLimit = 8;
+    ui64 MaxNonParallelTopStageExecutionLimit = 1;
+};
+
 /// per node singleton with instant API
 class IKqpResourceManager : private TNonCopyable {
 public:
@@ -211,6 +216,7 @@ class IKqpResourceManager : private TNonCopyable {
 
     virtual TKqpRMAllocateResult AllocateResources(TIntrusivePtr<TTxState>& tx, TIntrusivePtr<TTaskState>& task, const TKqpResourcesRequest& resources) = 0;
 
+    virtual TPlannerPlacingOptions GetPlacingOptions() = 0;
     virtual TTaskResourceEstimation EstimateTaskResources(const NYql::NDqProto::TDqTask& task, const ui32 tasksCount) = 0;
     virtual void EstimateTaskResources(TTaskResourceEstimation& result, const ui32 tasksCount) = 0;
 
diff --git a/ydb/core/protos/table_service_config.proto b/ydb/core/protos/table_service_config.proto
@@ -46,6 +46,9 @@ message TTableServiceConfig {
 
         optional uint64 MinMemAllocSize = 23 [default = 8388608]; // 8 MiB
         optional uint64 MinMemFreeSize = 24  [default = 33554432]; // 32 MiB
+
+        optional uint64 MaxNonParallelTasksExecutionLimit = 25 [default = 8];
+        optional uint64 MaxNonParallelTopStageExecutionLimit = 26 [default = 1];
     }
 
     message TSpillingServiceConfig {

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,9 @@ message TTableServiceConfig {`
`46`	`46`
`47`	`47`	`optional uint64 MinMemAllocSize = 23 [default = 8388608]; // 8 MiB`
`48`	`48`	`optional uint64 MinMemFreeSize = 24 [default = 33554432]; // 32 MiB`
	`49`	`+`
	`50`	`+ optional uint64 MaxNonParallelTasksExecutionLimit = 25 [default = 8];`
	`51`	`+ optional uint64 MaxNonParallelTopStageExecutionLimit = 26 [default = 1];`
`49`	`52`	`}`
`50`	`53`
`51`	`54`	`message TSpillingServiceConfig {`