pingcap · ti-chi-bot · May 10, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023
diff --git a/...ine/Schedule/TaskQueues/FiFOTaskQueue.cpp → ...ine/Schedule/TaskQueues/FIFOTaskQueue.cpp b/...ine/Schedule/TaskQueues/FiFOTaskQueue.cpp → ...ine/Schedule/TaskQueues/FIFOTaskQueue.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Flash/Pipeline/Schedule/TaskQueues/FiFOTaskQueue.h>
+#include <Flash/Pipeline/Schedule/TaskQueues/FIFOTaskQueue.h>
 #include <assert.h>
 #include <common/likely.h>
 

diff --git a/...eline/Schedule/TaskQueues/FiFOTaskQueue.h → ...eline/Schedule/TaskQueues/FIFOTaskQueue.h b/...eline/Schedule/TaskQueues/FiFOTaskQueue.h → ...eline/Schedule/TaskQueues/FIFOTaskQueue.h
@@ -30,6 +30,8 @@ class FIFOTaskQueue : public TaskQueue
 
     bool take(TaskPtr & task) noexcept override;
 
+    void updateStatistics(const TaskPtr &, size_t) noexcept override {}
+
     bool empty() noexcept override;
 
     void close() override;

diff --git a/dbms/src/Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.cpp b/dbms/src/Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.cpp
@@ -0,0 +1,199 @@
+// Copyright 2023 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h>
+#include <assert.h>
+#include <common/likely.h>
-#include <Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h>
-#include <assert.h>
-#include <common/likely.h>
+#include <Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h>
+#include <common/likely.h>
+
+#include <assert.h>
-#include <Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h>
-#include <assert.h>
-#include <common/likely.h>
+#include <Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h>
+#include <common/likely.h>
+
+#include <assert.h>
+
+namespace DB
+{
+void UnitQueue::take(TaskPtr & task)
+{
+    assert(!task);
+    assert(!empty());
+    task = std::move(task_queue.front());
+    task_queue.pop_front();
+    assert(task);
+}
+
+bool UnitQueue::empty()
+{
+    return task_queue.empty();
+}
+
+void UnitQueue::submit(TaskPtr && task)
+{
+    assert(task);
+    task_queue.push_back(std::move(task));
+}
+
+double UnitQueue::accuTimeAfterDivisor()
+{
+    return accu_consume_time / info.factor_for_normal;
+}
+
+template <typename TimeGetter>
+MultiLevelFeedbackQueue<TimeGetter>::MultiLevelFeedbackQueue()
+{
+    UInt64 time_slices[QUEUE_SIZE];
+    UInt64 time_slice = 0;
+    for (size_t i = 0; i < QUEUE_SIZE; ++i)
+    {
+        time_slice += LEVEL_TIME_SLICE_BASE_NS * (i + 1);
+        time_slices[i] = time_slice;
+    }
+
+    static constexpr double RATIO_OF_ADJACENT_QUEUE = 1.2;
+    double factors[QUEUE_SIZE];
+    double factor = 1;
+    for (int i = QUEUE_SIZE - 1; i >= 0; --i)
+    {
+        // Initialize factor for every unit queue.
+        // Higher priority queues have more execution time,
+        // so they should have a larger factor.
+        factors[i] = factor;
+        factor *= RATIO_OF_ADJACENT_QUEUE;
+    }
+
+    for (size_t i = 0; i < QUEUE_SIZE; ++i)
+        level_queues[i] = std::make_unique<UnitQueue>(time_slices[i], factors[i]);
+}
+
+template <typename TimeGetter>
+void MultiLevelFeedbackQueue<TimeGetter>::computeQueueLevel(const TaskPtr & task)
+{
+    auto time_spent = TimeGetter::get(task);
+    // level will only increment.
+    for (size_t i = task->mlfq_level; i < QUEUE_SIZE; ++i)
+    {
+        if (time_spent < getUnitQueueInfo(i).time_slice)
+        {
+            task->mlfq_level = i;
+            return;
+        }
+    }
+    task->mlfq_level = QUEUE_SIZE - 1;
+}
+
+template <typename TimeGetter>
+void MultiLevelFeedbackQueue<TimeGetter>::submit(TaskPtr && task) noexcept
+{
+    assert(task);
+    computeQueueLevel(task);
+    {
+        std::lock_guard lock(mu);
+        level_queues[task->mlfq_level]->submit(std::move(task));
+    }
+    assert(!task);
+    cv.notify_one();
+}
+
+template <typename TimeGetter>
+void MultiLevelFeedbackQueue<TimeGetter>::submit(std::vector<TaskPtr> & tasks) noexcept
+{
+    if (tasks.empty())
+        return;
+
+    for (auto & task : tasks)
+        computeQueueLevel(task);
+
+    std::lock_guard lock(mu);
+    for (auto & task : tasks)
+    {
+        level_queues[task->mlfq_level]->submit(std::move(task));
+        cv.notify_one();
+    }
+}
+
+template <typename TimeGetter>
+bool MultiLevelFeedbackQueue<TimeGetter>::take(TaskPtr & task) noexcept
+{
+    assert(!task);
+    {
+        // -1 means no candidates; else has candidate.
+        int queue_idx = -1;
+        double target_accu_time = 0;
+        std::unique_lock lock(mu);
+        while (true)
+        {
+            if (unlikely(is_closed))
+                return false;
+
+            // Find the queue with the smallest execution time.
+            for (size_t i = 0; i < QUEUE_SIZE; ++i)
+            {
+                // we just search for queue has element
+                const auto & cur_queue = level_queues[i];
+                if (!cur_queue->empty())
+                {
+                    double local_target_time = cur_queue->accuTimeAfterDivisor();
+                    if (queue_idx < 0 || local_target_time < target_accu_time)
+                    {
+                        target_accu_time = local_target_time;
+                        queue_idx = i;
+                    }
+                }
+            }
+
+            if (queue_idx >= 0)
+                break;
+            cv.wait(lock);
+        }
+        level_queues[queue_idx]->take(task);
+    }
+
+    assert(task);
+    return true;
+}
+
+template <typename TimeGetter>
+void MultiLevelFeedbackQueue<TimeGetter>::updateStatistics(const TaskPtr & task, size_t inc_value) noexcept
+{
+    assert(task);
+    level_queues[task->mlfq_level]->accu_consume_time += inc_value;
+}
+
+template <typename TimeGetter>
+bool MultiLevelFeedbackQueue<TimeGetter>::empty() noexcept
+{
+    std::lock_guard lock(mu);
+    for (const auto & queue : level_queues)
+    {
+        if (!queue->empty())
+            return false;
+    }
+    return true;
+}
+
+template <typename TimeGetter>
+void MultiLevelFeedbackQueue<TimeGetter>::close() noexcept
+{
+    {
+        std::lock_guard lock(mu);
+        is_closed = true;
+    }
+    cv.notify_all();
+}
+
+template <typename TimeGetter>
+const UnitQueueInfo & MultiLevelFeedbackQueue<TimeGetter>::getUnitQueueInfo(size_t level)
+{
+    assert(level < QUEUE_SIZE);
+    return level_queues[level]->info;
+}
+
+template class MultiLevelFeedbackQueue<CPUTimeGetter>;
+template class MultiLevelFeedbackQueue<IOTimeGetter>;
+
+} // namespace DB
diff --git a/dbms/src/Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h b/dbms/src/Flash/Pipeline/Schedule/TaskQueues/MultiLevelFeedbackQueue.h
@@ -0,0 +1,160 @@
+// Copyright 2023 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Common/Logger.h>
+#include <Flash/Pipeline/Schedule/TaskQueues/TaskQueue.h>
+
+#include <array>
+#include <atomic>
+#include <deque>
+#include <mutex>
+
+namespace DB
+{
+///    +-----------+       +-----------+       +-----------+            +-----------+
+///    |UnitQueue 1|       |UnitQueue 2|       |UnitQueue 3|    ...     |UnitQueue 8|
+///    +-----------+       +-----------+       +-----------+            +-----------+
+///          ^                   ^                   ^                        ^
+///          |                   |                   |                        |
+/// +--------+--------+  +-------+--------+  +-------+--------+       +-------+--------+
+/// | Task 1          |  | Task 6         |  | Task 11        |       | Task 16        |
+/// +-----------------+  +----------------+  +----------------+       +----------------+
+///          ^                   ^                   ^                        ^
+///          |                   |                   |                        |
+/// +--------+--------+  +-------+--------+  +-------+--------+       +-------+--------+
+/// | Task 2          |  | Task 7         |  | Task 12        |       | Task 17        |
+/// +-----------------+  +----------------+  +----------------+       +----------------+
+///          ^                   ^                   ^                        ^
+///          |                   |                   |                        |
+/// +--------+--------+  +-------+--------+  +-------+--------+       +-------+--------+
+/// | Task 3          |  | Task 8         |  | Task 13        |       | Task 18        |
+/// +-----------------+  +----------------+  +----------------+       +----------------+
+///          ^                   ^                   ^                        ^
+///          |                   |                   |                        |
+/// +--------+--------+  +-------+--------+  +-------+--------+       +-------+--------+
+/// | Task 4          |  | Task 9         |  | Task 14        |       | Task 19        |
+/// +-----------------+  +----------------+  +----------------+       +----------------+
+
+struct UnitQueueInfo
+{
+    UnitQueueInfo(UInt64 time_slice_, double factor_for_normal_)
+        : time_slice(time_slice_)
+        , factor_for_normal(factor_for_normal_)
+    {
+        assert(time_slice > 0);
+        assert(factor_for_normal > 0);
+    }
+
+    // The total duration of tasks executed in the queue must be less than `time_slice`.
+    UInt64 time_slice;
+
+    // factor for normalization.
+    // The priority value is equal to `accu_consume_time / factor_for_normal`.
+    // The smaller the value, the higher the priority.
+    // Therefore, the higher the priority of the queue, the larger the value of factor_for_normal.
+    double factor_for_normal;
+};
+
+class UnitQueue
+{
+public:
+    UnitQueue(UInt64 time_slice_, double factor_for_normal_)
+        : info(time_slice_, factor_for_normal_)
+    {}
+
+    void submit(TaskPtr && task);
+
+    void take(TaskPtr & task);
+
+    bool empty();
+
+    double accuTimeAfterDivisor();
+
+public:
+    const UnitQueueInfo info;
+    std::atomic_uint64_t accu_consume_time{0};
+
+private:
+    std::deque<TaskPtr> task_queue;
+};
+using UnitQueuePtr = std::unique_ptr<UnitQueue>;
+
+template <typename TimeGetter>
+class MultiLevelFeedbackQueue : public TaskQueue
+{
+public:
+    MultiLevelFeedbackQueue();
+
+    void submit(TaskPtr && task) noexcept override;
+
+    void submit(std::vector<TaskPtr> & tasks) noexcept override;
+
+    bool take(TaskPtr & task) noexcept override;
+
+    void updateStatistics(const TaskPtr & task, size_t inc_value) noexcept override;
+
+    bool empty() noexcept override;
+
+    void close() noexcept override;
+
+    const UnitQueueInfo & getUnitQueueInfo(size_t level);
+
+public:
+    static constexpr size_t QUEUE_SIZE = 8;
+
+    // The time slice of the i-th level is (i+1)*LEVEL_TIME_SLICE_BASE ns,
+    // so when a task's execution time exceeds 0.2s, 0.6s, 1.2s, 2s, 3s, 4.2s, 5.6s, and 7.2s,
+    // it will move to next level.
+    static constexpr int64_t LEVEL_TIME_SLICE_BASE_NS = 200'000'000L;
+
+private:
+    void computeQueueLevel(const TaskPtr & task);
+
+private:
+    std::mutex mu;
+    std::condition_variable cv;
+    bool is_closed = false;
+
+    LoggerPtr logger = Logger::get("MultiLevelFeedbackQueue");
+
+    // From high priority to low priority.
+    // The higher the priority of the queue,
+    // the longer the total execution time of all tasks in the queue,
+    // and the shorter the execution time of each individual task.
+    std::array<UnitQueuePtr, QUEUE_SIZE> level_queues;
+};
+
+struct CPUTimeGetter
+{
+    static UInt64 get(const TaskPtr & task)
+    {
+        assert(task);
+        return task->profile_info.cpu_execute_time;
+    }
+};
+using CPUMultiLevelFeedbackQueue = MultiLevelFeedbackQueue<CPUTimeGetter>;
+
+struct IOTimeGetter
+{
+    static UInt64 get(const TaskPtr & task)
+    {
+        assert(task);
+        return task->profile_info.io_execute_time;
+    }
+};
+using IOMultiLevelFeedbackQueue = MultiLevelFeedbackQueue<IOTimeGetter>;
+
+} // namespace DB
diff --git a/dbms/src/Flash/Pipeline/Schedule/TaskQueues/TaskQueue.h b/dbms/src/Flash/Pipeline/Schedule/TaskQueues/TaskQueue.h
@@ -37,6 +37,10 @@ class TaskQueue
     // return false if the queue had been closed.
     virtual bool take(TaskPtr & task) noexcept = 0;
 
+    // Update the execution metrics of the task taken from the queue.
+    // Used to adjust the priority of tasks within a queue.
+    virtual void updateStatistics(const TaskPtr & task, size_t inc_value) noexcept = 0;
+
     virtual bool empty() noexcept = 0;
 
     virtual void close() = 0;

diff --git a/dbms/src/Flash/Pipeline/Schedule/TaskQueues/tests/gtest_fifo.cpp b/dbms/src/Flash/Pipeline/Schedule/TaskQueues/tests/gtest_fifo.cpp
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <Common/ThreadManager.h>
-#include <Flash/Pipeline/Schedule/TaskQueues/FiFOTaskQueue.h>
+#include <Flash/Pipeline/Schedule/TaskQueues/FIFOTaskQueue.h>
 #include <TestUtils/TiFlashTestBasic.h>
 #include <gtest/gtest.h>